UiPath
diff --git a/‎src/uipath/eval/evaluators/base_evaluator.py‎
Lines changed: 21 additions & 1 deletion b/‎src/uipath/eval/evaluators/base_evaluator.py‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎src/uipath/eval/models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/uipath/eval/models/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/uipath/eval/models/models.py‎
Lines changed: 53 additions & 1 deletion b/‎src/uipath/eval/models/models.py‎
Lines changed: 53 additions & 1 deletion
diff --git a/‎src/uipath/eval/runtime/_types.py‎
Lines changed: 2 additions & 130 deletions b/‎src/uipath/eval/runtime/_types.py‎
Lines changed: 2 additions & 130 deletions
@@ -10,6 +10,7 @@
 from .._helpers.helpers import track_evaluation_metrics
 from ..models import AgentExecution, EvaluationResult
 from ..models.models import (
+    EvaluationResultDto,
     UiPathEvaluationError,
     UiPathEvaluationErrorCategory,
 )
@@ -141,7 +142,8 @@ def validate_model(cls, values: Any) -> Any:
 
             # Validate and create the config object if config dict is provided
             try:
-                validated_config = config_type.model_validate(values.get("config", {}))
+                raw_config = values.get("config") or values.get("evaluatorConfig") or {}
+                validated_config = config_type.model_validate(raw_config)
                 values["evaluator_config"] = validated_config
             except Exception as e:
                 raise UiPathEvaluationError(
@@ -553,6 +555,24 @@ def generate_json_type(cls) -> dict[str, Any]:
             "justificationSchema": cls.get_justification_schema(),
         }
 
+    def reduce_scores(self, results: list[EvaluationResultDto]) -> float:
+        """Reduce per-datapoint results into a single aggregated score.
+
+        Default implementation computes a simple average of scores. Subclasses
+        can override this to implement custom aggregation logic (e.g., precision,
+        recall) using the rich per-datapoint data in EvaluationResultDto.
+
+        Args:
+            results: List of per-datapoint results, each containing the score
+                and evaluation details/justification.
+
+        Returns:
+            The aggregated score
+        """
+        if not results:
+            return 0.0
+        return sum(r.score for r in results) / len(results)
+
     @abstractmethod
     async def validate_and_evaluate_criteria(
         self, agent_execution: AgentExecution, evaluation_criteria: Any
 
@@ -6,6 +6,7 @@
     ErrorEvaluationResult,
     EvalItemResult,
     EvaluationResult,
+    EvaluationResultDto,
     EvaluatorType,
     LegacyEvaluatorCategory,
     LegacyEvaluatorType,
@@ -19,6 +20,7 @@
 __all__ = [
     "AgentExecution",
     "EvaluationResult",
+    "EvaluationResultDto",
     "LLMResponse",
     "LegacyEvaluatorCategory",
     "LegacyEvaluatorType",
 
@@ -6,7 +6,9 @@
 from typing import Annotated, Any, Literal, Union
 
 from opentelemetry.sdk.trace import ReadableSpan
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, model_serializer
+from pydantic.alias_generators import to_camel
+from pydantic_core import core_schema
 
 
 class AgentExecution(BaseModel):
@@ -71,6 +73,56 @@ class ErrorEvaluationResult(BaseEvaluationResult):
 ]
 
 
+class EvaluationResultDto(BaseModel):
+    """Serializable evaluation result used for aggregation and transport."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    score: float
+    details: str | dict[str, Any] | None = None
+    evaluation_time: float | None = None
+
+    @model_serializer(mode="wrap")
+    def serialize_model(
+        self,
+        serializer: core_schema.SerializerFunctionWrapHandler,
+        info: core_schema.SerializationInfo,
+    ) -> Any:
+        """Omit 'details' key from serialized output when it is None."""
+        data = serializer(self)
+        if self.details is None and isinstance(data, dict):
+            data.pop("details", None)
+        return data
+
+    @classmethod
+    def from_evaluation_result(
+        cls, evaluation_result: EvaluationResult
+    ) -> "EvaluationResultDto":
+        """Convert an EvaluationResult to a serializable DTO."""
+        score_type = evaluation_result.score_type
+        score: float
+        if score_type == ScoreType.BOOLEAN:
+            score = 100 if evaluation_result.score else 0
+        elif score_type == ScoreType.ERROR:
+            score = 0
+        else:
+            score = evaluation_result.score
+
+        # Convert BaseModel details to dict so Pydantic doesn't lose subclass fields
+        if isinstance(evaluation_result.details, BaseModel):
+            details: str | dict[str, Any] | None = (
+                evaluation_result.details.model_dump()
+            )
+        else:
+            details = evaluation_result.details
+
+        return cls(
+            score=score,
+            details=details,
+            evaluation_time=evaluation_result.evaluation_time,
+        )
+
+
 class EvalItemResult(BaseModel):
     """Result of a single evaluation item."""
 
 
@@ -1,17 +1,13 @@
 import logging
-from collections import defaultdict
-from typing import Any
 
 from opentelemetry.sdk.trace import ReadableSpan
-from pydantic import BaseModel, ConfigDict, model_serializer
+from pydantic import BaseModel, ConfigDict
 from pydantic.alias_generators import to_camel
-from pydantic_core import core_schema
 
 from uipath.runtime import UiPathRuntimeResult
 
 from ..models.models import (
-    EvaluationResult,
-    ScoreType,
+    EvaluationResultDto,
     TrajectoryEvaluationTrace,
 )
 
@@ -51,52 +47,6 @@ def convert_eval_execution_output_to_serializable(
     )
 
 
-class EvaluationResultDto(BaseModel):
-    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
-
-    score: float
-    details: str | dict[str, Any] | None = None
-    evaluation_time: float | None = None
-
-    @model_serializer(mode="wrap")
-    def serialize_model(
-        self,
-        serializer: core_schema.SerializerFunctionWrapHandler,
-        info: core_schema.SerializationInfo,
-    ) -> Any:
-        data = serializer(self)
-        if self.details is None and isinstance(data, dict):
-            data.pop("details", None)
-        return data
-
-    @classmethod
-    def from_evaluation_result(
-        cls, evaluation_result: EvaluationResult
-    ) -> "EvaluationResultDto":
-        score_type = evaluation_result.score_type
-        score: float
-        if score_type == ScoreType.BOOLEAN:
-            score = 100 if evaluation_result.score else 0
-        elif score_type == ScoreType.ERROR:
-            score = 0
-        else:
-            score = evaluation_result.score
-
-        # Convert BaseModel details to dict so Pydantic doesn't lose subclass fields
-        if isinstance(evaluation_result.details, BaseModel):
-            details: str | dict[str, Any] | None = (
-                evaluation_result.details.model_dump()
-            )
-        else:
-            details = evaluation_result.details
-
-        return cls(
-            score=score,
-            details=details,
-            evaluation_time=evaluation_result.evaluation_time,
-        )
-
-
 class UiPathEvalRunResultDto(BaseModel):
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
@@ -138,81 +88,3 @@ def score(self) -> float:
             eval_result.score for eval_result in self.evaluation_set_results
         ]
         return sum(eval_item_scores) / len(eval_item_scores)
-
-    def calculate_final_score(
-        self,
-        evaluator_weights: dict[str, float] | None = None,
-        default_weight: float = 1.0,
-    ) -> tuple[float, dict[str, float]]:
-        """Aggregate evaluation results with deduplication and weighted scoring.
-
-        This function performs the following steps:
-        1. Flattens the nested evaluation_set_results structure
-        2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates)
-        3. Calculates average score per evaluator across all datapoints
-        4. Computes final weighted score across evaluators
-
-        Args:
-            evaluator_weights: Optional dict mapping evaluator names to weights
-            default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0)
-
-        Returns:
-            Tuple of (final_score, agg_metrics_per_evaluator)
-            - final_score: Weighted average across evaluators
-            - agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores
-        """
-        if not self.evaluation_set_results:
-            return 0.0, {}
-
-        if evaluator_weights is None:
-            evaluator_weights = {}
-
-        # Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication
-        # datapoint_id = evaluation_name, evaluator_name from UiPathEvalRunResultDto
-        grouped_by_datapoint_evaluator: defaultdict[
-            str, defaultdict[str, list[float]]
-        ] = defaultdict(lambda: defaultdict(list))
-
-        for eval_run_result in self.evaluation_set_results:
-            datapoint_id = eval_run_result.evaluation_name
-            for eval_run_result_dto in eval_run_result.evaluation_run_results:
-                evaluator_name = eval_run_result_dto.evaluator_name
-                score = eval_run_result_dto.result.score
-                grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append(
-                    score
-                )
-
-        # Step 2: Deduplicate by averaging same evaluator results for same datapoint
-        dedup_scores: list[tuple[str, str, float]] = []
-        for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items():
-            for evaluator_name, scores_list in evaluators_dict.items():
-                if scores_list:
-                    # Average the scores for this evaluator on this datapoint
-                    avg_score = sum(scores_list) / len(scores_list)
-                    dedup_scores.append((datapoint_id, evaluator_name, avg_score))
-
-        # Step 3: Group by evaluator and calculate average score per evaluator
-        grouped_by_evaluator: defaultdict[str, list[float]] = defaultdict(list)
-        for _datapoint_id, evaluator_name, score in dedup_scores:
-            grouped_by_evaluator[evaluator_name].append(score)
-
-        agg_metrics_per_evaluator = {}
-        for evaluator_name, scores_list in grouped_by_evaluator.items():
-            avg_score = sum(scores_list) / len(scores_list)
-            agg_metrics_per_evaluator[evaluator_name] = avg_score
-
-        # Step 4: Calculate final weighted score
-        if not agg_metrics_per_evaluator:
-            return 0.0, {}
-
-        total_weighted_score = 0.0
-        total_weight = 0.0
-
-        for evaluator_name, avg_score in agg_metrics_per_evaluator.items():
-            weight = evaluator_weights.get(evaluator_name, default_weight)
-            total_weighted_score += avg_score * weight
-            total_weight += weight
-
-        final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
-
-        return final_score, agg_metrics_per_evaluator