Skip to content

Commit 8d81895

Browse files
refactor: allow custom evaluator aggregation (#1396)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e262dcb commit 8d81895

7 files changed

Lines changed: 752 additions & 491 deletions

File tree

src/uipath/eval/evaluators/base_evaluator.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from .._helpers.helpers import track_evaluation_metrics
1111
from ..models import AgentExecution, EvaluationResult
1212
from ..models.models import (
13+
EvaluationResultDto,
1314
UiPathEvaluationError,
1415
UiPathEvaluationErrorCategory,
1516
)
@@ -141,7 +142,8 @@ def validate_model(cls, values: Any) -> Any:
141142

142143
# Validate and create the config object if config dict is provided
143144
try:
144-
validated_config = config_type.model_validate(values.get("config", {}))
145+
raw_config = values.get("config") or values.get("evaluatorConfig") or {}
146+
validated_config = config_type.model_validate(raw_config)
145147
values["evaluator_config"] = validated_config
146148
except Exception as e:
147149
raise UiPathEvaluationError(
@@ -553,6 +555,24 @@ def generate_json_type(cls) -> dict[str, Any]:
553555
"justificationSchema": cls.get_justification_schema(),
554556
}
555557

558+
def reduce_scores(self, results: list[EvaluationResultDto]) -> float:
559+
"""Reduce per-datapoint results into a single aggregated score.
560+
561+
Default implementation computes a simple average of scores. Subclasses
562+
can override this to implement custom aggregation logic (e.g., precision,
563+
recall) using the rich per-datapoint data in EvaluationResultDto.
564+
565+
Args:
566+
results: List of per-datapoint results, each containing the score
567+
and evaluation details/justification.
568+
569+
Returns:
570+
The aggregated score
571+
"""
572+
if not results:
573+
return 0.0
574+
return sum(r.score for r in results) / len(results)
575+
556576
@abstractmethod
557577
async def validate_and_evaluate_criteria(
558578
self, agent_execution: AgentExecution, evaluation_criteria: Any

src/uipath/eval/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
ErrorEvaluationResult,
77
EvalItemResult,
88
EvaluationResult,
9+
EvaluationResultDto,
910
EvaluatorType,
1011
LegacyEvaluatorCategory,
1112
LegacyEvaluatorType,
@@ -19,6 +20,7 @@
1920
__all__ = [
2021
"AgentExecution",
2122
"EvaluationResult",
23+
"EvaluationResultDto",
2224
"LLMResponse",
2325
"LegacyEvaluatorCategory",
2426
"LegacyEvaluatorType",

src/uipath/eval/models/models.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
from typing import Annotated, Any, Literal, Union
77

88
from opentelemetry.sdk.trace import ReadableSpan
9-
from pydantic import BaseModel, ConfigDict, Field
9+
from pydantic import BaseModel, ConfigDict, Field, model_serializer
10+
from pydantic.alias_generators import to_camel
11+
from pydantic_core import core_schema
1012

1113

1214
class AgentExecution(BaseModel):
@@ -71,6 +73,56 @@ class ErrorEvaluationResult(BaseEvaluationResult):
7173
]
7274

7375

76+
class EvaluationResultDto(BaseModel):
77+
"""Serializable evaluation result used for aggregation and transport."""
78+
79+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
80+
81+
score: float
82+
details: str | dict[str, Any] | None = None
83+
evaluation_time: float | None = None
84+
85+
@model_serializer(mode="wrap")
86+
def serialize_model(
87+
self,
88+
serializer: core_schema.SerializerFunctionWrapHandler,
89+
info: core_schema.SerializationInfo,
90+
) -> Any:
91+
"""Omit 'details' key from serialized output when it is None."""
92+
data = serializer(self)
93+
if self.details is None and isinstance(data, dict):
94+
data.pop("details", None)
95+
return data
96+
97+
@classmethod
98+
def from_evaluation_result(
99+
cls, evaluation_result: EvaluationResult
100+
) -> "EvaluationResultDto":
101+
"""Convert an EvaluationResult to a serializable DTO."""
102+
score_type = evaluation_result.score_type
103+
score: float
104+
if score_type == ScoreType.BOOLEAN:
105+
score = 100 if evaluation_result.score else 0
106+
elif score_type == ScoreType.ERROR:
107+
score = 0
108+
else:
109+
score = evaluation_result.score
110+
111+
# Convert BaseModel details to dict so Pydantic doesn't lose subclass fields
112+
if isinstance(evaluation_result.details, BaseModel):
113+
details: str | dict[str, Any] | None = (
114+
evaluation_result.details.model_dump()
115+
)
116+
else:
117+
details = evaluation_result.details
118+
119+
return cls(
120+
score=score,
121+
details=details,
122+
evaluation_time=evaluation_result.evaluation_time,
123+
)
124+
125+
74126
class EvalItemResult(BaseModel):
75127
"""Result of a single evaluation item."""
76128

src/uipath/eval/runtime/_types.py

Lines changed: 2 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
import logging
2-
from collections import defaultdict
3-
from typing import Any
42

53
from opentelemetry.sdk.trace import ReadableSpan
6-
from pydantic import BaseModel, ConfigDict, model_serializer
4+
from pydantic import BaseModel, ConfigDict
75
from pydantic.alias_generators import to_camel
8-
from pydantic_core import core_schema
96

107
from uipath.runtime import UiPathRuntimeResult
118

129
from ..models.models import (
13-
EvaluationResult,
14-
ScoreType,
10+
EvaluationResultDto,
1511
TrajectoryEvaluationTrace,
1612
)
1713

@@ -51,52 +47,6 @@ def convert_eval_execution_output_to_serializable(
5147
)
5248

5349

54-
class EvaluationResultDto(BaseModel):
55-
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
56-
57-
score: float
58-
details: str | dict[str, Any] | None = None
59-
evaluation_time: float | None = None
60-
61-
@model_serializer(mode="wrap")
62-
def serialize_model(
63-
self,
64-
serializer: core_schema.SerializerFunctionWrapHandler,
65-
info: core_schema.SerializationInfo,
66-
) -> Any:
67-
data = serializer(self)
68-
if self.details is None and isinstance(data, dict):
69-
data.pop("details", None)
70-
return data
71-
72-
@classmethod
73-
def from_evaluation_result(
74-
cls, evaluation_result: EvaluationResult
75-
) -> "EvaluationResultDto":
76-
score_type = evaluation_result.score_type
77-
score: float
78-
if score_type == ScoreType.BOOLEAN:
79-
score = 100 if evaluation_result.score else 0
80-
elif score_type == ScoreType.ERROR:
81-
score = 0
82-
else:
83-
score = evaluation_result.score
84-
85-
# Convert BaseModel details to dict so Pydantic doesn't lose subclass fields
86-
if isinstance(evaluation_result.details, BaseModel):
87-
details: str | dict[str, Any] | None = (
88-
evaluation_result.details.model_dump()
89-
)
90-
else:
91-
details = evaluation_result.details
92-
93-
return cls(
94-
score=score,
95-
details=details,
96-
evaluation_time=evaluation_result.evaluation_time,
97-
)
98-
99-
10050
class UiPathEvalRunResultDto(BaseModel):
10151
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
10252

@@ -138,81 +88,3 @@ def score(self) -> float:
13888
eval_result.score for eval_result in self.evaluation_set_results
13989
]
14090
return sum(eval_item_scores) / len(eval_item_scores)
141-
142-
def calculate_final_score(
143-
self,
144-
evaluator_weights: dict[str, float] | None = None,
145-
default_weight: float = 1.0,
146-
) -> tuple[float, dict[str, float]]:
147-
"""Aggregate evaluation results with deduplication and weighted scoring.
148-
149-
This function performs the following steps:
150-
1. Flattens the nested evaluation_set_results structure
151-
2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates)
152-
3. Calculates average score per evaluator across all datapoints
153-
4. Computes final weighted score across evaluators
154-
155-
Args:
156-
evaluator_weights: Optional dict mapping evaluator names to weights
157-
default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0)
158-
159-
Returns:
160-
Tuple of (final_score, agg_metrics_per_evaluator)
161-
- final_score: Weighted average across evaluators
162-
- agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores
163-
"""
164-
if not self.evaluation_set_results:
165-
return 0.0, {}
166-
167-
if evaluator_weights is None:
168-
evaluator_weights = {}
169-
170-
# Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication
171-
# datapoint_id = evaluation_name, evaluator_name from UiPathEvalRunResultDto
172-
grouped_by_datapoint_evaluator: defaultdict[
173-
str, defaultdict[str, list[float]]
174-
] = defaultdict(lambda: defaultdict(list))
175-
176-
for eval_run_result in self.evaluation_set_results:
177-
datapoint_id = eval_run_result.evaluation_name
178-
for eval_run_result_dto in eval_run_result.evaluation_run_results:
179-
evaluator_name = eval_run_result_dto.evaluator_name
180-
score = eval_run_result_dto.result.score
181-
grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append(
182-
score
183-
)
184-
185-
# Step 2: Deduplicate by averaging same evaluator results for same datapoint
186-
dedup_scores: list[tuple[str, str, float]] = []
187-
for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items():
188-
for evaluator_name, scores_list in evaluators_dict.items():
189-
if scores_list:
190-
# Average the scores for this evaluator on this datapoint
191-
avg_score = sum(scores_list) / len(scores_list)
192-
dedup_scores.append((datapoint_id, evaluator_name, avg_score))
193-
194-
# Step 3: Group by evaluator and calculate average score per evaluator
195-
grouped_by_evaluator: defaultdict[str, list[float]] = defaultdict(list)
196-
for _datapoint_id, evaluator_name, score in dedup_scores:
197-
grouped_by_evaluator[evaluator_name].append(score)
198-
199-
agg_metrics_per_evaluator = {}
200-
for evaluator_name, scores_list in grouped_by_evaluator.items():
201-
avg_score = sum(scores_list) / len(scores_list)
202-
agg_metrics_per_evaluator[evaluator_name] = avg_score
203-
204-
# Step 4: Calculate final weighted score
205-
if not agg_metrics_per_evaluator:
206-
return 0.0, {}
207-
208-
total_weighted_score = 0.0
209-
total_weight = 0.0
210-
211-
for evaluator_name, avg_score in agg_metrics_per_evaluator.items():
212-
weight = evaluator_weights.get(evaluator_name, default_weight)
213-
total_weighted_score += avg_score * weight
214-
total_weight += weight
215-
216-
final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
217-
218-
return final_score, agg_metrics_per_evaluator

0 commit comments

Comments
 (0)