|
1 | 1 | import logging |
2 | | -from collections import defaultdict |
3 | | -from typing import Any |
4 | 2 |
|
5 | 3 | from opentelemetry.sdk.trace import ReadableSpan |
6 | | -from pydantic import BaseModel, ConfigDict, model_serializer |
| 4 | +from pydantic import BaseModel, ConfigDict |
7 | 5 | from pydantic.alias_generators import to_camel |
8 | | -from pydantic_core import core_schema |
9 | 6 |
|
10 | 7 | from uipath.runtime import UiPathRuntimeResult |
11 | 8 |
|
12 | 9 | from ..models.models import ( |
13 | | - EvaluationResult, |
14 | | - ScoreType, |
| 10 | + EvaluationResultDto, |
15 | 11 | TrajectoryEvaluationTrace, |
16 | 12 | ) |
17 | 13 |
|
@@ -51,52 +47,6 @@ def convert_eval_execution_output_to_serializable( |
51 | 47 | ) |
52 | 48 |
|
53 | 49 |
|
54 | | -class EvaluationResultDto(BaseModel): |
55 | | - model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) |
56 | | - |
57 | | - score: float |
58 | | - details: str | dict[str, Any] | None = None |
59 | | - evaluation_time: float | None = None |
60 | | - |
61 | | - @model_serializer(mode="wrap") |
62 | | - def serialize_model( |
63 | | - self, |
64 | | - serializer: core_schema.SerializerFunctionWrapHandler, |
65 | | - info: core_schema.SerializationInfo, |
66 | | - ) -> Any: |
67 | | - data = serializer(self) |
68 | | - if self.details is None and isinstance(data, dict): |
69 | | - data.pop("details", None) |
70 | | - return data |
71 | | - |
72 | | - @classmethod |
73 | | - def from_evaluation_result( |
74 | | - cls, evaluation_result: EvaluationResult |
75 | | - ) -> "EvaluationResultDto": |
76 | | - score_type = evaluation_result.score_type |
77 | | - score: float |
78 | | - if score_type == ScoreType.BOOLEAN: |
79 | | - score = 100 if evaluation_result.score else 0 |
80 | | - elif score_type == ScoreType.ERROR: |
81 | | - score = 0 |
82 | | - else: |
83 | | - score = evaluation_result.score |
84 | | - |
85 | | - # Convert BaseModel details to dict so Pydantic doesn't lose subclass fields |
86 | | - if isinstance(evaluation_result.details, BaseModel): |
87 | | - details: str | dict[str, Any] | None = ( |
88 | | - evaluation_result.details.model_dump() |
89 | | - ) |
90 | | - else: |
91 | | - details = evaluation_result.details |
92 | | - |
93 | | - return cls( |
94 | | - score=score, |
95 | | - details=details, |
96 | | - evaluation_time=evaluation_result.evaluation_time, |
97 | | - ) |
98 | | - |
99 | | - |
100 | 50 | class UiPathEvalRunResultDto(BaseModel): |
101 | 51 | model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) |
102 | 52 |
|
@@ -138,81 +88,3 @@ def score(self) -> float: |
138 | 88 | eval_result.score for eval_result in self.evaluation_set_results |
139 | 89 | ] |
140 | 90 | return sum(eval_item_scores) / len(eval_item_scores) |
141 | | - |
142 | | - def calculate_final_score( |
143 | | - self, |
144 | | - evaluator_weights: dict[str, float] | None = None, |
145 | | - default_weight: float = 1.0, |
146 | | - ) -> tuple[float, dict[str, float]]: |
147 | | - """Aggregate evaluation results with deduplication and weighted scoring. |
148 | | -
|
149 | | - This function performs the following steps: |
150 | | - 1. Flattens the nested evaluation_set_results structure |
151 | | - 2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates) |
152 | | - 3. Calculates average score per evaluator across all datapoints |
153 | | - 4. Computes final weighted score across evaluators |
154 | | -
|
155 | | - Args: |
156 | | - evaluator_weights: Optional dict mapping evaluator names to weights |
157 | | - default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0) |
158 | | -
|
159 | | - Returns: |
160 | | - Tuple of (final_score, agg_metrics_per_evaluator) |
161 | | - - final_score: Weighted average across evaluators |
162 | | - - agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores |
163 | | - """ |
164 | | - if not self.evaluation_set_results: |
165 | | - return 0.0, {} |
166 | | - |
167 | | - if evaluator_weights is None: |
168 | | - evaluator_weights = {} |
169 | | - |
170 | | - # Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication |
171 | | - # datapoint_id = evaluation_name, evaluator_name from UiPathEvalRunResultDto |
172 | | - grouped_by_datapoint_evaluator: defaultdict[ |
173 | | - str, defaultdict[str, list[float]] |
174 | | - ] = defaultdict(lambda: defaultdict(list)) |
175 | | - |
176 | | - for eval_run_result in self.evaluation_set_results: |
177 | | - datapoint_id = eval_run_result.evaluation_name |
178 | | - for eval_run_result_dto in eval_run_result.evaluation_run_results: |
179 | | - evaluator_name = eval_run_result_dto.evaluator_name |
180 | | - score = eval_run_result_dto.result.score |
181 | | - grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append( |
182 | | - score |
183 | | - ) |
184 | | - |
185 | | - # Step 2: Deduplicate by averaging same evaluator results for same datapoint |
186 | | - dedup_scores: list[tuple[str, str, float]] = [] |
187 | | - for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items(): |
188 | | - for evaluator_name, scores_list in evaluators_dict.items(): |
189 | | - if scores_list: |
190 | | - # Average the scores for this evaluator on this datapoint |
191 | | - avg_score = sum(scores_list) / len(scores_list) |
192 | | - dedup_scores.append((datapoint_id, evaluator_name, avg_score)) |
193 | | - |
194 | | - # Step 3: Group by evaluator and calculate average score per evaluator |
195 | | - grouped_by_evaluator: defaultdict[str, list[float]] = defaultdict(list) |
196 | | - for _datapoint_id, evaluator_name, score in dedup_scores: |
197 | | - grouped_by_evaluator[evaluator_name].append(score) |
198 | | - |
199 | | - agg_metrics_per_evaluator = {} |
200 | | - for evaluator_name, scores_list in grouped_by_evaluator.items(): |
201 | | - avg_score = sum(scores_list) / len(scores_list) |
202 | | - agg_metrics_per_evaluator[evaluator_name] = avg_score |
203 | | - |
204 | | - # Step 4: Calculate final weighted score |
205 | | - if not agg_metrics_per_evaluator: |
206 | | - return 0.0, {} |
207 | | - |
208 | | - total_weighted_score = 0.0 |
209 | | - total_weight = 0.0 |
210 | | - |
211 | | - for evaluator_name, avg_score in agg_metrics_per_evaluator.items(): |
212 | | - weight = evaluator_weights.get(evaluator_name, default_weight) |
213 | | - total_weighted_score += avg_score * weight |
214 | | - total_weight += weight |
215 | | - |
216 | | - final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0 |
217 | | - |
218 | | - return final_score, agg_metrics_per_evaluator |
0 commit comments