Skip to content

Commit be12b16

Browse files
authored
Merge pull request #618 from UiPath/akshaya/evaluation_definitions
feat(EvalSchema): updating eval schema
2 parents e8417d9 + a32321b commit be12b16

11 files changed

Lines changed: 425 additions & 91 deletions

File tree

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,9 @@ dev = [
7272

7373
[tool.hatch.build.targets.wheel]
7474
packages = ["src/uipath"]
75-
include = ["src/uipath/_resources"]
75+
include = [
76+
"src/uipath/_resources"
77+
]
7678

7779
[tool.ruff]
7880
line-length = 88
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"fileName": "default.json",
3+
"id": "default-eval-set-id",
4+
"name": "Basic Calculator Evaluation Set",
5+
"batchSize": 10,
6+
"evaluatorRefs": [
7+
"equality"
8+
],
9+
"evaluations": [
10+
{
11+
"id": "test",
12+
"name": "Test Addition",
13+
"inputs": {"a": 1, "b": 1, "operator": "+"},
14+
"expectedOutput": {"result": 2},
15+
"simulationInstructions": "",
16+
"expectedAgentBehavior": "",
17+
"simulateInput": false,
18+
"inputGenerationInstructions": "",
19+
"simulateTools": false,
20+
"toolsToSimulate": [],
21+
"evalSetId": "default-eval-set-id",
22+
"createdAt": "2025-09-04T18:54:58.378Z",
23+
"updatedAt": "2025-09-04T18:55:55.416Z"
24+
}
25+
],
26+
"modelSettings": [],
27+
"createdAt": "2025-09-04T18:54:58.379Z",
28+
"updatedAt": "2025-09-04T18:55:55.416Z"
29+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"fileName": "equality.json",
3+
"id": "equality",
4+
"name": "Equality Evaluator",
5+
"description": "An evaluator that judges the agent based on expected output.",
6+
"category": 0,
7+
"type": 1,
8+
"prompt": "",
9+
"model": "same-as-agent",
10+
"targetOutputKey": "*",
11+
"createdAt": "2025-06-26T17:45:39.651Z",
12+
"updatedAt": "2025-06-26T17:45:39.651Z"
13+
}

samples/calculator/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class CalculatorOutput:
2525
# use InputTriggerEventArgs when called by UiPath EventTriggers
2626
@traced()
2727
def main(input: CalculatorInput) -> CalculatorOutput:
28-
result = 0
28+
result = 0.0
2929
match input.operator:
3030
case Operator.ADD: result = input.a + input.b
3131
case Operator.SUBTRACT: result = input.a - input.b
Lines changed: 35 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
from typing import Any, Dict
22

3+
from pydantic import TypeAdapter
4+
5+
from uipath._cli._evals._models._evaluator import (
6+
EqualsEvaluatorParams,
7+
Evaluator,
8+
JsonSimilarityEvaluatorParams,
9+
LLMEvaluatorParams,
10+
TrajectoryEvaluatorParams,
11+
)
312
from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams
413
from uipath.eval.evaluators import (
514
BaseEvaluator,
@@ -8,7 +17,6 @@
817
LlmAsAJudgeEvaluator,
918
TrajectoryEvaluator,
1019
)
11-
from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
1220

1321

1422
class EvaluatorFactory:
@@ -35,110 +43,64 @@ def create_evaluator(cls, data: Dict[str, Any]) -> BaseEvaluator[Any]:
3543
if not id:
3644
raise ValueError("Evaluator configuration must include 'id' field")
3745

38-
category = EvaluatorCategory.from_int(data.get("category"))
39-
evaluator_type = EvaluatorType.from_int(data.get("type", EvaluatorType.Unknown))
40-
description = data.get("description", "")
41-
created_at = data.get("createdAt", "")
42-
updated_at = data.get("updatedAt", "")
43-
target_output_key = data.get("targetOutputKey", "")
44-
45-
# Create base parameters
46-
base_params = EvaluatorBaseParams(
47-
id=id,
48-
category=category,
49-
evaluator_type=evaluator_type,
50-
name=name,
51-
description=description,
52-
created_at=created_at,
53-
updated_at=updated_at,
54-
target_output_key=target_output_key,
55-
)
56-
57-
match category:
58-
case EvaluatorCategory.Deterministic:
59-
if evaluator_type == evaluator_type.Equals:
60-
return EvaluatorFactory._create_exact_match_evaluator(
61-
base_params, data
62-
)
63-
elif evaluator_type == evaluator_type.JsonSimilarity:
64-
return EvaluatorFactory._create_json_similarity_evaluator(
65-
base_params, data
66-
)
67-
else:
68-
raise ValueError(
69-
f"Unknown evaluator type {evaluator_type} for category {category}"
70-
)
71-
case EvaluatorCategory.LlmAsAJudge:
72-
return EvaluatorFactory._create_llm_as_judge_evaluator(
73-
base_params, data
74-
)
75-
case EvaluatorCategory.AgentScorer:
76-
raise NotImplementedError()
77-
case EvaluatorCategory.Trajectory:
78-
return EvaluatorFactory._create_trajectory_evaluator(base_params, data)
46+
params: EvaluatorBaseParams = TypeAdapter(Evaluator).validate_python(data)
47+
48+
match params:
49+
case EqualsEvaluatorParams():
50+
return EvaluatorFactory._create_exact_match_evaluator(params)
51+
case JsonSimilarityEvaluatorParams():
52+
return EvaluatorFactory._create_json_similarity_evaluator(params)
53+
case LLMEvaluatorParams():
54+
return EvaluatorFactory._create_llm_as_judge_evaluator(params)
55+
case TrajectoryEvaluatorParams():
56+
return EvaluatorFactory._create_trajectory_evaluator(params)
7957
case _:
80-
raise ValueError(f"Unknown evaluator category: {category}")
58+
raise ValueError(f"Unknown evaluator category: {params}")
8159

8260
@staticmethod
8361
def _create_exact_match_evaluator(
84-
base_params: EvaluatorBaseParams, data: Dict[str, Any]
62+
params: EqualsEvaluatorParams,
8563
) -> ExactMatchEvaluator:
8664
"""Create a deterministic evaluator."""
87-
return ExactMatchEvaluator(
88-
**base_params.model_dump(),
89-
)
65+
return ExactMatchEvaluator(**params.model_dump())
9066

9167
@staticmethod
9268
def _create_json_similarity_evaluator(
93-
base_params: EvaluatorBaseParams, data: Dict[str, Any]
69+
params: JsonSimilarityEvaluatorParams,
9470
) -> JsonSimilarityEvaluator:
9571
"""Create a deterministic evaluator."""
96-
return JsonSimilarityEvaluator(
97-
**base_params.model_dump(),
98-
)
72+
return JsonSimilarityEvaluator(**params.model_dump())
9973

10074
@staticmethod
10175
def _create_llm_as_judge_evaluator(
102-
base_params: EvaluatorBaseParams, data: Dict[str, Any]
76+
params: LLMEvaluatorParams,
10377
) -> LlmAsAJudgeEvaluator:
10478
"""Create an LLM-as-a-judge evaluator."""
105-
prompt = data.get("prompt", "")
106-
if not prompt:
79+
if not params.prompt:
10780
raise ValueError("LLM evaluator must include 'prompt' field")
10881

109-
model = data.get("model", "")
110-
if not model:
82+
if not params.model:
11183
raise ValueError("LLM evaluator must include 'model' field")
112-
if model == "same-as-agent":
84+
if params.model == "same-as-agent":
11385
raise ValueError(
11486
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
11587
)
11688

117-
return LlmAsAJudgeEvaluator(
118-
**base_params.model_dump(),
119-
prompt=prompt,
120-
model=model,
121-
)
89+
return LlmAsAJudgeEvaluator(**params.model_dump())
12290

12391
@staticmethod
12492
def _create_trajectory_evaluator(
125-
base_params: EvaluatorBaseParams, data: Dict[str, Any]
93+
params: TrajectoryEvaluatorParams,
12694
) -> TrajectoryEvaluator:
12795
"""Create a trajectory evaluator."""
128-
prompt = data.get("prompt", "")
129-
if not prompt:
96+
if not params.prompt:
13097
raise ValueError("Trajectory evaluator must include 'prompt' field")
13198

132-
model = data.get("model", "")
133-
if not model:
99+
if not params.model:
134100
raise ValueError("LLM evaluator must include 'model' field")
135-
if model == "same-as-agent":
101+
if params.model == "same-as-agent":
136102
raise ValueError(
137103
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
138104
)
139105

140-
return TrajectoryEvaluator(
141-
**base_params.model_dump(),
142-
prompt=prompt,
143-
model=model,
144-
)
106+
return TrajectoryEvaluator(**params.model_dump())

src/uipath/_cli/_evals/_models/_evaluation_set.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
from pydantic.alias_generators import to_camel
66

77

8+
class EvaluationSimulationTool(BaseModel):
9+
name: str = Field(..., alias="name")
10+
11+
812
class EvaluationItem(BaseModel):
913
"""Individual evaluation item within an evaluation set."""
1014

@@ -14,15 +18,19 @@ class EvaluationItem(BaseModel):
1418
name: str
1519
inputs: Dict[str, Any]
1620
expected_output: Dict[str, Any]
17-
expected_agent_behavior: str = ""
18-
simulation_instructions: str = ""
19-
simulate_input: bool = False
20-
input_generation_instructions: str = ""
21-
simulate_tools: bool = False
22-
tools_to_simulate: List[str] = Field(default_factory=list)
23-
eval_set_id: str
24-
created_at: str
25-
updated_at: str
21+
expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")
22+
simulation_instructions: str = Field(default="", alias="simulationInstructions")
23+
simulate_input: bool = Field(default=False, alias="simulateInput")
24+
input_generation_instructions: str = Field(
25+
default="", alias="inputGenerationInstructions"
26+
)
27+
simulate_tools: bool = Field(default=False, alias="simulateTools")
28+
tools_to_simulate: List[EvaluationSimulationTool] = Field(
29+
default_factory=list, alias="toolsToSimulate"
30+
)
31+
eval_set_id: str = Field(alias="evalSetId")
32+
created_at: str = Field(alias="createdAt")
33+
updated_at: str = Field(alias="updatedAt")
2634

2735

2836
class EvaluationSet(BaseModel):
@@ -31,15 +39,17 @@ class EvaluationSet(BaseModel):
3139
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
3240

3341
id: str
34-
file_name: str
42+
file_name: str = Field(..., alias="fileName")
3543
evaluator_refs: List[str] = Field(default_factory=list)
3644
evaluations: List[EvaluationItem] = Field(default_factory=list)
3745
name: str
38-
batch_size: int = 10
39-
timeout_minutes: int = 20
40-
model_settings: List[Dict[str, Any]] = Field(default_factory=list)
41-
created_at: str
42-
updated_at: str
46+
batch_size: int = Field(10, alias="batchSize")
47+
timeout_minutes: int = Field(default=20, alias="timeoutMinutes")
48+
model_settings: List[Dict[str, Any]] = Field(
49+
default_factory=list, alias="modelSettings"
50+
)
51+
created_at: str = Field(alias="createdAt")
52+
updated_at: str = Field(alias="updatedAt")
4353

4454
def extract_selected_evals(self, eval_ids) -> None:
4555
selected_evals: list[EvaluationItem] = []
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
from typing import Annotated, Any, Literal, Union
2+
3+
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
4+
5+
from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
6+
7+
8+
class EvaluatorBaseParams(BaseModel):
9+
"""Parameters for initializing the base evaluator."""
10+
11+
id: str
12+
name: str
13+
description: str
14+
evaluator_type: EvaluatorType = Field(..., alias="type")
15+
created_at: str = Field(..., alias="createdAt")
16+
updated_at: str = Field(..., alias="updatedAt")
17+
target_output_key: str = Field(..., alias="targetOutputKey")
18+
file_name: str = Field(..., alias="fileName")
19+
20+
21+
class LLMEvaluatorParams(EvaluatorBaseParams):
22+
category: Literal[EvaluatorCategory.LlmAsAJudge] = Field(..., alias="category")
23+
prompt: str = Field(..., alias="prompt")
24+
model: str = Field(..., alias="model")
25+
26+
model_config = ConfigDict(
27+
validate_by_name=True, validate_by_alias=True, extra="allow"
28+
)
29+
30+
31+
class TrajectoryEvaluatorParams(EvaluatorBaseParams):
32+
category: Literal[EvaluatorCategory.Trajectory] = Field(..., alias="category")
33+
prompt: str = Field(..., alias="prompt")
34+
model: str = Field(..., alias="model")
35+
36+
model_config = ConfigDict(
37+
validate_by_name=True, validate_by_alias=True, extra="allow"
38+
)
39+
40+
41+
class EqualsEvaluatorParams(EvaluatorBaseParams):
42+
model_config = ConfigDict(
43+
validate_by_name=True, validate_by_alias=True, extra="allow"
44+
)
45+
46+
47+
class JsonSimilarityEvaluatorParams(EvaluatorBaseParams):
48+
model_config = ConfigDict(
49+
validate_by_name=True, validate_by_alias=True, extra="allow"
50+
)
51+
52+
53+
class UnknownEvaluatorParams(EvaluatorBaseParams):
54+
model_config = ConfigDict(
55+
validate_by_name=True, validate_by_alias=True, extra="allow"
56+
)
57+
58+
59+
def evaluator_discriminator(data: Any) -> str:
60+
if isinstance(data, dict):
61+
category = data.get("category")
62+
evaluator_type = data.get("type")
63+
match category:
64+
case EvaluatorCategory.LlmAsAJudge:
65+
return "LLMEvaluatorParams"
66+
case EvaluatorCategory.Trajectory:
67+
return "TrajectoryEvaluatorParams"
68+
case EvaluatorCategory.Deterministic:
69+
match evaluator_type:
70+
case EvaluatorType.Equals:
71+
return "EqualsEvaluatorParams"
72+
case EvaluatorType.JsonSimilarity:
73+
return "JsonSimilarityEvaluatorParams"
74+
case _:
75+
return "UnknownEvaluatorParams"
76+
case _:
77+
return "UnknownEvaluatorParams"
78+
else:
79+
return "UnknownEvaluatorParams"
80+
81+
82+
Evaluator = Annotated[
83+
Union[
84+
Annotated[
85+
LLMEvaluatorParams,
86+
Tag("LLMEvaluatorParams"),
87+
],
88+
Annotated[
89+
TrajectoryEvaluatorParams,
90+
Tag("TrajectoryEvaluatorParams"),
91+
],
92+
Annotated[
93+
EqualsEvaluatorParams,
94+
Tag("EqualsEvaluatorParams"),
95+
],
96+
Annotated[
97+
JsonSimilarityEvaluatorParams,
98+
Tag("JsonSimilarityEvaluatorParams"),
99+
],
100+
Annotated[
101+
UnknownEvaluatorParams,
102+
Tag("UnknownEvaluatorParams"),
103+
],
104+
],
105+
Field(discriminator=Discriminator(evaluator_discriminator)),
106+
]

0 commit comments

Comments
 (0)