Skip to content

Commit 9c4a0a2

Browse files
feature: add classification evaluators (#1397)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8d81895 commit 9c4a0a2

16 files changed

Lines changed: 1302 additions & 0 deletions
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"version": "2.0",
3+
"resources": []
4+
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
{
2+
"version": "1.0",
3+
"id": "TeleComClassificationEval",
4+
"name": "TeleCom Email Classification Evaluation",
5+
"evaluatorRefs": [
6+
"MulticlassClassificationEvaluator",
7+
"BalancedAccuracyEvaluator"
8+
],
9+
"evaluations": [
10+
{
11+
"id": "payment-invoice",
12+
"name": "Payment - Invoice reminder",
13+
"inputs": {
14+
"email_subject": "Your invoice is ready",
15+
"email_body": "Dear customer, your monthly invoice of $45.99 is now available. Payment is due by March 15."
16+
},
17+
"evaluationCriterias": {
18+
"MulticlassClassificationEvaluator": {
19+
"expectedClass": "payments"
20+
},
21+
"BalancedAccuracyEvaluator": {
22+
"expectedClass": "payments"
23+
}
24+
}
25+
},
26+
{
27+
"id": "payment-overdue",
28+
"name": "Payment - Overdue balance",
29+
"inputs": {
30+
"email_subject": "Action required: overdue balance",
31+
"email_body": "Your account has an overdue balance of $120.00. Please submit your payment as soon as possible to avoid service interruption."
32+
},
33+
"evaluationCriterias": {
34+
"MulticlassClassificationEvaluator": {
35+
"expectedClass": "payments"
36+
},
37+
"BalancedAccuracyEvaluator": {
38+
"expectedClass": "payments"
39+
}
40+
}
41+
},
42+
{
43+
"id": "payment-refund",
44+
"name": "Payment - Refund request (DELIBERATELY WRONG: ground truth set to 'spam')",
45+
"inputs": {
46+
"email_subject": "Refund for last month's charge",
47+
"email_body": "I was charged $29.99 last month for a service I cancelled. Please process a refund to my account."
48+
},
49+
"evaluationCriterias": {
50+
"MulticlassClassificationEvaluator": {
51+
"expectedClass": "spam"
52+
},
53+
"BalancedAccuracyEvaluator": {
54+
"expectedClass": "spam"
55+
}
56+
}
57+
},
58+
{
59+
"id": "payment-confirmation",
60+
"name": "Payment - Confirmation (DELIBERATELY WRONG: ground truth set to 'spam')",
61+
"inputs": {
62+
"email_subject": "Payment confirmation",
63+
"email_body": "Thank you for your payment of $85.00. Your account balance is now $0.00."
64+
},
65+
"evaluationCriterias": {
66+
"MulticlassClassificationEvaluator": {
67+
"expectedClass": "spam"
68+
},
69+
"BalancedAccuracyEvaluator": {
70+
"expectedClass": "spam"
71+
}
72+
}
73+
},
74+
{
75+
"id": "plan-upgrade",
76+
"name": "Plan - Upgrade inquiry",
77+
"inputs": {
78+
"email_subject": "How do I upgrade my plan?",
79+
"email_body": "Hi, I currently have the 10GB data plan and would like to upgrade to the unlimited plan. Can you help?"
80+
},
81+
"evaluationCriterias": {
82+
"MulticlassClassificationEvaluator": {
83+
"expectedClass": "plan_details"
84+
},
85+
"BalancedAccuracyEvaluator": {
86+
"expectedClass": "plan_details"
87+
}
88+
}
89+
},
90+
{
91+
"id": "plan-subscription",
92+
"name": "Plan - Subscription details",
93+
"inputs": {
94+
"email_subject": "Question about my subscription",
95+
"email_body": "I would like to know the details of my current plan including data limits and any upgrade options available."
96+
},
97+
"evaluationCriterias": {
98+
"MulticlassClassificationEvaluator": {
99+
"expectedClass": "plan_details"
100+
},
101+
"BalancedAccuracyEvaluator": {
102+
"expectedClass": "plan_details"
103+
}
104+
}
105+
},
106+
{
107+
"id": "spam-promo",
108+
"name": "Spam - Unsolicited promotion",
109+
"inputs": {
110+
"email_subject": "You won a FREE iPhone!!!",
111+
"email_body": "Congratulations! You have been selected as the lucky winner. Click here to claim your prize now!"
112+
},
113+
"evaluationCriterias": {
114+
"MulticlassClassificationEvaluator": {
115+
"expectedClass": "spam"
116+
},
117+
"BalancedAccuracyEvaluator": {
118+
"expectedClass": "spam"
119+
}
120+
}
121+
}
122+
]
123+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"version": "1.0",
3+
"id": "BalancedAccuracyEvaluator",
4+
"evaluatorTypeId": "file://types/balanced-accuracy-types.json",
5+
"evaluatorSchema": "file://balanced_accuracy_evaluator.py:BalancedAccuracyEvaluator",
6+
"description": "Balanced accuracy: mean of per-class recall rates. Uses weighted per-datapoint scores with a custom reduce_scores (sum instead of average).",
7+
"evaluatorConfig": {
8+
"name": "BalancedAccuracyEvaluator",
9+
"targetOutputKey": "category",
10+
"classes": ["plan_details", "payments", "spam"],
11+
"classCounts": {
12+
"payments": 2,
13+
"plan_details": 2,
14+
"spam": 3
15+
},
16+
"defaultEvaluationCriteria": {
17+
"expectedClass": "spam"
18+
}
19+
}
20+
}
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
"""Balanced accuracy evaluator — custom evaluator with non-trivial score aggregation.
2+
3+
Balanced accuracy = mean of per-class recall rates.
4+
5+
Per-datapoint scores encode class weights:
6+
- correct prediction: score = 1 / (num_classes * class_count_for_expected)
7+
- wrong prediction: score = 0
8+
9+
Then reduce_scores sums the scores, which yields:
10+
sum = Σ_k (correct_k / (K * n_k)) = (1/K) Σ_k (correct_k / n_k) = balanced_accuracy
11+
"""
12+
13+
from uipath.eval.evaluators.base_evaluator import (
14+
BaseEvaluationCriteria,
15+
BaseEvaluatorJustification,
16+
)
17+
from uipath.eval.evaluators.output_evaluator import (
18+
BaseOutputEvaluator,
19+
OutputEvaluatorConfig,
20+
)
21+
from uipath.eval.models import (
22+
AgentExecution,
23+
EvaluationResult,
24+
NumericEvaluationResult,
25+
)
26+
from uipath.eval.models.models import (
27+
EvaluationResultDto,
28+
UiPathEvaluationError,
29+
UiPathEvaluationErrorCategory,
30+
)
31+
32+
33+
class BalancedAccuracyEvaluationCriteria(BaseEvaluationCriteria):
34+
"""Per-datapoint criteria: which class this sample should belong to."""
35+
36+
expected_class: str
37+
38+
39+
class BalancedAccuracyEvaluatorConfig(
40+
OutputEvaluatorConfig[BalancedAccuracyEvaluationCriteria]
41+
):
42+
"""Evaluator config with class list and per-class sample counts."""
43+
44+
name: str = "BalancedAccuracyEvaluator"
45+
classes: list[str]
46+
class_counts: dict[str, int]
47+
48+
49+
class BalancedAccuracyJustification(BaseEvaluatorJustification):
50+
"""Details about how this datapoint was scored."""
51+
52+
predicted_class: str
53+
expected_class: str
54+
weight: float
55+
is_match: bool
56+
57+
58+
class BalancedAccuracyEvaluator(
59+
BaseOutputEvaluator[
60+
BalancedAccuracyEvaluationCriteria,
61+
BalancedAccuracyEvaluatorConfig,
62+
BalancedAccuracyJustification,
63+
]
64+
):
65+
"""Balanced accuracy: mean of per-class recall rates.
66+
67+
Uses weighted per-datapoint scores so that reduce_scores = sum()
68+
gives the correct balanced accuracy.
69+
"""
70+
71+
@classmethod
72+
def get_evaluator_id(cls) -> str:
73+
"""Get the evaluator id."""
74+
return "custom-balanced-accuracy"
75+
76+
@staticmethod
77+
def reduce_scores(results: list[EvaluationResultDto]) -> float:
78+
"""Sum of pre-weighted scores = balanced accuracy."""
79+
return sum(r.score for r in results)
80+
81+
async def evaluate(
82+
self,
83+
agent_execution: AgentExecution,
84+
evaluation_criteria: BalancedAccuracyEvaluationCriteria,
85+
) -> EvaluationResult:
86+
predicted_class = str(self._get_actual_output(agent_execution)).lower()
87+
expected_class = evaluation_criteria.expected_class.lower()
88+
classes = [c.lower() for c in self.evaluator_config.classes]
89+
class_counts = {
90+
k.lower(): v for k, v in self.evaluator_config.class_counts.items()
91+
}
92+
93+
if expected_class not in classes:
94+
raise UiPathEvaluationError(
95+
code="INVALID_EXPECTED_CLASS",
96+
title="Expected class not in configured classes",
97+
detail=f"Expected class '{expected_class}' is not in the configured classes: {classes}",
98+
category=UiPathEvaluationErrorCategory.USER,
99+
)
100+
101+
if predicted_class not in classes:
102+
raise UiPathEvaluationError(
103+
code="INVALID_PREDICTED_CLASS",
104+
title="Predicted class not in configured classes",
105+
detail=f"Predicted class '{predicted_class}' is not in the configured classes: {classes}",
106+
category=UiPathEvaluationErrorCategory.USER,
107+
)
108+
109+
num_classes = len(classes)
110+
n_k = class_counts.get(expected_class)
111+
if n_k is None or n_k <= 0:
112+
raise UiPathEvaluationError(
113+
code="INVALID_CLASS_COUNT",
114+
title="Missing or invalid class count",
115+
detail=f"class_counts must include a positive count for '{expected_class}'",
116+
category=UiPathEvaluationErrorCategory.USER,
117+
)
118+
119+
weight = 1.0 / (num_classes * n_k)
120+
is_match = predicted_class == expected_class
121+
score = weight if is_match else 0.0
122+
123+
justification = self.validate_justification(
124+
{
125+
"expected": expected_class,
126+
"actual": predicted_class,
127+
"predicted_class": predicted_class,
128+
"expected_class": expected_class,
129+
"weight": weight,
130+
"is_match": is_match,
131+
}
132+
)
133+
return NumericEvaluationResult(score=score, details=justification)
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
{
2+
"evaluatorTypeId": "custom-balanced-accuracy",
3+
"evaluatorConfigSchema": {
4+
"$defs": {
5+
"BalancedAccuracyEvaluationCriteria": {
6+
"description": "Per-datapoint criteria: which class this sample should belong to.",
7+
"properties": {
8+
"expectedClass": {
9+
"title": "Expected Class",
10+
"type": "string"
11+
}
12+
},
13+
"required": ["expectedClass"],
14+
"title": "BalancedAccuracyEvaluationCriteria",
15+
"type": "object"
16+
}
17+
},
18+
"description": "Evaluator config with class list and per-class sample counts.",
19+
"properties": {
20+
"name": {
21+
"default": "BalancedAccuracyEvaluator",
22+
"title": "Name",
23+
"type": "string"
24+
},
25+
"targetOutputKey": {
26+
"default": "*",
27+
"title": "Target Output Key",
28+
"type": "string"
29+
},
30+
"defaultEvaluationCriteria": {
31+
"$ref": "#/$defs/BalancedAccuracyEvaluationCriteria",
32+
"default": {
33+
"expectedClass": "spam"
34+
}
35+
},
36+
"classes": {
37+
"items": { "type": "string" },
38+
"title": "Classes",
39+
"type": "array"
40+
},
41+
"classCounts": {
42+
"additionalProperties": { "type": "integer" },
43+
"title": "Class Counts",
44+
"type": "object"
45+
}
46+
},
47+
"required": ["classes", "classCounts"],
48+
"title": "BalancedAccuracyEvaluatorConfig",
49+
"type": "object"
50+
},
51+
"evaluationCriteriaSchema": {
52+
"description": "Per-datapoint criteria: which class this sample should belong to.",
53+
"properties": {
54+
"expectedClass": {
55+
"title": "Expected Class",
56+
"type": "string"
57+
}
58+
},
59+
"required": ["expectedClass"],
60+
"title": "BalancedAccuracyEvaluationCriteria",
61+
"type": "object"
62+
},
63+
"justificationSchema": {
64+
"properties": {
65+
"predictedClass": { "type": "string" },
66+
"expectedClass": { "type": "string" },
67+
"weight": { "type": "number" },
68+
"isMatch": { "type": "boolean" }
69+
}
70+
}
71+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"version": "1.0",
3+
"id": "MulticlassClassificationEvaluator",
4+
"description": "Checks if the predicted email category matches the expected category using macro-averaged precision",
5+
"evaluatorTypeId": "uipath-multiclass-classification",
6+
"evaluatorConfig": {
7+
"name": "MulticlassClassificationEvaluator",
8+
"targetOutputKey": "category",
9+
"classes": ["plan_details", "payments", "spam"],
10+
"metricType": "precision",
11+
"averaging": "macro",
12+
"defaultEvaluationCriteria": {
13+
"expectedClass": "spam"
14+
}
15+
}
16+
}

0 commit comments

Comments
 (0)