From 250e065fca4ad3c14e3528da1e285edd1a54ce84 Mon Sep 17 00:00:00 2001 From: Amey Date: Wed, 10 Jun 2026 15:06:22 -0400 Subject: [PATCH] fix(eval): emit string justification for coded evaluators Coded evaluators (tool-call count/args/output/order, json-similarity) stored their explanation under per-evaluator keys (explained_tool_calls_*, lcs, matched_leaves) with no 'justification' key, so the eval worker's d.get('justification') always resolved to null while the structured detail was still present. Add a computed 'justification' string field to each coded justification model, derived from its existing structured detail via a shared format_explained_tool_calls helper. model_dump() now emits a string justification for every evaluator, matching LLMJudgeJustification, without changing the structured fields or the worker. --- .../eval/_helpers/evaluators_helpers.py | 22 +++++++++++++++++++ .../evaluators/json_similarity_evaluator.py | 8 +++++++ .../evaluators/tool_call_args_evaluator.py | 9 ++++++++ .../evaluators/tool_call_count_evaluator.py | 9 ++++++++ .../evaluators/tool_call_order_evaluator.py | 15 +++++++++++++ .../evaluators/tool_call_output_evaluator.py | 9 ++++++++ 6 files changed, 72 insertions(+) diff --git a/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py b/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py index d64954b99..40ffe420a 100644 --- a/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py +++ b/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py @@ -24,6 +24,28 @@ COMMUNITY_agents_SUFFIX = "-community-agents" +def format_explained_tool_calls(explanations: Mapping[str, str]) -> str: + """Render an `explained_tool_calls_*` mapping as a human-readable justification. + + The score helpers produce a mapping keyed by tool name with values like + "Actual: X, Expected: Y, Score: Z", or a single `_result` sentinel entry + describing an empty/short-circuit case. This collapses either shape into a + single string suitable for the `justification` field surfaced to users. + + Args: + explanations: Mapping of tool name (or `_result`) to its per-tool explanation. + + Returns: + A human-readable justification string (empty if there is nothing to explain). + """ + if not explanations: + return "" + return "; ".join( + value if key == "_result" else f"{key} -> {value}" + for key, value in explanations.items() + ) + + def extract_tool_calls_names(spans: Sequence[ReadableSpan]) -> list[str]: """Extract the tool call names from execution spans IN ORDER. diff --git a/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py index 552194f2e..ea4d5a1e8 100644 --- a/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py @@ -3,6 +3,8 @@ import math from typing import Any, Tuple +from pydantic import computed_field + from ..models import ( AgentExecution, EvaluationResult, @@ -23,6 +25,12 @@ class JsonSimilarityJustification(BaseEvaluatorJustification): matched_leaves: float total_leaves: float + @computed_field # type: ignore[prop-decorator] + @property + def justification(self) -> str: + """Human-readable justification derived from the matched JSON leaves.""" + return f"Matched {self.matched_leaves} of {self.total_leaves} JSON leaf values." + class JsonSimilarityEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]): """Configuration for the json similarity evaluator.""" diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py index 2703e3c76..067791c84 100644 --- a/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py @@ -1,7 +1,10 @@ """Tool call order evaluator for validating correct sequence of tool calls.""" +from pydantic import computed_field + from .._helpers.evaluators_helpers import ( extract_tool_calls, + format_explained_tool_calls, tool_calls_args_score, ) from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult, ToolCall @@ -34,6 +37,12 @@ class ToolCallArgsEvaluatorJustification(BaseEvaluatorJustification): explained_tool_calls_args: dict[str, str] + @computed_field # type: ignore[prop-decorator] + @property + def justification(self) -> str: + """Human-readable justification derived from the per-tool arg matches.""" + return format_explained_tool_calls(self.explained_tool_calls_args) + class ToolCallArgsEvaluator( BaseEvaluator[ diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py index 11d684ae1..22e9da29e 100644 --- a/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py @@ -2,8 +2,11 @@ from collections import Counter +from pydantic import computed_field + from .._helpers.evaluators_helpers import ( extract_tool_calls_names, + format_explained_tool_calls, tool_calls_count_score, ) from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult @@ -37,6 +40,12 @@ class ToolCallCountEvaluatorJustification(BaseEvaluatorJustification): explained_tool_calls_count: dict[str, str] + @computed_field # type: ignore[prop-decorator] + @property + def justification(self) -> str: + """Human-readable justification derived from the per-tool counts.""" + return format_explained_tool_calls(self.explained_tool_calls_count) + class ToolCallCountEvaluator( BaseEvaluator[ diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py index 1050ddc76..f68be149e 100644 --- a/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py @@ -1,5 +1,7 @@ """Tool call order evaluator for validating correct sequence of tool calls.""" +from pydantic import computed_field + from .._helpers.evaluators_helpers import ( extract_tool_calls_names, tool_calls_order_score, @@ -35,6 +37,19 @@ class ToolCallOrderEvaluatorJustification(BaseEvaluatorJustification): lcs: list[str] + @computed_field # type: ignore[prop-decorator] + @property + def justification(self) -> str: + """Human-readable justification derived from the matched call order.""" + if not self.lcs: + return ( + "No common ordered subsequence between expected and actual tool calls." + ) + return ( + "Longest common ordered subsequence between expected and actual " + f"tool calls: {self.lcs}." + ) + class ToolCallOrderEvaluator( BaseEvaluator[ diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py index fff139daf..9f27d07f4 100644 --- a/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py @@ -1,7 +1,10 @@ """Tool call order evaluator for validating correct sequence of tool calls.""" +from pydantic import computed_field + from .._helpers.evaluators_helpers import ( extract_tool_calls_outputs, + format_explained_tool_calls, tool_calls_output_score, ) from ..models import ( @@ -40,6 +43,12 @@ class ToolCallOutputEvaluatorJustification(BaseEvaluatorJustification): explained_tool_calls_outputs: dict[str, str] + @computed_field # type: ignore[prop-decorator] + @property + def justification(self) -> str: + """Human-readable justification derived from the per-tool output matches.""" + return format_explained_tool_calls(self.explained_tool_calls_outputs) + class ToolCallOutputEvaluator( BaseEvaluator[