From 250e065fca4ad3c14e3528da1e285edd1a54ce84 Mon Sep 17 00:00:00 2001
From: Amey <ameyjain.28@gmail.com>
Date: Wed, 10 Jun 2026 15:06:22 -0400
Subject: [PATCH] fix(eval): emit string justification for coded evaluators

Coded evaluators (tool-call count/args/output/order, json-similarity)
stored their explanation under per-evaluator keys (explained_tool_calls_*,
lcs, matched_leaves) with no 'justification' key, so the eval worker's
d.get('justification') always resolved to null while the structured detail
was still present.

Add a computed 'justification' string field to each coded justification
model, derived from its existing structured detail via a shared
format_explained_tool_calls helper. model_dump() now emits a string
justification for every evaluator, matching LLMJudgeJustification, without
changing the structured fields or the worker.
---
 .../eval/_helpers/evaluators_helpers.py       | 22 +++++++++++++++++++
 .../evaluators/json_similarity_evaluator.py   |  8 +++++++
 .../evaluators/tool_call_args_evaluator.py    |  9 ++++++++
 .../evaluators/tool_call_count_evaluator.py   |  9 ++++++++
 .../evaluators/tool_call_order_evaluator.py   | 15 +++++++++++++
 .../evaluators/tool_call_output_evaluator.py  |  9 ++++++++
 6 files changed, 72 insertions(+)

diff --git a/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py b/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py
index d64954b99..40ffe420a 100644
--- a/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py
+++ b/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py
@@ -24,6 +24,28 @@
 COMMUNITY_agents_SUFFIX = "-community-agents"
 
 
+def format_explained_tool_calls(explanations: Mapping[str, str]) -> str:
+    """Render an `explained_tool_calls_*` mapping as a human-readable justification.
+
+    The score helpers produce a mapping keyed by tool name with values like
+    "Actual: X, Expected: Y, Score: Z", or a single `_result` sentinel entry
+    describing an empty/short-circuit case. This collapses either shape into a
+    single string suitable for the `justification` field surfaced to users.
+
+    Args:
+        explanations: Mapping of tool name (or `_result`) to its per-tool explanation.
+
+    Returns:
+        A human-readable justification string (empty if there is nothing to explain).
+    """
+    if not explanations:
+        return ""
+    return "; ".join(
+        value if key == "_result" else f"{key} -> {value}"
+        for key, value in explanations.items()
+    )
+
+
 def extract_tool_calls_names(spans: Sequence[ReadableSpan]) -> list[str]:
     """Extract the tool call names from execution spans IN ORDER.
 
diff --git a/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py
index 552194f2e..ea4d5a1e8 100644
--- a/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py
@@ -3,6 +3,8 @@
 import math
 from typing import Any, Tuple
 
+from pydantic import computed_field
+
 from ..models import (
     AgentExecution,
     EvaluationResult,
@@ -23,6 +25,12 @@ class JsonSimilarityJustification(BaseEvaluatorJustification):
     matched_leaves: float
     total_leaves: float
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the matched JSON leaves."""
+        return f"Matched {self.matched_leaves} of {self.total_leaves} JSON leaf values."
+
 
 class JsonSimilarityEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]):
     """Configuration for the json similarity evaluator."""
diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py
index 2703e3c76..067791c84 100644
--- a/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py
@@ -1,7 +1,10 @@
 """Tool call order evaluator for validating correct sequence of tool calls."""
 
+from pydantic import computed_field
+
 from .._helpers.evaluators_helpers import (
     extract_tool_calls,
+    format_explained_tool_calls,
     tool_calls_args_score,
 )
 from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult, ToolCall
@@ -34,6 +37,12 @@ class ToolCallArgsEvaluatorJustification(BaseEvaluatorJustification):
 
     explained_tool_calls_args: dict[str, str]
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the per-tool arg matches."""
+        return format_explained_tool_calls(self.explained_tool_calls_args)
+
 
 class ToolCallArgsEvaluator(
     BaseEvaluator[
diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py
index 11d684ae1..22e9da29e 100644
--- a/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py
@@ -2,8 +2,11 @@
 
 from collections import Counter
 
+from pydantic import computed_field
+
 from .._helpers.evaluators_helpers import (
     extract_tool_calls_names,
+    format_explained_tool_calls,
     tool_calls_count_score,
 )
 from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
@@ -37,6 +40,12 @@ class ToolCallCountEvaluatorJustification(BaseEvaluatorJustification):
 
     explained_tool_calls_count: dict[str, str]
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the per-tool counts."""
+        return format_explained_tool_calls(self.explained_tool_calls_count)
+
 
 class ToolCallCountEvaluator(
     BaseEvaluator[
diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py
index 1050ddc76..f68be149e 100644
--- a/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py
@@ -1,5 +1,7 @@
 """Tool call order evaluator for validating correct sequence of tool calls."""
 
+from pydantic import computed_field
+
 from .._helpers.evaluators_helpers import (
     extract_tool_calls_names,
     tool_calls_order_score,
@@ -35,6 +37,19 @@ class ToolCallOrderEvaluatorJustification(BaseEvaluatorJustification):
 
     lcs: list[str]
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the matched call order."""
+        if not self.lcs:
+            return (
+                "No common ordered subsequence between expected and actual tool calls."
+            )
+        return (
+            "Longest common ordered subsequence between expected and actual "
+            f"tool calls: {self.lcs}."
+        )
+
 
 class ToolCallOrderEvaluator(
     BaseEvaluator[
diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py
index fff139daf..9f27d07f4 100644
--- a/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py
@@ -1,7 +1,10 @@
 """Tool call order evaluator for validating correct sequence of tool calls."""
 
+from pydantic import computed_field
+
 from .._helpers.evaluators_helpers import (
     extract_tool_calls_outputs,
+    format_explained_tool_calls,
     tool_calls_output_score,
 )
 from ..models import (
@@ -40,6 +43,12 @@ class ToolCallOutputEvaluatorJustification(BaseEvaluatorJustification):
 
     explained_tool_calls_outputs: dict[str, str]
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the per-tool output matches."""
+        return format_explained_tool_calls(self.explained_tool_calls_outputs)
+
 
 class ToolCallOutputEvaluator(
     BaseEvaluator[