UiPath · ameyjain · Jun 10, 2026
diff --git a/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py b/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py
@@ -24,6 +24,28 @@
 COMMUNITY_agents_SUFFIX = "-community-agents"
 
 
+def format_explained_tool_calls(explanations: Mapping[str, str]) -> str:
+    """Render an `explained_tool_calls_*` mapping as a human-readable justification.
+
+    The score helpers produce a mapping keyed by tool name with values like
+    "Actual: X, Expected: Y, Score: Z", or a single `_result` sentinel entry
+    describing an empty/short-circuit case. This collapses either shape into a
+    single string suitable for the `justification` field surfaced to users.
+
+    Args:
+        explanations: Mapping of tool name (or `_result`) to its per-tool explanation.
+
+    Returns:
+        A human-readable justification string (empty if there is nothing to explain).
+    """
+    if not explanations:
+        return ""
+    return "; ".join(
+        value if key == "_result" else f"{key} -> {value}"
+        for key, value in explanations.items()
+    )
+
+
 def extract_tool_calls_names(spans: Sequence[ReadableSpan]) -> list[str]:
     """Extract the tool call names from execution spans IN ORDER.
 

diff --git a/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/json_similarity_evaluator.py
@@ -3,6 +3,8 @@
 import math
 from typing import Any, Tuple
 
+from pydantic import computed_field
+
 from ..models import (
     AgentExecution,
     EvaluationResult,
@@ -23,6 +25,12 @@ class JsonSimilarityJustification(BaseEvaluatorJustification):
     matched_leaves: float
     total_leaves: float
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the matched JSON leaves."""
+        return f"Matched {self.matched_leaves} of {self.total_leaves} JSON leaf values."
+
 
 class JsonSimilarityEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]):
     """Configuration for the json similarity evaluator."""

diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_args_evaluator.py
@@ -1,7 +1,10 @@
 """Tool call order evaluator for validating correct sequence of tool calls."""
 
+from pydantic import computed_field
+
 from .._helpers.evaluators_helpers import (
     extract_tool_calls,
+    format_explained_tool_calls,
     tool_calls_args_score,
 )
 from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult, ToolCall
@@ -34,6 +37,12 @@ class ToolCallArgsEvaluatorJustification(BaseEvaluatorJustification):
 
     explained_tool_calls_args: dict[str, str]
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the per-tool arg matches."""
+        return format_explained_tool_calls(self.explained_tool_calls_args)
+
 
 class ToolCallArgsEvaluator(
     BaseEvaluator[

diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_count_evaluator.py
@@ -2,8 +2,11 @@
 
 from collections import Counter
 
+from pydantic import computed_field
+
 from .._helpers.evaluators_helpers import (
     extract_tool_calls_names,
+    format_explained_tool_calls,
     tool_calls_count_score,
 )
 from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
@@ -37,6 +40,12 @@ class ToolCallCountEvaluatorJustification(BaseEvaluatorJustification):
 
     explained_tool_calls_count: dict[str, str]
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the per-tool counts."""
+        return format_explained_tool_calls(self.explained_tool_calls_count)
+
 
 class ToolCallCountEvaluator(
     BaseEvaluator[

diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_order_evaluator.py
@@ -1,5 +1,7 @@
 """Tool call order evaluator for validating correct sequence of tool calls."""
 
+from pydantic import computed_field
+
 from .._helpers.evaluators_helpers import (
     extract_tool_calls_names,
     tool_calls_order_score,
@@ -35,6 +37,19 @@ class ToolCallOrderEvaluatorJustification(BaseEvaluatorJustification):
 
     lcs: list[str]
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the matched call order."""
+        if not self.lcs:
+            return (
+                "No common ordered subsequence between expected and actual tool calls."
+            )
+        return (
+            "Longest common ordered subsequence between expected and actual "
+            f"tool calls: {self.lcs}."
+        )
+
 
 class ToolCallOrderEvaluator(
     BaseEvaluator[

diff --git a/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/tool_call_output_evaluator.py
@@ -1,7 +1,10 @@
 """Tool call order evaluator for validating correct sequence of tool calls."""
 
+from pydantic import computed_field
+
 from .._helpers.evaluators_helpers import (
     extract_tool_calls_outputs,
+    format_explained_tool_calls,
     tool_calls_output_score,
 )
 from ..models import (
@@ -40,6 +43,12 @@ class ToolCallOutputEvaluatorJustification(BaseEvaluatorJustification):
 
     explained_tool_calls_outputs: dict[str, str]
 
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def justification(self) -> str:
+        """Human-readable justification derived from the per-tool output matches."""
+        return format_explained_tool_calls(self.explained_tool_calls_outputs)
+
 
 class ToolCallOutputEvaluator(
     BaseEvaluator[