Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,28 @@
COMMUNITY_agents_SUFFIX = "-community-agents"


def format_explained_tool_calls(explanations: Mapping[str, str]) -> str:
"""Render an `explained_tool_calls_*` mapping as a human-readable justification.

The score helpers produce a mapping keyed by tool name with values like
"Actual: X, Expected: Y, Score: Z", or a single `_result` sentinel entry
describing an empty/short-circuit case. This collapses either shape into a
single string suitable for the `justification` field surfaced to users.

Args:
explanations: Mapping of tool name (or `_result`) to its per-tool explanation.

Returns:
A human-readable justification string (empty if there is nothing to explain).
"""
if not explanations:
return ""
return "; ".join(
value if key == "_result" else f"{key} -> {value}"
for key, value in explanations.items()
)


def extract_tool_calls_names(spans: Sequence[ReadableSpan]) -> list[str]:
"""Extract the tool call names from execution spans IN ORDER.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import math
from typing import Any, Tuple

from pydantic import computed_field

from ..models import (
AgentExecution,
EvaluationResult,
Expand All @@ -23,6 +25,12 @@ class JsonSimilarityJustification(BaseEvaluatorJustification):
matched_leaves: float
total_leaves: float

@computed_field # type: ignore[prop-decorator]
@property
def justification(self) -> str:
"""Human-readable justification derived from the matched JSON leaves."""
return f"Matched {self.matched_leaves} of {self.total_leaves} JSON leaf values."


class JsonSimilarityEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]):
"""Configuration for the json similarity evaluator."""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""Tool call order evaluator for validating correct sequence of tool calls."""

from pydantic import computed_field

from .._helpers.evaluators_helpers import (
extract_tool_calls,
format_explained_tool_calls,
tool_calls_args_score,
)
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult, ToolCall
Expand Down Expand Up @@ -34,6 +37,12 @@ class ToolCallArgsEvaluatorJustification(BaseEvaluatorJustification):

explained_tool_calls_args: dict[str, str]

@computed_field # type: ignore[prop-decorator]
@property
def justification(self) -> str:
"""Human-readable justification derived from the per-tool arg matches."""
return format_explained_tool_calls(self.explained_tool_calls_args)


class ToolCallArgsEvaluator(
BaseEvaluator[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

from collections import Counter

from pydantic import computed_field

from .._helpers.evaluators_helpers import (
extract_tool_calls_names,
format_explained_tool_calls,
tool_calls_count_score,
)
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
Expand Down Expand Up @@ -37,6 +40,12 @@ class ToolCallCountEvaluatorJustification(BaseEvaluatorJustification):

explained_tool_calls_count: dict[str, str]

@computed_field # type: ignore[prop-decorator]
@property
def justification(self) -> str:
"""Human-readable justification derived from the per-tool counts."""
return format_explained_tool_calls(self.explained_tool_calls_count)


class ToolCallCountEvaluator(
BaseEvaluator[
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Tool call order evaluator for validating correct sequence of tool calls."""

from pydantic import computed_field

from .._helpers.evaluators_helpers import (
extract_tool_calls_names,
tool_calls_order_score,
Expand Down Expand Up @@ -35,6 +37,19 @@ class ToolCallOrderEvaluatorJustification(BaseEvaluatorJustification):

lcs: list[str]

@computed_field # type: ignore[prop-decorator]
@property
def justification(self) -> str:
"""Human-readable justification derived from the matched call order."""
if not self.lcs:
return (
"No common ordered subsequence between expected and actual tool calls."
)
return (
"Longest common ordered subsequence between expected and actual "
f"tool calls: {self.lcs}."
)


class ToolCallOrderEvaluator(
BaseEvaluator[
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""Tool call order evaluator for validating correct sequence of tool calls."""

from pydantic import computed_field

from .._helpers.evaluators_helpers import (
extract_tool_calls_outputs,
format_explained_tool_calls,
tool_calls_output_score,
)
from ..models import (
Expand Down Expand Up @@ -40,6 +43,12 @@ class ToolCallOutputEvaluatorJustification(BaseEvaluatorJustification):

explained_tool_calls_outputs: dict[str, str]

@computed_field # type: ignore[prop-decorator]
@property
def justification(self) -> str:
"""Human-readable justification derived from the per-tool output matches."""
return format_explained_tool_calls(self.explained_tool_calls_outputs)


class ToolCallOutputEvaluator(
BaseEvaluator[
Expand Down
Loading