Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
# Initialize input validator
self._validator = ToolCallsValidator(
error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
)

super().__init__(
Expand Down
2 changes: 1 addition & 1 deletion assets/evaluators/builtin/tool_call_accuracy/spec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
type: "evaluator"
name: "builtin.tool_call_accuracy"
version: 11
version: 12
displayName: "Tool-Call-Accuracy-Evaluator"
description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration."
evaluatorType: "builtin"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,7 @@ def __init__(self, model_config, *, credential=None, **kwargs):
self._validator = ToolDefinitionsValidator(
error_target=ExtendedErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
requires_query=False,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
)

super().__init__(
Expand Down Expand Up @@ -900,6 +900,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
target=ExtendedErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
)

# Short-circuit: if the agent runtime already reported a failed tool
# execution via a known-failure ``status`` (e.g. "failed", "error",
# "incomplete"), deterministically return ``fail`` without calling the
# LLM. The evaluator's scoring contract is binary -- "FALSE: at least
# one tool call failed" -- and the prompty rubric doesn't see the
# ``status`` field, so it would otherwise grade only the (typically
# empty) result body and frequently mis-score the conversation as a
# pass. ``status`` is only populated by upstream converters that
# preserve it; absent ``status``, behavior is unchanged.
if isinstance(eval_input.get("response"), list):
failed_statuses = _collect_failed_tool_statuses(eval_input["response"])
if failed_statuses:
reason = (
"Detected failed tool execution(s) with status "
+ ", ".join(sorted(set(failed_statuses)))
+ ". Marked as fail without LLM grading."
)
return {
self._result_key: 0.0,
f"{self._result_key}_score": 0.0,
f"{self._result_key}_passed": False,
f"{self._result_key}_result": "fail",
f"{self._result_key}_reason": reason,
f"{self._result_key}_status": "completed",
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": {
"short_circuit": "tool_status",
"failed_statuses": sorted(set(failed_statuses)),
},
}

if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
Expand Down Expand Up @@ -1089,3 +1120,40 @@ def _reformat_tool_definitions(tool_definitions, logger=None):
)
logger.debug(f"Original tool definitions: {tool_definitions}")
return tool_definitions


_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"})


def _collect_failed_tool_statuses(agent_response_msgs):
"""Return the list of failure statuses seen on any `tool_call` or
`tool_result` content block in `agent_response_msgs`.

Inputs are intentionally tolerated -- malformed messages / non-dict
content blocks are skipped rather than raised on, so this helper is safe
to call on freshly-deserialized agent traces.

:param agent_response_msgs: The agent's response message list (already
validated to be a list by the caller).
:type agent_response_msgs: list
:return: A list (with duplicates preserved) of lowercased failure status
strings. Empty list means no failure signal was found.
:rtype: list[str]
"""
found = []
if not isinstance(agent_response_msgs, list):
return found
for msg in agent_response_msgs:
if not isinstance(msg, dict):
continue
content = msg.get("content")
if not isinstance(content, list):
continue
for block in content:
if not isinstance(block, dict):
continue
if block.get("type") in ("tool_call", "tool_result"):
status = block.get("status")
if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES:
found.append(status.lower())
return found
2 changes: 1 addition & 1 deletion assets/evaluators/builtin/tool_call_success/spec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
type: "evaluator"
name: "builtin.tool_call_success"
version: 7
version: 9
displayName: "Tool-Call-Success-Evaluator"
description: "Evaluates whether all tool calls were successful or not. It checks all tool calls to determine if any of these resulted in technical failure like exception, error or timeout. This evaluator is useful for when you want to evaluate the tool calls generated by an AI agent for being successful."
evaluatorType: "builtin"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,7 @@ def __init__(
# Initialize input validator
self._validator = ToolDefinitionsValidator(
error_target=ExtendedErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
)

super().__init__(
Expand Down
2 changes: 1 addition & 1 deletion assets/evaluators/builtin/tool_input_accuracy/spec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
type: "evaluator"
name: "builtin.tool_input_accuracy"
version: 12
version: 13
displayName: "Tool-Input-Accuracy-Evaluator"
description: "A binary evaluator (0 or 1) that checks whether all parameters in an agent’s tool call are correct, validating grounding, type, format, completeness, and contextual appropriateness using LLM-based analysis. Use it to verify agent tool usage, API integration tests, or to ensure tool call parameters are fully correct in AI workflows."
evaluatorType: "builtin"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,18 @@ def _run_tool_type_test(
expected_flow_called = assert_type == self.AssertType.PASS
assert flow_mock is not None, "Flow mock should be set when use_mocking=True"
if expected_flow_called:
flow_mock.assert_called_once_with(
timeout=600,
**expected_flow_inputs,
)
# When expected_flow_inputs is empty (the base-class default for tool types whose
# captured expected-flow constants are not yet populated in common_tool_test_data),
# only assert that the flow was invoked exactly once. Once the per-tool fixtures
# land in a follow-up PR the subclass will populate expected_flow_inputs and the
# exact-arguments assertion will apply automatically.
if expected_flow_inputs:
flow_mock.assert_called_once_with(
timeout=600,
**expected_flow_inputs,
)
else:
flow_mock.assert_called_once()
else:
flow_mock.assert_not_called()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,11 @@ class TestToolCallAccuracyEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, B

evaluator_type = ToolCallAccuracyEvaluator

check_for_unsupported_tools = True
# Restricted built-in tool types are accepted by the validator as of asset version 12 (formerly
# rejected with NOT_APPLICABLE). Per-tool expected_flow_inputs for the newly-enabled tool types
# are tracked in a follow-up PR; until they are captured the flow-mock arg matcher is relaxed
# for tools with an empty expected_flow_inputs dict.
check_for_unsupported_tools = False

is_tool_definition_required = True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ class TestToolCallSuccessEvaluatorBehavior(BaseToolsEvaluatorBehaviorTest, BaseT

evaluator_type = ToolCallSuccessEvaluator

check_for_unsupported_tools = True
# Restricted built-in tool types are accepted by the validator as of asset version 8 (formerly
# rejected with NOT_APPLICABLE). Per-tool expected_flow_inputs for the newly-enabled tool types
# are tracked in a follow-up PR; until they are captured the flow-mock arg matcher is relaxed
# for tools with an empty expected_flow_inputs dict.
check_for_unsupported_tools = False

# Test Configs
requires_query = False
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests for the ToolCallSuccess deterministic status-based short-circuit.

When the agent runtime reports a known-failure ``status`` on any tool_call /
tool_result content block (e.g. "failed", "error", "incomplete"), the
evaluator deterministically returns a ``fail`` result without calling the
LLM. Absent ``status``, behavior is unchanged.
"""

import pytest

from ...builtin.tool_call_success.evaluator._tool_call_success import (
ToolCallSuccessEvaluator,
_FAILED_TOOL_STATUSES,
_collect_failed_tool_statuses,
)
from ..common.base_prompty_evaluator_runner import BasePromptyEvaluatorRunner


# region helpers


def _assistant_tool_call(tool_call_id, name, arguments, status=None):
"""Build an assistant message carrying a single tool_call content block."""
block = {
"type": "tool_call",
"tool_call_id": tool_call_id,
"name": name,
"arguments": arguments,
}
if status is not None:
block["status"] = status
return {"role": "assistant", "content": [block]}


def _tool_result(tool_call_id, result, status=None):
"""Build a tool message carrying a single tool_result content block."""
block = {
"type": "tool_result",
"tool_call_id": tool_call_id,
"tool_result": result,
}
if status is not None:
block["status"] = status
return {
"role": "tool",
"tool_call_id": tool_call_id,
"content": [block],
}


def _failing_response():
"""A minimal agent response with a failed tool execution."""
return [
_assistant_tool_call("call_1", "fetch_weather", {"location": "Seattle"}, status="failed"),
_tool_result("call_1", "", status="failed"),
]


# endregion


@pytest.mark.unittest
class TestCollectFailedToolStatuses:
"""Unit tests for the ``_collect_failed_tool_statuses`` helper."""

@pytest.mark.parametrize("status", sorted(_FAILED_TOOL_STATUSES))
def test_each_failure_status_is_detected(self, status):
msgs = [_assistant_tool_call("c1", "x", {}, status=status)]
assert _collect_failed_tool_statuses(msgs) == [status]

def test_case_insensitive_match(self):
msgs = [_assistant_tool_call("c1", "x", {}, status="FAILED")]
assert _collect_failed_tool_statuses(msgs) == ["failed"]

def test_completed_status_is_not_detected(self):
msgs = [_assistant_tool_call("c1", "x", {}, status="completed")]
assert _collect_failed_tool_statuses(msgs) == []

def test_missing_status_is_not_detected(self):
msgs = [_assistant_tool_call("c1", "x", {})]
assert _collect_failed_tool_statuses(msgs) == []

def test_status_on_tool_result_is_detected(self):
msgs = [_tool_result("c1", "", status="error")]
assert _collect_failed_tool_statuses(msgs) == ["error"]

def test_duplicates_preserved_in_return(self):
msgs = [
_assistant_tool_call("c1", "x", {}, status="failed"),
_tool_result("c1", "", status="failed"),
]
assert _collect_failed_tool_statuses(msgs) == ["failed", "failed"]

def test_status_on_unrelated_content_type_is_ignored(self):
msgs = [{"role": "assistant", "content": [{"type": "text", "text": "hi", "status": "failed"}]}]
assert _collect_failed_tool_statuses(msgs) == []

def test_non_list_input_returns_empty(self):
assert _collect_failed_tool_statuses(None) == []
assert _collect_failed_tool_statuses("not-a-list") == []
assert _collect_failed_tool_statuses({"role": "assistant"}) == []

def test_malformed_messages_are_tolerated(self):
msgs = [
None,
"not-a-dict",
{"role": "assistant"},
{"role": "assistant", "content": "stringly"},
{"role": "assistant", "content": [None, "x", {"type": "tool_call", "status": "failed"}]},
]
assert _collect_failed_tool_statuses(msgs) == ["failed"]

def test_unknown_status_string_is_ignored(self):
msgs = [_assistant_tool_call("c1", "x", {}, status="weird_state")]
assert _collect_failed_tool_statuses(msgs) == []


@pytest.mark.unittest
class TestToolCallSuccessShortCircuit(BasePromptyEvaluatorRunner):
"""Integration tests that the evaluator short-circuits before invoking the LLM."""

evaluator_type = ToolCallSuccessEvaluator

def _failing_query(self):
return [{"role": "user", "content": [{"type": "text", "text": "What's the weather?"}]}]

def test_short_circuit_when_tool_call_status_is_failed(self):
results, flow_mock = self._run_evaluation_and_return_mocked_flow(
query=self._failing_query(),
response=_failing_response(),
)
assert results["tool_call_success_result"] == "fail"
assert results["tool_call_success_passed"] is False
assert results["tool_call_success_score"] == 0.0
assert results["tool_call_success_status"] == "completed"
properties = results["tool_call_success_properties"]
assert properties["short_circuit"] == "tool_status"
assert properties["failed_statuses"] == ["failed"]
flow_mock.assert_not_called()

def test_short_circuit_dedupes_failed_statuses_in_properties(self):
response = [
_assistant_tool_call("c1", "fetch_weather", {"location": "Seattle"}, status="failed"),
_tool_result("c1", "", status="error"),
_assistant_tool_call("c2", "send_email", {"to": "x@example.com"}, status="failed"),
]
results, flow_mock = self._run_evaluation_and_return_mocked_flow(
query=self._failing_query(),
response=response,
)
properties = results["tool_call_success_properties"]
assert properties["failed_statuses"] == ["error", "failed"]
flow_mock.assert_not_called()

def test_no_short_circuit_when_all_statuses_completed(self):
response = [
_assistant_tool_call("c1", "fetch_weather", {"location": "Seattle"}, status="completed"),
_tool_result("c1", "Sunny, 72F.", status="completed"),
]
_, flow_mock = self._run_evaluation_and_return_mocked_flow(
query=self._failing_query(),
response=response,
)
flow_mock.assert_called_once()

def test_no_short_circuit_when_status_absent(self):
response = [
_assistant_tool_call("c1", "fetch_weather", {"location": "Seattle"}),
_tool_result("c1", "Sunny, 72F."),
]
_, flow_mock = self._run_evaluation_and_return_mocked_flow(
query=self._failing_query(),
response=response,
)
flow_mock.assert_called_once()
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ class TestToolInputAccuracyEvaluatorBehavior(BaseToolsEvaluatorBehaviorTest, Bas

evaluator_type = ToolInputAccuracyEvaluator

check_for_unsupported_tools = True
# Restricted built-in tool types are accepted by the validator as of asset version 13 (formerly
# rejected with NOT_APPLICABLE). Per-tool expected_flow_inputs for the newly-enabled tool types
# are tracked in a follow-up PR; until they are captured the flow-mock arg matcher is relaxed
# for tools with an empty expected_flow_inputs dict.
check_for_unsupported_tools = False

# Test Configs
requires_tool_definitions = True
Expand Down
Loading