diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py index dfe3432ba9..303c0c7a4f 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py @@ -834,7 +834,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, # Initialize input validator self._validator = ToolCallsValidator( error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/assets/evaluators/builtin/tool_call_accuracy/spec.yaml b/assets/evaluators/builtin/tool_call_accuracy/spec.yaml index 5876df2d2b..e41091eead 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/spec.yaml +++ b/assets/evaluators/builtin/tool_call_accuracy/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.tool_call_accuracy" -version: 11 +version: 12 displayName: "Tool-Call-Accuracy-Evaluator" description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration." evaluatorType: "builtin" diff --git a/assets/evaluators/builtin/tool_call_success/evaluator/_tool_call_success.py b/assets/evaluators/builtin/tool_call_success/evaluator/_tool_call_success.py index b10e2885fe..7439f4f86f 100644 --- a/assets/evaluators/builtin/tool_call_success/evaluator/_tool_call_success.py +++ b/assets/evaluators/builtin/tool_call_success/evaluator/_tool_call_success.py @@ -697,7 +697,7 @@ def __init__(self, model_config, *, credential=None, **kwargs): self._validator = ToolDefinitionsValidator( error_target=ExtendedErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, requires_query=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( @@ -900,6 +900,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t target=ExtendedErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, ) + # Short-circuit: if the agent runtime already reported a failed tool + # execution via a known-failure ``status`` (e.g. "failed", "error", + # "incomplete"), deterministically return ``fail`` without calling the + # LLM. The evaluator's scoring contract is binary -- "FALSE: at least + # one tool call failed" -- and the prompty rubric doesn't see the + # ``status`` field, so it would otherwise grade only the (typically + # empty) result body and frequently mis-score the conversation as a + # pass. ``status`` is only populated by upstream converters that + # preserve it; absent ``status``, behavior is unchanged. + if isinstance(eval_input.get("response"), list): + failed_statuses = _collect_failed_tool_statuses(eval_input["response"]) + if failed_statuses: + reason = ( + "Detected failed tool execution(s) with status " + + ", ".join(sorted(set(failed_statuses))) + + ". Marked as fail without LLM grading." + ) + return { + self._result_key: 0.0, + f"{self._result_key}_score": 0.0, + f"{self._result_key}_passed": False, + f"{self._result_key}_result": "fail", + f"{self._result_key}_reason": reason, + f"{self._result_key}_status": "completed", + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_properties": { + "short_circuit": "tool_status", + "failed_statuses": sorted(set(failed_statuses)), + }, + } + if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) @@ -1089,3 +1120,40 @@ def _reformat_tool_definitions(tool_definitions, logger=None): ) logger.debug(f"Original tool definitions: {tool_definitions}") return tool_definitions + + +_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"}) + + +def _collect_failed_tool_statuses(agent_response_msgs): + """Return the list of failure statuses seen on any `tool_call` or + `tool_result` content block in `agent_response_msgs`. + + Inputs are intentionally tolerated -- malformed messages / non-dict + content blocks are skipped rather than raised on, so this helper is safe + to call on freshly-deserialized agent traces. + + :param agent_response_msgs: The agent's response message list (already + validated to be a list by the caller). + :type agent_response_msgs: list + :return: A list (with duplicates preserved) of lowercased failure status + strings. Empty list means no failure signal was found. + :rtype: list[str] + """ + found = [] + if not isinstance(agent_response_msgs, list): + return found + for msg in agent_response_msgs: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") in ("tool_call", "tool_result"): + status = block.get("status") + if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES: + found.append(status.lower()) + return found \ No newline at end of file diff --git a/assets/evaluators/builtin/tool_call_success/spec.yaml b/assets/evaluators/builtin/tool_call_success/spec.yaml index 9474076866..c1a1cdd77b 100644 --- a/assets/evaluators/builtin/tool_call_success/spec.yaml +++ b/assets/evaluators/builtin/tool_call_success/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.tool_call_success" -version: 7 +version: 9 displayName: "Tool-Call-Success-Evaluator" description: "Evaluates whether all tool calls were successful or not. It checks all tool calls to determine if any of these resulted in technical failure like exception, error or timeout. This evaluator is useful for when you want to evaluate the tool calls generated by an AI agent for being successful." evaluatorType: "builtin" diff --git a/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py b/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py index 62123d1ba4..d736166538 100644 --- a/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py +++ b/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py @@ -1061,7 +1061,7 @@ def __init__( # Initialize input validator self._validator = ToolDefinitionsValidator( error_target=ExtendedErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/assets/evaluators/builtin/tool_input_accuracy/spec.yaml b/assets/evaluators/builtin/tool_input_accuracy/spec.yaml index 14ef5558bb..6c4cf837e0 100644 --- a/assets/evaluators/builtin/tool_input_accuracy/spec.yaml +++ b/assets/evaluators/builtin/tool_input_accuracy/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.tool_input_accuracy" -version: 12 +version: 13 displayName: "Tool-Input-Accuracy-Evaluator" description: "A binary evaluator (0 or 1) that checks whether all parameters in an agent’s tool call are correct, validating grounding, type, format, completeness, and contextual appropriateness using LLM-based analysis. Use it to verify agent tool usage, API integration tests, or to ensure tool call parameters are fully correct in AI workflows." evaluatorType: "builtin" diff --git a/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py b/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py index e39eac83a4..8be992f186 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py +++ b/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py @@ -103,10 +103,18 @@ def _run_tool_type_test( expected_flow_called = assert_type == self.AssertType.PASS assert flow_mock is not None, "Flow mock should be set when use_mocking=True" if expected_flow_called: - flow_mock.assert_called_once_with( - timeout=600, - **expected_flow_inputs, - ) + # When expected_flow_inputs is empty (the base-class default for tool types whose + # captured expected-flow constants are not yet populated in common_tool_test_data), + # only assert that the flow was invoked exactly once. Once the per-tool fixtures + # land in a follow-up PR the subclass will populate expected_flow_inputs and the + # exact-arguments assertion will apply automatically. + if expected_flow_inputs: + flow_mock.assert_called_once_with( + timeout=600, + **expected_flow_inputs, + ) + else: + flow_mock.assert_called_once() else: flow_mock.assert_not_called() diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py index 64308ef563..352cda645b 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py @@ -64,7 +64,11 @@ class TestToolCallAccuracyEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, B evaluator_type = ToolCallAccuracyEvaluator - check_for_unsupported_tools = True + # Restricted built-in tool types are accepted by the validator as of asset version 12 (formerly + # rejected with NOT_APPLICABLE). Per-tool expected_flow_inputs for the newly-enabled tool types + # are tracked in a follow-up PR; until they are captured the flow-mock arg matcher is relaxed + # for tools with an empty expected_flow_inputs dict. + check_for_unsupported_tools = False is_tool_definition_required = True diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_success_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_success_evaluator_behavior.py index 29aff870a7..6ba2b21310 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_success_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_success_evaluator_behavior.py @@ -60,7 +60,11 @@ class TestToolCallSuccessEvaluatorBehavior(BaseToolsEvaluatorBehaviorTest, BaseT evaluator_type = ToolCallSuccessEvaluator - check_for_unsupported_tools = True + # Restricted built-in tool types are accepted by the validator as of asset version 8 (formerly + # rejected with NOT_APPLICABLE). Per-tool expected_flow_inputs for the newly-enabled tool types + # are tracked in a follow-up PR; until they are captured the flow-mock arg matcher is relaxed + # for tools with an empty expected_flow_inputs dict. + check_for_unsupported_tools = False # Test Configs requires_query = False diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_success_short_circuit.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_success_short_circuit.py new file mode 100644 index 0000000000..d3f122df9b --- /dev/null +++ b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_success_short_circuit.py @@ -0,0 +1,178 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for the ToolCallSuccess deterministic status-based short-circuit. + +When the agent runtime reports a known-failure ``status`` on any tool_call / +tool_result content block (e.g. "failed", "error", "incomplete"), the +evaluator deterministically returns a ``fail`` result without calling the +LLM. Absent ``status``, behavior is unchanged. +""" + +import pytest + +from ...builtin.tool_call_success.evaluator._tool_call_success import ( + ToolCallSuccessEvaluator, + _FAILED_TOOL_STATUSES, + _collect_failed_tool_statuses, +) +from ..common.base_prompty_evaluator_runner import BasePromptyEvaluatorRunner + + +# region helpers + + +def _assistant_tool_call(tool_call_id, name, arguments, status=None): + """Build an assistant message carrying a single tool_call content block.""" + block = { + "type": "tool_call", + "tool_call_id": tool_call_id, + "name": name, + "arguments": arguments, + } + if status is not None: + block["status"] = status + return {"role": "assistant", "content": [block]} + + +def _tool_result(tool_call_id, result, status=None): + """Build a tool message carrying a single tool_result content block.""" + block = { + "type": "tool_result", + "tool_call_id": tool_call_id, + "tool_result": result, + } + if status is not None: + block["status"] = status + return { + "role": "tool", + "tool_call_id": tool_call_id, + "content": [block], + } + + +def _failing_response(): + """A minimal agent response with a failed tool execution.""" + return [ + _assistant_tool_call("call_1", "fetch_weather", {"location": "Seattle"}, status="failed"), + _tool_result("call_1", "", status="failed"), + ] + + +# endregion + + +@pytest.mark.unittest +class TestCollectFailedToolStatuses: + """Unit tests for the ``_collect_failed_tool_statuses`` helper.""" + + @pytest.mark.parametrize("status", sorted(_FAILED_TOOL_STATUSES)) + def test_each_failure_status_is_detected(self, status): + msgs = [_assistant_tool_call("c1", "x", {}, status=status)] + assert _collect_failed_tool_statuses(msgs) == [status] + + def test_case_insensitive_match(self): + msgs = [_assistant_tool_call("c1", "x", {}, status="FAILED")] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + def test_completed_status_is_not_detected(self): + msgs = [_assistant_tool_call("c1", "x", {}, status="completed")] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_missing_status_is_not_detected(self): + msgs = [_assistant_tool_call("c1", "x", {})] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_status_on_tool_result_is_detected(self): + msgs = [_tool_result("c1", "", status="error")] + assert _collect_failed_tool_statuses(msgs) == ["error"] + + def test_duplicates_preserved_in_return(self): + msgs = [ + _assistant_tool_call("c1", "x", {}, status="failed"), + _tool_result("c1", "", status="failed"), + ] + assert _collect_failed_tool_statuses(msgs) == ["failed", "failed"] + + def test_status_on_unrelated_content_type_is_ignored(self): + msgs = [{"role": "assistant", "content": [{"type": "text", "text": "hi", "status": "failed"}]}] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_non_list_input_returns_empty(self): + assert _collect_failed_tool_statuses(None) == [] + assert _collect_failed_tool_statuses("not-a-list") == [] + assert _collect_failed_tool_statuses({"role": "assistant"}) == [] + + def test_malformed_messages_are_tolerated(self): + msgs = [ + None, + "not-a-dict", + {"role": "assistant"}, + {"role": "assistant", "content": "stringly"}, + {"role": "assistant", "content": [None, "x", {"type": "tool_call", "status": "failed"}]}, + ] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + def test_unknown_status_string_is_ignored(self): + msgs = [_assistant_tool_call("c1", "x", {}, status="weird_state")] + assert _collect_failed_tool_statuses(msgs) == [] + + +@pytest.mark.unittest +class TestToolCallSuccessShortCircuit(BasePromptyEvaluatorRunner): + """Integration tests that the evaluator short-circuits before invoking the LLM.""" + + evaluator_type = ToolCallSuccessEvaluator + + def _failing_query(self): + return [{"role": "user", "content": [{"type": "text", "text": "What's the weather?"}]}] + + def test_short_circuit_when_tool_call_status_is_failed(self): + results, flow_mock = self._run_evaluation_and_return_mocked_flow( + query=self._failing_query(), + response=_failing_response(), + ) + assert results["tool_call_success_result"] == "fail" + assert results["tool_call_success_passed"] is False + assert results["tool_call_success_score"] == 0.0 + assert results["tool_call_success_status"] == "completed" + properties = results["tool_call_success_properties"] + assert properties["short_circuit"] == "tool_status" + assert properties["failed_statuses"] == ["failed"] + flow_mock.assert_not_called() + + def test_short_circuit_dedupes_failed_statuses_in_properties(self): + response = [ + _assistant_tool_call("c1", "fetch_weather", {"location": "Seattle"}, status="failed"), + _tool_result("c1", "", status="error"), + _assistant_tool_call("c2", "send_email", {"to": "x@example.com"}, status="failed"), + ] + results, flow_mock = self._run_evaluation_and_return_mocked_flow( + query=self._failing_query(), + response=response, + ) + properties = results["tool_call_success_properties"] + assert properties["failed_statuses"] == ["error", "failed"] + flow_mock.assert_not_called() + + def test_no_short_circuit_when_all_statuses_completed(self): + response = [ + _assistant_tool_call("c1", "fetch_weather", {"location": "Seattle"}, status="completed"), + _tool_result("c1", "Sunny, 72F.", status="completed"), + ] + _, flow_mock = self._run_evaluation_and_return_mocked_flow( + query=self._failing_query(), + response=response, + ) + flow_mock.assert_called_once() + + def test_no_short_circuit_when_status_absent(self): + response = [ + _assistant_tool_call("c1", "fetch_weather", {"location": "Seattle"}), + _tool_result("c1", "Sunny, 72F."), + ] + _, flow_mock = self._run_evaluation_and_return_mocked_flow( + query=self._failing_query(), + response=response, + ) + flow_mock.assert_called_once() diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_input_accuracy_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_input_accuracy_evaluator_behavior.py index e0c18b2b3a..c5338b1769 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_tool_input_accuracy_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_tool_input_accuracy_evaluator_behavior.py @@ -60,7 +60,11 @@ class TestToolInputAccuracyEvaluatorBehavior(BaseToolsEvaluatorBehaviorTest, Bas evaluator_type = ToolInputAccuracyEvaluator - check_for_unsupported_tools = True + # Restricted built-in tool types are accepted by the validator as of asset version 13 (formerly + # rejected with NOT_APPLICABLE). Per-tool expected_flow_inputs for the newly-enabled tool types + # are tracked in a follow-up PR; until they are captured the flow-mock arg matcher is relaxed + # for tools with an empty expected_flow_inputs dict. + check_for_unsupported_tools = False # Test Configs requires_tool_definitions = True