diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py index 130b595564..20c9ebbe8c 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from itertools import chain import os import logging from typing import Dict, List, Union, TypeVar @@ -1086,14 +1085,6 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): built_in_definitions = _get_needed_built_in_definitions(tool_calls) needed_tool_definitions.extend(built_in_definitions) - # OpenAPI tool is a collection of functions, so we need to expand it - tool_definitions_expanded = list( - chain.from_iterable( - tool.get("functions", []) if tool.get("type") == "openapi" else [tool] - for tool in needed_tool_definitions - ) - ) - # Validate that all tool calls have corresponding definitions for tool_call in tool_calls: if isinstance(tool_call, dict): @@ -1107,7 +1098,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): elif tool_name: # This is a regular function tool from converter or built-in tool from agent v2 tool_definition_exists = any( - tool.get("name") == tool_name for tool in tool_definitions_expanded + tool.get("name") == tool_name for tool in needed_tool_definitions ) if not tool_definition_exists: raise EvaluationException( diff --git a/assets/evaluators/builtin/tool_call_accuracy/spec.yaml b/assets/evaluators/builtin/tool_call_accuracy/spec.yaml index a0f1037317..1562d16280 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/spec.yaml +++ b/assets/evaluators/builtin/tool_call_accuracy/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.tool_call_accuracy" -version: 7 +version: 8 displayName: "Tool-Call-Accuracy-Evaluator" description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration." evaluatorType: "builtin" diff --git a/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py b/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py index adaaf8087f..4138ac2d78 100644 --- a/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py +++ b/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py @@ -2,7 +2,6 @@ # Licensed under the MIT License. import os import logging -from itertools import chain from typing import Dict, List, Union, TypeVar, cast from typing_extensions import override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -633,14 +632,6 @@ def _extract_needed_tool_definitions( built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls) needed_tool_definitions.extend(built_in_definitions) - # OpenAPI tool is a collection of functions, so we need to expand it - tool_definitions_expanded = list( - chain.from_iterable( - tool.get("functions", []) if tool.get("type") == "openapi" else [tool] - for tool in needed_tool_definitions - ) - ) - # Validate that all tool calls have corresponding definitions for tool_call in tool_calls: if isinstance(tool_call, dict): @@ -653,7 +644,7 @@ def _extract_needed_tool_definitions( continue elif tool_name: # This is a regular function tool from converter or built-in tool from agent v2 - tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded) + tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions) if not tool_definition_exists: raise EvaluationException( message=f"Tool definition for {tool_name} not found", diff --git a/assets/evaluators/builtin/tool_input_accuracy/spec.yaml b/assets/evaluators/builtin/tool_input_accuracy/spec.yaml index dea81aff21..dde6b58e35 100644 --- a/assets/evaluators/builtin/tool_input_accuracy/spec.yaml +++ b/assets/evaluators/builtin/tool_input_accuracy/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.tool_input_accuracy" -version: 9 +version: 10 displayName: "Tool-Input-Accuracy-Evaluator" description: "A binary evaluator (0 or 1) that checks whether all parameters in an agent’s tool call are correct, validating grounding, type, format, completeness, and contextual appropriateness using LLM-based analysis. Use it to verify agent tool usage, API integration tests, or to ensure tool call parameters are fully correct in AI workflows." evaluatorType: "builtin" diff --git a/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py b/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py index 0c928429e8..01499f9fdb 100644 --- a/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py +++ b/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py @@ -4,7 +4,6 @@ import logging from typing import Dict, List, Union, TypeVar from typing_extensions import override -from itertools import chain from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase from azure.ai.evaluation._exceptions import ( ErrorBlame, @@ -731,14 +730,6 @@ def _extract_needed_tool_definitions( built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls) needed_tool_definitions.extend(built_in_definitions) - # OpenAPI tool is a collection of functions, so we need to expand it - tool_definitions_expanded = list( - chain.from_iterable( - tool.get("functions", []) if tool.get("type") == "openapi" else [tool] - for tool in needed_tool_definitions - ) - ) - # Validate that all tool calls have corresponding definitions for tool_call in tool_calls: if isinstance(tool_call, dict): @@ -751,7 +742,7 @@ def _extract_needed_tool_definitions( continue elif tool_name: # This is a regular function tool from converter or built-in tool from agent v2 - tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded) + tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions) if not tool_definition_exists: raise EvaluationException( message=f"Tool definition for {tool_name} not found", diff --git a/assets/evaluators/builtin/tool_selection/spec.yaml b/assets/evaluators/builtin/tool_selection/spec.yaml index 281c80f25f..be22d5648d 100644 --- a/assets/evaluators/builtin/tool_selection/spec.yaml +++ b/assets/evaluators/builtin/tool_selection/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.tool_selection" -version: 7 +version: 8 displayName: "Tool-Selection-Evaluator" description: "Evaluates whether an AI agent selected the most appropriate and efficient tools for a given task, avoiding redundancy or missing essentials. Use it to assess tool choice quality in agent-based systems, orchestration platforms, and AI assistants that must pick the right tools from available options." evaluatorType: "builtin" diff --git a/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py b/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py index 72257a7667..e39eac83a4 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py +++ b/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py @@ -10,6 +10,7 @@ and asserts correct behavior using assert_expected_behavior and assert_called_once_with. """ +import copy from typing import Any, Dict, Optional from . import common_tool_test_data as data @@ -92,7 +93,7 @@ def _run_tool_type_test( Dictionary containing the extracted result data. """ results, flow_mock = self._run_evaluation_and_return_mocked_flow( - **evaluation_inputs, + **copy.deepcopy(evaluation_inputs), ) result_data = self._extract_and_print_result(results, test_label) self.assert_expected_behavior(assert_type, result_data) diff --git a/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py b/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py index cdcb98be7a..7a4c635177 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py +++ b/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py @@ -2044,8 +2044,43 @@ ) # ----- TCS expected flow response ----- -# For LOCAL_CALLS, FILE_SEARCH, IMAGE_GEN, MEMORY_SEARCH: _preprocess_messages is a no-op. -LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = LOCAL_CALLS_RESPONSE +# _preprocess_messages normalizes function_call/function_call_output types to tool_call/tool_result. +LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = [ + { + "run_id": "", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX", + "name": "get_horoscope", + "arguments": {"sign": "Aquarius"}, + } + ], + }, + { + "run_id": "", + "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": {"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter."}, + } + ], + }, + { + "role": "assistant", + "content": [ + { + "annotations": [], + "text": "Your horoscope for Aquarius is: Next Tuesday you will befriend a baby otter.", + "type": "output_text", + "logprobs": [], + } + ], + }, +] FILE_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = FILE_SEARCH_RESPONSE IMAGE_GEN_TCS_EXPECTED_FLOW_RESPONSE = IMAGE_GEN_RESPONSE MEMORY_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = MEMORY_SEARCH_RESPONSE @@ -2053,6 +2088,52 @@ KB_MCP_TCS_EXPECTED_FLOW_RESPONSE = KB_MCP_RESPONSE[2:] MCP_TCS_EXPECTED_FLOW_RESPONSE = MCP_RESPONSE[2:] +# Normalized OPENAPI_RESPONSE: openapi_call -> tool_call, openapi_call_output -> tool_result +OPENAPI_NORMALIZED_RESPONSE = [ + { + "run_id": "", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab", + "name": "weather_GetCurrentWeather", + "arguments": {"location": "Cairo", "format": "j1"}, + } + ], + }, + { + "run_id": "", + "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab", + "role": "tool", + "content": [{"type": "tool_result", "tool_result": ""}], + }, + { + "role": "assistant", + "content": [ + { + "annotations": [], + "text": ( + "**Current weather in Cairo:**\n\n- **Temperature:** 26\u00b0C (feels like 25\u00b0C)\n" + "- **Condition:** Sand (likely some dusty or sandy winds)\n" + "- **Humidity:** 28%\n" + "- **Cloud Cover:** 0% (clear skies)\n" + "- **Wind:** SW at 23 km/h\n" + "- **Visibility:** Moderate (4 km)\n" + "- **No precipitation**\n" + "- **UV Index:** 2\n\n" + "**Summary:** Cairo is experiencing warm, dry, and sunny weather, but there is " + "sand or dust in the air which may reduce visibility. Skies are clear and " + "it\u2019s breezy. Make sure to protect yourself from the dust if you\u2019re " + "heading outside!" + ), + "type": "output_text", + "logprobs": [], + } + ], + }, +] + # ============================================================================= # Expected flow inputs shared across multiple evaluators @@ -2734,9 +2815,6 @@ "arguments": { "sign": "Aquarius", }, - "tool_result": { - "horoscope": "Aquarius: Next Tuesday you will befriend a baby otter.", - }, }, ], }, diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py index bbf5cddd3a..fe8eb02ae9 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py @@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati test_openapi_expected_flow_inputs = { "query": data.OPENAPI_QUERY, - "response": data.OPENAPI_RESPONSE, + "response": data.OPENAPI_NORMALIZED_RESPONSE, } test_web_search_expected_flow_inputs = { diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py index 1cf9064743..e7723befce 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py @@ -20,63 +20,63 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation # region Expected flow inputs for each test test_function_tool_local_calls_expected_flow_inputs = { - "response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE, + "response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE, } test_code_interpreter_expected_flow_inputs = { - "response": data.CODE_INTERPRETER_RESPONSE, + "response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE, } test_bing_grounding_expected_flow_inputs = { - "response": data.BING_GROUNDING_RESPONSE, + "response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE, } test_bing_custom_search_expected_flow_inputs = { - "response": data.BING_CUSTOM_SEARCH_RESPONSE, + "response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_file_search_expected_flow_inputs = { - "response": data.FILE_SEARCH_RESPONSE, + "response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_azure_ai_search_expected_flow_inputs = { - "response": data.AZURE_AI_SEARCH_RESPONSE, + "response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_sharepoint_grounding_expected_flow_inputs = { - "response": data.SHAREPOINT_RESPONSE, + "response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE, } test_fabric_data_agent_expected_flow_inputs = { - "response": data.FABRIC_RESPONSE, + "response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE, } test_openapi_expected_flow_inputs = { - "response": data.OPENAPI_RESPONSE, + "response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE, } test_web_search_expected_flow_inputs = { - "response": data.WEB_SEARCH_RESPONSE, + "response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_browser_automation_expected_flow_inputs = { - "response": data.BROWSER_AUTOMATION_RESPONSE, + "response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE, } test_image_generation_expected_flow_inputs = { - "response": data.IMAGE_GEN_RESPONSE, + "response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE, } test_memory_search_expected_flow_inputs = { - "response": data.MEMORY_SEARCH_RESPONSE, + "response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_kb_mcp_expected_flow_inputs = { - "response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE, + "response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE, } test_mcp_expected_flow_inputs = { - "response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE, + "response": data.MCP_IR_EXPECTED_FLOW_RESPONSE, } # endregion diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py index 0e392a35a2..b6aff1d337 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py @@ -73,8 +73,11 @@ class TestToolSelectionEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, Base "tool_definitions": data.FABRIC_TOOL_DEFINITIONS, } - # OpenAPI: ToolSelection flow is not called (no extractable tool calls) - # Expected flow inputs not used since the test will not reach flow assertion + test_openapi_expected_flow_inputs = { + "query": data.OPENAPI_EXPECTED_FLOW_QUERY, + "tool_calls": ["weather_GetCurrentWeather"], + "tool_definitions": data.OPENAPI_TOOL_DEFINITIONS, + } test_web_search_expected_flow_inputs = { "query": data.WEB_SEARCH_EXPECTED_FLOW_QUERY,