Update Tools Evaluation Tests (#4926)

m7md7sien · web-flow · commit 5ad0a7947121 · 2026-04-23T19:00:06.000+02:00
* Update Tools Evaluation Tests

* Remove Unneeded Special Handling of OpenAPI Tool Calls

* Fix Flake8 issue

* Bump version
diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from itertools import chain
 import os
 import logging
 from typing import Dict, List, Union, TypeVar
@@ -1086,14 +1085,6 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
         built_in_definitions = _get_needed_built_in_definitions(tool_calls)
         needed_tool_definitions.extend(built_in_definitions)
 
-        # OpenAPI tool is a collection of functions, so we need to expand it
-        tool_definitions_expanded = list(
-            chain.from_iterable(
-                tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
-                for tool in needed_tool_definitions
-            )
-        )
-
         # Validate that all tool calls have corresponding definitions
         for tool_call in tool_calls:
             if isinstance(tool_call, dict):
@@ -1107,7 +1098,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
                     elif tool_name:
                         # This is a regular function tool from converter or built-in tool from agent v2
                         tool_definition_exists = any(
-                            tool.get("name") == tool_name for tool in tool_definitions_expanded
+                            tool.get("name") == tool_name for tool in needed_tool_definitions
                         )
                         if not tool_definition_exists:
                             raise EvaluationException(
diff --git a/assets/evaluators/builtin/tool_call_accuracy/spec.yaml b/assets/evaluators/builtin/tool_call_accuracy/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_call_accuracy"
-version: 7
+version: 8
 displayName: "Tool-Call-Accuracy-Evaluator"
 description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py b/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py
@@ -2,7 +2,6 @@
 # Licensed under the MIT License.
 import os
 import logging
-from itertools import chain
 from typing import Dict, List, Union, TypeVar, cast
 from typing_extensions import override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
@@ -633,14 +632,6 @@ def _extract_needed_tool_definitions(
     built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls)
     needed_tool_definitions.extend(built_in_definitions)
 
-    # OpenAPI tool is a collection of functions, so we need to expand it
-    tool_definitions_expanded = list(
-        chain.from_iterable(
-            tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
-            for tool in needed_tool_definitions
-        )
-    )
-
     # Validate that all tool calls have corresponding definitions
     for tool_call in tool_calls:
         if isinstance(tool_call, dict):
@@ -653,7 +644,7 @@ def _extract_needed_tool_definitions(
                     continue
                 elif tool_name:
                     # This is a regular function tool from converter or built-in tool from agent v2
-                    tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded)
+                    tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions)
                     if not tool_definition_exists:
                         raise EvaluationException(
                             message=f"Tool definition for {tool_name} not found",
diff --git a/assets/evaluators/builtin/tool_input_accuracy/spec.yaml b/assets/evaluators/builtin/tool_input_accuracy/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_input_accuracy"
-version: 9
+version: 10
 displayName: "Tool-Input-Accuracy-Evaluator"
 description: "A binary evaluator (0 or 1) that checks whether all parameters in an agent’s tool call are correct, validating grounding, type, format, completeness, and contextual appropriateness using LLM-based analysis. Use it to verify agent tool usage, API integration tests, or to ensure tool call parameters are fully correct in AI workflows."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py b/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py
@@ -4,7 +4,6 @@
 import logging
 from typing import Dict, List, Union, TypeVar
 from typing_extensions import override
-from itertools import chain
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._exceptions import (
     ErrorBlame,
@@ -731,14 +730,6 @@ def _extract_needed_tool_definitions(
     built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls)
     needed_tool_definitions.extend(built_in_definitions)
 
-    # OpenAPI tool is a collection of functions, so we need to expand it
-    tool_definitions_expanded = list(
-        chain.from_iterable(
-            tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
-            for tool in needed_tool_definitions
-        )
-    )
-
     # Validate that all tool calls have corresponding definitions
     for tool_call in tool_calls:
         if isinstance(tool_call, dict):
@@ -751,7 +742,7 @@ def _extract_needed_tool_definitions(
                     continue
                 elif tool_name:
                     # This is a regular function tool from converter or built-in tool from agent v2
-                    tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded)
+                    tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions)
                     if not tool_definition_exists:
                         raise EvaluationException(
                             message=f"Tool definition for {tool_name} not found",
diff --git a/assets/evaluators/builtin/tool_selection/spec.yaml b/assets/evaluators/builtin/tool_selection/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_selection"
-version: 7
+version: 8
 displayName: "Tool-Selection-Evaluator"
 description: "Evaluates whether an AI agent selected the most appropriate and efficient tools for a given task, avoiding redundancy or missing essentials. Use it to assess tool choice quality in agent-based systems, orchestration platforms, and AI assistants that must pick the right tools from available options."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py b/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py
@@ -10,6 +10,7 @@
 and asserts correct behavior using assert_expected_behavior and assert_called_once_with.
 """
 
+import copy
 from typing import Any, Dict, Optional
 
 from . import common_tool_test_data as data
@@ -92,7 +93,7 @@ def _run_tool_type_test(
             Dictionary containing the extracted result data.
         """
         results, flow_mock = self._run_evaluation_and_return_mocked_flow(
-            **evaluation_inputs,
+            **copy.deepcopy(evaluation_inputs),
         )
         result_data = self._extract_and_print_result(results, test_label)
         self.assert_expected_behavior(assert_type, result_data)
diff --git a/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py b/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py
@@ -2044,15 +2044,96 @@
 )
 
 # ----- TCS expected flow response -----
-# For LOCAL_CALLS, FILE_SEARCH, IMAGE_GEN, MEMORY_SEARCH: _preprocess_messages is a no-op.
-LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = LOCAL_CALLS_RESPONSE
+# _preprocess_messages normalizes function_call/function_call_output types to tool_call/tool_result.
+LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = [
+    {
+        "run_id": "",
+        "role": "assistant",
+        "content": [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
+                "name": "get_horoscope",
+                "arguments": {"sign": "Aquarius"},
+            }
+        ],
+    },
+    {
+        "run_id": "",
+        "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
+        "role": "tool",
+        "content": [
+            {
+                "type": "tool_result",
+                "tool_result": {"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter."},
+            }
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [
+            {
+                "annotations": [],
+                "text": "Your horoscope for Aquarius is: Next Tuesday you will befriend a baby otter.",
+                "type": "output_text",
+                "logprobs": [],
+            }
+        ],
+    },
+]
 FILE_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = FILE_SEARCH_RESPONSE
 IMAGE_GEN_TCS_EXPECTED_FLOW_RESPONSE = IMAGE_GEN_RESPONSE
 MEMORY_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = MEMORY_SEARCH_RESPONSE
 # For KB_MCP and MCP: _preprocess_messages drops the first 2 MCP approval messages.
 KB_MCP_TCS_EXPECTED_FLOW_RESPONSE = KB_MCP_RESPONSE[2:]
 MCP_TCS_EXPECTED_FLOW_RESPONSE = MCP_RESPONSE[2:]
 
+# Normalized OPENAPI_RESPONSE: openapi_call -> tool_call, openapi_call_output -> tool_result
+OPENAPI_NORMALIZED_RESPONSE = [
+    {
+        "run_id": "",
+        "role": "assistant",
+        "content": [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
+                "name": "weather_GetCurrentWeather",
+                "arguments": {"location": "Cairo", "format": "j1"},
+            }
+        ],
+    },
+    {
+        "run_id": "",
+        "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
+        "role": "tool",
+        "content": [{"type": "tool_result", "tool_result": ""}],
+    },
+    {
+        "role": "assistant",
+        "content": [
+            {
+                "annotations": [],
+                "text": (
+                    "**Current weather in Cairo:**\n\n- **Temperature:** 26\u00b0C (feels like 25\u00b0C)\n"
+                    "- **Condition:** Sand (likely some dusty or sandy winds)\n"
+                    "- **Humidity:** 28%\n"
+                    "- **Cloud Cover:** 0% (clear skies)\n"
+                    "- **Wind:** SW at 23 km/h\n"
+                    "- **Visibility:** Moderate (4 km)\n"
+                    "- **No precipitation**\n"
+                    "- **UV Index:** 2\n\n"
+                    "**Summary:** Cairo is experiencing warm, dry, and sunny weather, but there is "
+                    "sand or dust in the air which may reduce visibility. Skies are clear and "
+                    "it\u2019s breezy. Make sure to protect yourself from the dust if you\u2019re "
+                    "heading outside!"
+                ),
+                "type": "output_text",
+                "logprobs": [],
+            }
+        ],
+    },
+]
+
 
 # =============================================================================
 # Expected flow inputs shared across multiple evaluators
@@ -2734,9 +2815,6 @@
                 "arguments": {
                     "sign": "Aquarius",
                 },
-                "tool_result": {
-                    "horoscope": "Aquarius: Next Tuesday you will befriend a baby otter.",
-                },
             },
         ],
     },
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py
@@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati
 
     test_openapi_expected_flow_inputs = {
         "query": data.OPENAPI_QUERY,
-        "response": data.OPENAPI_RESPONSE,
+        "response": data.OPENAPI_NORMALIZED_RESPONSE,
     }
 
     test_web_search_expected_flow_inputs = {
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py
@@ -20,63 +20,63 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation
 
     # region Expected flow inputs for each test
     test_function_tool_local_calls_expected_flow_inputs = {
-        "response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE,
+        "response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_code_interpreter_expected_flow_inputs = {
-        "response": data.CODE_INTERPRETER_RESPONSE,
+        "response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_bing_grounding_expected_flow_inputs = {
-        "response": data.BING_GROUNDING_RESPONSE,
+        "response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_bing_custom_search_expected_flow_inputs = {
-        "response": data.BING_CUSTOM_SEARCH_RESPONSE,
+        "response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_file_search_expected_flow_inputs = {
-        "response": data.FILE_SEARCH_RESPONSE,
+        "response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_azure_ai_search_expected_flow_inputs = {
-        "response": data.AZURE_AI_SEARCH_RESPONSE,
+        "response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_sharepoint_grounding_expected_flow_inputs = {
-        "response": data.SHAREPOINT_RESPONSE,
+        "response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_fabric_data_agent_expected_flow_inputs = {
-        "response": data.FABRIC_RESPONSE,
+        "response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_openapi_expected_flow_inputs = {
-        "response": data.OPENAPI_RESPONSE,
+        "response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_web_search_expected_flow_inputs = {
-        "response": data.WEB_SEARCH_RESPONSE,
+        "response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_browser_automation_expected_flow_inputs = {
-        "response": data.BROWSER_AUTOMATION_RESPONSE,
+        "response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_image_generation_expected_flow_inputs = {
-        "response": data.IMAGE_GEN_RESPONSE,
+        "response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_memory_search_expected_flow_inputs = {
-        "response": data.MEMORY_SEARCH_RESPONSE,
+        "response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_kb_mcp_expected_flow_inputs = {
-        "response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE,
+        "response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_mcp_expected_flow_inputs = {
-        "response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE,
+        "response": data.MCP_IR_EXPECTED_FLOW_RESPONSE,
     }
     # endregion
 
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py
@@ -73,8 +73,11 @@ class TestToolSelectionEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, Base
         "tool_definitions": data.FABRIC_TOOL_DEFINITIONS,
     }
 
-    # OpenAPI: ToolSelection flow is not called (no extractable tool calls)
-    # Expected flow inputs not used since the test will not reach flow assertion
+    test_openapi_expected_flow_inputs = {
+        "query": data.OPENAPI_EXPECTED_FLOW_QUERY,
+        "tool_calls": ["weather_GetCurrentWeather"],
+        "tool_definitions": data.OPENAPI_TOOL_DEFINITIONS,
+    }
 
     test_web_search_expected_flow_inputs = {
         "query": data.WEB_SEARCH_EXPECTED_FLOW_QUERY,

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati`
`61`	`61`
`62`	`62`	`test_openapi_expected_flow_inputs = {`
`63`	`63`	`"query": data.OPENAPI_QUERY,`
`64`		`- "response": data.OPENAPI_RESPONSE,`
	`64`	`+ "response": data.OPENAPI_NORMALIZED_RESPONSE,`
`65`	`65`	`}`
`66`	`66`
`67`	`67`	`test_web_search_expected_flow_inputs = {`
Original file line number	Diff line number	Diff line change
`@@ -20,63 +20,63 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation`
`20`	`20`
`21`	`21`	`# region Expected flow inputs for each test`
`22`	`22`	`test_function_tool_local_calls_expected_flow_inputs = {`
`23`		`- "response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE,`
	`23`	`+ "response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE,`
`24`	`24`	`}`
`25`	`25`
`26`	`26`	`test_code_interpreter_expected_flow_inputs = {`
`27`		`- "response": data.CODE_INTERPRETER_RESPONSE,`
	`27`	`+ "response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE,`
`28`	`28`	`}`
`29`	`29`
`30`	`30`	`test_bing_grounding_expected_flow_inputs = {`
`31`		`- "response": data.BING_GROUNDING_RESPONSE,`
	`31`	`+ "response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE,`
`32`	`32`	`}`
`33`	`33`
`34`	`34`	`test_bing_custom_search_expected_flow_inputs = {`
`35`		`- "response": data.BING_CUSTOM_SEARCH_RESPONSE,`
	`35`	`+ "response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`test_file_search_expected_flow_inputs = {`
`39`		`- "response": data.FILE_SEARCH_RESPONSE,`
	`39`	`+ "response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`test_azure_ai_search_expected_flow_inputs = {`
`43`		`- "response": data.AZURE_AI_SEARCH_RESPONSE,`
	`43`	`+ "response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`44`	`44`	`}`
`45`	`45`
`46`	`46`	`test_sharepoint_grounding_expected_flow_inputs = {`
`47`		`- "response": data.SHAREPOINT_RESPONSE,`
	`47`	`+ "response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE,`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`test_fabric_data_agent_expected_flow_inputs = {`
`51`		`- "response": data.FABRIC_RESPONSE,`
	`51`	`+ "response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE,`
`52`	`52`	`}`
`53`	`53`
`54`	`54`	`test_openapi_expected_flow_inputs = {`
`55`		`- "response": data.OPENAPI_RESPONSE,`
	`55`	`+ "response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE,`
`56`	`56`	`}`
`57`	`57`
`58`	`58`	`test_web_search_expected_flow_inputs = {`
`59`		`- "response": data.WEB_SEARCH_RESPONSE,`
	`59`	`+ "response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`60`	`60`	`}`
`61`	`61`
`62`	`62`	`test_browser_automation_expected_flow_inputs = {`
`63`		`- "response": data.BROWSER_AUTOMATION_RESPONSE,`
	`63`	`+ "response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE,`
`64`	`64`	`}`
`65`	`65`
`66`	`66`	`test_image_generation_expected_flow_inputs = {`
`67`		`- "response": data.IMAGE_GEN_RESPONSE,`
	`67`	`+ "response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE,`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`test_memory_search_expected_flow_inputs = {`
`71`		`- "response": data.MEMORY_SEARCH_RESPONSE,`
	`71`	`+ "response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`72`	`72`	`}`
`73`	`73`
`74`	`74`	`test_kb_mcp_expected_flow_inputs = {`
`75`		`- "response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE,`
	`75`	`+ "response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE,`
`76`	`76`	`}`
`77`	`77`
`78`	`78`	`test_mcp_expected_flow_inputs = {`
`79`		`- "response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE,`
	`79`	`+ "response": data.MCP_IR_EXPECTED_FLOW_RESPONSE,`
`80`	`80`	`}`
`81`	`81`	`# endregion`
`82`	`82`