Azure · m7md7sien · Apr 23, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from itertools import chain
 import os
 import logging
 from typing import Dict, List, Union, TypeVar
@@ -1086,14 +1085,6 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
         built_in_definitions = _get_needed_built_in_definitions(tool_calls)
         needed_tool_definitions.extend(built_in_definitions)
 
-        # OpenAPI tool is a collection of functions, so we need to expand it
-        tool_definitions_expanded = list(
-            chain.from_iterable(
-                tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
-                for tool in needed_tool_definitions
-            )
-        )
-
         # Validate that all tool calls have corresponding definitions
         for tool_call in tool_calls:
             if isinstance(tool_call, dict):
@@ -1107,7 +1098,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
                     elif tool_name:
                         # This is a regular function tool from converter or built-in tool from agent v2
                         tool_definition_exists = any(
-                            tool.get("name") == tool_name for tool in tool_definitions_expanded
+                            tool.get("name") == tool_name for tool in needed_tool_definitions
                         )
                         if not tool_definition_exists:
                             raise EvaluationException(

@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_call_accuracy"
-version: 7
+version: 8
 displayName: "Tool-Call-Accuracy-Evaluator"
 description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration."
 evaluatorType: "builtin"

@@ -2,7 +2,6 @@
 # Licensed under the MIT License.
 import os
 import logging
-from itertools import chain
 from typing import Dict, List, Union, TypeVar, cast
 from typing_extensions import override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
@@ -633,14 +632,6 @@ def _extract_needed_tool_definitions(
     built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls)
     needed_tool_definitions.extend(built_in_definitions)
 
-    # OpenAPI tool is a collection of functions, so we need to expand it
-    tool_definitions_expanded = list(
-        chain.from_iterable(
-            tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
-            for tool in needed_tool_definitions
-        )
-    )
-
     # Validate that all tool calls have corresponding definitions
     for tool_call in tool_calls:
         if isinstance(tool_call, dict):
@@ -653,7 +644,7 @@ def _extract_needed_tool_definitions(
                     continue
                 elif tool_name:
                     # This is a regular function tool from converter or built-in tool from agent v2
-                    tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded)
+                    tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions)
                     if not tool_definition_exists:
                         raise EvaluationException(
                             message=f"Tool definition for {tool_name} not found",

@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_input_accuracy"
-version: 9
+version: 10
 displayName: "Tool-Input-Accuracy-Evaluator"
 description: "A binary evaluator (0 or 1) that checks whether all parameters in an agent’s tool call are correct, validating grounding, type, format, completeness, and contextual appropriateness using LLM-based analysis. Use it to verify agent tool usage, API integration tests, or to ensure tool call parameters are fully correct in AI workflows."
 evaluatorType: "builtin"

@@ -4,7 +4,6 @@
 import logging
 from typing import Dict, List, Union, TypeVar
 from typing_extensions import override
-from itertools import chain
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._exceptions import (
     ErrorBlame,
@@ -731,14 +730,6 @@ def _extract_needed_tool_definitions(
     built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls)
     needed_tool_definitions.extend(built_in_definitions)
 
-    # OpenAPI tool is a collection of functions, so we need to expand it
-    tool_definitions_expanded = list(
-        chain.from_iterable(
-            tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
-            for tool in needed_tool_definitions
-        )
-    )
-
     # Validate that all tool calls have corresponding definitions
     for tool_call in tool_calls:
         if isinstance(tool_call, dict):
@@ -751,7 +742,7 @@ def _extract_needed_tool_definitions(
                     continue
                 elif tool_name:
                     # This is a regular function tool from converter or built-in tool from agent v2
-                    tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded)
+                    tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions)
                     if not tool_definition_exists:
                         raise EvaluationException(
                             message=f"Tool definition for {tool_name} not found",

@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_selection"
-version: 7
+version: 8
 displayName: "Tool-Selection-Evaluator"
 description: "Evaluates whether an AI agent selected the most appropriate and efficient tools for a given task, avoiding redundancy or missing essentials. Use it to assess tool choice quality in agent-based systems, orchestration platforms, and AI assistants that must pick the right tools from available options."
 evaluatorType: "builtin"

@@ -10,6 +10,7 @@
 and asserts correct behavior using assert_expected_behavior and assert_called_once_with.
 """
 
+import copy
 from typing import Any, Dict, Optional
 
 from . import common_tool_test_data as data
@@ -92,7 +93,7 @@ def _run_tool_type_test(
             Dictionary containing the extracted result data.
         """
         results, flow_mock = self._run_evaluation_and_return_mocked_flow(
-            **evaluation_inputs,
+            **copy.deepcopy(evaluation_inputs),
         )
         result_data = self._extract_and_print_result(results, test_label)
         self.assert_expected_behavior(assert_type, result_data)

@@ -2044,15 +2044,96 @@
 )
 
 # ----- TCS expected flow response -----
-# For LOCAL_CALLS, FILE_SEARCH, IMAGE_GEN, MEMORY_SEARCH: _preprocess_messages is a no-op.
-LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = LOCAL_CALLS_RESPONSE
+# _preprocess_messages normalizes function_call/function_call_output types to tool_call/tool_result.
+LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = [
+    {
+        "run_id": "",
+        "role": "assistant",
+        "content": [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
+                "name": "get_horoscope",
+                "arguments": {"sign": "Aquarius"},
+            }
+        ],
+    },
+    {
+        "run_id": "",
+        "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
+        "role": "tool",
+        "content": [
+            {
+                "type": "tool_result",
+                "tool_result": {"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter."},
+            }
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [
+            {
+                "annotations": [],
+                "text": "Your horoscope for Aquarius is: Next Tuesday you will befriend a baby otter.",
+                "type": "output_text",
+                "logprobs": [],
+            }
+        ],
+    },
+]
 FILE_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = FILE_SEARCH_RESPONSE
 IMAGE_GEN_TCS_EXPECTED_FLOW_RESPONSE = IMAGE_GEN_RESPONSE
 MEMORY_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = MEMORY_SEARCH_RESPONSE
 # For KB_MCP and MCP: _preprocess_messages drops the first 2 MCP approval messages.
 KB_MCP_TCS_EXPECTED_FLOW_RESPONSE = KB_MCP_RESPONSE[2:]
 MCP_TCS_EXPECTED_FLOW_RESPONSE = MCP_RESPONSE[2:]
 
+# Normalized OPENAPI_RESPONSE: openapi_call -> tool_call, openapi_call_output -> tool_result
+OPENAPI_NORMALIZED_RESPONSE = [
+    {
+        "run_id": "",
+        "role": "assistant",
+        "content": [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
+                "name": "weather_GetCurrentWeather",
+                "arguments": {"location": "Cairo", "format": "j1"},
+            }
+        ],
+    },
+    {
+        "run_id": "",
+        "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
+        "role": "tool",
+        "content": [{"type": "tool_result", "tool_result": ""}],
+    },
+    {
+        "role": "assistant",
+        "content": [
+            {
+                "annotations": [],
+                "text": (
+                    "**Current weather in Cairo:**\n\n- **Temperature:** 26\u00b0C (feels like 25\u00b0C)\n"
+                    "- **Condition:** Sand (likely some dusty or sandy winds)\n"
+                    "- **Humidity:** 28%\n"
+                    "- **Cloud Cover:** 0% (clear skies)\n"
+                    "- **Wind:** SW at 23 km/h\n"
+                    "- **Visibility:** Moderate (4 km)\n"
+                    "- **No precipitation**\n"
+                    "- **UV Index:** 2\n\n"
+                    "**Summary:** Cairo is experiencing warm, dry, and sunny weather, but there is "
+                    "sand or dust in the air which may reduce visibility. Skies are clear and "
+                    "it\u2019s breezy. Make sure to protect yourself from the dust if you\u2019re "
+                    "heading outside!"
+                ),
+                "type": "output_text",
+                "logprobs": [],
+            }
+        ],
+    },
+]
+
 
 # =============================================================================
 # Expected flow inputs shared across multiple evaluators
@@ -2734,9 +2815,6 @@
                 "arguments": {
                     "sign": "Aquarius",
                 },
-                "tool_result": {
-                    "horoscope": "Aquarius: Next Tuesday you will befriend a baby otter.",
-                },
             },
         ],
     },

@@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati
 
     test_openapi_expected_flow_inputs = {
         "query": data.OPENAPI_QUERY,
-        "response": data.OPENAPI_RESPONSE,
+        "response": data.OPENAPI_NORMALIZED_RESPONSE,
     }
 
     test_web_search_expected_flow_inputs = {

@@ -20,63 +20,63 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation
 
     # region Expected flow inputs for each test
     test_function_tool_local_calls_expected_flow_inputs = {
-        "response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE,
+        "response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_code_interpreter_expected_flow_inputs = {
-        "response": data.CODE_INTERPRETER_RESPONSE,
+        "response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_bing_grounding_expected_flow_inputs = {
-        "response": data.BING_GROUNDING_RESPONSE,
+        "response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_bing_custom_search_expected_flow_inputs = {
-        "response": data.BING_CUSTOM_SEARCH_RESPONSE,
+        "response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_file_search_expected_flow_inputs = {
-        "response": data.FILE_SEARCH_RESPONSE,
+        "response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_azure_ai_search_expected_flow_inputs = {
-        "response": data.AZURE_AI_SEARCH_RESPONSE,
+        "response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_sharepoint_grounding_expected_flow_inputs = {
-        "response": data.SHAREPOINT_RESPONSE,
+        "response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_fabric_data_agent_expected_flow_inputs = {
-        "response": data.FABRIC_RESPONSE,
+        "response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_openapi_expected_flow_inputs = {
-        "response": data.OPENAPI_RESPONSE,
+        "response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_web_search_expected_flow_inputs = {
-        "response": data.WEB_SEARCH_RESPONSE,
+        "response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_browser_automation_expected_flow_inputs = {
-        "response": data.BROWSER_AUTOMATION_RESPONSE,
+        "response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_image_generation_expected_flow_inputs = {
-        "response": data.IMAGE_GEN_RESPONSE,
+        "response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_memory_search_expected_flow_inputs = {
-        "response": data.MEMORY_SEARCH_RESPONSE,
+        "response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_kb_mcp_expected_flow_inputs = {
-        "response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE,
+        "response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_mcp_expected_flow_inputs = {
-        "response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE,
+        "response": data.MCP_IR_EXPECTED_FLOW_RESPONSE,
     }
     # endregion
 

@@ -73,8 +73,11 @@ class TestToolSelectionEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, Base
         "tool_definitions": data.FABRIC_TOOL_DEFINITIONS,
     }
 
-    # OpenAPI: ToolSelection flow is not called (no extractable tool calls)
-    # Expected flow inputs not used since the test will not reach flow assertion
+    test_openapi_expected_flow_inputs = {
+        "query": data.OPENAPI_EXPECTED_FLOW_QUERY,
+        "tool_calls": ["weather_GetCurrentWeather"],
+        "tool_definitions": data.OPENAPI_TOOL_DEFINITIONS,
+    }
 
     test_web_search_expected_flow_inputs = {
         "query": data.WEB_SEARCH_EXPECTED_FLOW_QUERY,