Update Tools Evaluation Tests

m7md7sien · m7md7sien · commit ea9c8ea12e7a · 2026-04-09T18:56:27.000+02:00
diff --git a/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py b/assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py
@@ -10,6 +10,7 @@
 and asserts correct behavior using assert_expected_behavior and assert_called_once_with.
 """
 
+import copy
 from typing import Any, Dict, Optional
 
 from . import common_tool_test_data as data
@@ -92,7 +93,7 @@ def _run_tool_type_test(
             Dictionary containing the extracted result data.
         """
         results, flow_mock = self._run_evaluation_and_return_mocked_flow(
-            **evaluation_inputs,
+            **copy.deepcopy(evaluation_inputs),
         )
         result_data = self._extract_and_print_result(results, test_label)
         self.assert_expected_behavior(assert_type, result_data)
diff --git a/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py b/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py
@@ -2044,15 +2044,96 @@
 )
 
 # ----- TCS expected flow response -----
-# For LOCAL_CALLS, FILE_SEARCH, IMAGE_GEN, MEMORY_SEARCH: _preprocess_messages is a no-op.
-LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = LOCAL_CALLS_RESPONSE
+# _preprocess_messages normalizes function_call/function_call_output types to tool_call/tool_result.
+LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = [
+    {
+        "run_id": "",
+        "role": "assistant",
+        "content": [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
+                "name": "get_horoscope",
+                "arguments": {"sign": "Aquarius"},
+            }
+        ],
+    },
+    {
+        "run_id": "",
+        "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
+        "role": "tool",
+        "content": [
+            {
+                "type": "tool_result",
+                "tool_result": {"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter."},
+            }
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [
+            {
+                "annotations": [],
+                "text": "Your horoscope for Aquarius is: Next Tuesday you will befriend a baby otter.",
+                "type": "output_text",
+                "logprobs": [],
+            }
+        ],
+    },
+]
 FILE_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = FILE_SEARCH_RESPONSE
 IMAGE_GEN_TCS_EXPECTED_FLOW_RESPONSE = IMAGE_GEN_RESPONSE
 MEMORY_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = MEMORY_SEARCH_RESPONSE
 # For KB_MCP and MCP: _preprocess_messages drops the first 2 MCP approval messages.
 KB_MCP_TCS_EXPECTED_FLOW_RESPONSE = KB_MCP_RESPONSE[2:]
 MCP_TCS_EXPECTED_FLOW_RESPONSE = MCP_RESPONSE[2:]
 
+# Normalized OPENAPI_RESPONSE: openapi_call -> tool_call, openapi_call_output -> tool_result
+OPENAPI_NORMALIZED_RESPONSE = [
+    {
+        "run_id": "",
+        "role": "assistant",
+        "content": [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
+                "name": "weather_GetCurrentWeather",
+                "arguments": {"location": "Cairo", "format": "j1"},
+            }
+        ],
+    },
+    {
+        "run_id": "",
+        "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
+        "role": "tool",
+        "content": [{"type": "tool_result", "tool_result": ""}],
+    },
+    {
+        "role": "assistant",
+        "content": [
+            {
+                "annotations": [],
+                "text": (
+                    "**Current weather in Cairo:**\n\n- **Temperature:** 26\u00b0C (feels like 25\u00b0C)\n"
+                    "- **Condition:** Sand (likely some dusty or sandy winds)\n"
+                    "- **Humidity:** 28%\n"
+                    "- **Cloud Cover:** 0% (clear skies)\n"
+                    "- **Wind:** SW at 23 km/h\n"
+                    "- **Visibility:** Moderate (4 km)\n"
+                    "- **No precipitation**\n"
+                    "- **UV Index:** 2\n\n"
+                    "**Summary:** Cairo is experiencing warm, dry, and sunny weather, but there is "
+                    "sand or dust in the air which may reduce visibility. Skies are clear and "
+                    "it\u2019s breezy. Make sure to protect yourself from the dust if you\u2019re "
+                    "heading outside!"
+                ),
+                "type": "output_text",
+                "logprobs": [],
+            }
+        ],
+    },
+]
+
 
 # =============================================================================
 # Expected flow inputs shared across multiple evaluators
@@ -2314,6 +2395,8 @@
 )
 
 OPENAPI_EXPECTED_FLOW_RESPONSE = (
+                                 '[TOOL_CALL] weather_GetCurrentWeather(location="Cairo", format="j1")\n'
+                                 '[TOOL_RESULT] \n'
                                  "**Current weather in Cairo:**\n\n- **Temperature:** 26°C (feels like 25°C)\n- "
                                  "**Condition:** Sand (likely some dusty or sandy winds)\n- **Humidity:** 28%\n- "
                                  "**Cloud Cover:** 0% (clear skies)\n- **Wind:** SW at 23 km/h\n- **Visibility:** "
@@ -2732,9 +2815,6 @@
                 "arguments": {
                     "sign": "Aquarius",
                 },
-                "tool_result": {
-                    "horoscope": "Aquarius: Next Tuesday you will befriend a baby otter.",
-                },
             },
         ],
     },
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py
@@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati
 
     test_openapi_expected_flow_inputs = {
         "query": data.OPENAPI_QUERY,
-        "response": data.OPENAPI_RESPONSE,
+        "response": data.OPENAPI_NORMALIZED_RESPONSE,
     }
 
     test_web_search_expected_flow_inputs = {
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py
@@ -20,63 +20,63 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation
 
     # region Expected flow inputs for each test
     test_function_tool_local_calls_expected_flow_inputs = {
-        "response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE,
+        "response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_code_interpreter_expected_flow_inputs = {
-        "response": data.CODE_INTERPRETER_RESPONSE,
+        "response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_bing_grounding_expected_flow_inputs = {
-        "response": data.BING_GROUNDING_RESPONSE,
+        "response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_bing_custom_search_expected_flow_inputs = {
-        "response": data.BING_CUSTOM_SEARCH_RESPONSE,
+        "response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_file_search_expected_flow_inputs = {
-        "response": data.FILE_SEARCH_RESPONSE,
+        "response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_azure_ai_search_expected_flow_inputs = {
-        "response": data.AZURE_AI_SEARCH_RESPONSE,
+        "response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_sharepoint_grounding_expected_flow_inputs = {
-        "response": data.SHAREPOINT_RESPONSE,
+        "response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_fabric_data_agent_expected_flow_inputs = {
-        "response": data.FABRIC_RESPONSE,
+        "response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_openapi_expected_flow_inputs = {
-        "response": data.OPENAPI_RESPONSE,
+        "response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_web_search_expected_flow_inputs = {
-        "response": data.WEB_SEARCH_RESPONSE,
+        "response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_browser_automation_expected_flow_inputs = {
-        "response": data.BROWSER_AUTOMATION_RESPONSE,
+        "response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_image_generation_expected_flow_inputs = {
-        "response": data.IMAGE_GEN_RESPONSE,
+        "response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_memory_search_expected_flow_inputs = {
-        "response": data.MEMORY_SEARCH_RESPONSE,
+        "response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_kb_mcp_expected_flow_inputs = {
-        "response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE,
+        "response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE,
     }
 
     test_mcp_expected_flow_inputs = {
-        "response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE,
+        "response": data.MCP_IR_EXPECTED_FLOW_RESPONSE,
     }
     # endregion
 
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py
@@ -116,3 +116,15 @@ class TestToolSelectionEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, Base
     is_tool_definition_required = True
 
     evaluator_type = ToolSelectionEvaluator
+
+    def test_openapi(self):
+        """OpenAPI: ToolSelection flow is not called (no extractable tool calls)."""
+        results, flow_mock = self._run_evaluation_and_return_mocked_flow(
+            query=data.OPENAPI_QUERY,
+            response=data.OPENAPI_RESPONSE,
+            tool_definitions=data.OPENAPI_TOOL_DEFINITIONS,
+        )
+        result_data = self._extract_and_print_result(results, "OpenAPI")
+        self.assert_not_applicable(result_data)
+        assert flow_mock is not None, "Flow mock should be set when use_mocking=True"
+        flow_mock.assert_not_called()

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati`
`61`	`61`
`62`	`62`	`test_openapi_expected_flow_inputs = {`
`63`	`63`	`"query": data.OPENAPI_QUERY,`
`64`		`- "response": data.OPENAPI_RESPONSE,`
	`64`	`+ "response": data.OPENAPI_NORMALIZED_RESPONSE,`
`65`	`65`	`}`
`66`	`66`
`67`	`67`	`test_web_search_expected_flow_inputs = {`
Original file line number	Diff line number	Diff line change
`@@ -20,63 +20,63 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation`
`20`	`20`
`21`	`21`	`# region Expected flow inputs for each test`
`22`	`22`	`test_function_tool_local_calls_expected_flow_inputs = {`
`23`		`- "response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE,`
	`23`	`+ "response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE,`
`24`	`24`	`}`
`25`	`25`
`26`	`26`	`test_code_interpreter_expected_flow_inputs = {`
`27`		`- "response": data.CODE_INTERPRETER_RESPONSE,`
	`27`	`+ "response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE,`
`28`	`28`	`}`
`29`	`29`
`30`	`30`	`test_bing_grounding_expected_flow_inputs = {`
`31`		`- "response": data.BING_GROUNDING_RESPONSE,`
	`31`	`+ "response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE,`
`32`	`32`	`}`
`33`	`33`
`34`	`34`	`test_bing_custom_search_expected_flow_inputs = {`
`35`		`- "response": data.BING_CUSTOM_SEARCH_RESPONSE,`
	`35`	`+ "response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`test_file_search_expected_flow_inputs = {`
`39`		`- "response": data.FILE_SEARCH_RESPONSE,`
	`39`	`+ "response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`test_azure_ai_search_expected_flow_inputs = {`
`43`		`- "response": data.AZURE_AI_SEARCH_RESPONSE,`
	`43`	`+ "response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`44`	`44`	`}`
`45`	`45`
`46`	`46`	`test_sharepoint_grounding_expected_flow_inputs = {`
`47`		`- "response": data.SHAREPOINT_RESPONSE,`
	`47`	`+ "response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE,`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`test_fabric_data_agent_expected_flow_inputs = {`
`51`		`- "response": data.FABRIC_RESPONSE,`
	`51`	`+ "response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE,`
`52`	`52`	`}`
`53`	`53`
`54`	`54`	`test_openapi_expected_flow_inputs = {`
`55`		`- "response": data.OPENAPI_RESPONSE,`
	`55`	`+ "response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE,`
`56`	`56`	`}`
`57`	`57`
`58`	`58`	`test_web_search_expected_flow_inputs = {`
`59`		`- "response": data.WEB_SEARCH_RESPONSE,`
	`59`	`+ "response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`60`	`60`	`}`
`61`	`61`
`62`	`62`	`test_browser_automation_expected_flow_inputs = {`
`63`		`- "response": data.BROWSER_AUTOMATION_RESPONSE,`
	`63`	`+ "response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE,`
`64`	`64`	`}`
`65`	`65`
`66`	`66`	`test_image_generation_expected_flow_inputs = {`
`67`		`- "response": data.IMAGE_GEN_RESPONSE,`
	`67`	`+ "response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE,`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`test_memory_search_expected_flow_inputs = {`
`71`		`- "response": data.MEMORY_SEARCH_RESPONSE,`
	`71`	`+ "response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE,`
`72`	`72`	`}`
`73`	`73`
`74`	`74`	`test_kb_mcp_expected_flow_inputs = {`
`75`		`- "response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE,`
	`75`	`+ "response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE,`
`76`	`76`	`}`
`77`	`77`
`78`	`78`	`test_mcp_expected_flow_inputs = {`
`79`		`- "response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE,`
	`79`	`+ "response": data.MCP_IR_EXPECTED_FLOW_RESPONSE,`
`80`	`80`	`}`
`81`	`81`	`# endregion`
`82`	`82`