Skip to content

Commit 5ad0a79

Browse files
authored
Update Tools Evaluation Tests (#4926)
* Update Tools Evaluation Tests * Remove Unneeded Special Handling of OpenAPI Tool Calls * Fix Flake8 issue * Bump version
1 parent 0acbc91 commit 5ad0a79

11 files changed

Lines changed: 112 additions & 57 deletions

File tree

assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT License.
33

4-
from itertools import chain
54
import os
65
import logging
76
from typing import Dict, List, Union, TypeVar
@@ -1086,14 +1085,6 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
10861085
built_in_definitions = _get_needed_built_in_definitions(tool_calls)
10871086
needed_tool_definitions.extend(built_in_definitions)
10881087

1089-
# OpenAPI tool is a collection of functions, so we need to expand it
1090-
tool_definitions_expanded = list(
1091-
chain.from_iterable(
1092-
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
1093-
for tool in needed_tool_definitions
1094-
)
1095-
)
1096-
10971088
# Validate that all tool calls have corresponding definitions
10981089
for tool_call in tool_calls:
10991090
if isinstance(tool_call, dict):
@@ -1107,7 +1098,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
11071098
elif tool_name:
11081099
# This is a regular function tool from converter or built-in tool from agent v2
11091100
tool_definition_exists = any(
1110-
tool.get("name") == tool_name for tool in tool_definitions_expanded
1101+
tool.get("name") == tool_name for tool in needed_tool_definitions
11111102
)
11121103
if not tool_definition_exists:
11131104
raise EvaluationException(

assets/evaluators/builtin/tool_call_accuracy/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.tool_call_accuracy"
3-
version: 7
3+
version: 8
44
displayName: "Tool-Call-Accuracy-Evaluator"
55
description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration."
66
evaluatorType: "builtin"

assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# Licensed under the MIT License.
33
import os
44
import logging
5-
from itertools import chain
65
from typing import Dict, List, Union, TypeVar, cast
76
from typing_extensions import override
87
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
@@ -633,14 +632,6 @@ def _extract_needed_tool_definitions(
633632
built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls)
634633
needed_tool_definitions.extend(built_in_definitions)
635634

636-
# OpenAPI tool is a collection of functions, so we need to expand it
637-
tool_definitions_expanded = list(
638-
chain.from_iterable(
639-
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
640-
for tool in needed_tool_definitions
641-
)
642-
)
643-
644635
# Validate that all tool calls have corresponding definitions
645636
for tool_call in tool_calls:
646637
if isinstance(tool_call, dict):
@@ -653,7 +644,7 @@ def _extract_needed_tool_definitions(
653644
continue
654645
elif tool_name:
655646
# This is a regular function tool from converter or built-in tool from agent v2
656-
tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded)
647+
tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions)
657648
if not tool_definition_exists:
658649
raise EvaluationException(
659650
message=f"Tool definition for {tool_name} not found",

assets/evaluators/builtin/tool_input_accuracy/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.tool_input_accuracy"
3-
version: 9
3+
version: 10
44
displayName: "Tool-Input-Accuracy-Evaluator"
55
description: "A binary evaluator (0 or 1) that checks whether all parameters in an agent’s tool call are correct, validating grounding, type, format, completeness, and contextual appropriateness using LLM-based analysis. Use it to verify agent tool usage, API integration tests, or to ensure tool call parameters are fully correct in AI workflows."
66
evaluatorType: "builtin"

assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import logging
55
from typing import Dict, List, Union, TypeVar
66
from typing_extensions import override
7-
from itertools import chain
87
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
98
from azure.ai.evaluation._exceptions import (
109
ErrorBlame,
@@ -731,14 +730,6 @@ def _extract_needed_tool_definitions(
731730
built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls)
732731
needed_tool_definitions.extend(built_in_definitions)
733732

734-
# OpenAPI tool is a collection of functions, so we need to expand it
735-
tool_definitions_expanded = list(
736-
chain.from_iterable(
737-
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
738-
for tool in needed_tool_definitions
739-
)
740-
)
741-
742733
# Validate that all tool calls have corresponding definitions
743734
for tool_call in tool_calls:
744735
if isinstance(tool_call, dict):
@@ -751,7 +742,7 @@ def _extract_needed_tool_definitions(
751742
continue
752743
elif tool_name:
753744
# This is a regular function tool from converter or built-in tool from agent v2
754-
tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded)
745+
tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions)
755746
if not tool_definition_exists:
756747
raise EvaluationException(
757748
message=f"Tool definition for {tool_name} not found",

assets/evaluators/builtin/tool_selection/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.tool_selection"
3-
version: 7
3+
version: 8
44
displayName: "Tool-Selection-Evaluator"
55
description: "Evaluates whether an AI agent selected the most appropriate and efficient tools for a given task, avoiding redundancy or missing essentials. Use it to assess tool choice quality in agent-based systems, orchestration platforms, and AI assistants that must pick the right tools from available options."
66
evaluatorType: "builtin"

assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
and asserts correct behavior using assert_expected_behavior and assert_called_once_with.
1111
"""
1212

13+
import copy
1314
from typing import Any, Dict, Optional
1415

1516
from . import common_tool_test_data as data
@@ -92,7 +93,7 @@ def _run_tool_type_test(
9293
Dictionary containing the extracted result data.
9394
"""
9495
results, flow_mock = self._run_evaluation_and_return_mocked_flow(
95-
**evaluation_inputs,
96+
**copy.deepcopy(evaluation_inputs),
9697
)
9798
result_data = self._extract_and_print_result(results, test_label)
9899
self.assert_expected_behavior(assert_type, result_data)

assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py

Lines changed: 83 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2044,15 +2044,96 @@
20442044
)
20452045

20462046
# ----- TCS expected flow response -----
2047-
# For LOCAL_CALLS, FILE_SEARCH, IMAGE_GEN, MEMORY_SEARCH: _preprocess_messages is a no-op.
2048-
LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = LOCAL_CALLS_RESPONSE
2047+
# _preprocess_messages normalizes function_call/function_call_output types to tool_call/tool_result.
2048+
LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = [
2049+
{
2050+
"run_id": "",
2051+
"role": "assistant",
2052+
"content": [
2053+
{
2054+
"type": "tool_call",
2055+
"tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
2056+
"name": "get_horoscope",
2057+
"arguments": {"sign": "Aquarius"},
2058+
}
2059+
],
2060+
},
2061+
{
2062+
"run_id": "",
2063+
"tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
2064+
"role": "tool",
2065+
"content": [
2066+
{
2067+
"type": "tool_result",
2068+
"tool_result": {"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter."},
2069+
}
2070+
],
2071+
},
2072+
{
2073+
"role": "assistant",
2074+
"content": [
2075+
{
2076+
"annotations": [],
2077+
"text": "Your horoscope for Aquarius is: Next Tuesday you will befriend a baby otter.",
2078+
"type": "output_text",
2079+
"logprobs": [],
2080+
}
2081+
],
2082+
},
2083+
]
20492084
FILE_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = FILE_SEARCH_RESPONSE
20502085
IMAGE_GEN_TCS_EXPECTED_FLOW_RESPONSE = IMAGE_GEN_RESPONSE
20512086
MEMORY_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = MEMORY_SEARCH_RESPONSE
20522087
# For KB_MCP and MCP: _preprocess_messages drops the first 2 MCP approval messages.
20532088
KB_MCP_TCS_EXPECTED_FLOW_RESPONSE = KB_MCP_RESPONSE[2:]
20542089
MCP_TCS_EXPECTED_FLOW_RESPONSE = MCP_RESPONSE[2:]
20552090

2091+
# Normalized OPENAPI_RESPONSE: openapi_call -> tool_call, openapi_call_output -> tool_result
2092+
OPENAPI_NORMALIZED_RESPONSE = [
2093+
{
2094+
"run_id": "",
2095+
"role": "assistant",
2096+
"content": [
2097+
{
2098+
"type": "tool_call",
2099+
"tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
2100+
"name": "weather_GetCurrentWeather",
2101+
"arguments": {"location": "Cairo", "format": "j1"},
2102+
}
2103+
],
2104+
},
2105+
{
2106+
"run_id": "",
2107+
"tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
2108+
"role": "tool",
2109+
"content": [{"type": "tool_result", "tool_result": ""}],
2110+
},
2111+
{
2112+
"role": "assistant",
2113+
"content": [
2114+
{
2115+
"annotations": [],
2116+
"text": (
2117+
"**Current weather in Cairo:**\n\n- **Temperature:** 26\u00b0C (feels like 25\u00b0C)\n"
2118+
"- **Condition:** Sand (likely some dusty or sandy winds)\n"
2119+
"- **Humidity:** 28%\n"
2120+
"- **Cloud Cover:** 0% (clear skies)\n"
2121+
"- **Wind:** SW at 23 km/h\n"
2122+
"- **Visibility:** Moderate (4 km)\n"
2123+
"- **No precipitation**\n"
2124+
"- **UV Index:** 2\n\n"
2125+
"**Summary:** Cairo is experiencing warm, dry, and sunny weather, but there is "
2126+
"sand or dust in the air which may reduce visibility. Skies are clear and "
2127+
"it\u2019s breezy. Make sure to protect yourself from the dust if you\u2019re "
2128+
"heading outside!"
2129+
),
2130+
"type": "output_text",
2131+
"logprobs": [],
2132+
}
2133+
],
2134+
},
2135+
]
2136+
20562137

20572138
# =============================================================================
20582139
# Expected flow inputs shared across multiple evaluators
@@ -2734,9 +2815,6 @@
27342815
"arguments": {
27352816
"sign": "Aquarius",
27362817
},
2737-
"tool_result": {
2738-
"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter.",
2739-
},
27402818
},
27412819
],
27422820
},

assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati
6161

6262
test_openapi_expected_flow_inputs = {
6363
"query": data.OPENAPI_QUERY,
64-
"response": data.OPENAPI_RESPONSE,
64+
"response": data.OPENAPI_NORMALIZED_RESPONSE,
6565
}
6666

6767
test_web_search_expected_flow_inputs = {

assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,63 +20,63 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation
2020

2121
# region Expected flow inputs for each test
2222
test_function_tool_local_calls_expected_flow_inputs = {
23-
"response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE,
23+
"response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE,
2424
}
2525

2626
test_code_interpreter_expected_flow_inputs = {
27-
"response": data.CODE_INTERPRETER_RESPONSE,
27+
"response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE,
2828
}
2929

3030
test_bing_grounding_expected_flow_inputs = {
31-
"response": data.BING_GROUNDING_RESPONSE,
31+
"response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE,
3232
}
3333

3434
test_bing_custom_search_expected_flow_inputs = {
35-
"response": data.BING_CUSTOM_SEARCH_RESPONSE,
35+
"response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
3636
}
3737

3838
test_file_search_expected_flow_inputs = {
39-
"response": data.FILE_SEARCH_RESPONSE,
39+
"response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
4040
}
4141

4242
test_azure_ai_search_expected_flow_inputs = {
43-
"response": data.AZURE_AI_SEARCH_RESPONSE,
43+
"response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
4444
}
4545

4646
test_sharepoint_grounding_expected_flow_inputs = {
47-
"response": data.SHAREPOINT_RESPONSE,
47+
"response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE,
4848
}
4949

5050
test_fabric_data_agent_expected_flow_inputs = {
51-
"response": data.FABRIC_RESPONSE,
51+
"response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE,
5252
}
5353

5454
test_openapi_expected_flow_inputs = {
55-
"response": data.OPENAPI_RESPONSE,
55+
"response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE,
5656
}
5757

5858
test_web_search_expected_flow_inputs = {
59-
"response": data.WEB_SEARCH_RESPONSE,
59+
"response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
6060
}
6161

6262
test_browser_automation_expected_flow_inputs = {
63-
"response": data.BROWSER_AUTOMATION_RESPONSE,
63+
"response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE,
6464
}
6565

6666
test_image_generation_expected_flow_inputs = {
67-
"response": data.IMAGE_GEN_RESPONSE,
67+
"response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE,
6868
}
6969

7070
test_memory_search_expected_flow_inputs = {
71-
"response": data.MEMORY_SEARCH_RESPONSE,
71+
"response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
7272
}
7373

7474
test_kb_mcp_expected_flow_inputs = {
75-
"response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE,
75+
"response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE,
7676
}
7777

7878
test_mcp_expected_flow_inputs = {
79-
"response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE,
79+
"response": data.MCP_IR_EXPECTED_FLOW_RESPONSE,
8080
}
8181
# endregion
8282

0 commit comments

Comments
 (0)