Skip to content

Commit ea9c8ea

Browse files
committed
Update Tools Evaluation Tests
1 parent a647a39 commit ea9c8ea

5 files changed

Lines changed: 115 additions & 22 deletions

File tree

assets/evaluators/tests/test_evaluators_behavior/base_tool_evaluation_test.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
and asserts correct behavior using assert_expected_behavior and assert_called_once_with.
1111
"""
1212

13+
import copy
1314
from typing import Any, Dict, Optional
1415

1516
from . import common_tool_test_data as data
@@ -92,7 +93,7 @@ def _run_tool_type_test(
9293
Dictionary containing the extracted result data.
9394
"""
9495
results, flow_mock = self._run_evaluation_and_return_mocked_flow(
95-
**evaluation_inputs,
96+
**copy.deepcopy(evaluation_inputs),
9697
)
9798
result_data = self._extract_and_print_result(results, test_label)
9899
self.assert_expected_behavior(assert_type, result_data)

assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2044,15 +2044,96 @@
20442044
)
20452045

20462046
# ----- TCS expected flow response -----
2047-
# For LOCAL_CALLS, FILE_SEARCH, IMAGE_GEN, MEMORY_SEARCH: _preprocess_messages is a no-op.
2048-
LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = LOCAL_CALLS_RESPONSE
2047+
# _preprocess_messages normalizes function_call/function_call_output types to tool_call/tool_result.
2048+
LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = [
2049+
{
2050+
"run_id": "",
2051+
"role": "assistant",
2052+
"content": [
2053+
{
2054+
"type": "tool_call",
2055+
"tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
2056+
"name": "get_horoscope",
2057+
"arguments": {"sign": "Aquarius"},
2058+
}
2059+
],
2060+
},
2061+
{
2062+
"run_id": "",
2063+
"tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
2064+
"role": "tool",
2065+
"content": [
2066+
{
2067+
"type": "tool_result",
2068+
"tool_result": {"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter."},
2069+
}
2070+
],
2071+
},
2072+
{
2073+
"role": "assistant",
2074+
"content": [
2075+
{
2076+
"annotations": [],
2077+
"text": "Your horoscope for Aquarius is: Next Tuesday you will befriend a baby otter.",
2078+
"type": "output_text",
2079+
"logprobs": [],
2080+
}
2081+
],
2082+
},
2083+
]
20492084
FILE_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = FILE_SEARCH_RESPONSE
20502085
IMAGE_GEN_TCS_EXPECTED_FLOW_RESPONSE = IMAGE_GEN_RESPONSE
20512086
MEMORY_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = MEMORY_SEARCH_RESPONSE
20522087
# For KB_MCP and MCP: _preprocess_messages drops the first 2 MCP approval messages.
20532088
KB_MCP_TCS_EXPECTED_FLOW_RESPONSE = KB_MCP_RESPONSE[2:]
20542089
MCP_TCS_EXPECTED_FLOW_RESPONSE = MCP_RESPONSE[2:]
20552090

2091+
# Normalized OPENAPI_RESPONSE: openapi_call -> tool_call, openapi_call_output -> tool_result
2092+
OPENAPI_NORMALIZED_RESPONSE = [
2093+
{
2094+
"run_id": "",
2095+
"role": "assistant",
2096+
"content": [
2097+
{
2098+
"type": "tool_call",
2099+
"tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
2100+
"name": "weather_GetCurrentWeather",
2101+
"arguments": {"location": "Cairo", "format": "j1"},
2102+
}
2103+
],
2104+
},
2105+
{
2106+
"run_id": "",
2107+
"tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
2108+
"role": "tool",
2109+
"content": [{"type": "tool_result", "tool_result": ""}],
2110+
},
2111+
{
2112+
"role": "assistant",
2113+
"content": [
2114+
{
2115+
"annotations": [],
2116+
"text": (
2117+
"**Current weather in Cairo:**\n\n- **Temperature:** 26\u00b0C (feels like 25\u00b0C)\n"
2118+
"- **Condition:** Sand (likely some dusty or sandy winds)\n"
2119+
"- **Humidity:** 28%\n"
2120+
"- **Cloud Cover:** 0% (clear skies)\n"
2121+
"- **Wind:** SW at 23 km/h\n"
2122+
"- **Visibility:** Moderate (4 km)\n"
2123+
"- **No precipitation**\n"
2124+
"- **UV Index:** 2\n\n"
2125+
"**Summary:** Cairo is experiencing warm, dry, and sunny weather, but there is "
2126+
"sand or dust in the air which may reduce visibility. Skies are clear and "
2127+
"it\u2019s breezy. Make sure to protect yourself from the dust if you\u2019re "
2128+
"heading outside!"
2129+
),
2130+
"type": "output_text",
2131+
"logprobs": [],
2132+
}
2133+
],
2134+
},
2135+
]
2136+
20562137

20572138
# =============================================================================
20582139
# Expected flow inputs shared across multiple evaluators
@@ -2314,6 +2395,8 @@
23142395
)
23152396

23162397
OPENAPI_EXPECTED_FLOW_RESPONSE = (
2398+
'[TOOL_CALL] weather_GetCurrentWeather(location="Cairo", format="j1")\n'
2399+
'[TOOL_RESULT] \n'
23172400
"**Current weather in Cairo:**\n\n- **Temperature:** 26°C (feels like 25°C)\n- "
23182401
"**Condition:** Sand (likely some dusty or sandy winds)\n- **Humidity:** 28%\n- "
23192402
"**Cloud Cover:** 0% (clear skies)\n- **Wind:** SW at 23 km/h\n- **Visibility:** "
@@ -2732,9 +2815,6 @@
27322815
"arguments": {
27332816
"sign": "Aquarius",
27342817
},
2735-
"tool_result": {
2736-
"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter.",
2737-
},
27382818
},
27392819
],
27402820
},

assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati
6161

6262
test_openapi_expected_flow_inputs = {
6363
"query": data.OPENAPI_QUERY,
64-
"response": data.OPENAPI_RESPONSE,
64+
"response": data.OPENAPI_NORMALIZED_RESPONSE,
6565
}
6666

6767
test_web_search_expected_flow_inputs = {

assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,63 +20,63 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation
2020

2121
# region Expected flow inputs for each test
2222
test_function_tool_local_calls_expected_flow_inputs = {
23-
"response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE,
23+
"response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE,
2424
}
2525

2626
test_code_interpreter_expected_flow_inputs = {
27-
"response": data.CODE_INTERPRETER_RESPONSE,
27+
"response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE,
2828
}
2929

3030
test_bing_grounding_expected_flow_inputs = {
31-
"response": data.BING_GROUNDING_RESPONSE,
31+
"response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE,
3232
}
3333

3434
test_bing_custom_search_expected_flow_inputs = {
35-
"response": data.BING_CUSTOM_SEARCH_RESPONSE,
35+
"response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
3636
}
3737

3838
test_file_search_expected_flow_inputs = {
39-
"response": data.FILE_SEARCH_RESPONSE,
39+
"response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
4040
}
4141

4242
test_azure_ai_search_expected_flow_inputs = {
43-
"response": data.AZURE_AI_SEARCH_RESPONSE,
43+
"response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
4444
}
4545

4646
test_sharepoint_grounding_expected_flow_inputs = {
47-
"response": data.SHAREPOINT_RESPONSE,
47+
"response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE,
4848
}
4949

5050
test_fabric_data_agent_expected_flow_inputs = {
51-
"response": data.FABRIC_RESPONSE,
51+
"response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE,
5252
}
5353

5454
test_openapi_expected_flow_inputs = {
55-
"response": data.OPENAPI_RESPONSE,
55+
"response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE,
5656
}
5757

5858
test_web_search_expected_flow_inputs = {
59-
"response": data.WEB_SEARCH_RESPONSE,
59+
"response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
6060
}
6161

6262
test_browser_automation_expected_flow_inputs = {
63-
"response": data.BROWSER_AUTOMATION_RESPONSE,
63+
"response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE,
6464
}
6565

6666
test_image_generation_expected_flow_inputs = {
67-
"response": data.IMAGE_GEN_RESPONSE,
67+
"response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE,
6868
}
6969

7070
test_memory_search_expected_flow_inputs = {
71-
"response": data.MEMORY_SEARCH_RESPONSE,
71+
"response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
7272
}
7373

7474
test_kb_mcp_expected_flow_inputs = {
75-
"response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE,
75+
"response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE,
7676
}
7777

7878
test_mcp_expected_flow_inputs = {
79-
"response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE,
79+
"response": data.MCP_IR_EXPECTED_FLOW_RESPONSE,
8080
}
8181
# endregion
8282

assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,15 @@ class TestToolSelectionEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, Base
116116
is_tool_definition_required = True
117117

118118
evaluator_type = ToolSelectionEvaluator
119+
120+
def test_openapi(self):
121+
"""OpenAPI: ToolSelection flow is not called (no extractable tool calls)."""
122+
results, flow_mock = self._run_evaluation_and_return_mocked_flow(
123+
query=data.OPENAPI_QUERY,
124+
response=data.OPENAPI_RESPONSE,
125+
tool_definitions=data.OPENAPI_TOOL_DEFINITIONS,
126+
)
127+
result_data = self._extract_and_print_result(results, "OpenAPI")
128+
self.assert_not_applicable(result_data)
129+
assert flow_mock is not None, "Flow mock should be set when use_mocking=True"
130+
flow_mock.assert_not_called()

0 commit comments

Comments
 (0)