Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from itertools import chain
import os
import logging
from typing import Dict, List, Union, TypeVar
Expand Down Expand Up @@ -1086,14 +1085,6 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
built_in_definitions = _get_needed_built_in_definitions(tool_calls)
needed_tool_definitions.extend(built_in_definitions)

# OpenAPI tool is a collection of functions, so we need to expand it
tool_definitions_expanded = list(
chain.from_iterable(
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
for tool in needed_tool_definitions
)
)

# Validate that all tool calls have corresponding definitions
for tool_call in tool_calls:
if isinstance(tool_call, dict):
Expand All @@ -1107,7 +1098,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
elif tool_name:
# This is a regular function tool from converter or built-in tool from agent v2
tool_definition_exists = any(
tool.get("name") == tool_name for tool in tool_definitions_expanded
tool.get("name") == tool_name for tool in needed_tool_definitions
)
if not tool_definition_exists:
raise EvaluationException(
Expand Down
2 changes: 1 addition & 1 deletion assets/evaluators/builtin/tool_call_accuracy/spec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
type: "evaluator"
name: "builtin.tool_call_accuracy"
version: 7
version: 8
displayName: "Tool-Call-Accuracy-Evaluator"
description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration."
evaluatorType: "builtin"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# Licensed under the MIT License.
import os
import logging
from itertools import chain
from typing import Dict, List, Union, TypeVar, cast
from typing_extensions import override
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
Expand Down Expand Up @@ -633,14 +632,6 @@ def _extract_needed_tool_definitions(
built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls)
needed_tool_definitions.extend(built_in_definitions)

# OpenAPI tool is a collection of functions, so we need to expand it
tool_definitions_expanded = list(
chain.from_iterable(
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
for tool in needed_tool_definitions
)
)

# Validate that all tool calls have corresponding definitions
for tool_call in tool_calls:
if isinstance(tool_call, dict):
Expand All @@ -653,7 +644,7 @@ def _extract_needed_tool_definitions(
continue
elif tool_name:
# This is a regular function tool from converter or built-in tool from agent v2
tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded)
tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions)
if not tool_definition_exists:
raise EvaluationException(
message=f"Tool definition for {tool_name} not found",
Expand Down
2 changes: 1 addition & 1 deletion assets/evaluators/builtin/tool_input_accuracy/spec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
type: "evaluator"
name: "builtin.tool_input_accuracy"
version: 9
version: 10
displayName: "Tool-Input-Accuracy-Evaluator"
description: "A binary evaluator (0 or 1) that checks whether all parameters in an agent’s tool call are correct, validating grounding, type, format, completeness, and contextual appropriateness using LLM-based analysis. Use it to verify agent tool usage, API integration tests, or to ensure tool call parameters are fully correct in AI workflows."
evaluatorType: "builtin"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import logging
from typing import Dict, List, Union, TypeVar
from typing_extensions import override
from itertools import chain
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
from azure.ai.evaluation._exceptions import (
ErrorBlame,
Expand Down Expand Up @@ -731,14 +730,6 @@ def _extract_needed_tool_definitions(
built_in_definitions = _get_needed_built_in_tool_definitions(tool_calls)
needed_tool_definitions.extend(built_in_definitions)

# OpenAPI tool is a collection of functions, so we need to expand it
tool_definitions_expanded = list(
chain.from_iterable(
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
for tool in needed_tool_definitions
)
)

# Validate that all tool calls have corresponding definitions
for tool_call in tool_calls:
if isinstance(tool_call, dict):
Expand All @@ -751,7 +742,7 @@ def _extract_needed_tool_definitions(
continue
elif tool_name:
# This is a regular function tool from converter or built-in tool from agent v2
tool_definition_exists = any(tool.get("name") == tool_name for tool in tool_definitions_expanded)
tool_definition_exists = any(tool.get("name") == tool_name for tool in needed_tool_definitions)
if not tool_definition_exists:
raise EvaluationException(
message=f"Tool definition for {tool_name} not found",
Expand Down
2 changes: 1 addition & 1 deletion assets/evaluators/builtin/tool_selection/spec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
type: "evaluator"
name: "builtin.tool_selection"
version: 7
version: 8
displayName: "Tool-Selection-Evaluator"
description: "Evaluates whether an AI agent selected the most appropriate and efficient tools for a given task, avoiding redundancy or missing essentials. Use it to assess tool choice quality in agent-based systems, orchestration platforms, and AI assistants that must pick the right tools from available options."
evaluatorType: "builtin"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
and asserts correct behavior using assert_expected_behavior and assert_called_once_with.
"""

import copy
from typing import Any, Dict, Optional

from . import common_tool_test_data as data
Expand Down Expand Up @@ -92,7 +93,7 @@ def _run_tool_type_test(
Dictionary containing the extracted result data.
"""
results, flow_mock = self._run_evaluation_and_return_mocked_flow(
**evaluation_inputs,
**copy.deepcopy(evaluation_inputs),
)
result_data = self._extract_and_print_result(results, test_label)
self.assert_expected_behavior(assert_type, result_data)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2044,15 +2044,96 @@
)

# ----- TCS expected flow response -----
# For LOCAL_CALLS, FILE_SEARCH, IMAGE_GEN, MEMORY_SEARCH: _preprocess_messages is a no-op.
LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = LOCAL_CALLS_RESPONSE
# _preprocess_messages normalizes function_call/function_call_output types to tool_call/tool_result.
LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = [
{
"run_id": "",
"role": "assistant",
"content": [
{
"type": "tool_call",
"tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
"name": "get_horoscope",
"arguments": {"sign": "Aquarius"},
}
],
},
{
"run_id": "",
"tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX",
"role": "tool",
"content": [
{
"type": "tool_result",
"tool_result": {"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter."},
}
],
},
{
"role": "assistant",
"content": [
{
"annotations": [],
"text": "Your horoscope for Aquarius is: Next Tuesday you will befriend a baby otter.",
"type": "output_text",
"logprobs": [],
}
],
},
]
FILE_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = FILE_SEARCH_RESPONSE
IMAGE_GEN_TCS_EXPECTED_FLOW_RESPONSE = IMAGE_GEN_RESPONSE
MEMORY_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = MEMORY_SEARCH_RESPONSE
# For KB_MCP and MCP: _preprocess_messages drops the first 2 MCP approval messages.
KB_MCP_TCS_EXPECTED_FLOW_RESPONSE = KB_MCP_RESPONSE[2:]
MCP_TCS_EXPECTED_FLOW_RESPONSE = MCP_RESPONSE[2:]

# Normalized OPENAPI_RESPONSE: openapi_call -> tool_call, openapi_call_output -> tool_result
OPENAPI_NORMALIZED_RESPONSE = [
{
"run_id": "",
"role": "assistant",
"content": [
{
"type": "tool_call",
"tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
"name": "weather_GetCurrentWeather",
"arguments": {"location": "Cairo", "format": "j1"},
}
],
},
{
"run_id": "",
"tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab",
"role": "tool",
"content": [{"type": "tool_result", "tool_result": ""}],
},
{
"role": "assistant",
"content": [
{
"annotations": [],
"text": (
"**Current weather in Cairo:**\n\n- **Temperature:** 26\u00b0C (feels like 25\u00b0C)\n"
"- **Condition:** Sand (likely some dusty or sandy winds)\n"
"- **Humidity:** 28%\n"
"- **Cloud Cover:** 0% (clear skies)\n"
"- **Wind:** SW at 23 km/h\n"
"- **Visibility:** Moderate (4 km)\n"
"- **No precipitation**\n"
"- **UV Index:** 2\n\n"
"**Summary:** Cairo is experiencing warm, dry, and sunny weather, but there is "
"sand or dust in the air which may reduce visibility. Skies are clear and "
"it\u2019s breezy. Make sure to protect yourself from the dust if you\u2019re "
"heading outside!"
),
"type": "output_text",
"logprobs": [],
}
],
},
]


# =============================================================================
# Expected flow inputs shared across multiple evaluators
Expand Down Expand Up @@ -2734,9 +2815,6 @@
"arguments": {
"sign": "Aquarius",
},
"tool_result": {
"horoscope": "Aquarius: Next Tuesday you will befriend a baby otter.",
},
},
],
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati

test_openapi_expected_flow_inputs = {
"query": data.OPENAPI_QUERY,
"response": data.OPENAPI_RESPONSE,
"response": data.OPENAPI_NORMALIZED_RESPONSE,
}

test_web_search_expected_flow_inputs = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,63 +20,63 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation

# region Expected flow inputs for each test
test_function_tool_local_calls_expected_flow_inputs = {
"response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE,
"response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE,
}

test_code_interpreter_expected_flow_inputs = {
"response": data.CODE_INTERPRETER_RESPONSE,
"response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE,
}

test_bing_grounding_expected_flow_inputs = {
"response": data.BING_GROUNDING_RESPONSE,
"response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE,
}

test_bing_custom_search_expected_flow_inputs = {
"response": data.BING_CUSTOM_SEARCH_RESPONSE,
"response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
}

test_file_search_expected_flow_inputs = {
"response": data.FILE_SEARCH_RESPONSE,
"response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
}

test_azure_ai_search_expected_flow_inputs = {
"response": data.AZURE_AI_SEARCH_RESPONSE,
"response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
}

test_sharepoint_grounding_expected_flow_inputs = {
"response": data.SHAREPOINT_RESPONSE,
"response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE,
}

test_fabric_data_agent_expected_flow_inputs = {
"response": data.FABRIC_RESPONSE,
"response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE,
}

test_openapi_expected_flow_inputs = {
"response": data.OPENAPI_RESPONSE,
"response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE,
}

test_web_search_expected_flow_inputs = {
"response": data.WEB_SEARCH_RESPONSE,
"response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
}

test_browser_automation_expected_flow_inputs = {
"response": data.BROWSER_AUTOMATION_RESPONSE,
"response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE,
}

test_image_generation_expected_flow_inputs = {
"response": data.IMAGE_GEN_RESPONSE,
"response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE,
}

test_memory_search_expected_flow_inputs = {
"response": data.MEMORY_SEARCH_RESPONSE,
"response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE,
}

test_kb_mcp_expected_flow_inputs = {
"response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE,
"response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE,
}

test_mcp_expected_flow_inputs = {
"response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE,
"response": data.MCP_IR_EXPECTED_FLOW_RESPONSE,
}
# endregion

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,11 @@ class TestToolSelectionEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, Base
"tool_definitions": data.FABRIC_TOOL_DEFINITIONS,
}

# OpenAPI: ToolSelection flow is not called (no extractable tool calls)
# Expected flow inputs not used since the test will not reach flow assertion
test_openapi_expected_flow_inputs = {
"query": data.OPENAPI_EXPECTED_FLOW_QUERY,
"tool_calls": ["weather_GetCurrentWeather"],
"tool_definitions": data.OPENAPI_TOOL_DEFINITIONS,
}

test_web_search_expected_flow_inputs = {
"query": data.WEB_SEARCH_EXPECTED_FLOW_QUERY,
Expand Down
Loading