Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -787,9 +787,10 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):

.. note::

To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
The output field "details" has been renamed to "tool_call_accuracy_properties" for clarity.

The `gpt_` prefix is deprecated. Use `_score` suffix instead.

"""

_PROMPTY_FILE = "tool_call_accuracy.prompty"
Expand All @@ -804,7 +805,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."

_LLM_SCORE_KEY = "tool_calls_success_level"
_LLM_SCORE_KEY = "score"

_validator: ValidatorInterface

Expand Down Expand Up @@ -973,6 +974,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

if isinstance(llm_output, dict):
# Handle skipped status from LLM
llm_status = llm_output.get("status", "completed")
if llm_status == "skipped":
reason = llm_output.get("reason", "")
return self._not_applicable_result(reason, self.threshold)

score = llm_output.get(self._LLM_SCORE_KEY, None)
if not score or not check_score_is_valid(
score,
Expand All @@ -989,22 +996,30 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
)

# Format the output
reason = llm_output.get("chain_of_thought", "")
reason = llm_output.get("reason", "")
score = float(score)
score_result = "pass" if score >= self.threshold else "fail"
llm_properties = llm_output.get("properties", {}) or {}
llm_properties.update(
{
"prompt_tokens": prompty_output_dict.get("input_token_count", 0),
"completion_tokens": prompty_output_dict.get("output_token_count", 0),
"total_tokens": prompty_output_dict.get("total_token_count", 0),
"finish_reason": prompty_output_dict.get("finish_reason", ""),
"model": prompty_output_dict.get("model_id", ""),
"sample_input": prompty_output_dict.get("sample_input", ""),
"sample_output": prompty_output_dict.get("sample_output", ""),
}
)
response_dict = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_result": score_result,
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_passed": score_result == "pass",
f"{self._result_key}_reason": reason,
f"{self._result_key}_details": llm_output.get("details", {}),
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
f"{self._result_key}_status": "completed",
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
return response_dict

Expand Down Expand Up @@ -1049,29 +1064,25 @@ async def _real_call(self, **kwargs):

def _not_applicable_result(
self, error_message: str, threshold: Union[int, float]
) -> Dict[str, Union[str, float, Dict]]:
) -> Dict[str, Union[str, float, Dict, None]]:
"""Return a result indicating that the tool call is not applicable for evaluation.

:param error_message: The error message indicating why the evaluation is not applicable.
:type error_message: str
:param threshold: The threshold value for the evaluation.
:type threshold: Union[int, float]
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float]]
:rtype: Dict[str, Union[str, float, None]]
"""
return {
self._result_key: threshold,
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_result": "pass",
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_passed": None,
f"{self._result_key}_reason": f"Not applicable: {error_message}",
f"{self._result_key}_details": {},
f"{self._result_key}_prompt_tokens": 0,
f"{self._result_key}_completion_tokens": 0,
f"{self._result_key}_total_tokens": 0,
f"{self._result_key}_finish_reason": "",
f"{self._result_key}_model": "",
f"{self._result_key}_sample_input": "",
f"{self._result_key}_sample_output": "",
f"{self._result_key}_status": "skipped",
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}

def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,16 @@ Evaluate based on these factors:

**Tool Assessment**: Focus solely on appropriate use of available tools, not on capabilities beyond what tools can provide.

## Status: Skipped
Before performing any evaluation, check for the following conditions. If ANY are true, return `status: "skipped"` immediately without scoring:
1. **No tool calls to evaluate**: The TOOL CALLS TO BE EVALUATED section is empty (tool calls appearing only in the CONVERSATION section do not count).
2. **Missing tool definitions**: Any tool call in TOOL CALLS TO BE EVALUATED references a tool that is not present in the TOOL DEFINITIONS.

When skipped, return:
```json
{"reason": "<explain why evaluation was skipped>", "score": null, "status": "skipped", "properties": null}
```


# Ratings
## [Tool Call Accuracy: 1] (Irrelevant)
Expand Down Expand Up @@ -139,10 +149,13 @@ TOOL DEFINITIONS: {{tool_definitions}}

# Tasks
## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
- chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'.
- tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
- details: a dictionary that contains the following keys:
Your output should consist only of a JSON object that has the following keys:
- reason: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
- score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped".
- status: a string indicating the evaluation status. Must be one of:
- "completed": tool calls were present, tool definitions were available, and evaluation was performed.
- "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null.
- properties: a dictionary that contains the following keys:
- tool_calls_made_by_agent: total number of tool calls made by the agent
- correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
- per_tool_call_details: a list of dictionaries, each containing:
Expand Down
6 changes: 3 additions & 3 deletions assets/evaluators/builtin/tool_call_accuracy/spec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
type: "evaluator"
name: "builtin.tool_call_accuracy"
version: 8
version: 9
displayName: "Tool-Call-Accuracy-Evaluator"
description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration."
evaluatorType: "builtin"
Expand Down Expand Up @@ -49,9 +49,9 @@ dataMappingSchema:
type: "object"
required: ["query", "tool_definitions"]
outputSchema:
tool_call_accuracy:
tool_call_accuracy_score:
type: "ordinal"
desirable_direction: "increase"
min_value: 1
max_value: 5
path: ./evaluator
path: ./evaluator
15 changes: 13 additions & 2 deletions assets/evaluators/tests/common/base_evaluator_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) ->
Dictionary with standardized result fields.
"""
score = results.get(self.result_key)
if score is None:
score = results.get(f"{self.result_key}_score")

if f"{self.result_key}_error_message" not in results and score != "not applicable":
for field in self.expected_result_fields:
Expand All @@ -175,6 +177,7 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) ->

# Optional fields
reason = results.get(f"{self.result_key}_reason")
status = results.get(f"{self.result_key}_status")
threshold = results.get(f"{self._result_prefix}_threshold")
precision = results.get(f"{self._result_prefix}_precision")
recall = results.get(f"{self._result_prefix}_recall")
Expand All @@ -195,6 +198,9 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) ->
if threshold is not None:
print(f" Threshold: {threshold}")
result["threshold"] = threshold
if status is not None:
print(f" Status: {status}")
result["status"] = status
if precision is not None:
print(f" Precision: {precision}")
result["precision"] = precision
Expand Down Expand Up @@ -246,15 +252,20 @@ def assert_pass(self, result_data: Dict[str, Any]):
self._assert_pass_result(result_data)

def assert_not_applicable(self, result_data: Dict[str, Any]):
"""Assert a not-applicable result (intermediate response).
"""Assert a not-applicable result (intermediate response or skipped evaluation).

Args:
result_data: Dictionary containing evaluation result data.

Raises:
AssertionError: If the result is not a valid not-applicable result.
"""
self._assert_pass_result(result_data)
label_key = "label"
score_key = "score"
assert result_data[label_key] == "pass", \
f"Expected 'pass' but got '{result_data[label_key]}'"
assert result_data[score_key] is None, \
f"Expected score to be None for not-applicable result but got '{result_data[score_key]}'"
assert "Not applicable" in result_data.get("reason", ""), \
f"Expected reason to contain 'Not applicable' but got '{result_data.get('reason')}'"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import json
import copy
from ..common.base_prompty_evaluator_runner import BasePromptyEvaluatorRunner
from . import common_tool_test_data as _tool_data


class BaseEvaluatorBehaviorTest(BasePromptyEvaluatorRunner):
Expand Down Expand Up @@ -862,7 +863,7 @@ def test_function_call_response(self):
query=self.VALID_QUERY,
response=self.FUNCTION_CALL_ONLY_RESPONSE,
tool_calls=self.VALID_TOOL_CALLS,
tool_definitions=self.VALID_TOOL_DEFINITIONS,
tool_definitions=_tool_data.FUNCTION_CALL_RESPONSE_TOOL_DEFINITIONS,
)
result_data = self._extract_and_print_result(results, "Function Call Only - Not Applicable")
self.assert_not_applicable(result_data)
Expand All @@ -872,7 +873,7 @@ def test_function_call_response(self):
query=self.VALID_QUERY,
response=self.FUNCTION_CALL_FULL_RESPONSE,
tool_calls=self.VALID_TOOL_CALLS,
tool_definitions=self.VALID_TOOL_DEFINITIONS,
tool_definitions=_tool_data.FUNCTION_CALL_RESPONSE_TOOL_DEFINITIONS,
)
result_data = self._extract_and_print_result(results, "Function Call Full - Preprocessed")
self.assert_pass(result_data)
Expand All @@ -884,7 +885,7 @@ def test_mcp_approval_response(self):
query=self.VALID_QUERY,
response=self.MCP_APPROVAL_ONLY_RESPONSE,
tool_calls=self.VALID_TOOL_CALLS,
tool_definitions=self.VALID_TOOL_DEFINITIONS,
tool_definitions=_tool_data.MCP_APPROVAL_RESPONSE_TOOL_DEFINITIONS,
)
result_data = self._extract_and_print_result(results, "MCP Approval Only - Not Applicable")
self.assert_not_applicable(result_data)
Expand All @@ -894,7 +895,7 @@ def test_mcp_approval_response(self):
query=self.VALID_QUERY,
response=self.MCP_APPROVAL_FULL_RESPONSE,
tool_calls=self.VALID_TOOL_CALLS,
tool_definitions=self.VALID_TOOL_DEFINITIONS,
tool_definitions=_tool_data.MCP_APPROVAL_RESPONSE_TOOL_DEFINITIONS,
)
result_data = self._extract_and_print_result(results, "MCP Approval Full - Preprocessed")
self.assert_pass(result_data)
Original file line number Diff line number Diff line change
Expand Up @@ -3211,3 +3211,56 @@
],
},
]

# =============================================================================
# Tool definitions for FUNCTION_CALL_*_RESPONSE / MCP_APPROVAL_*_RESPONSE
# test data defined in base_evaluator_behavior_test.py.
# =============================================================================

# Tool definitions matching FUNCTION_CALL_ONLY_RESPONSE / FUNCTION_CALL_FULL_RESPONSE
# (uses the `get_horoscope` function tool).
FUNCTION_CALL_RESPONSE_TOOL_DEFINITIONS = [
{
"name": "get_horoscope",
"type": "function",
"description": "Get today's horoscope for an astrological sign.",
"parameters": {
"type": "object",
"properties": {
"sign": {
"type": "string",
"description": "An astrological sign like Taurus or Aquarius",
}
},
"required": ["sign"],
"additionalProperties": False,
},
},
]

# Tool definitions matching MCP_APPROVAL_ONLY_RESPONSE / MCP_APPROVAL_FULL_RESPONSE
# (uses the `microsoft_docs_search` tool surfaced via MCP).
MCP_APPROVAL_RESPONSE_TOOL_DEFINITIONS = [
{
"name": "microsoft_docs_search",
"type": "function",
"description": (
"Search official Microsoft/Azure documentation to find the most relevant "
"and trustworthy content for a user's query."
),
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": (
"a query or topic about Microsoft/Azure products, services, "
"platforms, developer tools, frameworks, or APIs"
),
},
},
"required": ["query"],
"additionalProperties": False,
},
},
]
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""

import pytest
from typing import List
from .base_tool_calls_evaluator_behavior_test import BaseToolCallEvaluatorBehaviorTest
from .base_tool_evaluation_test import BaseToolEvaluationTest
from . import common_tool_test_data as data
Expand Down Expand Up @@ -69,3 +70,17 @@ class TestToolCallAccuracyEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, B
is_tool_definition_required = True

MINIMAL_RESPONSE = BaseToolCallEvaluatorBehaviorTest.email_tool_call_and_assistant_response

@property
def expected_result_fields(self) -> List[str]:
"""Get the expected result fields for tools evaluators."""
return [
f"{self.result_key}",
f"{self.result_key}_score",
f"{self.result_key}_result",
f"{self.result_key}_passed",
f"{self.result_key}_reason",
f"{self.result_key}_status",
f"{self.result_key}_threshold",
f"{self.result_key}_properties",
]
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""Quality tests for Tool Call Accuracy Evaluator with real flow execution."""

import pytest
from typing import List
from ..common.base_quality_evaluator_runner import BaseQualityEvaluatorRunner, ExpectedResult
from ...builtin.tool_call_accuracy.evaluator._tool_call_accuracy import ToolCallAccuracyEvaluator

Expand All @@ -16,6 +17,20 @@ class TestToolCallAccuracyEvaluatorQuality(BaseQualityEvaluatorRunner):
Tests actual LLM evaluation with real flow execution (no mocking).
"""

@property
def expected_result_fields(self) -> List[str]:
"""Get the expected result fields for tools evaluators."""
return [
f"{self.result_key}",
f"{self.result_key}_score",
f"{self.result_key}_result",
f"{self.result_key}_passed",
f"{self.result_key}_reason",
f"{self.result_key}_status",
f"{self.result_key}_threshold",
f"{self.result_key}_properties",
]

evaluator_type = ToolCallAccuracyEvaluator

def test_pass_single_call(self) -> None:
Expand Down
Loading