Skip to content

Commit a31a7f3

Browse files
m7md7sienCopilotCopilot
authored
Update Tool Call Accuracy to output unified format (#4930)
* Unify the output of Tool Call Accuracy * Add status to prompty * Update Tool Call Accuracy Output Format * Update documentation to state deprecate 'gpt_' prefix Co-authored-by: Copilot <copilot@github.com> * Rename not_applicable to pass in tool_call_accuracy result key and update tests (#4964) Agent-Logs-Url: https://github.com/Azure/azureml-assets/sessions/ba5b2838-661b-419e-9645-b960cc227d25 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> * Use response-specific tool definitions in function_call/mcp_approval tests (#4971) Agent-Logs-Url: https://github.com/Azure/azureml-assets/sessions/0d2db933-6e9b-4b8d-b1a9-789026ec14c8 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> * Bump tool_call_accuracy evaluator version to 9 --------- Co-authored-by: Copilot <copilot@github.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
1 parent 5020434 commit a31a7f3

8 files changed

Lines changed: 158 additions & 39 deletions

File tree

assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -787,9 +787,10 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
787787
788788
.. note::
789789
790-
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
791-
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
792-
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
790+
The output field "details" has been renamed to "tool_call_accuracy_properties" for clarity.
791+
792+
The `gpt_` prefix is deprecated. Use `_score` suffix instead.
793+
793794
"""
794795

795796
_PROMPTY_FILE = "tool_call_accuracy.prompty"
@@ -804,7 +805,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
804805
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
805806
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
806807

807-
_LLM_SCORE_KEY = "tool_calls_success_level"
808+
_LLM_SCORE_KEY = "score"
808809

809810
_validator: ValidatorInterface
810811

@@ -973,6 +974,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
973974
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
974975

975976
if isinstance(llm_output, dict):
977+
# Handle skipped status from LLM
978+
llm_status = llm_output.get("status", "completed")
979+
if llm_status == "skipped":
980+
reason = llm_output.get("reason", "")
981+
return self._not_applicable_result(reason, self.threshold)
982+
976983
score = llm_output.get(self._LLM_SCORE_KEY, None)
977984
if not score or not check_score_is_valid(
978985
score,
@@ -989,22 +996,30 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
989996
)
990997

991998
# Format the output
992-
reason = llm_output.get("chain_of_thought", "")
999+
reason = llm_output.get("reason", "")
9931000
score = float(score)
9941001
score_result = "pass" if score >= self.threshold else "fail"
1002+
llm_properties = llm_output.get("properties", {}) or {}
1003+
llm_properties.update(
1004+
{
1005+
"prompt_tokens": prompty_output_dict.get("input_token_count", 0),
1006+
"completion_tokens": prompty_output_dict.get("output_token_count", 0),
1007+
"total_tokens": prompty_output_dict.get("total_token_count", 0),
1008+
"finish_reason": prompty_output_dict.get("finish_reason", ""),
1009+
"model": prompty_output_dict.get("model_id", ""),
1010+
"sample_input": prompty_output_dict.get("sample_input", ""),
1011+
"sample_output": prompty_output_dict.get("sample_output", ""),
1012+
}
1013+
)
9951014
response_dict = {
9961015
self._result_key: score,
1016+
f"{self._result_key}_score": score,
9971017
f"{self._result_key}_result": score_result,
998-
f"{self._result_key}_threshold": self._threshold,
1018+
f"{self._result_key}_passed": score_result == "pass",
9991019
f"{self._result_key}_reason": reason,
1000-
f"{self._result_key}_details": llm_output.get("details", {}),
1001-
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
1002-
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
1003-
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
1004-
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
1005-
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
1006-
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
1007-
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
1020+
f"{self._result_key}_status": "completed",
1021+
f"{self._result_key}_threshold": self._threshold,
1022+
f"{self._result_key}_properties": llm_properties,
10081023
}
10091024
return response_dict
10101025

@@ -1049,29 +1064,25 @@ async def _real_call(self, **kwargs):
10491064

10501065
def _not_applicable_result(
10511066
self, error_message: str, threshold: Union[int, float]
1052-
) -> Dict[str, Union[str, float, Dict]]:
1067+
) -> Dict[str, Union[str, float, Dict, None]]:
10531068
"""Return a result indicating that the tool call is not applicable for evaluation.
10541069
10551070
:param error_message: The error message indicating why the evaluation is not applicable.
10561071
:type error_message: str
10571072
:param threshold: The threshold value for the evaluation.
10581073
:type threshold: Union[int, float]
10591074
:return: A dictionary containing the result of the evaluation.
1060-
:rtype: Dict[str, Union[str, float]]
1075+
:rtype: Dict[str, Union[str, float, None]]
10611076
"""
10621077
return {
1063-
self._result_key: threshold,
1078+
f"{self._result_key}": None,
1079+
f"{self._result_key}_score": None,
10641080
f"{self._result_key}_result": "pass",
1065-
f"{self._result_key}_threshold": threshold,
1081+
f"{self._result_key}_passed": None,
10661082
f"{self._result_key}_reason": f"Not applicable: {error_message}",
1067-
f"{self._result_key}_details": {},
1068-
f"{self._result_key}_prompt_tokens": 0,
1069-
f"{self._result_key}_completion_tokens": 0,
1070-
f"{self._result_key}_total_tokens": 0,
1071-
f"{self._result_key}_finish_reason": "",
1072-
f"{self._result_key}_model": "",
1073-
f"{self._result_key}_sample_input": "",
1074-
f"{self._result_key}_sample_output": "",
1083+
f"{self._result_key}_status": "skipped",
1084+
f"{self._result_key}_threshold": threshold,
1085+
f"{self._result_key}_properties": None,
10751086
}
10761087

10771088
def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):

assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,16 @@ Evaluate based on these factors:
5454

5555
**Tool Assessment**: Focus solely on appropriate use of available tools, not on capabilities beyond what tools can provide.
5656

57+
## Status: Skipped
58+
Before performing any evaluation, check for the following conditions. If ANY are true, return `status: "skipped"` immediately without scoring:
59+
1. **No tool calls to evaluate**: The TOOL CALLS TO BE EVALUATED section is empty (tool calls appearing only in the CONVERSATION section do not count).
60+
2. **Missing tool definitions**: Any tool call in TOOL CALLS TO BE EVALUATED references a tool that is not present in the TOOL DEFINITIONS.
61+
62+
When skipped, return:
63+
```json
64+
{"reason": "<explain why evaluation was skipped>", "score": null, "status": "skipped", "properties": null}
65+
```
66+
5767

5868
# Ratings
5969
## [Tool Call Accuracy: 1] (Irrelevant)
@@ -139,10 +149,13 @@ TOOL DEFINITIONS: {{tool_definitions}}
139149

140150
# Tasks
141151
## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
142-
Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
143-
- chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'.
144-
- tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
145-
- details: a dictionary that contains the following keys:
152+
Your output should consist only of a JSON object that has the following keys:
153+
- reason: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
154+
- score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped".
155+
- status: a string indicating the evaluation status. Must be one of:
156+
- "completed": tool calls were present, tool definitions were available, and evaluation was performed.
157+
- "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null.
158+
- properties: a dictionary that contains the following keys:
146159
- tool_calls_made_by_agent: total number of tool calls made by the agent
147160
- correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
148161
- per_tool_call_details: a list of dictionaries, each containing:

assets/evaluators/builtin/tool_call_accuracy/spec.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.tool_call_accuracy"
3-
version: 8
3+
version: 9
44
displayName: "Tool-Call-Accuracy-Evaluator"
55
description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration."
66
evaluatorType: "builtin"
@@ -49,9 +49,9 @@ dataMappingSchema:
4949
type: "object"
5050
required: ["query", "tool_definitions"]
5151
outputSchema:
52-
tool_call_accuracy:
52+
tool_call_accuracy_score:
5353
type: "ordinal"
5454
desirable_direction: "increase"
5555
min_value: 1
5656
max_value: 5
57-
path: ./evaluator
57+
path: ./evaluator

assets/evaluators/tests/common/base_evaluator_runner.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) ->
162162
Dictionary with standardized result fields.
163163
"""
164164
score = results.get(self.result_key)
165+
if score is None:
166+
score = results.get(f"{self.result_key}_score")
165167

166168
if f"{self.result_key}_error_message" not in results and score != "not applicable":
167169
for field in self.expected_result_fields:
@@ -175,6 +177,7 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) ->
175177

176178
# Optional fields
177179
reason = results.get(f"{self.result_key}_reason")
180+
status = results.get(f"{self.result_key}_status")
178181
threshold = results.get(f"{self._result_prefix}_threshold")
179182
precision = results.get(f"{self._result_prefix}_precision")
180183
recall = results.get(f"{self._result_prefix}_recall")
@@ -195,6 +198,9 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) ->
195198
if threshold is not None:
196199
print(f" Threshold: {threshold}")
197200
result["threshold"] = threshold
201+
if status is not None:
202+
print(f" Status: {status}")
203+
result["status"] = status
198204
if precision is not None:
199205
print(f" Precision: {precision}")
200206
result["precision"] = precision
@@ -246,15 +252,20 @@ def assert_pass(self, result_data: Dict[str, Any]):
246252
self._assert_pass_result(result_data)
247253

248254
def assert_not_applicable(self, result_data: Dict[str, Any]):
249-
"""Assert a not-applicable result (intermediate response).
255+
"""Assert a not-applicable result (intermediate response or skipped evaluation).
250256
251257
Args:
252258
result_data: Dictionary containing evaluation result data.
253259
254260
Raises:
255261
AssertionError: If the result is not a valid not-applicable result.
256262
"""
257-
self._assert_pass_result(result_data)
263+
label_key = "label"
264+
score_key = "score"
265+
assert result_data[label_key] == "pass", \
266+
f"Expected 'pass' but got '{result_data[label_key]}'"
267+
assert result_data[score_key] is None, \
268+
f"Expected score to be None for not-applicable result but got '{result_data[score_key]}'"
258269
assert "Not applicable" in result_data.get("reason", ""), \
259270
f"Expected reason to contain 'Not applicable' but got '{result_data.get('reason')}'"
260271

assets/evaluators/tests/test_evaluators_behavior/base_evaluator_behavior_test.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import json
1212
import copy
1313
from ..common.base_prompty_evaluator_runner import BasePromptyEvaluatorRunner
14+
from . import common_tool_test_data as _tool_data
1415

1516

1617
class BaseEvaluatorBehaviorTest(BasePromptyEvaluatorRunner):
@@ -862,7 +863,7 @@ def test_function_call_response(self):
862863
query=self.VALID_QUERY,
863864
response=self.FUNCTION_CALL_ONLY_RESPONSE,
864865
tool_calls=self.VALID_TOOL_CALLS,
865-
tool_definitions=self.VALID_TOOL_DEFINITIONS,
866+
tool_definitions=_tool_data.FUNCTION_CALL_RESPONSE_TOOL_DEFINITIONS,
866867
)
867868
result_data = self._extract_and_print_result(results, "Function Call Only - Not Applicable")
868869
self.assert_not_applicable(result_data)
@@ -872,7 +873,7 @@ def test_function_call_response(self):
872873
query=self.VALID_QUERY,
873874
response=self.FUNCTION_CALL_FULL_RESPONSE,
874875
tool_calls=self.VALID_TOOL_CALLS,
875-
tool_definitions=self.VALID_TOOL_DEFINITIONS,
876+
tool_definitions=_tool_data.FUNCTION_CALL_RESPONSE_TOOL_DEFINITIONS,
876877
)
877878
result_data = self._extract_and_print_result(results, "Function Call Full - Preprocessed")
878879
self.assert_pass(result_data)
@@ -884,7 +885,7 @@ def test_mcp_approval_response(self):
884885
query=self.VALID_QUERY,
885886
response=self.MCP_APPROVAL_ONLY_RESPONSE,
886887
tool_calls=self.VALID_TOOL_CALLS,
887-
tool_definitions=self.VALID_TOOL_DEFINITIONS,
888+
tool_definitions=_tool_data.MCP_APPROVAL_RESPONSE_TOOL_DEFINITIONS,
888889
)
889890
result_data = self._extract_and_print_result(results, "MCP Approval Only - Not Applicable")
890891
self.assert_not_applicable(result_data)
@@ -894,7 +895,7 @@ def test_mcp_approval_response(self):
894895
query=self.VALID_QUERY,
895896
response=self.MCP_APPROVAL_FULL_RESPONSE,
896897
tool_calls=self.VALID_TOOL_CALLS,
897-
tool_definitions=self.VALID_TOOL_DEFINITIONS,
898+
tool_definitions=_tool_data.MCP_APPROVAL_RESPONSE_TOOL_DEFINITIONS,
898899
)
899900
result_data = self._extract_and_print_result(results, "MCP Approval Full - Preprocessed")
900901
self.assert_pass(result_data)

assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3211,3 +3211,56 @@
32113211
],
32123212
},
32133213
]
3214+
3215+
# =============================================================================
3216+
# Tool definitions for FUNCTION_CALL_*_RESPONSE / MCP_APPROVAL_*_RESPONSE
3217+
# test data defined in base_evaluator_behavior_test.py.
3218+
# =============================================================================
3219+
3220+
# Tool definitions matching FUNCTION_CALL_ONLY_RESPONSE / FUNCTION_CALL_FULL_RESPONSE
3221+
# (uses the `get_horoscope` function tool).
3222+
FUNCTION_CALL_RESPONSE_TOOL_DEFINITIONS = [
3223+
{
3224+
"name": "get_horoscope",
3225+
"type": "function",
3226+
"description": "Get today's horoscope for an astrological sign.",
3227+
"parameters": {
3228+
"type": "object",
3229+
"properties": {
3230+
"sign": {
3231+
"type": "string",
3232+
"description": "An astrological sign like Taurus or Aquarius",
3233+
}
3234+
},
3235+
"required": ["sign"],
3236+
"additionalProperties": False,
3237+
},
3238+
},
3239+
]
3240+
3241+
# Tool definitions matching MCP_APPROVAL_ONLY_RESPONSE / MCP_APPROVAL_FULL_RESPONSE
3242+
# (uses the `microsoft_docs_search` tool surfaced via MCP).
3243+
MCP_APPROVAL_RESPONSE_TOOL_DEFINITIONS = [
3244+
{
3245+
"name": "microsoft_docs_search",
3246+
"type": "function",
3247+
"description": (
3248+
"Search official Microsoft/Azure documentation to find the most relevant "
3249+
"and trustworthy content for a user's query."
3250+
),
3251+
"parameters": {
3252+
"type": "object",
3253+
"properties": {
3254+
"query": {
3255+
"type": "string",
3256+
"description": (
3257+
"a query or topic about Microsoft/Azure products, services, "
3258+
"platforms, developer tools, frameworks, or APIs"
3259+
),
3260+
},
3261+
},
3262+
"required": ["query"],
3263+
"additionalProperties": False,
3264+
},
3265+
},
3266+
]

assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"""
99

1010
import pytest
11+
from typing import List
1112
from .base_tool_calls_evaluator_behavior_test import BaseToolCallEvaluatorBehaviorTest
1213
from .base_tool_evaluation_test import BaseToolEvaluationTest
1314
from . import common_tool_test_data as data
@@ -69,3 +70,17 @@ class TestToolCallAccuracyEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, B
6970
is_tool_definition_required = True
7071

7172
MINIMAL_RESPONSE = BaseToolCallEvaluatorBehaviorTest.email_tool_call_and_assistant_response
73+
74+
@property
75+
def expected_result_fields(self) -> List[str]:
76+
"""Get the expected result fields for tools evaluators."""
77+
return [
78+
f"{self.result_key}",
79+
f"{self.result_key}_score",
80+
f"{self.result_key}_result",
81+
f"{self.result_key}_passed",
82+
f"{self.result_key}_reason",
83+
f"{self.result_key}_status",
84+
f"{self.result_key}_threshold",
85+
f"{self.result_key}_properties",
86+
]

assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""Quality tests for Tool Call Accuracy Evaluator with real flow execution."""
55

66
import pytest
7+
from typing import List
78
from ..common.base_quality_evaluator_runner import BaseQualityEvaluatorRunner, ExpectedResult
89
from ...builtin.tool_call_accuracy.evaluator._tool_call_accuracy import ToolCallAccuracyEvaluator
910

@@ -16,6 +17,20 @@ class TestToolCallAccuracyEvaluatorQuality(BaseQualityEvaluatorRunner):
1617
Tests actual LLM evaluation with real flow execution (no mocking).
1718
"""
1819

20+
@property
21+
def expected_result_fields(self) -> List[str]:
22+
"""Get the expected result fields for tools evaluators."""
23+
return [
24+
f"{self.result_key}",
25+
f"{self.result_key}_score",
26+
f"{self.result_key}_result",
27+
f"{self.result_key}_passed",
28+
f"{self.result_key}_reason",
29+
f"{self.result_key}_status",
30+
f"{self.result_key}_threshold",
31+
f"{self.result_key}_properties",
32+
]
33+
1934
evaluator_type = ToolCallAccuracyEvaluator
2035

2136
def test_pass_single_call(self) -> None:

0 commit comments

Comments
 (0)