From b90b3e64044fd84531e8811bad09210d519e9ed6 Mon Sep 17 00:00:00 2001 From: mohessie Date: Thu, 9 Apr 2026 21:54:25 +0200 Subject: [PATCH 1/7] Unify the output of Tool Call Accuracy --- .../evaluator/_tool_call_accuracy.py | 50 +++++++++---------- .../evaluator/tool_call_accuracy.prompty | 8 +-- .../builtin/tool_call_accuracy/spec.yaml | 2 +- .../tests/common/base_evaluator_runner.py | 6 +++ ...t_tool_call_accuracy_evaluator_behavior.py | 13 +++++ 5 files changed, 48 insertions(+), 31 deletions(-) diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py index 130b595564..3151a4d745 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py @@ -805,7 +805,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided." _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5." - _LLM_SCORE_KEY = "tool_calls_success_level" + _LLM_SCORE_KEY = "score" _validator: ValidatorInterface @@ -990,22 +990,26 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t ) # Format the output - reason = llm_output.get("chain_of_thought", "") + reason = llm_output.get("reasoning", "") score = float(score) score_result = "pass" if score >= self.threshold else "fail" + llm_properties = llm_output.get("properties", {}) + llm_properties.update({ + "prompt_tokens": prompty_output_dict.get("input_token_count", 0), + "completion_tokens": prompty_output_dict.get("output_token_count", 0), + "total_tokens": prompty_output_dict.get("total_token_count", 0), + "finish_reason": prompty_output_dict.get("finish_reason", ""), + "model": prompty_output_dict.get("model_id", ""), + "sample_input": prompty_output_dict.get("sample_input", ""), + "sample_output": prompty_output_dict.get("sample_output", ""), + }) response_dict = { - self._result_key: score, - f"{self._result_key}_result": score_result, - f"{self._result_key}_threshold": self._threshold, - f"{self._result_key}_reason": reason, - f"{self._result_key}_details": llm_output.get("details", {}), - f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0), - f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0), - f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0), - f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""), - f"{self._result_key}_model": prompty_output_dict.get("model_id", ""), - f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""), - f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), + f"{self.result_key}_score": score, + f"{self.result_key}_result": score_result, + f"{self.result_key}_reasoning": reason, + f"{self.result_key}_status": "completed", + f"{self.result_key}_threshold": self._threshold, + f"{self.result_key}_properties": llm_properties, } return response_dict @@ -1061,18 +1065,12 @@ def _not_applicable_result( :rtype: Dict[str, Union[str, float]] """ return { - self._result_key: threshold, - f"{self._result_key}_result": "pass", - f"{self._result_key}_threshold": threshold, - f"{self._result_key}_reason": f"Not applicable: {error_message}", - f"{self._result_key}_details": {}, - f"{self._result_key}_prompt_tokens": 0, - f"{self._result_key}_completion_tokens": 0, - f"{self._result_key}_total_tokens": 0, - f"{self._result_key}_finish_reason": "", - f"{self._result_key}_model": "", - f"{self._result_key}_sample_input": "", - f"{self._result_key}_sample_output": "", + f"{self.result_key}_score": None, + f"{self.result_key}_result": "not_applicable", + f"{self.result_key}_reasoning": f"Not applicable: {error_message}", + f"{self.result_key}_status": "skipped", + f"{self.result_key}_threshold": threshold, + f"{self.result_key}_properties": {}, } def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty b/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty index 4713b65f4e..f8ceffbcad 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty @@ -139,10 +139,10 @@ TOOL DEFINITIONS: {{tool_definitions}} # Tasks ## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above. -Your output should consist only of a JSON object, as provided in the examples, that has the following keys: - - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. - - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. - - details: a dictionary that contains the following keys: +Your output should consist only of a JSON object that has the following keys: + - score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. + - reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. + - properties: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent - per_tool_call_details: a list of dictionaries, each containing: diff --git a/assets/evaluators/builtin/tool_call_accuracy/spec.yaml b/assets/evaluators/builtin/tool_call_accuracy/spec.yaml index a0f1037317..84b942eb7e 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/spec.yaml +++ b/assets/evaluators/builtin/tool_call_accuracy/spec.yaml @@ -49,7 +49,7 @@ dataMappingSchema: type: "object" required: ["query", "tool_definitions"] outputSchema: - tool_call_accuracy: + tool_call_accuracy_score: type: "ordinal" desirable_direction: "increase" min_value: 1 diff --git a/assets/evaluators/tests/common/base_evaluator_runner.py b/assets/evaluators/tests/common/base_evaluator_runner.py index d084afbdae..147054aa3c 100644 --- a/assets/evaluators/tests/common/base_evaluator_runner.py +++ b/assets/evaluators/tests/common/base_evaluator_runner.py @@ -169,6 +169,9 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) -> # Optional fields reason = results.get(f"{self.result_key}_reason") + if reason is None: + reason = results.get(f"{self._result_prefix}_reasoning") + status = results.get(f"{self.result_key}_status") threshold = results.get(f"{self._result_prefix}_threshold") precision = results.get(f"{self._result_prefix}_precision") recall = results.get(f"{self._result_prefix}_recall") @@ -189,6 +192,9 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) -> if threshold is not None: print(f" Threshold: {threshold}") result["threshold"] = threshold + if status is not None: + print(f" Status: {status}") + result["status"] = status if precision is not None: print(f" Precision: {precision}") result["precision"] = precision diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py index 64308ef563..86a80ae023 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py @@ -8,6 +8,7 @@ """ import pytest +from typing import List from .base_tool_calls_evaluator_behavior_test import BaseToolCallEvaluatorBehaviorTest from .base_tool_evaluation_test import BaseToolEvaluationTest from . import common_tool_test_data as data @@ -69,3 +70,15 @@ class TestToolCallAccuracyEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, B is_tool_definition_required = True MINIMAL_RESPONSE = BaseToolCallEvaluatorBehaviorTest.email_tool_call_and_assistant_response + + @property + def expected_result_fields(self) -> List[str]: + """Get the expected result fields for tools evaluators.""" + return [ + f"{self.result_key}_score", + f"{self.result_key}_reasoning", + f"{self.result_key}_status", + f"{self.result_key}_threshold", + f"{self.result_key}_result", + f"{self.result_key}_properties", + ] From ea272f06d563d8bbdfe5bcf0f9536b15e73cc3f5 Mon Sep 17 00:00:00 2001 From: mohessie Date: Wed, 15 Apr 2026 00:10:26 +0200 Subject: [PATCH 2/7] Add status to prompty --- .../evaluator/_tool_call_accuracy.py | 36 +++++++++++-------- .../evaluator/tool_call_accuracy.prompty | 17 +++++++-- .../tests/common/base_evaluator_runner.py | 14 ++++++-- ...st_tool_call_accuracy_evaluator_quality.py | 13 +++++++ 4 files changed, 62 insertions(+), 18 deletions(-) diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py index 3151a4d745..c4f740a57d 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py @@ -974,6 +974,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) if isinstance(llm_output, dict): + # Handle skipped status from LLM + llm_status = llm_output.get("status", "completed") + if llm_status == "skipped": + reason = llm_output.get("reasoning", "") + return self._not_applicable_result(reason, self.threshold) + score = llm_output.get(self._LLM_SCORE_KEY, None) if not score or not check_score_is_valid( score, @@ -1004,12 +1010,13 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t "sample_output": prompty_output_dict.get("sample_output", ""), }) response_dict = { - f"{self.result_key}_score": score, - f"{self.result_key}_result": score_result, - f"{self.result_key}_reasoning": reason, - f"{self.result_key}_status": "completed", - f"{self.result_key}_threshold": self._threshold, - f"{self.result_key}_properties": llm_properties, + f"{self._result_key}_score": score, + f"{self._result_key}_result": score_result, + f"{self._result_key}_passed": score_result == "pass", + f"{self._result_key}_reasoning": reason, + f"{self._result_key}_status": "completed", + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_properties": llm_properties, } return response_dict @@ -1054,7 +1061,7 @@ async def _real_call(self, **kwargs): def _not_applicable_result( self, error_message: str, threshold: Union[int, float] - ) -> Dict[str, Union[str, float, Dict]]: + ) -> Dict[str, Union[str, float, Dict, None]]: """Return a result indicating that the tool call is not applicable for evaluation. :param error_message: The error message indicating why the evaluation is not applicable. @@ -1062,15 +1069,16 @@ def _not_applicable_result( :param threshold: The threshold value for the evaluation. :type threshold: Union[int, float] :return: A dictionary containing the result of the evaluation. - :rtype: Dict[str, Union[str, float]] + :rtype: Dict[str, Union[str, float, None]] """ return { - f"{self.result_key}_score": None, - f"{self.result_key}_result": "not_applicable", - f"{self.result_key}_reasoning": f"Not applicable: {error_message}", - f"{self.result_key}_status": "skipped", - f"{self.result_key}_threshold": threshold, - f"{self.result_key}_properties": {}, + f"{self._result_key}_score": None, + f"{self._result_key}_result": "not_applicable", + f"{self._result_key}_passed": None, + f"{self._result_key}_reasoning": f"Not applicable: {error_message}", + f"{self._result_key}_status": "skipped", + f"{self._result_key}_threshold": threshold, + f"{self._result_key}_properties": {}, } def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty b/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty index f8ceffbcad..f327862565 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty @@ -54,6 +54,16 @@ Evaluate based on these factors: **Tool Assessment**: Focus solely on appropriate use of available tools, not on capabilities beyond what tools can provide. +## Status: Skipped +Before performing any evaluation, check for the following conditions. If ANY are true, return `status: "skipped"` immediately without scoring: +1. **No tool calls to evaluate**: The TOOL CALLS TO BE EVALUATED section is empty (tool calls appearing only in the CONVERSATION section do not count). +2. **Missing tool definitions**: Any tool call in TOOL CALLS TO BE EVALUATED references a tool that is not present in the TOOL DEFINITIONS. + +When skipped, return: +```json +{"score": null, "reasoning": "", "status": "skipped", "properties": null} +``` + # Ratings ## [Tool Call Accuracy: 1] (Irrelevant) @@ -140,8 +150,11 @@ TOOL DEFINITIONS: {{tool_definitions}} # Tasks ## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above. Your output should consist only of a JSON object that has the following keys: - - score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. - - reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. + - score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped". + - status: a string indicating the evaluation status. Must be one of: + - "completed": tool calls were present, tool definitions were available, and evaluation was performed. + - "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null. + - reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped. - properties: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent diff --git a/assets/evaluators/tests/common/base_evaluator_runner.py b/assets/evaluators/tests/common/base_evaluator_runner.py index 147054aa3c..2df9ff7d89 100644 --- a/assets/evaluators/tests/common/base_evaluator_runner.py +++ b/assets/evaluators/tests/common/base_evaluator_runner.py @@ -156,6 +156,8 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) -> Dictionary with standardized result fields. """ score = results.get(self.result_key) + if score is None: + score = results.get(f"{self.result_key}_score") if f"{self.result_key}_error_message" not in results and score != "not applicable": for field in self.expected_result_fields: @@ -246,7 +248,7 @@ def assert_pass(self, result_data: Dict[str, Any]): self._assert_pass_result(result_data) def assert_not_applicable(self, result_data: Dict[str, Any]): - """Assert a not-applicable result (intermediate response). + """Assert a not-applicable result (intermediate response or skipped evaluation). Args: result_data: Dictionary containing evaluation result data. @@ -254,7 +256,15 @@ def assert_not_applicable(self, result_data: Dict[str, Any]): Raises: AssertionError: If the result is not a valid not-applicable result. """ - self._assert_pass_result(result_data) + label_key = "label" + score_key = "score" + if result_data[label_key] == "not_applicable": + assert result_data[label_key] == "not_applicable", \ + f"Expected 'not_applicable' but got '{result_data[label_key]}'" + assert result_data[score_key] is None, \ + f"Expected score to be None for not-applicable result but got '{result_data[score_key]}'" + else: + self._assert_pass_result(result_data) assert "Not applicable" in result_data.get("reason", ""), \ f"Expected reason to contain 'Not applicable' but got '{result_data.get('reason')}'" diff --git a/assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py b/assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py index cc37c6a4f3..3d12932d58 100644 --- a/assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py +++ b/assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py @@ -4,6 +4,7 @@ """Quality tests for Tool Call Accuracy Evaluator with real flow execution.""" import pytest +from typing import List from ..common.base_quality_evaluator_runner import BaseQualityEvaluatorRunner, ExpectedResult from ...builtin.tool_call_accuracy.evaluator._tool_call_accuracy import ToolCallAccuracyEvaluator @@ -16,6 +17,18 @@ class TestToolCallAccuracyEvaluatorQuality(BaseQualityEvaluatorRunner): Tests actual LLM evaluation with real flow execution (no mocking). """ + @property + def expected_result_fields(self) -> List[str]: + """Get the expected result fields for tools evaluators.""" + return [ + f"{self.result_key}_score", + f"{self.result_key}_reasoning", + f"{self.result_key}_status", + f"{self.result_key}_threshold", + f"{self.result_key}_result", + f"{self.result_key}_properties", + ] + evaluator_type = ToolCallAccuracyEvaluator def test_pass_single_call(self) -> None: From d026760892dd3636b645b0de9255226e88fe1291 Mon Sep 17 00:00:00 2001 From: mohessie Date: Thu, 23 Apr 2026 20:03:59 +0200 Subject: [PATCH 3/7] Update Tool Call Accuracy Output Format --- .../evaluator/_tool_call_accuracy.py | 34 +++++++++++-------- .../evaluator/tool_call_accuracy.prompty | 4 +-- .../tests/common/base_evaluator_runner.py | 2 -- ...t_tool_call_accuracy_evaluator_behavior.py | 6 ++-- ...st_tool_call_accuracy_evaluator_quality.py | 6 ++-- 5 files changed, 29 insertions(+), 23 deletions(-) diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py index 0b59cce6cc..0f9d9fb1f6 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py @@ -976,7 +976,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t # Handle skipped status from LLM llm_status = llm_output.get("status", "completed") if llm_status == "skipped": - reason = llm_output.get("reasoning", "") + reason = llm_output.get("reason", "") return self._not_applicable_result(reason, self.threshold) score = llm_output.get(self._LLM_SCORE_KEY, None) @@ -995,24 +995,27 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t ) # Format the output - reason = llm_output.get("reasoning", "") + reason = llm_output.get("reason", "") score = float(score) score_result = "pass" if score >= self.threshold else "fail" - llm_properties = llm_output.get("properties", {}) - llm_properties.update({ - "prompt_tokens": prompty_output_dict.get("input_token_count", 0), - "completion_tokens": prompty_output_dict.get("output_token_count", 0), - "total_tokens": prompty_output_dict.get("total_token_count", 0), - "finish_reason": prompty_output_dict.get("finish_reason", ""), - "model": prompty_output_dict.get("model_id", ""), - "sample_input": prompty_output_dict.get("sample_input", ""), - "sample_output": prompty_output_dict.get("sample_output", ""), - }) + llm_properties = llm_output.get("properties", {}) or {} + llm_properties.update( + { + "prompt_tokens": prompty_output_dict.get("input_token_count", 0), + "completion_tokens": prompty_output_dict.get("output_token_count", 0), + "total_tokens": prompty_output_dict.get("total_token_count", 0), + "finish_reason": prompty_output_dict.get("finish_reason", ""), + "model": prompty_output_dict.get("model_id", ""), + "sample_input": prompty_output_dict.get("sample_input", ""), + "sample_output": prompty_output_dict.get("sample_output", ""), + } + ) response_dict = { + self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_result": score_result, f"{self._result_key}_passed": score_result == "pass", - f"{self._result_key}_reasoning": reason, + f"{self._result_key}_reason": reason, f"{self._result_key}_status": "completed", f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, @@ -1071,13 +1074,14 @@ def _not_applicable_result( :rtype: Dict[str, Union[str, float, None]] """ return { + f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_result": "not_applicable", f"{self._result_key}_passed": None, - f"{self._result_key}_reasoning": f"Not applicable: {error_message}", + f"{self._result_key}_reason": f"Not applicable: {error_message}", f"{self._result_key}_status": "skipped", f"{self._result_key}_threshold": threshold, - f"{self._result_key}_properties": {}, + f"{self._result_key}_properties": None, } def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty b/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty index f327862565..c6722848fe 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty @@ -61,7 +61,7 @@ Before performing any evaluation, check for the following conditions. If ANY are When skipped, return: ```json -{"score": null, "reasoning": "", "status": "skipped", "properties": null} +{"reason": "", "score": null, "status": "skipped", "properties": null} ``` @@ -150,11 +150,11 @@ TOOL DEFINITIONS: {{tool_definitions}} # Tasks ## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above. Your output should consist only of a JSON object that has the following keys: + - reason: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped. - score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped". - status: a string indicating the evaluation status. Must be one of: - "completed": tool calls were present, tool definitions were available, and evaluation was performed. - "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null. - - reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped. - properties: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent diff --git a/assets/evaluators/tests/common/base_evaluator_runner.py b/assets/evaluators/tests/common/base_evaluator_runner.py index 2df9ff7d89..f1fd8d1d88 100644 --- a/assets/evaluators/tests/common/base_evaluator_runner.py +++ b/assets/evaluators/tests/common/base_evaluator_runner.py @@ -171,8 +171,6 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) -> # Optional fields reason = results.get(f"{self.result_key}_reason") - if reason is None: - reason = results.get(f"{self._result_prefix}_reasoning") status = results.get(f"{self.result_key}_status") threshold = results.get(f"{self._result_prefix}_threshold") precision = results.get(f"{self._result_prefix}_precision") diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py index 86a80ae023..6e1094154b 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_tool_call_accuracy_evaluator_behavior.py @@ -75,10 +75,12 @@ class TestToolCallAccuracyEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, B def expected_result_fields(self) -> List[str]: """Get the expected result fields for tools evaluators.""" return [ + f"{self.result_key}", f"{self.result_key}_score", - f"{self.result_key}_reasoning", + f"{self.result_key}_result", + f"{self.result_key}_passed", + f"{self.result_key}_reason", f"{self.result_key}_status", f"{self.result_key}_threshold", - f"{self.result_key}_result", f"{self.result_key}_properties", ] diff --git a/assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py b/assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py index 3d12932d58..1c2b9d07af 100644 --- a/assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py +++ b/assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py @@ -21,11 +21,13 @@ class TestToolCallAccuracyEvaluatorQuality(BaseQualityEvaluatorRunner): def expected_result_fields(self) -> List[str]: """Get the expected result fields for tools evaluators.""" return [ + f"{self.result_key}", f"{self.result_key}_score", - f"{self.result_key}_reasoning", + f"{self.result_key}_result", + f"{self.result_key}_passed", + f"{self.result_key}_reason", f"{self.result_key}_status", f"{self.result_key}_threshold", - f"{self.result_key}_result", f"{self.result_key}_properties", ] From 505d0dc17c2eb9b5fcaf9198b2c6354e4c32ca5a Mon Sep 17 00:00:00 2001 From: mohessie Date: Thu, 23 Apr 2026 22:03:39 +0200 Subject: [PATCH 4/7] Update documentation to state deprecate 'gpt_' prefix Co-authored-by: Copilot --- .../tool_call_accuracy/evaluator/_tool_call_accuracy.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py index 0f9d9fb1f6..59a2353d8d 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py @@ -787,9 +787,10 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): .. note:: - To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added. - To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output; - however, it is recommended to use the new key moving forward as the old key will be deprecated in the future. + The output field "details" has been renamed to "tool_call_accuracy_properties" for clarity. + + The `gpt_` prefix is deprecated. Use `_score` suffix instead. + """ _PROMPTY_FILE = "tool_call_accuracy.prompty" From 7a8632adfd090405a1eceffac7e903ce86336786 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 22:18:53 +0200 Subject: [PATCH 5/7] Rename not_applicable to pass in tool_call_accuracy result key and update tests (#4964) Agent-Logs-Url: https://github.com/Azure/azureml-assets/sessions/ba5b2838-661b-419e-9645-b960cc227d25 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> --- .../evaluator/_tool_call_accuracy.py | 2 +- .../evaluators/tests/common/base_evaluator_runner.py | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py index 59a2353d8d..e7e417cb26 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py +++ b/assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py @@ -1077,7 +1077,7 @@ def _not_applicable_result( return { f"{self._result_key}": None, f"{self._result_key}_score": None, - f"{self._result_key}_result": "not_applicable", + f"{self._result_key}_result": "pass", f"{self._result_key}_passed": None, f"{self._result_key}_reason": f"Not applicable: {error_message}", f"{self._result_key}_status": "skipped", diff --git a/assets/evaluators/tests/common/base_evaluator_runner.py b/assets/evaluators/tests/common/base_evaluator_runner.py index f1fd8d1d88..c48a514632 100644 --- a/assets/evaluators/tests/common/base_evaluator_runner.py +++ b/assets/evaluators/tests/common/base_evaluator_runner.py @@ -256,13 +256,10 @@ def assert_not_applicable(self, result_data: Dict[str, Any]): """ label_key = "label" score_key = "score" - if result_data[label_key] == "not_applicable": - assert result_data[label_key] == "not_applicable", \ - f"Expected 'not_applicable' but got '{result_data[label_key]}'" - assert result_data[score_key] is None, \ - f"Expected score to be None for not-applicable result but got '{result_data[score_key]}'" - else: - self._assert_pass_result(result_data) + assert result_data[label_key] == "pass", \ + f"Expected 'pass' but got '{result_data[label_key]}'" + assert result_data[score_key] is None, \ + f"Expected score to be None for not-applicable result but got '{result_data[score_key]}'" assert "Not applicable" in result_data.get("reason", ""), \ f"Expected reason to contain 'Not applicable' but got '{result_data.get('reason')}'" From 5dd543e6399a3b806ccacbb07e75420f9797b817 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Sun, 26 Apr 2026 20:54:22 +0300 Subject: [PATCH 6/7] Use response-specific tool definitions in function_call/mcp_approval tests (#4971) Agent-Logs-Url: https://github.com/Azure/azureml-assets/sessions/0d2db933-6e9b-4b8d-b1a9-789026ec14c8 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> --- .../base_evaluator_behavior_test.py | 9 ++-- .../common_tool_test_data.py | 53 +++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/assets/evaluators/tests/test_evaluators_behavior/base_evaluator_behavior_test.py b/assets/evaluators/tests/test_evaluators_behavior/base_evaluator_behavior_test.py index 5bde7c77b3..c8d444cbb1 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/base_evaluator_behavior_test.py +++ b/assets/evaluators/tests/test_evaluators_behavior/base_evaluator_behavior_test.py @@ -11,6 +11,7 @@ import json import copy from ..common.base_prompty_evaluator_runner import BasePromptyEvaluatorRunner +from . import common_tool_test_data as _tool_data class BaseEvaluatorBehaviorTest(BasePromptyEvaluatorRunner): @@ -862,7 +863,7 @@ def test_function_call_response(self): query=self.VALID_QUERY, response=self.FUNCTION_CALL_ONLY_RESPONSE, tool_calls=self.VALID_TOOL_CALLS, - tool_definitions=self.VALID_TOOL_DEFINITIONS, + tool_definitions=_tool_data.FUNCTION_CALL_RESPONSE_TOOL_DEFINITIONS, ) result_data = self._extract_and_print_result(results, "Function Call Only - Not Applicable") self.assert_not_applicable(result_data) @@ -872,7 +873,7 @@ def test_function_call_response(self): query=self.VALID_QUERY, response=self.FUNCTION_CALL_FULL_RESPONSE, tool_calls=self.VALID_TOOL_CALLS, - tool_definitions=self.VALID_TOOL_DEFINITIONS, + tool_definitions=_tool_data.FUNCTION_CALL_RESPONSE_TOOL_DEFINITIONS, ) result_data = self._extract_and_print_result(results, "Function Call Full - Preprocessed") self.assert_pass(result_data) @@ -884,7 +885,7 @@ def test_mcp_approval_response(self): query=self.VALID_QUERY, response=self.MCP_APPROVAL_ONLY_RESPONSE, tool_calls=self.VALID_TOOL_CALLS, - tool_definitions=self.VALID_TOOL_DEFINITIONS, + tool_definitions=_tool_data.MCP_APPROVAL_RESPONSE_TOOL_DEFINITIONS, ) result_data = self._extract_and_print_result(results, "MCP Approval Only - Not Applicable") self.assert_not_applicable(result_data) @@ -894,7 +895,7 @@ def test_mcp_approval_response(self): query=self.VALID_QUERY, response=self.MCP_APPROVAL_FULL_RESPONSE, tool_calls=self.VALID_TOOL_CALLS, - tool_definitions=self.VALID_TOOL_DEFINITIONS, + tool_definitions=_tool_data.MCP_APPROVAL_RESPONSE_TOOL_DEFINITIONS, ) result_data = self._extract_and_print_result(results, "MCP Approval Full - Preprocessed") self.assert_pass(result_data) diff --git a/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py b/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py index 7a4c635177..9ddd84f291 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py +++ b/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py @@ -3211,3 +3211,56 @@ ], }, ] + +# ============================================================================= +# Tool definitions for FUNCTION_CALL_*_RESPONSE / MCP_APPROVAL_*_RESPONSE +# test data defined in base_evaluator_behavior_test.py. +# ============================================================================= + +# Tool definitions matching FUNCTION_CALL_ONLY_RESPONSE / FUNCTION_CALL_FULL_RESPONSE +# (uses the `get_horoscope` function tool). +FUNCTION_CALL_RESPONSE_TOOL_DEFINITIONS = [ + { + "name": "get_horoscope", + "type": "function", + "description": "Get today's horoscope for an astrological sign.", + "parameters": { + "type": "object", + "properties": { + "sign": { + "type": "string", + "description": "An astrological sign like Taurus or Aquarius", + } + }, + "required": ["sign"], + "additionalProperties": False, + }, + }, +] + +# Tool definitions matching MCP_APPROVAL_ONLY_RESPONSE / MCP_APPROVAL_FULL_RESPONSE +# (uses the `microsoft_docs_search` tool surfaced via MCP). +MCP_APPROVAL_RESPONSE_TOOL_DEFINITIONS = [ + { + "name": "microsoft_docs_search", + "type": "function", + "description": ( + "Search official Microsoft/Azure documentation to find the most relevant " + "and trustworthy content for a user's query." + ), + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": ( + "a query or topic about Microsoft/Azure products, services, " + "platforms, developer tools, frameworks, or APIs" + ), + }, + }, + "required": ["query"], + "additionalProperties": False, + }, + }, +] From 10a1ff261ae9955f56b9f8cf61e1c8e6a2f33744 Mon Sep 17 00:00:00 2001 From: Mohamed Hessien Date: Sun, 26 Apr 2026 20:55:34 +0300 Subject: [PATCH 7/7] Bump tool_call_accuracy evaluator version to 9 --- assets/evaluators/builtin/tool_call_accuracy/spec.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/evaluators/builtin/tool_call_accuracy/spec.yaml b/assets/evaluators/builtin/tool_call_accuracy/spec.yaml index cb5035074a..0207692c1e 100644 --- a/assets/evaluators/builtin/tool_call_accuracy/spec.yaml +++ b/assets/evaluators/builtin/tool_call_accuracy/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.tool_call_accuracy" -version: 8 +version: 9 displayName: "Tool-Call-Accuracy-Evaluator" description: "Measures whether the agent selects the correct tool calls, applies the correct parameters, and tracks inefficient or missing too calls, in order to resolve a user's request. This is an umbrella evaluators that assessing overall tool call quality. Use this metric in agent-based systems, and AI assistants that rely on tool integration." evaluatorType: "builtin" @@ -54,4 +54,4 @@ outputSchema: desirable_direction: "increase" min_value: 1 max_value: 5 -path: ./evaluator \ No newline at end of file +path: ./evaluator