Update Tool Call Accuracy to output unified format

m7md7sien · m7md7sien · commit 3eb40a8dfb8e · 2026-04-15T00:35:18.000+02:00
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -438,3 +438,26 @@ def _not_applicable_result(
             result[f"{self._result_key}_details"] = {}
 
         return result
+
+    # TODO: After all evaluators output are updated, we can remove the _not_applicable_result method and replace calls to it with _return_not_applicable_result, which returns a "skipped" status instead of "pass" to avoid confusion.
+    def _return_not_applicable_result(
+        self, error_message: str, threshold: Union[int, float]
+    ) -> Dict[str, Union[str, float, Dict, None]]:
+        """Return a result indicating that the tool call is not applicable for evaluation.
+
+        :param error_message: The error message indicating why the evaluation is not applicable.
+        :type error_message: str
+        :param threshold: The threshold value for the evaluation.
+        :type threshold: Union[int, float]
+        :return: A dictionary containing the result of the evaluation.
+        :rtype: Dict[str, Union[str, float, None]]
+        """
+        return {
+            f"{self._result_key}_score": None,
+            f"{self._result_key}_result": "not_applicable",
+            f"{self._result_key}_passed": None,
+            f"{self._result_key}_reasoning": f"Not applicable: {error_message}",
+            f"{self._result_key}_status": "skipped",
+            f"{self._result_key}_threshold": threshold,
+            f"{self._result_key}_properties": {},
+        }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -86,7 +86,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
     _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
 
-    _LLM_SCORE_KEY = "tool_calls_success_level"
+    _LLM_SCORE_KEY = "score"
 
     _validator: ValidatorInterface
 
@@ -256,6 +256,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
         if isinstance(llm_output, dict):
+                        # Handle skipped status from LLM
+            llm_status = llm_output.get("status", "completed")
+            if llm_status == "skipped":
+                reason = llm_output.get("reasoning", "")
+                return self._return_not_applicable_result(reason, self.threshold)
+
             score = llm_output.get(self._LLM_SCORE_KEY, None)
             if not score or not check_score_is_valid(
                 score,
@@ -271,23 +277,27 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 )
 
             # Format the output
-            reason = llm_output.get("chain_of_thought", "")
+            reason = llm_output.get("reasoning", "")
             score = float(score)
             score_result = "pass" if score >= self.threshold else "fail"
+            llm_properties = llm_output.get("properties", {})
+            llm_properties.update({
+                "prompt_tokens": prompty_output_dict.get("input_token_count", 0),
+                "completion_tokens": prompty_output_dict.get("output_token_count", 0),
+                "total_tokens": prompty_output_dict.get("total_token_count", 0),
+                "finish_reason": prompty_output_dict.get("finish_reason", ""),
+                "model": prompty_output_dict.get("model_id", ""),
+                "sample_input": prompty_output_dict.get("sample_input", ""),
+                "sample_output": prompty_output_dict.get("sample_output", ""),
+            })
             response_dict = {
-                self._result_key: score,
-                f"gpt_{self._result_key}": score,
+                f"{self._result_key}_score": score,
                 f"{self._result_key}_result": score_result,
+                f"{self._result_key}_passed": score_result == "pass",
+                f"{self._result_key}_reasoning": reason,
+                f"{self._result_key}_status": "completed",
                 f"{self._result_key}_threshold": self._threshold,
-                f"{self._result_key}_reason": reason,
-                f"{self._result_key}_details": llm_output.get("details", {}),
-                f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
-                f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
-                f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
-                f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
-                f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
-                f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
-                f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
+                f"{self._result_key}_properties": llm_properties,
             }
             return response_dict
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty
@@ -54,6 +54,16 @@ Evaluate based on these factors:
 
 **Tool Assessment**: Focus solely on appropriate use of available tools, not on capabilities beyond what tools can provide.
 
+## Status: Skipped
+Before performing any evaluation, check for the following conditions. If ANY are true, return `status: "skipped"` immediately without scoring:
+1. **No tool calls to evaluate**: The TOOL CALLS TO BE EVALUATED section is empty (tool calls appearing only in the CONVERSATION section do not count).
+2. **Missing tool definitions**: Any tool call in TOOL CALLS TO BE EVALUATED references a tool that is not present in the TOOL DEFINITIONS.
+
+When skipped, return:
+```json
+{"score": null, "reasoning": "<explain why evaluation was skipped>", "status": "skipped", "properties": null}
+```
+
 
 # Ratings
 ## [Tool Call Accuracy: 1] (Irrelevant)
@@ -139,10 +149,13 @@ TOOL DEFINITIONS: {{tool_definitions}}
 
 # Tasks
 ## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
-Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
-  - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'.
-  - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
-  - details: a dictionary that contains the following keys:
+Your output should consist only of a JSON object that has the following keys:
+  - score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped".
+  - status: a string indicating the evaluation status. Must be one of:
+      - "completed": tool calls were present, tool definitions were available, and evaluation was performed.
+      - "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null.
+  - reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
+  - properties: a dictionary that contains the following keys:
         - tool_calls_made_by_agent: total number of tool calls made by the agent
         - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
         - per_tool_call_details: a list of dictionaries, each containing: