Rename tool_call_accuracy reasoning output to reason and update skipped properties handling (#46355)

Copilot · m7md7sien · web-flow · commit aa848fef1a4b · 2026-04-16T22:10:13.000+02:00
Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/89b3b528-f2ac-4284-88fb-c484d4c0cce1 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -456,8 +456,8 @@ def _return_not_applicable_result(
             f"{self._result_key}_score": None,
             f"{self._result_key}_result": "not_applicable",
             f"{self._result_key}_passed": None,
-            f"{self._result_key}_reasoning": f"Not applicable: {error_message}",
+            f"{self._result_key}_reason": f"Not applicable: {error_message}",
             f"{self._result_key}_status": "skipped",
             f"{self._result_key}_threshold": threshold,
-            f"{self._result_key}_properties": {},
+            f"{self._result_key}_properties": None,
         }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -258,7 +258,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             # Handle skipped status from LLM
             llm_status = llm_output.get("status", "completed")
             if llm_status == "skipped":
-                reason = llm_output.get("reasoning", "")
+                reason = llm_output.get("reason", "")
                 return self._return_not_applicable_result(reason, self.threshold)
 
             score = llm_output.get(self._LLM_SCORE_KEY, None)
@@ -276,10 +276,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 )
 
             # Format the output
-            reason = llm_output.get("reasoning", "")
+            reason = llm_output.get("reason", "")
             score = float(score)
             score_result = "pass" if score >= self.threshold else "fail"
-            llm_properties = llm_output.get("properties", {})
+            llm_properties = llm_output.get("properties", {}) or {}
             llm_properties.update(
                 {
                     "prompt_tokens": prompty_output_dict.get("input_token_count", 0),
@@ -295,7 +295,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_score": score,
                 f"{self._result_key}_result": score_result,
                 f"{self._result_key}_passed": score_result == "pass",
-                f"{self._result_key}_reasoning": reason,
+                f"{self._result_key}_reason": reason,
                 f"{self._result_key}_status": "completed",
                 f"{self._result_key}_threshold": self._threshold,
                 f"{self._result_key}_properties": llm_properties,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty
@@ -61,7 +61,7 @@ Before performing any evaluation, check for the following conditions. If ANY are
 
 When skipped, return:
 ```json
-{"score": null, "reasoning": "<explain why evaluation was skipped>", "status": "skipped", "properties": null}
+{"reason": "<explain why evaluation was skipped>", "score": null, "status": "skipped", "properties": null}
 ```
 
 
@@ -150,11 +150,11 @@ TOOL DEFINITIONS: {{tool_definitions}}
 # Tasks
 ## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
 Your output should consist only of a JSON object that has the following keys:
+  - reason: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
   - score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped".
   - status: a string indicating the evaluation status. Must be one of:
       - "completed": tool calls were present, tool definitions were available, and evaluation was performed.
       - "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null.
-  - reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
   - properties: a dictionary that contains the following keys:
         - tool_calls_made_by_agent: total number of tool calls made by the agent
         - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
@@ -176,4 +176,4 @@ Your output should consist only of a JSON object that has the following keys:
               - tool_name: name of the tool
               - missing_count: number of missing calls for this query
 
-# Output
+# Output
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py
@@ -70,7 +70,7 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
         assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_result"] == "not_applicable"
         assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_status"] == "skipped"
         assert (
-            "not applicable" in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reasoning"].lower()
+            "not applicable" in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"].lower()
             and ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-            in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reasoning"]
+            in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
         )
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -48,7 +48,7 @@ async def flow_side_effect(timeout, **kwargs):
         # Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid
         return {
             "llm_output": {
-                "reasoning": "The tool calls were very correct that I returned a huge number!",
+                "reason": "The tool calls were very correct that I returned a huge number!",
                 "score": 25,
                 "properties": {},
             }
@@ -63,7 +63,7 @@ async def flow_side_effect(timeout, **kwargs):
 
     return {
         "llm_output": {
-            "reasoning": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
+            "reason": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
             "score": score,
             "properties": {
                 "tool_calls_made_by_agent": total_calls,
@@ -134,8 +134,8 @@ def test_evaluate_tools_valid1(self, mock_model_config):
         assert result[f"{key}_score"] == 3.0  # Mixed good/bad gets score 3
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert f"{key}_reasoning" in result
-        assert result[f"{key}_reasoning"] == "Evaluated 2 tool calls with 1 correct calls."
+        assert f"{key}_reason" in result
+        assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
         assert f"{key}_properties" in result
 
     def test_evaluate_tools_valid2(self, mock_model_config):
@@ -196,8 +196,8 @@ def test_evaluate_tools_valid2(self, mock_model_config):
         assert result[f"{key}_score"] == 1.0  # All bad gets score 1
         assert result[f"{key}_result"] == "fail"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert f"{key}_reasoning" in result
-        assert result[f"{key}_reasoning"] == "Evaluated 2 tool calls with 0 correct calls."
+        assert f"{key}_reason" in result
+        assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
         assert f"{key}_properties" in result
 
     def test_evaluate_tools_valid3(self, mock_model_config):
@@ -258,8 +258,8 @@ def test_evaluate_tools_valid3(self, mock_model_config):
         assert result[f"{key}_score"] == 5.0  # All good gets score 5
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert f"{key}_reasoning" in result
-        assert result[f"{key}_reasoning"] == "Evaluated 2 tool calls with 2 correct calls."
+        assert f"{key}_reason" in result
+        assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
         assert f"{key}_properties" in result
 
     def test_evaluate_tools_one_eval_fails(self, mock_model_config):
@@ -383,8 +383,8 @@ def test_evaluate_tools_built_in_tool_definition(self, mock_model_config):
         assert result[f"{key}_score"] == 5.0  # All good gets score 5
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert f"{key}_reasoning" in result
-        assert result[f"{key}_reasoning"] == "Evaluated 1 tool calls with 1 correct calls."
+        assert f"{key}_reason" in result
+        assert result[f"{key}_reason"] == "Evaluated 1 tool calls with 1 correct calls."
         assert f"{key}_properties" in result
 
     def test_evaluate_tools_no_tools(self, mock_model_config):