Fix test assertions for new unified output format and apply black formatting (#46336)

Copilot · m7md7sien · web-flow · commit f454ee31e224 · 2026-04-16T00:11:16.000+02:00
Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/23f40ca5-7114-46ec-89be-a369e38ac971 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -232,7 +232,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         if _is_intermediate_response(eval_input.get("response")):
             return self._return_not_applicable_result(
                 "Intermediate response. Please provide the agent's final response for evaluation.",
-                self.threshold
+                self.threshold,
             )
 
         # Preprocess messages if they are lists
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py
@@ -66,11 +66,11 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
                 }
             ],
         )
+        assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_score"] is None
+        assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_result"] == "not_applicable"
+        assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_status"] == "skipped"
         assert (
-            result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        )
-        assert (
-            "not applicable" in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"].lower()
+            "not applicable" in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reasoning"].lower()
             and ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-            in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
+            in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reasoning"]
         )

Original file line number	Diff line number	Diff line change
`@@ -232,7 +232,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t`
`232`	`232`	`if _is_intermediate_response(eval_input.get("response")):`
`233`	`233`	`return self._return_not_applicable_result(`
`234`	`234`	`"Intermediate response. Please provide the agent's final response for evaluation.",`
`235`		`- self.threshold`
	`235`	`+ self.threshold,`
`236`	`236`	`)`
`237`	`237`
`238`	`238`	`# Preprocess messages if they are lists`
Original file line number	Diff line number	Diff line change
`@@ -66,11 +66,11 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):`
`66`	`66`	`}`
`67`	`67`	`],`
`68`	`68`	`)`
	`69`	`+ assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_score"] is None`
	`70`	`+ assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_result"] == "not_applicable"`
	`71`	`+ assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_status"] == "skipped"`
`69`	`72`	`assert (`
`70`		`- result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE`
`71`		`- )`
`72`		`- assert (`
`73`		`- "not applicable" in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"].lower()`
	`73`	`+ "not applicable" in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reasoning"].lower()`
`74`	`74`	`and ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE`
`75`		`- in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]`
	`75`	`+ in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reasoning"]`
`76`	`76`	`)`