Normalize parameter types in TaskNavigationEfficiency comparison (#46227)

Copilot · m7md7sien · web-flow · commit f50234508e59 · 2026-05-15T06:03:32.000+03:00
* Normalize parameter types in TaskNavigationEfficiency comparison Port fix from Azure/azureml-assets#4901. Adds _normalize_param_value static method for consistent string comparison of parameter values (int, float, bool, dict, list) between agent and ground truth. Updates _extract_tool_names_and_params_from_response to preserve original value types instead of premature str() conversion. Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/4888d4d1-bd21-46b6-a733-231b3ffefddd Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> * Fix black formatting: collapse expressions that fit within 120 char line-length (#46232) Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/1c88d810-2e80-47a9-ad09-40adb6529219 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> * Fix black issue --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> Co-authored-by: mohessie <mohessie@microsoft.com>
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -539,13 +539,13 @@ def _parse_tools_from_response(self, response):
 
         return tool_calls
 
-    def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
+    def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, Any]]]:
         """Extract tool names and parameters from the response.
 
         :param response: The response to parse.
         :type response: Union[str, List[dict]]
         :return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
-        :rtype: List[Tuple[str, Dict[str, str]]]
+        :rtype: List[Tuple[str, Dict[str, Any]]]
         """
         tool_calls = self._parse_tools_from_response(response)
         tool_name_param_pairs = []
@@ -580,14 +580,13 @@ def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[s
             if "arguments" in tool_call:
                 args = tool_call["arguments"]
                 if isinstance(args, dict):
-                    # Convert all values to strings for consistent comparison
-                    parameters = {str(k): str(v) for k, v in args.items()}
+                    parameters = {str(k): v for k, v in args.items()}
                 elif isinstance(args, str):
                     # If arguments is a string, try to parse it as JSON
                     try:
                         parsed_args = json.loads(args)
                         if isinstance(parsed_args, dict):
-                            parameters = {str(k): str(v) for k, v in parsed_args.items()}
+                            parameters = {str(k): v for k, v in parsed_args.items()}
                     except json.JSONDecodeError:
                         raise EvaluationException(
                             "Failed to parse tool call arguments as JSON.",
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py
@@ -151,6 +151,23 @@ async def _real_call(self, **kwargs):
         self._validator.validate_eval_input(kwargs)
         return await super()._real_call(**kwargs)
 
+    @staticmethod
+    def _normalize_param_value(value: Any) -> str:
+        """Normalize a parameter value to a string for consistent comparison.
+
+        Uses json.dumps for dicts and lists to produce canonical JSON strings,
+        and str() for other types. This ensures both agent and ground truth
+        parameter values are compared in the same string format.
+        """
+        if isinstance(value, str):
+            return value
+        if isinstance(value, (dict, list)):
+            try:
+                return json.dumps(value, sort_keys=True)
+            except (TypeError, ValueError):
+                return str(value)
+        return str(value)
+
     def _prepare_steps_for_comparison(
         self,
         agent_tool_pairs: List[Tuple[str, Dict[str, Any]]],
@@ -165,10 +182,22 @@ def _prepare_steps_for_comparison(
         agent_steps: List[Union[str, Tuple[str, Tuple]]] = []
         ground_truth_steps: List[Union[str, Tuple[str, Tuple]]] = []
         if use_parameter_matching:
-            # When parameter matching is enabled, we need to match both tool name and parameters
-            agent_steps = [(pair[0], tuple(sorted(pair[1].items()))) for pair in agent_tool_pairs]
+            # When parameter matching is enabled, we need to match both tool name and parameters.
+            # Normalize all parameter values to strings on both sides for consistent comparison.
+            agent_steps = [
+                (pair[0], tuple(sorted((k, self._normalize_param_value(v)) for k, v in pair[1].items())))
+                for pair in agent_tool_pairs
+            ]
             ground_truth_steps = [
-                (name, tuple(sorted(ground_truth_params.get(name, {}).items()))) for name in ground_truth
+                (
+                    name,
+                    tuple(
+                        sorted(
+                            (k, self._normalize_param_value(v)) for k, v in ground_truth_params.get(name, {}).items()
+                        )
+                    ),
+                )
+                for name in ground_truth
             ]
         else:
             # When parameter matching is disabled, only compare tool names
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
@@ -167,6 +167,194 @@ def test_invalid_ground_truth(self):
         with pytest.raises(EvaluationException):
             evaluator(response=[], ground_truth=[])
 
+    # ==================== PARAMETER TYPE NORMALIZATION TESTS ====================
+
+    @staticmethod
+    def _make_action(name: str, arguments) -> dict:
+        """Create an assistant action with a tool call."""
+        return {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "tool_call",
+                    "tool_call_id": f"call_{name}",
+                    "name": name,
+                    "arguments": arguments,
+                }
+            ],
+        }
+
+    def test_param_int_agent_vs_int_ground_truth(self):
+        """Test that int param values match when both sides are int."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"count": 1, "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"count": 1, "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_int_agent_vs_str_ground_truth(self):
+        """Test that int agent param matches str ground truth ('1' == '1')."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"count": 1, "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"count": "1", "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_str_agent_vs_int_ground_truth(self):
+        """Test that str agent param matches int ground truth ('1' == '1')."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"count": "1", "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"count": 1, "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_bool_agent_vs_bool_ground_truth(self):
+        """Test that bool param values match when both sides are bool."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"verbose": True, "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"verbose": True, "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_bool_agent_vs_str_ground_truth(self):
+        """Test that bool agent param matches str 'True' ground truth."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"verbose": True, "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"verbose": "True", "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_dict_agent_vs_dict_ground_truth(self):
+        """Test that dict param values match when both sides are dict."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"filters": {"category": "news", "lang": "en"}, "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"filters": {"category": "news", "lang": "en"}, "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_dict_agent_vs_json_str_ground_truth(self):
+        """Test that dict agent param matches JSON-stringified ground truth."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"filters": {"category": "news", "lang": "en"}, "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"filters": '{"category": "news", "lang": "en"}', "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_json_str_agent_vs_dict_ground_truth(self):
+        """Test that JSON-stringified agent param matches dict ground truth."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[
+                self._make_action("search", {"filters": '{"category": "news", "lang": "en"}', "query": "weather"})
+            ],
+            ground_truth=(
+                ["search"],
+                {"search": {"filters": {"category": "news", "lang": "en"}, "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_list_agent_vs_list_ground_truth(self):
+        """Test that list param values match when both sides are list."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"tags": ["a", "b", "c"], "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"tags": ["a", "b", "c"], "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_list_agent_vs_json_str_ground_truth(self):
+        """Test that list agent param matches JSON-stringified list ground truth."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"tags": ["a", "b", "c"], "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"tags": '["a", "b", "c"]', "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_stringified_args_vs_dict_ground_truth(self):
+        """Test that stringified JSON arguments match dict ground truth values."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        response = [
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "tool_call",
+                        "tool_call_id": "call_1",
+                        "name": "search",
+                        "arguments": '{"count": 1, "query": "weather"}',
+                    }
+                ],
+            }
+        ]
+        result = evaluator(
+            response=response,
+            ground_truth=(
+                ["search"],
+                {"search": {"count": 1, "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_float_agent_vs_float_ground_truth(self):
+        """Test that float param values match when both sides are float."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"threshold": 0.5, "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"threshold": 0.5, "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    def test_param_float_agent_vs_str_ground_truth(self):
+        """Test that float agent param matches str ground truth ('0.5' == '0.5')."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+        result = evaluator(
+            response=[self._make_action("search", {"threshold": 0.5, "query": "weather"})],
+            ground_truth=(
+                ["search"],
+                {"search": {"threshold": "0.5", "query": "weather"}},
+            ),
+        )
+        assert result["task_navigation_efficiency_result"] == "pass"
+
     def test_tuple_format_with_parameters(self):
         """Test tuple format with exact parameter matching."""
         evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)