Skip to content

Commit f502345

Browse files
Copilotm7md7sien
andauthored
Normalize parameter types in TaskNavigationEfficiency comparison (#46227)
* Normalize parameter types in TaskNavigationEfficiency comparison Port fix from Azure/azureml-assets#4901. Adds _normalize_param_value static method for consistent string comparison of parameter values (int, float, bool, dict, list) between agent and ground truth. Updates _extract_tool_names_and_params_from_response to preserve original value types instead of premature str() conversion. Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/4888d4d1-bd21-46b6-a733-231b3ffefddd Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> * Fix black formatting: collapse expressions that fit within 120 char line-length (#46232) Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/1c88d810-2e80-47a9-ad09-40adb6529219 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> * Fix black issue --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> Co-authored-by: mohessie <mohessie@microsoft.com>
1 parent b06269f commit f502345

3 files changed

Lines changed: 224 additions & 8 deletions

File tree

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -539,13 +539,13 @@ def _parse_tools_from_response(self, response):
539539

540540
return tool_calls
541541

542-
def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
542+
def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, Any]]]:
543543
"""Extract tool names and parameters from the response.
544544
545545
:param response: The response to parse.
546546
:type response: Union[str, List[dict]]
547547
:return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
548-
:rtype: List[Tuple[str, Dict[str, str]]]
548+
:rtype: List[Tuple[str, Dict[str, Any]]]
549549
"""
550550
tool_calls = self._parse_tools_from_response(response)
551551
tool_name_param_pairs = []
@@ -580,14 +580,13 @@ def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[s
580580
if "arguments" in tool_call:
581581
args = tool_call["arguments"]
582582
if isinstance(args, dict):
583-
# Convert all values to strings for consistent comparison
584-
parameters = {str(k): str(v) for k, v in args.items()}
583+
parameters = {str(k): v for k, v in args.items()}
585584
elif isinstance(args, str):
586585
# If arguments is a string, try to parse it as JSON
587586
try:
588587
parsed_args = json.loads(args)
589588
if isinstance(parsed_args, dict):
590-
parameters = {str(k): str(v) for k, v in parsed_args.items()}
589+
parameters = {str(k): v for k, v in parsed_args.items()}
591590
except json.JSONDecodeError:
592591
raise EvaluationException(
593592
"Failed to parse tool call arguments as JSON.",

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,23 @@ async def _real_call(self, **kwargs):
151151
self._validator.validate_eval_input(kwargs)
152152
return await super()._real_call(**kwargs)
153153

154+
@staticmethod
155+
def _normalize_param_value(value: Any) -> str:
156+
"""Normalize a parameter value to a string for consistent comparison.
157+
158+
Uses json.dumps for dicts and lists to produce canonical JSON strings,
159+
and str() for other types. This ensures both agent and ground truth
160+
parameter values are compared in the same string format.
161+
"""
162+
if isinstance(value, str):
163+
return value
164+
if isinstance(value, (dict, list)):
165+
try:
166+
return json.dumps(value, sort_keys=True)
167+
except (TypeError, ValueError):
168+
return str(value)
169+
return str(value)
170+
154171
def _prepare_steps_for_comparison(
155172
self,
156173
agent_tool_pairs: List[Tuple[str, Dict[str, Any]]],
@@ -165,10 +182,22 @@ def _prepare_steps_for_comparison(
165182
agent_steps: List[Union[str, Tuple[str, Tuple]]] = []
166183
ground_truth_steps: List[Union[str, Tuple[str, Tuple]]] = []
167184
if use_parameter_matching:
168-
# When parameter matching is enabled, we need to match both tool name and parameters
169-
agent_steps = [(pair[0], tuple(sorted(pair[1].items()))) for pair in agent_tool_pairs]
185+
# When parameter matching is enabled, we need to match both tool name and parameters.
186+
# Normalize all parameter values to strings on both sides for consistent comparison.
187+
agent_steps = [
188+
(pair[0], tuple(sorted((k, self._normalize_param_value(v)) for k, v in pair[1].items())))
189+
for pair in agent_tool_pairs
190+
]
170191
ground_truth_steps = [
171-
(name, tuple(sorted(ground_truth_params.get(name, {}).items()))) for name in ground_truth
192+
(
193+
name,
194+
tuple(
195+
sorted(
196+
(k, self._normalize_param_value(v)) for k, v in ground_truth_params.get(name, {}).items()
197+
)
198+
),
199+
)
200+
for name in ground_truth
172201
]
173202
else:
174203
# When parameter matching is disabled, only compare tool names

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,194 @@ def test_invalid_ground_truth(self):
167167
with pytest.raises(EvaluationException):
168168
evaluator(response=[], ground_truth=[])
169169

170+
# ==================== PARAMETER TYPE NORMALIZATION TESTS ====================
171+
172+
@staticmethod
173+
def _make_action(name: str, arguments) -> dict:
174+
"""Create an assistant action with a tool call."""
175+
return {
176+
"role": "assistant",
177+
"content": [
178+
{
179+
"type": "tool_call",
180+
"tool_call_id": f"call_{name}",
181+
"name": name,
182+
"arguments": arguments,
183+
}
184+
],
185+
}
186+
187+
def test_param_int_agent_vs_int_ground_truth(self):
188+
"""Test that int param values match when both sides are int."""
189+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
190+
result = evaluator(
191+
response=[self._make_action("search", {"count": 1, "query": "weather"})],
192+
ground_truth=(
193+
["search"],
194+
{"search": {"count": 1, "query": "weather"}},
195+
),
196+
)
197+
assert result["task_navigation_efficiency_result"] == "pass"
198+
199+
def test_param_int_agent_vs_str_ground_truth(self):
200+
"""Test that int agent param matches str ground truth ('1' == '1')."""
201+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
202+
result = evaluator(
203+
response=[self._make_action("search", {"count": 1, "query": "weather"})],
204+
ground_truth=(
205+
["search"],
206+
{"search": {"count": "1", "query": "weather"}},
207+
),
208+
)
209+
assert result["task_navigation_efficiency_result"] == "pass"
210+
211+
def test_param_str_agent_vs_int_ground_truth(self):
212+
"""Test that str agent param matches int ground truth ('1' == '1')."""
213+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
214+
result = evaluator(
215+
response=[self._make_action("search", {"count": "1", "query": "weather"})],
216+
ground_truth=(
217+
["search"],
218+
{"search": {"count": 1, "query": "weather"}},
219+
),
220+
)
221+
assert result["task_navigation_efficiency_result"] == "pass"
222+
223+
def test_param_bool_agent_vs_bool_ground_truth(self):
224+
"""Test that bool param values match when both sides are bool."""
225+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
226+
result = evaluator(
227+
response=[self._make_action("search", {"verbose": True, "query": "weather"})],
228+
ground_truth=(
229+
["search"],
230+
{"search": {"verbose": True, "query": "weather"}},
231+
),
232+
)
233+
assert result["task_navigation_efficiency_result"] == "pass"
234+
235+
def test_param_bool_agent_vs_str_ground_truth(self):
236+
"""Test that bool agent param matches str 'True' ground truth."""
237+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
238+
result = evaluator(
239+
response=[self._make_action("search", {"verbose": True, "query": "weather"})],
240+
ground_truth=(
241+
["search"],
242+
{"search": {"verbose": "True", "query": "weather"}},
243+
),
244+
)
245+
assert result["task_navigation_efficiency_result"] == "pass"
246+
247+
def test_param_dict_agent_vs_dict_ground_truth(self):
248+
"""Test that dict param values match when both sides are dict."""
249+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
250+
result = evaluator(
251+
response=[self._make_action("search", {"filters": {"category": "news", "lang": "en"}, "query": "weather"})],
252+
ground_truth=(
253+
["search"],
254+
{"search": {"filters": {"category": "news", "lang": "en"}, "query": "weather"}},
255+
),
256+
)
257+
assert result["task_navigation_efficiency_result"] == "pass"
258+
259+
def test_param_dict_agent_vs_json_str_ground_truth(self):
260+
"""Test that dict agent param matches JSON-stringified ground truth."""
261+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
262+
result = evaluator(
263+
response=[self._make_action("search", {"filters": {"category": "news", "lang": "en"}, "query": "weather"})],
264+
ground_truth=(
265+
["search"],
266+
{"search": {"filters": '{"category": "news", "lang": "en"}', "query": "weather"}},
267+
),
268+
)
269+
assert result["task_navigation_efficiency_result"] == "pass"
270+
271+
def test_param_json_str_agent_vs_dict_ground_truth(self):
272+
"""Test that JSON-stringified agent param matches dict ground truth."""
273+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
274+
result = evaluator(
275+
response=[
276+
self._make_action("search", {"filters": '{"category": "news", "lang": "en"}', "query": "weather"})
277+
],
278+
ground_truth=(
279+
["search"],
280+
{"search": {"filters": {"category": "news", "lang": "en"}, "query": "weather"}},
281+
),
282+
)
283+
assert result["task_navigation_efficiency_result"] == "pass"
284+
285+
def test_param_list_agent_vs_list_ground_truth(self):
286+
"""Test that list param values match when both sides are list."""
287+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
288+
result = evaluator(
289+
response=[self._make_action("search", {"tags": ["a", "b", "c"], "query": "weather"})],
290+
ground_truth=(
291+
["search"],
292+
{"search": {"tags": ["a", "b", "c"], "query": "weather"}},
293+
),
294+
)
295+
assert result["task_navigation_efficiency_result"] == "pass"
296+
297+
def test_param_list_agent_vs_json_str_ground_truth(self):
298+
"""Test that list agent param matches JSON-stringified list ground truth."""
299+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
300+
result = evaluator(
301+
response=[self._make_action("search", {"tags": ["a", "b", "c"], "query": "weather"})],
302+
ground_truth=(
303+
["search"],
304+
{"search": {"tags": '["a", "b", "c"]', "query": "weather"}},
305+
),
306+
)
307+
assert result["task_navigation_efficiency_result"] == "pass"
308+
309+
def test_param_stringified_args_vs_dict_ground_truth(self):
310+
"""Test that stringified JSON arguments match dict ground truth values."""
311+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
312+
response = [
313+
{
314+
"role": "assistant",
315+
"content": [
316+
{
317+
"type": "tool_call",
318+
"tool_call_id": "call_1",
319+
"name": "search",
320+
"arguments": '{"count": 1, "query": "weather"}',
321+
}
322+
],
323+
}
324+
]
325+
result = evaluator(
326+
response=response,
327+
ground_truth=(
328+
["search"],
329+
{"search": {"count": 1, "query": "weather"}},
330+
),
331+
)
332+
assert result["task_navigation_efficiency_result"] == "pass"
333+
334+
def test_param_float_agent_vs_float_ground_truth(self):
335+
"""Test that float param values match when both sides are float."""
336+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
337+
result = evaluator(
338+
response=[self._make_action("search", {"threshold": 0.5, "query": "weather"})],
339+
ground_truth=(
340+
["search"],
341+
{"search": {"threshold": 0.5, "query": "weather"}},
342+
),
343+
)
344+
assert result["task_navigation_efficiency_result"] == "pass"
345+
346+
def test_param_float_agent_vs_str_ground_truth(self):
347+
"""Test that float agent param matches str ground truth ('0.5' == '0.5')."""
348+
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
349+
result = evaluator(
350+
response=[self._make_action("search", {"threshold": 0.5, "query": "weather"})],
351+
ground_truth=(
352+
["search"],
353+
{"search": {"threshold": "0.5", "query": "weather"}},
354+
),
355+
)
356+
assert result["task_navigation_efficiency_result"] == "pass"
357+
170358
def test_tuple_format_with_parameters(self):
171359
"""Test tuple format with exact parameter matching."""
172360
evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)

0 commit comments

Comments
 (0)