@@ -167,6 +167,194 @@ def test_invalid_ground_truth(self):
167167 with pytest .raises (EvaluationException ):
168168 evaluator (response = [], ground_truth = [])
169169
170+ # ==================== PARAMETER TYPE NORMALIZATION TESTS ====================
171+
172+ @staticmethod
173+ def _make_action (name : str , arguments ) -> dict :
174+ """Create an assistant action with a tool call."""
175+ return {
176+ "role" : "assistant" ,
177+ "content" : [
178+ {
179+ "type" : "tool_call" ,
180+ "tool_call_id" : f"call_{ name } " ,
181+ "name" : name ,
182+ "arguments" : arguments ,
183+ }
184+ ],
185+ }
186+
187+ def test_param_int_agent_vs_int_ground_truth (self ):
188+ """Test that int param values match when both sides are int."""
189+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
190+ result = evaluator (
191+ response = [self ._make_action ("search" , {"count" : 1 , "query" : "weather" })],
192+ ground_truth = (
193+ ["search" ],
194+ {"search" : {"count" : 1 , "query" : "weather" }},
195+ ),
196+ )
197+ assert result ["task_navigation_efficiency_result" ] == "pass"
198+
199+ def test_param_int_agent_vs_str_ground_truth (self ):
200+ """Test that int agent param matches str ground truth ('1' == '1')."""
201+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
202+ result = evaluator (
203+ response = [self ._make_action ("search" , {"count" : 1 , "query" : "weather" })],
204+ ground_truth = (
205+ ["search" ],
206+ {"search" : {"count" : "1" , "query" : "weather" }},
207+ ),
208+ )
209+ assert result ["task_navigation_efficiency_result" ] == "pass"
210+
211+ def test_param_str_agent_vs_int_ground_truth (self ):
212+ """Test that str agent param matches int ground truth ('1' == '1')."""
213+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
214+ result = evaluator (
215+ response = [self ._make_action ("search" , {"count" : "1" , "query" : "weather" })],
216+ ground_truth = (
217+ ["search" ],
218+ {"search" : {"count" : 1 , "query" : "weather" }},
219+ ),
220+ )
221+ assert result ["task_navigation_efficiency_result" ] == "pass"
222+
223+ def test_param_bool_agent_vs_bool_ground_truth (self ):
224+ """Test that bool param values match when both sides are bool."""
225+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
226+ result = evaluator (
227+ response = [self ._make_action ("search" , {"verbose" : True , "query" : "weather" })],
228+ ground_truth = (
229+ ["search" ],
230+ {"search" : {"verbose" : True , "query" : "weather" }},
231+ ),
232+ )
233+ assert result ["task_navigation_efficiency_result" ] == "pass"
234+
235+ def test_param_bool_agent_vs_str_ground_truth (self ):
236+ """Test that bool agent param matches str 'True' ground truth."""
237+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
238+ result = evaluator (
239+ response = [self ._make_action ("search" , {"verbose" : True , "query" : "weather" })],
240+ ground_truth = (
241+ ["search" ],
242+ {"search" : {"verbose" : "True" , "query" : "weather" }},
243+ ),
244+ )
245+ assert result ["task_navigation_efficiency_result" ] == "pass"
246+
247+ def test_param_dict_agent_vs_dict_ground_truth (self ):
248+ """Test that dict param values match when both sides are dict."""
249+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
250+ result = evaluator (
251+ response = [self ._make_action ("search" , {"filters" : {"category" : "news" , "lang" : "en" }, "query" : "weather" })],
252+ ground_truth = (
253+ ["search" ],
254+ {"search" : {"filters" : {"category" : "news" , "lang" : "en" }, "query" : "weather" }},
255+ ),
256+ )
257+ assert result ["task_navigation_efficiency_result" ] == "pass"
258+
259+ def test_param_dict_agent_vs_json_str_ground_truth (self ):
260+ """Test that dict agent param matches JSON-stringified ground truth."""
261+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
262+ result = evaluator (
263+ response = [self ._make_action ("search" , {"filters" : {"category" : "news" , "lang" : "en" }, "query" : "weather" })],
264+ ground_truth = (
265+ ["search" ],
266+ {"search" : {"filters" : '{"category": "news", "lang": "en"}' , "query" : "weather" }},
267+ ),
268+ )
269+ assert result ["task_navigation_efficiency_result" ] == "pass"
270+
271+ def test_param_json_str_agent_vs_dict_ground_truth (self ):
272+ """Test that JSON-stringified agent param matches dict ground truth."""
273+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
274+ result = evaluator (
275+ response = [
276+ self ._make_action ("search" , {"filters" : '{"category": "news", "lang": "en"}' , "query" : "weather" })
277+ ],
278+ ground_truth = (
279+ ["search" ],
280+ {"search" : {"filters" : {"category" : "news" , "lang" : "en" }, "query" : "weather" }},
281+ ),
282+ )
283+ assert result ["task_navigation_efficiency_result" ] == "pass"
284+
285+ def test_param_list_agent_vs_list_ground_truth (self ):
286+ """Test that list param values match when both sides are list."""
287+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
288+ result = evaluator (
289+ response = [self ._make_action ("search" , {"tags" : ["a" , "b" , "c" ], "query" : "weather" })],
290+ ground_truth = (
291+ ["search" ],
292+ {"search" : {"tags" : ["a" , "b" , "c" ], "query" : "weather" }},
293+ ),
294+ )
295+ assert result ["task_navigation_efficiency_result" ] == "pass"
296+
297+ def test_param_list_agent_vs_json_str_ground_truth (self ):
298+ """Test that list agent param matches JSON-stringified list ground truth."""
299+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
300+ result = evaluator (
301+ response = [self ._make_action ("search" , {"tags" : ["a" , "b" , "c" ], "query" : "weather" })],
302+ ground_truth = (
303+ ["search" ],
304+ {"search" : {"tags" : '["a", "b", "c"]' , "query" : "weather" }},
305+ ),
306+ )
307+ assert result ["task_navigation_efficiency_result" ] == "pass"
308+
309+ def test_param_stringified_args_vs_dict_ground_truth (self ):
310+ """Test that stringified JSON arguments match dict ground truth values."""
311+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
312+ response = [
313+ {
314+ "role" : "assistant" ,
315+ "content" : [
316+ {
317+ "type" : "tool_call" ,
318+ "tool_call_id" : "call_1" ,
319+ "name" : "search" ,
320+ "arguments" : '{"count": 1, "query": "weather"}' ,
321+ }
322+ ],
323+ }
324+ ]
325+ result = evaluator (
326+ response = response ,
327+ ground_truth = (
328+ ["search" ],
329+ {"search" : {"count" : 1 , "query" : "weather" }},
330+ ),
331+ )
332+ assert result ["task_navigation_efficiency_result" ] == "pass"
333+
334+ def test_param_float_agent_vs_float_ground_truth (self ):
335+ """Test that float param values match when both sides are float."""
336+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
337+ result = evaluator (
338+ response = [self ._make_action ("search" , {"threshold" : 0.5 , "query" : "weather" })],
339+ ground_truth = (
340+ ["search" ],
341+ {"search" : {"threshold" : 0.5 , "query" : "weather" }},
342+ ),
343+ )
344+ assert result ["task_navigation_efficiency_result" ] == "pass"
345+
346+ def test_param_float_agent_vs_str_ground_truth (self ):
347+ """Test that float agent param matches str ground truth ('0.5' == '0.5')."""
348+ evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
349+ result = evaluator (
350+ response = [self ._make_action ("search" , {"threshold" : 0.5 , "query" : "weather" })],
351+ ground_truth = (
352+ ["search" ],
353+ {"search" : {"threshold" : "0.5" , "query" : "weather" }},
354+ ),
355+ )
356+ assert result ["task_navigation_efficiency_result" ] == "pass"
357+
170358 def test_tuple_format_with_parameters (self ):
171359 """Test tuple format with exact parameter matching."""
172360 evaluator = _TaskNavigationEfficiencyEvaluator (matching_mode = _TaskNavigationEfficiencyMatchingMode .EXACT_MATCH )
0 commit comments