@@ -29,11 +29,11 @@ def test_exact_match_scenario(self):
2929 ground_truth = ["search" , "analyze" , "report" ]
3030
3131 result = evaluator (response = response , ground_truth = ground_truth )
32- assert result ["task_navigation_efficiency_result " ] == "pass"
33- assert "task_navigation_efficiency_details " in result
34- assert result ["task_navigation_efficiency_details " ]["precision_score" ] == 1.0
35- assert result ["task_navigation_efficiency_details " ]["recall_score" ] == 1.0
36- assert result ["task_navigation_efficiency_details " ]["f1_score" ] == 1.0
32+ assert result ["task_navigation_efficiency_passed " ] is True
33+ assert "task_navigation_efficiency_properties " in result
34+ assert result ["task_navigation_efficiency_properties " ]["precision_score" ] == 1.0
35+ assert result ["task_navigation_efficiency_properties " ]["recall_score" ] == 1.0
36+ assert result ["task_navigation_efficiency_properties " ]["f1_score" ] == 1.0
3737
3838 def test_in_order_match_with_extra_steps (self ):
3939 """Test when agent has extra steps but maintains order."""
@@ -62,10 +62,10 @@ def test_in_order_match_with_extra_steps(self):
6262 ground_truth = ["search" , "analyze" , "report" ]
6363
6464 result = evaluator (response = response , ground_truth = ground_truth )
65- assert result ["task_navigation_efficiency_result " ] == "pass"
66- assert result ["task_navigation_efficiency_details " ]["precision_score" ] == 0.75 # 3/4
67- assert result ["task_navigation_efficiency_details " ]["recall_score" ] == 1.0 # 3/3
68- assert result ["task_navigation_efficiency_details " ]["f1_score" ] == pytest .approx (0.857 , rel = 1e-2 )
65+ assert result ["task_navigation_efficiency_passed " ] is True
66+ assert result ["task_navigation_efficiency_properties " ]["precision_score" ] == 0.75 # 3/4
67+ assert result ["task_navigation_efficiency_properties " ]["recall_score" ] == 1.0 # 3/3
68+ assert result ["task_navigation_efficiency_properties " ]["f1_score" ] == pytest .approx (0.857 , rel = 1e-2 )
6969
7070 def test_any_order_match (self ):
7171 """Test when agent has all steps but in wrong order."""
@@ -90,10 +90,10 @@ def test_any_order_match(self):
9090 ground_truth = ["search" , "analyze" , "report" ]
9191
9292 result = evaluator (response = response , ground_truth = ground_truth )
93- assert result ["task_navigation_efficiency_result " ] == "pass"
94- assert result ["task_navigation_efficiency_details " ]["precision_score" ] == 1.0
95- assert result ["task_navigation_efficiency_details " ]["recall_score" ] == 1.0
96- assert result ["task_navigation_efficiency_details " ]["f1_score" ] == 1.0
93+ assert result ["task_navigation_efficiency_passed " ] is True
94+ assert result ["task_navigation_efficiency_properties " ]["precision_score" ] == 1.0
95+ assert result ["task_navigation_efficiency_properties " ]["recall_score" ] == 1.0
96+ assert result ["task_navigation_efficiency_properties " ]["f1_score" ] == 1.0
9797
9898 def test_exact_match_failure (self ):
9999 """Test when exact match fails but other matches succeed."""
@@ -121,10 +121,10 @@ def test_exact_match_failure(self):
121121 ground_truth = ["search" , "analyze" ]
122122
123123 exact_result = exact_evaluator (response = response , ground_truth = ground_truth )
124- assert exact_result ["task_navigation_efficiency_result " ] == "fail"
124+ assert exact_result ["task_navigation_efficiency_passed " ] is False
125125
126126 in_order_result = in_order_evaluator (response = response , ground_truth = ground_truth )
127- assert in_order_result ["task_navigation_efficiency_result " ] == "pass"
127+ assert in_order_result ["task_navigation_efficiency_passed " ] is True
128128
129129 def test_invalid_ground_truth (self ):
130130 """Test with invalid ground truth steps."""
@@ -161,10 +161,10 @@ def test_tuple_format_with_parameters(self):
161161 )
162162
163163 result = evaluator (response = response , ground_truth = ground_truth )
164- assert result ["task_navigation_efficiency_result " ] == "pass"
165- assert result ["task_navigation_efficiency_details " ]["precision_score" ] == 1.0
166- assert result ["task_navigation_efficiency_details " ]["recall_score" ] == 1.0
167- assert result ["task_navigation_efficiency_details " ]["f1_score" ] == 1.0
164+ assert result ["task_navigation_efficiency_passed " ] is True
165+ assert result ["task_navigation_efficiency_properties " ]["precision_score" ] == 1.0
166+ assert result ["task_navigation_efficiency_properties " ]["recall_score" ] == 1.0
167+ assert result ["task_navigation_efficiency_properties " ]["f1_score" ] == 1.0
168168
169169 def test_matching_mode_validation (self ):
170170 """Test validation of matching_mode parameter."""
0 commit comments