Azure · m7md7sien · Apr 22, 2026 · Apr 22, 2026
diff --git a/...luation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/...luation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
@@ -29,11 +29,11 @@ def test_exact_match_scenario(self):
         ground_truth = ["search", "analyze", "report"]
 
         result = evaluator(response=response, ground_truth=ground_truth)
-        assert result["task_navigation_efficiency_result"] == "pass"
-        assert "task_navigation_efficiency_details" in result
-        assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
+        assert result["task_navigation_efficiency_passed"] is True
+        assert "task_navigation_efficiency_properties" in result
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
 
     def test_in_order_match_with_extra_steps(self):
         """Test when agent has extra steps but maintains order."""
@@ -62,10 +62,10 @@ def test_in_order_match_with_extra_steps(self):
         ground_truth = ["search", "analyze", "report"]
 
         result = evaluator(response=response, ground_truth=ground_truth)
-        assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["task_navigation_efficiency_details"]["precision_score"] == 0.75  # 3/4
-        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0  # 3/3
-        assert result["task_navigation_efficiency_details"]["f1_score"] == pytest.approx(0.857, rel=1e-2)
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 0.75  # 3/4
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0  # 3/3
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == pytest.approx(0.857, rel=1e-2)
 
     def test_any_order_match(self):
         """Test when agent has all steps but in wrong order."""
@@ -90,10 +90,10 @@ def test_any_order_match(self):
         ground_truth = ["search", "analyze", "report"]
 
         result = evaluator(response=response, ground_truth=ground_truth)
-        assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
 
     def test_exact_match_failure(self):
         """Test when exact match fails but other matches succeed."""
@@ -121,10 +121,10 @@ def test_exact_match_failure(self):
         ground_truth = ["search", "analyze"]
 
         exact_result = exact_evaluator(response=response, ground_truth=ground_truth)
-        assert exact_result["task_navigation_efficiency_result"] == "fail"
+        assert exact_result["task_navigation_efficiency_passed"] is False
 
         in_order_result = in_order_evaluator(response=response, ground_truth=ground_truth)
-        assert in_order_result["task_navigation_efficiency_result"] == "pass"
+        assert in_order_result["task_navigation_efficiency_passed"] is True
 
     def test_invalid_ground_truth(self):
         """Test with invalid ground truth steps."""
@@ -161,10 +161,10 @@ def test_tuple_format_with_parameters(self):
         )
 
         result = evaluator(response=response, ground_truth=ground_truth)
-        assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
 
     def test_matching_mode_validation(self):
         """Test validation of matching_mode parameter."""