Update task_navigation_efficiency unit tests to new output schema (#46475)

Copilot · m7md7sien · web-flow · commit 92cb5ab82df4 · 2026-04-23T00:09:51.000+02:00
Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/af7ce3e2-8f3b-425a-85fd-d9dcfd79552d Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
@@ -29,11 +29,11 @@ def test_exact_match_scenario(self):
         ground_truth = ["search", "analyze", "report"]
 
         result = evaluator(response=response, ground_truth=ground_truth)
-        assert result["task_navigation_efficiency_result"] == "pass"
-        assert "task_navigation_efficiency_details" in result
-        assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
+        assert result["task_navigation_efficiency_passed"] is True
+        assert "task_navigation_efficiency_properties" in result
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
 
     def test_in_order_match_with_extra_steps(self):
         """Test when agent has extra steps but maintains order."""
@@ -62,10 +62,10 @@ def test_in_order_match_with_extra_steps(self):
         ground_truth = ["search", "analyze", "report"]
 
         result = evaluator(response=response, ground_truth=ground_truth)
-        assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["task_navigation_efficiency_details"]["precision_score"] == 0.75  # 3/4
-        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0  # 3/3
-        assert result["task_navigation_efficiency_details"]["f1_score"] == pytest.approx(0.857, rel=1e-2)
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 0.75  # 3/4
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0  # 3/3
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == pytest.approx(0.857, rel=1e-2)
 
     def test_any_order_match(self):
         """Test when agent has all steps but in wrong order."""
@@ -90,10 +90,10 @@ def test_any_order_match(self):
         ground_truth = ["search", "analyze", "report"]
 
         result = evaluator(response=response, ground_truth=ground_truth)
-        assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
 
     def test_exact_match_failure(self):
         """Test when exact match fails but other matches succeed."""
@@ -121,10 +121,10 @@ def test_exact_match_failure(self):
         ground_truth = ["search", "analyze"]
 
         exact_result = exact_evaluator(response=response, ground_truth=ground_truth)
-        assert exact_result["task_navigation_efficiency_result"] == "fail"
+        assert exact_result["task_navigation_efficiency_passed"] is False
 
         in_order_result = in_order_evaluator(response=response, ground_truth=ground_truth)
-        assert in_order_result["task_navigation_efficiency_result"] == "pass"
+        assert in_order_result["task_navigation_efficiency_passed"] is True
 
     def test_invalid_ground_truth(self):
         """Test with invalid ground truth steps."""
@@ -161,10 +161,10 @@ def test_tuple_format_with_parameters(self):
         )
 
         result = evaluator(response=response, ground_truth=ground_truth)
-        assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
-        assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
 
     def test_matching_mode_validation(self):
         """Test validation of matching_mode parameter."""