Skip to content

Commit 92cb5ab

Browse files
Copilotm7md7sien
andauthored
Update task_navigation_efficiency unit tests to new output schema (#46475)
Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/af7ce3e2-8f3b-425a-85fd-d9dcfd79552d Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
1 parent f5677df commit 92cb5ab

1 file changed

Lines changed: 19 additions & 19 deletions

File tree

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ def test_exact_match_scenario(self):
2929
ground_truth = ["search", "analyze", "report"]
3030

3131
result = evaluator(response=response, ground_truth=ground_truth)
32-
assert result["task_navigation_efficiency_result"] == "pass"
33-
assert "task_navigation_efficiency_details" in result
34-
assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
35-
assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
36-
assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
32+
assert result["task_navigation_efficiency_passed"] is True
33+
assert "task_navigation_efficiency_properties" in result
34+
assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
35+
assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
36+
assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
3737

3838
def test_in_order_match_with_extra_steps(self):
3939
"""Test when agent has extra steps but maintains order."""
@@ -62,10 +62,10 @@ def test_in_order_match_with_extra_steps(self):
6262
ground_truth = ["search", "analyze", "report"]
6363

6464
result = evaluator(response=response, ground_truth=ground_truth)
65-
assert result["task_navigation_efficiency_result"] == "pass"
66-
assert result["task_navigation_efficiency_details"]["precision_score"] == 0.75 # 3/4
67-
assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0 # 3/3
68-
assert result["task_navigation_efficiency_details"]["f1_score"] == pytest.approx(0.857, rel=1e-2)
65+
assert result["task_navigation_efficiency_passed"] is True
66+
assert result["task_navigation_efficiency_properties"]["precision_score"] == 0.75 # 3/4
67+
assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 # 3/3
68+
assert result["task_navigation_efficiency_properties"]["f1_score"] == pytest.approx(0.857, rel=1e-2)
6969

7070
def test_any_order_match(self):
7171
"""Test when agent has all steps but in wrong order."""
@@ -90,10 +90,10 @@ def test_any_order_match(self):
9090
ground_truth = ["search", "analyze", "report"]
9191

9292
result = evaluator(response=response, ground_truth=ground_truth)
93-
assert result["task_navigation_efficiency_result"] == "pass"
94-
assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
95-
assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
96-
assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
93+
assert result["task_navigation_efficiency_passed"] is True
94+
assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
95+
assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
96+
assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
9797

9898
def test_exact_match_failure(self):
9999
"""Test when exact match fails but other matches succeed."""
@@ -121,10 +121,10 @@ def test_exact_match_failure(self):
121121
ground_truth = ["search", "analyze"]
122122

123123
exact_result = exact_evaluator(response=response, ground_truth=ground_truth)
124-
assert exact_result["task_navigation_efficiency_result"] == "fail"
124+
assert exact_result["task_navigation_efficiency_passed"] is False
125125

126126
in_order_result = in_order_evaluator(response=response, ground_truth=ground_truth)
127-
assert in_order_result["task_navigation_efficiency_result"] == "pass"
127+
assert in_order_result["task_navigation_efficiency_passed"] is True
128128

129129
def test_invalid_ground_truth(self):
130130
"""Test with invalid ground truth steps."""
@@ -161,10 +161,10 @@ def test_tuple_format_with_parameters(self):
161161
)
162162

163163
result = evaluator(response=response, ground_truth=ground_truth)
164-
assert result["task_navigation_efficiency_result"] == "pass"
165-
assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
166-
assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
167-
assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
164+
assert result["task_navigation_efficiency_passed"] is True
165+
assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
166+
assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
167+
assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
168168

169169
def test_matching_mode_validation(self):
170170
"""Test validation of matching_mode parameter."""

0 commit comments

Comments
 (0)