Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ def test_exact_match_scenario(self):
ground_truth = ["search", "analyze", "report"]

result = evaluator(response=response, ground_truth=ground_truth)
assert result["task_navigation_efficiency_result"] == "pass"
assert "task_navigation_efficiency_details" in result
assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
assert result["task_navigation_efficiency_passed"] is True
assert "task_navigation_efficiency_properties" in result
assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0

def test_in_order_match_with_extra_steps(self):
"""Test when agent has extra steps but maintains order."""
Expand Down Expand Up @@ -62,10 +62,10 @@ def test_in_order_match_with_extra_steps(self):
ground_truth = ["search", "analyze", "report"]

result = evaluator(response=response, ground_truth=ground_truth)
assert result["task_navigation_efficiency_result"] == "pass"
assert result["task_navigation_efficiency_details"]["precision_score"] == 0.75 # 3/4
assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0 # 3/3
assert result["task_navigation_efficiency_details"]["f1_score"] == pytest.approx(0.857, rel=1e-2)
assert result["task_navigation_efficiency_passed"] is True
assert result["task_navigation_efficiency_properties"]["precision_score"] == 0.75 # 3/4
assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 # 3/3
assert result["task_navigation_efficiency_properties"]["f1_score"] == pytest.approx(0.857, rel=1e-2)

def test_any_order_match(self):
"""Test when agent has all steps but in wrong order."""
Expand All @@ -90,10 +90,10 @@ def test_any_order_match(self):
ground_truth = ["search", "analyze", "report"]

result = evaluator(response=response, ground_truth=ground_truth)
assert result["task_navigation_efficiency_result"] == "pass"
assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
assert result["task_navigation_efficiency_passed"] is True
assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0

def test_exact_match_failure(self):
"""Test when exact match fails but other matches succeed."""
Expand Down Expand Up @@ -121,10 +121,10 @@ def test_exact_match_failure(self):
ground_truth = ["search", "analyze"]

exact_result = exact_evaluator(response=response, ground_truth=ground_truth)
assert exact_result["task_navigation_efficiency_result"] == "fail"
assert exact_result["task_navigation_efficiency_passed"] is False

in_order_result = in_order_evaluator(response=response, ground_truth=ground_truth)
assert in_order_result["task_navigation_efficiency_result"] == "pass"
assert in_order_result["task_navigation_efficiency_passed"] is True

def test_invalid_ground_truth(self):
"""Test with invalid ground truth steps."""
Expand Down Expand Up @@ -161,10 +161,10 @@ def test_tuple_format_with_parameters(self):
)

result = evaluator(response=response, ground_truth=ground_truth)
assert result["task_navigation_efficiency_result"] == "pass"
assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
assert result["task_navigation_efficiency_passed"] is True
assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0

def test_matching_mode_validation(self):
"""Test validation of matching_mode parameter."""
Expand Down
Loading