MACS now uses TaskExecutionStatus

cemde · cemde · commit e522a9a1ad45 · 2025-12-31T15:33:26.000+01:00
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
@@ -53,6 +53,7 @@ def get_model_adapter(self, model_id, **kwargs):
     MessageHistory,
     ModelAdapter,
     Task,
+    TaskExecutionStatus,
     ToolInvocationHistory,
     ToolLLMSimulator,
     User,
@@ -64,6 +65,17 @@ def get_model_adapter(self, model_id, **kwargs):
 from maseval.core.tracing import TraceableMixin
 
 
+# Statuses where agent is accountable (included in scoring)
+# Note: task_timeout is included - timeouts count as failures in MACS
+SCOREABLE_STATUSES = frozenset(
+    {
+        TaskExecutionStatus.SUCCESS.value,
+        TaskExecutionStatus.AGENT_ERROR.value,
+        TaskExecutionStatus.TASK_TIMEOUT.value,
+    }
+)
+
+
 # =============================================================================
 # Tool
 # =============================================================================
@@ -987,29 +999,14 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
             - excluded: Dict with counts of excluded tasks by category
             - status_counts: Dict with counts of each status type
     """
-    # Status values that indicate infrastructure failures (not agent's fault)
-    INFRASTRUCTURE_STATUSES = {
-        "environment_error",
-        "user_error",
-        "unknown_execution_error",
-        "evaluation_failed",
-        "setup_failed",
-    }
-
     if not results:
         return {
             "total_tasks": 0,
             "scored_tasks": 0,
             "successful_tasks": 0,
             "success_rate": 0.0,
             "mean_metrics": {},
-            "excluded": {
-                "environment_error": 0,
-                "user_error": 0,
-                "unknown_execution_error": 0,
-                "evaluation_failed": 0,
-                "setup_failed": 0,
-            },
+            "excluded": {},
             "status_counts": {},
         }
 
@@ -1019,14 +1016,14 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
     successful_tasks = 0
     scored_tasks = 0
     status_counts: Dict[str, int] = {}
-    excluded_counts: Dict[str, int] = {s: 0 for s in INFRASTRUCTURE_STATUSES}
+    excluded_counts: Dict[str, int] = {}
 
     for res in results:
         status = res.get("status", "unknown")
         status_counts[status] = status_counts.get(status, 0) + 1
 
-        # Skip infrastructure failures from scoring
-        if status in INFRASTRUCTURE_STATUSES:
+        # Skip infrastructure failures from scoring (use module-level SCOREABLE_STATUSES)
+        if status not in SCOREABLE_STATUSES:
             excluded_counts[status] = excluded_counts.get(status, 0) + 1
             continue
 
diff --git a/maseval/benchmark/tau2/evaluator.py b/maseval/benchmark/tau2/evaluator.py
@@ -23,12 +23,23 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional
 
-from maseval import Evaluator, Task
+from maseval import Evaluator, Task, TaskExecutionStatus
 
 from maseval.benchmark.tau2.environment import Tau2Environment, get_environment_constructor
 from maseval.benchmark.tau2.utils import compare_tool_calls
 
 
+# Statuses where agent is accountable (included in scoring)
+# Note: task_timeout is included - timeouts count as failures in tau2
+SCOREABLE_STATUSES = frozenset(
+    {
+        TaskExecutionStatus.SUCCESS.value,
+        TaskExecutionStatus.AGENT_ERROR.value,
+        TaskExecutionStatus.TASK_TIMEOUT.value,
+    }
+)
+
+
 class RewardType(str, Enum):
     """Types of rewards that can be computed.
 
@@ -447,21 +458,14 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
     """Compute summary metrics across all benchmark results.
 
     Infrastructure errors are excluded from scoring metrics.
+    Uses SCOREABLE_STATUSES to determine which results count toward agent score.
 
     Args:
         results: List of result dicts from benchmark.run()
 
     Returns:
         Dict with success_rate, mean_reward, pass_at_k, status_counts
     """
-    INFRASTRUCTURE_STATUSES = {
-        "environment_error",
-        "user_error",
-        "unknown_execution_error",
-        "evaluation_failed",
-        "setup_failed",
-    }
-
     if not results:
         return {
             "total_tasks": 0,
@@ -482,8 +486,8 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
         status = res.get("status", "unknown")
         status_counts[status] = status_counts.get(status, 0) + 1
 
-        if status in INFRASTRUCTURE_STATUSES:
-            continue
+        if status not in SCOREABLE_STATUSES:
+            continue  # Skip infrastructure errors
 
         scored_tasks += 1
         evals = res.get("eval") or []
@@ -529,7 +533,7 @@ def compute_pass_at_k(
     task_results: Dict[str, List[bool]] = {}
     for res in results:
         task_id = res.get("task_id", "")
-        if res.get("status") not in {"success", "agent_error"}:
+        if res.get("status") not in SCOREABLE_STATUSES:
             continue  # Skip infrastructure errors
 
         evals = res.get("eval") or []
@@ -615,7 +619,7 @@ def compute_pass_hat_k(
     task_results: Dict[str, List[bool]] = {}
     for res in results:
         task_id = res.get("task_id", "")
-        if res.get("status") not in {"success", "agent_error"}:
+        if res.get("status") not in SCOREABLE_STATUSES:
             continue  # Skip infrastructure errors
 
         evals = res.get("eval") or []
diff --git a/tests/test_benchmarks/test_macs/test_macs_benchmark.py b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
@@ -365,18 +365,12 @@ def test_empty_results(self):
         assert result["successful_tasks"] == 0
         assert result["success_rate"] == 0.0
         assert result["mean_metrics"] == {}
-        assert result["excluded"] == {
-            "environment_error": 0,
-            "user_error": 0,
-            "unknown_execution_error": 0,
-            "evaluation_failed": 0,
-            "setup_failed": 0,
-        }
+        assert result["excluded"] == {}
         assert result["status_counts"] == {}
 
     def test_single_successful_result(self):
         """Single successful result counted."""
-        results = [{"status": "completed", "eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0}]}]
+        results = [{"status": "success", "eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0}]}]
 
         metrics = compute_benchmark_metrics(results)
 
@@ -387,7 +381,7 @@ def test_single_successful_result(self):
 
     def test_single_failed_result(self):
         """Single failed result counted."""
-        results = [{"status": "completed", "eval": [{"overall_gsr": 0.0, "user_gsr": 0.0, "system_gsr": 0.0}]}]
+        results = [{"status": "success", "eval": [{"overall_gsr": 0.0, "user_gsr": 0.0, "system_gsr": 0.0}]}]
 
         metrics = compute_benchmark_metrics(results)
 
@@ -399,9 +393,9 @@ def test_single_failed_result(self):
     def test_multiple_results(self):
         """Multiple results aggregated correctly."""
         results = [
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},  # Success
-            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},  # Fail
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},  # Success
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},  # Success
+            {"status": "success", "eval": [{"overall_gsr": 0.0}]},  # Fail
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},  # Success
         ]
 
         metrics = compute_benchmark_metrics(results)
@@ -414,10 +408,10 @@ def test_multiple_results(self):
     def test_success_rate_calculation(self):
         """success_rate = successful/scored (not total)."""
         results = [
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
-            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
-            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 0.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 0.0}]},
         ]
 
         metrics = compute_benchmark_metrics(results)
@@ -427,8 +421,8 @@ def test_success_rate_calculation(self):
     def test_mean_metrics_calculation(self):
         """Mean of numeric metrics computed."""
         results = [
-            {"status": "completed", "eval": [{"overall_gsr": 1.0, "partial_gsr": 0.8}]},
-            {"status": "completed", "eval": [{"overall_gsr": 0.0, "partial_gsr": 0.4}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0, "partial_gsr": 0.8}]},
+            {"status": "success", "eval": [{"overall_gsr": 0.0, "partial_gsr": 0.4}]},
         ]
 
         metrics = compute_benchmark_metrics(results)
@@ -439,9 +433,9 @@ def test_mean_metrics_calculation(self):
     def test_handles_missing_eval(self):
         """Handles results with no eval key."""
         results = [
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
-            {"status": "completed", "no_eval_key": True},  # Missing eval
-            {"status": "completed", "eval": None},  # None eval
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "success", "no_eval_key": True},  # Missing eval
+            {"status": "success", "eval": None},  # None eval
         ]
 
         metrics = compute_benchmark_metrics(results)
@@ -454,7 +448,7 @@ def test_handles_non_numeric_values(self):
         """Non-numeric values in eval are ignored for mean."""
         results = [
             {
-                "status": "completed",
+                "status": "success",
                 "eval": [
                     {
                         "overall_gsr": 1.0,
@@ -475,23 +469,23 @@ def test_handles_non_numeric_values(self):
     def test_excludes_environment_errors_from_scoring(self):
         """Environment errors are excluded from scoring."""
         results = [
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},
             {"status": "environment_error", "eval": None},  # Should be excluded
-            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 0.0}]},
         ]
 
         metrics = compute_benchmark_metrics(results)
 
         assert metrics["total_tasks"] == 3
-        assert metrics["scored_tasks"] == 2  # Only completed tasks
+        assert metrics["scored_tasks"] == 2  # Only success tasks
         assert metrics["successful_tasks"] == 1
         assert metrics["success_rate"] == 0.5  # 1/2, not 1/3
         assert metrics["excluded"]["environment_error"] == 1
 
     def test_excludes_user_errors_from_scoring(self):
         """User simulator errors are excluded from scoring."""
         results = [
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},
             {"status": "user_error", "eval": None},
         ]
 
@@ -500,14 +494,14 @@ def test_excludes_user_errors_from_scoring(self):
         assert metrics["total_tasks"] == 2
         assert metrics["scored_tasks"] == 1
         assert metrics["successful_tasks"] == 1
-        assert metrics["success_rate"] == 1.0  # Only the completed one
+        assert metrics["success_rate"] == 1.0  # Only the success one
         assert metrics["excluded"]["user_error"] == 1
 
     def test_excludes_unknown_errors_from_scoring(self):
         """Unknown execution errors are excluded from scoring."""
         results = [
             {"status": "unknown_execution_error", "eval": None},
-            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 0.0}]},
         ]
 
         metrics = compute_benchmark_metrics(results)
@@ -521,7 +515,7 @@ def test_excludes_setup_failed_from_scoring(self):
         """Setup failures are excluded from scoring."""
         results = [
             {"status": "setup_failed", "eval": None},
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},
         ]
 
         metrics = compute_benchmark_metrics(results)
@@ -534,21 +528,21 @@ def test_excludes_evaluation_failed_from_scoring(self):
         """Evaluation failures are excluded from scoring."""
         results = [
             {"status": "evaluation_failed", "eval": None},
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},
         ]
 
         metrics = compute_benchmark_metrics(results)
 
         assert metrics["total_tasks"] == 2
         assert metrics["scored_tasks"] == 1
-        assert metrics["success_rate"] == 1.0  # Only the completed one
+        assert metrics["success_rate"] == 1.0  # Only the success one
         assert metrics["excluded"]["evaluation_failed"] == 1
 
     def test_includes_agent_errors_in_scoring(self):
         """Agent errors ARE included in scoring (agent's fault)."""
         results = [
             {"status": "agent_error", "eval": [{"overall_gsr": 0.0}]},
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},
         ]
 
         metrics = compute_benchmark_metrics(results)
@@ -561,23 +555,23 @@ def test_includes_agent_errors_in_scoring(self):
     def test_status_counts_tracked(self):
         """Status counts are tracked for all tasks."""
         results = [
-            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
-            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "success", "eval": [{"overall_gsr": 0.0}]},
             {"status": "agent_error", "eval": None},
             {"status": "environment_error", "eval": None},
         ]
 
         metrics = compute_benchmark_metrics(results)
 
-        assert metrics["status_counts"]["completed"] == 2
+        assert metrics["status_counts"]["success"] == 2
         assert metrics["status_counts"]["agent_error"] == 1
         assert metrics["status_counts"]["environment_error"] == 1
 
     def test_mixed_errors_comprehensive(self):
         """Comprehensive test with various error types."""
         results = [
-            {"status": "completed", "eval": [{"overall_gsr": 1.0, "accuracy": 0.9}]},
-            {"status": "completed", "eval": [{"overall_gsr": 0.0, "accuracy": 0.3}]},
+            {"status": "success", "eval": [{"overall_gsr": 1.0, "accuracy": 0.9}]},
+            {"status": "success", "eval": [{"overall_gsr": 0.0, "accuracy": 0.3}]},
             {"status": "agent_error", "eval": [{"overall_gsr": 0.0, "accuracy": 0.0}]},
             {"status": "environment_error", "eval": None},  # Excluded
             {"status": "user_error", "eval": None},  # Excluded
@@ -588,7 +582,7 @@ def test_mixed_errors_comprehensive(self):
         metrics = compute_benchmark_metrics(results)
 
         assert metrics["total_tasks"] == 7
-        assert metrics["scored_tasks"] == 3  # completed(2) + agent_error(1)
+        assert metrics["scored_tasks"] == 3  # success(2) + agent_error(1)
         assert metrics["successful_tasks"] == 1
         assert metrics["success_rate"] == pytest.approx(1 / 3)
         assert metrics["mean_metrics"]["accuracy"] == pytest.approx((0.9 + 0.3 + 0.0) / 3)
diff --git a/tests/test_benchmarks/test_tau2/test_evaluator.py b/tests/test_benchmarks/test_tau2/test_evaluator.py