fix: treat agent timeouts as scored results with timed_out metadata flag

sjarmak · claude · sjarmak · commit f1cb4555850e · 2026-02-23T16:44:48.000Z
AgentTimeoutError tasks already have their partial work scored by the
verifier, but were classified as "errored" hiding their reward from
analysis. Now timeouts with a verifier reward are classified as
completed_pass/completed_fail with a timed_out flag, keeping only real
infrastructure errors as "errored".

- aggregate_status.py: classify_task() checks exception_type before
  marking errored; adds timed_out field and separate timed_out count
- extractors.py: AgentTimeoutError tasks classified by reward not error
- models.py: add timed_out field to TaskMetrics dataclass
- compare_configs.py: pass through timed_out flag in comparison output

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/scripts/aggregate_status.py b/scripts/aggregate_status.py
@@ -234,6 +234,7 @@ def classify_task(task_dir: Path, timeout_hours: float) -> dict:
         record["error_fingerprint"] = None
         record["metrics"] = {}
         record["wall_clock_seconds"] = None
+        record["timed_out"] = False
         return record
 
     # Parse result.json
@@ -251,6 +252,7 @@ def classify_task(task_dir: Path, timeout_hours: float) -> dict:
         }
         record["metrics"] = {}
         record["wall_clock_seconds"] = None
+        record["timed_out"] = False
         return record
 
     # Check for exception
@@ -276,14 +278,35 @@ def classify_task(task_dir: Path, timeout_hours: float) -> dict:
     record["started_at"] = data.get("started_at", "")
     record["finished_at"] = data.get("finished_at", "")
 
-    if exception_info is not None:
+    # Determine exception type for timeout-vs-error classification
+    exception_type = ""
+    if isinstance(exception_info, dict):
+        exception_type = exception_info.get(
+            "exception_type", exception_info.get("type", "")
+        )
+
+    if (
+        exception_info is not None
+        and exception_type == "AgentTimeoutError"
+        and reward is not None
+    ):
+        # Agent timed out but verifier scored partial work — treat as scored result
+        record["timed_out"] = True
+        record["error_fingerprint"] = fingerprint_error(exception_info)
+        if reward > 0:
+            record["status"] = "completed_pass"
+        else:
+            record["status"] = "completed_fail"
+    elif exception_info is not None:
         record["status"] = "errored"
         record["error_fingerprint"] = fingerprint_error(exception_info)
-    elif reward is not None and reward > 0:
-        record["status"] = "completed_pass"
-        record["error_fingerprint"] = None
+        record["timed_out"] = False
     else:
-        record["status"] = "completed_fail"
+        record["timed_out"] = False
+        if reward is not None and reward > 0:
+            record["status"] = "completed_pass"
+        else:
+            record["status"] = "completed_fail"
         record["error_fingerprint"] = None
 
     return record
@@ -355,6 +378,8 @@ def scan_all_tasks(
 
                 tasks.append(record)
                 totals[record["status"]] += 1
+                if record.get("timed_out"):
+                    totals["timed_out"] += 1
                 by_suite[suite][config][record["status"]] += 1
 
                 # Accumulate error summary
@@ -592,12 +617,16 @@ def format_table(output: dict) -> str:
 
     # Totals
     totals = output["totals"]
+    timed_out_count = totals.pop("timed_out", 0)
     total_all = sum(totals.values())
     lines.append(f"TOTALS: {total_all} tasks")
     for status in ("running", "completed_pass", "completed_fail", "errored", "timeout"):
         count = totals.get(status, 0)
         if count:
             lines.append(f"  {status:20s} {count:>5d}")
+    if timed_out_count:
+        lines.append(f"  {'timed_out (scored)':20s} {timed_out_count:>5d}")
+    totals["timed_out"] = timed_out_count  # restore for JSON output
     lines.append("")
 
     # By suite/config breakdown
@@ -658,14 +687,18 @@ def format_table(output: dict) -> str:
 
     # Task details (only non-pass or if few tasks)
     non_pass = [t for t in output["tasks"] if t["status"] != "completed_pass"]
-    if non_pass:
-        lines.append(f"NON-PASSING TASKS ({len(non_pass)}):")
-        for t in non_pass:
+    timed_out_pass = [t for t in output["tasks"]
+                      if t["status"] == "completed_pass" and t.get("timed_out")]
+    notable = non_pass + timed_out_pass
+    if notable:
+        lines.append(f"NON-PASSING / TIMED-OUT TASKS ({len(notable)}):")
+        for t in notable:
             fp_str = ""
             if t.get("error_fingerprint"):
                 fp_str = f" [{t['error_fingerprint']['fingerprint_id']}]"
             reward_str = f" reward={t['reward']:.2f}" if t["reward"] is not None else ""
-            lines.append(f"  {t['status']:16s}  {t.get('suite',''):20s}  {t.get('config',''):18s}  {t['task_name']}{reward_str}{fp_str}")
+            timeout_str = " [timed_out]" if t.get("timed_out") else ""
+            lines.append(f"  {t['status']:16s}  {t.get('suite',''):20s}  {t.get('config',''):18s}  {t['task_name']}{reward_str}{fp_str}{timeout_str}")
 
     return "\n".join(lines)
 
diff --git a/scripts/ccb_metrics/extractors.py b/scripts/ccb_metrics/extractors.py
@@ -97,8 +97,11 @@ def extract_task_from_result_json(
                 continue
             break
 
-    # Status
-    if data.get("exception_info"):
+    # Status — agent timeouts are scored normally (verifier runs on partial work)
+    exc = data.get("exception_info") or {}
+    exc_type = exc.get("exception_type", exc.get("type", "")) if isinstance(exc, dict) else ""
+    timed_out = bool(exc and exc_type == "AgentTimeoutError")
+    if exc and not timed_out:
         status = "error"
     elif reward is not None:
         status = "passed" if reward > 0 else "failed"
@@ -160,6 +163,7 @@ def extract_task_from_result_json(
         config_name=config_name,
         reward=reward,
         status=status,
+        timed_out=timed_out,
         wall_clock_seconds=wall_clock,
         agent_execution_seconds=agent_execution_seconds,
         environment_setup_seconds=environment_setup_seconds,
diff --git a/scripts/ccb_metrics/models.py b/scripts/ccb_metrics/models.py
@@ -83,6 +83,9 @@ class TaskMetrics:
     input_output_ratio: Optional[float] = None
     cache_hit_rate: Optional[float] = None
 
+    # Agent timeout flag (verifier still scored partial work)
+    timed_out: bool = False
+
     # Tier 1: error & environment
     error_fingerprint: Optional[dict] = None
     verifier_test_summary: Optional[dict] = None
diff --git a/scripts/compare_configs.py b/scripts/compare_configs.py
@@ -122,6 +122,7 @@ def _build_comparison(task_matrix, suite_filter) -> dict:
                 "reward": reward,
                 "wall_clock_seconds": rec.get("wall_clock_seconds"),
                 "error_fingerprint": rec.get("error_fingerprint"),
+                "timed_out": rec.get("timed_out", False),
             }
 
             is_pass = status == "completed_pass"

Original file line number	Diff line number	Diff line change
`@@ -122,6 +122,7 @@ def _build_comparison(task_matrix, suite_filter) -> dict:`
`122`	`122`	`"reward": reward,`
`123`	`123`	`"wall_clock_seconds": rec.get("wall_clock_seconds"),`
`124`	`124`	`"error_fingerprint": rec.get("error_fingerprint"),`
	`125`	`+ "timed_out": rec.get("timed_out", False),`
`125`	`126`	`}`
`126`	`127`
`127`	`128`	`is_pass = status == "completed_pass"`