Skip to content

Commit f1cb455

Browse files
sjarmakclaude
andcommitted
fix: treat agent timeouts as scored results with timed_out metadata flag
AgentTimeoutError tasks already have their partial work scored by the verifier, but were classified as "errored" hiding their reward from analysis. Now timeouts with a verifier reward are classified as completed_pass/completed_fail with a timed_out flag, keeping only real infrastructure errors as "errored". - aggregate_status.py: classify_task() checks exception_type before marking errored; adds timed_out field and separate timed_out count - extractors.py: AgentTimeoutError tasks classified by reward not error - models.py: add timed_out field to TaskMetrics dataclass - compare_configs.py: pass through timed_out flag in comparison output Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9aa95a0 commit f1cb455

File tree

4 files changed

+52
-11
lines changed

4 files changed

+52
-11
lines changed

scripts/aggregate_status.py

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ def classify_task(task_dir: Path, timeout_hours: float) -> dict:
234234
record["error_fingerprint"] = None
235235
record["metrics"] = {}
236236
record["wall_clock_seconds"] = None
237+
record["timed_out"] = False
237238
return record
238239

239240
# Parse result.json
@@ -251,6 +252,7 @@ def classify_task(task_dir: Path, timeout_hours: float) -> dict:
251252
}
252253
record["metrics"] = {}
253254
record["wall_clock_seconds"] = None
255+
record["timed_out"] = False
254256
return record
255257

256258
# Check for exception
@@ -276,14 +278,35 @@ def classify_task(task_dir: Path, timeout_hours: float) -> dict:
276278
record["started_at"] = data.get("started_at", "")
277279
record["finished_at"] = data.get("finished_at", "")
278280

279-
if exception_info is not None:
281+
# Determine exception type for timeout-vs-error classification
282+
exception_type = ""
283+
if isinstance(exception_info, dict):
284+
exception_type = exception_info.get(
285+
"exception_type", exception_info.get("type", "")
286+
)
287+
288+
if (
289+
exception_info is not None
290+
and exception_type == "AgentTimeoutError"
291+
and reward is not None
292+
):
293+
# Agent timed out but verifier scored partial work — treat as scored result
294+
record["timed_out"] = True
295+
record["error_fingerprint"] = fingerprint_error(exception_info)
296+
if reward > 0:
297+
record["status"] = "completed_pass"
298+
else:
299+
record["status"] = "completed_fail"
300+
elif exception_info is not None:
280301
record["status"] = "errored"
281302
record["error_fingerprint"] = fingerprint_error(exception_info)
282-
elif reward is not None and reward > 0:
283-
record["status"] = "completed_pass"
284-
record["error_fingerprint"] = None
303+
record["timed_out"] = False
285304
else:
286-
record["status"] = "completed_fail"
305+
record["timed_out"] = False
306+
if reward is not None and reward > 0:
307+
record["status"] = "completed_pass"
308+
else:
309+
record["status"] = "completed_fail"
287310
record["error_fingerprint"] = None
288311

289312
return record
@@ -355,6 +378,8 @@ def scan_all_tasks(
355378

356379
tasks.append(record)
357380
totals[record["status"]] += 1
381+
if record.get("timed_out"):
382+
totals["timed_out"] += 1
358383
by_suite[suite][config][record["status"]] += 1
359384

360385
# Accumulate error summary
@@ -592,12 +617,16 @@ def format_table(output: dict) -> str:
592617

593618
# Totals
594619
totals = output["totals"]
620+
timed_out_count = totals.pop("timed_out", 0)
595621
total_all = sum(totals.values())
596622
lines.append(f"TOTALS: {total_all} tasks")
597623
for status in ("running", "completed_pass", "completed_fail", "errored", "timeout"):
598624
count = totals.get(status, 0)
599625
if count:
600626
lines.append(f" {status:20s} {count:>5d}")
627+
if timed_out_count:
628+
lines.append(f" {'timed_out (scored)':20s} {timed_out_count:>5d}")
629+
totals["timed_out"] = timed_out_count # restore for JSON output
601630
lines.append("")
602631

603632
# By suite/config breakdown
@@ -658,14 +687,18 @@ def format_table(output: dict) -> str:
658687

659688
# Task details (only non-pass or if few tasks)
660689
non_pass = [t for t in output["tasks"] if t["status"] != "completed_pass"]
661-
if non_pass:
662-
lines.append(f"NON-PASSING TASKS ({len(non_pass)}):")
663-
for t in non_pass:
690+
timed_out_pass = [t for t in output["tasks"]
691+
if t["status"] == "completed_pass" and t.get("timed_out")]
692+
notable = non_pass + timed_out_pass
693+
if notable:
694+
lines.append(f"NON-PASSING / TIMED-OUT TASKS ({len(notable)}):")
695+
for t in notable:
664696
fp_str = ""
665697
if t.get("error_fingerprint"):
666698
fp_str = f" [{t['error_fingerprint']['fingerprint_id']}]"
667699
reward_str = f" reward={t['reward']:.2f}" if t["reward"] is not None else ""
668-
lines.append(f" {t['status']:16s} {t.get('suite',''):20s} {t.get('config',''):18s} {t['task_name']}{reward_str}{fp_str}")
700+
timeout_str = " [timed_out]" if t.get("timed_out") else ""
701+
lines.append(f" {t['status']:16s} {t.get('suite',''):20s} {t.get('config',''):18s} {t['task_name']}{reward_str}{fp_str}{timeout_str}")
669702

670703
return "\n".join(lines)
671704

scripts/ccb_metrics/extractors.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,11 @@ def extract_task_from_result_json(
9797
continue
9898
break
9999

100-
# Status
101-
if data.get("exception_info"):
100+
# Status — agent timeouts are scored normally (verifier runs on partial work)
101+
exc = data.get("exception_info") or {}
102+
exc_type = exc.get("exception_type", exc.get("type", "")) if isinstance(exc, dict) else ""
103+
timed_out = bool(exc and exc_type == "AgentTimeoutError")
104+
if exc and not timed_out:
102105
status = "error"
103106
elif reward is not None:
104107
status = "passed" if reward > 0 else "failed"
@@ -160,6 +163,7 @@ def extract_task_from_result_json(
160163
config_name=config_name,
161164
reward=reward,
162165
status=status,
166+
timed_out=timed_out,
163167
wall_clock_seconds=wall_clock,
164168
agent_execution_seconds=agent_execution_seconds,
165169
environment_setup_seconds=environment_setup_seconds,

scripts/ccb_metrics/models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ class TaskMetrics:
8383
input_output_ratio: Optional[float] = None
8484
cache_hit_rate: Optional[float] = None
8585

86+
# Agent timeout flag (verifier still scored partial work)
87+
timed_out: bool = False
88+
8689
# Tier 1: error & environment
8790
error_fingerprint: Optional[dict] = None
8891
verifier_test_summary: Optional[dict] = None

scripts/compare_configs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def _build_comparison(task_matrix, suite_filter) -> dict:
122122
"reward": reward,
123123
"wall_clock_seconds": rec.get("wall_clock_seconds"),
124124
"error_fingerprint": rec.get("error_fingerprint"),
125+
"timed_out": rec.get("timed_out", False),
125126
}
126127

127128
is_pass = status == "completed_pass"

0 commit comments

Comments
 (0)