Skip to content

Commit fe8873b

Browse files
susiejojoclaude
andauthored
fix(viz): accurate cost tracking with timestamp-based iteration assignment (#285)
The cost chart previously assumed strict positional pairing of llm_metrics.jsonl entries (entry[i*2]=design, entry[i*2+1]=execute). This broke when retries appended extra entries. Now assigns entries to iterations by comparing timestamps against sorted ledger boundaries. Key fixes: - Include FAILED iterations in boundaries so their costs aren't misattributed to adjacent successful iterations - Normalize timezone-naive timestamps to UTC-aware before comparison - Sort boundaries explicitly rather than assuming ledger order - Skip corrupt JSONL lines instead of crashing the entire function - Guard against null/non-int iteration values in ledger Adds 15 tests covering normal, retry, FAILED iteration, timezone, and edge-case scenarios. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d4c01aa commit fe8873b

2 files changed

Lines changed: 346 additions & 27 deletions

File tree

scripts/visualize_campaign.py

Lines changed: 81 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import re
2323
import sys
2424
import webbrowser
25+
from datetime import datetime, timezone
2526
from pathlib import Path
2627

2728
HTML_TEMPLATE = """<!DOCTYPE html>
@@ -1958,8 +1959,10 @@ def load_campaign(campaign_path: Path):
19581959
def load_llm_metrics(campaign_path: Path, ledger: dict) -> dict:
19591960
"""Load LLM cost metrics from llm_metrics.jsonl and group by iteration.
19601961
1961-
Returns a dict keyed by iteration ID (e.g., "iter-0") with cost breakdowns.
1962+
Returns a dict keyed by iteration ID (e.g., "iter-1") with cost breakdowns.
19621963
Each iteration has two phases: design (planner) and execute-analyze (executor).
1964+
Handles retries correctly by assigning entries to iterations based on
1965+
timestamps rather than assuming strict design/execute pairs.
19631966
"""
19641967
metrics_path = campaign_path / "llm_metrics.jsonl"
19651968
if not metrics_path.exists():
@@ -1969,49 +1972,100 @@ def load_llm_metrics(campaign_path: Path, ledger: dict) -> dict:
19691972
with open(metrics_path) as f:
19701973
for line in f:
19711974
line = line.strip()
1972-
if line:
1975+
if not line:
1976+
continue
1977+
try:
19731978
entries.append(json.loads(line))
1979+
except json.JSONDecodeError:
1980+
continue
19741981

19751982
if not entries:
19761983
return {}
19771984

19781985
iterations = ledger.get("iterations", [])
19791986
result = {}
19801987

1981-
# Entries come in pairs: design (planner) + execute-analyze (executor).
1982-
# The baseline iteration (iter-0, outcome=None) has no metrics —
1983-
# pairs map to non-baseline iterations only (iter-1, iter-2, ...).
1984-
non_baseline = [it for it in iterations if it.get("h_main_result") is not None]
1985-
for i, it in enumerate(non_baseline):
1986-
iter_id = f"iter-{it['iteration']}"
1987-
design_idx = i * 2
1988-
execute_idx = i * 2 + 1
1988+
# Include all iterations with a timestamp (excludes only baseline iter-0).
1989+
# FAILED iterations are included so their costs aren't misattributed to neighbors.
1990+
non_baseline = [
1991+
it for it in iterations
1992+
if isinstance(it.get("iteration"), int) and it["iteration"] > 0 and it.get("timestamp")
1993+
]
1994+
if not non_baseline:
1995+
return {}
1996+
1997+
def parse_ts(ts_str):
1998+
if not ts_str:
1999+
return None
2000+
try:
2001+
dt = datetime.fromisoformat(ts_str)
2002+
if dt.tzinfo is None:
2003+
dt = dt.replace(tzinfo=timezone.utc)
2004+
return dt
2005+
except (ValueError, TypeError):
2006+
return None
2007+
2008+
# Build sorted iteration end-timestamps for bucketing metrics entries.
2009+
iter_boundaries = []
2010+
for it in non_baseline:
2011+
end_ts = parse_ts(it.get("timestamp"))
2012+
if end_ts is None:
2013+
continue
2014+
iter_boundaries.append({
2015+
"iter_id": f"iter-{it['iteration']}",
2016+
"end_ts": end_ts,
2017+
})
2018+
iter_boundaries.sort(key=lambda b: b["end_ts"])
2019+
2020+
if not iter_boundaries:
2021+
return {}
2022+
2023+
# Assign each metrics entry to the first iteration whose end_ts >= entry_ts.
2024+
iter_entries = {b["iter_id"]: [] for b in iter_boundaries}
2025+
for entry in entries:
2026+
entry_ts = parse_ts(entry.get("timestamp"))
2027+
if entry_ts is None:
2028+
continue
2029+
assigned = None
2030+
for b in iter_boundaries:
2031+
if entry_ts <= b["end_ts"]:
2032+
assigned = b["iter_id"]
2033+
break
2034+
if assigned is None:
2035+
# Entry after last completed iteration (e.g., failed/in-progress subsequent iter)
2036+
assigned = iter_boundaries[-1]["iter_id"]
2037+
iter_entries[assigned].append(entry)
2038+
2039+
# Aggregate per iteration: sum all design entries and all execute entries.
2040+
for b in iter_boundaries:
2041+
iter_id = b["iter_id"]
2042+
entries_for_iter = iter_entries[iter_id]
2043+
2044+
design_entries = [e for e in entries_for_iter if e.get("role") == "planner"]
2045+
execute_entries = [e for e in entries_for_iter if e.get("role") == "executor"]
19892046

19902047
iter_metrics = {"design": None, "execute": None, "total_cost": 0, "total_duration_ms": 0, "total_turns": 0}
19912048

1992-
if design_idx < len(entries):
1993-
d = entries[design_idx]
2049+
if design_entries:
19942050
iter_metrics["design"] = {
1995-
"model": d.get("model", "unknown"),
1996-
"cost_usd": d.get("cost_usd") or 0,
1997-
"duration_ms": d.get("duration_ms") or 0,
1998-
"num_turns": d.get("num_turns") or 0,
1999-
"input_tokens": d.get("input_tokens") or 0,
2000-
"output_tokens": d.get("output_tokens") or 0,
2051+
"model": design_entries[-1].get("model", "unknown"),
2052+
"cost_usd": sum(e.get("cost_usd") or 0 for e in design_entries),
2053+
"duration_ms": sum(e.get("duration_ms") or 0 for e in design_entries),
2054+
"num_turns": sum(e.get("num_turns") or 0 for e in design_entries),
2055+
"input_tokens": sum(e.get("input_tokens") or 0 for e in design_entries),
2056+
"output_tokens": sum(e.get("output_tokens") or 0 for e in design_entries),
20012057
}
20022058

2003-
if execute_idx < len(entries):
2004-
e = entries[execute_idx]
2059+
if execute_entries:
20052060
iter_metrics["execute"] = {
2006-
"model": e.get("model", "unknown"),
2007-
"cost_usd": e.get("cost_usd") or 0,
2008-
"duration_ms": e.get("duration_ms") or 0,
2009-
"num_turns": e.get("num_turns") or 0,
2010-
"input_tokens": e.get("input_tokens") or 0,
2011-
"output_tokens": e.get("output_tokens") or 0,
2061+
"model": execute_entries[-1].get("model", "unknown"),
2062+
"cost_usd": sum(e.get("cost_usd") or 0 for e in execute_entries),
2063+
"duration_ms": sum(e.get("duration_ms") or 0 for e in execute_entries),
2064+
"num_turns": sum(e.get("num_turns") or 0 for e in execute_entries),
2065+
"input_tokens": sum(e.get("input_tokens") or 0 for e in execute_entries),
2066+
"output_tokens": sum(e.get("output_tokens") or 0 for e in execute_entries),
20122067
}
20132068

2014-
# Compute totals
20152069
for phase in ["design", "execute"]:
20162070
if iter_metrics[phase]:
20172071
iter_metrics["total_cost"] += iter_metrics[phase]["cost_usd"]

0 commit comments

Comments
 (0)