2222import re
2323import sys
2424import webbrowser
25+ from datetime import datetime , timezone
2526from pathlib import Path
2627
2728HTML_TEMPLATE = """<!DOCTYPE html>
@@ -1958,8 +1959,10 @@ def load_campaign(campaign_path: Path):
19581959def load_llm_metrics (campaign_path : Path , ledger : dict ) -> dict :
19591960 """Load LLM cost metrics from llm_metrics.jsonl and group by iteration.
19601961
1961- Returns a dict keyed by iteration ID (e.g., "iter-0 ") with cost breakdowns.
1962+ Returns a dict keyed by iteration ID (e.g., "iter-1 ") with cost breakdowns.
19621963 Each iteration has two phases: design (planner) and execute-analyze (executor).
1964+ Handles retries correctly by assigning entries to iterations based on
1965+ timestamps rather than assuming strict design/execute pairs.
19631966 """
19641967 metrics_path = campaign_path / "llm_metrics.jsonl"
19651968 if not metrics_path .exists ():
@@ -1969,49 +1972,100 @@ def load_llm_metrics(campaign_path: Path, ledger: dict) -> dict:
19691972 with open (metrics_path ) as f :
19701973 for line in f :
19711974 line = line .strip ()
1972- if line :
1975+ if not line :
1976+ continue
1977+ try :
19731978 entries .append (json .loads (line ))
1979+ except json .JSONDecodeError :
1980+ continue
19741981
19751982 if not entries :
19761983 return {}
19771984
19781985 iterations = ledger .get ("iterations" , [])
19791986 result = {}
19801987
1981- # Entries come in pairs: design (planner) + execute-analyze (executor).
1982- # The baseline iteration (iter-0, outcome=None) has no metrics —
1983- # pairs map to non-baseline iterations only (iter-1, iter-2, ...).
1984- non_baseline = [it for it in iterations if it .get ("h_main_result" ) is not None ]
1985- for i , it in enumerate (non_baseline ):
1986- iter_id = f"iter-{ it ['iteration' ]} "
1987- design_idx = i * 2
1988- execute_idx = i * 2 + 1
1988+ # Include all iterations with a timestamp (excludes only baseline iter-0).
1989+ # FAILED iterations are included so their costs aren't misattributed to neighbors.
1990+ non_baseline = [
1991+ it for it in iterations
1992+ if isinstance (it .get ("iteration" ), int ) and it ["iteration" ] > 0 and it .get ("timestamp" )
1993+ ]
1994+ if not non_baseline :
1995+ return {}
1996+
1997+ def parse_ts (ts_str ):
1998+ if not ts_str :
1999+ return None
2000+ try :
2001+ dt = datetime .fromisoformat (ts_str )
2002+ if dt .tzinfo is None :
2003+ dt = dt .replace (tzinfo = timezone .utc )
2004+ return dt
2005+ except (ValueError , TypeError ):
2006+ return None
2007+
2008+ # Build sorted iteration end-timestamps for bucketing metrics entries.
2009+ iter_boundaries = []
2010+ for it in non_baseline :
2011+ end_ts = parse_ts (it .get ("timestamp" ))
2012+ if end_ts is None :
2013+ continue
2014+ iter_boundaries .append ({
2015+ "iter_id" : f"iter-{ it ['iteration' ]} " ,
2016+ "end_ts" : end_ts ,
2017+ })
2018+ iter_boundaries .sort (key = lambda b : b ["end_ts" ])
2019+
2020+ if not iter_boundaries :
2021+ return {}
2022+
2023+ # Assign each metrics entry to the first iteration whose end_ts >= entry_ts.
2024+ iter_entries = {b ["iter_id" ]: [] for b in iter_boundaries }
2025+ for entry in entries :
2026+ entry_ts = parse_ts (entry .get ("timestamp" ))
2027+ if entry_ts is None :
2028+ continue
2029+ assigned = None
2030+ for b in iter_boundaries :
2031+ if entry_ts <= b ["end_ts" ]:
2032+ assigned = b ["iter_id" ]
2033+ break
2034+ if assigned is None :
2035+ # Entry after last completed iteration (e.g., failed/in-progress subsequent iter)
2036+ assigned = iter_boundaries [- 1 ]["iter_id" ]
2037+ iter_entries [assigned ].append (entry )
2038+
2039+ # Aggregate per iteration: sum all design entries and all execute entries.
2040+ for b in iter_boundaries :
2041+ iter_id = b ["iter_id" ]
2042+ entries_for_iter = iter_entries [iter_id ]
2043+
2044+ design_entries = [e for e in entries_for_iter if e .get ("role" ) == "planner" ]
2045+ execute_entries = [e for e in entries_for_iter if e .get ("role" ) == "executor" ]
19892046
19902047 iter_metrics = {"design" : None , "execute" : None , "total_cost" : 0 , "total_duration_ms" : 0 , "total_turns" : 0 }
19912048
1992- if design_idx < len (entries ):
1993- d = entries [design_idx ]
2049+ if design_entries :
19942050 iter_metrics ["design" ] = {
1995- "model" : d .get ("model" , "unknown" ),
1996- "cost_usd" : d .get ("cost_usd" ) or 0 ,
1997- "duration_ms" : d .get ("duration_ms" ) or 0 ,
1998- "num_turns" : d .get ("num_turns" ) or 0 ,
1999- "input_tokens" : d .get ("input_tokens" ) or 0 ,
2000- "output_tokens" : d .get ("output_tokens" ) or 0 ,
2051+ "model" : design_entries [ - 1 ] .get ("model" , "unknown" ),
2052+ "cost_usd" : sum ( e .get ("cost_usd" ) or 0 for e in design_entries ) ,
2053+ "duration_ms" : sum ( e .get ("duration_ms" ) or 0 for e in design_entries ) ,
2054+ "num_turns" : sum ( e .get ("num_turns" ) or 0 for e in design_entries ) ,
2055+ "input_tokens" : sum ( e .get ("input_tokens" ) or 0 for e in design_entries ) ,
2056+ "output_tokens" : sum ( e .get ("output_tokens" ) or 0 for e in design_entries ) ,
20012057 }
20022058
2003- if execute_idx < len (entries ):
2004- e = entries [execute_idx ]
2059+ if execute_entries :
20052060 iter_metrics ["execute" ] = {
2006- "model" : e .get ("model" , "unknown" ),
2007- "cost_usd" : e .get ("cost_usd" ) or 0 ,
2008- "duration_ms" : e .get ("duration_ms" ) or 0 ,
2009- "num_turns" : e .get ("num_turns" ) or 0 ,
2010- "input_tokens" : e .get ("input_tokens" ) or 0 ,
2011- "output_tokens" : e .get ("output_tokens" ) or 0 ,
2061+ "model" : execute_entries [ - 1 ] .get ("model" , "unknown" ),
2062+ "cost_usd" : sum ( e .get ("cost_usd" ) or 0 for e in execute_entries ) ,
2063+ "duration_ms" : sum ( e .get ("duration_ms" ) or 0 for e in execute_entries ) ,
2064+ "num_turns" : sum ( e .get ("num_turns" ) or 0 for e in execute_entries ) ,
2065+ "input_tokens" : sum ( e .get ("input_tokens" ) or 0 for e in execute_entries ) ,
2066+ "output_tokens" : sum ( e .get ("output_tokens" ) or 0 for e in execute_entries ) ,
20122067 }
20132068
2014- # Compute totals
20152069 for phase in ["design" , "execute" ]:
20162070 if iter_metrics [phase ]:
20172071 iter_metrics ["total_cost" ] += iter_metrics [phase ]["cost_usd" ]
0 commit comments