2424from utils import assert_all , get_nested_value , load_json , soft_assert # noqa: E402
2525
2626NO_TOOLS_METRICS = {
27- "aime25" : ("pass@1[avg-of-4]" , "symbolic_correct" , (84 .0 , 94.0 )),
27+ "aime25" : ("pass@1[avg-of-4]" , "symbolic_correct" , (88 .0 , 94.0 )),
2828 "gpqa" : ("pass@1[avg-of-4]" , "symbolic_correct" , (69.0 , 76.0 )),
2929 "mmlu-pro" : ("pass@1" , "symbolic_correct" , (74.0 , 82.0 )),
3030 "ifbench" : ("pass@1[avg-of-5]" , "average_score" , (66.0 , 77.0 )),
4040}
4141
4242WITH_TOOLS_METRICS = {
43- "aime25" : ("pass@1[avg-of-4]" , "symbolic_correct" , (88 .0 , 100.0 )),
43+ "aime25" : ("pass@1[avg-of-4]" , "symbolic_correct" , (95 .0 , 100.0 )),
4444 "gpqa" : ("pass@1[avg-of-4]" , "symbolic_correct" , (72.0 , 78.0 )),
4545 "hle" : ("pass@1" , "judge_correct" , (13.0 , 19.0 )),
4646}
@@ -95,13 +95,21 @@ def check_metric_group(
9595 for benchmark , (agg_key , field , (lo , hi )) in metric_config .items ():
9696 metrics_path , metrics , benchmark_label = resolve_metrics_entry (eval_dir , benchmark )
9797 soft_assert (agg_key in metrics , f"Missing aggregation key { agg_key } in { metrics_path } " )
98+ if agg_key not in metrics :
99+ continue
100+ agg_metrics = metrics [agg_key ]
98101 if isinstance (field , tuple ):
99- value = get_nested_value (metrics [ agg_key ] , field )
102+ value = get_nested_value (agg_metrics , field )
100103 field_label = "/" .join (field )
101104 else :
102- value = metrics [agg_key ].get (field )
105+ soft_assert (field in agg_metrics , f"Missing field { field } in { metrics_path } " )
106+ if field not in agg_metrics :
107+ continue
108+ value = agg_metrics [field ]
103109 field_label = field
104110 soft_assert (value is not None , f"Missing field { field_label } in { metrics_path } " )
111+ if value is None :
112+ continue
105113 value = normalize_percent (float (value ))
106114 print (f"{ eval_dir .name } /{ benchmark_label } /{ agg_key } /{ field_label } : { value } " )
107115 soft_assert (lo <= value <= hi , f"{ benchmark } : { field_label } ={ value } out of range [{ lo } , { hi } ]" )
@@ -128,9 +136,23 @@ def check_tool_usage(eval_dir: Path):
128136 bench_dir = eval_dir / "eval-results" / benchmark
129137 for _ , row in iter_output_rows (bench_dir ):
130138 total_samples += 1
131- if row .get ("num_tool_calls" , 0 ) > 0 :
139+ soft_assert ("num_tool_calls" in row , f"Missing num_tool_calls in { benchmark } output row" )
140+ soft_assert ("conversation" in row , f"Missing conversation in { benchmark } output row" )
141+ if "num_tool_calls" not in row or "conversation" not in row :
142+ continue
143+ if row ["num_tool_calls" ] > 0 :
132144 samples_with_tools += 1
133- if any (msg .get ("role" ) == "tool" for msg in row .get ("conversation" , [])):
145+ has_tool_message = False
146+ for msg in row ["conversation" ]:
147+ soft_assert (isinstance (msg , dict ), f"Conversation entry is not a dict in { benchmark } output row" )
148+ if not isinstance (msg , dict ):
149+ continue
150+ soft_assert ("role" in msg , f"Missing role in { benchmark } conversation entry" )
151+ if "role" not in msg :
152+ continue
153+ if msg ["role" ] == "tool" :
154+ has_tool_message = True
155+ if has_tool_message :
134156 samples_with_tool_messages += 1
135157
136158 soft_assert (total_samples > 0 , "No samples found in with_tools outputs" )
@@ -160,11 +182,25 @@ def check_timeouts(eval_dir: Path):
160182 if not line .strip ():
161183 continue
162184 row = json .loads (line )
163- for msg in row .get ("conversation" , []):
164- if msg .get ("role" ) == "tool" :
165- content = str (msg .get ("content" , "" ))
166- if timeout_pattern .search (content ):
167- file_timeouts += 1
185+ soft_assert ("conversation" in row , f"Missing conversation in { benchmark } /{ output_path .name } " )
186+ if "conversation" not in row :
187+ continue
188+ for msg in row ["conversation" ]:
189+ soft_assert (
190+ isinstance (msg , dict ),
191+ f"Conversation entry is not a dict in { benchmark } /{ output_path .name } " ,
192+ )
193+ if not isinstance (msg , dict ):
194+ continue
195+ soft_assert ("role" in msg , f"Missing role in { benchmark } /{ output_path .name } " )
196+ if "role" not in msg or msg ["role" ] != "tool" :
197+ continue
198+ soft_assert ("content" in msg , f"Missing content in { benchmark } /{ output_path .name } " )
199+ if "content" not in msg :
200+ continue
201+ content = str (msg ["content" ])
202+ if timeout_pattern .search (content ):
203+ file_timeouts += 1
168204 bench_timeouts += file_timeouts
169205 if file_timeouts > 0 :
170206 print (f"{ benchmark } /{ output_path .name } : num_code_timeouts={ file_timeouts } " )
0 commit comments