Skip to content

Commit 6bdefa5

Browse files
committed
update range
Signed-off-by: Wei Du <wedu@nvidia.com>
1 parent df54e02 commit 6bdefa5

2 files changed

Lines changed: 65 additions & 28 deletions

File tree

tests/slurm-tests/nano_30b_eval/check_results.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from utils import assert_all, get_nested_value, load_json, soft_assert # noqa: E402
2525

2626
NO_TOOLS_METRICS = {
27-
"aime25": ("pass@1[avg-of-4]", "symbolic_correct", (84.0, 94.0)),
27+
"aime25": ("pass@1[avg-of-4]", "symbolic_correct", (88.0, 94.0)),
2828
"gpqa": ("pass@1[avg-of-4]", "symbolic_correct", (69.0, 76.0)),
2929
"mmlu-pro": ("pass@1", "symbolic_correct", (74.0, 82.0)),
3030
"ifbench": ("pass@1[avg-of-5]", "average_score", (66.0, 77.0)),
@@ -40,7 +40,7 @@
4040
}
4141

4242
WITH_TOOLS_METRICS = {
43-
"aime25": ("pass@1[avg-of-4]", "symbolic_correct", (88.0, 100.0)),
43+
"aime25": ("pass@1[avg-of-4]", "symbolic_correct", (95.0, 100.0)),
4444
"gpqa": ("pass@1[avg-of-4]", "symbolic_correct", (72.0, 78.0)),
4545
"hle": ("pass@1", "judge_correct", (13.0, 19.0)),
4646
}
@@ -95,13 +95,21 @@ def check_metric_group(
9595
for benchmark, (agg_key, field, (lo, hi)) in metric_config.items():
9696
metrics_path, metrics, benchmark_label = resolve_metrics_entry(eval_dir, benchmark)
9797
soft_assert(agg_key in metrics, f"Missing aggregation key {agg_key} in {metrics_path}")
98+
if agg_key not in metrics:
99+
continue
100+
agg_metrics = metrics[agg_key]
98101
if isinstance(field, tuple):
99-
value = get_nested_value(metrics[agg_key], field)
102+
value = get_nested_value(agg_metrics, field)
100103
field_label = "/".join(field)
101104
else:
102-
value = metrics[agg_key].get(field)
105+
soft_assert(field in agg_metrics, f"Missing field {field} in {metrics_path}")
106+
if field not in agg_metrics:
107+
continue
108+
value = agg_metrics[field]
103109
field_label = field
104110
soft_assert(value is not None, f"Missing field {field_label} in {metrics_path}")
111+
if value is None:
112+
continue
105113
value = normalize_percent(float(value))
106114
print(f"{eval_dir.name}/{benchmark_label}/{agg_key}/{field_label}: {value}")
107115
soft_assert(lo <= value <= hi, f"{benchmark}: {field_label}={value} out of range [{lo}, {hi}]")
@@ -128,9 +136,23 @@ def check_tool_usage(eval_dir: Path):
128136
bench_dir = eval_dir / "eval-results" / benchmark
129137
for _, row in iter_output_rows(bench_dir):
130138
total_samples += 1
131-
if row.get("num_tool_calls", 0) > 0:
139+
soft_assert("num_tool_calls" in row, f"Missing num_tool_calls in {benchmark} output row")
140+
soft_assert("conversation" in row, f"Missing conversation in {benchmark} output row")
141+
if "num_tool_calls" not in row or "conversation" not in row:
142+
continue
143+
if row["num_tool_calls"] > 0:
132144
samples_with_tools += 1
133-
if any(msg.get("role") == "tool" for msg in row.get("conversation", [])):
145+
has_tool_message = False
146+
for msg in row["conversation"]:
147+
soft_assert(isinstance(msg, dict), f"Conversation entry is not a dict in {benchmark} output row")
148+
if not isinstance(msg, dict):
149+
continue
150+
soft_assert("role" in msg, f"Missing role in {benchmark} conversation entry")
151+
if "role" not in msg:
152+
continue
153+
if msg["role"] == "tool":
154+
has_tool_message = True
155+
if has_tool_message:
134156
samples_with_tool_messages += 1
135157

136158
soft_assert(total_samples > 0, "No samples found in with_tools outputs")
@@ -160,11 +182,25 @@ def check_timeouts(eval_dir: Path):
160182
if not line.strip():
161183
continue
162184
row = json.loads(line)
163-
for msg in row.get("conversation", []):
164-
if msg.get("role") == "tool":
165-
content = str(msg.get("content", ""))
166-
if timeout_pattern.search(content):
167-
file_timeouts += 1
185+
soft_assert("conversation" in row, f"Missing conversation in {benchmark}/{output_path.name}")
186+
if "conversation" not in row:
187+
continue
188+
for msg in row["conversation"]:
189+
soft_assert(
190+
isinstance(msg, dict),
191+
f"Conversation entry is not a dict in {benchmark}/{output_path.name}",
192+
)
193+
if not isinstance(msg, dict):
194+
continue
195+
soft_assert("role" in msg, f"Missing role in {benchmark}/{output_path.name}")
196+
if "role" not in msg or msg["role"] != "tool":
197+
continue
198+
soft_assert("content" in msg, f"Missing content in {benchmark}/{output_path.name}")
199+
if "content" not in msg:
200+
continue
201+
content = str(msg["content"])
202+
if timeout_pattern.search(content):
203+
file_timeouts += 1
168204
bench_timeouts += file_timeouts
169205
if file_timeouts > 0:
170206
print(f"{benchmark}/{output_path.name}: num_code_timeouts={file_timeouts}")

tests/slurm-tests/nano_30b_eval/run_test.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616

1717
import argparse
1818

19-
from nemo_skills.pipeline.cli import eval, prepare_data, run_cmd, wrap_arguments
19+
from nemo_skills.pipeline.cli import eval as run_eval
20+
from nemo_skills.pipeline.cli import prepare_data, run_cmd, wrap_arguments
2021

2122
MODEL = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
2223
MODEL_DIRNAME = MODEL.split("/")[-1]
@@ -115,7 +116,7 @@ def eval_no_tools(
115116
expnames = []
116117

117118
expname = f"{expname_prefix}-no-tools-aime25"
118-
eval(
119+
run_eval(
119120
ctx=wrap_arguments(NO_TOOLS_PARAMS),
120121
cluster=cluster,
121122
model=get_local_model_path(workspace),
@@ -135,7 +136,7 @@ def eval_no_tools(
135136
expnames.append(expname)
136137

137138
expname = f"{expname_prefix}-no-tools-gpqa"
138-
eval(
139+
run_eval(
139140
ctx=wrap_arguments(NO_TOOLS_PARAMS + "++prompt_config=eval/aai/mcq-4choices-boxed "),
140141
cluster=cluster,
141142
model=get_local_model_path(workspace),
@@ -155,7 +156,7 @@ def eval_no_tools(
155156
expnames.append(expname)
156157

157158
expname = f"{expname_prefix}-no-tools-mmlu-pro"
158-
eval(
159+
run_eval(
159160
ctx=wrap_arguments(NO_TOOLS_PARAMS + "++prompt_config=eval/aai/mcq-10choices-boxed "),
160161
cluster=cluster,
161162
model=get_local_model_path(workspace),
@@ -175,7 +176,7 @@ def eval_no_tools(
175176
expnames.append(expname)
176177

177178
expname = f"{expname_prefix}-no-tools-ifbench"
178-
eval(
179+
run_eval(
179180
ctx=wrap_arguments(
180181
NO_TOOLS_PARAMS + "++generation_key=response ++prompt_config=generic/default ++eval_type=ifbench "
181182
),
@@ -197,7 +198,7 @@ def eval_no_tools(
197198
expnames.append(expname)
198199

199200
expname = f"{expname_prefix}-no-tools-livecodebench"
200-
eval(
201+
run_eval(
201202
ctx=wrap_arguments(NO_TOOLS_PARAMS),
202203
cluster=cluster,
203204
model=get_local_model_path(workspace),
@@ -218,7 +219,7 @@ def eval_no_tools(
218219
expnames.append(expname)
219220

220221
expname = f"{expname_prefix}-no-tools-arena-hard-v2"
221-
eval(
222+
run_eval(
222223
ctx=wrap_arguments(NO_TOOLS_PARAMS),
223224
cluster=cluster,
224225
model=get_local_model_path(workspace),
@@ -242,7 +243,7 @@ def eval_no_tools(
242243
expnames.append(expname)
243244

244245
expname = f"{expname_prefix}-no-tools-scicode"
245-
eval(
246+
run_eval(
246247
ctx=wrap_arguments(NO_TOOLS_PARAMS + "++prompt_config=eval/scicode/background ++eval_type=scicode "),
247248
cluster=cluster,
248249
model=get_local_model_path(workspace),
@@ -262,7 +263,7 @@ def eval_no_tools(
262263
expnames.append(expname)
263264

264265
expname = f"{expname_prefix}-no-tools-hle"
265-
eval(
266+
run_eval(
266267
ctx=wrap_arguments(NO_TOOLS_PARAMS),
267268
cluster=cluster,
268269
model=get_local_model_path(workspace),
@@ -287,7 +288,7 @@ def eval_no_tools(
287288
expnames.append(expname)
288289

289290
expname = f"{expname_prefix}-no-tools-aalcr"
290-
eval(
291+
run_eval(
291292
ctx=wrap_arguments(NO_TOOLS_PARAMS),
292293
cluster=cluster,
293294
model=get_local_model_path(workspace),
@@ -312,7 +313,7 @@ def eval_no_tools(
312313
expnames.append(expname)
313314

314315
expname = f"{expname_prefix}-no-tools-mmlu-prox"
315-
eval(
316+
run_eval(
316317
ctx=wrap_arguments(NO_TOOLS_PARAMS + "++prompt_config=generic/default ++eval_type=multichoice "),
317318
cluster=cluster,
318319
model=get_local_model_path(workspace),
@@ -332,7 +333,7 @@ def eval_no_tools(
332333
expnames.append(expname)
333334

334335
expname = f"{expname_prefix}-no-tools-wmt24pp"
335-
eval(
336+
run_eval(
336337
ctx=wrap_arguments(NO_TOOLS_PARAMS + "++prompt_config=multilingual/segment-translation "),
337338
cluster=cluster,
338339
model=get_local_model_path(workspace),
@@ -373,7 +374,7 @@ def eval_with_tools(
373374
expnames = []
374375

375376
expname = f"{expname_prefix}-with-tools-aime25"
376-
eval(
377+
run_eval(
377378
ctx=wrap_arguments(WITH_TOOLS_COMMON_PARAMS + "++prompt_config=qwen/math-tir "),
378379
cluster=cluster,
379380
model=get_local_model_path(workspace),
@@ -394,7 +395,7 @@ def eval_with_tools(
394395
expnames.append(expname)
395396

396397
expname = f"{expname_prefix}-with-tools-gpqa"
397-
eval(
398+
run_eval(
398399
ctx=wrap_arguments(WITH_TOOLS_COMMON_PARAMS + "++prompt_config=eval/aai/mcq-4choices-boxed "),
399400
cluster=cluster,
400401
model=get_local_model_path(workspace),
@@ -415,7 +416,7 @@ def eval_with_tools(
415416
expnames.append(expname)
416417

417418
expname = f"{expname_prefix}-with-tools-hle"
418-
eval(
419+
run_eval(
419420
ctx=wrap_arguments(WITH_TOOLS_COMMON_PARAMS + "++prompt_config=generic/hle "),
420421
cluster=cluster,
421422
model=get_local_model_path(workspace),
@@ -458,7 +459,7 @@ def eval_formal_math(
458459
enable_tools=False,
459460
)
460461
expname = f"{expname_prefix}-formal-math-pass32"
461-
eval(
462+
run_eval(
462463
ctx=wrap_arguments(FORMAL_MATH_PARAMS),
463464
cluster=cluster,
464465
model=get_local_model_path(workspace),
@@ -497,7 +498,7 @@ def eval_agentic(
497498
+ " --async-scheduling --enforce-eager"
498499
)
499500
expname = f"{expname_prefix}-agentic-openhands"
500-
eval(
501+
run_eval(
501502
ctx=wrap_arguments(AGENTIC_PARAMS),
502503
cluster=cluster,
503504
model=get_local_model_path(workspace),

0 commit comments

Comments
 (0)