Skip to content

Commit 9bbe752

Browse files
committed
feat(accuracy): align AIME25 with lighteval reference (AIP-876 follow-up)
Same lighteval alignment as AIME24, but with the yentinglin/aime_2025 dataset. Bare problem text as user message, generation_size=32768, default_grader=lighteval_expr. Tests pin the same invariants (prompt is bare problem text, n_shots/enable_cot ignored). Reference: trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:142 Signed-off-by: Elias Bermudez <dbermudez@nvidia.com>
1 parent cd239a5 commit 9bbe752

4 files changed

Lines changed: 282 additions & 9 deletions

File tree

docs/accuracy/accuracy-benchmarking.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ system message).
7373
| `mmlu` | `multiple_choice` | 5 | `lighteval/mmlu` (57 subjects) |
7474
| `aime` | `math` | 8 | `Maxwell-Jia/AIME_2024` (trt-llm reference, 8-shot CoT) |
7575
| `aime24` | `lighteval_expr` | 0 | `HuggingFaceH4/aime_2024` (trt-llm/lighteval reference) |
76+
| `aime25` | `lighteval_expr` | 0 | `yentinglin/aime_2025` (trt-llm/lighteval reference) |
7677

7778
## CLI Flags
7879

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,82 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
from aiperf.accuracy.models import BenchmarkProblem
4+
"""AIME 2025 benchmark loader, aligned with the trt-llm lighteval reference.
5+
6+
Mirrors ``acc_bench_lighteval.py:aime25``: same ``aime_prompt_fn``,
7+
same zero-shot config, ``generation_size=32768``,
8+
``hf_repo="yentinglin/aime_2025"``. See the AIME24 module for a fuller
9+
explanation of the design.
10+
11+
Reference:
12+
trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:142
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import asyncio
18+
from typing import Any
19+
20+
from datasets import Dataset, load_dataset
21+
22+
from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem
523
from aiperf.common.config import UserConfig
624
from aiperf.common.mixins import AIPerfLoggerMixin
725

26+
DATASET_NAME = "yentinglin/aime_2025"
27+
TASK_NAME = "aime25"
28+
29+
# lighteval's aime25 task config: ``generation_size=32768``.
30+
DEFAULT_GENERATION_SIZE = 32768
31+
32+
# Schema field names in yentinglin/aime_2025 (same lowercase shape as
33+
# AIME24's HuggingFaceH4 mirror).
34+
PROBLEM_FIELD = "problem"
35+
ANSWER_FIELD = "answer"
36+
837

938
class AIME25Benchmark(AIPerfLoggerMixin):
10-
"""AIME 2025 benchmark loader."""
39+
"""AIME 2025 lighteval-aligned benchmark loader.
40+
41+
Loads ``yentinglin/aime_2025`` (train split) and emits one user
42+
message per problem containing the bare problem text — matching
43+
lighteval's zero-shot ``aime_prompt_fn`` rendering. Pair with
44+
``LightevalExprGrader`` for grading parity with the recipe.
45+
"""
1146

12-
def __init__(self, user_config: UserConfig, **kwargs) -> None:
47+
def __init__(self, user_config: UserConfig, **kwargs: Any) -> None:
1348
super().__init__(**kwargs)
1449
self.user_config = user_config
1550

1651
async def load_problems(
1752
self, tasks: list[str] | None, n_shots: int, enable_cot: bool
1853
) -> list[BenchmarkProblem]:
19-
raise NotImplementedError(
20-
"aime25 benchmark is not yet implemented; only 'mmlu' is available in this release."
21-
)
54+
"""Load AIME25 problems and format them lighteval-style.
55+
56+
Args:
57+
tasks: Ignored — AIME25 has no subtasks.
58+
n_shots: Ignored — the lighteval reference is zero-shot.
59+
enable_cot: Ignored — lighteval's ``aime_prompt_fn`` does
60+
not add a CoT trigger.
61+
62+
Returns:
63+
One ``BenchmarkProblem`` per dataset row, in dataset order.
64+
"""
65+
ds: Dataset = await asyncio.to_thread(load_dataset, DATASET_NAME, split="train")
66+
return await asyncio.to_thread(self._build_problems, ds)
67+
68+
def _build_problems(self, ds: Dataset) -> list[BenchmarkProblem]:
69+
problems: list[BenchmarkProblem] = []
70+
for row in ds:
71+
problem = row[PROBLEM_FIELD]
72+
messages: list[AccuracyChatMessage] = [{"role": "user", "content": problem}]
73+
problems.append(
74+
BenchmarkProblem(
75+
prompt=problem,
76+
ground_truth=str(row[ANSWER_FIELD]),
77+
task=TASK_NAME,
78+
metadata={"generation_size": DEFAULT_GENERATION_SIZE},
79+
raw_messages=messages,
80+
)
81+
)
82+
return problems

src/aiperf/plugin/plugins.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,11 +1152,12 @@ accuracy_benchmark:
11521152
aime25:
11531153
class: aiperf.accuracy.benchmarks.aime25:AIME25Benchmark
11541154
description: |
1155-
AIME 2025 benchmark with problems from the 2025 competition year.
1155+
AIME 2025 benchmark, aligned with the trt-llm benchmark recipe's
1156+
lighteval-backed configuration (yentinglin/aime_2025 + lighteval
1157+
``expr_gold_metric``).
11561158
metadata:
1157-
default_grader: math
1159+
default_grader: lighteval_expr
11581160
default_n_shots: 0
1159-
is_implemented: false
11601161

11611162
math_500:
11621163
class: aiperf.accuracy.benchmarks.math_500:Math500Benchmark
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Unit tests for ``AIME25Benchmark`` after lighteval alignment.
5+
6+
Same shape as ``test_aime24_benchmark.py`` — the lighteval reference
7+
config is identical except for the dataset URL.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
from typing import Any
13+
from unittest.mock import MagicMock, patch
14+
15+
import pytest
16+
17+
from aiperf.accuracy.benchmarks.aime25 import (
18+
DEFAULT_GENERATION_SIZE,
19+
TASK_NAME,
20+
AIME25Benchmark,
21+
)
22+
from aiperf.accuracy.models import BenchmarkProblem
23+
from aiperf.common.config import EndpointConfig, UserConfig
24+
from aiperf.common.config.accuracy_config import AccuracyConfig
25+
from aiperf.plugin.enums import AccuracyBenchmarkType, EndpointType
26+
27+
28+
def _make_user_config() -> UserConfig:
29+
return UserConfig(
30+
endpoint=EndpointConfig(
31+
model_names=["test-model"],
32+
type=EndpointType.COMPLETIONS,
33+
streaming=False,
34+
),
35+
accuracy=AccuracyConfig(benchmark=AccuracyBenchmarkType.AIME25),
36+
)
37+
38+
39+
def _make_row(problem: str = "What is 1+1?", answer: int = 2) -> dict[str, Any]:
40+
return {"problem": problem, "answer": answer}
41+
42+
43+
def _make_fake_dataset(rows: list[dict[str, Any]]) -> MagicMock:
44+
ds = MagicMock()
45+
ds.__iter__ = MagicMock(side_effect=lambda: iter(rows))
46+
ds.__len__ = MagicMock(return_value=len(rows))
47+
ds.__getitem__ = MagicMock(side_effect=lambda i: rows[i])
48+
return ds
49+
50+
51+
class TestPromptIsBareProblemText:
52+
@pytest.mark.asyncio
53+
async def test_flat_prompt_is_problem_text(self) -> None:
54+
rows = [_make_row("Compute the answer.", 42)]
55+
with patch(
56+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
57+
return_value=_make_fake_dataset(rows),
58+
):
59+
bench = AIME25Benchmark(user_config=_make_user_config())
60+
problems = await bench.load_problems(
61+
tasks=None, n_shots=0, enable_cot=False
62+
)
63+
assert problems[0].prompt == "Compute the answer."
64+
65+
@pytest.mark.asyncio
66+
async def test_no_instruction_prefix(self) -> None:
67+
rows = [_make_row("Q?", 1)]
68+
with patch(
69+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
70+
return_value=_make_fake_dataset(rows),
71+
):
72+
bench = AIME25Benchmark(user_config=_make_user_config())
73+
problems = await bench.load_problems(
74+
tasks=None, n_shots=0, enable_cot=False
75+
)
76+
prompt = problems[0].prompt
77+
assert "**Problem**" not in prompt
78+
assert "competition math" not in prompt
79+
assert "Let's think" not in prompt
80+
assert "boxed" not in prompt
81+
82+
@pytest.mark.asyncio
83+
async def test_chat_message_is_single_user_message(self) -> None:
84+
rows = [_make_row("Q?", 1)]
85+
with patch(
86+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
87+
return_value=_make_fake_dataset(rows),
88+
):
89+
bench = AIME25Benchmark(user_config=_make_user_config())
90+
problems = await bench.load_problems(
91+
tasks=None, n_shots=0, enable_cot=False
92+
)
93+
msgs = problems[0].raw_messages
94+
assert msgs is not None
95+
assert len(msgs) == 1
96+
assert msgs[0]["role"] == "user"
97+
assert msgs[0]["content"] == "Q?"
98+
99+
100+
class TestNShotsAndCoTAreIgnored:
101+
@pytest.mark.asyncio
102+
async def test_n_shots_argument_does_not_affect_prompt(self) -> None:
103+
rows = [_make_row(f"q{i}", i) for i in range(3)]
104+
with patch(
105+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
106+
return_value=_make_fake_dataset(rows),
107+
):
108+
bench = AIME25Benchmark(user_config=_make_user_config())
109+
zero_shot = await bench.load_problems(
110+
tasks=None, n_shots=0, enable_cot=False
111+
)
112+
five_shot = await bench.load_problems(
113+
tasks=None, n_shots=5, enable_cot=False
114+
)
115+
assert [p.prompt for p in zero_shot] == [p.prompt for p in five_shot]
116+
117+
@pytest.mark.asyncio
118+
async def test_enable_cot_does_not_affect_prompt(self) -> None:
119+
rows = [_make_row("Q?", 1)]
120+
with patch(
121+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
122+
return_value=_make_fake_dataset(rows),
123+
):
124+
bench = AIME25Benchmark(user_config=_make_user_config())
125+
no_cot = await bench.load_problems(tasks=None, n_shots=0, enable_cot=False)
126+
with_cot = await bench.load_problems(tasks=None, n_shots=0, enable_cot=True)
127+
assert no_cot[0].prompt == with_cot[0].prompt
128+
129+
130+
class TestLoadProblemsCore:
131+
@pytest.mark.asyncio
132+
async def test_returns_one_problem_per_row(self) -> None:
133+
rows = [_make_row(f"q{i}", i) for i in range(5)]
134+
with patch(
135+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
136+
return_value=_make_fake_dataset(rows),
137+
):
138+
bench = AIME25Benchmark(user_config=_make_user_config())
139+
problems = await bench.load_problems(
140+
tasks=None, n_shots=0, enable_cot=False
141+
)
142+
assert len(problems) == 5
143+
assert all(isinstance(p, BenchmarkProblem) for p in problems)
144+
145+
@pytest.mark.asyncio
146+
async def test_ground_truth_is_string_form_of_answer(self) -> None:
147+
rows = [_make_row("q", 42)]
148+
with patch(
149+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
150+
return_value=_make_fake_dataset(rows),
151+
):
152+
bench = AIME25Benchmark(user_config=_make_user_config())
153+
problems = await bench.load_problems(
154+
tasks=None, n_shots=0, enable_cot=False
155+
)
156+
assert problems[0].ground_truth == "42"
157+
158+
@pytest.mark.asyncio
159+
async def test_task_name_is_aime25(self) -> None:
160+
rows = [_make_row("q", 1)]
161+
with patch(
162+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
163+
return_value=_make_fake_dataset(rows),
164+
):
165+
bench = AIME25Benchmark(user_config=_make_user_config())
166+
problems = await bench.load_problems(
167+
tasks=None, n_shots=0, enable_cot=False
168+
)
169+
assert problems[0].task == TASK_NAME
170+
171+
@pytest.mark.asyncio
172+
async def test_generation_size_is_32k(self) -> None:
173+
rows = [_make_row("q", 1)]
174+
with patch(
175+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
176+
return_value=_make_fake_dataset(rows),
177+
):
178+
bench = AIME25Benchmark(user_config=_make_user_config())
179+
problems = await bench.load_problems(
180+
tasks=None, n_shots=0, enable_cot=False
181+
)
182+
assert problems[0].metadata["generation_size"] == DEFAULT_GENERATION_SIZE
183+
assert DEFAULT_GENERATION_SIZE == 32768
184+
185+
186+
class TestPathologicalDatasetRows:
187+
@pytest.mark.asyncio
188+
async def test_empty_dataset_returns_empty_list(self) -> None:
189+
with patch(
190+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
191+
return_value=_make_fake_dataset([]),
192+
):
193+
bench = AIME25Benchmark(user_config=_make_user_config())
194+
problems = await bench.load_problems(
195+
tasks=None, n_shots=0, enable_cot=False
196+
)
197+
assert problems == []
198+
199+
@pytest.mark.asyncio
200+
async def test_unicode_problem_text_preserved(self) -> None:
201+
rows = [_make_row("Solve ∑₁ⁿ k² for n=10. ✓", 385)]
202+
with patch(
203+
"aiperf.accuracy.benchmarks.aime25.load_dataset",
204+
return_value=_make_fake_dataset(rows),
205+
):
206+
bench = AIME25Benchmark(user_config=_make_user_config())
207+
problems = await bench.load_problems(
208+
tasks=None, n_shots=0, enable_cot=False
209+
)
210+
assert "∑₁ⁿ" in problems[0].prompt

0 commit comments

Comments
 (0)