Skip to content

Commit ed0edf6

Browse files
committed
feat(accuracy): align AIME24 with lighteval reference (AIP-875 follow-up)
Switches AIME24 to lighteval-aligned grading per the trt-llm benchmark recipe's `acc_bench_lighteval.py:aime24`. Stacks on the lighteval foundation landed on branch 874. Loader: - Prompt is now the bare `line["problem"]` text — what lighteval's `aime_prompt_fn` + `PromptManager` produce when `few_shots_split=None`. No instruction prefix, no `**Problem**:` wrapping, no CoT trigger. - `n_shots` and `enable_cot` parameters are accepted for protocol uniformity but ignored (the reference is zero-shot, no CoT trigger). Tests pin both behaviors. - `generation_size=32768` to match lighteval's `aime24` task config. - Removed dependency on `aime.py`'s `INSTRUCTION_PREFIX` / `DEFAULT_GENERATION_SIZE` (those are AIME-specific now; `aime24` follows lighteval, not the AIMETemplate path). plugins.yaml: - `aime24.metadata.default_grader`: `math` → `lighteval_expr` (the recipe's `expr_gold_metric` configuration of `MultilingualExtractiveMatchMetric`). Tests (rewritten): - 11 tests covering: prompt is bare problem text, no instruction prefix in any form, single user message, n_shots ignored, enable_cot ignored, ground_truth stringification, task name, generation_size, empty/unicode dataset rows. - Old assertions (recipe's `**Problem**: ... **Solution**: ...` format from the previous AIME implementation) are intentionally dropped since AIME24 follows lighteval, not AIMETemplate. Documentation: - docs/accuracy/accuracy-benchmarking.md availability table updated to show AIME24's `default_grader=lighteval_expr` and `n_shots=0`. Signed-off-by: Elias Bermudez <dbermudez@nvidia.com>
1 parent 07ab7c7 commit ed0edf6

3 files changed

Lines changed: 139 additions & 342 deletions

File tree

Lines changed: 56 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,31 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
"""AIME 2024 benchmark loader, ported from lighteval's aime24 task config.
5-
6-
Loads the ``HuggingFaceH4/aime_2024`` dataset (LightEval's canonical
7-
AIME 2024 mirror, with lowercase ``problem``/``answer`` field names) and
8-
formats each problem the same way as :mod:`aiperf.accuracy.benchmarks.aime`
9-
so the prompt + chat construction stays consistent across the AIME family.
10-
The split between ``aime`` and ``aime24`` is deliberate: ``aime`` is the
11-
year-agnostic identifier (DeepEval/Maxwell-Jia capitalized schema), while
12-
``aime24`` pins to lighteval's canonical mirror so users running
13-
side-by-side comparisons against lighteval get matching prompts.
14-
15-
lighteval reference: lighteval/src/lighteval/tasks/extended/aime/main.py
4+
"""AIME 2024 benchmark loader, aligned with the trt-llm lighteval reference.
5+
6+
Mirrors the recipe's ``acc_bench_lighteval.py`` configuration:
7+
8+
aime24 = LightevalTaskConfig(
9+
name="aime24",
10+
prompt_function=aime_prompt_fn,
11+
hf_repo="HuggingFaceH4/aime_2024",
12+
hf_subset="default",
13+
evaluation_splits=["train"],
14+
few_shots_split=None,
15+
few_shots_select=None,
16+
generation_size=32768,
17+
metric=[expr_gold_metric],
18+
)
19+
20+
The recipe's ``aime_prompt_fn`` produces a ``Doc`` whose ``query`` is
21+
the bare problem text — lighteval's prompt manager wraps it as a
22+
single user message with no instruction prefix and no few-shot
23+
priming (``few_shots_split=None``). We emit prompts the same way.
24+
Pair with ``LightevalExprGrader`` for the recipe's ``expr_gold_metric``
25+
extraction.
26+
27+
Reference:
28+
trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:128
1629
"""
1730

1831
from __future__ import annotations
@@ -22,33 +35,31 @@
2235

2336
from datasets import Dataset, load_dataset
2437

25-
from aiperf.accuracy.benchmarks.aime import (
26-
DEFAULT_GENERATION_SIZE,
27-
INSTRUCTION_PREFIX,
28-
)
2938
from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem
3039
from aiperf.common.config import UserConfig
3140
from aiperf.common.mixins import AIPerfLoggerMixin
3241

3342
DATASET_NAME = "HuggingFaceH4/aime_2024"
3443
TASK_NAME = "aime24"
3544

36-
# Field names in the HuggingFaceH4/aime_2024 schema (lowercase, distinct
37-
# from the Maxwell-Jia mirror used by AIMEBenchmark).
45+
# lighteval's aime24 task config: ``generation_size=32768`` to give
46+
# reasoning models room to think before emitting the boxed answer.
47+
DEFAULT_GENERATION_SIZE = 32768
48+
49+
# Schema field names in HuggingFaceH4/aime_2024 (lowercase, lighteval
50+
# canonical — distinct from the Maxwell-Jia mirror used by ``aime``).
3851
PROBLEM_FIELD = "problem"
3952
ANSWER_FIELD = "answer"
4053

4154

4255
class AIME24Benchmark(AIPerfLoggerMixin):
43-
"""AIME 2024 benchmark loader (lighteval canonical schema).
44-
45-
Loads competition problems from ``HuggingFaceH4/aime_2024`` (train
46-
split) and produces ``BenchmarkProblem`` objects ready for both the
47-
completions endpoint (flat ``prompt``) and the chat endpoint
48-
(``raw_messages``). Pairs with ``MathGrader`` for numerical
49-
equivalence; instruction prefix and generation size are reused from
50-
:mod:`aiperf.accuracy.benchmarks.aime` so the prompt format stays in
51-
lockstep across the AIME family.
56+
"""AIME 2024 lighteval-aligned benchmark loader.
57+
58+
Loads ``HuggingFaceH4/aime_2024`` (train split) and emits one user
59+
message per problem containing the bare problem text — the format
60+
lighteval's ``aime_prompt_fn`` + ``PromptManager`` produce when
61+
``few_shots_split=None``. Pair with ``LightevalExprGrader`` for
62+
grading parity with the recipe.
5263
"""
5364

5465
def __init__(self, user_config: UserConfig, **kwargs: Any) -> None:
@@ -58,114 +69,38 @@ def __init__(self, user_config: UserConfig, **kwargs: Any) -> None:
5869
async def load_problems(
5970
self, tasks: list[str] | None, n_shots: int, enable_cot: bool
6071
) -> list[BenchmarkProblem]:
61-
"""Load every AIME 2024 problem and format it for the LLM.
72+
"""Load AIME24 problems and format them lighteval-style.
6273
6374
Args:
64-
tasks: Ignored — AIME 2024 has no subtasks. Accepted for
65-
protocol parity with benchmarks that do filter.
66-
n_shots: Number of few-shot examples to prepend (drawn from
67-
the start of the dataset). 0 disables few-shot prompting.
68-
enable_cot: When True, append ``Let's think step by step.`` to
69-
each query.
75+
tasks: Ignored — AIME24 has no subtasks.
76+
n_shots: Ignored — the lighteval reference is zero-shot
77+
(``few_shots_split=None``); accepting the parameter
78+
keeps the protocol uniform but emitting few-shots
79+
here would diverge from the reference.
80+
enable_cot: Ignored — lighteval's ``aime_prompt_fn`` does
81+
not add a CoT trigger; the model decides whether to
82+
reason based on the system prompt the user provides
83+
via ``--accuracy-system-prompt``.
7084
7185
Returns:
72-
One ``BenchmarkProblem`` per dataset row, in dataset order.
86+
One ``BenchmarkProblem`` per dataset row, in dataset
87+
order.
7388
"""
7489
ds: Dataset = await asyncio.to_thread(load_dataset, DATASET_NAME, split="train")
75-
return await asyncio.to_thread(self._build_problems, ds, n_shots, enable_cot)
90+
return await asyncio.to_thread(self._build_problems, ds)
7691

77-
def _build_problems(
78-
self, ds: Dataset, n_shots: int, enable_cot: bool
79-
) -> list[BenchmarkProblem]:
80-
few_shots = self._build_few_shots(ds, n_shots)
92+
def _build_problems(self, ds: Dataset) -> list[BenchmarkProblem]:
8193
problems: list[BenchmarkProblem] = []
8294
for row in ds:
83-
prompt = self._format_prompt(row, few_shots, enable_cot)
84-
raw_messages = self._build_chat_messages(row, few_shots, enable_cot)
95+
problem = row[PROBLEM_FIELD]
96+
messages: list[AccuracyChatMessage] = [{"role": "user", "content": problem}]
8597
problems.append(
8698
BenchmarkProblem(
87-
prompt=prompt,
99+
prompt=problem,
88100
ground_truth=str(row[ANSWER_FIELD]),
89101
task=TASK_NAME,
90102
metadata={"generation_size": DEFAULT_GENERATION_SIZE},
91-
raw_messages=raw_messages,
103+
raw_messages=messages,
92104
)
93105
)
94106
return problems
95-
96-
def _build_few_shots(self, ds: Dataset, n_shots: int) -> list[dict[str, str]]:
97-
"""Few-shot examples drawn sequentially from the start of the split.
98-
99-
The HuggingFaceH4 mirror has no separate dev/validation split, so
100-
early problems can appear in their own prompts; lighteval makes
101-
the same trade-off when no held-out pool is available.
102-
"""
103-
if n_shots <= 0:
104-
return []
105-
size = min(n_shots, len(ds))
106-
return [self._format_example(ds[i]) for i in range(size)]
107-
108-
def _format_example(self, row: dict[str, Any]) -> dict[str, str]:
109-
"""Format a dataset row as a few-shot example with ``\\boxed{}``."""
110-
answer = str(row[ANSWER_FIELD])
111-
problem = row[PROBLEM_FIELD]
112-
return {
113-
"problem": problem,
114-
"answer": answer,
115-
"formatted": f"Problem: {problem}\nAnswer: \\boxed{{{answer}}}",
116-
}
117-
118-
def _format_prompt(
119-
self,
120-
row: dict[str, Any],
121-
few_shots: list[dict[str, str]],
122-
enable_cot: bool,
123-
) -> str:
124-
"""Build the flat completions prompt: instruction + shots + query."""
125-
few_shot_text = "\n\n".join(ex["formatted"] for ex in few_shots)
126-
if few_shot_text:
127-
few_shot_text += "\n\n"
128-
129-
problem = row[PROBLEM_FIELD]
130-
if enable_cot:
131-
query = f"Problem: {problem}\nLet's think step by step.\nAnswer:"
132-
else:
133-
query = f"Problem: {problem}\nAnswer:"
134-
135-
return INSTRUCTION_PREFIX + few_shot_text + query
136-
137-
def _build_chat_messages(
138-
self,
139-
row: dict[str, Any],
140-
few_shots: list[dict[str, str]],
141-
enable_cot: bool,
142-
) -> list[AccuracyChatMessage]:
143-
"""Build multi-turn chat messages following lighteval's PromptManager.
144-
145-
Identical structure to :class:`aiperf.accuracy.benchmarks.aime.AIMEBenchmark`:
146-
instruction lives on the first user message, assistant primers
147-
contain ``\\boxed{answer}``, and the trailing user message has no
148-
re-instruction unless there were zero few-shots.
149-
"""
150-
messages: list[AccuracyChatMessage] = []
151-
152-
for ix, ex in enumerate(few_shots):
153-
q = f"Problem: {ex['problem']}\nAnswer:"
154-
if ix == 0:
155-
q = INSTRUCTION_PREFIX + q
156-
messages.append({"role": "user", "content": q})
157-
messages.append(
158-
{"role": "assistant", "content": f"\\boxed{{{ex['answer']}}}"}
159-
)
160-
161-
problem = row[PROBLEM_FIELD]
162-
if enable_cot:
163-
main_q = f"Problem: {problem}\nLet's think step by step.\nAnswer:"
164-
else:
165-
main_q = f"Problem: {problem}\nAnswer:"
166-
167-
if not few_shots:
168-
main_q = INSTRUCTION_PREFIX + main_q
169-
170-
messages.append({"role": "user", "content": main_q})
171-
return messages

src/aiperf/plugin/plugins.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,11 +1142,12 @@ accuracy_benchmark:
11421142
aime24:
11431143
class: aiperf.accuracy.benchmarks.aime24:AIME24Benchmark
11441144
description: |
1145-
AIME 2024 benchmark with problems from the 2024 competition year.
1145+
AIME 2024 benchmark, aligned with the trt-llm benchmark recipe's
1146+
lighteval-backed configuration (HuggingFaceH4/aime_2024 + lighteval
1147+
``expr_gold_metric``).
11461148
metadata:
1147-
default_grader: math
1149+
default_grader: lighteval_expr
11481150
default_n_shots: 0
1149-
is_implemented: false
11501151

11511152
aime25:
11521153
class: aiperf.accuracy.benchmarks.aime25:AIME25Benchmark

0 commit comments

Comments
 (0)