11# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22# SPDX-License-Identifier: Apache-2.0
33
4- """AIME 2024 benchmark loader, ported from lighteval's aime24 task config.
5-
6- Loads the ``HuggingFaceH4/aime_2024`` dataset (LightEval's canonical
7- AIME 2024 mirror, with lowercase ``problem``/``answer`` field names) and
8- formats each problem the same way as :mod:`aiperf.accuracy.benchmarks.aime`
9- so the prompt + chat construction stays consistent across the AIME family.
10- The split between ``aime`` and ``aime24`` is deliberate: ``aime`` is the
11- year-agnostic identifier (DeepEval/Maxwell-Jia capitalized schema), while
12- ``aime24`` pins to lighteval's canonical mirror so users running
13- side-by-side comparisons against lighteval get matching prompts.
14-
15- lighteval reference: lighteval/src/lighteval/tasks/extended/aime/main.py
4+ """AIME 2024 benchmark loader, aligned with the trt-llm lighteval reference.
5+
6+ Mirrors the recipe's ``acc_bench_lighteval.py`` configuration:
7+
8+ aime24 = LightevalTaskConfig(
9+ name="aime24",
10+ prompt_function=aime_prompt_fn,
11+ hf_repo="HuggingFaceH4/aime_2024",
12+ hf_subset="default",
13+ evaluation_splits=["train"],
14+ few_shots_split=None,
15+ few_shots_select=None,
16+ generation_size=32768,
17+ metric=[expr_gold_metric],
18+ )
19+
20+ The recipe's ``aime_prompt_fn`` produces a ``Doc`` whose ``query`` is
21+ the bare problem text — lighteval's prompt manager wraps it as a
22+ single user message with no instruction prefix and no few-shot
23+ priming (``few_shots_split=None``). We emit prompts the same way.
24+ Pair with ``LightevalExprGrader`` for the recipe's ``expr_gold_metric``
25+ extraction.
26+
27+ Reference:
28+ trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:128
1629"""
1730
1831from __future__ import annotations
2235
2336from datasets import Dataset , load_dataset
2437
25- from aiperf .accuracy .benchmarks .aime import (
26- DEFAULT_GENERATION_SIZE ,
27- INSTRUCTION_PREFIX ,
28- )
2938from aiperf .accuracy .models import AccuracyChatMessage , BenchmarkProblem
3039from aiperf .common .config import UserConfig
3140from aiperf .common .mixins import AIPerfLoggerMixin
3241
3342DATASET_NAME = "HuggingFaceH4/aime_2024"
3443TASK_NAME = "aime24"
3544
36- # Field names in the HuggingFaceH4/aime_2024 schema (lowercase, distinct
37- # from the Maxwell-Jia mirror used by AIMEBenchmark).
45+ # lighteval's aime24 task config: ``generation_size=32768`` to give
46+ # reasoning models room to think before emitting the boxed answer.
47+ DEFAULT_GENERATION_SIZE = 32768
48+
49+ # Schema field names in HuggingFaceH4/aime_2024 (lowercase, lighteval
50+ # canonical — distinct from the Maxwell-Jia mirror used by ``aime``).
3851PROBLEM_FIELD = "problem"
3952ANSWER_FIELD = "answer"
4053
4154
4255class AIME24Benchmark (AIPerfLoggerMixin ):
43- """AIME 2024 benchmark loader (lighteval canonical schema).
44-
45- Loads competition problems from ``HuggingFaceH4/aime_2024`` (train
46- split) and produces ``BenchmarkProblem`` objects ready for both the
47- completions endpoint (flat ``prompt``) and the chat endpoint
48- (``raw_messages``). Pairs with ``MathGrader`` for numerical
49- equivalence; instruction prefix and generation size are reused from
50- :mod:`aiperf.accuracy.benchmarks.aime` so the prompt format stays in
51- lockstep across the AIME family.
56+ """AIME 2024 lighteval-aligned benchmark loader.
57+
58+ Loads ``HuggingFaceH4/aime_2024`` (train split) and emits one user
59+ message per problem containing the bare problem text — the format
60+ lighteval's ``aime_prompt_fn`` + ``PromptManager`` produce when
61+ ``few_shots_split=None``. Pair with ``LightevalExprGrader`` for
62+ grading parity with the recipe.
5263 """
5364
5465 def __init__ (self , user_config : UserConfig , ** kwargs : Any ) -> None :
@@ -58,114 +69,38 @@ def __init__(self, user_config: UserConfig, **kwargs: Any) -> None:
5869 async def load_problems (
5970 self , tasks : list [str ] | None , n_shots : int , enable_cot : bool
6071 ) -> list [BenchmarkProblem ]:
61- """Load every AIME 2024 problem and format it for the LLM .
72+ """Load AIME24 problems and format them lighteval-style .
6273
6374 Args:
64- tasks: Ignored — AIME 2024 has no subtasks. Accepted for
65- protocol parity with benchmarks that do filter.
66- n_shots: Number of few-shot examples to prepend (drawn from
67- the start of the dataset). 0 disables few-shot prompting.
68- enable_cot: When True, append ``Let's think step by step.`` to
69- each query.
75+ tasks: Ignored — AIME24 has no subtasks.
76+ n_shots: Ignored — the lighteval reference is zero-shot
77+ (``few_shots_split=None``); accepting the parameter
78+ keeps the protocol uniform but emitting few-shots
79+ here would diverge from the reference.
80+ enable_cot: Ignored — lighteval's ``aime_prompt_fn`` does
81+ not add a CoT trigger; the model decides whether to
82+ reason based on the system prompt the user provides
83+ via ``--accuracy-system-prompt``.
7084
7185 Returns:
72- One ``BenchmarkProblem`` per dataset row, in dataset order.
86+ One ``BenchmarkProblem`` per dataset row, in dataset
87+ order.
7388 """
7489 ds : Dataset = await asyncio .to_thread (load_dataset , DATASET_NAME , split = "train" )
75- return await asyncio .to_thread (self ._build_problems , ds , n_shots , enable_cot )
90+ return await asyncio .to_thread (self ._build_problems , ds )
7691
77- def _build_problems (
78- self , ds : Dataset , n_shots : int , enable_cot : bool
79- ) -> list [BenchmarkProblem ]:
80- few_shots = self ._build_few_shots (ds , n_shots )
92+ def _build_problems (self , ds : Dataset ) -> list [BenchmarkProblem ]:
8193 problems : list [BenchmarkProblem ] = []
8294 for row in ds :
83- prompt = self . _format_prompt ( row , few_shots , enable_cot )
84- raw_messages = self . _build_chat_messages ( row , few_shots , enable_cot )
95+ problem = row [ PROBLEM_FIELD ]
96+ messages : list [ AccuracyChatMessage ] = [{ "role" : "user" , "content" : problem }]
8597 problems .append (
8698 BenchmarkProblem (
87- prompt = prompt ,
99+ prompt = problem ,
88100 ground_truth = str (row [ANSWER_FIELD ]),
89101 task = TASK_NAME ,
90102 metadata = {"generation_size" : DEFAULT_GENERATION_SIZE },
91- raw_messages = raw_messages ,
103+ raw_messages = messages ,
92104 )
93105 )
94106 return problems
95-
96- def _build_few_shots (self , ds : Dataset , n_shots : int ) -> list [dict [str , str ]]:
97- """Few-shot examples drawn sequentially from the start of the split.
98-
99- The HuggingFaceH4 mirror has no separate dev/validation split, so
100- early problems can appear in their own prompts; lighteval makes
101- the same trade-off when no held-out pool is available.
102- """
103- if n_shots <= 0 :
104- return []
105- size = min (n_shots , len (ds ))
106- return [self ._format_example (ds [i ]) for i in range (size )]
107-
108- def _format_example (self , row : dict [str , Any ]) -> dict [str , str ]:
109- """Format a dataset row as a few-shot example with ``\\ boxed{}``."""
110- answer = str (row [ANSWER_FIELD ])
111- problem = row [PROBLEM_FIELD ]
112- return {
113- "problem" : problem ,
114- "answer" : answer ,
115- "formatted" : f"Problem: { problem } \n Answer: \\ boxed{{{ answer } }}" ,
116- }
117-
118- def _format_prompt (
119- self ,
120- row : dict [str , Any ],
121- few_shots : list [dict [str , str ]],
122- enable_cot : bool ,
123- ) -> str :
124- """Build the flat completions prompt: instruction + shots + query."""
125- few_shot_text = "\n \n " .join (ex ["formatted" ] for ex in few_shots )
126- if few_shot_text :
127- few_shot_text += "\n \n "
128-
129- problem = row [PROBLEM_FIELD ]
130- if enable_cot :
131- query = f"Problem: { problem } \n Let's think step by step.\n Answer:"
132- else :
133- query = f"Problem: { problem } \n Answer:"
134-
135- return INSTRUCTION_PREFIX + few_shot_text + query
136-
137- def _build_chat_messages (
138- self ,
139- row : dict [str , Any ],
140- few_shots : list [dict [str , str ]],
141- enable_cot : bool ,
142- ) -> list [AccuracyChatMessage ]:
143- """Build multi-turn chat messages following lighteval's PromptManager.
144-
145- Identical structure to :class:`aiperf.accuracy.benchmarks.aime.AIMEBenchmark`:
146- instruction lives on the first user message, assistant primers
147- contain ``\\ boxed{answer}``, and the trailing user message has no
148- re-instruction unless there were zero few-shots.
149- """
150- messages : list [AccuracyChatMessage ] = []
151-
152- for ix , ex in enumerate (few_shots ):
153- q = f"Problem: { ex ['problem' ]} \n Answer:"
154- if ix == 0 :
155- q = INSTRUCTION_PREFIX + q
156- messages .append ({"role" : "user" , "content" : q })
157- messages .append (
158- {"role" : "assistant" , "content" : f"\\ boxed{{{ ex ['answer' ]} }}" }
159- )
160-
161- problem = row [PROBLEM_FIELD ]
162- if enable_cot :
163- main_q = f"Problem: { problem } \n Let's think step by step.\n Answer:"
164- else :
165- main_q = f"Problem: { problem } \n Answer:"
166-
167- if not few_shots :
168- main_q = INSTRUCTION_PREFIX + main_q
169-
170- messages .append ({"role" : "user" , "content" : main_q })
171- return messages
0 commit comments