feat(accuracy): align AIME24 with lighteval reference (AIP-875 follow-up)

debermudez · debermudez · commit 05ece4764ff9 · 2026-05-12T12:09:46.000-07:00
Switches AIME24 to lighteval-aligned grading per the trt-llm benchmark
recipe's `acc_bench_lighteval.py:aime24`. Stacks on the lighteval
foundation landed on branch 874.

Loader:
- Prompt is now the bare `line["problem"]` text — what lighteval's
  `aime_prompt_fn` + `PromptManager` produce when
  `few_shots_split=None`. No instruction prefix, no `**Problem**:`
  wrapping, no CoT trigger.
- `n_shots` and `enable_cot` parameters are accepted for protocol
  uniformity but ignored (the reference is zero-shot, no CoT
  trigger). Tests pin both behaviors.
- `generation_size=32768` to match lighteval's `aime24` task config.
- Removed dependency on `aime.py`'s `INSTRUCTION_PREFIX` /
  `DEFAULT_GENERATION_SIZE` (those are AIME-specific now;
  `aime24` follows lighteval, not the AIMETemplate path).

plugins.yaml:
- `aime24.metadata.default_grader`: `math` → `lighteval_expr`
  (the recipe's `expr_gold_metric` configuration of
  `MultilingualExtractiveMatchMetric`).

Tests (rewritten):
- 11 tests covering: prompt is bare problem text, no instruction
  prefix in any form, single user message, n_shots ignored,
  enable_cot ignored, ground_truth stringification, task name,
  generation_size, empty/unicode dataset rows.
- Old assertions (recipe's `**Problem**: ... **Solution**: ...`
  format from the previous AIME implementation) are intentionally
  dropped since AIME24 follows lighteval, not AIMETemplate.

Documentation:
- docs/accuracy/accuracy-benchmarking.md availability table updated
  to show AIME24's `default_grader=lighteval_expr` and `n_shots=0`.

Signed-off-by: Elias Bermudez &lt;dbermudez@nvidia.com&gt;
diff --git a/src/aiperf/accuracy/benchmarks/aime24.py b/src/aiperf/accuracy/benchmarks/aime24.py
@@ -1,18 +1,31 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""AIME 2024 benchmark loader, ported from lighteval's aime24 task config.
-
-Loads the ``HuggingFaceH4/aime_2024`` dataset (LightEval's canonical
-AIME 2024 mirror, with lowercase ``problem``/``answer`` field names) and
-formats each problem the same way as :mod:`aiperf.accuracy.benchmarks.aime`
-so the prompt + chat construction stays consistent across the AIME family.
-The split between ``aime`` and ``aime24`` is deliberate: ``aime`` is the
-year-agnostic identifier (DeepEval/Maxwell-Jia capitalized schema), while
-``aime24`` pins to lighteval's canonical mirror so users running
-side-by-side comparisons against lighteval get matching prompts.
-
-lighteval reference: lighteval/src/lighteval/tasks/extended/aime/main.py
+"""AIME 2024 benchmark loader, aligned with the trt-llm lighteval reference.
+
+Mirrors the recipe's ``acc_bench_lighteval.py`` configuration:
+
+    aime24 = LightevalTaskConfig(
+        name="aime24",
+        prompt_function=aime_prompt_fn,
+        hf_repo="HuggingFaceH4/aime_2024",
+        hf_subset="default",
+        evaluation_splits=["train"],
+        few_shots_split=None,
+        few_shots_select=None,
+        generation_size=32768,
+        metric=[expr_gold_metric],
+    )
+
+The recipe's ``aime_prompt_fn`` produces a ``Doc`` whose ``query`` is
+the bare problem text — lighteval's prompt manager wraps it as a
+single user message with no instruction prefix and no few-shot
+priming (``few_shots_split=None``). We emit prompts the same way.
+Pair with ``LightevalExprGrader`` for the recipe's ``expr_gold_metric``
+extraction.
+
+Reference:
+    trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:128
 """
 
 from __future__ import annotations
@@ -22,33 +35,31 @@
 
 from datasets import Dataset, load_dataset
 
-from aiperf.accuracy.benchmarks.aime import (
-    DEFAULT_GENERATION_SIZE,
-    INSTRUCTION_PREFIX,
-)
 from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem
 from aiperf.common.config import UserConfig
 from aiperf.common.mixins import AIPerfLoggerMixin
 
 DATASET_NAME = "HuggingFaceH4/aime_2024"
 TASK_NAME = "aime24"
 
-# Field names in the HuggingFaceH4/aime_2024 schema (lowercase, distinct
-# from the Maxwell-Jia mirror used by AIMEBenchmark).
+# lighteval's aime24 task config: ``generation_size=32768`` to give
+# reasoning models room to think before emitting the boxed answer.
+DEFAULT_GENERATION_SIZE = 32768
+
+# Schema field names in HuggingFaceH4/aime_2024 (lowercase, lighteval
+# canonical — distinct from the Maxwell-Jia mirror used by ``aime``).
 PROBLEM_FIELD = "problem"
 ANSWER_FIELD = "answer"
 
 
 class AIME24Benchmark(AIPerfLoggerMixin):
-    """AIME 2024 benchmark loader (lighteval canonical schema).
-
-    Loads competition problems from ``HuggingFaceH4/aime_2024`` (train
-    split) and produces ``BenchmarkProblem`` objects ready for both the
-    completions endpoint (flat ``prompt``) and the chat endpoint
-    (``raw_messages``). Pairs with ``MathGrader`` for numerical
-    equivalence; instruction prefix and generation size are reused from
-    :mod:`aiperf.accuracy.benchmarks.aime` so the prompt format stays in
-    lockstep across the AIME family.
+    """AIME 2024 lighteval-aligned benchmark loader.
+
+    Loads ``HuggingFaceH4/aime_2024`` (train split) and emits one user
+    message per problem containing the bare problem text — the format
+    lighteval's ``aime_prompt_fn`` + ``PromptManager`` produce when
+    ``few_shots_split=None``. Pair with ``LightevalExprGrader`` for
+    grading parity with the recipe.
     """
 
     def __init__(self, user_config: UserConfig, **kwargs: Any) -> None:
@@ -58,114 +69,38 @@ def __init__(self, user_config: UserConfig, **kwargs: Any) -> None:
     async def load_problems(
         self, tasks: list[str] | None, n_shots: int, enable_cot: bool
     ) -> list[BenchmarkProblem]:
-        """Load every AIME 2024 problem and format it for the LLM.
+        """Load AIME24 problems and format them lighteval-style.
 
         Args:
-            tasks: Ignored — AIME 2024 has no subtasks. Accepted for
-                protocol parity with benchmarks that do filter.
-            n_shots: Number of few-shot examples to prepend (drawn from
-                the start of the dataset). 0 disables few-shot prompting.
-            enable_cot: When True, append ``Let's think step by step.`` to
-                each query.
+            tasks: Ignored — AIME24 has no subtasks.
+            n_shots: Ignored — the lighteval reference is zero-shot
+                (``few_shots_split=None``); accepting the parameter
+                keeps the protocol uniform but emitting few-shots
+                here would diverge from the reference.
+            enable_cot: Ignored — lighteval's ``aime_prompt_fn`` does
+                not add a CoT trigger; the model decides whether to
+                reason based on the system prompt the user provides
+                via ``--accuracy-system-prompt``.
 
         Returns:
-            One ``BenchmarkProblem`` per dataset row, in dataset order.
+            One ``BenchmarkProblem`` per dataset row, in dataset
+            order.
         """
         ds: Dataset = await asyncio.to_thread(load_dataset, DATASET_NAME, split="train")
-        return await asyncio.to_thread(self._build_problems, ds, n_shots, enable_cot)
+        return await asyncio.to_thread(self._build_problems, ds)
 
-    def _build_problems(
-        self, ds: Dataset, n_shots: int, enable_cot: bool
-    ) -> list[BenchmarkProblem]:
-        few_shots = self._build_few_shots(ds, n_shots)
+    def _build_problems(self, ds: Dataset) -> list[BenchmarkProblem]:
         problems: list[BenchmarkProblem] = []
         for row in ds:
-            prompt = self._format_prompt(row, few_shots, enable_cot)
-            raw_messages = self._build_chat_messages(row, few_shots, enable_cot)
+            problem = row[PROBLEM_FIELD]
+            messages: list[AccuracyChatMessage] = [{"role": "user", "content": problem}]
             problems.append(
                 BenchmarkProblem(
-                    prompt=prompt,
+                    prompt=problem,
                     ground_truth=str(row[ANSWER_FIELD]),
                     task=TASK_NAME,
                     metadata={"generation_size": DEFAULT_GENERATION_SIZE},
-                    raw_messages=raw_messages,
+                    raw_messages=messages,
                 )
             )
         return problems
-
-    def _build_few_shots(self, ds: Dataset, n_shots: int) -> list[dict[str, str]]:
-        """Few-shot examples drawn sequentially from the start of the split.
-
-        The HuggingFaceH4 mirror has no separate dev/validation split, so
-        early problems can appear in their own prompts; lighteval makes
-        the same trade-off when no held-out pool is available.
-        """
-        if n_shots <= 0:
-            return []
-        size = min(n_shots, len(ds))
-        return [self._format_example(ds[i]) for i in range(size)]
-
-    def _format_example(self, row: dict[str, Any]) -> dict[str, str]:
-        """Format a dataset row as a few-shot example with ``\\boxed{}``."""
-        answer = str(row[ANSWER_FIELD])
-        problem = row[PROBLEM_FIELD]
-        return {
-            "problem": problem,
-            "answer": answer,
-            "formatted": f"Problem: {problem}\nAnswer: \\boxed{{{answer}}}",
-        }
-
-    def _format_prompt(
-        self,
-        row: dict[str, Any],
-        few_shots: list[dict[str, str]],
-        enable_cot: bool,
-    ) -> str:
-        """Build the flat completions prompt: instruction + shots + query."""
-        few_shot_text = "\n\n".join(ex["formatted"] for ex in few_shots)
-        if few_shot_text:
-            few_shot_text += "\n\n"
-
-        problem = row[PROBLEM_FIELD]
-        if enable_cot:
-            query = f"Problem: {problem}\nLet's think step by step.\nAnswer:"
-        else:
-            query = f"Problem: {problem}\nAnswer:"
-
-        return INSTRUCTION_PREFIX + few_shot_text + query
-
-    def _build_chat_messages(
-        self,
-        row: dict[str, Any],
-        few_shots: list[dict[str, str]],
-        enable_cot: bool,
-    ) -> list[AccuracyChatMessage]:
-        """Build multi-turn chat messages following lighteval's PromptManager.
-
-        Identical structure to :class:`aiperf.accuracy.benchmarks.aime.AIMEBenchmark`:
-        instruction lives on the first user message, assistant primers
-        contain ``\\boxed{answer}``, and the trailing user message has no
-        re-instruction unless there were zero few-shots.
-        """
-        messages: list[AccuracyChatMessage] = []
-
-        for ix, ex in enumerate(few_shots):
-            q = f"Problem: {ex['problem']}\nAnswer:"
-            if ix == 0:
-                q = INSTRUCTION_PREFIX + q
-            messages.append({"role": "user", "content": q})
-            messages.append(
-                {"role": "assistant", "content": f"\\boxed{{{ex['answer']}}}"}
-            )
-
-        problem = row[PROBLEM_FIELD]
-        if enable_cot:
-            main_q = f"Problem: {problem}\nLet's think step by step.\nAnswer:"
-        else:
-            main_q = f"Problem: {problem}\nAnswer:"
-
-        if not few_shots:
-            main_q = INSTRUCTION_PREFIX + main_q
-
-        messages.append({"role": "user", "content": main_q})
-        return messages
diff --git a/src/aiperf/plugin/plugins.yaml b/src/aiperf/plugin/plugins.yaml
@@ -1168,11 +1168,12 @@ accuracy_benchmark:
   aime24:
     class: aiperf.accuracy.benchmarks.aime24:AIME24Benchmark
     description: |
-      AIME 2024 benchmark with problems from the 2024 competition year.
+      AIME 2024 benchmark, aligned with the trt-llm benchmark recipe's
+      lighteval-backed configuration (HuggingFaceH4/aime_2024 + lighteval
+      ``expr_gold_metric``).
     metadata:
-      default_grader: math
+      default_grader: lighteval_expr
       default_n_shots: 0
-      is_implemented: false
 
   aime25:
     class: aiperf.accuracy.benchmarks.aime25:AIME25Benchmark
diff --git a/tests/unit/accuracy/test_accuracy_config.py b/tests/unit/accuracy/test_accuracy_config.py
@@ -25,7 +25,6 @@
 STUB_BENCHMARKS = (
     "hellaswag",
     "bigbench",
-    "aime24",
     "aime25",
     "math_500",
     "gpqa_diamond",
diff --git a/tests/unit/accuracy/test_aime24_benchmark.py b/tests/unit/accuracy/test_aime24_benchmark.py