feat(accuracy): align AIME25 with lighteval reference (AIP-876 follow-up)

debermudez · debermudez · commit 9bbe75238965 · 2026-05-12T16:24:29.000-07:00
Same lighteval alignment as AIME24, but with the yentinglin/aime_2025
dataset. Bare problem text as user message, generation_size=32768,
default_grader=lighteval_expr. Tests pin the same invariants
(prompt is bare problem text, n_shots/enable_cot ignored).

Reference: trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:142

Signed-off-by: Elias Bermudez &lt;dbermudez@nvidia.com&gt;
diff --git a/docs/accuracy/accuracy-benchmarking.md b/docs/accuracy/accuracy-benchmarking.md
@@ -73,6 +73,7 @@ system message).
 | `mmlu` | `multiple_choice` | 5 | `lighteval/mmlu` (57 subjects) |
 | `aime` | `math` | 8 | `Maxwell-Jia/AIME_2024` (trt-llm reference, 8-shot CoT) |
 | `aime24` | `lighteval_expr` | 0 | `HuggingFaceH4/aime_2024` (trt-llm/lighteval reference) |
+| `aime25` | `lighteval_expr` | 0 | `yentinglin/aime_2025` (trt-llm/lighteval reference) |
 
 ## CLI Flags
 
diff --git a/src/aiperf/accuracy/benchmarks/aime25.py b/src/aiperf/accuracy/benchmarks/aime25.py
@@ -1,21 +1,82 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from aiperf.accuracy.models import BenchmarkProblem
+"""AIME 2025 benchmark loader, aligned with the trt-llm lighteval reference.
+
+Mirrors ``acc_bench_lighteval.py:aime25``: same ``aime_prompt_fn``,
+same zero-shot config, ``generation_size=32768``,
+``hf_repo="yentinglin/aime_2025"``. See the AIME24 module for a fuller
+explanation of the design.
+
+Reference:
+    trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:142
+"""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+from datasets import Dataset, load_dataset
+
+from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem
 from aiperf.common.config import UserConfig
 from aiperf.common.mixins import AIPerfLoggerMixin
 
+DATASET_NAME = "yentinglin/aime_2025"
+TASK_NAME = "aime25"
+
+# lighteval's aime25 task config: ``generation_size=32768``.
+DEFAULT_GENERATION_SIZE = 32768
+
+# Schema field names in yentinglin/aime_2025 (same lowercase shape as
+# AIME24's HuggingFaceH4 mirror).
+PROBLEM_FIELD = "problem"
+ANSWER_FIELD = "answer"
+
 
 class AIME25Benchmark(AIPerfLoggerMixin):
-    """AIME 2025 benchmark loader."""
+    """AIME 2025 lighteval-aligned benchmark loader.
+
+    Loads ``yentinglin/aime_2025`` (train split) and emits one user
+    message per problem containing the bare problem text — matching
+    lighteval's zero-shot ``aime_prompt_fn`` rendering. Pair with
+    ``LightevalExprGrader`` for grading parity with the recipe.
+    """
 
-    def __init__(self, user_config: UserConfig, **kwargs) -> None:
+    def __init__(self, user_config: UserConfig, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         self.user_config = user_config
 
     async def load_problems(
         self, tasks: list[str] | None, n_shots: int, enable_cot: bool
     ) -> list[BenchmarkProblem]:
-        raise NotImplementedError(
-            "aime25 benchmark is not yet implemented; only 'mmlu' is available in this release."
-        )
+        """Load AIME25 problems and format them lighteval-style.
+
+        Args:
+            tasks: Ignored — AIME25 has no subtasks.
+            n_shots: Ignored — the lighteval reference is zero-shot.
+            enable_cot: Ignored — lighteval's ``aime_prompt_fn`` does
+                not add a CoT trigger.
+
+        Returns:
+            One ``BenchmarkProblem`` per dataset row, in dataset order.
+        """
+        ds: Dataset = await asyncio.to_thread(load_dataset, DATASET_NAME, split="train")
+        return await asyncio.to_thread(self._build_problems, ds)
+
+    def _build_problems(self, ds: Dataset) -> list[BenchmarkProblem]:
+        problems: list[BenchmarkProblem] = []
+        for row in ds:
+            problem = row[PROBLEM_FIELD]
+            messages: list[AccuracyChatMessage] = [{"role": "user", "content": problem}]
+            problems.append(
+                BenchmarkProblem(
+                    prompt=problem,
+                    ground_truth=str(row[ANSWER_FIELD]),
+                    task=TASK_NAME,
+                    metadata={"generation_size": DEFAULT_GENERATION_SIZE},
+                    raw_messages=messages,
+                )
+            )
+        return problems
diff --git a/src/aiperf/plugin/plugins.yaml b/src/aiperf/plugin/plugins.yaml
@@ -1152,11 +1152,12 @@ accuracy_benchmark:
   aime25:
     class: aiperf.accuracy.benchmarks.aime25:AIME25Benchmark
     description: |
-      AIME 2025 benchmark with problems from the 2025 competition year.
+      AIME 2025 benchmark, aligned with the trt-llm benchmark recipe's
+      lighteval-backed configuration (yentinglin/aime_2025 + lighteval
+      ``expr_gold_metric``).
     metadata:
-      default_grader: math
+      default_grader: lighteval_expr
       default_n_shots: 0
-      is_implemented: false
 
   math_500:
     class: aiperf.accuracy.benchmarks.math_500:Math500Benchmark
diff --git a/tests/unit/accuracy/test_aime25_benchmark.py b/tests/unit/accuracy/test_aime25_benchmark.py
@@ -0,0 +1,210 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for ``AIME25Benchmark`` after lighteval alignment.
+
+Same shape as ``test_aime24_benchmark.py`` — the lighteval reference
+config is identical except for the dataset URL.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from aiperf.accuracy.benchmarks.aime25 import (
+    DEFAULT_GENERATION_SIZE,
+    TASK_NAME,
+    AIME25Benchmark,
+)
+from aiperf.accuracy.models import BenchmarkProblem
+from aiperf.common.config import EndpointConfig, UserConfig
+from aiperf.common.config.accuracy_config import AccuracyConfig
+from aiperf.plugin.enums import AccuracyBenchmarkType, EndpointType
+
+
+def _make_user_config() -> UserConfig:
+    return UserConfig(
+        endpoint=EndpointConfig(
+            model_names=["test-model"],
+            type=EndpointType.COMPLETIONS,
+            streaming=False,
+        ),
+        accuracy=AccuracyConfig(benchmark=AccuracyBenchmarkType.AIME25),
+    )
+
+
+def _make_row(problem: str = "What is 1+1?", answer: int = 2) -> dict[str, Any]:
+    return {"problem": problem, "answer": answer}
+
+
+def _make_fake_dataset(rows: list[dict[str, Any]]) -> MagicMock:
+    ds = MagicMock()
+    ds.__iter__ = MagicMock(side_effect=lambda: iter(rows))
+    ds.__len__ = MagicMock(return_value=len(rows))
+    ds.__getitem__ = MagicMock(side_effect=lambda i: rows[i])
+    return ds
+
+
+class TestPromptIsBareProblemText:
+    @pytest.mark.asyncio
+    async def test_flat_prompt_is_problem_text(self) -> None:
+        rows = [_make_row("Compute the answer.", 42)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            problems = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+        assert problems[0].prompt == "Compute the answer."
+
+    @pytest.mark.asyncio
+    async def test_no_instruction_prefix(self) -> None:
+        rows = [_make_row("Q?", 1)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            problems = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+        prompt = problems[0].prompt
+        assert "**Problem**" not in prompt
+        assert "competition math" not in prompt
+        assert "Let's think" not in prompt
+        assert "boxed" not in prompt
+
+    @pytest.mark.asyncio
+    async def test_chat_message_is_single_user_message(self) -> None:
+        rows = [_make_row("Q?", 1)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            problems = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+        msgs = problems[0].raw_messages
+        assert msgs is not None
+        assert len(msgs) == 1
+        assert msgs[0]["role"] == "user"
+        assert msgs[0]["content"] == "Q?"
+
+
+class TestNShotsAndCoTAreIgnored:
+    @pytest.mark.asyncio
+    async def test_n_shots_argument_does_not_affect_prompt(self) -> None:
+        rows = [_make_row(f"q{i}", i) for i in range(3)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            zero_shot = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+            five_shot = await bench.load_problems(
+                tasks=None, n_shots=5, enable_cot=False
+            )
+        assert [p.prompt for p in zero_shot] == [p.prompt for p in five_shot]
+
+    @pytest.mark.asyncio
+    async def test_enable_cot_does_not_affect_prompt(self) -> None:
+        rows = [_make_row("Q?", 1)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            no_cot = await bench.load_problems(tasks=None, n_shots=0, enable_cot=False)
+            with_cot = await bench.load_problems(tasks=None, n_shots=0, enable_cot=True)
+        assert no_cot[0].prompt == with_cot[0].prompt
+
+
+class TestLoadProblemsCore:
+    @pytest.mark.asyncio
+    async def test_returns_one_problem_per_row(self) -> None:
+        rows = [_make_row(f"q{i}", i) for i in range(5)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            problems = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+        assert len(problems) == 5
+        assert all(isinstance(p, BenchmarkProblem) for p in problems)
+
+    @pytest.mark.asyncio
+    async def test_ground_truth_is_string_form_of_answer(self) -> None:
+        rows = [_make_row("q", 42)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            problems = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+        assert problems[0].ground_truth == "42"
+
+    @pytest.mark.asyncio
+    async def test_task_name_is_aime25(self) -> None:
+        rows = [_make_row("q", 1)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            problems = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+        assert problems[0].task == TASK_NAME
+
+    @pytest.mark.asyncio
+    async def test_generation_size_is_32k(self) -> None:
+        rows = [_make_row("q", 1)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            problems = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+        assert problems[0].metadata["generation_size"] == DEFAULT_GENERATION_SIZE
+        assert DEFAULT_GENERATION_SIZE == 32768
+
+
+class TestPathologicalDatasetRows:
+    @pytest.mark.asyncio
+    async def test_empty_dataset_returns_empty_list(self) -> None:
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset([]),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            problems = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+        assert problems == []
+
+    @pytest.mark.asyncio
+    async def test_unicode_problem_text_preserved(self) -> None:
+        rows = [_make_row("Solve ∑₁ⁿ k² for n=10. ✓", 385)]
+        with patch(
+            "aiperf.accuracy.benchmarks.aime25.load_dataset",
+            return_value=_make_fake_dataset(rows),
+        ):
+            bench = AIME25Benchmark(user_config=_make_user_config())
+            problems = await bench.load_problems(
+                tasks=None, n_shots=0, enable_cot=False
+            )
+        assert "∑₁ⁿ" in problems[0].prompt