From 3e85d67ae69a5294c4db72891ab72f09f9f6a2fe Mon Sep 17 00:00:00 2001 From: Elias Bermudez Date: Tue, 26 May 2026 10:04:08 -0700 Subject: [PATCH] feat(accuracy): AIME 2025 lighteval-aligned benchmark loader (AIP-876) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement ``AIME25Benchmark`` mirroring the trt-llm benchmark recipe's ``acc_bench_lighteval.py:aime25`` configuration: same ``aime_prompt_fn`` zero-shot rendering, ``generation_size=32768``, ``hf_repo="yentinglin/aime_2025"``. Same shape as ``AIME24Benchmark`` just pointed at the 2025 mirror. The loader emits one ``BenchmarkProblem`` per dataset row with the bare problem text as ``prompt``, ``str(answer)`` as ``ground_truth``, and ``metadata.generation_size`` = 32768. ``tasks`` / ``n_shots`` / ``enable_cot`` are accepted for protocol uniformity but ignored. Pair with ``LightevalExprGrader`` for the recipe's ``expr_gold_metric`` extraction. Built on top of AIP-875 (lighteval sub-stack ordering: 875 → 876). No heavy optional dependency — ``datasets`` is core — so CI gets 100% line + branch coverage out of the box. Updates the stub registry: drop ``aime25`` from ``test_accuracy_config.STUB_BENCHMARKS``, drop ``is_implemented: false`` from the ``aime25`` plugins.yaml entry, switch ``default_grader`` to ``lighteval_expr``, add the ``aime25`` row to ``docs/accuracy/accuracy-benchmarking.md``, and move it from "Still Stubbed" to "Implemented" in ``accuracy_stubs.md`` (refreshing the Status Summary, Method Count Summary, and Suggested Implementation Order accordingly). Signed-off-by: Elias Bermudez --- docs/accuracy/accuracy-benchmarking.md | 1 + docs/accuracy/accuracy_stubs.md | 18 +- src/aiperf/accuracy/benchmarks/aime25.py | 71 ++++++- src/aiperf/plugin/plugins.yaml | 7 +- tests/unit/accuracy/test_accuracy_config.py | 1 - tests/unit/accuracy/test_aime25_benchmark.py | 207 +++++++++++++++++++ 6 files changed, 283 insertions(+), 22 deletions(-) create mode 100644 tests/unit/accuracy/test_aime25_benchmark.py diff --git a/docs/accuracy/accuracy-benchmarking.md b/docs/accuracy/accuracy-benchmarking.md index 36612868c..18e8dabab 100644 --- a/docs/accuracy/accuracy-benchmarking.md +++ b/docs/accuracy/accuracy-benchmarking.md @@ -75,6 +75,7 @@ system message). | `hellaswag` | `exact_match` | 10 | `Rowan/hellaswag` (trt-llm/DeepEval reference; one few-shot per unique activity_label) | | `bigbench` | `exact_match` | 3 | `lukaemon/bbh` (trt-llm/DeepEval reference; 27 subtasks, canonical CoT/non-CoT prompt files) | | `aime24` | `lighteval_expr` | 0 | `HuggingFaceH4/aime_2024` (trt-llm/lighteval reference, bare problem text, `expr_gold_metric`) | +| `aime25` | `lighteval_expr` | 0 | `yentinglin/aime_2025` (trt-llm/lighteval reference, bare problem text, `expr_gold_metric`) | ## CLI Flags diff --git a/docs/accuracy/accuracy_stubs.md b/docs/accuracy/accuracy_stubs.md index 9c95af783..e02953cf6 100644 --- a/docs/accuracy/accuracy_stubs.md +++ b/docs/accuracy/accuracy_stubs.md @@ -7,7 +7,7 @@ This document catalogs every stubbed method in the accuracy benchmarking scaffolding. The scaffolding is fully integrated into the plugin system, CLI, and config pipeline — the performance benchmarking path is unaffected. -**Status summary:** With the AIME24 loader landing on top of the BigBench / HellaSwag stack, `MultipleChoiceGrader`, `MathGrader`, `CodeExecutionGrader`, `LightevalExprGrader`, `LightevalLatexGrader`, `LightevalGPQAGrader`, `ExactMatchGrader`, `MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`, and `AIME24Benchmark` are fully implemented; the remaining benchmarks (`aime25`, `math_500`, `gpqa_diamond`, `lcb_codegeneration`) are still stubs and ship behind `NotImplementedError` until each follow-up branch lands. Use the implemented classes as canonical references when filling in the remaining stubs. +**Status summary:** With the AIME25 loader landing on top of the AIME24 / BigBench / HellaSwag stack, `MultipleChoiceGrader`, `MathGrader`, `CodeExecutionGrader`, `LightevalExprGrader`, `LightevalLatexGrader`, `LightevalGPQAGrader`, `ExactMatchGrader`, `MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`, `AIME24Benchmark`, and `AIME25Benchmark` are fully implemented; the remaining benchmarks (`math_500`, `gpqa_diamond`, `lcb_codegeneration`) are still stubs and ship behind `NotImplementedError` until each follow-up branch lands. Use the implemented classes as canonical references when filling in the remaining stubs. ## Table of Contents @@ -174,15 +174,15 @@ All benchmarks use `AIPerfLoggerMixin` and must implement 1 method. | 3 | `HellaSwagBenchmark` | `benchmarks/hellaswag.py` | `hellaswag` | `exact_match` | 10 | **IMPLEMENTED.** Loads `Rowan/hellaswag` (validation split filtered per task by `activity_label`; train split feeds the "one few-shot per unique activity_label" rule). Prompt rendering delegates to `deepeval.benchmarks.HellaSwag`'s `HellaSwagTemplate.generate_output`, so output is byte-equal to the trt-llm recipe's DeepEval-backed path. Pairs with `exact_match` for strict `Scorer.exact_match_score` semantics. Requires the `[accuracy]` extras (deepeval). | | 4 | `BigBenchBenchmark` | `benchmarks/bigbench.py` | `bigbench` | `exact_match` | 3 | **IMPLEMENTED.** Loads `lukaemon/bbh` (27 BBH subtasks). Prompt rendering delegates to `deepeval.benchmarks.BigBenchHard`'s `BigBenchHardTemplate.generate_output`, which reads the 27 canonical CoT/shot prompt files DeepEval ships as package data. Pairs with `exact_match` for the recipe's strict `Scorer.exact_match_score` semantics. `default_n_shots=3`, `default_enable_cot=true`. Requires the `[accuracy]` extras (deepeval). | | 5 | `AIME24Benchmark` | `benchmarks/aime24.py` | `aime24` | `lighteval_expr` | 0 | **IMPLEMENTED.** Loads `HuggingFaceH4/aime_2024` (train split) and emits the bare problem text as a single user message — no instruction prefix, no few-shot priming. Mirrors the trt-llm benchmark recipe's `acc_bench_lighteval.py` configuration (`few_shots_split=None`, `generation_size=32768`). Pairs with `lighteval_expr` for the recipe's `expr_gold_metric` extraction. | +| 6 | `AIME25Benchmark` | `benchmarks/aime25.py` | `aime25` | `lighteval_expr` | 0 | **IMPLEMENTED.** Same lighteval-aligned shape as `AIME24Benchmark` but pointed at `yentinglin/aime_2025` (the recipe's `aime25` task config). Identical prompt rendering, generation size, and grader pairing. | ### Still Stubbed | # | Class | File | Plugin Key | Default Grader | Default N-Shots | |---|-------|------|------------|----------------|-----------------| -| 1 | `AIME25Benchmark` | `benchmarks/aime25.py` | `aime25` | `math` | 0 | -| 2 | `Math500Benchmark` | `benchmarks/math_500.py` | `math_500` | `math` | 0 | -| 3 | `GPQADiamondBenchmark` | `benchmarks/gpqa_diamond.py` | `gpqa_diamond` | `multiple_choice` | 0 | -| 4 | `LCBCodeGenerationBenchmark` | `benchmarks/lcb_codegeneration.py` | `lcb_codegeneration` | `code_execution` | 0 | +| 1 | `Math500Benchmark` | `benchmarks/math_500.py` | `math_500` | `math` | 0 | +| 2 | `GPQADiamondBenchmark` | `benchmarks/gpqa_diamond.py` | `gpqa_diamond` | `multiple_choice` | 0 | +| 3 | `LCBCodeGenerationBenchmark` | `benchmarks/lcb_codegeneration.py` | `lcb_codegeneration` | `code_execution` | 0 | **Each benchmark has 1 method to implement:** @@ -309,13 +309,13 @@ All stubs are registered in `src/aiperf/plugin/plugins.yaml` and `src/aiperf/plu | Component | Implemented | Still Stubbed | Methods per Stub | Remaining Methods | |-----------|-------------|---------------|------------------|-------------------| | Graders | 7 (all) | 0 | — | 0 | -| Benchmarks | 5 (`MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`, `AIME24Benchmark`) | 4 | 1 (`load_problems`) | 4 | +| Benchmarks | 6 (`MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`, `AIME24Benchmark`, `AIME25Benchmark`) | 3 | 1 (`load_problems`) | 3 | | Record Processor | 1 (`AccuracyRecordProcessor`) | 0 | — | 0 | | Results Processor | 1 (`AccuracyResultsProcessor`) | 0 | — | 0 | | Console Exporter | 1 (`AccuracyConsoleExporter`) | 0 | — | 0 | | Data Exporter | 1 (`AccuracyDataExporter`) | 0 | — | 0 | | Stub-plugin Validator | 0 | 1 | 1 (`AccuracyConfig._reject_stub_plugins`) | 1 | -| **Total** | **16** | **5** | | **5** | +| **Total** | **17** | **4** | | **4** | ### Self-Disabling Pattern @@ -323,9 +323,9 @@ Processors and exporters raise their `Disabled` exception **in `__init__`** when ### Suggested Implementation Order -The processors, exporters, all graders, and five benchmarks (`MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`, `AIME24Benchmark`) are already wired end-to-end. The remaining work is the four stub benchmarks; mirror the existing loader whose grader matches: +The processors, exporters, all graders, and six benchmarks (`MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`, `AIME24Benchmark`, `AIME25Benchmark`) are already wired end-to-end. The remaining work is the three stub benchmarks; mirror the existing loader whose grader matches: -1. **`aime25`, `math_500`** — mirror `AIME24Benchmark` (`benchmarks/aime24.py`) for the lighteval-aligned shape; pair with `lighteval_expr` (aime25) or `lighteval_latex` (math_500). +1. **`math_500`** — mirror `AIME24Benchmark` (`benchmarks/aime24.py`) for the lighteval-aligned shape; pair with `lighteval_latex`. 2. **`gpqa_diamond`** — mirror `MMLUBenchmark` (`benchmarks/mmlu.py`); pair with the `lighteval_gpqa` grader. 3. **`lcb_codegeneration`** — mirror `MMLUBenchmark`'s scaffolding; pair with the `code_execution` grader. 4. **Stub-plugin validator** — update `AccuracyConfig._reject_stub_plugins()` whenever a benchmark moves from stubbed to supported. diff --git a/src/aiperf/accuracy/benchmarks/aime25.py b/src/aiperf/accuracy/benchmarks/aime25.py index 90757b5d0..b0b19c019 100644 --- a/src/aiperf/accuracy/benchmarks/aime25.py +++ b/src/aiperf/accuracy/benchmarks/aime25.py @@ -1,31 +1,84 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +"""AIME 2025 benchmark loader, aligned with the trt-llm lighteval reference. + +Mirrors ``acc_bench_lighteval.py:aime25``: same ``aime_prompt_fn``, +same zero-shot config, ``generation_size=32768``, +``hf_repo="yentinglin/aime_2025"``. See the AIME24 module for a fuller +explanation of the design. + +Reference: + trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:142 +""" + from __future__ import annotations -from typing import TYPE_CHECKING +import asyncio +from typing import TYPE_CHECKING, Any -from aiperf.accuracy.models import BenchmarkProblem +from datasets import Dataset, load_dataset + +from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem from aiperf.common.mixins import AIPerfLoggerMixin if TYPE_CHECKING: from aiperf.config.resolution.plan import BenchmarkRun +DATASET_NAME = "yentinglin/aime_2025" +TASK_NAME = "aime25" + +# lighteval's aime25 task config: ``generation_size=32768``. +DEFAULT_GENERATION_SIZE = 32768 + +# Schema field names in yentinglin/aime_2025 (same lowercase shape as +# AIME24's HuggingFaceH4 mirror). +PROBLEM_FIELD = "problem" +ANSWER_FIELD = "answer" + class AIME25Benchmark(AIPerfLoggerMixin): - """Registered placeholder for a future AIME 2025 loader. + """AIME 2025 lighteval-aligned benchmark loader. - `load_problems()` intentionally raises NotImplementedError in this release; - use the MMLU benchmark when a working accuracy loader is required. + Loads ``yentinglin/aime_2025`` (train split) and emits one user + message per problem containing the bare problem text — matching + lighteval's zero-shot ``aime_prompt_fn`` rendering. Pair with + ``LightevalExprGrader`` for grading parity with the recipe. """ - def __init__(self, run: BenchmarkRun, **kwargs) -> None: + def __init__(self, run: BenchmarkRun, **kwargs: Any) -> None: super().__init__(**kwargs) self.run = run async def load_problems( self, tasks: list[str] | None, n_shots: int, enable_cot: bool ) -> list[BenchmarkProblem]: - raise NotImplementedError( - "aime25 benchmark is not yet implemented; only 'mmlu' is available in this release." - ) + """Load AIME25 problems and format them lighteval-style. + + Args: + tasks: Ignored — AIME25 has no subtasks. + n_shots: Ignored — the lighteval reference is zero-shot. + enable_cot: Ignored — lighteval's ``aime_prompt_fn`` does + not add a CoT trigger. + + Returns: + One ``BenchmarkProblem`` per dataset row, in dataset order. + """ + ds: Dataset = await asyncio.to_thread(load_dataset, DATASET_NAME, split="train") + return await asyncio.to_thread(self._build_problems, ds) + + def _build_problems(self, ds: Dataset) -> list[BenchmarkProblem]: + problems: list[BenchmarkProblem] = [] + for row in ds: + problem = row[PROBLEM_FIELD] + messages: list[AccuracyChatMessage] = [{"role": "user", "content": problem}] + problems.append( + BenchmarkProblem( + prompt=problem, + ground_truth=str(row[ANSWER_FIELD]), + task=TASK_NAME, + metadata={"generation_size": DEFAULT_GENERATION_SIZE}, + raw_messages=messages, + ) + ) + return problems diff --git a/src/aiperf/plugin/plugins.yaml b/src/aiperf/plugin/plugins.yaml index db3a3eb51..c70f0c776 100644 --- a/src/aiperf/plugin/plugins.yaml +++ b/src/aiperf/plugin/plugins.yaml @@ -1265,11 +1265,12 @@ accuracy_benchmark: aime25: class: aiperf.accuracy.benchmarks.aime25:AIME25Benchmark description: | - AIME 2025 benchmark with problems from the 2025 competition year. + AIME 2025 benchmark, aligned with the trt-llm benchmark recipe's + lighteval-backed configuration (yentinglin/aime_2025 + lighteval + ``expr_gold_metric``). metadata: - default_grader: math + default_grader: lighteval_expr default_n_shots: 0 - is_implemented: false math_500: class: aiperf.accuracy.benchmarks.math_500:Math500Benchmark diff --git a/tests/unit/accuracy/test_accuracy_config.py b/tests/unit/accuracy/test_accuracy_config.py index 8618451db..2beb4dc87 100644 --- a/tests/unit/accuracy/test_accuracy_config.py +++ b/tests/unit/accuracy/test_accuracy_config.py @@ -23,7 +23,6 @@ # This branch (AIP-874) implements ``aime``, ``math``, and ``code_execution``, # so those names are absent from the stub lists. STUB_BENCHMARKS = ( - "aime25", "math_500", "gpqa_diamond", "lcb_codegeneration", diff --git a/tests/unit/accuracy/test_aime25_benchmark.py b/tests/unit/accuracy/test_aime25_benchmark.py new file mode 100644 index 000000000..954d1c9c6 --- /dev/null +++ b/tests/unit/accuracy/test_aime25_benchmark.py @@ -0,0 +1,207 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for ``AIME25Benchmark`` after lighteval alignment. + +Same shape as ``test_aime24_benchmark.py`` — the lighteval reference +config is identical except for the dataset URL. +""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from aiperf.accuracy.benchmarks.aime25 import ( + DEFAULT_GENERATION_SIZE, + TASK_NAME, + AIME25Benchmark, +) +from aiperf.accuracy.models import BenchmarkProblem +from aiperf.plugin.enums import AccuracyBenchmarkType, EndpointType +from tests.unit.conftest import make_benchmark_run + + +def _make_run(): + return make_benchmark_run( + model_names=["test-model"], + endpoint_type=EndpointType.COMPLETIONS, + streaming=False, + accuracy={"benchmark": AccuracyBenchmarkType.AIME25}, + ) + + +def _make_row(problem: str = "What is 1+1?", answer: int = 2) -> dict[str, Any]: + return {"problem": problem, "answer": answer} + + +def _make_fake_dataset(rows: list[dict[str, Any]]) -> MagicMock: + ds = MagicMock() + ds.__iter__ = MagicMock(side_effect=lambda: iter(rows)) + ds.__len__ = MagicMock(return_value=len(rows)) + ds.__getitem__ = MagicMock(side_effect=lambda i: rows[i]) + return ds + + +class TestPromptIsBareProblemText: + @pytest.mark.asyncio + async def test_flat_prompt_is_problem_text(self) -> None: + rows = [_make_row("Compute the answer.", 42)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert problems[0].prompt == "Compute the answer." + + @pytest.mark.asyncio + async def test_no_instruction_prefix(self) -> None: + rows = [_make_row("Q?", 1)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + prompt = problems[0].prompt + assert "**Problem**" not in prompt + assert "competition math" not in prompt + assert "Let's think" not in prompt + assert "boxed" not in prompt + + @pytest.mark.asyncio + async def test_chat_message_is_single_user_message(self) -> None: + rows = [_make_row("Q?", 1)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + msgs = problems[0].raw_messages + assert msgs is not None + assert len(msgs) == 1 + assert msgs[0]["role"] == "user" + assert msgs[0]["content"] == "Q?" + + +class TestNShotsAndCoTAreIgnored: + @pytest.mark.asyncio + async def test_n_shots_argument_does_not_affect_prompt(self) -> None: + rows = [_make_row(f"q{i}", i) for i in range(3)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + zero_shot = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + five_shot = await bench.load_problems( + tasks=None, n_shots=5, enable_cot=False + ) + assert [p.prompt for p in zero_shot] == [p.prompt for p in five_shot] + + @pytest.mark.asyncio + async def test_enable_cot_does_not_affect_prompt(self) -> None: + rows = [_make_row("Q?", 1)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + no_cot = await bench.load_problems(tasks=None, n_shots=0, enable_cot=False) + with_cot = await bench.load_problems(tasks=None, n_shots=0, enable_cot=True) + assert no_cot[0].prompt == with_cot[0].prompt + + +class TestLoadProblemsCore: + @pytest.mark.asyncio + async def test_returns_one_problem_per_row(self) -> None: + rows = [_make_row(f"q{i}", i) for i in range(5)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert len(problems) == 5 + assert all(isinstance(p, BenchmarkProblem) for p in problems) + + @pytest.mark.asyncio + async def test_ground_truth_is_string_form_of_answer(self) -> None: + rows = [_make_row("q", 42)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert problems[0].ground_truth == "42" + + @pytest.mark.asyncio + async def test_task_name_is_aime25(self) -> None: + rows = [_make_row("q", 1)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert problems[0].task == TASK_NAME + + @pytest.mark.asyncio + async def test_generation_size_is_32k(self) -> None: + rows = [_make_row("q", 1)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert problems[0].metadata["generation_size"] == DEFAULT_GENERATION_SIZE + assert DEFAULT_GENERATION_SIZE == 32768 + + +class TestPathologicalDatasetRows: + @pytest.mark.asyncio + async def test_empty_dataset_returns_empty_list(self) -> None: + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset([]), + ): + bench = AIME25Benchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert problems == [] + + @pytest.mark.asyncio + async def test_unicode_problem_text_preserved(self) -> None: + rows = [_make_row("Solve ∑₁ⁿ k² for n=10. ✓", 385)] + with patch( + "aiperf.accuracy.benchmarks.aime25.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = AIME25Benchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert "∑₁ⁿ" in problems[0].prompt