feat(accuracy): align MATH-500 with lighteval reference (AIP-879 follow-up)

debermudez · debermudez · commit e4b5d58fddc2 · 2026-05-12T00:59:36.000-07:00
Switches MATH-500 to lighteval-aligned grading per the trt-llm
benchmark recipe's acc_bench_lighteval.py:math_500. Loader emits the
bare problem text as the user message; ground_truth is the full
solution (containing the boxed answer); LightevalLatexGrader extracts
the boxed expression at grade time. Per-row `subject` becomes the
task name so the accuracy CSV breaks down by MATH subject.

plugins.yaml: `math_500.default_grader`: math → lighteval_latex
generation_size: 32768.

Reference: trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:156

Signed-off-by: Elias Bermudez &lt;dbermudez@nvidia.com&gt;
diff --git a/docs/accuracy/accuracy-benchmarking.md b/docs/accuracy/accuracy-benchmarking.md
@@ -74,6 +74,7 @@ system message).
 | `aime` | `math` | 8 | `Maxwell-Jia/AIME_2024` (trt-llm reference, 8-shot CoT) |
 | `aime24` | `lighteval_expr` | 0 | `HuggingFaceH4/aime_2024` (trt-llm/lighteval reference) |
 | `aime25` | `lighteval_expr` | 0 | `yentinglin/aime_2025` (trt-llm/lighteval reference) |
+| `math_500` | `lighteval_latex` | 0 | `HuggingFaceH4/MATH-500` (trt-llm/lighteval reference) |
 
 ## CLI Flags
 
diff --git a/src/aiperf/accuracy/benchmarks/math_500.py b/src/aiperf/accuracy/benchmarks/math_500.py
@@ -1,21 +1,115 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from aiperf.accuracy.models import BenchmarkProblem
+"""MATH-500 benchmark loader, aligned with the trt-llm lighteval reference.
+
+Mirrors ``acc_bench_lighteval.py:math_500``:
+
+    math_500 = LightevalTaskConfig(
+        name="math_500",
+        prompt_function=prompt_fn,        # query=line["problem"], choices=[line["solution"]]
+        hf_repo="HuggingFaceH4/MATH-500",
+        evaluation_splits=["test"],
+        few_shots_split=None,
+        generation_size=32768,
+        metric=[latex_gold_metric],
+    )
+
+Two notable differences from the AIME24/AIME25 loaders:
+
+1. ``ground_truth`` is the full ``solution`` text (which contains a
+   ``\\boxed{answer}``), not a bare answer. ``LightevalLatexGrader``'s
+   ``LatexExtractionConfig`` extracts the boxed answer from the
+   solution at grade time. This matches the recipe's
+   ``latex_gold_metric.gold_extraction_target=(LatexExtractionConfig(),)``.
+2. Pair with ``LightevalLatexGrader`` (default), not
+   ``LightevalExprGrader`` — gold answers are LaTeX expressions
+   (fractions, square roots, etc.).
+
+Reference:
+    trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:156
+"""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+from datasets import Dataset, load_dataset
+
+from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem
 from aiperf.common.config import UserConfig
 from aiperf.common.mixins import AIPerfLoggerMixin
 
+DATASET_NAME = "HuggingFaceH4/MATH-500"
+TASK_NAME = "math_500"
+
+# lighteval's math_500 task config: ``generation_size=32768``.
+DEFAULT_GENERATION_SIZE = 32768
+
+# Schema field names in HuggingFaceH4/MATH-500.
+PROBLEM_FIELD = "problem"
+SOLUTION_FIELD = "solution"
+SUBJECT_FIELD = "subject"
+LEVEL_FIELD = "level"
+
 
 class Math500Benchmark(AIPerfLoggerMixin):
-    """MATH-500 benchmark loader for mathematical reasoning evaluation."""
+    """MATH-500 lighteval-aligned benchmark loader.
 
-    def __init__(self, user_config: UserConfig, **kwargs) -> None:
+    Loads ``HuggingFaceH4/MATH-500`` (test split) and emits one user
+    message per problem containing the bare problem text — matching
+    lighteval's ``prompt_fn``. Gold is the full ``solution`` text;
+    ``LightevalLatexGrader`` extracts the boxed answer at grade time.
+    """
+
+    def __init__(self, user_config: UserConfig, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         self.user_config = user_config
 
     async def load_problems(
         self, tasks: list[str] | None, n_shots: int, enable_cot: bool
     ) -> list[BenchmarkProblem]:
-        raise NotImplementedError(
-            "math_500 benchmark is not yet implemented; only 'mmlu' is available in this release."
-        )
+        """Load MATH-500 problems lighteval-style.
+
+        Args:
+            tasks: Ignored — lighteval's MATH-500 task has no subtask
+                filtering (subjects are kept in metadata for reporting,
+                but lighteval evaluates the full split). Use the
+                aggregated CSV per-subject row to break results down
+                after the run.
+            n_shots: Ignored — the lighteval reference is zero-shot
+                (``few_shots_split=None``).
+            enable_cot: Ignored — lighteval's ``prompt_fn`` does not
+                add a CoT trigger.
+
+        Returns:
+            One ``BenchmarkProblem`` per dataset row, in dataset order.
+        """
+        ds: Dataset = await asyncio.to_thread(load_dataset, DATASET_NAME, split="test")
+        return await asyncio.to_thread(self._build_problems, ds)
+
+    def _build_problems(self, ds: Dataset) -> list[BenchmarkProblem]:
+        problems: list[BenchmarkProblem] = []
+        for row in ds:
+            problem = row[PROBLEM_FIELD]
+            solution = str(row.get(SOLUTION_FIELD, ""))
+            messages: list[AccuracyChatMessage] = [{"role": "user", "content": problem}]
+            problems.append(
+                BenchmarkProblem(
+                    prompt=problem,
+                    # Gold is the full solution containing \\boxed{answer};
+                    # LightevalLatexGrader extracts the boxed expression.
+                    ground_truth=solution,
+                    # Use ``subject`` as the per-row task so the
+                    # accuracy CSV breaks down by MATH subject.
+                    task=row.get(SUBJECT_FIELD) or TASK_NAME,
+                    metadata={
+                        "subject": row.get(SUBJECT_FIELD, ""),
+                        "level": row.get(LEVEL_FIELD),
+                        "generation_size": DEFAULT_GENERATION_SIZE,
+                    },
+                    raw_messages=messages,
+                )
+            )
+        return problems
diff --git a/src/aiperf/plugin/plugins.yaml b/src/aiperf/plugin/plugins.yaml
@@ -1162,12 +1162,13 @@ accuracy_benchmark:
   math_500:
     class: aiperf.accuracy.benchmarks.math_500:Math500Benchmark
     description: |
-      MATH-500 benchmark with 500 curated mathematical reasoning problems
-      spanning algebra, geometry, number theory, and combinatorics.
+      MATH-500 benchmark, aligned with the trt-llm benchmark recipe's
+      lighteval-backed configuration (HuggingFaceH4/MATH-500 + lighteval
+      ``latex_gold_metric``). Per-row ``subject`` is preserved as the
+      task name so accuracy results break down by MATH subject.
     metadata:
-      default_grader: math
+      default_grader: lighteval_latex
       default_n_shots: 0
-      is_implemented: false
 
   gpqa_diamond:
     class: aiperf.accuracy.benchmarks.gpqa_diamond:GPQADiamondBenchmark
diff --git a/tests/unit/accuracy/test_math_500_benchmark.py b/tests/unit/accuracy/test_math_500_benchmark.py