|
1 | 1 | # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | 3 |
|
4 | | -from aiperf.accuracy.models import BenchmarkProblem |
| 4 | +"""MATH-500 benchmark loader, aligned with the trt-llm lighteval reference. |
| 5 | +
|
| 6 | +Mirrors ``acc_bench_lighteval.py:math_500``: |
| 7 | +
|
| 8 | + math_500 = LightevalTaskConfig( |
| 9 | + name="math_500", |
| 10 | + prompt_function=prompt_fn, # query=line["problem"], choices=[line["solution"]] |
| 11 | + hf_repo="HuggingFaceH4/MATH-500", |
| 12 | + evaluation_splits=["test"], |
| 13 | + few_shots_split=None, |
| 14 | + generation_size=32768, |
| 15 | + metric=[latex_gold_metric], |
| 16 | + ) |
| 17 | +
|
| 18 | +Two notable differences from the AIME24/AIME25 loaders: |
| 19 | +
|
| 20 | +1. ``ground_truth`` is the full ``solution`` text (which contains a |
| 21 | + ``\\boxed{answer}``), not a bare answer. ``LightevalLatexGrader``'s |
| 22 | + ``LatexExtractionConfig`` extracts the boxed answer from the |
| 23 | + solution at grade time. This matches the recipe's |
| 24 | + ``latex_gold_metric.gold_extraction_target=(LatexExtractionConfig(),)``. |
| 25 | +2. Pair with ``LightevalLatexGrader`` (default), not |
| 26 | + ``LightevalExprGrader`` — gold answers are LaTeX expressions |
| 27 | + (fractions, square roots, etc.). |
| 28 | +
|
| 29 | +Reference: |
| 30 | + trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:156 |
| 31 | +""" |
| 32 | + |
| 33 | +from __future__ import annotations |
| 34 | + |
| 35 | +import asyncio |
| 36 | +from typing import Any |
| 37 | + |
| 38 | +from datasets import Dataset, load_dataset |
| 39 | + |
| 40 | +from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem |
5 | 41 | from aiperf.common.config import UserConfig |
6 | 42 | from aiperf.common.mixins import AIPerfLoggerMixin |
7 | 43 |
|
| 44 | +DATASET_NAME = "HuggingFaceH4/MATH-500" |
| 45 | +TASK_NAME = "math_500" |
| 46 | + |
| 47 | +# lighteval's math_500 task config: ``generation_size=32768``. |
| 48 | +DEFAULT_GENERATION_SIZE = 32768 |
| 49 | + |
| 50 | +# Schema field names in HuggingFaceH4/MATH-500. |
| 51 | +PROBLEM_FIELD = "problem" |
| 52 | +SOLUTION_FIELD = "solution" |
| 53 | +SUBJECT_FIELD = "subject" |
| 54 | +LEVEL_FIELD = "level" |
| 55 | + |
8 | 56 |
|
9 | 57 | class Math500Benchmark(AIPerfLoggerMixin): |
10 | | - """MATH-500 benchmark loader for mathematical reasoning evaluation.""" |
| 58 | + """MATH-500 lighteval-aligned benchmark loader. |
11 | 59 |
|
12 | | - def __init__(self, user_config: UserConfig, **kwargs) -> None: |
| 60 | + Loads ``HuggingFaceH4/MATH-500`` (test split) and emits one user |
| 61 | + message per problem containing the bare problem text — matching |
| 62 | + lighteval's ``prompt_fn``. Gold is the full ``solution`` text; |
| 63 | + ``LightevalLatexGrader`` extracts the boxed answer at grade time. |
| 64 | + """ |
| 65 | + |
| 66 | + def __init__(self, user_config: UserConfig, **kwargs: Any) -> None: |
13 | 67 | super().__init__(**kwargs) |
14 | 68 | self.user_config = user_config |
15 | 69 |
|
16 | 70 | async def load_problems( |
17 | 71 | self, tasks: list[str] | None, n_shots: int, enable_cot: bool |
18 | 72 | ) -> list[BenchmarkProblem]: |
19 | | - raise NotImplementedError( |
20 | | - "math_500 benchmark is not yet implemented; only 'mmlu' is available in this release." |
21 | | - ) |
| 73 | + """Load MATH-500 problems lighteval-style. |
| 74 | +
|
| 75 | + Args: |
| 76 | + tasks: Ignored — lighteval's MATH-500 task has no subtask |
| 77 | + filtering (subjects are kept in metadata for reporting, |
| 78 | + but lighteval evaluates the full split). Use the |
| 79 | + aggregated CSV per-subject row to break results down |
| 80 | + after the run. |
| 81 | + n_shots: Ignored — the lighteval reference is zero-shot |
| 82 | + (``few_shots_split=None``). |
| 83 | + enable_cot: Ignored — lighteval's ``prompt_fn`` does not |
| 84 | + add a CoT trigger. |
| 85 | +
|
| 86 | + Returns: |
| 87 | + One ``BenchmarkProblem`` per dataset row, in dataset order. |
| 88 | + """ |
| 89 | + ds: Dataset = await asyncio.to_thread(load_dataset, DATASET_NAME, split="test") |
| 90 | + return await asyncio.to_thread(self._build_problems, ds) |
| 91 | + |
| 92 | + def _build_problems(self, ds: Dataset) -> list[BenchmarkProblem]: |
| 93 | + problems: list[BenchmarkProblem] = [] |
| 94 | + for row in ds: |
| 95 | + problem = row[PROBLEM_FIELD] |
| 96 | + solution = str(row.get(SOLUTION_FIELD, "")) |
| 97 | + messages: list[AccuracyChatMessage] = [{"role": "user", "content": problem}] |
| 98 | + problems.append( |
| 99 | + BenchmarkProblem( |
| 100 | + prompt=problem, |
| 101 | + # Gold is the full solution containing \\boxed{answer}; |
| 102 | + # LightevalLatexGrader extracts the boxed expression. |
| 103 | + ground_truth=solution, |
| 104 | + # Use ``subject`` as the per-row task so the |
| 105 | + # accuracy CSV breaks down by MATH subject. |
| 106 | + task=row.get(SUBJECT_FIELD) or TASK_NAME, |
| 107 | + metadata={ |
| 108 | + "subject": row.get(SUBJECT_FIELD, ""), |
| 109 | + "level": row.get(LEVEL_FIELD), |
| 110 | + "generation_size": DEFAULT_GENERATION_SIZE, |
| 111 | + }, |
| 112 | + raw_messages=messages, |
| 113 | + ) |
| 114 | + ) |
| 115 | + return problems |
0 commit comments