Skip to content

Commit d5dc2e0

Browse files
authored
llama-eval : add AIME 2026 dataset support (ggml-org#23058)
Add Aime2026Dataset class loading from MathArena/aime_2026 on HuggingFace. 30 problems (two sets of 15), single config/split. Usage: --dataset aime2026 Assisted-by: llama.cpp:local pi
1 parent ac33f03 commit d5dc2e0

1 file changed

Lines changed: 56 additions & 1 deletion

File tree

examples/llama-eval/llama-eval.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f
4444
GRADER_PATTERNS = {
4545
"aime": r'\boxed{(\d+)}|\b(\d+)\b',
4646
"aime2025": r'\boxed{(\d+)}|\b(\d+)\b',
47+
"aime2026": r'\boxed{(\d+)}|\b(\d+)\b',
4748
"gsm8k": r'\b(\d+)\b',
4849
}
4950

@@ -58,6 +59,11 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f
5859
"-123",
5960
"999"
6061
],
62+
"aime2026": [
63+
"42",
64+
"-123",
65+
"999"
66+
],
6167
"gsm8k": [
6268
"42",
6369
"-123",
@@ -81,6 +87,12 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f
8187
8288
{question}
8389
90+
Remember to put your answer inside \\boxed{{}}.
91+
""",
92+
"aime2026": """Solve the following math problem step by step. Put your answer inside \\boxed{{}}.
93+
94+
{question}
95+
8496
Remember to put your answer inside \\boxed{{}}.
8597
""",
8698
"gsm8k": """{question}
@@ -166,6 +178,8 @@ def load_dataset(self, seed: int = 1234):
166178
self.dataset = AimeDataset()
167179
elif self.dataset_type == "aime2025":
168180
self.dataset = Aime2025Dataset()
181+
elif self.dataset_type == "aime2026":
182+
self.dataset = Aime2026Dataset()
169183
elif self.dataset_type == "gsm8k":
170184
self.dataset = Gsm8kDataset()
171185
elif self.dataset_type == "gpqa":
@@ -679,6 +693,47 @@ def get_prompt(self, question: Dict) -> str:
679693
question=self.get_question_text(question),
680694
)
681695

696+
class Aime2026Dataset(BaseDataset):
697+
def __init__(self):
698+
self.questions = []
699+
self._load_dataset()
700+
701+
def _load_dataset(self):
702+
print(f"Loading AIME2026 dataset...")
703+
from datasets import load_dataset
704+
705+
cache_path = cache_dir / "MathArena___aime_2026" / "default" / "0.0.0"
706+
if cache_path.exists():
707+
print(f"Using cached dataset from {cache_path}")
708+
ds = load_dataset("MathArena/aime_2026", "default", split="train", cache_dir=str(cache_path))
709+
else:
710+
ds = load_dataset("MathArena/aime_2026", "default", split="train")
711+
712+
self.questions = []
713+
for row in ds:
714+
question = dict(row)
715+
question["dataset_type"] = "aime2026"
716+
self.questions.append(question)
717+
718+
print(f"AIME2026 dataset loaded: {len(self.questions)} questions")
719+
720+
def get_question(self, index: int) -> Dict:
721+
"""Get question by index"""
722+
return self.questions[index]
723+
724+
def get_question_text(self, question: Dict) -> str:
725+
"""Get question string"""
726+
return question["problem"]
727+
728+
def get_answer(self, question: Dict) -> str:
729+
return str(question["answer"])
730+
731+
def get_prompt(self, question: Dict) -> str:
732+
"""Get formatted prompt for the question"""
733+
return TEMPLATE_REGISTRY["aime2026"].format(
734+
question=self.get_question_text(question),
735+
)
736+
682737
class Gsm8kDataset(BaseDataset):
683738
def __init__(self, split: str = "test"):
684739
self.split = split
@@ -1188,7 +1243,7 @@ def main():
11881243
"--dataset",
11891244
type=str,
11901245
default="aime",
1191-
choices=["aime", "aime2025", "gsm8k", "gpqa"],
1246+
choices=["aime", "aime2025", "aime2026", "gsm8k", "gpqa"],
11921247
help="Dataset type (default: aime)"
11931248
)
11941249
parser.add_argument(

0 commit comments

Comments
 (0)