llama-eval : add AIME 2026 dataset support (ggml-org#23058)

ggerganov · web-flow · commit d5dc2e0a0275 · 2026-05-15T13:58:30.000+03:00
Add Aime2026Dataset class loading from MathArena/aime_2026 on
HuggingFace. 30 problems (two sets of 15), single config/split.

Usage: --dataset aime2026

Assisted-by: llama.cpp:local pi
diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py
@@ -44,6 +44,7 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f
 GRADER_PATTERNS = {
     "aime": r'\boxed{(\d+)}|\b(\d+)\b',
     "aime2025": r'\boxed{(\d+)}|\b(\d+)\b',
+    "aime2026": r'\boxed{(\d+)}|\b(\d+)\b',
     "gsm8k": r'\b(\d+)\b',
 }
 
@@ -58,6 +59,11 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f
         "-123",
         "999"
     ],
+    "aime2026": [
+        "42",
+        "-123",
+        "999"
+    ],
     "gsm8k": [
         "42",
         "-123",
@@ -81,6 +87,12 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f
 
 {question}
 
+Remember to put your answer inside \\boxed{{}}.
+""",
+    "aime2026": """Solve the following math problem step by step. Put your answer inside \\boxed{{}}.
+
+{question}
+
 Remember to put your answer inside \\boxed{{}}.
 """,
     "gsm8k": """{question}
@@ -166,6 +178,8 @@ def load_dataset(self, seed: int = 1234):
             self.dataset = AimeDataset()
         elif self.dataset_type == "aime2025":
             self.dataset = Aime2025Dataset()
+        elif self.dataset_type == "aime2026":
+            self.dataset = Aime2026Dataset()
         elif self.dataset_type == "gsm8k":
             self.dataset = Gsm8kDataset()
         elif self.dataset_type == "gpqa":
@@ -679,6 +693,47 @@ def get_prompt(self, question: Dict) -> str:
             question=self.get_question_text(question),
         )
 
+class Aime2026Dataset(BaseDataset):
+    def __init__(self):
+        self.questions = []
+        self._load_dataset()
+
+    def _load_dataset(self):
+        print(f"Loading AIME2026 dataset...")
+        from datasets import load_dataset
+
+        cache_path = cache_dir / "MathArena___aime_2026" / "default" / "0.0.0"
+        if cache_path.exists():
+            print(f"Using cached dataset from {cache_path}")
+            ds = load_dataset("MathArena/aime_2026", "default", split="train", cache_dir=str(cache_path))
+        else:
+            ds = load_dataset("MathArena/aime_2026", "default", split="train")
+
+        self.questions = []
+        for row in ds:
+            question = dict(row)
+            question["dataset_type"] = "aime2026"
+            self.questions.append(question)
+
+        print(f"AIME2026 dataset loaded: {len(self.questions)} questions")
+
+    def get_question(self, index: int) -> Dict:
+        """Get question by index"""
+        return self.questions[index]
+
+    def get_question_text(self, question: Dict) -> str:
+        """Get question string"""
+        return question["problem"]
+
+    def get_answer(self, question: Dict) -> str:
+        return str(question["answer"])
+
+    def get_prompt(self, question: Dict) -> str:
+        """Get formatted prompt for the question"""
+        return TEMPLATE_REGISTRY["aime2026"].format(
+            question=self.get_question_text(question),
+        )
+
 class Gsm8kDataset(BaseDataset):
     def __init__(self, split: str = "test"):
         self.split = split
@@ -1188,7 +1243,7 @@ def main():
         "--dataset",
         type=str,
         default="aime",
-        choices=["aime", "aime2025", "gsm8k", "gpqa"],
+        choices=["aime", "aime2025", "aime2026", "gsm8k", "gpqa"],
         help="Dataset type (default: aime)"
     )
     parser.add_argument(