@@ -44,6 +44,7 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f
4444GRADER_PATTERNS = {
4545 "aime" : r'\boxed{(\d+)}|\b(\d+)\b' ,
4646 "aime2025" : r'\boxed{(\d+)}|\b(\d+)\b' ,
47+ "aime2026" : r'\boxed{(\d+)}|\b(\d+)\b' ,
4748 "gsm8k" : r'\b(\d+)\b' ,
4849}
4950
@@ -58,6 +59,11 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f
5859 "-123" ,
5960 "999"
6061 ],
62+ "aime2026" : [
63+ "42" ,
64+ "-123" ,
65+ "999"
66+ ],
6167 "gsm8k" : [
6268 "42" ,
6369 "-123" ,
@@ -81,6 +87,12 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f
8187
8288{question}
8389
90+ Remember to put your answer inside \\ boxed{{}}.
91+ """ ,
92+ "aime2026" : """Solve the following math problem step by step. Put your answer inside \\ boxed{{}}.
93+
94+ {question}
95+
8496Remember to put your answer inside \\ boxed{{}}.
8597""" ,
8698 "gsm8k" : """{question}
@@ -166,6 +178,8 @@ def load_dataset(self, seed: int = 1234):
166178 self .dataset = AimeDataset ()
167179 elif self .dataset_type == "aime2025" :
168180 self .dataset = Aime2025Dataset ()
181+ elif self .dataset_type == "aime2026" :
182+ self .dataset = Aime2026Dataset ()
169183 elif self .dataset_type == "gsm8k" :
170184 self .dataset = Gsm8kDataset ()
171185 elif self .dataset_type == "gpqa" :
@@ -679,6 +693,47 @@ def get_prompt(self, question: Dict) -> str:
679693 question = self .get_question_text (question ),
680694 )
681695
696+ class Aime2026Dataset (BaseDataset ):
697+ def __init__ (self ):
698+ self .questions = []
699+ self ._load_dataset ()
700+
701+ def _load_dataset (self ):
702+ print (f"Loading AIME2026 dataset..." )
703+ from datasets import load_dataset
704+
705+ cache_path = cache_dir / "MathArena___aime_2026" / "default" / "0.0.0"
706+ if cache_path .exists ():
707+ print (f"Using cached dataset from { cache_path } " )
708+ ds = load_dataset ("MathArena/aime_2026" , "default" , split = "train" , cache_dir = str (cache_path ))
709+ else :
710+ ds = load_dataset ("MathArena/aime_2026" , "default" , split = "train" )
711+
712+ self .questions = []
713+ for row in ds :
714+ question = dict (row )
715+ question ["dataset_type" ] = "aime2026"
716+ self .questions .append (question )
717+
718+ print (f"AIME2026 dataset loaded: { len (self .questions )} questions" )
719+
720+ def get_question (self , index : int ) -> Dict :
721+ """Get question by index"""
722+ return self .questions [index ]
723+
724+ def get_question_text (self , question : Dict ) -> str :
725+ """Get question string"""
726+ return question ["problem" ]
727+
728+ def get_answer (self , question : Dict ) -> str :
729+ return str (question ["answer" ])
730+
731+ def get_prompt (self , question : Dict ) -> str :
732+ """Get formatted prompt for the question"""
733+ return TEMPLATE_REGISTRY ["aime2026" ].format (
734+ question = self .get_question_text (question ),
735+ )
736+
682737class Gsm8kDataset (BaseDataset ):
683738 def __init__ (self , split : str = "test" ):
684739 self .split = split
@@ -1188,7 +1243,7 @@ def main():
11881243 "--dataset" ,
11891244 type = str ,
11901245 default = "aime" ,
1191- choices = ["aime" , "aime2025" , "gsm8k" , "gpqa" ],
1246+ choices = ["aime" , "aime2025" , "aime2026" , " gsm8k" , "gpqa" ],
11921247 help = "Dataset type (default: aime)"
11931248 )
11941249 parser .add_argument (
0 commit comments