Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions src/eval/tasks/aime2025/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from inspect_ai import eval as inspect_eval # type: ignore # noqa: E402
from inspect_ai.util._display import init_display_type # noqa: E402

import inspect_evals.aime2025 # noqa: F401, E402 (registers task definitions)
from task import aime2025


def parse_args() -> argparse.Namespace:
Expand Down Expand Up @@ -67,14 +67,13 @@ def main() -> None:
if (args.limit is not None) and (args.limit != -1):
other_kwargs["limit"] = args.limit

task = "inspect_evals/aime2025"
model_args = {
'gpu_memory_utilization': args.gpu_memory_utilization,
}
model_args.update(template_kwargs(args))

eval_out = inspect_eval(
task,
aime2025(),
model=f"vllm/{args.model_path}",
model_args=model_args,
score_display=False,
Expand Down
75 changes: 75 additions & 0 deletions src/eval/tasks/aime2025/score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""AIME numeric answer extraction and equality grading.

inspect_ai.match(numeric=True) compares the extracted answer to the target with
str.endswith, which marks wrong answers correct when the prediction ends with
the same digit suffix (e.g. 711 vs 11).
"""

from __future__ import annotations

import re

from inspect_ai._util.text import strip_numeric_punctuation
from inspect_ai.scorer import Score, Scorer, Target, accuracy, scorer, stderr
from inspect_ai.scorer._common import first_number_normalized, normalize_number
from inspect_ai.scorer._metric import CORRECT, INCORRECT
from inspect_ai.solver import TaskState

ANSWER_LINE = re.compile(r"(?im)^\s*ANSWER:\s*(.+?)\s*$")
BOXED = re.compile(r"\\boxed\{([^{}]*)\}")


def strip_boxed(text: str) -> str:
prev = None
cur = text
while prev != cur:
prev = cur
cur = BOXED.sub(r"\1", cur)
return cur


def extract(completion: str) -> str:
cleaned = strip_boxed(completion.strip())
matches = ANSWER_LINE.findall(cleaned)
if matches:
return matches[-1].strip()

v = strip_numeric_punctuation(cleaned.casefold())
words = re.split(r"\s+", v)
words.reverse()
return first_number_normalized(words)


def grade(completion: str, target: str) -> tuple[str, bool]:
t = target.strip()
if not t.isnumeric():
raise ValueError(f"AIME targets must be numeric strings, got {target!r}")

answer = extract(completion)
pred = normalize_number(strip_numeric_punctuation(answer.casefold()))
gold = normalize_number(strip_numeric_punctuation(t.casefold()))
return answer, pred == gold


@scorer(metrics=[accuracy(), stderr()])
def aime_scorer() -> Scorer:
async def score(state: TaskState, target: Target) -> Score:
raw = state.output.completion
extracted: str | None = None
for value in target:
extracted, matched = grade(raw, value)
if matched:
return Score(
value=CORRECT,
answer=extracted,
explanation=raw,
metadata={"unprocessed_answer": raw, "extracted_answer": extracted},
)
return Score(
value=INCORRECT,
answer=extracted,
explanation=raw,
metadata={"unprocessed_answer": raw, "extracted_answer": extracted},
)

return score
35 changes: 35 additions & 0 deletions src/eval/tasks/aime2025/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""AIME 2025 inspect task with equality-based numeric grading."""

from __future__ import annotations

from typing import Any

from inspect_ai import Task, task
from inspect_ai.dataset import Sample, hf_dataset
from inspect_evals.aime2024.aime2024 import aime2024_solver

from score import aime_scorer

DATASET_PATH = "math-ai/aime25"


@task
def aime2025() -> Task:
dataset = hf_dataset(
path=DATASET_PATH,
split="test",
sample_fields=record_to_sample,
)
return Task(
dataset=dataset,
solver=aime2024_solver(),
scorer=[aime_scorer()],
)


def record_to_sample(record: dict[str, Any]) -> Sample:
return Sample(
id=record["id"],
input=record["problem"],
target=str(record["answer"]),
)
25 changes: 25 additions & 0 deletions src/eval/tasks/aime2025/test_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from score import grade, strip_boxed


def test_suffix_false_positive_rejected():
_, ok = grade("ANSWER: 711", "11")
assert not ok


def test_dataset_suffix_collision_rejected():
_, ok = grade("ANSWER: 149", "49")
assert not ok


def test_correct_answer_line():
_, ok = grade("Step by step...\nANSWER: 127", "127")
assert ok


def test_answer_line_beats_trailing_wrong_number():
_, ok = grade("work shows 149\nANSWER: 49", "49")
assert ok


def test_strip_boxed_multiple():
assert strip_boxed(r"\boxed{1} and \boxed{2}") == "1 and 2"
5 changes: 5 additions & 0 deletions src/run_task.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ mkdir -p "${JOB_DIR}"
mkdir "${JOB_DIR}/task"

cp "src/eval/tasks/${EVALUATION_TASK}/evaluate.py" "${JOB_DIR}/task"
for _aime_extra in score.py task.py; do
if [ -f "src/eval/tasks/${EVALUATION_TASK}/${_aime_extra}" ]; then
cp "src/eval/tasks/${EVALUATION_TASK}/${_aime_extra}" "${JOB_DIR}/task"
fi
done
if [ -d "src/eval/tasks/${EVALUATION_TASK}/evaluation_code" ]; then
cp -r "src/eval/tasks/${EVALUATION_TASK}/evaluation_code" "${JOB_DIR}/task"
fi
Expand Down