Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/lighteval/main_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from typer import Argument, Option
from typing_extensions import Annotated

from lighteval.cli_args import load_tasks_multilingual as load_tasks_multilingual_arg
from lighteval.models.abstract_model import InspectAIModelConfig
from lighteval.tasks.lighteval_task import LightevalTaskConfig

Expand Down Expand Up @@ -432,10 +433,11 @@ def eval( # noqa C901
rich_help_panel=HELP_PANEL_NAME_4,
),
] = False,
load_tasks_multilingual: load_tasks_multilingual_arg.type = load_tasks_multilingual_arg.default,
):
from lighteval.tasks.registry import Registry

registry = Registry(tasks=tasks, custom_tasks=None, load_multilingual=False)
registry = Registry(tasks=tasks, custom_tasks=custom_tasks, load_multilingual=load_tasks_multilingual)
task_configs = registry.task_to_configs
inspect_ai_tasks = []

Expand Down
35 changes: 31 additions & 4 deletions src/lighteval/tasks/multilingual/tasks/mmath500.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
LumiOpen/MATH-500_mt

abstract:
Multilingual translations of the MATH-500 benchmark, a subset of 500 problems
from the MATH benchmark that OpenAI created in their Let's Verify Step by Step
paper. Currently contains Finnish translations produced with Claude Opus 4.5.
Multilingual MATH-500 benchmark, a subset of 500 problems from the MATH
benchmark that OpenAI created in their Let's Verify Step by Step paper.
Contains the original English problems and Finnish translations produced
with Claude Opus 4.5. Supports configurable scorer model via env vars.

languages:
finnish
english, finnish

tags:
math, reasoning, multilingual
Expand All @@ -33,6 +34,12 @@


MATH_QUERY_TEMPLATES = {
"en": """
Solve the following problem. The final line of your response MUST be of the following format:
"ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering.

{prompt}
""".strip(),
"fi": """
Ratkaise seuraava tehtävä. Vastauksesi viimeisen rivin TÄYTYY olla seuraavassa muodossa:
"ANSWER: $ANSWER" (ilman lainausmerkkejä), jossa $ANSWER on lopullinen vastaus. Ajattele vaiheittain ennen vastaamista.
Expand Down Expand Up @@ -84,6 +91,25 @@ def record_to_sample(record):
return Sample(input=query, target=target)


mmath500_en = LightevalTaskConfig(
name="mmath500:en",
prompt_function=_mmath500_prompt_fn("en"),
hf_repo="HuggingFaceH4/MATH-500",
hf_subset="default",
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=None,
few_shots_select=None,
generation_size=32768,
metrics=[
Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}),
],
version=1,
sample_fields=record_to_sample,
solver=[prompt_template(MATH_QUERY_TEMPLATES["en"]), generate(cache=True)],
scorer=model_graded_fact(model=_get_scorer_model()),
)

mmath500_fi = LightevalTaskConfig(
name="mmath500:fi",
prompt_function=_mmath500_prompt_fn("fi"),
Expand All @@ -104,5 +130,6 @@ def record_to_sample(record):
)

TASKS_TABLE = [
mmath500_en,
mmath500_fi,
]
27 changes: 26 additions & 1 deletion src/lighteval/tasks/tasks/math_500.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,17 @@
true
"""

import os
import warnings

warnings.warn(
"math_500 is deprecated, use mmath500:en instead (supports configurable scorer model)",
DeprecationWarning,
stacklevel=2,
)

from inspect_ai.dataset import Sample
from inspect_ai.model import GenerateConfig, get_model
from inspect_ai.scorer import model_graded_fact
from inspect_ai.solver import generate, prompt_template

Expand All @@ -31,6 +41,21 @@
from lighteval.tasks.requests import Doc


def _get_scorer_model():
base_url = os.environ.get("SCORER_MODEL_BASE_URL")
if base_url:
model_name = os.environ.get("SCORER_MODEL_PATH", "Qwen/Qwen3.5-9B")
return get_model(
f"openai-api/scorer/{model_name}",
config=GenerateConfig(
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
),
base_url=base_url,
api_key=os.environ.get("VLLM_API_KEY", "inspectai"),
)
return None


MATH_QUERY_TEMPLATE = """
Solve the following problem. The final line of your response MUST be of the following format:
"ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering.
Expand Down Expand Up @@ -71,7 +96,7 @@ def record_to_sample(record):
version=2,
sample_fields=record_to_sample,
solver=[prompt_template(MATH_QUERY_TEMPLATE), generate(cache=True)],
scorer=model_graded_fact(),
scorer=model_graded_fact(model=_get_scorer_model()),
)

TASKS_TABLE = [
Expand Down
Loading