Skip to content

Commit f9ab622

Browse files
authored
Merge pull request #7 from LumiOpen/daniel/translate-prompts
Add mmath500:en, fix inspect backend for multilingual tasks
2 parents 352d4ce + 54cd2d2 commit f9ab622

3 files changed

Lines changed: 60 additions & 6 deletions

File tree

src/lighteval/main_inspect.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from typer import Argument, Option
3838
from typing_extensions import Annotated
3939

40+
from lighteval.cli_args import load_tasks_multilingual as load_tasks_multilingual_arg
4041
from lighteval.models.abstract_model import InspectAIModelConfig
4142
from lighteval.tasks.lighteval_task import LightevalTaskConfig
4243

@@ -432,10 +433,11 @@ def eval( # noqa C901
432433
rich_help_panel=HELP_PANEL_NAME_4,
433434
),
434435
] = False,
436+
load_tasks_multilingual: load_tasks_multilingual_arg.type = load_tasks_multilingual_arg.default,
435437
):
436438
from lighteval.tasks.registry import Registry
437439

438-
registry = Registry(tasks=tasks, custom_tasks=None, load_multilingual=False)
440+
registry = Registry(tasks=tasks, custom_tasks=custom_tasks, load_multilingual=load_tasks_multilingual)
439441
task_configs = registry.task_to_configs
440442
inspect_ai_tasks = []
441443

src/lighteval/tasks/multilingual/tasks/mmath500.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
LumiOpen/MATH-500_mt
77
88
abstract:
9-
Multilingual translations of the MATH-500 benchmark, a subset of 500 problems
10-
from the MATH benchmark that OpenAI created in their Let's Verify Step by Step
11-
paper. Currently contains Finnish translations produced with Claude Opus 4.5.
9+
Multilingual MATH-500 benchmark, a subset of 500 problems from the MATH
10+
benchmark that OpenAI created in their Let's Verify Step by Step paper.
11+
Contains the original English problems and Finnish translations produced
12+
with Claude Opus 4.5. Supports configurable scorer model via env vars.
1213
1314
languages:
14-
finnish
15+
english, finnish
1516
1617
tags:
1718
math, reasoning, multilingual
@@ -33,6 +34,12 @@
3334

3435

3536
MATH_QUERY_TEMPLATES = {
37+
"en": """
38+
Solve the following problem. The final line of your response MUST be of the following format:
39+
"ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering.
40+
41+
{prompt}
42+
""".strip(),
3643
"fi": """
3744
Ratkaise seuraava tehtävä. Vastauksesi viimeisen rivin TÄYTYY olla seuraavassa muodossa:
3845
"ANSWER: $ANSWER" (ilman lainausmerkkejä), jossa $ANSWER on lopullinen vastaus. Ajattele vaiheittain ennen vastaamista.
@@ -84,6 +91,25 @@ def record_to_sample(record):
8491
return Sample(input=query, target=target)
8592

8693

94+
mmath500_en = LightevalTaskConfig(
95+
name="mmath500:en",
96+
prompt_function=_mmath500_prompt_fn("en"),
97+
hf_repo="HuggingFaceH4/MATH-500",
98+
hf_subset="default",
99+
hf_avail_splits=["test"],
100+
evaluation_splits=["test"],
101+
few_shots_split=None,
102+
few_shots_select=None,
103+
generation_size=32768,
104+
metrics=[
105+
Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}),
106+
],
107+
version=1,
108+
sample_fields=record_to_sample,
109+
solver=[prompt_template(MATH_QUERY_TEMPLATES["en"]), generate(cache=True)],
110+
scorer=model_graded_fact(model=_get_scorer_model()),
111+
)
112+
87113
mmath500_fi = LightevalTaskConfig(
88114
name="mmath500:fi",
89115
prompt_function=_mmath500_prompt_fn("fi"),
@@ -104,5 +130,6 @@ def record_to_sample(record):
104130
)
105131

106132
TASKS_TABLE = [
133+
mmath500_en,
107134
mmath500_fi,
108135
]

src/lighteval/tasks/tasks/math_500.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,17 @@
2222
true
2323
"""
2424

25+
import os
26+
import warnings
27+
28+
warnings.warn(
29+
"math_500 is deprecated, use mmath500:en instead (supports configurable scorer model)",
30+
DeprecationWarning,
31+
stacklevel=2,
32+
)
33+
2534
from inspect_ai.dataset import Sample
35+
from inspect_ai.model import GenerateConfig, get_model
2636
from inspect_ai.scorer import model_graded_fact
2737
from inspect_ai.solver import generate, prompt_template
2838

@@ -31,6 +41,21 @@
3141
from lighteval.tasks.requests import Doc
3242

3343

44+
def _get_scorer_model():
45+
base_url = os.environ.get("SCORER_MODEL_BASE_URL")
46+
if base_url:
47+
model_name = os.environ.get("SCORER_MODEL_PATH", "Qwen/Qwen3.5-9B")
48+
return get_model(
49+
f"openai-api/scorer/{model_name}",
50+
config=GenerateConfig(
51+
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
52+
),
53+
base_url=base_url,
54+
api_key=os.environ.get("VLLM_API_KEY", "inspectai"),
55+
)
56+
return None
57+
58+
3459
MATH_QUERY_TEMPLATE = """
3560
Solve the following problem. The final line of your response MUST be of the following format:
3661
"ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering.
@@ -71,7 +96,7 @@ def record_to_sample(record):
7196
version=2,
7297
sample_fields=record_to_sample,
7398
solver=[prompt_template(MATH_QUERY_TEMPLATE), generate(cache=True)],
74-
scorer=model_graded_fact(),
99+
scorer=model_graded_fact(model=_get_scorer_model()),
75100
)
76101

77102
TASKS_TABLE = [

0 commit comments

Comments
 (0)