From 802b9c5dcf4ad2cd00c3612c0eb395c7aa718179 Mon Sep 17 00:00:00 2001 From: Daniel Zautner Date: Mon, 23 Mar 2026 11:57:32 +0200 Subject: [PATCH 1/4] Add multilingual MATH-500 Finnish task (mmath500:fi) Uses LumiOpen/MATH-500_mt dataset with Qwen3.5-9B (reasoning disabled) as scorer. --- .../tasks/multilingual/tasks/mmath500.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 src/lighteval/tasks/multilingual/tasks/mmath500.py diff --git a/src/lighteval/tasks/multilingual/tasks/mmath500.py b/src/lighteval/tasks/multilingual/tasks/mmath500.py new file mode 100644 index 000000000..fcf23d483 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mmath500.py @@ -0,0 +1,83 @@ +""" +name: +mMATH-500 + +dataset: +LumiOpen/MATH-500_mt + +abstract: +Multilingual translations of the MATH-500 benchmark, a subset of 500 problems +from the MATH benchmark that OpenAI created in their Let's Verify Step by Step +paper. Currently contains Finnish translations produced with Claude Opus 4.5. + +languages: +finnish + +tags: +math, reasoning, multilingual + +paper: +https://arxiv.org/abs/2305.20050 +""" + +from inspect_ai.dataset import Sample +from inspect_ai.model import GenerateConfig, get_model +from inspect_ai.scorer import model_graded_fact +from inspect_ai.solver import generate, prompt_template + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +MATH_QUERY_TEMPLATE = """ +Solve the following problem. The final line of your response MUST be of the following format: +"ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering. + +{prompt} +""".strip() + +SCORER_MODEL = get_model( + "vllm/Qwen/Qwen3.5-9B", + config=GenerateConfig(reasoning_tokens=0), +) + + +def mmath500_prompt(line, task_name: str = None): + query = MATH_QUERY_TEMPLATE.format(prompt=line["problem"]) + return Doc( + task_name=task_name, + query=query, + choices=[f"ANSWER: {line['answer']}"], + gold_index=0, + ) + + +def record_to_sample(record): + query = record["problem"] + target = record["answer"] + return Sample(input=query, target=target) + + +mmath500_fi = LightevalTaskConfig( + name="mmath500:fi", + prompt_function=mmath500_prompt, + hf_repo="LumiOpen/MATH-500_mt", + hf_subset="default", + hf_avail_splits=["fi"], + evaluation_splits=["fi"], + few_shots_split=None, + few_shots_select=None, + generation_size=32768, + metrics=[ + Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}), + ], + version=1, + sample_fields=record_to_sample, + solver=[prompt_template(MATH_QUERY_TEMPLATE), generate(cache=True)], + scorer=model_graded_fact(model=SCORER_MODEL), +) + +TASKS_TABLE = [ + mmath500_fi, +] From 3b80bb10a06e0d435378655d3bf62e1c89554a30 Mon Sep 17 00:00:00 2001 From: Daniel Zautner Date: Mon, 23 Mar 2026 12:04:55 +0200 Subject: [PATCH 2/4] Fix scorer model to use env vars instead of hardcoded vLLM init Read SCORER_MODEL_BASE_URL/SCORER_MODEL_PATH from env to connect to an existing scorer server started by the eval harness. Falls back to using the eval model (like original math_500) when no scorer server is set up. --- .../tasks/multilingual/tasks/mmath500.py | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/lighteval/tasks/multilingual/tasks/mmath500.py b/src/lighteval/tasks/multilingual/tasks/mmath500.py index fcf23d483..6fc2470ec 100644 --- a/src/lighteval/tasks/multilingual/tasks/mmath500.py +++ b/src/lighteval/tasks/multilingual/tasks/mmath500.py @@ -20,8 +20,10 @@ https://arxiv.org/abs/2305.20050 """ +import os + from inspect_ai.dataset import Sample -from inspect_ai.model import GenerateConfig, get_model +from inspect_ai.model import get_model from inspect_ai.scorer import model_graded_fact from inspect_ai.solver import generate, prompt_template @@ -37,10 +39,23 @@ {prompt} """.strip() -SCORER_MODEL = get_model( - "vllm/Qwen/Qwen3.5-9B", - config=GenerateConfig(reasoning_tokens=0), -) + +def _get_scorer_model(): + """Resolve the scorer model from environment variables if available. + + When SCORER_MODEL_BASE_URL is set (e.g. by the eval harness which starts + a dedicated scorer vLLM server), connect to it via the OpenAI-compatible + API. Otherwise return None so model_graded_fact uses the eval model. + """ + base_url = os.environ.get("SCORER_MODEL_BASE_URL") + if base_url: + model_name = os.environ.get("SCORER_MODEL_PATH", "Qwen/Qwen3.5-9B") + return get_model( + f"openai-api/scorer/{model_name}", + base_url=base_url, + api_key=os.environ.get("VLLM_API_KEY", "inspectai"), + ) + return None def mmath500_prompt(line, task_name: str = None): @@ -75,7 +90,7 @@ def record_to_sample(record): version=1, sample_fields=record_to_sample, solver=[prompt_template(MATH_QUERY_TEMPLATE), generate(cache=True)], - scorer=model_graded_fact(model=SCORER_MODEL), + scorer=model_graded_fact(model=_get_scorer_model()), ) TASKS_TABLE = [ From 5385ba15006c455f1c57aafc22ef33a0e54e4a0a Mon Sep 17 00:00:00 2001 From: Daniel Zautner Date: Wed, 25 Mar 2026 10:40:06 +0200 Subject: [PATCH 3/4] Translate prompt templates to target languages for maime and mmath500 Finnish and Danish prompts reviewed by native speakers (Kai, Maria). --- .../tasks/multilingual/tasks/mmath500.py | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/lighteval/tasks/multilingual/tasks/mmath500.py b/src/lighteval/tasks/multilingual/tasks/mmath500.py index 6fc2470ec..2c0b4fc4d 100644 --- a/src/lighteval/tasks/multilingual/tasks/mmath500.py +++ b/src/lighteval/tasks/multilingual/tasks/mmath500.py @@ -32,12 +32,14 @@ from lighteval.tasks.requests import Doc -MATH_QUERY_TEMPLATE = """ -Solve the following problem. The final line of your response MUST be of the following format: -"ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering. +MATH_QUERY_TEMPLATES = { + "fi": """ +Ratkaise seuraava tehtävä. Vastauksesi viimeisen rivin TÄYTYY olla seuraavassa muodossa: +"ANSWER: $ANSWER" (ilman lainausmerkkejä), jossa $ANSWER on lopullinen vastaus. Ajattele vaiheittain ennen vastaamista. {prompt} -""".strip() +""".strip(), +} def _get_scorer_model(): @@ -58,14 +60,19 @@ def _get_scorer_model(): return None -def mmath500_prompt(line, task_name: str = None): - query = MATH_QUERY_TEMPLATE.format(prompt=line["problem"]) - return Doc( - task_name=task_name, - query=query, - choices=[f"ANSWER: {line['answer']}"], - gold_index=0, - ) +def _mmath500_prompt_fn(lang: str): + template = MATH_QUERY_TEMPLATES[lang] + + def mmath500_prompt(line, task_name: str = None): + query = template.format(prompt=line["problem"]) + return Doc( + task_name=task_name, + query=query, + choices=[f"ANSWER: {line['answer']}"], + gold_index=0, + ) + + return mmath500_prompt def record_to_sample(record): @@ -76,7 +83,7 @@ def record_to_sample(record): mmath500_fi = LightevalTaskConfig( name="mmath500:fi", - prompt_function=mmath500_prompt, + prompt_function=_mmath500_prompt_fn("fi"), hf_repo="LumiOpen/MATH-500_mt", hf_subset="default", hf_avail_splits=["fi"], @@ -89,7 +96,7 @@ def record_to_sample(record): ], version=1, sample_fields=record_to_sample, - solver=[prompt_template(MATH_QUERY_TEMPLATE), generate(cache=True)], + solver=[prompt_template(MATH_QUERY_TEMPLATES["fi"]), generate(cache=True)], scorer=model_graded_fact(model=_get_scorer_model()), ) From 63e69a66eef5210f6f05cbd9db6663f09d7725e5 Mon Sep 17 00:00:00 2001 From: Daniel Zautner Date: Thu, 26 Mar 2026 11:44:18 +0200 Subject: [PATCH 4/4] Disable reasoning on scorer model by default Pass enable_thinking=False via extra_body to the scorer model so it doesn't waste tokens on chain-of-thought when grading answers. --- src/lighteval/tasks/multilingual/tasks/mmath500.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lighteval/tasks/multilingual/tasks/mmath500.py b/src/lighteval/tasks/multilingual/tasks/mmath500.py index 2c0b4fc4d..aae9d610a 100644 --- a/src/lighteval/tasks/multilingual/tasks/mmath500.py +++ b/src/lighteval/tasks/multilingual/tasks/mmath500.py @@ -23,7 +23,7 @@ import os from inspect_ai.dataset import Sample -from inspect_ai.model import get_model +from inspect_ai.model import GenerateConfig, get_model from inspect_ai.scorer import model_graded_fact from inspect_ai.solver import generate, prompt_template @@ -54,6 +54,9 @@ def _get_scorer_model(): model_name = os.environ.get("SCORER_MODEL_PATH", "Qwen/Qwen3.5-9B") return get_model( f"openai-api/scorer/{model_name}", + config=GenerateConfig( + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, + ), base_url=base_url, api_key=os.environ.get("VLLM_API_KEY", "inspectai"), )