diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py index 4aa5cbf87..1b8fb8db4 100644 --- a/src/lighteval/main_inspect.py +++ b/src/lighteval/main_inspect.py @@ -37,6 +37,7 @@ from typer import Argument, Option from typing_extensions import Annotated +from lighteval.cli_args import load_tasks_multilingual as load_tasks_multilingual_arg from lighteval.models.abstract_model import InspectAIModelConfig from lighteval.tasks.lighteval_task import LightevalTaskConfig @@ -432,10 +433,11 @@ def eval( # noqa C901 rich_help_panel=HELP_PANEL_NAME_4, ), ] = False, + load_tasks_multilingual: load_tasks_multilingual_arg.type = load_tasks_multilingual_arg.default, ): from lighteval.tasks.registry import Registry - registry = Registry(tasks=tasks, custom_tasks=None, load_multilingual=False) + registry = Registry(tasks=tasks, custom_tasks=custom_tasks, load_multilingual=load_tasks_multilingual) task_configs = registry.task_to_configs inspect_ai_tasks = [] diff --git a/src/lighteval/tasks/multilingual/tasks/mmath500.py b/src/lighteval/tasks/multilingual/tasks/mmath500.py index aae9d610a..ddd9a463f 100644 --- a/src/lighteval/tasks/multilingual/tasks/mmath500.py +++ b/src/lighteval/tasks/multilingual/tasks/mmath500.py @@ -6,12 +6,13 @@ LumiOpen/MATH-500_mt abstract: -Multilingual translations of the MATH-500 benchmark, a subset of 500 problems -from the MATH benchmark that OpenAI created in their Let's Verify Step by Step -paper. Currently contains Finnish translations produced with Claude Opus 4.5. +Multilingual MATH-500 benchmark, a subset of 500 problems from the MATH +benchmark that OpenAI created in their Let's Verify Step by Step paper. +Contains the original English problems and Finnish translations produced +with Claude Opus 4.5. Supports configurable scorer model via env vars. languages: -finnish +english, finnish tags: math, reasoning, multilingual @@ -33,6 +34,12 @@ MATH_QUERY_TEMPLATES = { + "en": """ +Solve the following problem. The final line of your response MUST be of the following format: +"ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering. + +{prompt} +""".strip(), "fi": """ Ratkaise seuraava tehtävä. Vastauksesi viimeisen rivin TÄYTYY olla seuraavassa muodossa: "ANSWER: $ANSWER" (ilman lainausmerkkejä), jossa $ANSWER on lopullinen vastaus. Ajattele vaiheittain ennen vastaamista. @@ -84,6 +91,25 @@ def record_to_sample(record): return Sample(input=query, target=target) +mmath500_en = LightevalTaskConfig( + name="mmath500:en", + prompt_function=_mmath500_prompt_fn("en"), + hf_repo="HuggingFaceH4/MATH-500", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=32768, + metrics=[ + Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}), + ], + version=1, + sample_fields=record_to_sample, + solver=[prompt_template(MATH_QUERY_TEMPLATES["en"]), generate(cache=True)], + scorer=model_graded_fact(model=_get_scorer_model()), +) + mmath500_fi = LightevalTaskConfig( name="mmath500:fi", prompt_function=_mmath500_prompt_fn("fi"), @@ -104,5 +130,6 @@ def record_to_sample(record): ) TASKS_TABLE = [ + mmath500_en, mmath500_fi, ] diff --git a/src/lighteval/tasks/tasks/math_500.py b/src/lighteval/tasks/tasks/math_500.py index 965bcd645..b819dbd43 100644 --- a/src/lighteval/tasks/tasks/math_500.py +++ b/src/lighteval/tasks/tasks/math_500.py @@ -22,7 +22,17 @@ true """ +import os +import warnings + +warnings.warn( + "math_500 is deprecated, use mmath500:en instead (supports configurable scorer model)", + DeprecationWarning, + stacklevel=2, +) + from inspect_ai.dataset import Sample +from inspect_ai.model import GenerateConfig, get_model from inspect_ai.scorer import model_graded_fact from inspect_ai.solver import generate, prompt_template @@ -31,6 +41,21 @@ from lighteval.tasks.requests import Doc +def _get_scorer_model(): + base_url = os.environ.get("SCORER_MODEL_BASE_URL") + if base_url: + model_name = os.environ.get("SCORER_MODEL_PATH", "Qwen/Qwen3.5-9B") + return get_model( + f"openai-api/scorer/{model_name}", + config=GenerateConfig( + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, + ), + base_url=base_url, + api_key=os.environ.get("VLLM_API_KEY", "inspectai"), + ) + return None + + MATH_QUERY_TEMPLATE = """ Solve the following problem. The final line of your response MUST be of the following format: "ANSWER: $ANSWER" (without quotes) where $ANSWER is the final answer. Think step by step before answering. @@ -71,7 +96,7 @@ def record_to_sample(record): version=2, sample_fields=record_to_sample, solver=[prompt_template(MATH_QUERY_TEMPLATE), generate(cache=True)], - scorer=model_graded_fact(), + scorer=model_graded_fact(model=_get_scorer_model()), ) TASKS_TABLE = [