Add Finnish GPQA Diamond task definition

Daniel Zautner · claude · Daniel Zautner · commit d3bf098dece6 · 2026-03-16T11:42:26.000+02:00
Task: gpqa-fi (log-likelihood) and gpqa-fi:diamond (generative/instruct)
Dataset: LumiOpen/GPQA-FI (198 post-edited Finnish translations of GPQA Diamond)

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/lighteval/tasks/multilingual/tasks/gpqa_fi.py b/src/lighteval/tasks/multilingual/tasks/gpqa_fi.py
@@ -0,0 +1,128 @@
+"""
+name:
+GPQA-FI
+
+dataset:
+LumiOpen/GPQA-FI
+
+abstract:
+Finnish translation of the GPQA Diamond benchmark (Graduate-Level Google-Proof
+Q&A). Contains 198 expert-written multiple-choice questions in biology,
+physics, and chemistry, professionally post-edited from machine translations
+by native Finnish speakers.
+
+languages:
+finnish
+
+tags:
+knowledge, multiple-choice, qa, science, multilingual
+
+paper:
+https://arxiv.org/abs/2311.12022
+"""
+
+import random
+from string import ascii_uppercase
+
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import choice
+from inspect_ai.solver import multiple_choice
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+def record_to_sample(record):
+    gold_index = random.randint(0, 3)
+    choices = [record["Incorrect Answer 1"], record["Incorrect Answer 2"], record["Incorrect Answer 3"]]
+    choices.insert(gold_index, record["Correct Answer"])
+    return Sample(
+        input=record["Question"].strip(),
+        choices=choices,
+        target=ascii_uppercase[gold_index],
+    )
+
+
+def gpqa_fi_prompt(line, task_name: str = None):
+    gold_index = random.randint(0, 3)
+    choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
+    choices.insert(gold_index, line["Correct Answer"])
+
+    instruction = "Vastaa seuraavaan monivalintakysymykseen. Vastauksesi viimeisen rivin tulee olla muotoa: 'Vastaus: $KIRJAIN' (ilman lainausmerkkejä), jossa KIRJAIN on A, B, C tai D. Ajattele vaihe vaiheelta ennen vastaamista.\n\n"
+
+    query = f"Kysymys: {line['Question']}\n"
+    query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, choices)])
+    query += "Vastaus: "
+    return Doc(
+        task_name=task_name,
+        query=f"{instruction}{query}",
+        choices=ascii_uppercase[: len(choices)],
+        gold_index=gold_index,
+        instruction=instruction,
+    )
+
+
+def gpqa_fi_instruct_prompt(line, task_name: str = None):
+    gold_index = random.randint(0, 3)
+    choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
+    choices.insert(gold_index, line["Correct Answer"])
+    instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
+    query_template = "{Instruction}\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
+    query = query_template.format(
+        A=choices[0].strip(),
+        B=choices[1].strip(),
+        C=choices[2].strip(),
+        D=choices[3].strip(),
+        Question=line["Question"].strip(),
+        Instruction=instruction,
+    )
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=list(ascii_uppercase)[: len(choices)],
+        gold_index=gold_index,
+        instruction=instruction,
+    )
+
+
+# Log-likelihood based evaluation (matches French GPQA pattern)
+gpqa_fi = LightevalTaskConfig(
+    name="gpqa-fi",
+    prompt_function=gpqa_fi_prompt,
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
+    hf_repo="LumiOpen/GPQA-FI",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select="random_sampling",
+    generation_size=1,
+    metrics=[Metrics.loglikelihood_acc],
+    stop_sequence=["\n"],
+    version=0,
+)
+
+# Instruct / generative evaluation (matches English GPQA diamond pattern)
+gpqa_fi_diamond = LightevalTaskConfig(
+    name="gpqa-fi:diamond",
+    prompt_function=gpqa_fi_instruct_prompt,
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
+    hf_repo="LumiOpen/GPQA-FI",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,
+    metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})],
+    stop_sequence=[],
+    version=1,
+)
+
+TASKS_TABLE = [gpqa_fi, gpqa_fi_diamond]