Skip to content

Commit d3bf098

Browse files
Daniel Zautnerclaude
andcommitted
Add Finnish GPQA Diamond task definition
Task: gpqa-fi (log-likelihood) and gpqa-fi:diamond (generative/instruct) Dataset: LumiOpen/GPQA-FI (198 post-edited Finnish translations of GPQA Diamond) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 06aee5b commit d3bf098

1 file changed

Lines changed: 128 additions & 0 deletions

File tree

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
"""
2+
name:
3+
GPQA-FI
4+
5+
dataset:
6+
LumiOpen/GPQA-FI
7+
8+
abstract:
9+
Finnish translation of the GPQA Diamond benchmark (Graduate-Level Google-Proof
10+
Q&A). Contains 198 expert-written multiple-choice questions in biology,
11+
physics, and chemistry, professionally post-edited from machine translations
12+
by native Finnish speakers.
13+
14+
languages:
15+
finnish
16+
17+
tags:
18+
knowledge, multiple-choice, qa, science, multilingual
19+
20+
paper:
21+
https://arxiv.org/abs/2311.12022
22+
"""
23+
24+
import random
25+
from string import ascii_uppercase
26+
27+
from inspect_ai.dataset import Sample
28+
from inspect_ai.scorer import choice
29+
from inspect_ai.solver import multiple_choice
30+
31+
from lighteval.metrics.metrics import Metrics
32+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
33+
from lighteval.tasks.requests import Doc
34+
35+
36+
def record_to_sample(record):
37+
gold_index = random.randint(0, 3)
38+
choices = [record["Incorrect Answer 1"], record["Incorrect Answer 2"], record["Incorrect Answer 3"]]
39+
choices.insert(gold_index, record["Correct Answer"])
40+
return Sample(
41+
input=record["Question"].strip(),
42+
choices=choices,
43+
target=ascii_uppercase[gold_index],
44+
)
45+
46+
47+
def gpqa_fi_prompt(line, task_name: str = None):
48+
gold_index = random.randint(0, 3)
49+
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
50+
choices.insert(gold_index, line["Correct Answer"])
51+
52+
instruction = "Vastaa seuraavaan monivalintakysymykseen. Vastauksesi viimeisen rivin tulee olla muotoa: 'Vastaus: $KIRJAIN' (ilman lainausmerkkejä), jossa KIRJAIN on A, B, C tai D. Ajattele vaihe vaiheelta ennen vastaamista.\n\n"
53+
54+
query = f"Kysymys: {line['Question']}\n"
55+
query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, choices)])
56+
query += "Vastaus: "
57+
return Doc(
58+
task_name=task_name,
59+
query=f"{instruction}{query}",
60+
choices=ascii_uppercase[: len(choices)],
61+
gold_index=gold_index,
62+
instruction=instruction,
63+
)
64+
65+
66+
def gpqa_fi_instruct_prompt(line, task_name: str = None):
67+
gold_index = random.randint(0, 3)
68+
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
69+
choices.insert(gold_index, line["Correct Answer"])
70+
instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
71+
query_template = "{Instruction}\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
72+
query = query_template.format(
73+
A=choices[0].strip(),
74+
B=choices[1].strip(),
75+
C=choices[2].strip(),
76+
D=choices[3].strip(),
77+
Question=line["Question"].strip(),
78+
Instruction=instruction,
79+
)
80+
81+
return Doc(
82+
task_name=task_name,
83+
query=query,
84+
choices=list(ascii_uppercase)[: len(choices)],
85+
gold_index=gold_index,
86+
instruction=instruction,
87+
)
88+
89+
90+
# Log-likelihood based evaluation (matches French GPQA pattern)
91+
gpqa_fi = LightevalTaskConfig(
92+
name="gpqa-fi",
93+
prompt_function=gpqa_fi_prompt,
94+
sample_fields=record_to_sample,
95+
solver=[multiple_choice(cache=True)],
96+
scorer=choice(),
97+
hf_repo="LumiOpen/GPQA-FI",
98+
hf_subset="default",
99+
hf_avail_splits=["train"],
100+
evaluation_splits=["train"],
101+
few_shots_split=None,
102+
few_shots_select="random_sampling",
103+
generation_size=1,
104+
metrics=[Metrics.loglikelihood_acc],
105+
stop_sequence=["\n"],
106+
version=0,
107+
)
108+
109+
# Instruct / generative evaluation (matches English GPQA diamond pattern)
110+
gpqa_fi_diamond = LightevalTaskConfig(
111+
name="gpqa-fi:diamond",
112+
prompt_function=gpqa_fi_instruct_prompt,
113+
sample_fields=record_to_sample,
114+
solver=[multiple_choice(cache=True)],
115+
scorer=choice(),
116+
hf_repo="LumiOpen/GPQA-FI",
117+
hf_subset="default",
118+
hf_avail_splits=["train"],
119+
evaluation_splits=["train"],
120+
few_shots_split=None,
121+
few_shots_select=None,
122+
generation_size=32768,
123+
metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})],
124+
stop_sequence=[],
125+
version=1,
126+
)
127+
128+
TASKS_TABLE = [gpqa_fi, gpqa_fi_diamond]

0 commit comments

Comments
 (0)