Skip to content

Commit 5531207

Browse files
author
Daniel Zautner
committed
Add Finnish GPQA Diamond task with Finnish prompts and seeded answer ordering
1 parent 06aee5b commit 5531207

1 file changed

Lines changed: 130 additions & 0 deletions

File tree

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
"""
2+
name:
3+
GPQA-FI
4+
5+
dataset:
6+
LumiOpen/GPQA-FI
7+
8+
abstract:
9+
Finnish translation of the GPQA Diamond benchmark (Graduate-Level Google-Proof
10+
Q&A). Contains 198 expert-written multiple-choice questions in biology,
11+
physics, and chemistry, professionally post-edited from machine translations
12+
by native Finnish speakers.
13+
14+
languages:
15+
finnish
16+
17+
tags:
18+
knowledge, multiple-choice, qa, science, multilingual
19+
20+
paper:
21+
https://arxiv.org/abs/2311.12022
22+
"""
23+
24+
import random
25+
from string import ascii_uppercase
26+
27+
from inspect_ai.dataset import Sample
28+
from inspect_ai.scorer import choice
29+
from inspect_ai.solver import multiple_choice
30+
31+
from lighteval.metrics.metrics import Metrics
32+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
33+
from lighteval.tasks.requests import Doc
34+
35+
random.seed(42)
36+
37+
38+
def record_to_sample(record):
39+
gold_index = random.randint(0, 3)
40+
choices = [record["Incorrect Answer 1"], record["Incorrect Answer 2"], record["Incorrect Answer 3"]]
41+
choices.insert(gold_index, record["Correct Answer"])
42+
return Sample(
43+
input=record["Question"].strip(),
44+
choices=choices,
45+
target=ascii_uppercase[gold_index],
46+
)
47+
48+
49+
def gpqa_fi_prompt(line, task_name: str = None):
50+
gold_index = random.randint(0, 3)
51+
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
52+
choices.insert(gold_index, line["Correct Answer"])
53+
54+
instruction = "Vastaa seuraavaan monivalintakysymykseen. Vastauksesi viimeisen rivin tulee olla muotoa: 'Vastaus: $KIRJAIN' (ilman lainausmerkkejä), jossa KIRJAIN on A, B, C tai D. Ajattele vaihe vaiheelta ennen vastaamista.\n\n"
55+
56+
query = f"Kysymys: {line['Question']}\n"
57+
query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, choices)])
58+
query += "Vastaus: "
59+
return Doc(
60+
task_name=task_name,
61+
query=f"{instruction}{query}",
62+
choices=ascii_uppercase[: len(choices)],
63+
gold_index=gold_index,
64+
instruction=instruction,
65+
)
66+
67+
68+
def gpqa_fi_instruct_prompt(line, task_name: str = None):
69+
gold_index = random.randint(0, 3)
70+
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
71+
choices.insert(gold_index, line["Correct Answer"])
72+
instruction = "Vastaa seuraavaan monivalintakysymykseen. Vastauksesi viimeisen rivin tulee olla muotoa: 'Vastaus: $KIRJAIN' (ilman lainausmerkkejä), jossa KIRJAIN on A, B, C tai D. Ajattele vaihe vaiheelta ennen vastaamista."
73+
query_template = "{Instruction}\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
74+
query = query_template.format(
75+
A=choices[0].strip(),
76+
B=choices[1].strip(),
77+
C=choices[2].strip(),
78+
D=choices[3].strip(),
79+
Question=line["Question"].strip(),
80+
Instruction=instruction,
81+
)
82+
83+
return Doc(
84+
task_name=task_name,
85+
query=query,
86+
choices=list(ascii_uppercase)[: len(choices)],
87+
gold_index=gold_index,
88+
instruction=instruction,
89+
)
90+
91+
92+
# Log-likelihood based evaluation (matches French GPQA pattern)
93+
gpqa_fi = LightevalTaskConfig(
94+
name="gpqa-fi",
95+
prompt_function=gpqa_fi_prompt,
96+
sample_fields=record_to_sample,
97+
solver=[multiple_choice(cache=True)],
98+
scorer=choice(),
99+
hf_repo="LumiOpen/GPQA-FI",
100+
hf_subset="default",
101+
hf_avail_splits=["train"],
102+
evaluation_splits=["train"],
103+
few_shots_split=None,
104+
few_shots_select="random_sampling",
105+
generation_size=1,
106+
metrics=[Metrics.loglikelihood_acc],
107+
stop_sequence=["\n"],
108+
version=0,
109+
)
110+
111+
# Instruct / generative evaluation (matches English GPQA diamond pattern)
112+
gpqa_fi_diamond = LightevalTaskConfig(
113+
name="gpqa-fi:diamond",
114+
prompt_function=gpqa_fi_instruct_prompt,
115+
sample_fields=record_to_sample,
116+
solver=[multiple_choice(cache=True)],
117+
scorer=choice(),
118+
hf_repo="LumiOpen/GPQA-FI",
119+
hf_subset="default",
120+
hf_avail_splits=["train"],
121+
evaluation_splits=["train"],
122+
few_shots_split=None,
123+
few_shots_select=None,
124+
generation_size=32768,
125+
metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})],
126+
stop_sequence=[],
127+
version=1,
128+
)
129+
130+
TASKS_TABLE = [gpqa_fi, gpqa_fi_diamond]

0 commit comments

Comments
 (0)