|
| 1 | +""" |
| 2 | +name: |
| 3 | +GPQA-FI |
| 4 | +
|
| 5 | +dataset: |
| 6 | +LumiOpen/GPQA-FI |
| 7 | +
|
| 8 | +abstract: |
| 9 | +Finnish translation of the GPQA Diamond benchmark (Graduate-Level Google-Proof |
| 10 | +Q&A). Contains 198 expert-written multiple-choice questions in biology, |
| 11 | +physics, and chemistry, professionally post-edited from machine translations |
| 12 | +by native Finnish speakers. |
| 13 | +
|
| 14 | +languages: |
| 15 | +finnish |
| 16 | +
|
| 17 | +tags: |
| 18 | +knowledge, multiple-choice, qa, science, multilingual |
| 19 | +
|
| 20 | +paper: |
| 21 | +https://arxiv.org/abs/2311.12022 |
| 22 | +""" |
| 23 | + |
| 24 | +import random |
| 25 | +from string import ascii_uppercase |
| 26 | + |
| 27 | +from inspect_ai.dataset import Sample |
| 28 | +from inspect_ai.scorer import choice |
| 29 | +from inspect_ai.solver import multiple_choice |
| 30 | + |
| 31 | +from lighteval.metrics.metrics import Metrics |
| 32 | +from lighteval.tasks.lighteval_task import LightevalTaskConfig |
| 33 | +from lighteval.tasks.requests import Doc |
| 34 | + |
| 35 | + |
| 36 | +def record_to_sample(record): |
| 37 | + gold_index = random.randint(0, 3) |
| 38 | + choices = [record["Incorrect Answer 1"], record["Incorrect Answer 2"], record["Incorrect Answer 3"]] |
| 39 | + choices.insert(gold_index, record["Correct Answer"]) |
| 40 | + return Sample( |
| 41 | + input=record["Question"].strip(), |
| 42 | + choices=choices, |
| 43 | + target=ascii_uppercase[gold_index], |
| 44 | + ) |
| 45 | + |
| 46 | + |
| 47 | +def gpqa_fi_prompt(line, task_name: str = None): |
| 48 | + gold_index = random.randint(0, 3) |
| 49 | + choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]] |
| 50 | + choices.insert(gold_index, line["Correct Answer"]) |
| 51 | + |
| 52 | + instruction = "Vastaa seuraavaan monivalintakysymykseen. Vastauksesi viimeisen rivin tulee olla muotoa: 'Vastaus: $KIRJAIN' (ilman lainausmerkkejä), jossa KIRJAIN on A, B, C tai D. Ajattele vaihe vaiheelta ennen vastaamista.\n\n" |
| 53 | + |
| 54 | + query = f"Kysymys: {line['Question']}\n" |
| 55 | + query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, choices)]) |
| 56 | + query += "Vastaus: " |
| 57 | + return Doc( |
| 58 | + task_name=task_name, |
| 59 | + query=f"{instruction}{query}", |
| 60 | + choices=ascii_uppercase[: len(choices)], |
| 61 | + gold_index=gold_index, |
| 62 | + instruction=instruction, |
| 63 | + ) |
| 64 | + |
| 65 | + |
| 66 | +def gpqa_fi_instruct_prompt(line, task_name: str = None): |
| 67 | + gold_index = random.randint(0, 3) |
| 68 | + choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]] |
| 69 | + choices.insert(gold_index, line["Correct Answer"]) |
| 70 | + instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering." |
| 71 | + query_template = "{Instruction}\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}" |
| 72 | + query = query_template.format( |
| 73 | + A=choices[0].strip(), |
| 74 | + B=choices[1].strip(), |
| 75 | + C=choices[2].strip(), |
| 76 | + D=choices[3].strip(), |
| 77 | + Question=line["Question"].strip(), |
| 78 | + Instruction=instruction, |
| 79 | + ) |
| 80 | + |
| 81 | + return Doc( |
| 82 | + task_name=task_name, |
| 83 | + query=query, |
| 84 | + choices=list(ascii_uppercase)[: len(choices)], |
| 85 | + gold_index=gold_index, |
| 86 | + instruction=instruction, |
| 87 | + ) |
| 88 | + |
| 89 | + |
| 90 | +# Log-likelihood based evaluation (matches French GPQA pattern) |
| 91 | +gpqa_fi = LightevalTaskConfig( |
| 92 | + name="gpqa-fi", |
| 93 | + prompt_function=gpqa_fi_prompt, |
| 94 | + sample_fields=record_to_sample, |
| 95 | + solver=[multiple_choice(cache=True)], |
| 96 | + scorer=choice(), |
| 97 | + hf_repo="LumiOpen/GPQA-FI", |
| 98 | + hf_subset="default", |
| 99 | + hf_avail_splits=["train"], |
| 100 | + evaluation_splits=["train"], |
| 101 | + few_shots_split=None, |
| 102 | + few_shots_select="random_sampling", |
| 103 | + generation_size=1, |
| 104 | + metrics=[Metrics.loglikelihood_acc], |
| 105 | + stop_sequence=["\n"], |
| 106 | + version=0, |
| 107 | +) |
| 108 | + |
| 109 | +# Instruct / generative evaluation (matches English GPQA diamond pattern) |
| 110 | +gpqa_fi_diamond = LightevalTaskConfig( |
| 111 | + name="gpqa-fi:diamond", |
| 112 | + prompt_function=gpqa_fi_instruct_prompt, |
| 113 | + sample_fields=record_to_sample, |
| 114 | + solver=[multiple_choice(cache=True)], |
| 115 | + scorer=choice(), |
| 116 | + hf_repo="LumiOpen/GPQA-FI", |
| 117 | + hf_subset="default", |
| 118 | + hf_avail_splits=["train"], |
| 119 | + evaluation_splits=["train"], |
| 120 | + few_shots_split=None, |
| 121 | + few_shots_select=None, |
| 122 | + generation_size=32768, |
| 123 | + metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})], |
| 124 | + stop_sequence=[], |
| 125 | + version=1, |
| 126 | +) |
| 127 | + |
| 128 | +TASKS_TABLE = [gpqa_fi, gpqa_fi_diamond] |
0 commit comments