From b42fb73ca0ee692b7a92b7b5273daefb037a5391 Mon Sep 17 00:00:00 2001 From: mvanypersele Date: Mon, 9 Mar 2026 10:52:33 +0000 Subject: [PATCH 1/2] squad_v2.py: include unanswerable questions in evaluation Fixes #1184 --- src/lighteval/tasks/tasks/squad_v2.py | 45 ++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/src/lighteval/tasks/tasks/squad_v2.py b/src/lighteval/tasks/tasks/squad_v2.py index a70358b65..be9a33af6 100644 --- a/src/lighteval/tasks/tasks/squad_v2.py +++ b/src/lighteval/tasks/tasks/squad_v2.py @@ -16,6 +16,14 @@ when possible, but also determine when no answer is supported by the paragraph and abstain from answering. +note: +This is an LLM-friendly adaptation of the original SQuAD 2.0 evaluation. +The original evaluation uses extractive span selection with a confidence-based +"no answer" threshold, which does not apply to generative models. +Here, the model is instead instructed to generate "unanswerable" when the +question cannot be answered from the context. EM and F1 metrics are computed +over both answerable and unanswerable questions. + languages: english @@ -28,12 +36,46 @@ from lighteval.metrics.metrics import Metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc from lighteval.tasks.templates.qa import get_qa_prompt_function from lighteval.utils.language import Language +UNANSWERABLE = "unanswerable" + + +def squad_v2_prompt(line, task_name: str = None): + answers = list(set(ans for ans in line["answers"]["text"] if len(ans) > 0)) + is_unanswerable = len(answers) == 0 + + if is_unanswerable: + choices = [f" {UNANSWERABLE}"] + else: + choices = [f" {ans}" for ans in answers] + + return Doc( + task_name=task_name, + query=f"Context: {line['context']}\nQuestion: {line['question']}\n" + f"Answer with a span from the context, or \"{UNANSWERABLE}\" if the question cannot be answered.\nAnswer:", + choices=choices, + gold_index=list(range(len(choices))), + ) + squad_v2 = LightevalTaskConfig( name="squad_v2", + prompt_function=squad_v2_prompt, + hf_repo="rajpurkar/squad_v2", + hf_subset="squad_v2", + evaluation_splits=("validation",), + few_shots_split="train", + stop_sequence=["\n", "Question:", "question:"], + generation_size=200, + metrics=[Metrics.exact_match, Metrics.f1_score], + version=2, +) + +squad_v2_answerable = LightevalTaskConfig( + name="squad_v2:answerable", prompt_function=get_qa_prompt_function( Language.ENGLISH, lambda line: { @@ -49,10 +91,11 @@ few_shots_split="train", stop_sequence=["\n", "Question:", "question:"], generation_size=200, - metrics=[Metrics.exact_match], + metrics=[Metrics.exact_match, Metrics.f1_score], version=1, ) TASKS_TABLE = [ squad_v2, + squad_v2_answerable, ] From e59c8c8736f2b14386e24a9cc400312cb182179b Mon Sep 17 00:00:00 2001 From: mvanypersele Date: Tue, 10 Mar 2026 12:44:49 +0000 Subject: [PATCH 2/2] Add faithfulness metric to squad_v2 --- src/lighteval/tasks/tasks/squad_v2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lighteval/tasks/tasks/squad_v2.py b/src/lighteval/tasks/tasks/squad_v2.py index be9a33af6..7e0f18414 100644 --- a/src/lighteval/tasks/tasks/squad_v2.py +++ b/src/lighteval/tasks/tasks/squad_v2.py @@ -58,6 +58,7 @@ def squad_v2_prompt(line, task_name: str = None): f"Answer with a span from the context, or \"{UNANSWERABLE}\" if the question cannot be answered.\nAnswer:", choices=choices, gold_index=list(range(len(choices))), + specific={"text": line["context"]}, ) @@ -70,7 +71,7 @@ def squad_v2_prompt(line, task_name: str = None): few_shots_split="train", stop_sequence=["\n", "Question:", "question:"], generation_size=200, - metrics=[Metrics.exact_match, Metrics.f1_score], + metrics=[Metrics.exact_match, Metrics.f1_score, Metrics.faithfulness], version=2, )