Skip to content

Commit 27dc33f

Browse files
Rakshitha-IreddiRakshitha Ireddiisaac-chung
authored
[MVEB] Add WorldSense 1min video-centric QA task (#4529)
* [MVEB] Add WorldSense1MinVideoAudioCentricQA task and fix modalities * add init --------- Co-authored-by: Rakshitha Ireddi <your.email@example.com> Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
1 parent 30e6f28 commit 27dc33f

2 files changed

Lines changed: 156 additions & 0 deletions

File tree

mteb/tasks/multichoice/eng/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
PerceptionTestVideoCentricQA,
1111
)
1212
from .video_mme import VideoMMEShortVideoAudioCentricQA, VideoMMEShortVideoCentricQA
13+
from .worldsense import WorldSense1MinVideoAudioCentricQA, WorldSense1MinVideoCentricQA
1314

1415
__all__ = [
1516
"AVMemeExamVideoAudioCentricQA",
@@ -28,4 +29,6 @@
2829
"PerceptionTestVideoCentricQA",
2930
"VideoMMEShortVideoAudioCentricQA",
3031
"VideoMMEShortVideoCentricQA",
32+
"WorldSense1MinVideoAudioCentricQA",
33+
"WorldSense1MinVideoCentricQA",
3134
]
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
from __future__ import annotations
2+
3+
from datasets import Dataset, load_dataset
4+
5+
from mteb.abstasks.retrieval import AbsTaskRetrieval
6+
from mteb.abstasks.retrieval_dataset_loaders import RetrievalSplitData
7+
from mteb.abstasks.task_metadata import TaskMetadata
8+
9+
10+
class WorldSense1MinVideoCentricQA(AbsTaskRetrieval):
11+
metadata = TaskMetadata(
12+
name="WorldSense1MinVideoCentricQA",
13+
description="WorldSense_1min is a video question answering benchmark covering diverse real-world domains including sports, culture, music, and daily life. Each example pairs a ~1-minute video with a question and multiple candidate answers. The task is formulated as multiple-choice retrieval: given the (video, question) pair, retrieve the correct candidate.",
14+
reference="https://arxiv.org/abs/2502.04326",
15+
dataset={
16+
"path": "mteb/WorldSense_1min",
17+
"revision": "10c7ce0eb32d620f1f685bfedde2724066068a1c",
18+
},
19+
type="VideoCentricQA",
20+
category="vt2t",
21+
eval_splits=["test"],
22+
eval_langs=["eng-Latn"],
23+
main_score="accuracy",
24+
date=("2025-02-06", "2025-02-06"),
25+
domains=["Web"],
26+
task_subtypes=["Question answering"],
27+
license="cc-by-4.0",
28+
annotations_creators="human-annotated",
29+
dialect=[],
30+
modalities=["video", "text"],
31+
sample_creation="found",
32+
is_beta=True,
33+
bibtex_citation=r"""
34+
@article{hong2025worldsense,
35+
author = {Hong, Jack and Yan, Shilin and Cai, Jiayin and Jiang, Xiaolong and Hu, Yao and Xie, Weidi},
36+
journal = {arXiv preprint arXiv:2502.04326},
37+
title = {WorldSense: Evaluating Real-world Omnimodal Understanding for Multimodal LLMs},
38+
year = {2025},
39+
}
40+
""",
41+
)
42+
43+
def load_data(self, **kwargs) -> None:
44+
if self.data_loaded:
45+
return
46+
self.dataset = {"default": {}}
47+
for split in self.metadata.eval_splits:
48+
ds = load_dataset(
49+
self.metadata.dataset["path"],
50+
revision=self.metadata.dataset["revision"],
51+
split=split,
52+
)
53+
ds = ds.add_column("id", [f"q{i}" for i in range(len(ds))])
54+
55+
queries = ds.select_columns(["id", "question", "video"]).rename_column(
56+
"question", "text"
57+
)
58+
59+
corpus_rows: list[dict] = []
60+
relevant_docs: dict[str, dict[str, int]] = {}
61+
top_ranked: dict[str, list[str]] = {}
62+
for row in ds.select_columns(["id", "candidates", "answer"]):
63+
qid = row["id"]
64+
answer = row["answer"]
65+
top_ranked[qid] = []
66+
for j, candidate in enumerate(row["candidates"]):
67+
doc_id = f"{qid}_c{j}"
68+
corpus_rows.append({"id": doc_id, "text": candidate})
69+
top_ranked[qid].append(doc_id)
70+
if candidate == answer:
71+
relevant_docs[qid] = {doc_id: 1}
72+
73+
corpus = Dataset.from_list(corpus_rows)
74+
self.dataset["default"][split] = RetrievalSplitData(
75+
queries=queries,
76+
corpus=corpus,
77+
relevant_docs=relevant_docs,
78+
top_ranked=top_ranked,
79+
)
80+
self.data_loaded = True
81+
82+
83+
class WorldSense1MinVideoAudioCentricQA(AbsTaskRetrieval):
84+
metadata = TaskMetadata(
85+
name="WorldSense1MinVideoAudioCentricQA",
86+
description="WorldSense_1min is a video question answering benchmark covering diverse real-world domains including sports, culture, music, and daily life. Each example pairs a ~1-minute video with audio and a question and multiple candidate answers. The task is formulated as multiple-choice retrieval: given the (video, audio, question) tuple, retrieve the correct candidate.",
87+
reference="https://arxiv.org/abs/2502.04326",
88+
dataset={
89+
"path": "mteb/WorldSense_1min",
90+
"revision": "10c7ce0eb32d620f1f685bfedde2724066068a1c",
91+
},
92+
type="VideoCentricQA",
93+
category="vat2t",
94+
eval_splits=["test"],
95+
eval_langs=["eng-Latn"],
96+
main_score="accuracy",
97+
date=("2025-02-06", "2025-02-06"),
98+
domains=["Web"],
99+
task_subtypes=["Question answering"],
100+
license="cc-by-4.0",
101+
annotations_creators="human-annotated",
102+
dialect=[],
103+
modalities=["video", "audio", "text"],
104+
sample_creation="found",
105+
is_beta=True,
106+
bibtex_citation=r"""
107+
@article{hong2025worldsense,
108+
author = {Hong, Jack and Yan, Shilin and Cai, Jiayin and Jiang, Xiaolong and Hu, Yao and Xie, Weidi},
109+
journal = {arXiv preprint arXiv:2502.04326},
110+
title = {WorldSense: Evaluating Real-world Omnimodal Understanding for Multimodal LLMs},
111+
year = {2025},
112+
}
113+
""",
114+
)
115+
116+
def load_data(self, **kwargs) -> None:
117+
if self.data_loaded:
118+
return
119+
self.dataset = {"default": {}}
120+
for split in self.metadata.eval_splits:
121+
ds = load_dataset(
122+
self.metadata.dataset["path"],
123+
revision=self.metadata.dataset["revision"],
124+
split=split,
125+
)
126+
ds = ds.add_column("id", [f"q{i}" for i in range(len(ds))])
127+
128+
queries = ds.select_columns(
129+
["id", "question", "video", "audio"]
130+
).rename_column("question", "text")
131+
132+
corpus_rows: list[dict] = []
133+
relevant_docs: dict[str, dict[str, int]] = {}
134+
top_ranked: dict[str, list[str]] = {}
135+
for row in ds.select_columns(["id", "candidates", "answer"]):
136+
qid = row["id"]
137+
answer = row["answer"]
138+
top_ranked[qid] = []
139+
for j, candidate in enumerate(row["candidates"]):
140+
doc_id = f"{qid}_c{j}"
141+
corpus_rows.append({"id": doc_id, "text": candidate})
142+
top_ranked[qid].append(doc_id)
143+
if candidate == answer:
144+
relevant_docs[qid] = {doc_id: 1}
145+
146+
corpus = Dataset.from_list(corpus_rows)
147+
self.dataset["default"][split] = RetrievalSplitData(
148+
queries=queries,
149+
corpus=corpus,
150+
relevant_docs=relevant_docs,
151+
top_ranked=top_ranked,
152+
)
153+
self.data_loaded = True

0 commit comments

Comments
 (0)