Skip to content

Commit 285e68c

Browse files
Rakshitha-IreddiRakshitha Ireddiisaac-chung
authored
[MVEB] Add Daily-Omni video-centric QA task (#4530)
* [MVEB] Add DailyOmniVideoAudioCentricQA task and fix modalities * add to init --------- Co-authored-by: Rakshitha Ireddi <your.email@example.com> Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
1 parent c7d1bb3 commit 285e68c

2 files changed

Lines changed: 156 additions & 0 deletions

File tree

mteb/tasks/multichoice/eng/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from .blink_it2i_multi_choice import BLINKIT2IMultiChoice
44
from .blink_it2t_multi_choice import BLINKIT2TMultiChoice
55
from .cv_bench import CVBenchCount, CVBenchDepth, CVBenchDistance, CVBenchRelation
6+
from .daily_omni import DailyOmniVideoAudioCentricQA, DailyOmniVideoCentricQA
67
from .egoschema import EgoSchemaVideoCentricQA
78
from .nextqa import NExTQAVideoCentricQA
89
from .perception_test import (
@@ -23,6 +24,8 @@
2324
"CVBenchDepth",
2425
"CVBenchDistance",
2526
"CVBenchRelation",
27+
"DailyOmniVideoAudioCentricQA",
28+
"DailyOmniVideoCentricQA",
2629
"EgoSchemaVideoCentricQA",
2730
"NExTQAVideoCentricQA",
2831
"PerceptionTestVideoAudioCentricQA",
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
from __future__ import annotations
2+
3+
from datasets import Dataset, load_dataset
4+
5+
from mteb.abstasks.retrieval import AbsTaskRetrieval
6+
from mteb.abstasks.retrieval_dataset_loaders import RetrievalSplitData
7+
from mteb.abstasks.task_metadata import TaskMetadata
8+
9+
10+
class DailyOmniVideoCentricQA(AbsTaskRetrieval):
11+
metadata = TaskMetadata(
12+
name="DailyOmniVideoCentricQA",
13+
description="Daily-Omni is a video question answering benchmark covering everyday scenarios with audio-visual content. Each example pairs a video with a question and multiple candidate answers. The task is formulated as multiple-choice retrieval: given the (video, question) pair, retrieve the correct candidate.",
14+
reference="https://arxiv.org/abs/2505.17862",
15+
dataset={
16+
"path": "mteb/Daily-Omni",
17+
"revision": "1209825141184353b668f8c205765e313b3d2a26",
18+
},
19+
type="VideoCentricQA",
20+
category="vt2t",
21+
eval_splits=["test"],
22+
eval_langs=["eng-Latn"],
23+
main_score="accuracy",
24+
date=("2025-05-23", "2025-05-23"),
25+
domains=["Web"],
26+
task_subtypes=["Question answering"],
27+
license="cc-by-4.0",
28+
annotations_creators="human-annotated",
29+
dialect=[],
30+
modalities=["video", "text"],
31+
sample_creation="found",
32+
is_beta=True,
33+
bibtex_citation=r"""
34+
@article{zhou2025dailyomni,
35+
author = {Zhou, Ziwei and Wang, Rui and Wu, Zuxuan and Jiang, Yu-Gang},
36+
journal = {arXiv preprint arXiv:2505.17862},
37+
title = {Daily-Omni: Towards Audio-Visual Reasoning with Temporal Alignment across Modalities},
38+
year = {2025},
39+
}
40+
""",
41+
)
42+
43+
def load_data(self, **kwargs) -> None:
44+
if self.data_loaded:
45+
return
46+
self.dataset = {"default": {}}
47+
for split in self.metadata.eval_splits:
48+
ds = load_dataset(
49+
self.metadata.dataset["path"],
50+
revision=self.metadata.dataset["revision"],
51+
split=split,
52+
)
53+
ds = ds.add_column("id", [f"q{i}" for i in range(len(ds))])
54+
55+
queries = ds.select_columns(["id", "question", "video"]).rename_column(
56+
"question", "text"
57+
)
58+
59+
corpus_rows: list[dict] = []
60+
relevant_docs: dict[str, dict[str, int]] = {}
61+
top_ranked: dict[str, list[str]] = {}
62+
for row in ds.select_columns(["id", "candidates", "answer"]):
63+
qid = row["id"]
64+
answer = row["answer"]
65+
top_ranked[qid] = []
66+
for j, candidate in enumerate(row["candidates"]):
67+
doc_id = f"{qid}_c{j}"
68+
corpus_rows.append({"id": doc_id, "text": candidate})
69+
top_ranked[qid].append(doc_id)
70+
if candidate == answer:
71+
relevant_docs[qid] = {doc_id: 1}
72+
73+
corpus = Dataset.from_list(corpus_rows)
74+
self.dataset["default"][split] = RetrievalSplitData(
75+
queries=queries,
76+
corpus=corpus,
77+
relevant_docs=relevant_docs,
78+
top_ranked=top_ranked,
79+
)
80+
self.data_loaded = True
81+
82+
83+
class DailyOmniVideoAudioCentricQA(AbsTaskRetrieval):
84+
metadata = TaskMetadata(
85+
name="DailyOmniVideoAudioCentricQA",
86+
description="Daily-Omni is a video question answering benchmark covering everyday scenarios with audio-visual content. Each example pairs a video with audio and a question and multiple candidate answers. The task is formulated as multiple-choice retrieval: given the (video, audio, question) tuple, retrieve the correct candidate.",
87+
reference="https://arxiv.org/abs/2505.17862",
88+
dataset={
89+
"path": "mteb/Daily-Omni",
90+
"revision": "1209825141184353b668f8c205765e313b3d2a26",
91+
},
92+
type="VideoCentricQA",
93+
category="vat2t",
94+
eval_splits=["test"],
95+
eval_langs=["eng-Latn"],
96+
main_score="accuracy",
97+
date=("2025-05-23", "2025-05-23"),
98+
domains=["Web"],
99+
task_subtypes=["Question answering"],
100+
license="cc-by-4.0",
101+
annotations_creators="human-annotated",
102+
dialect=[],
103+
modalities=["video", "audio", "text"],
104+
sample_creation="found",
105+
is_beta=True,
106+
bibtex_citation=r"""
107+
@article{zhou2025dailyomni,
108+
author = {Zhou, Ziwei and Wang, Rui and Wu, Zuxuan and Jiang, Yu-Gang},
109+
journal = {arXiv preprint arXiv:2505.17862},
110+
title = {Daily-Omni: Towards Audio-Visual Reasoning with Temporal Alignment across Modalities},
111+
year = {2025},
112+
}
113+
""",
114+
)
115+
116+
def load_data(self, **kwargs) -> None:
117+
if self.data_loaded:
118+
return
119+
self.dataset = {"default": {}}
120+
for split in self.metadata.eval_splits:
121+
ds = load_dataset(
122+
self.metadata.dataset["path"],
123+
revision=self.metadata.dataset["revision"],
124+
split=split,
125+
)
126+
ds = ds.add_column("id", [f"q{i}" for i in range(len(ds))])
127+
128+
queries = ds.select_columns(
129+
["id", "question", "video", "audio"]
130+
).rename_column("question", "text")
131+
132+
corpus_rows: list[dict] = []
133+
relevant_docs: dict[str, dict[str, int]] = {}
134+
top_ranked: dict[str, list[str]] = {}
135+
for row in ds.select_columns(["id", "candidates", "answer"]):
136+
qid = row["id"]
137+
answer = row["answer"]
138+
top_ranked[qid] = []
139+
for j, candidate in enumerate(row["candidates"]):
140+
doc_id = f"{qid}_c{j}"
141+
corpus_rows.append({"id": doc_id, "text": candidate})
142+
top_ranked[qid].append(doc_id)
143+
if candidate == answer:
144+
relevant_docs[qid] = {doc_id: 1}
145+
146+
corpus = Dataset.from_list(corpus_rows)
147+
self.dataset["default"][split] = RetrievalSplitData(
148+
queries=queries,
149+
corpus=corpus,
150+
relevant_docs=relevant_docs,
151+
top_ranked=top_ranked,
152+
)
153+
self.data_loaded = True

0 commit comments

Comments
 (0)