|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +from datasets import Dataset, load_dataset |
| 4 | + |
| 5 | +from mteb.abstasks.retrieval import AbsTaskRetrieval |
| 6 | +from mteb.abstasks.retrieval_dataset_loaders import RetrievalSplitData |
| 7 | +from mteb.abstasks.task_metadata import TaskMetadata |
| 8 | + |
| 9 | + |
| 10 | +class DailyOmniVideoCentricQA(AbsTaskRetrieval): |
| 11 | + metadata = TaskMetadata( |
| 12 | + name="DailyOmniVideoCentricQA", |
| 13 | + description="Daily-Omni is a video question answering benchmark covering everyday scenarios with audio-visual content. Each example pairs a video with a question and multiple candidate answers. The task is formulated as multiple-choice retrieval: given the (video, question) pair, retrieve the correct candidate.", |
| 14 | + reference="https://arxiv.org/abs/2505.17862", |
| 15 | + dataset={ |
| 16 | + "path": "mteb/Daily-Omni", |
| 17 | + "revision": "1209825141184353b668f8c205765e313b3d2a26", |
| 18 | + }, |
| 19 | + type="VideoCentricQA", |
| 20 | + category="vt2t", |
| 21 | + eval_splits=["test"], |
| 22 | + eval_langs=["eng-Latn"], |
| 23 | + main_score="accuracy", |
| 24 | + date=("2025-05-23", "2025-05-23"), |
| 25 | + domains=["Web"], |
| 26 | + task_subtypes=["Question answering"], |
| 27 | + license="cc-by-4.0", |
| 28 | + annotations_creators="human-annotated", |
| 29 | + dialect=[], |
| 30 | + modalities=["video", "text"], |
| 31 | + sample_creation="found", |
| 32 | + is_beta=True, |
| 33 | + bibtex_citation=r""" |
| 34 | +@article{zhou2025dailyomni, |
| 35 | + author = {Zhou, Ziwei and Wang, Rui and Wu, Zuxuan and Jiang, Yu-Gang}, |
| 36 | + journal = {arXiv preprint arXiv:2505.17862}, |
| 37 | + title = {Daily-Omni: Towards Audio-Visual Reasoning with Temporal Alignment across Modalities}, |
| 38 | + year = {2025}, |
| 39 | +} |
| 40 | +""", |
| 41 | + ) |
| 42 | + |
| 43 | + def load_data(self, **kwargs) -> None: |
| 44 | + if self.data_loaded: |
| 45 | + return |
| 46 | + self.dataset = {"default": {}} |
| 47 | + for split in self.metadata.eval_splits: |
| 48 | + ds = load_dataset( |
| 49 | + self.metadata.dataset["path"], |
| 50 | + revision=self.metadata.dataset["revision"], |
| 51 | + split=split, |
| 52 | + ) |
| 53 | + ds = ds.add_column("id", [f"q{i}" for i in range(len(ds))]) |
| 54 | + |
| 55 | + queries = ds.select_columns(["id", "question", "video"]).rename_column( |
| 56 | + "question", "text" |
| 57 | + ) |
| 58 | + |
| 59 | + corpus_rows: list[dict] = [] |
| 60 | + relevant_docs: dict[str, dict[str, int]] = {} |
| 61 | + top_ranked: dict[str, list[str]] = {} |
| 62 | + for row in ds.select_columns(["id", "candidates", "answer"]): |
| 63 | + qid = row["id"] |
| 64 | + answer = row["answer"] |
| 65 | + top_ranked[qid] = [] |
| 66 | + for j, candidate in enumerate(row["candidates"]): |
| 67 | + doc_id = f"{qid}_c{j}" |
| 68 | + corpus_rows.append({"id": doc_id, "text": candidate}) |
| 69 | + top_ranked[qid].append(doc_id) |
| 70 | + if candidate == answer: |
| 71 | + relevant_docs[qid] = {doc_id: 1} |
| 72 | + |
| 73 | + corpus = Dataset.from_list(corpus_rows) |
| 74 | + self.dataset["default"][split] = RetrievalSplitData( |
| 75 | + queries=queries, |
| 76 | + corpus=corpus, |
| 77 | + relevant_docs=relevant_docs, |
| 78 | + top_ranked=top_ranked, |
| 79 | + ) |
| 80 | + self.data_loaded = True |
| 81 | + |
| 82 | + |
| 83 | +class DailyOmniVideoAudioCentricQA(AbsTaskRetrieval): |
| 84 | + metadata = TaskMetadata( |
| 85 | + name="DailyOmniVideoAudioCentricQA", |
| 86 | + description="Daily-Omni is a video question answering benchmark covering everyday scenarios with audio-visual content. Each example pairs a video with audio and a question and multiple candidate answers. The task is formulated as multiple-choice retrieval: given the (video, audio, question) tuple, retrieve the correct candidate.", |
| 87 | + reference="https://arxiv.org/abs/2505.17862", |
| 88 | + dataset={ |
| 89 | + "path": "mteb/Daily-Omni", |
| 90 | + "revision": "1209825141184353b668f8c205765e313b3d2a26", |
| 91 | + }, |
| 92 | + type="VideoCentricQA", |
| 93 | + category="vat2t", |
| 94 | + eval_splits=["test"], |
| 95 | + eval_langs=["eng-Latn"], |
| 96 | + main_score="accuracy", |
| 97 | + date=("2025-05-23", "2025-05-23"), |
| 98 | + domains=["Web"], |
| 99 | + task_subtypes=["Question answering"], |
| 100 | + license="cc-by-4.0", |
| 101 | + annotations_creators="human-annotated", |
| 102 | + dialect=[], |
| 103 | + modalities=["video", "audio", "text"], |
| 104 | + sample_creation="found", |
| 105 | + is_beta=True, |
| 106 | + bibtex_citation=r""" |
| 107 | +@article{zhou2025dailyomni, |
| 108 | + author = {Zhou, Ziwei and Wang, Rui and Wu, Zuxuan and Jiang, Yu-Gang}, |
| 109 | + journal = {arXiv preprint arXiv:2505.17862}, |
| 110 | + title = {Daily-Omni: Towards Audio-Visual Reasoning with Temporal Alignment across Modalities}, |
| 111 | + year = {2025}, |
| 112 | +} |
| 113 | +""", |
| 114 | + ) |
| 115 | + |
| 116 | + def load_data(self, **kwargs) -> None: |
| 117 | + if self.data_loaded: |
| 118 | + return |
| 119 | + self.dataset = {"default": {}} |
| 120 | + for split in self.metadata.eval_splits: |
| 121 | + ds = load_dataset( |
| 122 | + self.metadata.dataset["path"], |
| 123 | + revision=self.metadata.dataset["revision"], |
| 124 | + split=split, |
| 125 | + ) |
| 126 | + ds = ds.add_column("id", [f"q{i}" for i in range(len(ds))]) |
| 127 | + |
| 128 | + queries = ds.select_columns( |
| 129 | + ["id", "question", "video", "audio"] |
| 130 | + ).rename_column("question", "text") |
| 131 | + |
| 132 | + corpus_rows: list[dict] = [] |
| 133 | + relevant_docs: dict[str, dict[str, int]] = {} |
| 134 | + top_ranked: dict[str, list[str]] = {} |
| 135 | + for row in ds.select_columns(["id", "candidates", "answer"]): |
| 136 | + qid = row["id"] |
| 137 | + answer = row["answer"] |
| 138 | + top_ranked[qid] = [] |
| 139 | + for j, candidate in enumerate(row["candidates"]): |
| 140 | + doc_id = f"{qid}_c{j}" |
| 141 | + corpus_rows.append({"id": doc_id, "text": candidate}) |
| 142 | + top_ranked[qid].append(doc_id) |
| 143 | + if candidate == answer: |
| 144 | + relevant_docs[qid] = {doc_id: 1} |
| 145 | + |
| 146 | + corpus = Dataset.from_list(corpus_rows) |
| 147 | + self.dataset["default"][split] = RetrievalSplitData( |
| 148 | + queries=queries, |
| 149 | + corpus=corpus, |
| 150 | + relevant_docs=relevant_docs, |
| 151 | + top_ranked=top_ranked, |
| 152 | + ) |
| 153 | + self.data_loaded = True |
0 commit comments