|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +from mteb.abstasks.task_metadata import TaskMetadata |
| 4 | +from mteb.abstasks.zeroshot_classification import AbsTaskZeroShotClassification |
| 5 | + |
| 6 | +CITATION = r""" |
| 7 | +@inproceedings{hong2025worldsense, |
| 8 | + author = {Hong, Jack and Yan, Shilin and Cai, Jiayin and Jiang, Xiaolong and Hu, Yao and Xie, Weidi}, |
| 9 | + journal = {arXiv preprint arXiv:2502.04326}, |
| 10 | + title = {Worldsense: Evaluating real-world omnimodal understanding for multimodal llms}, |
| 11 | + year = {2025}, |
| 12 | +} |
| 13 | +""" |
| 14 | + |
| 15 | + |
| 16 | +class WorldSenseAudioVideoZeroShotClassification(AbsTaskZeroShotClassification): |
| 17 | + metadata = TaskMetadata( |
| 18 | + name="WorldSenseAudioVideoZeroShot", |
| 19 | + description="WorldSense is a multimodal video understanding benchmark encompassing visual, audio, and text inputs. Videos are categorized into 8 primary domains across 67 fine-grained subcategories. This zero-shot classification task predicts the domain category of a video clip", |
| 20 | + reference="https://arxiv.org/abs/2502.04326", |
| 21 | + dataset={ |
| 22 | + "path": "mteb/WorldSense_1min", |
| 23 | + "revision": "10c7ce0eb32d620f1f685bfedde2724066068a1c", |
| 24 | + }, |
| 25 | + type="VideoZeroshotClassification", |
| 26 | + category="va2t", |
| 27 | + eval_splits=["test"], |
| 28 | + eval_langs=["eng-Latn"], |
| 29 | + main_score="accuracy", |
| 30 | + date=("2025-02-06", "2026-03-01"), |
| 31 | + domains=["Scene", "AudioScene", "Music", "Entertainment"], |
| 32 | + task_subtypes=["Scene recognition"], |
| 33 | + license="not specified", |
| 34 | + annotations_creators="expert-annotated", |
| 35 | + dialect=[], |
| 36 | + modalities=["video", "audio"], |
| 37 | + sample_creation="found", |
| 38 | + bibtex_citation=CITATION, |
| 39 | + is_beta=True, |
| 40 | + ) |
| 41 | + |
| 42 | + input_column_name = ("video", "audio") |
| 43 | + label_column_name: str = "domain" |
| 44 | + |
| 45 | + def get_candidate_labels(self) -> list[str]: |
| 46 | + return self.dataset["test"].features[self.label_column_name].names |
| 47 | + |
| 48 | + |
| 49 | +class WorldSenseVideoZeroShotClassification(AbsTaskZeroShotClassification): |
| 50 | + metadata = TaskMetadata( |
| 51 | + name="WorldSenseVideoZeroShot", |
| 52 | + description="WorldSense is a multimodal video understanding benchmark encompassing visual, audio, and text inputs. Videos are categorized into 8 primary domains across 67 fine-grained subcategories. This zero-shot classification task predicts the domain category of a video clip", |
| 53 | + reference="https://arxiv.org/abs/2502.04326", |
| 54 | + dataset={ |
| 55 | + "path": "mteb/WorldSense_1min", |
| 56 | + "revision": "10c7ce0eb32d620f1f685bfedde2724066068a1c", |
| 57 | + }, |
| 58 | + type="VideoZeroshotClassification", |
| 59 | + category="v2t", |
| 60 | + eval_splits=["test"], |
| 61 | + eval_langs=["eng-Latn"], |
| 62 | + main_score="accuracy", |
| 63 | + date=("2025-02-06", "2026-03-01"), |
| 64 | + domains=["Scene", "AudioScene", "Music", "Entertainment"], |
| 65 | + task_subtypes=["Scene recognition"], |
| 66 | + license="not specified", |
| 67 | + annotations_creators="expert-annotated", |
| 68 | + dialect=[], |
| 69 | + modalities=["video"], |
| 70 | + sample_creation="found", |
| 71 | + bibtex_citation=CITATION, |
| 72 | + is_beta=True, |
| 73 | + ) |
| 74 | + |
| 75 | + input_column_name = "video" |
| 76 | + label_column_name: str = "domain" |
| 77 | + |
| 78 | + def get_candidate_labels(self) -> list[str]: |
| 79 | + return self.dataset["test"].features[self.label_column_name].names |
0 commit comments