Add Zeroshot classification for Worldsense classification datatset (#4538)

deep9539 · Samoed · web-flow · commit 22d1c004b2df · 2026-04-28T23:34:21.000-04:00
* Add Zeroshot classification for Worldsense classification datatset

* format

---------

Co-authored-by: Roman Solomatin &lt;36135455+Samoed@users.noreply.github.com&gt;
diff --git a/mteb/tasks/zeroshot_classification/eng/__init__.py b/mteb/tasks/zeroshot_classification/eng/__init__.py
@@ -39,6 +39,10 @@
     UCF101VideoZeroShotClassification,
     UCF101ZeroShotClassification,
 )
+from .worldsense_classification import (
+    WorldSenseAudioVideoZeroShotClassification,
+    WorldSenseVideoZeroShotClassification,
+)
 
 __all__ = [
     "CLEVR",
@@ -76,4 +80,6 @@
     "UCF101VideoAudioZeroShotClassification",
     "UCF101VideoZeroShotClassification",
     "UCF101ZeroShotClassification",
+    "WorldSenseAudioVideoZeroShotClassification",
+    "WorldSenseVideoZeroShotClassification",
 ]
diff --git a/mteb/tasks/zeroshot_classification/eng/worldsense_classification.py b/mteb/tasks/zeroshot_classification/eng/worldsense_classification.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from mteb.abstasks.task_metadata import TaskMetadata
+from mteb.abstasks.zeroshot_classification import AbsTaskZeroShotClassification
+
+CITATION = r"""
+@inproceedings{hong2025worldsense,
+  author = {Hong, Jack and Yan, Shilin and Cai, Jiayin and Jiang, Xiaolong and Hu, Yao and Xie, Weidi},
+  journal = {arXiv preprint arXiv:2502.04326},
+  title = {Worldsense: Evaluating real-world omnimodal understanding for multimodal llms},
+  year = {2025},
+}
+"""
+
+
+class WorldSenseAudioVideoZeroShotClassification(AbsTaskZeroShotClassification):
+    metadata = TaskMetadata(
+        name="WorldSenseAudioVideoZeroShot",
+        description="WorldSense is a multimodal video understanding benchmark encompassing visual, audio, and text inputs. Videos are categorized into 8 primary domains across 67 fine-grained subcategories. This zero-shot classification task predicts the domain category of a video clip",
+        reference="https://arxiv.org/abs/2502.04326",
+        dataset={
+            "path": "mteb/WorldSense_1min",
+            "revision": "10c7ce0eb32d620f1f685bfedde2724066068a1c",
+        },
+        type="VideoZeroshotClassification",
+        category="va2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=("2025-02-06", "2026-03-01"),
+        domains=["Scene", "AudioScene", "Music", "Entertainment"],
+        task_subtypes=["Scene recognition"],
+        license="not specified",
+        annotations_creators="expert-annotated",
+        dialect=[],
+        modalities=["video", "audio"],
+        sample_creation="found",
+        bibtex_citation=CITATION,
+        is_beta=True,
+    )
+
+    input_column_name = ("video", "audio")
+    label_column_name: str = "domain"
+
+    def get_candidate_labels(self) -> list[str]:
+        return self.dataset["test"].features[self.label_column_name].names
+
+
+class WorldSenseVideoZeroShotClassification(AbsTaskZeroShotClassification):
+    metadata = TaskMetadata(
+        name="WorldSenseVideoZeroShot",
+        description="WorldSense is a multimodal video understanding benchmark encompassing visual, audio, and text inputs. Videos are categorized into 8 primary domains across 67 fine-grained subcategories. This zero-shot classification task predicts the domain category of a video clip",
+        reference="https://arxiv.org/abs/2502.04326",
+        dataset={
+            "path": "mteb/WorldSense_1min",
+            "revision": "10c7ce0eb32d620f1f685bfedde2724066068a1c",
+        },
+        type="VideoZeroshotClassification",
+        category="v2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=("2025-02-06", "2026-03-01"),
+        domains=["Scene", "AudioScene", "Music", "Entertainment"],
+        task_subtypes=["Scene recognition"],
+        license="not specified",
+        annotations_creators="expert-annotated",
+        dialect=[],
+        modalities=["video"],
+        sample_creation="found",
+        bibtex_citation=CITATION,
+        is_beta=True,
+    )
+
+    input_column_name = "video"
+    label_column_name: str = "domain"
+
+    def get_candidate_labels(self) -> list[str]:
+        return self.dataset["test"].features[self.label_column_name].names