embeddings-benchmark · isaac-chung · Apr 30, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 30, 2026
diff --git a/mteb/tasks/zeroshot_classification/eng/__init__.py b/mteb/tasks/zeroshot_classification/eng/__init__.py
@@ -20,6 +20,14 @@
     Kinetics400VAZeroShotClassification,
     Kinetics400ZeroShotClassification,
 )
+from .kinetics600 import (
+    Kinetics600VAZeroShotClassification,
+    Kinetics600VZeroShotClassification,
+)
+from .kinetics700 import (
+    Kinetics700VAZeroShotClassification,
+    Kinetics700VZeroShotClassification,
+)
 from .meld_classification import (
     MELDAudioVideoZeroShotClassification,
     MELDVideoZeroShotClassification,
@@ -73,6 +81,10 @@
     "Imagenet1kZeroShotClassification",
     "Kinetics400VAZeroShotClassification",
     "Kinetics400ZeroShotClassification",
+    "Kinetics600VAZeroShotClassification",
+    "Kinetics600VZeroShotClassification",
+    "Kinetics700VAZeroShotClassification",
+    "Kinetics700VZeroShotClassification",
     "MELDAudioVideoZeroShotClassification",
     "MELDVideoZeroShotClassification",
     "MNISTZeroShotClassification",

diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics600.py b/mteb/tasks/zeroshot_classification/eng/kinetics600.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+from mteb.abstasks.task_metadata import TaskMetadata
+from mteb.abstasks.zeroshot_classification import AbsTaskZeroShotClassification
+
+CITATION = r"""
+@article{carreira2018short,
+  author = {Carreira, Joao and Noland, Eric and Banki-Horvath, Andras and Hillier, Chloe and Zisserman, Andrew},
+  journal = {arXiv preprint arXiv:1808.01340},
+  title = {A Short Note about Kinetics-600},
+  year = {2018},
+}
+"""
+
+
+class Kinetics600VAZeroShotClassification(AbsTaskZeroShotClassification):
+    metadata = TaskMetadata(
+        name="Kinetics600VAZeroShot",
+        description="Kinetics-600 is a large-scale action recognition dataset containing 600 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses both video and audio modalities.",
+        reference="https://arxiv.org/abs/1808.01340",
+        dataset={
+            "path": "mteb/kinetics-600",
+            "revision": "a7be893c873e39341a96753e99bfd7b7025aaaf9",
+        },
+        type="VideoZeroshotClassification",
+        category="va2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=(
+            "2018-08-03",
+            "2018-08-03",
+        ),
+        domains=["Web", "Scene"],
+        task_subtypes=["Activity recognition"],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        modalities=["video", "audio", "text"],
+        sample_creation="found",
+        bibtex_citation=CITATION,
+        is_beta=True,
+    )
+
+    input_column_name = ("video", "audio")
+    label_column_name: str = "label"
+
+    def get_candidate_labels(self) -> list[str]:
+        return [
+            f"a video of {name}"
+            for name in self.dataset["test"].features[self.label_column_name].names
+        ]
+
+
+class Kinetics600VZeroShotClassification(AbsTaskZeroShotClassification):
+    metadata = TaskMetadata(
+        name="Kinetics600VZeroShot",
+        description="Kinetics-600 is a large-scale action recognition dataset containing 600 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses video only.",
+        reference="https://arxiv.org/abs/1808.01340",
+        dataset={
+            "path": "mteb/kinetics-600",
+            "revision": "a7be893c873e39341a96753e99bfd7b7025aaaf9",
+        },
+        type="VideoZeroshotClassification",
+        category="v2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=(
+            "2018-08-03",
+            "2018-08-03",
+        ),
+        domains=["Web", "Scene"],
+        task_subtypes=["Activity recognition"],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        modalities=["video", "text"],
+        sample_creation="found",
+        bibtex_citation=CITATION,
+        is_beta=True,
+    )
+
+    input_column_name = "video"
+    label_column_name: str = "label"
+
+    def get_candidate_labels(self) -> list[str]:
+        return [
+            f"a video of {name}"
+            for name in self.dataset["test"].features[self.label_column_name].names
+        ]
diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics700.py b/mteb/tasks/zeroshot_classification/eng/kinetics700.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+from mteb.abstasks.task_metadata import TaskMetadata
+from mteb.abstasks.zeroshot_classification import AbsTaskZeroShotClassification
+
+CITATION = r"""
+@article{smaira2020short,
+  author = {Smaira, Lucas and Carreira, Joao and Noland, Eric and Clancy, Ellen and Wu, Amy and Zisserman, Andrew},
+  journal = {arXiv preprint arXiv:2010.10864},
+  title = {A Short Note on the Kinetics-700-2020 Human Action Dataset},
+  year = {2020},
+}
+"""
+
+
+class Kinetics700VAZeroShotClassification(AbsTaskZeroShotClassification):
+    metadata = TaskMetadata(
+        name="Kinetics700VAZeroShot",
+        description="Kinetics-700-2020 is a large-scale action recognition dataset containing 700 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses both video and audio modalities.",
+        reference="https://arxiv.org/abs/2010.10864",
+        dataset={
+            "path": "mteb/kinetics-700-2020",
+            "revision": "e9f50aa09759e014b8afc16cc27ec536d4c0747f",
+        },
+        type="VideoZeroshotClassification",
+        category="va2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=(
+            "2020-10-21",
+            "2020-10-21",
+        ),
+        domains=["Web", "Scene"],
+        task_subtypes=["Activity recognition"],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        modalities=["video", "audio", "text"],
+        sample_creation="found",
+        bibtex_citation=CITATION,
+        is_beta=True,
+    )
+
+    input_column_name = ("video", "audio")
+    label_column_name: str = "label"
+
+    def get_candidate_labels(self) -> list[str]:
+        return [
+            f"a video of {name}"
+            for name in self.dataset["test"].features[self.label_column_name].names
+        ]
+
+
+class Kinetics700VZeroShotClassification(AbsTaskZeroShotClassification):
+    metadata = TaskMetadata(
+        name="Kinetics700VZeroShot",
+        description="Kinetics-700-2020 is a large-scale action recognition dataset containing 700 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses video only.",
+        reference="https://arxiv.org/abs/2010.10864",
+        dataset={
+            "path": "mteb/kinetics-700-2020",
+            "revision": "e9f50aa09759e014b8afc16cc27ec536d4c0747f",
+        },
+        type="VideoZeroshotClassification",
+        category="v2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=(
+            "2020-10-21",
+            "2020-10-21",
+        ),
+        domains=["Web", "Scene"],
+        task_subtypes=["Activity recognition"],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        modalities=["video", "text"],
+        sample_creation="found",
+        bibtex_citation=CITATION,
+        is_beta=True,
+    )
+
+    input_column_name = "video"
+    label_column_name: str = "label"
+
+    def get_candidate_labels(self) -> list[str]:
+        return [
+            f"a video of {name}"
+            for name in self.dataset["test"].features[self.label_column_name].names
+        ]