From c30ff04aac25ea31153d560de176c9f978c30117 Mon Sep 17 00:00:00 2001 From: deep9539 Date: Mon, 27 Apr 2026 22:59:41 -0700 Subject: [PATCH 1/6] Add ZeroShotClassification for Kinetics700 --- .../zeroshot_classification/eng/__init__.py | 6 ++ .../eng/kinetics700.py | 89 +++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 mteb/tasks/zeroshot_classification/eng/kinetics700.py diff --git a/mteb/tasks/zeroshot_classification/eng/__init__.py b/mteb/tasks/zeroshot_classification/eng/__init__.py index 31d2f88591..9caefcc9b4 100644 --- a/mteb/tasks/zeroshot_classification/eng/__init__.py +++ b/mteb/tasks/zeroshot_classification/eng/__init__.py @@ -24,6 +24,10 @@ MELDAudioVideoZeroShotClassification, MELDVideoZeroShotClassification, ) +from .kinetics700 import ( + Kinetics700VAZeroShotClassification, + Kinetics700VZeroShotClassification, +) from .mnist import MNISTZeroShotClassification from .music_avqa import ( MusicAVQACLSAudioVideoZeroShotClassification, @@ -73,6 +77,8 @@ "Imagenet1kZeroShotClassification", "Kinetics400VAZeroShotClassification", "Kinetics400ZeroShotClassification", + "Kinetics700VAZeroShotClassification", + "Kinetics700VZeroShotClassification", "MELDAudioVideoZeroShotClassification", "MELDVideoZeroShotClassification", "MNISTZeroShotClassification", diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics700.py b/mteb/tasks/zeroshot_classification/eng/kinetics700.py new file mode 100644 index 0000000000..4f8aa4cc7e --- /dev/null +++ b/mteb/tasks/zeroshot_classification/eng/kinetics700.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from mteb.abstasks.task_metadata import TaskMetadata +from mteb.abstasks.zeroshot_classification import AbsTaskZeroShotClassification + +CITATION = r""" +@article{smaira2020short, + author = {Smaira, Lucas and Carreira, Joao and Noland, Eric and Clancy, Ellen and Wu, Amy and Zisserman, Andrew}, + journal = {arXiv preprint arXiv:2010.10864}, + title = {A Short Note on the Kinetics-700-2020 Human Action Dataset}, + year = {2020}, +} +""" + + +class Kinetics700VAZeroShotClassification(AbsTaskZeroShotClassification): + metadata = TaskMetadata( + name="Kinetics700VAZeroShot", + description="Kinetics-700-2020 is a large-scale action recognition dataset containing 700 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses both video and audio modalities.", + reference="https://arxiv.org/abs/2010.10864", + dataset={ + "path": "mteb/kinetics-700-2020", + "revision": "e9f50aa09759e014b8afc16cc27ec536d4c0747f", + }, + type="VideoZeroshotClassification", + category="va2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2020-10-21", + "2020-10-21", + ), + domains=["Web", "Scene"], + task_subtypes=["Activity recognition"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["video", "audio"], + sample_creation="found", + bibtex_citation=CITATION, + is_beta=True, + ) + + input_column_name = ("video", "audio") + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + return [ + name for name in self.dataset["test"].features[self.label_column_name].names + ] + + +class Kinetics700VZeroShotClassification(AbsTaskZeroShotClassification): + metadata = TaskMetadata( + name="Kinetics700VZeroShot", + description="Kinetics-700-2020 is a large-scale action recognition dataset containing 700 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses video only.", + reference="https://arxiv.org/abs/2010.10864", + dataset={ + "path": "mteb/kinetics-700-2020", + "revision": "e9f50aa09759e014b8afc16cc27ec536d4c0747f", + }, + type="VideoZeroshotClassification", + category="v2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2020-10-21", + "2020-10-21", + ), + domains=["Web", "Scene"], + task_subtypes=["Activity recognition"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["video"], + sample_creation="found", + bibtex_citation=CITATION, + is_beta=True, + ) + + input_column_name = "video" + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + return [ + name for name in self.dataset["test"].features[self.label_column_name].names + ] From 3b01bfbcf3c5b96e305379e72b2fd43a3a4db705 Mon Sep 17 00:00:00 2001 From: deep9539 Date: Mon, 27 Apr 2026 23:04:54 -0700 Subject: [PATCH 2/6] Add ZeroShotClassification for Kinetics600 --- .../zeroshot_classification/eng/__init__.py | 12 ++- .../eng/kinetics600.py | 89 +++++++++++++++++++ 2 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 mteb/tasks/zeroshot_classification/eng/kinetics600.py diff --git a/mteb/tasks/zeroshot_classification/eng/__init__.py b/mteb/tasks/zeroshot_classification/eng/__init__.py index 9caefcc9b4..94d2105bc2 100644 --- a/mteb/tasks/zeroshot_classification/eng/__init__.py +++ b/mteb/tasks/zeroshot_classification/eng/__init__.py @@ -20,14 +20,18 @@ Kinetics400VAZeroShotClassification, Kinetics400ZeroShotClassification, ) -from .meld_classification import ( - MELDAudioVideoZeroShotClassification, - MELDVideoZeroShotClassification, +from .kinetics600 import ( + Kinetics600VAZeroShotClassification, + Kinetics600VZeroShotClassification, ) from .kinetics700 import ( Kinetics700VAZeroShotClassification, Kinetics700VZeroShotClassification, ) +from .meld_classification import ( + MELDAudioVideoZeroShotClassification, + MELDVideoZeroShotClassification, +) from .mnist import MNISTZeroShotClassification from .music_avqa import ( MusicAVQACLSAudioVideoZeroShotClassification, @@ -77,6 +81,8 @@ "Imagenet1kZeroShotClassification", "Kinetics400VAZeroShotClassification", "Kinetics400ZeroShotClassification", + "Kinetics600VAZeroShotClassification", + "Kinetics600VZeroShotClassification", "Kinetics700VAZeroShotClassification", "Kinetics700VZeroShotClassification", "MELDAudioVideoZeroShotClassification", diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics600.py b/mteb/tasks/zeroshot_classification/eng/kinetics600.py new file mode 100644 index 0000000000..ddf9742b78 --- /dev/null +++ b/mteb/tasks/zeroshot_classification/eng/kinetics600.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from mteb.abstasks.task_metadata import TaskMetadata +from mteb.abstasks.zeroshot_classification import AbsTaskZeroShotClassification + +CITATION = r""" +@article{carreira2018short, + author = {Carreira, Joao and Noland, Eric and Banki-Horvath, Andras and Hillier, Chloe and Zisserman, Andrew}, + journal = {arXiv preprint arXiv:1808.01340}, + title = {A Short Note about Kinetics-600}, + year = {2018}, +} +""" + + +class Kinetics600VAZeroShotClassification(AbsTaskZeroShotClassification): + metadata = TaskMetadata( + name="Kinetics600VAZeroShot", + description="Kinetics-600 is a large-scale action recognition dataset containing 600 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses both video and audio modalities.", + reference="https://arxiv.org/abs/1808.01340", + dataset={ + "path": "mteb/kinetics-600", + "revision": "a7be893c873e39341a96753e99bfd7b7025aaaf9", + }, + type="VideoZeroshotClassification", + category="va2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2018-08-03", + "2018-08-03", + ), + domains=["Web", "Scene"], + task_subtypes=["Activity recognition"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["video", "audio"], + sample_creation="found", + bibtex_citation=CITATION, + is_beta=True, + ) + + input_column_name = ("video", "audio") + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + return [ + name for name in self.dataset["test"].features[self.label_column_name].names + ] + + +class Kinetics600VZeroShotClassification(AbsTaskZeroShotClassification): + metadata = TaskMetadata( + name="Kinetics600VZeroShot", + description="Kinetics-600 is a large-scale action recognition dataset containing 600 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses video only.", + reference="https://arxiv.org/abs/1808.01340", + dataset={ + "path": "mteb/kinetics-600", + "revision": "a7be893c873e39341a96753e99bfd7b7025aaaf9", + }, + type="VideoZeroshotClassification", + category="v2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2018-08-03", + "2018-08-03", + ), + domains=["Web", "Scene"], + task_subtypes=["Activity recognition"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["video"], + sample_creation="found", + bibtex_citation=CITATION, + is_beta=True, + ) + + input_column_name = "video" + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + return [ + name for name in self.dataset["test"].features[self.label_column_name].names + ] \ No newline at end of file From fd657afe64573ce11e8e368b6fdad5eaf062d6a6 Mon Sep 17 00:00:00 2001 From: deep9539 Date: Wed, 29 Apr 2026 19:04:31 -0700 Subject: [PATCH 3/6] Add "a video of " prefix classification label for kinetics600 and kinetics700 --- mteb/tasks/zeroshot_classification/eng/kinetics600.py | 2 +- mteb/tasks/zeroshot_classification/eng/kinetics700.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics600.py b/mteb/tasks/zeroshot_classification/eng/kinetics600.py index ddf9742b78..83704a5575 100644 --- a/mteb/tasks/zeroshot_classification/eng/kinetics600.py +++ b/mteb/tasks/zeroshot_classification/eng/kinetics600.py @@ -85,5 +85,5 @@ class Kinetics600VZeroShotClassification(AbsTaskZeroShotClassification): def get_candidate_labels(self) -> list[str]: return [ - name for name in self.dataset["test"].features[self.label_column_name].names + f"a video of {name}" for name in self.dataset["test"].features[self.label_column_name].names ] \ No newline at end of file diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics700.py b/mteb/tasks/zeroshot_classification/eng/kinetics700.py index 4f8aa4cc7e..903e79497e 100644 --- a/mteb/tasks/zeroshot_classification/eng/kinetics700.py +++ b/mteb/tasks/zeroshot_classification/eng/kinetics700.py @@ -85,5 +85,5 @@ class Kinetics700VZeroShotClassification(AbsTaskZeroShotClassification): def get_candidate_labels(self) -> list[str]: return [ - name for name in self.dataset["test"].features[self.label_column_name].names + "a video of {name}" for name in self.dataset["test"].features[self.label_column_name].names ] From 7d78f9b669f241b201abeef9645b28ef11c4e2c5 Mon Sep 17 00:00:00 2001 From: deep9539 Date: Wed, 29 Apr 2026 20:44:26 -0700 Subject: [PATCH 4/6] Add a video of prefix to remaining task - kinetics600 and kinetics700. --- mteb/tasks/zeroshot_classification/eng/kinetics600.py | 2 +- mteb/tasks/zeroshot_classification/eng/kinetics700.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics600.py b/mteb/tasks/zeroshot_classification/eng/kinetics600.py index 83704a5575..ba0f0444b3 100644 --- a/mteb/tasks/zeroshot_classification/eng/kinetics600.py +++ b/mteb/tasks/zeroshot_classification/eng/kinetics600.py @@ -47,7 +47,7 @@ class Kinetics600VAZeroShotClassification(AbsTaskZeroShotClassification): def get_candidate_labels(self) -> list[str]: return [ - name for name in self.dataset["test"].features[self.label_column_name].names + f"a video of {name}" for name in self.dataset["test"].features[self.label_column_name].names ] diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics700.py b/mteb/tasks/zeroshot_classification/eng/kinetics700.py index 903e79497e..5db42a2e67 100644 --- a/mteb/tasks/zeroshot_classification/eng/kinetics700.py +++ b/mteb/tasks/zeroshot_classification/eng/kinetics700.py @@ -47,7 +47,7 @@ class Kinetics700VAZeroShotClassification(AbsTaskZeroShotClassification): def get_candidate_labels(self) -> list[str]: return [ - name for name in self.dataset["test"].features[self.label_column_name].names + f"a video of {name}" for name in self.dataset["test"].features[self.label_column_name].names ] From 6462b74a03df9726fcd6db49880d86554456b4f0 Mon Sep 17 00:00:00 2001 From: deep9539 Date: Wed, 29 Apr 2026 20:52:11 -0700 Subject: [PATCH 5/6] Ran make lint --- mteb/tasks/zeroshot_classification/eng/kinetics600.py | 8 +++++--- mteb/tasks/zeroshot_classification/eng/kinetics700.py | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics600.py b/mteb/tasks/zeroshot_classification/eng/kinetics600.py index ba0f0444b3..927718364f 100644 --- a/mteb/tasks/zeroshot_classification/eng/kinetics600.py +++ b/mteb/tasks/zeroshot_classification/eng/kinetics600.py @@ -47,7 +47,8 @@ class Kinetics600VAZeroShotClassification(AbsTaskZeroShotClassification): def get_candidate_labels(self) -> list[str]: return [ - f"a video of {name}" for name in self.dataset["test"].features[self.label_column_name].names + f"a video of {name}" + for name in self.dataset["test"].features[self.label_column_name].names ] @@ -85,5 +86,6 @@ class Kinetics600VZeroShotClassification(AbsTaskZeroShotClassification): def get_candidate_labels(self) -> list[str]: return [ - f"a video of {name}" for name in self.dataset["test"].features[self.label_column_name].names - ] \ No newline at end of file + f"a video of {name}" + for name in self.dataset["test"].features[self.label_column_name].names + ] diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics700.py b/mteb/tasks/zeroshot_classification/eng/kinetics700.py index 5db42a2e67..2abd281218 100644 --- a/mteb/tasks/zeroshot_classification/eng/kinetics700.py +++ b/mteb/tasks/zeroshot_classification/eng/kinetics700.py @@ -47,7 +47,8 @@ class Kinetics700VAZeroShotClassification(AbsTaskZeroShotClassification): def get_candidate_labels(self) -> list[str]: return [ - f"a video of {name}" for name in self.dataset["test"].features[self.label_column_name].names + f"a video of {name}" + for name in self.dataset["test"].features[self.label_column_name].names ] @@ -85,5 +86,6 @@ class Kinetics700VZeroShotClassification(AbsTaskZeroShotClassification): def get_candidate_labels(self) -> list[str]: return [ - "a video of {name}" for name in self.dataset["test"].features[self.label_column_name].names + "a video of {name}" + for name in self.dataset["test"].features[self.label_column_name].names ] From 0d22e64b597540807c1dc3ef35facfb8fde1e39f Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Thu, 30 Apr 2026 13:44:27 +0300 Subject: [PATCH 6/6] fix modalities Co-authored-by: Roman Solomatin --- mteb/tasks/zeroshot_classification/eng/kinetics600.py | 4 ++-- mteb/tasks/zeroshot_classification/eng/kinetics700.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics600.py b/mteb/tasks/zeroshot_classification/eng/kinetics600.py index 927718364f..f1b9038a2e 100644 --- a/mteb/tasks/zeroshot_classification/eng/kinetics600.py +++ b/mteb/tasks/zeroshot_classification/eng/kinetics600.py @@ -36,7 +36,7 @@ class Kinetics600VAZeroShotClassification(AbsTaskZeroShotClassification): license="cc-by-4.0", annotations_creators="human-annotated", dialect=[], - modalities=["video", "audio"], + modalities=["video", "audio", "text"], sample_creation="found", bibtex_citation=CITATION, is_beta=True, @@ -75,7 +75,7 @@ class Kinetics600VZeroShotClassification(AbsTaskZeroShotClassification): license="cc-by-4.0", annotations_creators="human-annotated", dialect=[], - modalities=["video"], + modalities=["video", "text"], sample_creation="found", bibtex_citation=CITATION, is_beta=True, diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics700.py b/mteb/tasks/zeroshot_classification/eng/kinetics700.py index 2abd281218..6bfe181122 100644 --- a/mteb/tasks/zeroshot_classification/eng/kinetics700.py +++ b/mteb/tasks/zeroshot_classification/eng/kinetics700.py @@ -36,7 +36,7 @@ class Kinetics700VAZeroShotClassification(AbsTaskZeroShotClassification): license="cc-by-4.0", annotations_creators="human-annotated", dialect=[], - modalities=["video", "audio"], + modalities=["video", "audio", "text"], sample_creation="found", bibtex_citation=CITATION, is_beta=True, @@ -75,7 +75,7 @@ class Kinetics700VZeroShotClassification(AbsTaskZeroShotClassification): license="cc-by-4.0", annotations_creators="human-annotated", dialect=[], - modalities=["video"], + modalities=["video", "text"], sample_creation="found", bibtex_citation=CITATION, is_beta=True, @@ -86,6 +86,6 @@ class Kinetics700VZeroShotClassification(AbsTaskZeroShotClassification): def get_candidate_labels(self) -> list[str]: return [ - "a video of {name}" + f"a video of {name}" for name in self.dataset["test"].features[self.label_column_name].names ]