diff --git a/mteb/tasks/zeroshot_classification/eng/__init__.py b/mteb/tasks/zeroshot_classification/eng/__init__.py index 31d2f88591..94d2105bc2 100644 --- a/mteb/tasks/zeroshot_classification/eng/__init__.py +++ b/mteb/tasks/zeroshot_classification/eng/__init__.py @@ -20,6 +20,14 @@ Kinetics400VAZeroShotClassification, Kinetics400ZeroShotClassification, ) +from .kinetics600 import ( + Kinetics600VAZeroShotClassification, + Kinetics600VZeroShotClassification, +) +from .kinetics700 import ( + Kinetics700VAZeroShotClassification, + Kinetics700VZeroShotClassification, +) from .meld_classification import ( MELDAudioVideoZeroShotClassification, MELDVideoZeroShotClassification, @@ -73,6 +81,10 @@ "Imagenet1kZeroShotClassification", "Kinetics400VAZeroShotClassification", "Kinetics400ZeroShotClassification", + "Kinetics600VAZeroShotClassification", + "Kinetics600VZeroShotClassification", + "Kinetics700VAZeroShotClassification", + "Kinetics700VZeroShotClassification", "MELDAudioVideoZeroShotClassification", "MELDVideoZeroShotClassification", "MNISTZeroShotClassification", diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics600.py b/mteb/tasks/zeroshot_classification/eng/kinetics600.py new file mode 100644 index 0000000000..f1b9038a2e --- /dev/null +++ b/mteb/tasks/zeroshot_classification/eng/kinetics600.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from mteb.abstasks.task_metadata import TaskMetadata +from mteb.abstasks.zeroshot_classification import AbsTaskZeroShotClassification + +CITATION = r""" +@article{carreira2018short, + author = {Carreira, Joao and Noland, Eric and Banki-Horvath, Andras and Hillier, Chloe and Zisserman, Andrew}, + journal = {arXiv preprint arXiv:1808.01340}, + title = {A Short Note about Kinetics-600}, + year = {2018}, +} +""" + + +class Kinetics600VAZeroShotClassification(AbsTaskZeroShotClassification): + metadata = TaskMetadata( + name="Kinetics600VAZeroShot", + description="Kinetics-600 is a large-scale action recognition dataset containing 600 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses both video and audio modalities.", + reference="https://arxiv.org/abs/1808.01340", + dataset={ + "path": "mteb/kinetics-600", + "revision": "a7be893c873e39341a96753e99bfd7b7025aaaf9", + }, + type="VideoZeroshotClassification", + category="va2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2018-08-03", + "2018-08-03", + ), + domains=["Web", "Scene"], + task_subtypes=["Activity recognition"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["video", "audio", "text"], + sample_creation="found", + bibtex_citation=CITATION, + is_beta=True, + ) + + input_column_name = ("video", "audio") + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + return [ + f"a video of {name}" + for name in self.dataset["test"].features[self.label_column_name].names + ] + + +class Kinetics600VZeroShotClassification(AbsTaskZeroShotClassification): + metadata = TaskMetadata( + name="Kinetics600VZeroShot", + description="Kinetics-600 is a large-scale action recognition dataset containing 600 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses video only.", + reference="https://arxiv.org/abs/1808.01340", + dataset={ + "path": "mteb/kinetics-600", + "revision": "a7be893c873e39341a96753e99bfd7b7025aaaf9", + }, + type="VideoZeroshotClassification", + category="v2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2018-08-03", + "2018-08-03", + ), + domains=["Web", "Scene"], + task_subtypes=["Activity recognition"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["video", "text"], + sample_creation="found", + bibtex_citation=CITATION, + is_beta=True, + ) + + input_column_name = "video" + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + return [ + f"a video of {name}" + for name in self.dataset["test"].features[self.label_column_name].names + ] diff --git a/mteb/tasks/zeroshot_classification/eng/kinetics700.py b/mteb/tasks/zeroshot_classification/eng/kinetics700.py new file mode 100644 index 0000000000..6bfe181122 --- /dev/null +++ b/mteb/tasks/zeroshot_classification/eng/kinetics700.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from mteb.abstasks.task_metadata import TaskMetadata +from mteb.abstasks.zeroshot_classification import AbsTaskZeroShotClassification + +CITATION = r""" +@article{smaira2020short, + author = {Smaira, Lucas and Carreira, Joao and Noland, Eric and Clancy, Ellen and Wu, Amy and Zisserman, Andrew}, + journal = {arXiv preprint arXiv:2010.10864}, + title = {A Short Note on the Kinetics-700-2020 Human Action Dataset}, + year = {2020}, +} +""" + + +class Kinetics700VAZeroShotClassification(AbsTaskZeroShotClassification): + metadata = TaskMetadata( + name="Kinetics700VAZeroShot", + description="Kinetics-700-2020 is a large-scale action recognition dataset containing 700 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses both video and audio modalities.", + reference="https://arxiv.org/abs/2010.10864", + dataset={ + "path": "mteb/kinetics-700-2020", + "revision": "e9f50aa09759e014b8afc16cc27ec536d4c0747f", + }, + type="VideoZeroshotClassification", + category="va2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2020-10-21", + "2020-10-21", + ), + domains=["Web", "Scene"], + task_subtypes=["Activity recognition"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["video", "audio", "text"], + sample_creation="found", + bibtex_citation=CITATION, + is_beta=True, + ) + + input_column_name = ("video", "audio") + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + return [ + f"a video of {name}" + for name in self.dataset["test"].features[self.label_column_name].names + ] + + +class Kinetics700VZeroShotClassification(AbsTaskZeroShotClassification): + metadata = TaskMetadata( + name="Kinetics700VZeroShot", + description="Kinetics-700-2020 is a large-scale action recognition dataset containing 700 human action classes from YouTube videos. Each clip is approximately 10 seconds long. This variant uses video only.", + reference="https://arxiv.org/abs/2010.10864", + dataset={ + "path": "mteb/kinetics-700-2020", + "revision": "e9f50aa09759e014b8afc16cc27ec536d4c0747f", + }, + type="VideoZeroshotClassification", + category="v2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2020-10-21", + "2020-10-21", + ), + domains=["Web", "Scene"], + task_subtypes=["Activity recognition"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["video", "text"], + sample_creation="found", + bibtex_citation=CITATION, + is_beta=True, + ) + + input_column_name = "video" + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + return [ + f"a video of {name}" + for name in self.dataset["test"].features[self.label_column_name].names + ]