Skip to content

Commit 22d1c00

Browse files
deep9539Samoed
andauthored
Add Zeroshot classification for Worldsense classification datatset (#4538)
* Add Zeroshot classification for Worldsense classification datatset * format --------- Co-authored-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
1 parent b54dd54 commit 22d1c00

2 files changed

Lines changed: 85 additions & 0 deletions

File tree

mteb/tasks/zeroshot_classification/eng/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@
3939
UCF101VideoZeroShotClassification,
4040
UCF101ZeroShotClassification,
4141
)
42+
from .worldsense_classification import (
43+
WorldSenseAudioVideoZeroShotClassification,
44+
WorldSenseVideoZeroShotClassification,
45+
)
4246

4347
__all__ = [
4448
"CLEVR",
@@ -76,4 +80,6 @@
7680
"UCF101VideoAudioZeroShotClassification",
7781
"UCF101VideoZeroShotClassification",
7882
"UCF101ZeroShotClassification",
83+
"WorldSenseAudioVideoZeroShotClassification",
84+
"WorldSenseVideoZeroShotClassification",
7985
]
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
from __future__ import annotations
2+
3+
from mteb.abstasks.task_metadata import TaskMetadata
4+
from mteb.abstasks.zeroshot_classification import AbsTaskZeroShotClassification
5+
6+
CITATION = r"""
7+
@inproceedings{hong2025worldsense,
8+
author = {Hong, Jack and Yan, Shilin and Cai, Jiayin and Jiang, Xiaolong and Hu, Yao and Xie, Weidi},
9+
journal = {arXiv preprint arXiv:2502.04326},
10+
title = {Worldsense: Evaluating real-world omnimodal understanding for multimodal llms},
11+
year = {2025},
12+
}
13+
"""
14+
15+
16+
class WorldSenseAudioVideoZeroShotClassification(AbsTaskZeroShotClassification):
17+
metadata = TaskMetadata(
18+
name="WorldSenseAudioVideoZeroShot",
19+
description="WorldSense is a multimodal video understanding benchmark encompassing visual, audio, and text inputs. Videos are categorized into 8 primary domains across 67 fine-grained subcategories. This zero-shot classification task predicts the domain category of a video clip",
20+
reference="https://arxiv.org/abs/2502.04326",
21+
dataset={
22+
"path": "mteb/WorldSense_1min",
23+
"revision": "10c7ce0eb32d620f1f685bfedde2724066068a1c",
24+
},
25+
type="VideoZeroshotClassification",
26+
category="va2t",
27+
eval_splits=["test"],
28+
eval_langs=["eng-Latn"],
29+
main_score="accuracy",
30+
date=("2025-02-06", "2026-03-01"),
31+
domains=["Scene", "AudioScene", "Music", "Entertainment"],
32+
task_subtypes=["Scene recognition"],
33+
license="not specified",
34+
annotations_creators="expert-annotated",
35+
dialect=[],
36+
modalities=["video", "audio"],
37+
sample_creation="found",
38+
bibtex_citation=CITATION,
39+
is_beta=True,
40+
)
41+
42+
input_column_name = ("video", "audio")
43+
label_column_name: str = "domain"
44+
45+
def get_candidate_labels(self) -> list[str]:
46+
return self.dataset["test"].features[self.label_column_name].names
47+
48+
49+
class WorldSenseVideoZeroShotClassification(AbsTaskZeroShotClassification):
50+
metadata = TaskMetadata(
51+
name="WorldSenseVideoZeroShot",
52+
description="WorldSense is a multimodal video understanding benchmark encompassing visual, audio, and text inputs. Videos are categorized into 8 primary domains across 67 fine-grained subcategories. This zero-shot classification task predicts the domain category of a video clip",
53+
reference="https://arxiv.org/abs/2502.04326",
54+
dataset={
55+
"path": "mteb/WorldSense_1min",
56+
"revision": "10c7ce0eb32d620f1f685bfedde2724066068a1c",
57+
},
58+
type="VideoZeroshotClassification",
59+
category="v2t",
60+
eval_splits=["test"],
61+
eval_langs=["eng-Latn"],
62+
main_score="accuracy",
63+
date=("2025-02-06", "2026-03-01"),
64+
domains=["Scene", "AudioScene", "Music", "Entertainment"],
65+
task_subtypes=["Scene recognition"],
66+
license="not specified",
67+
annotations_creators="expert-annotated",
68+
dialect=[],
69+
modalities=["video"],
70+
sample_creation="found",
71+
bibtex_citation=CITATION,
72+
is_beta=True,
73+
)
74+
75+
input_column_name = "video"
76+
label_column_name: str = "domain"
77+
78+
def get_candidate_labels(self) -> list[str]:
79+
return self.dataset["test"].features[self.label_column_name].names

0 commit comments

Comments
 (0)