Skip to content

Commit ed09b9a

Browse files
authored
feat: add VideoNet benchmark (#1308)
* support VideoNet * prevent unnecessary downloads A large portion of the HF Dataset is not required to run evals on our benchmark. Updated the code so that only files required for evals are downloaded.
1 parent 55c7eba commit ed09b9a

11 files changed

Lines changed: 322 additions & 0 deletions
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
include: _default.yaml
2+
doc_to_visual: !function binary_utils.videonet_binary_doc_to_visual
3+
doc_to_text: !function binary_utils.videonet_binary_doc_to_text
4+
doc_to_messages: !function binary_utils.videonet_binary_doc_to_messages
5+
generation_kwargs:
6+
temperature: 0
7+
do_sample: False
8+
max_new_tokens: 8192
9+
process_results: !function binary_utils.videonet_binary_process_results
10+
metric_list:
11+
- metric: binary_acc
12+
aggregation: !function binary_utils.videonet_binary_aggregate_results
13+
higher_is_better: True
14+
lmms_eval_specific_kwargs:
15+
default:
16+
pre_prompt: ""
17+
post_prompt: ""
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
dataset_path: "raivn/VideoNet"
2+
dataset_name: "lmms-eval"
3+
output_type: generate_until
4+
doc_to_target: "answer"
5+
test_split: test

lmms_eval/tasks/videonet/_mcq.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
include: _default.yaml
2+
doc_to_visual: !function mcq_utils.videonet_mcq_doc_to_visual
3+
doc_to_text: !function mcq_utils.videonet_mcq_doc_to_text
4+
doc_to_messages: !function mcq_utils.videonet_mcq_doc_to_messages
5+
generation_kwargs:
6+
temperature: 0
7+
do_sample: False
8+
max_new_tokens: 8192
9+
process_results: !function mcq_utils.videonet_mcq_process_results
10+
metric_list:
11+
- metric: mcq_acc
12+
aggregation: !function mcq_utils.videonet_mcq_aggregate_results
13+
higher_is_better: True
14+
lmms_eval_specific_kwargs:
15+
default:
16+
pre_prompt: ""
17+
post_prompt: ""
18+
metadata:
19+
version: 0.0
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
from functools import cache
2+
from pathlib import Path
3+
4+
from huggingface_hub import snapshot_download
5+
6+
7+
@cache
8+
def _get_videos_dir() -> Path:
9+
dataset_root = snapshot_download(repo_id="raivn/VideoNet", repo_type="dataset", allow_patterns=["videos/*.mp4"])
10+
return Path(dataset_root) / "videos"
11+
12+
13+
def _get_video_path(video_fname) -> str:
14+
return str(_get_videos_dir() / video_fname)
15+
16+
17+
def videonet_binary_doc_to_visual(doc):
18+
question = doc["question"]
19+
video_fnames = [entry["video"] for entry in question if entry["type"] == "video"]
20+
video_paths = [_get_video_path(video_fname) for video_fname in video_fnames]
21+
return video_paths
22+
23+
24+
def videonet_binary_doc_to_text(doc, lmms_eval_specific_kwargs=None):
25+
question = doc["question"]
26+
texts = [entry["text"] for entry in question if entry["type"] == "text"]
27+
text = "\n".join(texts)
28+
if lmms_eval_specific_kwargs:
29+
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
30+
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
31+
text = pre_prompt + text + post_prompt
32+
return text
33+
34+
35+
def _process_video_entry(entry: dict) -> dict:
36+
video_fname = entry["video"]
37+
video_path = _get_video_path(video_fname)
38+
return {"type": "video", "url": video_path}
39+
40+
41+
def _process_text_entry(entry: dict, lmms_eval_specific_kwargs=None) -> dict:
42+
text = entry["text"]
43+
if lmms_eval_specific_kwargs:
44+
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
45+
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
46+
text = pre_prompt + text + post_prompt
47+
return {"type": "text", "text": text}
48+
49+
50+
def _question_to_content(question: list[dict], lmms_eval_specific_kwargs=None) -> list[dict]:
51+
content = []
52+
if lmms_eval_specific_kwargs and (pre_prompt := lmms_eval_specific_kwargs.get("pre_prompt")):
53+
content.append({"type": "text", "text": pre_prompt})
54+
for entry in question:
55+
if entry["type"] == "text":
56+
content.append(entry)
57+
elif entry["type"] == "video":
58+
content.append(_process_video_entry(entry))
59+
else:
60+
raise Exception("Your copy of the benchmark is corrupted. Please re-download the `benchmarks/` folder from HuggingFace.")
61+
if lmms_eval_specific_kwargs and (post_prompt := lmms_eval_specific_kwargs.get("post_prompt")):
62+
content.append({"type": "text", "text": post_prompt})
63+
return content
64+
65+
66+
def videonet_binary_doc_to_messages(doc, lmms_eval_specific_kwargs=None):
67+
question = doc["question"]
68+
content = _question_to_content(question, lmms_eval_specific_kwargs)
69+
return [{"role": "user", "content": content}]
70+
71+
72+
def _extract_binary_prediction(text: str) -> str:
73+
text = text.splitlines()[-1].strip().lower()
74+
pred = text.replace("*", "").replace("#", "").replace(",", "").replace(".", "").replace(":", "")
75+
76+
if pred == "yes" or pred == "no":
77+
return pred
78+
parts = pred.split(" ")
79+
if len(parts) > 1:
80+
first, last = parts[0], parts[-1]
81+
if first == "yes" or first == "no":
82+
return first
83+
elif last == "yes" or last == "no":
84+
return last
85+
if "boxed{yes}" in pred:
86+
return "yes"
87+
if "boxed{no}" in pred:
88+
return "no"
89+
if "the answer is yes" in pred:
90+
return "yes"
91+
if "the answer is no" in pred:
92+
return "no"
93+
if 'is "yes"' in pred:
94+
return "yes"
95+
if "is 'yes'" in pred:
96+
return "yes"
97+
if 'is "no"' in pred:
98+
return "no"
99+
if "is 'no'" in pred:
100+
return "no"
101+
if "does not show" in pred or "does not depict" in pred:
102+
return "no"
103+
return pred
104+
105+
106+
def videonet_binary_process_results(doc, results):
107+
model_output = results[0] if results else ""
108+
ground_truth = doc["answer"]
109+
110+
pred = _extract_binary_prediction(model_output)
111+
correct = 1.0 if pred == ground_truth else 0.0
112+
return {
113+
"binary_acc": {
114+
"question_key": doc["key"],
115+
"correct": correct,
116+
"ground_truth": ground_truth,
117+
"model_prediction": pred,
118+
"model_output": model_output,
119+
}
120+
}
121+
122+
123+
def videonet_binary_aggregate_results(results):
124+
if not results:
125+
return 0.0
126+
num_correct = sum(r["correct"] for r in results)
127+
total = len(results)
128+
return num_correct / total
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
from functools import cache
2+
from pathlib import Path
3+
4+
from huggingface_hub import snapshot_download
5+
6+
7+
@cache
8+
def _get_videos_dir() -> Path:
9+
dataset_root = snapshot_download(repo_id="raivn/VideoNet", repo_type="dataset", allow_patterns=["videos/*.mp4"])
10+
return Path(dataset_root) / "videos"
11+
12+
13+
def _get_video_path(video_fname) -> str:
14+
return str(_get_videos_dir() / video_fname)
15+
16+
17+
def videonet_mcq_doc_to_visual(doc):
18+
question = doc["question"]
19+
video_fname = [entry["video"] for entry in question if entry["type"] == "video"][0]
20+
return [_get_video_path(video_fname)]
21+
22+
23+
def videonet_mcq_doc_to_text(doc, lmms_eval_specific_kwargs=None):
24+
question = doc["question"]
25+
text = [entry["text"] for entry in question if entry["type"] == "text"][0]
26+
if lmms_eval_specific_kwargs:
27+
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
28+
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
29+
text = pre_prompt + text + post_prompt
30+
return text
31+
32+
33+
def _process_video_entry(entry: dict) -> dict:
34+
video_fname = entry["video"]
35+
video_path = _get_video_path(video_fname)
36+
return {"type": "video", "url": video_path}
37+
38+
39+
def _process_text_entry(entry: dict, lmms_eval_specific_kwargs=None) -> dict:
40+
text = entry["text"]
41+
if lmms_eval_specific_kwargs:
42+
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
43+
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
44+
text = pre_prompt + text + post_prompt
45+
return {"type": "text", "text": text}
46+
47+
48+
def _question_to_content(question: list[dict], lmms_eval_specific_kwargs=None) -> list[dict]:
49+
content = []
50+
for entry in question:
51+
if entry["type"] == "text":
52+
content.append(_process_text_entry(entry, lmms_eval_specific_kwargs))
53+
elif entry["type"] == "video":
54+
content.append(_process_video_entry(entry))
55+
else:
56+
raise Exception("Your copy of the benchmark is corrupted. Please re-download the `benchmarks/` folder from HuggingFace.")
57+
return content
58+
59+
60+
def videonet_mcq_doc_to_messages(doc, lmms_eval_specific_kwargs=None):
61+
question = doc["question"]
62+
content = _question_to_content(question, lmms_eval_specific_kwargs)
63+
return [{"role": "user", "content": content}]
64+
65+
66+
def _extract_prediction_mcq(text: str) -> str:
67+
text = text.splitlines()[-1].strip().upper()
68+
pred = text.replace("*", "").replace("#", "").replace(",", "").replace(".", "").replace(":", "")
69+
if pred in {"A", "B", "C", "D"}:
70+
return pred
71+
if "BOXED{A}" in pred:
72+
return "A"
73+
if "BOXED{B}" in pred:
74+
return "B"
75+
if "BOXED{C}" in pred:
76+
return "C"
77+
if "BOXED{D}" in pred:
78+
return "D"
79+
return pred
80+
81+
82+
def videonet_mcq_process_results(doc, results):
83+
model_output = results[0] if results else ""
84+
ground_truth = doc["answer"]
85+
86+
pred = _extract_prediction_mcq(model_output)
87+
correct = 1.0 if pred == ground_truth else 0.0
88+
return {
89+
"mcq_acc": {
90+
"question_key": doc["key"],
91+
"correct": correct,
92+
"ground_truth": ground_truth,
93+
"model_prediction": pred,
94+
"model_output": model_output,
95+
}
96+
}
97+
98+
99+
def videonet_mcq_aggregate_results(results):
100+
if not results:
101+
return 0.0
102+
num_correct = sum(r["correct"] for r in results)
103+
total = len(results)
104+
return num_correct / total
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
include: _binary.yaml
2+
task: "videonet_binary_0shot"
3+
dataset_kwargs:
4+
cache_dir: videonet
5+
data_files:
6+
test: "benchmarks/binary_0shot.jsonl"
7+
metadata:
8+
num_fewshot: 0
9+
version: 0.0
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
include: _binary.yaml
2+
task: "videonet_binary_1shot"
3+
dataset_kwargs:
4+
cache_dir: videonet
5+
data_files:
6+
test: "benchmarks/binary_1shot.jsonl"
7+
metadata:
8+
num_fewshot: 1
9+
version: 0.0
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
include: _binary.yaml
2+
task: "videonet_binary_2shot"
3+
dataset_kwargs:
4+
cache_dir: videonet
5+
data_files:
6+
test: "benchmarks/binary_2shot.jsonl"
7+
metadata:
8+
num_fewshot: 2
9+
version: 0.0
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
include: _binary.yaml
2+
task: "videonet_binary_3shot"
3+
dataset_kwargs:
4+
cache_dir: videonet
5+
data_files:
6+
test: "benchmarks/binary_3shot.jsonl"
7+
metadata:
8+
num_fewshot: 3
9+
version: 0.0
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
include: _mcq.yaml
2+
task: "videonet_mcq_test"
3+
dataset_kwargs:
4+
cache_dir: videonet
5+
data_files:
6+
test: "benchmarks/mcq_test.jsonl"

0 commit comments

Comments
 (0)