Skip to content

Commit c337362

Browse files
authored
feat: add COVER and WM-aBench video understanding benchmarks (#1273)
Add two video understanding benchmarks: - COVER: Counterfactual Video Reasoning (ACL Findings 2025) - tests causal understanding in videos via counterfactual question generation - WM-aBench: World Models aBench with 36+ task variants covering spatial reasoning, motion understanding, object interactions, physical properties, temporal reasoning, and visual attributes
1 parent cfc260b commit c337362

File tree

46 files changed

+887
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+887
-0
lines changed

lmms_eval/tasks/cover/cover.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# COVER: Counterfactual Video Reasoning (ACL Findings 2025)
2+
# Paper: https://arxiv.org/abs/2503.10691
3+
# Original: PeterPanonly/COVER (incompatible format, zips only)
4+
# Clean copy: lmms-lab-eval/COVER (pre-parsed QA from JSONL)
5+
#
6+
# Videos: VIDEO.zip must be in lmms-lab-eval/COVER or PeterPanonly/COVER.
7+
# Set COVER_DATA_DIR env var to override video directory.
8+
dataset_path: lmms-lab-eval/COVER
9+
dataset_kwargs:
10+
token: True
11+
cache_dir: cover
12+
video: True
13+
task: cover
14+
test_split: test
15+
output_type: generate_until
16+
doc_to_visual: !function utils.cover_doc_to_visual
17+
doc_to_text: !function utils.cover_doc_to_text
18+
doc_to_target: "answer"
19+
generation_kwargs:
20+
max_new_tokens: 16
21+
temperature: 0
22+
top_p: 1.0
23+
num_beams: 1
24+
do_sample: false
25+
process_results: !function utils.cover_process_results
26+
metric_list:
27+
- metric: cover_accuracy
28+
aggregation: !function utils.cover_aggregate_results
29+
higher_is_better: true
30+
lmms_eval_specific_kwargs:
31+
default:
32+
pre_prompt: ""
33+
post_prompt: "\nAnswer with the option's letter from the given choices directly."
34+
metadata:
35+
version: 0.2
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
"""Pre-extract COVER videos and optionally dump the QA JSON.
2+
3+
This is a convenience script for manual setup. During normal lmms-eval
4+
runs, the task's utils.py handles everything automatically (downloads
5+
the HF repo, extracts videos, loads JSONL data in process_docs).
6+
7+
Usage:
8+
# Extract videos only (recommended before first eval run):
9+
python -m lmms_eval.tasks.cover.generate_qa --extract-videos
10+
11+
# Also dump a standalone QA JSON for inspection:
12+
python -m lmms_eval.tasks.cover.generate_qa \
13+
--extract-videos --output $HF_HOME/cover/cover_qa.json
14+
"""
15+
16+
import argparse
17+
import io
18+
import json
19+
import os
20+
import zipfile
21+
from collections import defaultdict
22+
23+
from huggingface_hub import snapshot_download
24+
25+
DATASET_REPO_ID = "PeterPanonly/COVER"
26+
27+
28+
def main():
29+
parser = argparse.ArgumentParser(description="COVER dataset setup")
30+
parser.add_argument(
31+
"--output",
32+
type=str,
33+
default="",
34+
help="Output JSON file path (optional). If set, writes a flat " "QA JSON with all samples.",
35+
)
36+
parser.add_argument(
37+
"--cache-dir",
38+
type=str,
39+
default="",
40+
help="Directory for extracted videos. Default: $HF_HOME/cover/",
41+
)
42+
parser.add_argument(
43+
"--extract-videos",
44+
action="store_true",
45+
help="Extract VIDEO.zip into the cache directory.",
46+
)
47+
args = parser.parse_args()
48+
49+
hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface"))
50+
cache_dir = args.cache_dir or os.path.join(hf_home, "cover")
51+
52+
# Download the HF repo
53+
print(f"Downloading {DATASET_REPO_ID} ...")
54+
repo_dir = snapshot_download(
55+
repo_id=DATASET_REPO_ID,
56+
repo_type="dataset",
57+
etag_timeout=60,
58+
)
59+
print(f" Repo cached at: {repo_dir}")
60+
61+
# Extract videos
62+
if args.extract_videos:
63+
video_dir = os.path.join(cache_dir, "VIDEO")
64+
if os.path.exists(video_dir):
65+
print(f" VIDEO directory already exists: {video_dir}")
66+
else:
67+
video_zip = os.path.join(repo_dir, "VIDEO.zip")
68+
if not os.path.exists(video_zip):
69+
print(f" ERROR: VIDEO.zip not found at {video_zip}")
70+
return
71+
print(f" Extracting VIDEO.zip to {cache_dir} ...")
72+
os.makedirs(cache_dir, exist_ok=True)
73+
with zipfile.ZipFile(video_zip) as zf:
74+
zf.extractall(cache_dir)
75+
print(" Done.")
76+
77+
# Optionally dump QA JSON
78+
if args.output:
79+
jsonl_zip_path = os.path.join(repo_dir, "jsonl.zip")
80+
if not os.path.exists(jsonl_zip_path):
81+
print(f" ERROR: jsonl.zip not found at {jsonl_zip_path}")
82+
return
83+
84+
samples = []
85+
idx = 0
86+
with zipfile.ZipFile(jsonl_zip_path) as zf:
87+
for name in sorted(zf.namelist()):
88+
if not name.endswith(".jsonl"):
89+
continue
90+
aspect = os.path.basename(name).replace(".jsonl", "")
91+
with zf.open(name) as f:
92+
for line in f:
93+
entry = json.loads(line)
94+
src = entry["src_dataset"]
95+
vname = entry["video_name"]
96+
text = entry["text"]
97+
98+
video_path = f"VIDEO/{src}/{vname}"
99+
100+
orig = text["original_qa"]
101+
samples.append(
102+
{
103+
"idx": idx,
104+
"video_path": video_path,
105+
"src_dataset": src,
106+
"video_name": vname,
107+
"question": orig["qs"],
108+
"choices": orig["choice"],
109+
"answer": orig["ans"],
110+
"qa_type": "original",
111+
"aspect": aspect,
112+
}
113+
)
114+
idx += 1
115+
116+
cf = text["counterfactual_qa"]
117+
samples.append(
118+
{
119+
"idx": idx,
120+
"video_path": video_path,
121+
"src_dataset": src,
122+
"video_name": vname,
123+
"question": cf["qs"],
124+
"choices": cf["choice"],
125+
"answer": cf["ans"],
126+
"qa_type": "counterfactual",
127+
"aspect": aspect,
128+
}
129+
)
130+
idx += 1
131+
132+
os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
133+
with open(args.output, "w") as f:
134+
json.dump(samples, f, indent=2)
135+
136+
print(f"\n {len(samples)} QA samples written to {args.output}")
137+
138+
by_type = defaultdict(int)
139+
by_aspect = defaultdict(int)
140+
for s in samples:
141+
by_type[s["qa_type"]] += 1
142+
by_aspect[s["aspect"]] += 1
143+
144+
print(f" By qa_type: {dict(by_type)}")
145+
print(f" By aspect ({len(by_aspect)} categories):")
146+
for aspect in sorted(by_aspect):
147+
print(f" {aspect}: {by_aspect[aspect]}")
148+
elif not args.extract_videos:
149+
print("Nothing to do. Use --extract-videos and/or --output.")
150+
151+
152+
if __name__ == "__main__":
153+
main()

lmms_eval/tasks/cover/utils.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
"""COVER benchmark -- counterfactual video reasoning (ACL Findings 2025).
2+
3+
Dataset: lmms-lab-eval/COVER (clean QA data, parsed from PeterPanonly/COVER)
4+
Paper: https://arxiv.org/abs/2503.10691
5+
6+
The clean dataset has pre-parsed QA pairs with columns:
7+
src_dataset, video_name, question, choices (JSON), answer, qa_type, aspect.
8+
9+
Videos: VIDEO.zip must be extracted to $HF_HOME/cover/VIDEO/ or set
10+
COVER_DATA_DIR to the video root directory.
11+
"""
12+
13+
import json
14+
import os
15+
import zipfile
16+
from collections import defaultdict
17+
18+
from huggingface_hub import snapshot_download
19+
from loguru import logger as eval_logger
20+
21+
# ---------------------------------------------------------------------------
22+
# Video directory resolution
23+
# ---------------------------------------------------------------------------
24+
_hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/"))
25+
_CACHE_DIR = os.path.join(_hf_home, "cover")
26+
27+
28+
def _get_cache_dir():
29+
explicit = os.getenv("COVER_DATA_DIR", "").strip()
30+
if explicit:
31+
return os.path.expanduser(explicit)
32+
return _CACHE_DIR
33+
34+
35+
_videos_ready = False
36+
37+
38+
def _ensure_videos():
39+
"""Download VIDEO.zip from PeterPanonly/COVER and extract if needed."""
40+
global _videos_ready
41+
if _videos_ready:
42+
return
43+
44+
cache_dir = _get_cache_dir()
45+
video_dir = os.path.join(cache_dir, "VIDEO")
46+
if os.path.exists(video_dir):
47+
_videos_ready = True
48+
return
49+
50+
eval_logger.info("COVER: downloading VIDEO.zip from PeterPanonly/COVER ...")
51+
repo_dir = snapshot_download(
52+
repo_id="PeterPanonly/COVER",
53+
repo_type="dataset",
54+
etag_timeout=60,
55+
)
56+
57+
video_zip = os.path.join(repo_dir, "VIDEO.zip")
58+
if os.path.exists(video_zip):
59+
eval_logger.info(f"COVER: extracting VIDEO.zip to {cache_dir} ...")
60+
os.makedirs(cache_dir, exist_ok=True)
61+
with zipfile.ZipFile(video_zip) as zf:
62+
zf.extractall(cache_dir)
63+
eval_logger.info("COVER: extraction complete.")
64+
else:
65+
eval_logger.warning(f"COVER: VIDEO.zip not found at {video_zip}. " f"Videos must be placed manually in {video_dir}.")
66+
_videos_ready = True
67+
68+
69+
# ---------------------------------------------------------------------------
70+
# doc_to_visual / doc_to_text
71+
# ---------------------------------------------------------------------------
72+
def cover_doc_to_visual(doc):
73+
_ensure_videos()
74+
cache_dir = _get_cache_dir()
75+
video_path = os.path.join(cache_dir, "VIDEO", doc["src_dataset"], doc["video_name"])
76+
77+
if os.path.exists(video_path):
78+
return [video_path]
79+
80+
# Try extension variants
81+
base, ext = os.path.splitext(video_path)
82+
for alt_ext in [".mp4", ".MP4", ".avi", ".AVI", ".mkv"]:
83+
if alt_ext != ext:
84+
alt = base + alt_ext
85+
if os.path.exists(alt):
86+
return [alt]
87+
88+
eval_logger.warning(f"COVER video not found: {video_path}")
89+
return [video_path]
90+
91+
92+
def cover_doc_to_text(doc, lmms_eval_specific_kwargs=None):
93+
question = doc["question"]
94+
choices = doc["choices"]
95+
if isinstance(choices, str):
96+
choices = json.loads(choices)
97+
98+
sorted_keys = sorted(choices.keys())
99+
choices_str = "\n".join(f"{k}. {choices[k]}" for k in sorted_keys)
100+
101+
post_prompt = ""
102+
if lmms_eval_specific_kwargs:
103+
post_prompt = lmms_eval_specific_kwargs.get("default", {}).get("post_prompt", "")
104+
105+
return f"{question}\n{choices_str}{post_prompt}"
106+
107+
108+
# ---------------------------------------------------------------------------
109+
# process_results / aggregate_results
110+
# ---------------------------------------------------------------------------
111+
def _extract_answer(response):
112+
import re
113+
114+
response = response.strip()
115+
if not response:
116+
return ""
117+
# Direct single letter
118+
if len(response) == 1 and response.upper() in "ABCDEF":
119+
return response.upper()
120+
# Pattern: (A), A., A:
121+
m = re.match(r"[\(\s]*([A-Fa-f])[\)\.\:\s]", response)
122+
if m:
123+
return m.group(1).upper()
124+
# First letter
125+
m = re.match(r"^([A-Fa-f])\b", response)
126+
if m:
127+
return m.group(1).upper()
128+
# Search in short response
129+
if len(response) < 50:
130+
m = re.search(r"\b([A-Da-d])\b", response)
131+
if m:
132+
return m.group(1).upper()
133+
return response[:1].upper()
134+
135+
136+
def cover_process_results(doc, results):
137+
pred = _extract_answer(results[0])
138+
gt = doc["answer"].strip().upper()
139+
140+
return {
141+
"cover_accuracy": {
142+
"pred_answer": pred,
143+
"answer": gt,
144+
"qa_type": doc.get("qa_type", ""),
145+
"aspect": doc.get("aspect", ""),
146+
"score": 1.0 if pred == gt else 0.0,
147+
}
148+
}
149+
150+
151+
def cover_aggregate_results(results):
152+
total = len(results)
153+
correct = sum(r["score"] for r in results)
154+
155+
# Per qa_type
156+
for qt in ("original", "counterfactual"):
157+
subset = [r for r in results if r["qa_type"] == qt]
158+
if subset:
159+
acc = 100.0 * sum(r["score"] for r in subset) / len(subset)
160+
eval_logger.info(f"COVER {qt}: {acc:.1f}% ({len(subset)} samples)")
161+
162+
# Per aspect
163+
aspect_scores = defaultdict(lambda: {"correct": 0, "total": 0})
164+
for r in results:
165+
aspect_scores[r["aspect"]]["total"] += 1
166+
aspect_scores[r["aspect"]]["correct"] += r["score"]
167+
for aspect in sorted(aspect_scores):
168+
s = aspect_scores[aspect]
169+
acc = 100.0 * s["correct"] / s["total"] if s["total"] > 0 else 0.0
170+
eval_logger.info(f"COVER {aspect}: {acc:.1f}% ({s['total']} samples)")
171+
172+
overall = 100.0 * correct / total if total > 0 else 0.0
173+
eval_logger.info(f"COVER Overall: {overall:.1f}% ({total} samples)")
174+
return overall
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
dataset_path: maitrix-org/WM-ABench
2+
dataset_kwargs:
3+
token: True
4+
test_split: test
5+
output_type: generate_until
6+
doc_to_visual: !function utils.wm_abench_doc_to_visual
7+
doc_to_text: !function utils.wm_abench_doc_to_text
8+
doc_to_target: !function utils.wm_abench_doc_to_target
9+
10+
process_results: !function utils.wm_abench_process_results
11+
12+
metric_list:
13+
- metric: wm_abench_acc
14+
aggregation: !function utils.wm_abench_aggregate_results
15+
higher_is_better: true
16+
- metric: wm_abench_acc_clean
17+
aggregation: !function utils.wm_abench_aggregate_results_clean
18+
higher_is_better: true
19+
- metric: wm_abench_blocked_rate
20+
aggregation: !function utils.wm_abench_aggregate_blocked_rate
21+
higher_is_better: false
22+
23+
generation_kwargs:
24+
max_new_tokens: 16
25+
temperature: 0
26+
top_p: 1.0
27+
num_beams: 1
28+
do_sample: false
29+
30+
lmms_eval_specific_kwargs:
31+
default:
32+
pre_prompt: ""
33+
post_prompt: ""

0 commit comments

Comments
 (0)