EvolvingLMMs-Lab
diff --git a/‎lmms_eval/tasks/cover/cover.yaml‎
Lines changed: 35 additions & 0 deletions b/‎lmms_eval/tasks/cover/cover.yaml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/cover/generate_qa.py‎
Lines changed: 153 additions & 0 deletions b/‎lmms_eval/tasks/cover/generate_qa.py‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/cover/utils.py‎
Lines changed: 174 additions & 0 deletions b/‎lmms_eval/tasks/cover/utils.py‎
Lines changed: 174 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/wm_abench/_default_template_yaml‎
Lines changed: 33 additions & 0 deletions b/‎lmms_eval/tasks/wm_abench/_default_template_yaml‎
Lines changed: 33 additions & 0 deletions
@@ -0,0 +1,35 @@
+# COVER: Counterfactual Video Reasoning (ACL Findings 2025)
+# Paper: https://arxiv.org/abs/2503.10691
+# Original: PeterPanonly/COVER (incompatible format, zips only)
+# Clean copy: lmms-lab-eval/COVER (pre-parsed QA from JSONL)
+#
+# Videos: VIDEO.zip must be in lmms-lab-eval/COVER or PeterPanonly/COVER.
+# Set COVER_DATA_DIR env var to override video directory.
+dataset_path: lmms-lab-eval/COVER
+dataset_kwargs:
+  token: True
+  cache_dir: cover
+  video: True
+task: cover
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cover_doc_to_visual
+doc_to_text: !function utils.cover_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.cover_process_results
+metric_list:
+  - metric: cover_accuracy
+    aggregation: !function utils.cover_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly."
+metadata:
+  version: 0.2
@@ -0,0 +1,153 @@
+"""Pre-extract COVER videos and optionally dump the QA JSON.
+
+This is a convenience script for manual setup.  During normal lmms-eval
+runs, the task's utils.py handles everything automatically (downloads
+the HF repo, extracts videos, loads JSONL data in process_docs).
+
+Usage:
+    # Extract videos only (recommended before first eval run):
+    python -m lmms_eval.tasks.cover.generate_qa --extract-videos
+
+    # Also dump a standalone QA JSON for inspection:
+    python -m lmms_eval.tasks.cover.generate_qa \
+        --extract-videos --output $HF_HOME/cover/cover_qa.json
+"""
+
+import argparse
+import io
+import json
+import os
+import zipfile
+from collections import defaultdict
+
+from huggingface_hub import snapshot_download
+
+DATASET_REPO_ID = "PeterPanonly/COVER"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="COVER dataset setup")
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="",
+        help="Output JSON file path (optional). If set, writes a flat " "QA JSON with all samples.",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        default="",
+        help="Directory for extracted videos. Default: $HF_HOME/cover/",
+    )
+    parser.add_argument(
+        "--extract-videos",
+        action="store_true",
+        help="Extract VIDEO.zip into the cache directory.",
+    )
+    args = parser.parse_args()
+
+    hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface"))
+    cache_dir = args.cache_dir or os.path.join(hf_home, "cover")
+
+    # Download the HF repo
+    print(f"Downloading {DATASET_REPO_ID} ...")
+    repo_dir = snapshot_download(
+        repo_id=DATASET_REPO_ID,
+        repo_type="dataset",
+        etag_timeout=60,
+    )
+    print(f"  Repo cached at: {repo_dir}")
+
+    # Extract videos
+    if args.extract_videos:
+        video_dir = os.path.join(cache_dir, "VIDEO")
+        if os.path.exists(video_dir):
+            print(f"  VIDEO directory already exists: {video_dir}")
+        else:
+            video_zip = os.path.join(repo_dir, "VIDEO.zip")
+            if not os.path.exists(video_zip):
+                print(f"  ERROR: VIDEO.zip not found at {video_zip}")
+                return
+            print(f"  Extracting VIDEO.zip to {cache_dir} ...")
+            os.makedirs(cache_dir, exist_ok=True)
+            with zipfile.ZipFile(video_zip) as zf:
+                zf.extractall(cache_dir)
+            print("  Done.")
+
+    # Optionally dump QA JSON
+    if args.output:
+        jsonl_zip_path = os.path.join(repo_dir, "jsonl.zip")
+        if not os.path.exists(jsonl_zip_path):
+            print(f"  ERROR: jsonl.zip not found at {jsonl_zip_path}")
+            return
+
+        samples = []
+        idx = 0
+        with zipfile.ZipFile(jsonl_zip_path) as zf:
+            for name in sorted(zf.namelist()):
+                if not name.endswith(".jsonl"):
+                    continue
+                aspect = os.path.basename(name).replace(".jsonl", "")
+                with zf.open(name) as f:
+                    for line in f:
+                        entry = json.loads(line)
+                        src = entry["src_dataset"]
+                        vname = entry["video_name"]
+                        text = entry["text"]
+
+                        video_path = f"VIDEO/{src}/{vname}"
+
+                        orig = text["original_qa"]
+                        samples.append(
+                            {
+                                "idx": idx,
+                                "video_path": video_path,
+                                "src_dataset": src,
+                                "video_name": vname,
+                                "question": orig["qs"],
+                                "choices": orig["choice"],
+                                "answer": orig["ans"],
+                                "qa_type": "original",
+                                "aspect": aspect,
+                            }
+                        )
+                        idx += 1
+
+                        cf = text["counterfactual_qa"]
+                        samples.append(
+                            {
+                                "idx": idx,
+                                "video_path": video_path,
+                                "src_dataset": src,
+                                "video_name": vname,
+                                "question": cf["qs"],
+                                "choices": cf["choice"],
+                                "answer": cf["ans"],
+                                "qa_type": "counterfactual",
+                                "aspect": aspect,
+                            }
+                        )
+                        idx += 1
+
+        os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+        with open(args.output, "w") as f:
+            json.dump(samples, f, indent=2)
+
+        print(f"\n  {len(samples)} QA samples written to {args.output}")
+
+        by_type = defaultdict(int)
+        by_aspect = defaultdict(int)
+        for s in samples:
+            by_type[s["qa_type"]] += 1
+            by_aspect[s["aspect"]] += 1
+
+        print(f"  By qa_type: {dict(by_type)}")
+        print(f"  By aspect ({len(by_aspect)} categories):")
+        for aspect in sorted(by_aspect):
+            print(f"    {aspect}: {by_aspect[aspect]}")
+    elif not args.extract_videos:
+        print("Nothing to do. Use --extract-videos and/or --output.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,174 @@
+"""COVER benchmark -- counterfactual video reasoning (ACL Findings 2025).
+
+Dataset: lmms-lab-eval/COVER (clean QA data, parsed from PeterPanonly/COVER)
+Paper:   https://arxiv.org/abs/2503.10691
+
+The clean dataset has pre-parsed QA pairs with columns:
+  src_dataset, video_name, question, choices (JSON), answer, qa_type, aspect.
+
+Videos: VIDEO.zip must be extracted to $HF_HOME/cover/VIDEO/ or set
+COVER_DATA_DIR to the video root directory.
+"""
+
+import json
+import os
+import zipfile
+from collections import defaultdict
+
+from huggingface_hub import snapshot_download
+from loguru import logger as eval_logger
+
+# ---------------------------------------------------------------------------
+# Video directory resolution
+# ---------------------------------------------------------------------------
+_hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/"))
+_CACHE_DIR = os.path.join(_hf_home, "cover")
+
+
+def _get_cache_dir():
+    explicit = os.getenv("COVER_DATA_DIR", "").strip()
+    if explicit:
+        return os.path.expanduser(explicit)
+    return _CACHE_DIR
+
+
+_videos_ready = False
+
+
+def _ensure_videos():
+    """Download VIDEO.zip from PeterPanonly/COVER and extract if needed."""
+    global _videos_ready
+    if _videos_ready:
+        return
+
+    cache_dir = _get_cache_dir()
+    video_dir = os.path.join(cache_dir, "VIDEO")
+    if os.path.exists(video_dir):
+        _videos_ready = True
+        return
+
+    eval_logger.info("COVER: downloading VIDEO.zip from PeterPanonly/COVER ...")
+    repo_dir = snapshot_download(
+        repo_id="PeterPanonly/COVER",
+        repo_type="dataset",
+        etag_timeout=60,
+    )
+
+    video_zip = os.path.join(repo_dir, "VIDEO.zip")
+    if os.path.exists(video_zip):
+        eval_logger.info(f"COVER: extracting VIDEO.zip to {cache_dir} ...")
+        os.makedirs(cache_dir, exist_ok=True)
+        with zipfile.ZipFile(video_zip) as zf:
+            zf.extractall(cache_dir)
+        eval_logger.info("COVER: extraction complete.")
+    else:
+        eval_logger.warning(f"COVER: VIDEO.zip not found at {video_zip}. " f"Videos must be placed manually in {video_dir}.")
+    _videos_ready = True
+
+
+# ---------------------------------------------------------------------------
+# doc_to_visual / doc_to_text
+# ---------------------------------------------------------------------------
+def cover_doc_to_visual(doc):
+    _ensure_videos()
+    cache_dir = _get_cache_dir()
+    video_path = os.path.join(cache_dir, "VIDEO", doc["src_dataset"], doc["video_name"])
+
+    if os.path.exists(video_path):
+        return [video_path]
+
+    # Try extension variants
+    base, ext = os.path.splitext(video_path)
+    for alt_ext in [".mp4", ".MP4", ".avi", ".AVI", ".mkv"]:
+        if alt_ext != ext:
+            alt = base + alt_ext
+            if os.path.exists(alt):
+                return [alt]
+
+    eval_logger.warning(f"COVER video not found: {video_path}")
+    return [video_path]
+
+
+def cover_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    question = doc["question"]
+    choices = doc["choices"]
+    if isinstance(choices, str):
+        choices = json.loads(choices)
+
+    sorted_keys = sorted(choices.keys())
+    choices_str = "\n".join(f"{k}. {choices[k]}" for k in sorted_keys)
+
+    post_prompt = ""
+    if lmms_eval_specific_kwargs:
+        post_prompt = lmms_eval_specific_kwargs.get("default", {}).get("post_prompt", "")
+
+    return f"{question}\n{choices_str}{post_prompt}"
+
+
+# ---------------------------------------------------------------------------
+# process_results / aggregate_results
+# ---------------------------------------------------------------------------
+def _extract_answer(response):
+    import re
+
+    response = response.strip()
+    if not response:
+        return ""
+    # Direct single letter
+    if len(response) == 1 and response.upper() in "ABCDEF":
+        return response.upper()
+    # Pattern: (A), A., A:
+    m = re.match(r"[\(\s]*([A-Fa-f])[\)\.\:\s]", response)
+    if m:
+        return m.group(1).upper()
+    # First letter
+    m = re.match(r"^([A-Fa-f])\b", response)
+    if m:
+        return m.group(1).upper()
+    # Search in short response
+    if len(response) < 50:
+        m = re.search(r"\b([A-Da-d])\b", response)
+        if m:
+            return m.group(1).upper()
+    return response[:1].upper()
+
+
+def cover_process_results(doc, results):
+    pred = _extract_answer(results[0])
+    gt = doc["answer"].strip().upper()
+
+    return {
+        "cover_accuracy": {
+            "pred_answer": pred,
+            "answer": gt,
+            "qa_type": doc.get("qa_type", ""),
+            "aspect": doc.get("aspect", ""),
+            "score": 1.0 if pred == gt else 0.0,
+        }
+    }
+
+
+def cover_aggregate_results(results):
+    total = len(results)
+    correct = sum(r["score"] for r in results)
+
+    # Per qa_type
+    for qt in ("original", "counterfactual"):
+        subset = [r for r in results if r["qa_type"] == qt]
+        if subset:
+            acc = 100.0 * sum(r["score"] for r in subset) / len(subset)
+            eval_logger.info(f"COVER {qt}: {acc:.1f}% ({len(subset)} samples)")
+
+    # Per aspect
+    aspect_scores = defaultdict(lambda: {"correct": 0, "total": 0})
+    for r in results:
+        aspect_scores[r["aspect"]]["total"] += 1
+        aspect_scores[r["aspect"]]["correct"] += r["score"]
+    for aspect in sorted(aspect_scores):
+        s = aspect_scores[aspect]
+        acc = 100.0 * s["correct"] / s["total"] if s["total"] > 0 else 0.0
+        eval_logger.info(f"COVER {aspect}: {acc:.1f}% ({s['total']} samples)")
+
+    overall = 100.0 * correct / total if total > 0 else 0.0
+    eval_logger.info(f"COVER Overall: {overall:.1f}% ({total} samples)")
+    return overall
@@ -0,0 +1,33 @@
+dataset_path: maitrix-org/WM-ABench
+dataset_kwargs:
+  token: True
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.wm_abench_doc_to_visual
+doc_to_text: !function utils.wm_abench_doc_to_text
+doc_to_target: !function utils.wm_abench_doc_to_target
+
+process_results: !function utils.wm_abench_process_results
+
+metric_list:
+  - metric: wm_abench_acc
+    aggregation: !function utils.wm_abench_aggregate_results
+    higher_is_better: true
+  - metric: wm_abench_acc_clean
+    aggregation: !function utils.wm_abench_aggregate_results_clean
+    higher_is_better: true
+  - metric: wm_abench_blocked_rate
+    aggregation: !function utils.wm_abench_aggregate_blocked_rate
+    higher_is_better: false
+
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""