feat: add VideoNet benchmark (#1308)

yadavta · web-flow · commit ed09b9af46b0 · 2026-05-06T11:14:58.000+08:00
* support VideoNet

* prevent unnecessary downloads

A large portion of the HF Dataset is not required
to run evals on our benchmark. Updated the code
so that only files required for evals are downloaded.
diff --git a/lmms_eval/tasks/videonet/_binary.yaml b/lmms_eval/tasks/videonet/_binary.yaml
@@ -0,0 +1,17 @@
+include: _default.yaml
+doc_to_visual: !function binary_utils.videonet_binary_doc_to_visual
+doc_to_text: !function binary_utils.videonet_binary_doc_to_text
+doc_to_messages: !function binary_utils.videonet_binary_doc_to_messages
+generation_kwargs:
+  temperature: 0
+  do_sample: False
+  max_new_tokens: 8192
+process_results: !function binary_utils.videonet_binary_process_results
+metric_list:
+  - metric: binary_acc
+    aggregation: !function binary_utils.videonet_binary_aggregate_results
+    higher_is_better: True
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
diff --git a/lmms_eval/tasks/videonet/_default.yaml b/lmms_eval/tasks/videonet/_default.yaml
@@ -0,0 +1,5 @@
+dataset_path: "raivn/VideoNet"
+dataset_name: "lmms-eval"
+output_type: generate_until
+doc_to_target: "answer"
+test_split: test
diff --git a/lmms_eval/tasks/videonet/_mcq.yaml b/lmms_eval/tasks/videonet/_mcq.yaml
@@ -0,0 +1,19 @@
+include: _default.yaml
+doc_to_visual: !function mcq_utils.videonet_mcq_doc_to_visual
+doc_to_text: !function mcq_utils.videonet_mcq_doc_to_text
+doc_to_messages: !function mcq_utils.videonet_mcq_doc_to_messages
+generation_kwargs:
+  temperature: 0
+  do_sample: False
+  max_new_tokens: 8192
+process_results: !function mcq_utils.videonet_mcq_process_results
+metric_list:
+  - metric: mcq_acc
+    aggregation: !function mcq_utils.videonet_mcq_aggregate_results
+    higher_is_better: True
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+metadata:
+  version: 0.0
diff --git a/lmms_eval/tasks/videonet/binary_utils.py b/lmms_eval/tasks/videonet/binary_utils.py
@@ -0,0 +1,128 @@
+from functools import cache
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+
+
+@cache
+def _get_videos_dir() -> Path:
+    dataset_root = snapshot_download(repo_id="raivn/VideoNet", repo_type="dataset", allow_patterns=["videos/*.mp4"])
+    return Path(dataset_root) / "videos"
+
+
+def _get_video_path(video_fname) -> str:
+    return str(_get_videos_dir() / video_fname)
+
+
+def videonet_binary_doc_to_visual(doc):
+    question = doc["question"]
+    video_fnames = [entry["video"] for entry in question if entry["type"] == "video"]
+    video_paths = [_get_video_path(video_fname) for video_fname in video_fnames]
+    return video_paths
+
+
+def videonet_binary_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    question = doc["question"]
+    texts = [entry["text"] for entry in question if entry["type"] == "text"]
+    text = "\n".join(texts)
+    if lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+        post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+        text = pre_prompt + text + post_prompt
+    return text
+
+
+def _process_video_entry(entry: dict) -> dict:
+    video_fname = entry["video"]
+    video_path = _get_video_path(video_fname)
+    return {"type": "video", "url": video_path}
+
+
+def _process_text_entry(entry: dict, lmms_eval_specific_kwargs=None) -> dict:
+    text = entry["text"]
+    if lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+        post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+        text = pre_prompt + text + post_prompt
+    return {"type": "text", "text": text}
+
+
+def _question_to_content(question: list[dict], lmms_eval_specific_kwargs=None) -> list[dict]:
+    content = []
+    if lmms_eval_specific_kwargs and (pre_prompt := lmms_eval_specific_kwargs.get("pre_prompt")):
+        content.append({"type": "text", "text": pre_prompt})
+    for entry in question:
+        if entry["type"] == "text":
+            content.append(entry)
+        elif entry["type"] == "video":
+            content.append(_process_video_entry(entry))
+        else:
+            raise Exception("Your copy of the benchmark is corrupted. Please re-download the `benchmarks/` folder from HuggingFace.")
+    if lmms_eval_specific_kwargs and (post_prompt := lmms_eval_specific_kwargs.get("post_prompt")):
+        content.append({"type": "text", "text": post_prompt})
+    return content
+
+
+def videonet_binary_doc_to_messages(doc, lmms_eval_specific_kwargs=None):
+    question = doc["question"]
+    content = _question_to_content(question, lmms_eval_specific_kwargs)
+    return [{"role": "user", "content": content}]
+
+
+def _extract_binary_prediction(text: str) -> str:
+    text = text.splitlines()[-1].strip().lower()
+    pred = text.replace("*", "").replace("#", "").replace(",", "").replace(".", "").replace(":", "")
+
+    if pred == "yes" or pred == "no":
+        return pred
+    parts = pred.split(" ")
+    if len(parts) > 1:
+        first, last = parts[0], parts[-1]
+        if first == "yes" or first == "no":
+            return first
+        elif last == "yes" or last == "no":
+            return last
+    if "boxed{yes}" in pred:
+        return "yes"
+    if "boxed{no}" in pred:
+        return "no"
+    if "the answer is yes" in pred:
+        return "yes"
+    if "the answer is no" in pred:
+        return "no"
+    if 'is "yes"' in pred:
+        return "yes"
+    if "is 'yes'" in pred:
+        return "yes"
+    if 'is "no"' in pred:
+        return "no"
+    if "is 'no'" in pred:
+        return "no"
+    if "does not show" in pred or "does not depict" in pred:
+        return "no"
+    return pred
+
+
+def videonet_binary_process_results(doc, results):
+    model_output = results[0] if results else ""
+    ground_truth = doc["answer"]
+
+    pred = _extract_binary_prediction(model_output)
+    correct = 1.0 if pred == ground_truth else 0.0
+    return {
+        "binary_acc": {
+            "question_key": doc["key"],
+            "correct": correct,
+            "ground_truth": ground_truth,
+            "model_prediction": pred,
+            "model_output": model_output,
+        }
+    }
+
+
+def videonet_binary_aggregate_results(results):
+    if not results:
+        return 0.0
+    num_correct = sum(r["correct"] for r in results)
+    total = len(results)
+    return num_correct / total
diff --git a/lmms_eval/tasks/videonet/mcq_utils.py b/lmms_eval/tasks/videonet/mcq_utils.py
@@ -0,0 +1,104 @@
+from functools import cache
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+
+
+@cache
+def _get_videos_dir() -> Path:
+    dataset_root = snapshot_download(repo_id="raivn/VideoNet", repo_type="dataset", allow_patterns=["videos/*.mp4"])
+    return Path(dataset_root) / "videos"
+
+
+def _get_video_path(video_fname) -> str:
+    return str(_get_videos_dir() / video_fname)
+
+
+def videonet_mcq_doc_to_visual(doc):
+    question = doc["question"]
+    video_fname = [entry["video"] for entry in question if entry["type"] == "video"][0]
+    return [_get_video_path(video_fname)]
+
+
+def videonet_mcq_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    question = doc["question"]
+    text = [entry["text"] for entry in question if entry["type"] == "text"][0]
+    if lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+        post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+        text = pre_prompt + text + post_prompt
+    return text
+
+
+def _process_video_entry(entry: dict) -> dict:
+    video_fname = entry["video"]
+    video_path = _get_video_path(video_fname)
+    return {"type": "video", "url": video_path}
+
+
+def _process_text_entry(entry: dict, lmms_eval_specific_kwargs=None) -> dict:
+    text = entry["text"]
+    if lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+        post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+        text = pre_prompt + text + post_prompt
+    return {"type": "text", "text": text}
+
+
+def _question_to_content(question: list[dict], lmms_eval_specific_kwargs=None) -> list[dict]:
+    content = []
+    for entry in question:
+        if entry["type"] == "text":
+            content.append(_process_text_entry(entry, lmms_eval_specific_kwargs))
+        elif entry["type"] == "video":
+            content.append(_process_video_entry(entry))
+        else:
+            raise Exception("Your copy of the benchmark is corrupted. Please re-download the `benchmarks/` folder from HuggingFace.")
+    return content
+
+
+def videonet_mcq_doc_to_messages(doc, lmms_eval_specific_kwargs=None):
+    question = doc["question"]
+    content = _question_to_content(question, lmms_eval_specific_kwargs)
+    return [{"role": "user", "content": content}]
+
+
+def _extract_prediction_mcq(text: str) -> str:
+    text = text.splitlines()[-1].strip().upper()
+    pred = text.replace("*", "").replace("#", "").replace(",", "").replace(".", "").replace(":", "")
+    if pred in {"A", "B", "C", "D"}:
+        return pred
+    if "BOXED{A}" in pred:
+        return "A"
+    if "BOXED{B}" in pred:
+        return "B"
+    if "BOXED{C}" in pred:
+        return "C"
+    if "BOXED{D}" in pred:
+        return "D"
+    return pred
+
+
+def videonet_mcq_process_results(doc, results):
+    model_output = results[0] if results else ""
+    ground_truth = doc["answer"]
+
+    pred = _extract_prediction_mcq(model_output)
+    correct = 1.0 if pred == ground_truth else 0.0
+    return {
+        "mcq_acc": {
+            "question_key": doc["key"],
+            "correct": correct,
+            "ground_truth": ground_truth,
+            "model_prediction": pred,
+            "model_output": model_output,
+        }
+    }
+
+
+def videonet_mcq_aggregate_results(results):
+    if not results:
+        return 0.0
+    num_correct = sum(r["correct"] for r in results)
+    total = len(results)
+    return num_correct / total
diff --git a/lmms_eval/tasks/videonet/videonet_binary_0shot.yaml b/lmms_eval/tasks/videonet/videonet_binary_0shot.yaml
@@ -0,0 +1,9 @@
+include: _binary.yaml
+task: "videonet_binary_0shot"
+dataset_kwargs:
+  cache_dir: videonet
+  data_files:
+    test: "benchmarks/binary_0shot.jsonl"
+metadata:
+  num_fewshot: 0
+  version: 0.0
diff --git a/lmms_eval/tasks/videonet/videonet_binary_1shot.yaml b/lmms_eval/tasks/videonet/videonet_binary_1shot.yaml
@@ -0,0 +1,9 @@
+include: _binary.yaml
+task: "videonet_binary_1shot"
+dataset_kwargs:
+  cache_dir: videonet
+  data_files:
+    test: "benchmarks/binary_1shot.jsonl"
+metadata:
+  num_fewshot: 1
+  version: 0.0
diff --git a/lmms_eval/tasks/videonet/videonet_binary_2shot.yaml b/lmms_eval/tasks/videonet/videonet_binary_2shot.yaml
@@ -0,0 +1,9 @@
+include: _binary.yaml
+task: "videonet_binary_2shot"
+dataset_kwargs:
+  cache_dir: videonet
+  data_files:
+    test: "benchmarks/binary_2shot.jsonl"
+metadata:
+  num_fewshot: 2
+  version: 0.0
diff --git a/lmms_eval/tasks/videonet/videonet_binary_3shot.yaml b/lmms_eval/tasks/videonet/videonet_binary_3shot.yaml
@@ -0,0 +1,9 @@
+include: _binary.yaml
+task: "videonet_binary_3shot"
+dataset_kwargs:
+  cache_dir: videonet
+  data_files:
+    test: "benchmarks/binary_3shot.jsonl"
+metadata:
+  num_fewshot: 3
+  version: 0.0
diff --git a/lmms_eval/tasks/videonet/videonet_mcq_test.yaml b/lmms_eval/tasks/videonet/videonet_mcq_test.yaml
@@ -0,0 +1,6 @@
+include: _mcq.yaml
+task: "videonet_mcq_test"
+dataset_kwargs:
+  cache_dir: videonet
+  data_files:
+    test: "benchmarks/mcq_test.jsonl"
diff --git a/lmms_eval/tasks/videonet/videonet_mcq_val.yaml b/lmms_eval/tasks/videonet/videonet_mcq_val.yaml
@@ -0,0 +1,7 @@
+include: _mcq.yaml
+task: "videonet_mcq_val"
+dataset_kwargs:
+  cache_dir: videonet
+  data_files:
+    test:
+      - "benchmarks/mcq_val.jsonl"