Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions lmms_eval/tasks/videonet/_binary.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
include: _default.yaml
doc_to_visual: !function binary_utils.videonet_binary_doc_to_visual
doc_to_text: !function binary_utils.videonet_binary_doc_to_text
doc_to_messages: !function binary_utils.videonet_binary_doc_to_messages
generation_kwargs:
temperature: 0
do_sample: False
max_new_tokens: 8192
process_results: !function binary_utils.videonet_binary_process_results
metric_list:
- metric: binary_acc
aggregation: !function binary_utils.videonet_binary_aggregate_results
higher_is_better: True
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
5 changes: 5 additions & 0 deletions lmms_eval/tasks/videonet/_default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
dataset_path: "raivn/VideoNet"
dataset_name: "lmms-eval"
output_type: generate_until
doc_to_target: "answer"
test_split: test
19 changes: 19 additions & 0 deletions lmms_eval/tasks/videonet/_mcq.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
include: _default.yaml
doc_to_visual: !function mcq_utils.videonet_mcq_doc_to_visual
doc_to_text: !function mcq_utils.videonet_mcq_doc_to_text
doc_to_messages: !function mcq_utils.videonet_mcq_doc_to_messages
generation_kwargs:
temperature: 0
do_sample: False
max_new_tokens: 8192
process_results: !function mcq_utils.videonet_mcq_process_results
metric_list:
- metric: mcq_acc
aggregation: !function mcq_utils.videonet_mcq_aggregate_results
higher_is_better: True
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
metadata:
version: 0.0
128 changes: 128 additions & 0 deletions lmms_eval/tasks/videonet/binary_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from functools import cache
from pathlib import Path

from huggingface_hub import snapshot_download


@cache
def _get_videos_dir() -> Path:
dataset_root = snapshot_download(repo_id="raivn/VideoNet", repo_type="dataset", allow_patterns=["videos/*.mp4"])
return Path(dataset_root) / "videos"


def _get_video_path(video_fname) -> str:
return str(_get_videos_dir() / video_fname)


def videonet_binary_doc_to_visual(doc):
question = doc["question"]
video_fnames = [entry["video"] for entry in question if entry["type"] == "video"]
video_paths = [_get_video_path(video_fname) for video_fname in video_fnames]
return video_paths


def videonet_binary_doc_to_text(doc, lmms_eval_specific_kwargs=None):
question = doc["question"]
texts = [entry["text"] for entry in question if entry["type"] == "text"]
text = "\n".join(texts)
if lmms_eval_specific_kwargs:
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
text = pre_prompt + text + post_prompt
return text


def _process_video_entry(entry: dict) -> dict:
video_fname = entry["video"]
video_path = _get_video_path(video_fname)
return {"type": "video", "url": video_path}


def _process_text_entry(entry: dict, lmms_eval_specific_kwargs=None) -> dict:
text = entry["text"]
if lmms_eval_specific_kwargs:
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
text = pre_prompt + text + post_prompt
return {"type": "text", "text": text}


def _question_to_content(question: list[dict], lmms_eval_specific_kwargs=None) -> list[dict]:
content = []
if lmms_eval_specific_kwargs and (pre_prompt := lmms_eval_specific_kwargs.get("pre_prompt")):
content.append({"type": "text", "text": pre_prompt})
for entry in question:
if entry["type"] == "text":
content.append(entry)
elif entry["type"] == "video":
content.append(_process_video_entry(entry))
else:
raise Exception("Your copy of the benchmark is corrupted. Please re-download the `benchmarks/` folder from HuggingFace.")
if lmms_eval_specific_kwargs and (post_prompt := lmms_eval_specific_kwargs.get("post_prompt")):
content.append({"type": "text", "text": post_prompt})
return content


def videonet_binary_doc_to_messages(doc, lmms_eval_specific_kwargs=None):
question = doc["question"]
content = _question_to_content(question, lmms_eval_specific_kwargs)
return [{"role": "user", "content": content}]


def _extract_binary_prediction(text: str) -> str:
text = text.splitlines()[-1].strip().lower()
pred = text.replace("*", "").replace("#", "").replace(",", "").replace(".", "").replace(":", "")

if pred == "yes" or pred == "no":
return pred
parts = pred.split(" ")
if len(parts) > 1:
first, last = parts[0], parts[-1]
if first == "yes" or first == "no":
return first
elif last == "yes" or last == "no":
return last
if "boxed{yes}" in pred:
return "yes"
if "boxed{no}" in pred:
return "no"
if "the answer is yes" in pred:
return "yes"
if "the answer is no" in pred:
return "no"
if 'is "yes"' in pred:
return "yes"
if "is 'yes'" in pred:
return "yes"
if 'is "no"' in pred:
return "no"
if "is 'no'" in pred:
return "no"
if "does not show" in pred or "does not depict" in pred:
return "no"
return pred


def videonet_binary_process_results(doc, results):
model_output = results[0] if results else ""
ground_truth = doc["answer"]

pred = _extract_binary_prediction(model_output)
correct = 1.0 if pred == ground_truth else 0.0
return {
"binary_acc": {
"question_key": doc["key"],
"correct": correct,
"ground_truth": ground_truth,
"model_prediction": pred,
"model_output": model_output,
}
}


def videonet_binary_aggregate_results(results):
if not results:
return 0.0
num_correct = sum(r["correct"] for r in results)
total = len(results)
return num_correct / total
104 changes: 104 additions & 0 deletions lmms_eval/tasks/videonet/mcq_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from functools import cache
from pathlib import Path

from huggingface_hub import snapshot_download


@cache
def _get_videos_dir() -> Path:
dataset_root = snapshot_download(repo_id="raivn/VideoNet", repo_type="dataset", allow_patterns=["videos/*.mp4"])
return Path(dataset_root) / "videos"


def _get_video_path(video_fname) -> str:
return str(_get_videos_dir() / video_fname)


def videonet_mcq_doc_to_visual(doc):
question = doc["question"]
video_fname = [entry["video"] for entry in question if entry["type"] == "video"][0]
return [_get_video_path(video_fname)]


def videonet_mcq_doc_to_text(doc, lmms_eval_specific_kwargs=None):
question = doc["question"]
text = [entry["text"] for entry in question if entry["type"] == "text"][0]
if lmms_eval_specific_kwargs:
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
text = pre_prompt + text + post_prompt
return text


def _process_video_entry(entry: dict) -> dict:
video_fname = entry["video"]
video_path = _get_video_path(video_fname)
return {"type": "video", "url": video_path}


def _process_text_entry(entry: dict, lmms_eval_specific_kwargs=None) -> dict:
text = entry["text"]
if lmms_eval_specific_kwargs:
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
text = pre_prompt + text + post_prompt
return {"type": "text", "text": text}


def _question_to_content(question: list[dict], lmms_eval_specific_kwargs=None) -> list[dict]:
content = []
for entry in question:
if entry["type"] == "text":
content.append(_process_text_entry(entry, lmms_eval_specific_kwargs))
elif entry["type"] == "video":
content.append(_process_video_entry(entry))
else:
raise Exception("Your copy of the benchmark is corrupted. Please re-download the `benchmarks/` folder from HuggingFace.")
return content


def videonet_mcq_doc_to_messages(doc, lmms_eval_specific_kwargs=None):
question = doc["question"]
content = _question_to_content(question, lmms_eval_specific_kwargs)
return [{"role": "user", "content": content}]


def _extract_prediction_mcq(text: str) -> str:
text = text.splitlines()[-1].strip().upper()
pred = text.replace("*", "").replace("#", "").replace(",", "").replace(".", "").replace(":", "")
if pred in {"A", "B", "C", "D"}:
return pred
if "BOXED{A}" in pred:
return "A"
if "BOXED{B}" in pred:
return "B"
if "BOXED{C}" in pred:
return "C"
if "BOXED{D}" in pred:
return "D"
return pred


def videonet_mcq_process_results(doc, results):
model_output = results[0] if results else ""
ground_truth = doc["answer"]

pred = _extract_prediction_mcq(model_output)
correct = 1.0 if pred == ground_truth else 0.0
return {
"mcq_acc": {
"question_key": doc["key"],
"correct": correct,
"ground_truth": ground_truth,
"model_prediction": pred,
"model_output": model_output,
}
}


def videonet_mcq_aggregate_results(results):
if not results:
return 0.0
num_correct = sum(r["correct"] for r in results)
total = len(results)
return num_correct / total
9 changes: 9 additions & 0 deletions lmms_eval/tasks/videonet/videonet_binary_0shot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
include: _binary.yaml
task: "videonet_binary_0shot"
dataset_kwargs:
cache_dir: videonet
data_files:
test: "benchmarks/binary_0shot.jsonl"
metadata:
num_fewshot: 0
version: 0.0
9 changes: 9 additions & 0 deletions lmms_eval/tasks/videonet/videonet_binary_1shot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
include: _binary.yaml
task: "videonet_binary_1shot"
dataset_kwargs:
cache_dir: videonet
data_files:
test: "benchmarks/binary_1shot.jsonl"
metadata:
num_fewshot: 1
version: 0.0
9 changes: 9 additions & 0 deletions lmms_eval/tasks/videonet/videonet_binary_2shot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
include: _binary.yaml
task: "videonet_binary_2shot"
dataset_kwargs:
cache_dir: videonet
data_files:
test: "benchmarks/binary_2shot.jsonl"
metadata:
num_fewshot: 2
version: 0.0
9 changes: 9 additions & 0 deletions lmms_eval/tasks/videonet/videonet_binary_3shot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
include: _binary.yaml
task: "videonet_binary_3shot"
dataset_kwargs:
cache_dir: videonet
data_files:
test: "benchmarks/binary_3shot.jsonl"
metadata:
num_fewshot: 3
version: 0.0
6 changes: 6 additions & 0 deletions lmms_eval/tasks/videonet/videonet_mcq_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
include: _mcq.yaml
task: "videonet_mcq_test"
dataset_kwargs:
cache_dir: videonet
data_files:
test: "benchmarks/mcq_test.jsonl"
7 changes: 7 additions & 0 deletions lmms_eval/tasks/videonet/videonet_mcq_val.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
include: _mcq.yaml
task: "videonet_mcq_val"
dataset_kwargs:
cache_dir: videonet
data_files:
test:
- "benchmarks/mcq_val.jsonl"
Loading