diff --git a/lmms_eval/tasks/videonet/_binary.yaml b/lmms_eval/tasks/videonet/_binary.yaml new file mode 100644 index 000000000..fc53d4cd4 --- /dev/null +++ b/lmms_eval/tasks/videonet/_binary.yaml @@ -0,0 +1,17 @@ +include: _default.yaml +doc_to_visual: !function binary_utils.videonet_binary_doc_to_visual +doc_to_text: !function binary_utils.videonet_binary_doc_to_text +doc_to_messages: !function binary_utils.videonet_binary_doc_to_messages +generation_kwargs: + temperature: 0 + do_sample: False + max_new_tokens: 8192 +process_results: !function binary_utils.videonet_binary_process_results +metric_list: + - metric: binary_acc + aggregation: !function binary_utils.videonet_binary_aggregate_results + higher_is_better: True +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "" diff --git a/lmms_eval/tasks/videonet/_default.yaml b/lmms_eval/tasks/videonet/_default.yaml new file mode 100644 index 000000000..86edcc722 --- /dev/null +++ b/lmms_eval/tasks/videonet/_default.yaml @@ -0,0 +1,5 @@ +dataset_path: "raivn/VideoNet" +dataset_name: "lmms-eval" +output_type: generate_until +doc_to_target: "answer" +test_split: test diff --git a/lmms_eval/tasks/videonet/_mcq.yaml b/lmms_eval/tasks/videonet/_mcq.yaml new file mode 100644 index 000000000..9dc18fe67 --- /dev/null +++ b/lmms_eval/tasks/videonet/_mcq.yaml @@ -0,0 +1,19 @@ +include: _default.yaml +doc_to_visual: !function mcq_utils.videonet_mcq_doc_to_visual +doc_to_text: !function mcq_utils.videonet_mcq_doc_to_text +doc_to_messages: !function mcq_utils.videonet_mcq_doc_to_messages +generation_kwargs: + temperature: 0 + do_sample: False + max_new_tokens: 8192 +process_results: !function mcq_utils.videonet_mcq_process_results +metric_list: + - metric: mcq_acc + aggregation: !function mcq_utils.videonet_mcq_aggregate_results + higher_is_better: True +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "" +metadata: + version: 0.0 diff --git a/lmms_eval/tasks/videonet/binary_utils.py b/lmms_eval/tasks/videonet/binary_utils.py new file mode 100644 index 000000000..c9e509100 --- /dev/null +++ b/lmms_eval/tasks/videonet/binary_utils.py @@ -0,0 +1,128 @@ +from functools import cache +from pathlib import Path + +from huggingface_hub import snapshot_download + + +@cache +def _get_videos_dir() -> Path: + dataset_root = snapshot_download(repo_id="raivn/VideoNet", repo_type="dataset", allow_patterns=["videos/*.mp4"]) + return Path(dataset_root) / "videos" + + +def _get_video_path(video_fname) -> str: + return str(_get_videos_dir() / video_fname) + + +def videonet_binary_doc_to_visual(doc): + question = doc["question"] + video_fnames = [entry["video"] for entry in question if entry["type"] == "video"] + video_paths = [_get_video_path(video_fname) for video_fname in video_fnames] + return video_paths + + +def videonet_binary_doc_to_text(doc, lmms_eval_specific_kwargs=None): + question = doc["question"] + texts = [entry["text"] for entry in question if entry["type"] == "text"] + text = "\n".join(texts) + if lmms_eval_specific_kwargs: + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") + post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") + text = pre_prompt + text + post_prompt + return text + + +def _process_video_entry(entry: dict) -> dict: + video_fname = entry["video"] + video_path = _get_video_path(video_fname) + return {"type": "video", "url": video_path} + + +def _process_text_entry(entry: dict, lmms_eval_specific_kwargs=None) -> dict: + text = entry["text"] + if lmms_eval_specific_kwargs: + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") + post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") + text = pre_prompt + text + post_prompt + return {"type": "text", "text": text} + + +def _question_to_content(question: list[dict], lmms_eval_specific_kwargs=None) -> list[dict]: + content = [] + if lmms_eval_specific_kwargs and (pre_prompt := lmms_eval_specific_kwargs.get("pre_prompt")): + content.append({"type": "text", "text": pre_prompt}) + for entry in question: + if entry["type"] == "text": + content.append(entry) + elif entry["type"] == "video": + content.append(_process_video_entry(entry)) + else: + raise Exception("Your copy of the benchmark is corrupted. Please re-download the `benchmarks/` folder from HuggingFace.") + if lmms_eval_specific_kwargs and (post_prompt := lmms_eval_specific_kwargs.get("post_prompt")): + content.append({"type": "text", "text": post_prompt}) + return content + + +def videonet_binary_doc_to_messages(doc, lmms_eval_specific_kwargs=None): + question = doc["question"] + content = _question_to_content(question, lmms_eval_specific_kwargs) + return [{"role": "user", "content": content}] + + +def _extract_binary_prediction(text: str) -> str: + text = text.splitlines()[-1].strip().lower() + pred = text.replace("*", "").replace("#", "").replace(",", "").replace(".", "").replace(":", "") + + if pred == "yes" or pred == "no": + return pred + parts = pred.split(" ") + if len(parts) > 1: + first, last = parts[0], parts[-1] + if first == "yes" or first == "no": + return first + elif last == "yes" or last == "no": + return last + if "boxed{yes}" in pred: + return "yes" + if "boxed{no}" in pred: + return "no" + if "the answer is yes" in pred: + return "yes" + if "the answer is no" in pred: + return "no" + if 'is "yes"' in pred: + return "yes" + if "is 'yes'" in pred: + return "yes" + if 'is "no"' in pred: + return "no" + if "is 'no'" in pred: + return "no" + if "does not show" in pred or "does not depict" in pred: + return "no" + return pred + + +def videonet_binary_process_results(doc, results): + model_output = results[0] if results else "" + ground_truth = doc["answer"] + + pred = _extract_binary_prediction(model_output) + correct = 1.0 if pred == ground_truth else 0.0 + return { + "binary_acc": { + "question_key": doc["key"], + "correct": correct, + "ground_truth": ground_truth, + "model_prediction": pred, + "model_output": model_output, + } + } + + +def videonet_binary_aggregate_results(results): + if not results: + return 0.0 + num_correct = sum(r["correct"] for r in results) + total = len(results) + return num_correct / total diff --git a/lmms_eval/tasks/videonet/mcq_utils.py b/lmms_eval/tasks/videonet/mcq_utils.py new file mode 100644 index 000000000..a67e3ed8e --- /dev/null +++ b/lmms_eval/tasks/videonet/mcq_utils.py @@ -0,0 +1,104 @@ +from functools import cache +from pathlib import Path + +from huggingface_hub import snapshot_download + + +@cache +def _get_videos_dir() -> Path: + dataset_root = snapshot_download(repo_id="raivn/VideoNet", repo_type="dataset", allow_patterns=["videos/*.mp4"]) + return Path(dataset_root) / "videos" + + +def _get_video_path(video_fname) -> str: + return str(_get_videos_dir() / video_fname) + + +def videonet_mcq_doc_to_visual(doc): + question = doc["question"] + video_fname = [entry["video"] for entry in question if entry["type"] == "video"][0] + return [_get_video_path(video_fname)] + + +def videonet_mcq_doc_to_text(doc, lmms_eval_specific_kwargs=None): + question = doc["question"] + text = [entry["text"] for entry in question if entry["type"] == "text"][0] + if lmms_eval_specific_kwargs: + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") + post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") + text = pre_prompt + text + post_prompt + return text + + +def _process_video_entry(entry: dict) -> dict: + video_fname = entry["video"] + video_path = _get_video_path(video_fname) + return {"type": "video", "url": video_path} + + +def _process_text_entry(entry: dict, lmms_eval_specific_kwargs=None) -> dict: + text = entry["text"] + if lmms_eval_specific_kwargs: + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") + post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") + text = pre_prompt + text + post_prompt + return {"type": "text", "text": text} + + +def _question_to_content(question: list[dict], lmms_eval_specific_kwargs=None) -> list[dict]: + content = [] + for entry in question: + if entry["type"] == "text": + content.append(_process_text_entry(entry, lmms_eval_specific_kwargs)) + elif entry["type"] == "video": + content.append(_process_video_entry(entry)) + else: + raise Exception("Your copy of the benchmark is corrupted. Please re-download the `benchmarks/` folder from HuggingFace.") + return content + + +def videonet_mcq_doc_to_messages(doc, lmms_eval_specific_kwargs=None): + question = doc["question"] + content = _question_to_content(question, lmms_eval_specific_kwargs) + return [{"role": "user", "content": content}] + + +def _extract_prediction_mcq(text: str) -> str: + text = text.splitlines()[-1].strip().upper() + pred = text.replace("*", "").replace("#", "").replace(",", "").replace(".", "").replace(":", "") + if pred in {"A", "B", "C", "D"}: + return pred + if "BOXED{A}" in pred: + return "A" + if "BOXED{B}" in pred: + return "B" + if "BOXED{C}" in pred: + return "C" + if "BOXED{D}" in pred: + return "D" + return pred + + +def videonet_mcq_process_results(doc, results): + model_output = results[0] if results else "" + ground_truth = doc["answer"] + + pred = _extract_prediction_mcq(model_output) + correct = 1.0 if pred == ground_truth else 0.0 + return { + "mcq_acc": { + "question_key": doc["key"], + "correct": correct, + "ground_truth": ground_truth, + "model_prediction": pred, + "model_output": model_output, + } + } + + +def videonet_mcq_aggregate_results(results): + if not results: + return 0.0 + num_correct = sum(r["correct"] for r in results) + total = len(results) + return num_correct / total diff --git a/lmms_eval/tasks/videonet/videonet_binary_0shot.yaml b/lmms_eval/tasks/videonet/videonet_binary_0shot.yaml new file mode 100644 index 000000000..ed156beb0 --- /dev/null +++ b/lmms_eval/tasks/videonet/videonet_binary_0shot.yaml @@ -0,0 +1,9 @@ +include: _binary.yaml +task: "videonet_binary_0shot" +dataset_kwargs: + cache_dir: videonet + data_files: + test: "benchmarks/binary_0shot.jsonl" +metadata: + num_fewshot: 0 + version: 0.0 diff --git a/lmms_eval/tasks/videonet/videonet_binary_1shot.yaml b/lmms_eval/tasks/videonet/videonet_binary_1shot.yaml new file mode 100644 index 000000000..7b834f8d4 --- /dev/null +++ b/lmms_eval/tasks/videonet/videonet_binary_1shot.yaml @@ -0,0 +1,9 @@ +include: _binary.yaml +task: "videonet_binary_1shot" +dataset_kwargs: + cache_dir: videonet + data_files: + test: "benchmarks/binary_1shot.jsonl" +metadata: + num_fewshot: 1 + version: 0.0 diff --git a/lmms_eval/tasks/videonet/videonet_binary_2shot.yaml b/lmms_eval/tasks/videonet/videonet_binary_2shot.yaml new file mode 100644 index 000000000..e14cc6cb6 --- /dev/null +++ b/lmms_eval/tasks/videonet/videonet_binary_2shot.yaml @@ -0,0 +1,9 @@ +include: _binary.yaml +task: "videonet_binary_2shot" +dataset_kwargs: + cache_dir: videonet + data_files: + test: "benchmarks/binary_2shot.jsonl" +metadata: + num_fewshot: 2 + version: 0.0 diff --git a/lmms_eval/tasks/videonet/videonet_binary_3shot.yaml b/lmms_eval/tasks/videonet/videonet_binary_3shot.yaml new file mode 100644 index 000000000..ba1b51f7d --- /dev/null +++ b/lmms_eval/tasks/videonet/videonet_binary_3shot.yaml @@ -0,0 +1,9 @@ +include: _binary.yaml +task: "videonet_binary_3shot" +dataset_kwargs: + cache_dir: videonet + data_files: + test: "benchmarks/binary_3shot.jsonl" +metadata: + num_fewshot: 3 + version: 0.0 diff --git a/lmms_eval/tasks/videonet/videonet_mcq_test.yaml b/lmms_eval/tasks/videonet/videonet_mcq_test.yaml new file mode 100644 index 000000000..2f41e6758 --- /dev/null +++ b/lmms_eval/tasks/videonet/videonet_mcq_test.yaml @@ -0,0 +1,6 @@ +include: _mcq.yaml +task: "videonet_mcq_test" +dataset_kwargs: + cache_dir: videonet + data_files: + test: "benchmarks/mcq_test.jsonl" diff --git a/lmms_eval/tasks/videonet/videonet_mcq_val.yaml b/lmms_eval/tasks/videonet/videonet_mcq_val.yaml new file mode 100644 index 000000000..bee4b5641 --- /dev/null +++ b/lmms_eval/tasks/videonet/videonet_mcq_val.yaml @@ -0,0 +1,7 @@ +include: _mcq.yaml +task: "videonet_mcq_val" +dataset_kwargs: + cache_dir: videonet + data_files: + test: + - "benchmarks/mcq_val.jsonl"