feat: add ReVSI evaluation (#1307)

eamonn-zh · Copilot · web-flow · commit 78d6c72b690e · 2026-05-06T15:12:29.000+08:00
* add the ReVSI benchmark

* add the ReVSI benchmark

* feat: enhance REVSI metrics and aggregation functions

Co-authored-by: Copilot &lt;copilot@github.com&gt;

---------

Co-authored-by: Copilot &lt;copilot@github.com&gt;
diff --git a/lmms_eval/tasks/revsi/_default_template_yaml b/lmms_eval/tasks/revsi/_default_template_yaml
@@ -0,0 +1,48 @@
+dataset_path: 3dlg-hcvc/ReVSI
+test_split: test
+dataset_kwargs:
+  token: True
+  cache_dir: revsi
+  video: True
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_visual: !function utils.revsi_doc_to_visual
+doc_to_text: !function utils.revsi_doc_to_text
+doc_to_target: "ground_truth"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: false
+process_results: !function utils.revsi_process_results
+metric_list:
+  - metric: overall_acc
+    aggregation: !function utils.revsi_aggregate_overall
+    higher_is_better: true
+  - metric: object_abs_distance_acc
+    aggregation: !function utils.revsi_aggregate_object_abs_distance_acc
+    higher_is_better: true
+  - metric: object_counting_acc
+    aggregation: !function utils.revsi_aggregate_object_counting_acc
+    higher_is_better: true
+  - metric: object_rel_direction_acc
+    aggregation: !function utils.revsi_aggregate_object_rel_direction_acc
+    higher_is_better: true
+  - metric: object_rel_distance_acc
+    aggregation: !function utils.revsi_aggregate_object_rel_distance_acc
+    higher_is_better: true
+  - metric: object_size_estimation_acc
+    aggregation: !function utils.revsi_aggregate_object_size_estimation_acc
+    higher_is_better: true
+  - metric: room_size_estimation_acc
+    aggregation: !function utils.revsi_aggregate_room_size_estimation_acc
+    higher_is_better: true
+  - metric: route_planning_acc
+    aggregation: !function utils.revsi_aggregate_route_planning_acc
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "These are frames of a video."
+    mcq_post_prompt: "Answer with the option's letter from the given choices directly."
+    nq_post_prompt: "Answer the question using a single integer or decimal number."
+metadata:
+  - version: 1.0
diff --git a/lmms_eval/tasks/revsi/revsi.yaml b/lmms_eval/tasks/revsi/revsi.yaml
@@ -0,0 +1,6 @@
+group: revsi
+task:
+- revsi_all_frame
+- revsi_64_frame
+- revsi_32_frame
+- revsi_16_frame
diff --git a/lmms_eval/tasks/revsi/revsi_16_frame.yaml b/lmms_eval/tasks/revsi/revsi_16_frame.yaml
@@ -0,0 +1,3 @@
+dataset_name: 16_frame
+task: revsi_16_frame
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/revsi/revsi_32_frame.yaml b/lmms_eval/tasks/revsi/revsi_32_frame.yaml
@@ -0,0 +1,3 @@
+dataset_name: 32_frame
+task: revsi_32_frame
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/revsi/revsi_64_frame.yaml b/lmms_eval/tasks/revsi/revsi_64_frame.yaml
@@ -0,0 +1,3 @@
+dataset_name: 64_frame
+task: revsi_64_frame
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/revsi/revsi_all_frame.yaml b/lmms_eval/tasks/revsi/revsi_all_frame.yaml
@@ -0,0 +1,3 @@
+dataset_name: all_frame
+task: revsi_all_frame
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/revsi/utils.py b/lmms_eval/tasks/revsi/utils.py
@@ -0,0 +1,171 @@
+import os
+import datasets
+import numpy as np
+import pandas as pd
+from huggingface_hub.constants import HF_HOME
+from lmms_eval.utils import resolve_cache_dir
+from lmms_eval.tasks._task_utils.default_template_yaml import load_default_template_yaml
+
+
+MCQ_QUESTION_TYPES = [
+    "object_rel_direction_forward_easy",
+    "object_rel_direction_backward_easy",
+    "object_rel_direction_forward_hard",
+    "object_rel_direction_backward_hard",
+    "object_rel_distance_closest",
+    "object_rel_distance_farthest",
+    "route_planning",
+]
+
+
+NQ_QUESTION_TYPES = [
+    "object_counting_single",
+    "object_counting_multiple",
+    "object_abs_distance",
+    "object_size_estimation",
+    "room_size_estimation_single",
+    "room_size_estimation_multiple"
+]
+
+
+REVSI_METRICS = [
+    "overall_acc",
+    "object_abs_distance_acc",
+    "object_counting_acc",
+    "object_rel_direction_acc",
+    "object_rel_distance_acc",
+    "object_size_estimation_acc",
+    "room_size_estimation_acc",
+    "route_planning_acc",
+]
+
+
+COMPOSITE_METRICS = {
+    "object_rel_direction_acc": [
+        "object_rel_direction_forward_easy",
+        "object_rel_direction_backward_easy",
+        "object_rel_direction_forward_hard",
+        "object_rel_direction_backward_hard",
+    ],
+    "object_rel_distance_acc": [
+        "object_rel_distance_closest",
+        "object_rel_distance_farthest",
+    ],
+    "object_counting_acc": [
+        "object_counting_single",
+        "object_counting_multiple",
+    ],
+    "room_size_estimation_acc": [
+        "room_size_estimation_single",
+        "room_size_estimation_multiple",
+    ],
+}
+
+
+config = load_default_template_yaml(__file__)
+cache_dir = resolve_cache_dir(config["dataset_kwargs"]["cache_dir"], base_dir=HF_HOME)
+
+
+def revsi_doc_to_visual(doc):
+    video_path = os.path.join(cache_dir, f"{doc['num_frames']}_frame", f"{doc['scene_id']}.mp4")
+    if not os.path.exists(video_path):
+        raise FileExistsError(f"video path:{video_path} does not exist.")
+    return [video_path]
+
+
+def revsi_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    question = doc["question"]
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    if doc["question_type"] in NQ_QUESTION_TYPES:
+        post_prompt = lmms_eval_specific_kwargs.get("nq_post_prompt", "")
+        return "\n".join([pre_prompt, question, post_prompt]).strip()
+    elif doc["question_type"] in MCQ_QUESTION_TYPES:
+        options = "Options:\n" + "\n".join(doc["options"])
+        post_prompt = lmms_eval_specific_kwargs.get("mcq_post_prompt", "")
+        return "\n".join([pre_prompt, question, options, post_prompt]).strip()
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    if os.getenv("LMMS_EVAL_SHUFFLE_DOCS", None):
+        return dataset.shuffle(seed=42)
+    return dataset
+
+
+def _mean_relative_accuracy(pred, target, start, end, interval):
+    num_pts = (end - start) / interval + 2
+    conf_intervs = np.linspace(start, end, int(num_pts))
+    acc = (abs(pred - target) / target) <= (1 - conf_intervs)
+    return acc.mean()
+
+
+def revsi_process_results(doc, results):
+    pred_answer = str(results[0]).strip().split(" ")[0].rstrip(".").strip()
+    gt_answer = doc["ground_truth"]
+    if doc["question_type"] in MCQ_QUESTION_TYPES:
+        acc = 1.0 if pred_answer.lower() == gt_answer.lower() else 0.0
+    elif doc["question_type"] in NQ_QUESTION_TYPES:
+        try:
+            acc = _mean_relative_accuracy(float(pred_answer), float(gt_answer), 0.5, 0.95, 0.05)
+        except:
+            acc = 0.0
+    doc["acc"] = acc
+    return {metric: doc for metric in REVSI_METRICS}
+
+
+def _collapse_question_types(output, metric_name, question_types):
+    question_type_metrics = [
+        f"{question_type}_acc" for question_type in question_types if f"{question_type}_acc" in output
+    ]
+    if not question_type_metrics:
+        return
+    output[metric_name] = np.mean([output.pop(metric) for metric in question_type_metrics])
+
+
+def _compute_all_subscores(results) -> dict:
+    results = pd.DataFrame(results)
+    output = {
+        f"{question_type}_acc": per_question_type["acc"].mean()
+        for question_type, per_question_type in results.groupby("question_type")
+    }
+
+    for metric_name, question_types in COMPOSITE_METRICS.items():
+        _collapse_question_types(output, metric_name, question_types)
+
+    output["overall_acc"] = sum(output.values()) / len(output) if output else 0.0
+    return output
+
+
+def _aggregate_metric(results, metric_name):
+    return _compute_all_subscores(results).get(metric_name, 0.0)
+
+
+def revsi_aggregate_overall(results):
+    return _aggregate_metric(results, "overall_acc")
+
+
+def revsi_aggregate_object_abs_distance_acc(results):
+    return _aggregate_metric(results, "object_abs_distance_acc")
+
+
+def revsi_aggregate_object_counting_acc(results):
+    return _aggregate_metric(results, "object_counting_acc")
+
+
+def revsi_aggregate_object_rel_direction_acc(results):
+    return _aggregate_metric(results, "object_rel_direction_acc")
+
+
+def revsi_aggregate_object_rel_distance_acc(results):
+    return _aggregate_metric(results, "object_rel_distance_acc")
+
+
+def revsi_aggregate_object_size_estimation_acc(results):
+    return _aggregate_metric(results, "object_size_estimation_acc")
+
+
+def revsi_aggregate_room_size_estimation_acc(results):
+    return _aggregate_metric(results, "room_size_estimation_acc")
+
+
+def revsi_aggregate_route_planning_acc(results):
+    return _aggregate_metric(results, "route_planning_acc")

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+dataset_name: 16_frame`
	`2`	`+task: revsi_16_frame`
	`3`	`+include: _default_template_yaml`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+dataset_name: 32_frame`
	`2`	`+task: revsi_32_frame`
	`3`	`+include: _default_template_yaml`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+dataset_name: 64_frame`
	`2`	`+task: revsi_64_frame`
	`3`	`+include: _default_template_yaml`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+dataset_name: all_frame`
	`2`	`+task: revsi_all_frame`
	`3`	`+include: _default_template_yaml`