From fa366a8314097fde1fb7f184143b79a5b0d22505 Mon Sep 17 00:00:00 2001
From: akawincent <liwenpu0520@gmail.com>
Date: Sat, 28 Mar 2026 14:31:17 +0800
Subject: [PATCH 1/2] fix: enable frame sampling in internvl_hf

---
 lmms_eval/models/chat/internvl_hf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lmms_eval/models/chat/internvl_hf.py b/lmms_eval/models/chat/internvl_hf.py
index e4f73f155..a8d26f004 100644
--- a/lmms_eval/models/chat/internvl_hf.py
+++ b/lmms_eval/models/chat/internvl_hf.py
@@ -249,6 +249,9 @@ def _collate(x):
                 images_kwargs["min_patches"] = self.min_patches
             if self.max_patches is not None:
                 images_kwargs["max_patches"] = self.max_patches
+            if self.num_frames is not None or self.fps is not None:
+                # InternVL only applies num_frames/fps when frame sampling is explicitly enabled.
+                videos_kwargs["do_sample_frames"] = True
             if self.num_frames is not None:
                 videos_kwargs["num_frames"] = self.num_frames
             if self.fps is not None:

From 24b7ea9ddd28465ff36ab1dead768c02b7941141 Mon Sep 17 00:00:00 2001
From: akawincent <liwenpu0520@gmail.com>
Date: Sat, 28 Mar 2026 14:31:17 +0800
Subject: [PATCH 2/2] fix: handle video-only internvl_hf inputs

---
 lmms_eval/models/chat/internvl_hf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lmms_eval/models/chat/internvl_hf.py b/lmms_eval/models/chat/internvl_hf.py
index a8d26f004..ba3298782 100644
--- a/lmms_eval/models/chat/internvl_hf.py
+++ b/lmms_eval/models/chat/internvl_hf.py
@@ -263,6 +263,8 @@ def _collate(x):
             if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
                 eval_logger.debug(f"Prompt for doc ID {doc_id[0]}:\n\n{text}\n")
 
+            if len(visuals) == 0:
+                visuals = None
             if len(videos) == 0:
                 videos = None
             inputs = self.processor(
@@ -278,7 +280,7 @@ def _collate(x):
             # this is safe to assume because the `grouper` object ensures it.
             gen_kwargs = all_gen_kwargs[0]
 
-            gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+            gen_kwargs["image_sizes"] = [visual.size for visual in visuals] if visuals is not None else []
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
             if "temperature" not in gen_kwargs: