diff --git a/lmms_eval/models/chat/internvl_hf.py b/lmms_eval/models/chat/internvl_hf.py
index e4f73f155..ba3298782 100644
--- a/lmms_eval/models/chat/internvl_hf.py
+++ b/lmms_eval/models/chat/internvl_hf.py
@@ -249,6 +249,9 @@ def _collate(x):
                 images_kwargs["min_patches"] = self.min_patches
             if self.max_patches is not None:
                 images_kwargs["max_patches"] = self.max_patches
+            if self.num_frames is not None or self.fps is not None:
+                # InternVL only applies num_frames/fps when frame sampling is explicitly enabled.
+                videos_kwargs["do_sample_frames"] = True
             if self.num_frames is not None:
                 videos_kwargs["num_frames"] = self.num_frames
             if self.fps is not None:
@@ -260,6 +263,8 @@ def _collate(x):
             if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
                 eval_logger.debug(f"Prompt for doc ID {doc_id[0]}:\n\n{text}\n")
 
+            if len(visuals) == 0:
+                visuals = None
             if len(videos) == 0:
                 videos = None
             inputs = self.processor(
@@ -275,7 +280,7 @@ def _collate(x):
             # this is safe to assume because the `grouper` object ensures it.
             gen_kwargs = all_gen_kwargs[0]
 
-            gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+            gen_kwargs["image_sizes"] = [visual.size for visual in visuals] if visuals is not None else []
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
             if "temperature" not in gen_kwargs: