diff --git a/lmms_eval/models/chat/internvl_hf.py b/lmms_eval/models/chat/internvl_hf.py index e4f73f155..ba3298782 100644 --- a/lmms_eval/models/chat/internvl_hf.py +++ b/lmms_eval/models/chat/internvl_hf.py @@ -249,6 +249,9 @@ def _collate(x): images_kwargs["min_patches"] = self.min_patches if self.max_patches is not None: images_kwargs["max_patches"] = self.max_patches + if self.num_frames is not None or self.fps is not None: + # InternVL only applies num_frames/fps when frame sampling is explicitly enabled. + videos_kwargs["do_sample_frames"] = True if self.num_frames is not None: videos_kwargs["num_frames"] = self.num_frames if self.fps is not None: @@ -260,6 +263,8 @@ def _collate(x): if self.accelerator.is_main_process and doc_id[0] % 100 == 0: eval_logger.debug(f"Prompt for doc ID {doc_id[0]}:\n\n{text}\n") + if len(visuals) == 0: + visuals = None if len(videos) == 0: videos = None inputs = self.processor( @@ -275,7 +280,7 @@ def _collate(x): # this is safe to assume because the `grouper` object ensures it. gen_kwargs = all_gen_kwargs[0] - gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] + gen_kwargs["image_sizes"] = [visual.size for visual in visuals] if visuals is not None else [] if "max_new_tokens" not in gen_kwargs: gen_kwargs["max_new_tokens"] = 1024 if "temperature" not in gen_kwargs: