refactor: extract CodecVideoLoadingMixin from LlavaOv2IterableDataset

kcz358 · kcz358 · commit 714856feefcc · 2026-06-26T00:29:03.000-07:00
Move the dataset-level codec video orchestration (collecting canvases
plus their CodecVideoOutput metadata across a message list) into a
reusable CodecVideoLoadingMixin alongside multimodal_mixin. The backend
implementation (load_video_lmms_video_utils) stays in
MultiModalDataLoadingMixin; LlavaOv2IterableDataset now inherits the
mixin and only handles image collection plus processor dispatch.
diff --git a/src/lmms_engine/datasets/codec_video_mixin.py b/src/lmms_engine/datasets/codec_video_mixin.py
@@ -0,0 +1,71 @@
+"""Mixin for loading videos through the ``lmms_video_utils`` codec backend.
+
+Separates the dataset-level codec orchestration (collecting canvases plus
+their ``CodecVideoOutput`` metadata across a message list) from both the
+backend implementation (``load_video_lmms_video_utils`` lives in
+``MultiModalDataLoadingMixin``) and the concrete dataset class. A dataset
+that mixes this in can pass the collected ``video_metadata`` straight into a
+codec-aware processor (e.g. LLaVA-OneVision-2).
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+
+class CodecVideoLoadingMixin:
+    """Collects codec-stream video inputs from OpenAI-style messages.
+
+    Requires the host class to also provide
+    ``load_video_lmms_video_utils`` (from ``MultiModalDataLoadingMixin``)
+    and a ``config`` with ``video_backend`` / ``fps``.
+    """
+
+    def load_codec_videos(
+        self,
+        video_path: str,
+        data_folder: Optional[str] = None,
+        fps: int = 1,
+        video_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[Any, float, Any]:
+        assert (
+            self.config.video_backend == "lmms_video_utils"
+        ), "CodecVideoLoadingMixin requires video_backend='lmms_video_utils'"
+        if data_folder is not None:
+            video_path = os.path.join(data_folder, video_path)
+        return self.load_video_lmms_video_utils(video_path, fps, video_kwargs=video_kwargs)
+
+    def collect_codec_video_inputs(
+        self,
+        messages: List[dict],
+        data_folder: Optional[str] = None,
+    ) -> Tuple[List[Any], List[Any], Optional[float]]:
+        """Walk ``messages`` and load every ``video_url`` via the codec
+        backend.
+
+        Returns ``(videos, video_metadata_list, sample_fps)`` where
+        ``videos`` are the canvas arrays, ``video_metadata_list`` holds the
+        matching ``CodecVideoOutput`` objects, and ``sample_fps`` is the fps
+        of the last loaded video (or ``None`` if no video was present).
+        """
+        videos: List[Any] = []
+        video_metadata_list: List[Any] = []
+        sample_fps: Optional[float] = None
+
+        for message in messages:
+            for content in message["content"]:
+                if content.get("type") != "video_url":
+                    continue
+                video_url = content["video_url"]
+                extra = {k: v for k, v in video_url.items() if k != "url" and v is not None}
+                frames, sample_fps, codec_output = self.load_codec_videos(
+                    video_url["url"],
+                    data_folder=data_folder,
+                    fps=self.config.fps,
+                    video_kwargs=extra or None,
+                )
+                videos.append(frames)
+                video_metadata_list.append(codec_output)
+
+        return videos, video_metadata_list, sample_fps
diff --git a/src/lmms_engine/datasets/iterable/llava_ov2_iterable_dataset.py b/src/lmms_engine/datasets/iterable/llava_ov2_iterable_dataset.py
@@ -1,11 +1,11 @@
 import json
 import os
-from typing import Any, Dict, Tuple
+from typing import Any, Dict
 
-import numpy as np
 import torch
 from PIL import Image
 
+from lmms_engine.datasets.codec_video_mixin import CodecVideoLoadingMixin
 from lmms_engine.datasets.iterable.vision_iterable_dataset import (
     VisionSFTIterableDataset,
 )
@@ -14,40 +14,31 @@
 
 
 @register_dataset("llava_ov2_iterable")
-class LlavaOv2IterableDataset(VisionSFTIterableDataset):
+class LlavaOv2IterableDataset(CodecVideoLoadingMixin, VisionSFTIterableDataset):
     """Iterable dataset for LLaVA-OneVision-2 with codec-stream video input.
 
     Reuses ``VisionSFTIterableDataset`` plumbing but routes video loading
-    through the ``lmms_video_utils`` backend so each video produces a
-    ``CodecVideoOutput`` (canvases + patch_positions + source_pts) that the
-    downstream processor can consume directly instead of re-deriving
-    timestamps from frame index.
+    through the ``lmms_video_utils`` backend (via ``CodecVideoLoadingMixin``)
+    so each video produces a ``CodecVideoOutput`` (canvases + patch_positions
+    + source_pts) that the downstream processor can consume directly instead
+    of re-deriving timestamps from frame index.
     """
 
     def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
         images_list = []
-        videos = []
-        video_metadata_list = []
         kwargs: Dict[str, Any] = {}
         messages = data["messages"]
         if isinstance(messages, str):
             messages = json.loads(messages)
+
         for message in messages:
             for content in message["content"]:
                 if content["type"] == "image_url":
                     images_list.append(content["image_url"]["url"])
-                elif content["type"] == "video_url":
-                    video_url = content["video_url"]
-                    extra = {k: v for k, v in video_url.items() if k != "url" and v is not None}
-                    frames, sample_fps, codec_output = self.load_videos(
-                        video_url["url"],
-                        data_folder=data_folder,
-                        fps=self.config.fps,
-                        video_kwargs=extra or None,
-                    )
-                    videos.append(frames)
-                    video_metadata_list.append(codec_output)
-                    kwargs["fps"] = sample_fps
+
+        videos, video_metadata_list, sample_fps = self.collect_codec_video_inputs(messages, data_folder=data_folder)
+        if sample_fps is not None:
+            kwargs["fps"] = sample_fps
 
         hf_messages = TrainUtilities.convert_open_to_hf(messages)
         if data_folder is not None:
@@ -58,23 +49,8 @@ def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
             images = None
         if len(videos) == 0:
             videos = None
-            video_metadata_list = None
-        if video_metadata_list is not None:
+        else:
             kwargs["video_metadata"] = video_metadata_list
 
         inputs = self.processor.process(images=images, hf_messages=hf_messages, videos=videos, **kwargs)
         return inputs
-
-    def load_videos(
-        self,
-        video_path: str,
-        data_folder=None,
-        fps: int = 1,
-        video_kwargs=None,
-    ) -> Tuple[np.ndarray, float, Any]:
-        assert (
-            self.config.video_backend == "lmms_video_utils"
-        ), "LlavaOv2IterableDataset only supports lmms_video_utils backend"
-        if data_folder is not None:
-            video_path = os.path.join(data_folder, video_path)
-        return self.load_video_lmms_video_utils(video_path, fps, video_kwargs=video_kwargs)