Skip to content

Commit 714856f

Browse files
committed
refactor: extract CodecVideoLoadingMixin from LlavaOv2IterableDataset
Move the dataset-level codec video orchestration (collecting canvases plus their CodecVideoOutput metadata across a message list) into a reusable CodecVideoLoadingMixin alongside multimodal_mixin. The backend implementation (load_video_lmms_video_utils) stays in MultiModalDataLoadingMixin; LlavaOv2IterableDataset now inherits the mixin and only handles image collection plus processor dispatch.
1 parent 9ca9323 commit 714856f

2 files changed

Lines changed: 84 additions & 37 deletions

File tree

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""Mixin for loading videos through the ``lmms_video_utils`` codec backend.
2+
3+
Separates the dataset-level codec orchestration (collecting canvases plus
4+
their ``CodecVideoOutput`` metadata across a message list) from both the
5+
backend implementation (``load_video_lmms_video_utils`` lives in
6+
``MultiModalDataLoadingMixin``) and the concrete dataset class. A dataset
7+
that mixes this in can pass the collected ``video_metadata`` straight into a
8+
codec-aware processor (e.g. LLaVA-OneVision-2).
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import os
14+
from typing import Any, Dict, List, Optional, Tuple
15+
16+
17+
class CodecVideoLoadingMixin:
18+
"""Collects codec-stream video inputs from OpenAI-style messages.
19+
20+
Requires the host class to also provide
21+
``load_video_lmms_video_utils`` (from ``MultiModalDataLoadingMixin``)
22+
and a ``config`` with ``video_backend`` / ``fps``.
23+
"""
24+
25+
def load_codec_videos(
26+
self,
27+
video_path: str,
28+
data_folder: Optional[str] = None,
29+
fps: int = 1,
30+
video_kwargs: Optional[Dict[str, Any]] = None,
31+
) -> Tuple[Any, float, Any]:
32+
assert (
33+
self.config.video_backend == "lmms_video_utils"
34+
), "CodecVideoLoadingMixin requires video_backend='lmms_video_utils'"
35+
if data_folder is not None:
36+
video_path = os.path.join(data_folder, video_path)
37+
return self.load_video_lmms_video_utils(video_path, fps, video_kwargs=video_kwargs)
38+
39+
def collect_codec_video_inputs(
40+
self,
41+
messages: List[dict],
42+
data_folder: Optional[str] = None,
43+
) -> Tuple[List[Any], List[Any], Optional[float]]:
44+
"""Walk ``messages`` and load every ``video_url`` via the codec
45+
backend.
46+
47+
Returns ``(videos, video_metadata_list, sample_fps)`` where
48+
``videos`` are the canvas arrays, ``video_metadata_list`` holds the
49+
matching ``CodecVideoOutput`` objects, and ``sample_fps`` is the fps
50+
of the last loaded video (or ``None`` if no video was present).
51+
"""
52+
videos: List[Any] = []
53+
video_metadata_list: List[Any] = []
54+
sample_fps: Optional[float] = None
55+
56+
for message in messages:
57+
for content in message["content"]:
58+
if content.get("type") != "video_url":
59+
continue
60+
video_url = content["video_url"]
61+
extra = {k: v for k, v in video_url.items() if k != "url" and v is not None}
62+
frames, sample_fps, codec_output = self.load_codec_videos(
63+
video_url["url"],
64+
data_folder=data_folder,
65+
fps=self.config.fps,
66+
video_kwargs=extra or None,
67+
)
68+
videos.append(frames)
69+
video_metadata_list.append(codec_output)
70+
71+
return videos, video_metadata_list, sample_fps
Lines changed: 13 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import json
22
import os
3-
from typing import Any, Dict, Tuple
3+
from typing import Any, Dict
44

5-
import numpy as np
65
import torch
76
from PIL import Image
87

8+
from lmms_engine.datasets.codec_video_mixin import CodecVideoLoadingMixin
99
from lmms_engine.datasets.iterable.vision_iterable_dataset import (
1010
VisionSFTIterableDataset,
1111
)
@@ -14,40 +14,31 @@
1414

1515

1616
@register_dataset("llava_ov2_iterable")
17-
class LlavaOv2IterableDataset(VisionSFTIterableDataset):
17+
class LlavaOv2IterableDataset(CodecVideoLoadingMixin, VisionSFTIterableDataset):
1818
"""Iterable dataset for LLaVA-OneVision-2 with codec-stream video input.
1919
2020
Reuses ``VisionSFTIterableDataset`` plumbing but routes video loading
21-
through the ``lmms_video_utils`` backend so each video produces a
22-
``CodecVideoOutput`` (canvases + patch_positions + source_pts) that the
23-
downstream processor can consume directly instead of re-deriving
24-
timestamps from frame index.
21+
through the ``lmms_video_utils`` backend (via ``CodecVideoLoadingMixin``)
22+
so each video produces a ``CodecVideoOutput`` (canvases + patch_positions
23+
+ source_pts) that the downstream processor can consume directly instead
24+
of re-deriving timestamps from frame index.
2525
"""
2626

2727
def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
2828
images_list = []
29-
videos = []
30-
video_metadata_list = []
3129
kwargs: Dict[str, Any] = {}
3230
messages = data["messages"]
3331
if isinstance(messages, str):
3432
messages = json.loads(messages)
33+
3534
for message in messages:
3635
for content in message["content"]:
3736
if content["type"] == "image_url":
3837
images_list.append(content["image_url"]["url"])
39-
elif content["type"] == "video_url":
40-
video_url = content["video_url"]
41-
extra = {k: v for k, v in video_url.items() if k != "url" and v is not None}
42-
frames, sample_fps, codec_output = self.load_videos(
43-
video_url["url"],
44-
data_folder=data_folder,
45-
fps=self.config.fps,
46-
video_kwargs=extra or None,
47-
)
48-
videos.append(frames)
49-
video_metadata_list.append(codec_output)
50-
kwargs["fps"] = sample_fps
38+
39+
videos, video_metadata_list, sample_fps = self.collect_codec_video_inputs(messages, data_folder=data_folder)
40+
if sample_fps is not None:
41+
kwargs["fps"] = sample_fps
5142

5243
hf_messages = TrainUtilities.convert_open_to_hf(messages)
5344
if data_folder is not None:
@@ -58,23 +49,8 @@ def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
5849
images = None
5950
if len(videos) == 0:
6051
videos = None
61-
video_metadata_list = None
62-
if video_metadata_list is not None:
52+
else:
6353
kwargs["video_metadata"] = video_metadata_list
6454

6555
inputs = self.processor.process(images=images, hf_messages=hf_messages, videos=videos, **kwargs)
6656
return inputs
67-
68-
def load_videos(
69-
self,
70-
video_path: str,
71-
data_folder=None,
72-
fps: int = 1,
73-
video_kwargs=None,
74-
) -> Tuple[np.ndarray, float, Any]:
75-
assert (
76-
self.config.video_backend == "lmms_video_utils"
77-
), "LlavaOv2IterableDataset only supports lmms_video_utils backend"
78-
if data_folder is not None:
79-
video_path = os.path.join(data_folder, video_path)
80-
return self.load_video_lmms_video_utils(video_path, fps, video_kwargs=video_kwargs)

0 commit comments

Comments
 (0)