Skip to content

Commit fea2e57

Browse files
authored
feat(datasets): lmms_video_utils video backend for LLaVA-OneVision-2 codec stream (#183)
* chore(datasets): allow lmms_video_utils as video_backend literal * feat(datasets): add lmms_video_utils video backend New load_video_lmms_video_utils returns (canvases, fps, CodecVideoOutput) so downstream processors aware of codec metadata can use per-patch positions and per-canvas timestamps directly. Decoder backend/device read from extra_kwargs (video_decode_backend / video_decode_device), default pyav/cpu; 'cuda' resolves to cuda:LOCAL_RANK. * feat: LlavaOv2IterableDataset + codec metadata in OV2 processor Adds llava_ov2_iterable dataset that pipes lmms_video_utils canvases plus their CodecVideoOutput metadata into LlavaOnevision2DataProcessor. The processor gains a video_metadata kwarg: when supplied, it bypasses OV2's video_processor and constructs pixel_values_videos / video_grid_thw / patch_positions / frame_timestamps directly from the codec output, preserving real source-frame coordinates instead of falling back to arange(T). The frame-sampling path is untouched. * chore(deps): add lmms-video-utils[all] as optional video extra Pulled in by [video] and rolled up into [all] so existing 'install .[all]' workflows get torchcodec-backed codec-stream support for free, while base installs stay slim. * refactor: extract CodecVideoLoadingMixin from LlavaOv2IterableDataset Move the dataset-level codec video orchestration (collecting canvases plus their CodecVideoOutput metadata across a message list) into a reusable CodecVideoLoadingMixin alongside multimodal_mixin. The backend implementation (load_video_lmms_video_utils) stays in MultiModalDataLoadingMixin; LlavaOv2IterableDataset now inherits the mixin and only handles image collection plus processor dispatch.
1 parent e7cdc94 commit fea2e57

7 files changed

Lines changed: 378 additions & 9 deletions

File tree

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,15 @@ eval = [
7373
"httpx",
7474
]
7575

76+
video = [
77+
"lmms-video-utils[all]>=0.1.0",
78+
]
79+
7680
all = [
7781
"lmms_engine[pref]",
7882
"lmms_engine[storage]",
7983
"lmms_engine[eval]",
84+
"lmms_engine[video]",
8085
]
8186

8287

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""Mixin for loading videos through the ``lmms_video_utils`` codec backend.
2+
3+
Separates the dataset-level codec orchestration (collecting canvases plus
4+
their ``CodecVideoOutput`` metadata across a message list) from both the
5+
backend implementation (``load_video_lmms_video_utils`` lives in
6+
``MultiModalDataLoadingMixin``) and the concrete dataset class. A dataset
7+
that mixes this in can pass the collected ``video_metadata`` straight into a
8+
codec-aware processor (e.g. LLaVA-OneVision-2).
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import os
14+
from typing import Any, Dict, List, Optional, Tuple
15+
16+
17+
class CodecVideoLoadingMixin:
18+
"""Collects codec-stream video inputs from OpenAI-style messages.
19+
20+
Requires the host class to also provide
21+
``load_video_lmms_video_utils`` (from ``MultiModalDataLoadingMixin``)
22+
and a ``config`` with ``video_backend`` / ``fps``.
23+
"""
24+
25+
def load_codec_videos(
26+
self,
27+
video_path: str,
28+
data_folder: Optional[str] = None,
29+
fps: int = 1,
30+
video_kwargs: Optional[Dict[str, Any]] = None,
31+
) -> Tuple[Any, float, Any]:
32+
assert (
33+
self.config.video_backend == "lmms_video_utils"
34+
), "CodecVideoLoadingMixin requires video_backend='lmms_video_utils'"
35+
if data_folder is not None:
36+
video_path = os.path.join(data_folder, video_path)
37+
return self.load_video_lmms_video_utils(video_path, fps, video_kwargs=video_kwargs)
38+
39+
def collect_codec_video_inputs(
40+
self,
41+
messages: List[dict],
42+
data_folder: Optional[str] = None,
43+
) -> Tuple[List[Any], List[Any], Optional[float]]:
44+
"""Walk ``messages`` and load every ``video_url`` via the codec
45+
backend.
46+
47+
Returns ``(videos, video_metadata_list, sample_fps)`` where
48+
``videos`` are the canvas arrays, ``video_metadata_list`` holds the
49+
matching ``CodecVideoOutput`` objects, and ``sample_fps`` is the fps
50+
of the last loaded video (or ``None`` if no video was present).
51+
"""
52+
videos: List[Any] = []
53+
video_metadata_list: List[Any] = []
54+
sample_fps: Optional[float] = None
55+
56+
for message in messages:
57+
for content in message["content"]:
58+
if content.get("type") != "video_url":
59+
continue
60+
video_url = content["video_url"]
61+
extra = {k: v for k, v in video_url.items() if k != "url" and v is not None}
62+
frames, sample_fps, codec_output = self.load_codec_videos(
63+
video_url["url"],
64+
data_folder=data_folder,
65+
fps=self.config.fps,
66+
video_kwargs=extra or None,
67+
)
68+
videos.append(frames)
69+
video_metadata_list.append(codec_output)
70+
71+
return videos, video_metadata_list, sample_fps

src/lmms_engine/datasets/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class DatasetConfig(Args):
3838
video_min_pixels: Optional[int] = 3136
3939
frame_num: Optional[int] = 64
4040
fps: Optional[int] = 1
41-
video_backend: Optional[Literal["decord", "qwen_vl_utils", "qwen_omni_utils"]] = "qwen_vl_utils"
41+
video_backend: Optional[Literal["decord", "qwen_vl_utils", "qwen_omni_utils", "lmms_video_utils"]] = "qwen_vl_utils"
4242

4343
@field_validator(
4444
"video_max_pixels",

src/lmms_engine/datasets/iterable/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .bagel_iterable_dataset import BagelIterableDataset
22
from .base_iterable_dataset import BaseIterableDataset
33
from .fineweb_edu_dataset import FinewebEduDataset
4+
from .llava_ov2_iterable_dataset import LlavaOv2IterableDataset
45
from .multimodal_iterable_dataset import MultiModalIterableDataset
56
from .qwen3_vl_iterable_dataset import Qwen3VLIterableDataset
67
from .qwen_omni_iterable_dataset import QwenOmniIterableDataset
@@ -12,6 +13,7 @@
1213
"MultiModalIterableDataset",
1314
"VisionSFTIterableDataset",
1415
"BagelIterableDataset",
16+
"LlavaOv2IterableDataset",
1517
"Qwen3VLIterableDataset",
1618
"QwenOmniIterableDataset",
1719
]
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import json
2+
import os
3+
from typing import Any, Dict
4+
5+
import torch
6+
from PIL import Image
7+
8+
from lmms_engine.datasets.codec_video_mixin import CodecVideoLoadingMixin
9+
from lmms_engine.datasets.iterable.vision_iterable_dataset import (
10+
VisionSFTIterableDataset,
11+
)
12+
from lmms_engine.mapping_func import register_dataset
13+
from lmms_engine.utils.train_utils import TrainUtilities
14+
15+
16+
@register_dataset("llava_ov2_iterable")
17+
class LlavaOv2IterableDataset(CodecVideoLoadingMixin, VisionSFTIterableDataset):
18+
"""Iterable dataset for LLaVA-OneVision-2 with codec-stream video input.
19+
20+
Reuses ``VisionSFTIterableDataset`` plumbing but routes video loading
21+
through the ``lmms_video_utils`` backend (via ``CodecVideoLoadingMixin``)
22+
so each video produces a ``CodecVideoOutput`` (canvases + patch_positions
23+
+ source_pts) that the downstream processor can consume directly instead
24+
of re-deriving timestamps from frame index.
25+
"""
26+
27+
def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
28+
images_list = []
29+
kwargs: Dict[str, Any] = {}
30+
messages = data["messages"]
31+
if isinstance(messages, str):
32+
messages = json.loads(messages)
33+
34+
for message in messages:
35+
for content in message["content"]:
36+
if content["type"] == "image_url":
37+
images_list.append(content["image_url"]["url"])
38+
39+
videos, video_metadata_list, sample_fps = self.collect_codec_video_inputs(messages, data_folder=data_folder)
40+
if sample_fps is not None:
41+
kwargs["fps"] = sample_fps
42+
43+
hf_messages = TrainUtilities.convert_open_to_hf(messages)
44+
if data_folder is not None:
45+
images = [Image.open(os.path.join(data_folder, image)) for image in images_list]
46+
else:
47+
images = [Image.open(image) for image in images_list]
48+
if len(images) == 0:
49+
images = None
50+
if len(videos) == 0:
51+
videos = None
52+
else:
53+
kwargs["video_metadata"] = video_metadata_list
54+
55+
inputs = self.processor.process(images=images, hf_messages=hf_messages, videos=videos, **kwargs)
56+
return inputs

src/lmms_engine/datasets/multimodal_mixin.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ def load_videos(
144144
return self.load_video_qwen_vl_utils(video_path, fps, video_kwargs=video_kwargs)
145145
elif self.config.video_backend == "qwen_omni_utils":
146146
return self.load_video_qwen_omni_utils(video_path, fps)
147+
elif self.config.video_backend == "lmms_video_utils":
148+
return self.load_video_lmms_video_utils(video_path, fps, video_kwargs=video_kwargs)
147149
else:
148150
raise ValueError(f"Video backend {self.config.video_backend} not supported")
149151

@@ -280,3 +282,80 @@ def load_video_qwen_omni_utils(
280282
return video_frames, sample_fps
281283
else:
282284
raise ValueError("No video frames returned from process_mm_info")
285+
286+
def load_video_lmms_video_utils(
287+
self,
288+
video_path: str,
289+
fps: int,
290+
video_kwargs: Optional[Dict[str, Any]] = None,
291+
) -> Tuple[np.ndarray, float, Any]:
292+
"""
293+
Load video using lmms_video_utils codec-stream backend.
294+
295+
Returns canvases (codec-packed "frames") plus the full
296+
``CodecVideoOutput`` metadata so a downstream processor that
297+
understands per-patch positions (e.g. LLaVA-OneVision-2) can use
298+
them instead of re-deriving timestamps from frame index.
299+
300+
Decoder selection is read from ``config.extra_kwargs``:
301+
``video_decode_backend``: "auto" | "torchcodec" | "pyav"
302+
(default: "pyav")
303+
``video_decode_device``: "cpu" | "cuda" | "cuda:N"
304+
(default: "cpu"; "cuda" resolves to cuda:LOCAL_RANK)
305+
306+
Args:
307+
video_path: Path to video file.
308+
fps: Target fps; mapped to ``target_fps`` when sampling strategy
309+
is ``"fps"``.
310+
video_kwargs: Optional extra fields. Supported keys mirror
311+
qwen-vl-utils (``video_start``/``video_end``/``fps``/
312+
``nframes``/``max_pixels``/``min_pixels``); plus any
313+
``CodecConfig`` field (``score_mode``, ``gop_mode``,
314+
``target_canvas`` etc.).
315+
316+
Returns:
317+
Tuple of (frames [T, H, W, C] np.uint8, sample_fps, codec_output).
318+
"""
319+
from lmms_video_utils import fetch_codec_video
320+
321+
extra = self.config.extra_kwargs or {}
322+
decode_backend = extra.get("video_decode_backend", "pyav")
323+
decode_device = extra.get("video_decode_device", "cpu")
324+
if decode_device == "cuda":
325+
local_rank = int(os.environ.get("LOCAL_RANK", "0"))
326+
decode_device = f"cuda:{local_rank}"
327+
328+
overrides: Dict[str, Any] = {
329+
"backend": decode_backend,
330+
"device": decode_device,
331+
}
332+
if self.config.video_max_pixels is not None:
333+
overrides["max_pixels"] = int(self.config.video_max_pixels)
334+
if self.config.video_min_pixels is not None:
335+
overrides["min_pixels"] = int(self.config.video_min_pixels)
336+
if self.config.video_max_frames is not None:
337+
overrides["max_frames"] = int(self.config.video_max_frames)
338+
339+
if self.config.video_sampling_strategy == "fps":
340+
overrides["target_fps"] = float(fps)
341+
elif self.config.video_sampling_strategy == "frame_num":
342+
overrides["max_frames"] = int(self.config.frame_num)
343+
344+
if video_kwargs:
345+
qwen_to_ours = {
346+
"video_start": "start_time",
347+
"video_end": "end_time",
348+
"fps": "target_fps",
349+
"nframes": "max_frames",
350+
"max_pixels": "max_pixels",
351+
"min_pixels": "min_pixels",
352+
}
353+
for k, v in video_kwargs.items():
354+
overrides[qwen_to_ours.get(k, k)] = v
355+
356+
codec_output = fetch_codec_video(video_path, **overrides)
357+
canvases = codec_output.canvases.cpu().numpy()
358+
if canvases.ndim == 4 and canvases.shape[1] in (1, 3, 4):
359+
canvases = np.transpose(canvases, (0, 2, 3, 1))
360+
sample_fps = float(codec_output.fps)
361+
return canvases, sample_fps, codec_output

0 commit comments

Comments
 (0)