Skip to content

Commit b20464d

Browse files
authored
Support video inputs (#4360)
* support video inputs * refactor; fix video mrope * reorganize media io * rename media * cleanups; support qwen3.5 video inputs * fix ut * better modality * refactor media io; add opencv video loader; remove qwen-vl-utils codes * support interns1-pro video inputs * fix and add more ut; unify video loader outputs * minor * make copilot happy * support image_data inputs * fix ut * add init file * fix video/jpeg loading * abstract make data func * media io kwargs for video * fix kwargs for load_file * remove role assertion * fix review comments * fix ut * remove copy * fix lint
1 parent d80a6b4 commit b20464d

71 files changed

Lines changed: 1693 additions & 814 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

autotest/tools/pipeline/mllm_case.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
from PIL import Image
66

77
from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
8-
from lmdeploy.vl import load_image
8+
from lmdeploy.vl import encode_image_base64, load_image
99
from lmdeploy.vl.constants import IMAGE_TOKEN
10-
from lmdeploy.vl.utils import encode_image_base64
1110

1211
gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)
1312

docs/en/multi_modal/internvl.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ import numpy as np
116116
from lmdeploy import pipeline, GenerationConfig
117117
from decord import VideoReader, cpu
118118
from lmdeploy.vl.constants import IMAGE_TOKEN
119-
from lmdeploy.vl.utils import encode_image_base64
119+
from lmdeploy.vl import encode_image_base64
120120
from PIL import Image
121121
pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
122122

docs/en/multi_modal/minicpmv.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ print(out.text)
9797

9898
```python
9999
from lmdeploy import pipeline, GenerationConfig
100-
from lmdeploy.vl.utils import encode_image_base64
100+
from lmdeploy.vl import encode_image_base64
101101
import torch
102102
from PIL import Image
103103
from transformers import AutoModel, AutoTokenizer

docs/en/multi_modal/qwen2_5_vl.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ import numpy as np
9999
from lmdeploy import pipeline, GenerationConfig
100100
from decord import VideoReader, cpu
101101
from lmdeploy.vl.constants import IMAGE_TOKEN
102-
from lmdeploy.vl.utils import encode_image_base64
102+
from lmdeploy.vl import encode_image_base64
103103
from PIL import Image
104104
pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')
105105

docs/zh_cn/multi_modal/internvl.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ import numpy as np
116116
from lmdeploy import pipeline, GenerationConfig
117117
from decord import VideoReader, cpu
118118
from lmdeploy.vl.constants import IMAGE_TOKEN
119-
from lmdeploy.vl.utils import encode_image_base64
119+
from lmdeploy.vl import encode_image_base64
120120
from PIL import Image
121121
pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
122122

docs/zh_cn/multi_modal/minicpmv.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ print(out.text)
9797

9898
```python
9999
from lmdeploy import pipeline, GenerationConfig
100-
from lmdeploy.vl.utils import encode_image_base64
100+
from lmdeploy.vl import encode_image_base64
101101
import torch
102102
from PIL import Image
103103
from transformers import AutoModel, AutoTokenizer

docs/zh_cn/multi_modal/qwen2_5_vl.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ import numpy as np
9999
from lmdeploy import pipeline, GenerationConfig
100100
from decord import VideoReader, cpu
101101
from lmdeploy.vl.constants import IMAGE_TOKEN
102-
from lmdeploy.vl.utils import encode_image_base64
102+
from lmdeploy.vl import encode_image_base64
103103
from PIL import Image
104104
pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')
105105

lmdeploy/pytorch/model_inputs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import lmdeploy.pytorch.distributed as dist
1212
from lmdeploy.pytorch.backends import get_backend
1313
from lmdeploy.pytorch.config import CacheConfig, DLLMConfig, ModelConfig, QuantizationConfig
14-
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
14+
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
1515
from lmdeploy.pytorch.utils import CtxMgrBase, singleton
1616

1717
if TYPE_CHECKING:
@@ -66,7 +66,7 @@ class VisionModelInputs:
6666
input_embeddings: List[List[torch.Tensor]] = None
6767
input_embedding_ranges: List[torch.LongTensor] = None
6868
input_embedding_indexing: torch.BoolTensor = None
69-
input_multimodals: List[MultiModalTensor] = None
69+
input_multimodals: List[MultiModalData] = None
7070

7171
def to_device(self, device: str, non_blocking: bool = False):
7272
"""To device."""
@@ -255,7 +255,7 @@ class StepContext:
255255
local_adapter_ids: torch.LongTensor = None
256256
input_embeddings: torch.Tensor = None
257257
input_embedding_indexing: torch.Tensor = None
258-
input_multimodals: List[MultiModalTensor] = None
258+
input_multimodals: List[MultiModalData] = None
259259
vision_inputs: VisionModelInputs = None
260260
attn_metadata: Any = None
261261
kv_quant_policy: Literal[0, 4, 8] = 0

lmdeploy/pytorch/models/chatglm2.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
1111
from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
12-
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
12+
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
1313
from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding,
1414
build_rotary_params)
1515
from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj,
@@ -866,10 +866,10 @@ def preprocess_input(self,
866866
if isinstance(num_pad, torch.Tensor):
867867
num_pad = num_pad.item()
868868

869-
mm_data = MultiModalTensor(data=pixel_values,
870-
start=offset,
871-
end=offset + num_pad,
872-
meta=dict(image_token_id=image_token_id))
869+
mm_data = MultiModalData(data=pixel_values,
870+
start=offset,
871+
end=offset + num_pad,
872+
meta=dict(image_token_id=image_token_id))
873873
input_imgs.append(mm_data)
874874

875875
result = PreprocessInputResult(

lmdeploy/pytorch/models/cogvlm.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from lmdeploy.pytorch.distributed import get_tp_world_rank
1313
from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
1414
from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
15-
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
15+
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
1616
from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
1717
from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj,
1818
build_rowwise_linear)
@@ -901,10 +901,10 @@ def preprocess_input(self, input_ids: List[int], input_multimodals=None, **kwarg
901901
if isinstance(num_pad, torch.Tensor):
902902
num_pad = num_pad.item()
903903

904-
mm_data = MultiModalTensor(data=pixel_values,
905-
start=offset,
906-
end=offset + num_pad,
907-
meta=dict(image_token_id=image_token_id))
904+
mm_data = MultiModalData(data=pixel_values,
905+
start=offset,
906+
end=offset + num_pad,
907+
meta=dict(image_token_id=image_token_id))
908908
input_imgs.append(mm_data)
909909

910910
result = PreprocessInputResult(

0 commit comments

Comments
 (0)