Skip to content

Commit a62dac2

Browse files
committed
WIP: support video and audio, refactor
1 parent 9101c71 commit a62dac2

File tree

9 files changed

+453
-132
lines changed

9 files changed

+453
-132
lines changed

lmdeploy/pytorch/models/qwen3_omni_moe_thinker.py

Lines changed: 232 additions & 47 deletions
Large diffs are not rendered by default.

lmdeploy/pytorch/models/qwen3_vl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,7 @@ def _make_image_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
899899
offset = input_mm['offset']
900900
start = offset
901901
image_token_id = input_mm['image_token_id']
902-
num_pad = input_mm['image_tokens']
902+
num_pad = input_mm['mm_token_num']
903903
if isinstance(num_pad, torch.Tensor):
904904
num_pad = num_pad.item()
905905

@@ -917,7 +917,7 @@ def _make_video_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
917917
offset = input_mm['offset']
918918
start = offset
919919
video_token_id = input_mm['video_token_id']
920-
num_pad = input_mm['video_tokens']
920+
num_pad = input_mm['mm_token_num']
921921
if isinstance(num_pad, torch.Tensor):
922922
num_pad = num_pad.item()
923923

lmdeploy/pytorch/multimodal/data_type.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ class MultiModalData:
1515
start: int
1616
end: int = None
1717
meta: Dict[str, Any] = None
18-
1918
modality: Modality = Modality.IMAGE
2019

2120
def __post_init__(self):

lmdeploy/serve/processors/multimodal.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from lmdeploy.tokenizer import Tokenizer
99
from lmdeploy.utils import get_logger
1010
from lmdeploy.vl.constants import Modality
11+
from lmdeploy.vl.media.audio import AudioMediaIO
1112
from lmdeploy.vl.media.connection import load_from_url
1213
from lmdeploy.vl.media.image import ImageMediaIO
1314
from lmdeploy.vl.media.time_series import TimeSeriesMediaIO
@@ -124,6 +125,10 @@ def _parse_multimodal_item(i: int, in_messages: List[Dict], out_messages: List[D
124125
vid_io = VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {}))
125126
data, metadata = load_from_url(data_src, vid_io)
126127
item_params['video_metadata'] = metadata
128+
elif item_type == 'audio_url':
129+
modality = Modality.AUDIO
130+
audio_io = AudioMediaIO(**media_io_kwargs.get('audio', {}))
131+
data = load_from_url(data_src, audio_io)
127132
elif item_type == 'time_series_url':
128133
modality = Modality.TIME_SERIES
129134
ts_io = TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {}))
@@ -304,7 +309,7 @@ def _re_format_prompt_images_pair(prompt: Tuple) -> Dict:
304309

305310
def _has_multimodal_input(self, messages: List[Dict]) -> bool:
306311
"""Check if messages contain multimodal input (images)."""
307-
multimodal_types = ['image_url', 'image_data', 'video_url', 'time_series_url']
312+
multimodal_types = ['image_url', 'image_data', 'video_url', 'audio_url', 'time_series_url']
308313
return any(
309314
isinstance(message.get('content'), list) and any(
310315
item.get('type') in multimodal_types for item in message['content']) for message in messages)

lmdeploy/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,10 @@ def _get_and_verify_max_len(
305305
for key in llm_keys:
306306
hf_config = getattr(hf_config, key, hf_config)
307307

308+
# for qwen3-omni thinker
309+
if hasattr(hf_config, 'thinker_config'):
310+
hf_config = hf_config.thinker_config.text_config
311+
308312
logger = get_logger('lmdeploy')
309313
derived_max_model_len = float('inf')
310314
possible_keys = [

lmdeploy/vl/media/audio.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Copyright (c) OpenMMLab. All rights reserved.
2+
# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/media/audio.py
3+
4+
import base64
5+
from io import BytesIO
6+
from pathlib import Path
7+
8+
import numpy.typing as npt
9+
10+
from .base import MediaIO
11+
12+
try:
13+
import librosa
14+
except ImportError:
15+
raise ImportError('Please install librosa via `pip install librosa`.')
16+
17+
try:
18+
import soundfile
19+
except ImportError:
20+
raise ImportError('Please install soundfile via `pip install soundfile`.')
21+
22+
23+
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
24+
25+
def __init__(self, **kwargs) -> None:
26+
super().__init__()
27+
28+
# for potential custom arguments from --media-io-kwargs
29+
self.kwargs = kwargs
30+
31+
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
32+
# sr=None preserves the original sampling rate of the audio file
33+
return librosa.load(BytesIO(data), sr=None)
34+
35+
def load_base64(
36+
self,
37+
media_type: str,
38+
data: str,
39+
) -> tuple[npt.NDArray, float]:
40+
return self.load_bytes(base64.b64decode(data))
41+
42+
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
43+
return librosa.load(filepath, sr=None)
44+
45+
def encode_base64(
46+
self,
47+
media: tuple[npt.NDArray, int],
48+
*,
49+
audio_format: str = 'WAV',
50+
) -> str:
51+
audio, sr = media
52+
53+
with BytesIO() as buffer:
54+
soundfile.write(buffer, audio, sr, format=audio_format)
55+
data = buffer.getvalue()
56+
57+
return base64.b64encode(data).decode('utf-8')

lmdeploy/vl/model/base.py

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -252,40 +252,32 @@ def to_pytorch_with_input_ids(self, messages):
252252

253253
return dict(prompt=None, input_ids=input_ids, multimodal=preps)
254254

255-
def to_pytorch_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start):
255+
def to_pytorch_aux(self, messages, prompt, mm_placeholder, tokenizer, sequence_start):
256256
"""Auxiliary function to pack the preprocessing results in a format
257-
compatible with what is required by pytorch engine.
258-
259-
Args:
260-
messages(List[Dict]): the output of `preprocess`
261-
prompt(str): the prompt after applying chat template
262-
IMAGE_TOKEN(str): a placeholder where image tokens will be
263-
inserted
264-
tokenzer: the tokenizer model
265-
sequence_start: starting flag of a sequence
266-
"""
267-
# collect all preprocessing result from messages
268-
preps = [x['content'] for x in messages if x['role'] == 'preprocess']
269-
assert len(preps) == 1
270-
preps = preps[0]
257+
compatible with what is required by pytorch engine."""
258+
# collect all multi-modal preprocessing result from messages, keyed by 'preprocess'
259+
mm_items = [x['content'] for x in messages if x['role'] == 'preprocess']
260+
assert len(mm_items) == 1
261+
mm_items = mm_items[0]
271262

272263
# split prompt into segments and validate data
273-
segs = prompt.split(IMAGE_TOKEN)
274-
assert len(segs) == len(preps) + 1, (f'the number of {IMAGE_TOKEN} is not equal '
275-
f'to input images, {len(segs) - 1} vs {len(preps)}')
264+
prompt_segments = prompt.split(mm_placeholder)
265+
assert len(prompt_segments) == len(mm_items) + 1, (
266+
f'the number of {mm_placeholder} is not equal '
267+
f'to input multi modal items, {len(mm_items) - 1} vs {len(prompt_segments)}')
276268

277-
# calculate the image token offset for each image
269+
# calculate the token offset for each multi modal item
278270
input_ids = []
279-
for i, seg in enumerate(segs):
280-
if i > 0 and i <= len(preps):
281-
preps[i - 1].update(offset=len(input_ids))
282-
image_tokens = preps[i - 1]['image_tokens']
283-
assert self.image_token_id == preps[i - 1]['image_token_id']
284-
input_ids.extend([self.image_token_id] * image_tokens)
271+
mm_placeholder_id = tokenizer.encode(mm_placeholder, add_special_tokens=False)[-1]
272+
for i, seg in enumerate(prompt_segments):
273+
if i > 0 and i <= len(mm_items):
274+
mm_items[i - 1].update(offset=len(input_ids))
275+
mm_token_num = mm_items[i - 1]['mm_token_num']
276+
input_ids.extend([mm_placeholder_id] * mm_token_num)
285277
token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start))
286278
input_ids.extend(token_ids)
287279

288-
return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)
280+
return dict(prompt=prompt, input_ids=input_ids, multimodal=mm_items)
289281

290282
def to_turbomind_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start):
291283
"""Auxiliary function to pack the forwarding results in a format

lmdeploy/vl/model/qwen3.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def _preprocess_image(self,
9393
return_tensors='pt')
9494
merge_length = self.processor.image_processor.merge_size**2
9595
image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
96-
result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
96+
result.update(dict(image_size=image.size, mm_token_num=image_tokens, image_token_id=self.image_token_id))
9797
return result
9898

9999
def _preprocess_video(self,
@@ -206,7 +206,7 @@ def to_pytorch_aux_video(self, messages, prompt, VIDEO_TOKEN, tokenizer, sequenc
206206
video_token_ids = tokenizer.encode(video_placeholder)
207207
input_ids.extend(video_token_ids)
208208

209-
preps[i - 1].update(video_tokens=len(video_token_ids))
209+
preps[i - 1].update(mm_token_num=len(video_token_ids))
210210

211211
token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start))
212212
input_ids.extend(token_ids)

0 commit comments

Comments
 (0)