Skip to content

Commit f177f41

Browse files
committed
Fix TurboMind Qwen3 VL family vision support
1 parent 539e55c commit f177f41

4 files changed

Lines changed: 117 additions & 54 deletions

File tree

lmdeploy/turbomind/deploy/source_model/qwen.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,17 @@ def moe_ffn_shared_gate(self, i):
153153
return self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.shared_expert_gate.weight')
154154

155155

156+
def _configure_nested_language_model_prefix(reader):
157+
"""Handle VL checkpoints whose text weights live under
158+
``model.language_model``."""
159+
if any(k.startswith('model.language_model.') for k in reader.params.keys()):
160+
reader.attn_layer_prefix = 'model.language_model.layers'
161+
reader.tok_embeddings_key = 'model.language_model.embed_tokens.weight'
162+
reader.norm_weight_key = 'model.language_model.norm.weight'
163+
if reader.model_cfg.get('tie_word_embeddings', False):
164+
reader.output_weight_key = reader.tok_embeddings_key
165+
166+
156167
@INPUT_MODELS.register_module(name='qwen2-moe')
157168
class Qwen2MoeModel(LlamaModel):
158169

@@ -172,6 +183,11 @@ def model_info(self):
172183

173184

174185
class Qwen3Reader(LlamaReader):
186+
attn_layer_patten = r'(?:model\.language_model\.|model\.)layers\.([0-9]+)\.'
187+
188+
def __init__(self, *args, **kwargs):
189+
super().__init__(*args, **kwargs)
190+
_configure_nested_language_model_prefix(self)
175191

176192
def qk_norm(self, i: int):
177193
result = []
@@ -193,6 +209,11 @@ def model_info(self):
193209

194210

195211
class Qwen3MoeReader(Qwen2MoeReader):
212+
attn_layer_patten = r'(?:model\.language_model\.|model\.)layers\.([0-9]+)\.'
213+
214+
def __init__(self, *args, **kwargs):
215+
super().__init__(*args, **kwargs)
216+
_configure_nested_language_model_prefix(self)
196217

197218
def qk_norm(self, i: int):
198219
result = []
@@ -236,13 +257,7 @@ class Qwen3_5ReaderMixin:
236257

237258
def __init__(self, *args, **kwargs):
238259
super().__init__(*args, **kwargs)
239-
if any(k.startswith('model.language_model.') for k in self.params.keys()):
240-
self.attn_layer_prefix = 'model.language_model.layers'
241-
self.tok_embeddings_key = 'model.language_model.embed_tokens.weight'
242-
self.norm_weight_key = 'model.language_model.norm.weight'
243-
tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False)
244-
if tie_word_embeddings:
245-
self.output_weight_key = self.tok_embeddings_key
260+
_configure_nested_language_model_prefix(self)
246261

247262
# ---- zero-centered RMSNorm: add 1 to weights during export ----
248263
def attn_norm(self, i: int):

lmdeploy/turbomind/supported_models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
# Qwen3
3333
Qwen3ForCausalLM='qwen3',
3434
Qwen3MoeForCausalLM='qwen3-moe',
35+
Qwen3VLForConditionalGeneration='qwen3',
36+
Qwen3VLMoeForConditionalGeneration='qwen3-moe',
3537
# Qwen 3.5
3638
Qwen3_5ForConditionalGeneration='qwen3_5',
3739
Qwen3_5MoeForConditionalGeneration='qwen3_5-moe',

lmdeploy/vl/model/qwen3.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,13 @@ def proc_messages(self, messages, chat_template, sequence_start, chat_template_k
256256
else:
257257
prompt_messages = messages
258258
prompt = chat_template.messages2prompt(prompt_messages, sequence_start, **chat_template_kwargs)
259-
return prompt, None
259+
return prompt, self.image_token
260+
261+
def _ensure_turbomind_image_only(self, inputs: list[dict]):
262+
"""TurboMind split vision currently supports image inputs only."""
263+
has_video = self.contains_video_input or any('video_grid_thw' in item for item in inputs)
264+
if has_video:
265+
raise NotImplementedError('TurboMind split vision for the Qwen3 VL family currently supports images only.')
260266

261267
def to_pytorch_aux_video(self, messages, prompt, VIDEO_TOKEN, tokenizer, sequence_start):
262268
"""Pack the video input to the compatible format with pytorch
@@ -332,6 +338,7 @@ def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
332338
"""Run vision encoder for TurboMind split path (shared Qwen3 VL
333339
family)."""
334340
inputs = [x['content'] for x in messages if x['role'] == 'preprocess'][0]
341+
self._ensure_turbomind_image_only(inputs)
335342
dtype = torch.half
336343
device = next(self.model.visual.parameters()).device
337344
outputs = []
@@ -381,8 +388,9 @@ def to_turbomind(self,
381388
chat_template_kwargs: dict | None = None,
382389
**kwargs):
383390
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start, chat_template_kwargs)
384-
info = super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
385391
inputs = [x['content'] for x in messages if x['role'] == 'preprocess'][0]
392+
self._ensure_turbomind_image_only(inputs)
393+
info = super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
386394
grid_thws = [x['image_grid_thw'].tolist()[0] for x in inputs]
387395
seq_len = len(info['input_ids'])
388396
ranges = info['input_embedding_ranges']
Lines changed: 83 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
# Copyright (c) OpenMMLab. All rights reserved.
2-
import os
2+
from types import SimpleNamespace
33

44
import pytest
5+
import torch
56

6-
from lmdeploy.messages import PytorchEngineConfig
7-
from lmdeploy.serve.core.vl_async_engine import adjust_vl_backend_config_for_prefix_caching
8-
from lmdeploy.vl.model.qwen3 import resolve_qwen_vl_family_automodel
7+
from lmdeploy.archs import get_task
8+
from lmdeploy.messages import TurbomindEngineConfig
9+
from lmdeploy.vl.model.qwen3 import Qwen3VLModel, resolve_qwen_vl_family_automodel
910

1011

1112
@pytest.mark.parametrize('arch,expected_block', [
@@ -25,44 +26,81 @@ def test_resolve_unknown_arch_raises():
2526
resolve_qwen_vl_family_automodel('NotAModel')
2627

2728

28-
def test_adjust_prefix_caching_leaves_qwen3vl_untouched(monkeypatch):
29-
monkeypatch.setattr(
30-
'lmdeploy.archs.get_model_arch',
31-
lambda _path: ('Qwen3VLForConditionalGeneration', None),
32-
)
33-
cfg = PytorchEngineConfig(enable_prefix_caching=True)
34-
out = adjust_vl_backend_config_for_prefix_caching('/fake', cfg)
35-
assert out is cfg
36-
assert cfg.enable_prefix_caching is True
37-
38-
39-
def test_adjust_prefix_caching_disables_for_other_vl_without_mutation(monkeypatch):
40-
monkeypatch.setattr(
41-
'lmdeploy.archs.get_model_arch',
42-
lambda _path: ('InternVLChatModel', None),
43-
)
44-
cfg = PytorchEngineConfig(enable_prefix_caching=True)
45-
out = adjust_vl_backend_config_for_prefix_caching('/fake', cfg)
46-
assert out is not cfg
47-
assert out.enable_prefix_caching is False
48-
assert cfg.enable_prefix_caching is True
49-
50-
51-
_HUB_SMOKE_MODEL_IDS = (
52-
'Qwen/Qwen3-VL-4B-Instruct',
53-
'Qwen/Qwen3-VL-8B-Instruct',
54-
)
55-
56-
57-
def test_hub_config_resolves_qwen_vl_family_smoke():
58-
"""Optional: verify published configs use an architecture handled by the shared loader."""
59-
if os.environ.get('LMDEPLOY_RUN_VL_LOAD_TESTS') != '1':
60-
pytest.skip('HF hub config fetch; enable with LMDEPLOY_RUN_VL_LOAD_TESTS=1')
61-
pytest.importorskip('transformers')
62-
from transformers import AutoConfig
63-
64-
for model_id in _HUB_SMOKE_MODEL_IDS:
65-
cfg = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
66-
cls, no_split = resolve_qwen_vl_family_automodel(cfg.architectures[0])
67-
assert cls is not None
68-
assert len(no_split) >= 1
29+
def test_get_task_routes_qwen3_vl_to_vl_engine(monkeypatch):
30+
cfg = SimpleNamespace(to_dict=lambda: {'architectures': ['Qwen3VLForConditionalGeneration']})
31+
monkeypatch.setattr('lmdeploy.archs.get_model_arch', lambda _path: ('Qwen3VLForConditionalGeneration', cfg))
32+
33+
task, pipeline_class = get_task('/fake-model', TurbomindEngineConfig())
34+
assert task == 'vlm'
35+
assert pipeline_class.__name__ == 'VLAsyncEngine'
36+
37+
38+
class _DummyChatTemplate:
39+
40+
def __init__(self, prompt):
41+
self.prompt = prompt
42+
43+
def messages2prompt(self, messages, sequence_start, **kwargs):
44+
return self.prompt
45+
46+
47+
class _DummyTokenizer:
48+
49+
def encode(self, text, add_bos=False):
50+
tokens = [] if not text else [len(text)]
51+
if add_bos:
52+
return [0] + tokens
53+
return tokens
54+
55+
56+
def _build_qwen3_vl_stub():
57+
model = Qwen3VLModel.__new__(Qwen3VLModel)
58+
model.image_token = '<|image_pad|>'
59+
model.image_token_id = 151655
60+
model.contains_video_input = False
61+
return model
62+
63+
64+
def test_qwen3_vl_to_turbomind_uses_image_token_placeholder():
65+
model = _build_qwen3_vl_stub()
66+
tokenizer = _DummyTokenizer()
67+
prompt = 'prefix<|vision_start|><|image_pad|><|vision_end|>suffix'
68+
chat_template = _DummyChatTemplate(prompt)
69+
image_grid_thw = torch.tensor([[1, 2, 2]])
70+
image_embed = torch.randn(1, 4)
71+
messages = [{
72+
'role': 'user',
73+
'content': [{
74+
'type': 'image',
75+
'data': object(),
76+
}],
77+
}, {
78+
'role': 'preprocess',
79+
'content': [{
80+
'image_grid_thw': image_grid_thw,
81+
}],
82+
}, {
83+
'role': 'forward',
84+
'content': [image_embed],
85+
}]
86+
87+
info = model.to_turbomind(messages, chat_template, tokenizer, sequence_start=True)
88+
89+
begin = len(tokenizer.encode('prefix<|vision_start|>', add_bos=True))
90+
assert info['input_embedding_ranges'] == [[begin, begin + image_embed.shape[0]]]
91+
assert len(info['input_embeddings']) == 1
92+
assert info['input_meta']['mrope_position_ids'].shape[1] == len(info['input_ids'])
93+
94+
95+
def test_qwen3_vl_to_turbomind_rejects_video():
96+
model = _build_qwen3_vl_stub()
97+
model.contains_video_input = True
98+
messages = [{
99+
'role': 'preprocess',
100+
'content': [{
101+
'video_grid_thw': torch.tensor([[1, 2, 2]]),
102+
}],
103+
}]
104+
105+
with pytest.raises(NotImplementedError, match='supports images only'):
106+
model.to_turbomind(messages, _DummyChatTemplate(''), _DummyTokenizer(), sequence_start=True)

0 commit comments

Comments
 (0)