Skip to content

Commit 13f0ae2

Browse files
committed
feat: implement Turbomind vision encoder support for Qwen3VL/3.5 families
1 parent 81627e3 commit 13f0ae2

File tree

12 files changed

+357
-41
lines changed

12 files changed

+357
-41
lines changed

lmdeploy/archs.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def autoget_backend_config(
9393
return backend, config
9494

9595

96-
def check_vl_llm(backend: str, config: dict) -> bool:
96+
def check_vl_llm(config: dict) -> bool:
9797
"""Check if the model is a vl model from model config."""
9898
if 'auto_map' in config:
9999
for _, v in config['auto_map'].items():
@@ -121,22 +121,22 @@ def check_vl_llm(backend: str, config: dict) -> bool:
121121
return True
122122
elif arch in ['ChatGLMModel', 'ChatGLMForConditionalGeneration'] and 'vision_config' in config:
123123
return True
124-
elif arch in ['Qwen3_5ForConditionalGeneration', 'Qwen3_5MoeForConditionalGeneration'] and backend == 'turbomind':
125-
return False
126124
elif arch in supported_archs:
127125
return True
128126
return False
129127

130128

131-
def get_task(backend: str, model_path: str):
129+
def get_task(model_path: str, backend_config: PytorchEngineConfig | TurbomindEngineConfig | None = None):
132130
"""Get pipeline type and pipeline class from model config."""
133131
from lmdeploy.serve.core import AsyncEngine
134132

135133
if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
136134
# workspace model
137135
return 'llm', AsyncEngine
138136
_, config = get_model_arch(model_path)
139-
if check_vl_llm(backend, config.to_dict()):
137+
if check_vl_llm(config.to_dict()):
138+
if backend_config and backend_config.disable_vision_encoder:
139+
return 'llm', AsyncEngine
140140
from lmdeploy.serve.core import VLAsyncEngine
141141
return 'vlm', VLAsyncEngine
142142

lmdeploy/cli/serve.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,8 @@ def api_server(args):
272272
async_=args.async_,
273273
communicator=args.communicator,
274274
enable_metrics=not args.disable_metrics,
275-
hf_overrides=args.hf_overrides)
275+
hf_overrides=args.hf_overrides,
276+
disable_vision_encoder=args.disable_vision_encoder)
276277
chat_template_config = get_chat_template(args.chat_template, args.model_path)
277278
speculative_config = get_speculative_config(args)
278279

lmdeploy/lite/apis/calibrate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def calibrate(model: str,
241241
'Support only `wikitext2`, `c4`, `pileval`, `gsm8k`, ' \
242242
'`neuralmagic_calibration`, `open-platypus`, `openwebtext`.'
243243

244-
model_type, _ = get_task(backend='turbomind', model_path=model)
244+
model_type, _ = get_task(model_path=model)
245245
make_compatible_internvl_config(model)
246246

247247
# Load tokenizer and configuration

lmdeploy/messages.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,8 @@ class TurbomindEngineConfig:
253253
it to True if you want to update weights after create the pipeline
254254
hf_overrides: Huggingface overrides for the model.
255255
It can be used to override the default config of the model
256+
disable_vision_encoder: Whether to disable loading vision
257+
encoder. Default to False.
256258
enable_metrics: enable metrics system
257259
"""
258260

@@ -291,6 +293,7 @@ class TurbomindEngineConfig:
291293
empty_init: bool = False
292294
communicator: str = 'nccl'
293295
hf_overrides: dict[str, Any] | None = None
296+
disable_vision_encoder: bool = False
294297
enable_metrics: bool = True
295298

296299
def __post_init__(self):

lmdeploy/pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def __init__(self,
6969

7070
# Create inference engine
7171
backend, backend_config = autoget_backend_config(model_path, backend_config)
72-
_, pipeline_class = get_task(backend, model_path)
72+
_, pipeline_class = get_task(model_path, backend_config)
7373
self.async_engine = pipeline_class(model_path,
7474
backend=backend,
7575
backend_config=backend_config,

lmdeploy/serve/openai/api_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1485,7 +1485,7 @@ def serve(model_path: str,
14851485
http_or_https = 'https'
14861486

14871487
handle_torchrun()
1488-
_, pipeline_class = get_task(backend, model_path)
1488+
_, pipeline_class = get_task(model_path, backend_config)
14891489
if isinstance(backend_config, PytorchEngineConfig):
14901490
backend_config.enable_mp_engine = True
14911491
# router replay

lmdeploy/turbomind/deploy/source_model/qwen.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,17 @@ def moe_ffn_shared_gate(self, i):
153153
return self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.shared_expert_gate.weight')
154154

155155

156+
def _configure_nested_language_model_prefix(reader):
157+
"""Handle VL checkpoints whose text weights live under
158+
``model.language_model``."""
159+
if any(k.startswith('model.language_model.') for k in reader.params.keys()):
160+
reader.attn_layer_prefix = 'model.language_model.layers'
161+
reader.tok_embeddings_key = 'model.language_model.embed_tokens.weight'
162+
reader.norm_weight_key = 'model.language_model.norm.weight'
163+
if reader.model_cfg.get('tie_word_embeddings', False):
164+
reader.output_weight_key = reader.tok_embeddings_key
165+
166+
156167
@INPUT_MODELS.register_module(name='qwen2-moe')
157168
class Qwen2MoeModel(LlamaModel):
158169

@@ -172,6 +183,11 @@ def model_info(self):
172183

173184

174185
class Qwen3Reader(LlamaReader):
186+
attn_layer_patten = r'(?:model\.language_model\.|model\.)layers\.([0-9]+)\.'
187+
188+
def __init__(self, *args, **kwargs):
189+
super().__init__(*args, **kwargs)
190+
_configure_nested_language_model_prefix(self)
175191

176192
def qk_norm(self, i: int):
177193
result = []
@@ -193,6 +209,11 @@ def model_info(self):
193209

194210

195211
class Qwen3MoeReader(Qwen2MoeReader):
212+
attn_layer_patten = r'(?:model\.language_model\.|model\.)layers\.([0-9]+)\.'
213+
214+
def __init__(self, *args, **kwargs):
215+
super().__init__(*args, **kwargs)
216+
_configure_nested_language_model_prefix(self)
196217

197218
def qk_norm(self, i: int):
198219
result = []
@@ -236,13 +257,7 @@ class Qwen3_5ReaderMixin:
236257

237258
def __init__(self, *args, **kwargs):
238259
super().__init__(*args, **kwargs)
239-
if any(k.startswith('model.language_model.') for k in self.params.keys()):
240-
self.attn_layer_prefix = 'model.language_model.layers'
241-
self.tok_embeddings_key = 'model.language_model.embed_tokens.weight'
242-
self.norm_weight_key = 'model.language_model.norm.weight'
243-
tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False)
244-
if tie_word_embeddings:
245-
self.output_weight_key = self.tok_embeddings_key
260+
_configure_nested_language_model_prefix(self)
246261

247262
# ---- zero-centered RMSNorm: add 1 to weights during export ----
248263
def attn_norm(self, i: int):

lmdeploy/turbomind/supported_models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
# Qwen3
3333
Qwen3ForCausalLM='qwen3',
3434
Qwen3MoeForCausalLM='qwen3-moe',
35+
Qwen3VLForConditionalGeneration='qwen3',
36+
Qwen3VLMoeForConditionalGeneration='qwen3-moe',
3537
# Qwen 3.5
3638
Qwen3_5ForConditionalGeneration='qwen3_5',
3739
Qwen3_5MoeForConditionalGeneration='qwen3_5-moe',

lmdeploy/vl/model/qwen3.py

Lines changed: 163 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,105 @@
77
from lmdeploy.utils import get_logger
88
from lmdeploy.vl.constants import Modality
99
from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
10+
from lmdeploy.vl.model.utils import disable_logging
1011

1112
logger = get_logger('lmdeploy')
1213

1314

14-
def check_transformers():
15+
def check_qwen3_vl_deps_install():
16+
"""Check dependencies for Qwen3-VL / Qwen3.5 (same vision stack as
17+
Qwen2-VL's ``check_qwen_vl_deps_install``).
18+
19+
- **Transformers**: recent build with Qwen3-VL and Qwen3.5 classes (see Qwen3-VL model card on HF).
20+
- **Accelerate**: required for TurboMind split vision loading (`load_checkpoint_and_dispatch`).
21+
- **qwen-vl-utils** (optional): pip package ``qwen-vl-utils``; many upstream Qwen-VL recipes use it for
22+
video helpers. LMDeploy's Qwen3 preprocessor uses ``AutoProcessor`` only; warn if missing so users
23+
can align with `Qwen2VLModel` / official docs when needed.
24+
"""
1525
try:
16-
from transformers import Qwen3VLForConditionalGeneration, Qwen3VLMoeForConditionalGeneration # noqa: F401
26+
from transformers import ( # noqa: F401
27+
Qwen3_5ForConditionalGeneration,
28+
Qwen3_5MoeForConditionalGeneration,
29+
Qwen3VLForConditionalGeneration,
30+
Qwen3VLMoeForConditionalGeneration,
31+
)
1732
except ImportError:
18-
raise ImportError('please install latest transformers by '
33+
raise ImportError('please install a recent transformers with Qwen3-VL / Qwen3.5 support, e.g. '
1934
'pip install git+https://github.com/huggingface/transformers.git')
35+
try:
36+
import accelerate # noqa: F401
37+
except ImportError:
38+
raise ImportError('please install accelerate for TurboMind vision loading: pip install accelerate')
39+
try:
40+
import qwen_vl_utils # noqa: F401
41+
except ImportError:
42+
logger.warning_once(
43+
'qwen-vl-utils is not installed. Install with `pip install qwen-vl-utils` if you use '
44+
'video pipelines or helpers from the Qwen-VL examples (optional for LMDeploy Qwen3 preprocess).')
45+
46+
47+
def resolve_qwen_vl_family_automodel(arch: str) -> tuple[type, list[str]]:
48+
"""Map HF architecture name to the model class and accelerate no-split
49+
vision block names.
50+
51+
Qwen3-VL introduced this TurboMind split-vision path; Qwen3.5 reuses the same stack.
52+
"""
53+
if arch == 'Qwen3VLForConditionalGeneration':
54+
from transformers import Qwen3VLForConditionalGeneration as AutoModelCls
55+
56+
no_split = ['Qwen3VLVisionBlock', 'Qwen3VLMoeVisionBlock']
57+
elif arch == 'Qwen3VLMoeForConditionalGeneration':
58+
from transformers import Qwen3VLMoeForConditionalGeneration as AutoModelCls
59+
60+
no_split = ['Qwen3VLVisionBlock', 'Qwen3VLMoeVisionBlock']
61+
elif arch == 'Qwen3_5ForConditionalGeneration':
62+
from transformers import Qwen3_5ForConditionalGeneration as AutoModelCls
63+
64+
no_split = ['Qwen3_5VisionBlock', 'Qwen3_5MoeVisionBlock']
65+
elif arch == 'Qwen3_5MoeForConditionalGeneration':
66+
from transformers import Qwen3_5MoeForConditionalGeneration as AutoModelCls
67+
68+
no_split = ['Qwen3_5VisionBlock', 'Qwen3_5MoeVisionBlock']
69+
else:
70+
raise ValueError(f'Unsupported Qwen VL family architecture: {arch}')
71+
return AutoModelCls, no_split
72+
73+
74+
def load_qwen_vl_family_vision_backbone(
75+
model_path: str,
76+
hf_config: Any,
77+
with_llm: bool,
78+
max_memory: dict[int, int] | None,
79+
) -> Any:
80+
"""Load vision tower only (TurboMind path) for Qwen3-VL and Qwen3.5."""
81+
arch = hf_config.architectures[0]
82+
AutoModelCls, no_split = resolve_qwen_vl_family_automodel(arch)
83+
84+
if with_llm:
85+
return AutoModelCls.from_pretrained(model_path, device_map='cpu')
86+
87+
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
88+
89+
with init_empty_weights():
90+
config = hf_config
91+
config.tie_word_embeddings = False
92+
if hasattr(config, 'text_config'):
93+
config.text_config.tie_word_embeddings = False
94+
model = AutoModelCls._from_config(config)
95+
del model.model.language_model
96+
del model.lm_head
97+
model.half()
98+
99+
with disable_logging():
100+
load_checkpoint_and_dispatch(
101+
model=model,
102+
checkpoint=model_path,
103+
device_map='auto',
104+
max_memory=max_memory,
105+
no_split_module_classes=no_split,
106+
dtype=torch.half,
107+
)
108+
return model.model.eval()
20109

21110

22111
@VISION_MODELS.register_module()
@@ -26,7 +115,7 @@ class Qwen3VLModel(VisionModel):
26115
_arch = ['Qwen3VLForConditionalGeneration', 'Qwen3VLMoeForConditionalGeneration']
27116

28117
def build_preprocessor(self):
29-
check_transformers()
118+
check_qwen3_vl_deps_install()
30119
self.processor = AutoProcessor.from_pretrained(self.model_path)
31120

32121
# image tokens
@@ -167,7 +256,13 @@ def proc_messages(self, messages, chat_template, sequence_start, chat_template_k
167256
else:
168257
prompt_messages = messages
169258
prompt = chat_template.messages2prompt(prompt_messages, sequence_start, **chat_template_kwargs)
170-
return prompt, None
259+
return prompt, self.image_token
260+
261+
def _ensure_turbomind_image_only(self, inputs: list[dict]):
262+
"""TurboMind split vision currently supports image inputs only."""
263+
has_video = self.contains_video_input or any('video_grid_thw' in item for item in inputs)
264+
if has_video:
265+
raise NotImplementedError('TurboMind split vision for the Qwen3 VL family currently supports images only.')
171266

172267
def to_pytorch_aux_video(self, messages, prompt, VIDEO_TOKEN, tokenizer, sequence_start):
173268
"""Pack the video input to the compatible format with pytorch
@@ -229,13 +324,61 @@ def to_pytorch(self,
229324
return self.to_pytorch_aux(messages, prompt, self.image_token, tokenizer, sequence_start)
230325

231326
def build_model(self):
232-
# TODO: implement for turbomind
233-
pass
327+
"""Load vision tower for TurboMind split path (Qwen3-VL and Qwen3.5
328+
share the same stack)."""
329+
loaded = load_qwen_vl_family_vision_backbone(self.model_path, self.hf_config, self.with_llm,
330+
self.max_memory)
331+
if self.with_llm:
332+
self.vl_model = loaded
333+
else:
334+
self.model = loaded
234335

235336
@torch.no_grad()
236337
def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
237-
# TODO: implement for turbomind
238-
pass
338+
"""Run vision encoder for TurboMind split path (shared Qwen3 VL
339+
family)."""
340+
inputs = [x['content'] for x in messages if x['role'] == 'preprocess'][0]
341+
self._ensure_turbomind_image_only(inputs)
342+
dtype = torch.half
343+
device = next(self.model.visual.parameters()).device
344+
outputs = []
345+
for idx in range(0, len(inputs), max_batch_size):
346+
pixel_values = [x['pixel_values'].type(dtype) for x in inputs[idx:idx + max_batch_size]]
347+
image_grid_thw = [x['image_grid_thw'] for x in inputs[idx:idx + max_batch_size]]
348+
pixel_values = torch.cat(pixel_values, dim=0).to(device)
349+
image_grid_thw = torch.cat(image_grid_thw, dim=0).to(device)
350+
image_embeds = self.model.visual(pixel_values, grid_thw=image_grid_thw)
351+
if hasattr(image_embeds, 'pooler_output'):
352+
image_embeds = image_embeds.pooler_output
353+
merge_length = self.processor.image_processor.merge_size**2
354+
split_size = image_grid_thw.prod(dim=1) // merge_length
355+
image_embeds = image_embeds.split(split_size.tolist())
356+
outputs.extend(image_embeds)
357+
messages.append(dict(role='forward', content=outputs))
358+
return messages
359+
360+
@staticmethod
361+
def get_mrope_info(seq_len: int, grid_thws: list[tuple] | None = None, ranges: list[tuple] | None = None):
362+
mrope_position_ids = [torch.arange(ranges[0][0]).expand(3, -1)]
363+
st_idx = ranges[0][0]
364+
for i, (grid_thw, embedding_range) in enumerate(zip(grid_thws, ranges)):
365+
llm_grid_t, llm_grid_h, llm_grid_w = grid_thw
366+
llm_grid_h //= 2
367+
llm_grid_w //= 2
368+
t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
369+
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
370+
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
371+
mrope_position_ids.append(torch.stack([t_index, h_index, w_index]) + st_idx)
372+
st_idx += max(llm_grid_h, llm_grid_w)
373+
if i < len(ranges) - 1:
374+
text_len = ranges[i + 1][0] - ranges[i][1]
375+
else:
376+
text_len = seq_len - embedding_range[1]
377+
mrope_position_ids.append(torch.arange(text_len).expand(3, -1) + st_idx)
378+
st_idx += text_len
379+
mrope_position_ids = torch.cat(mrope_position_ids, dim=-1)
380+
mrope_position_delta = torch.tensor([st_idx - seq_len], dtype=torch.long)
381+
return mrope_position_ids, mrope_position_delta
239382

240383
def to_turbomind(self,
241384
messages,
@@ -244,5 +387,14 @@ def to_turbomind(self,
244387
sequence_start,
245388
chat_template_kwargs: dict | None = None,
246389
**kwargs):
247-
# TODO: implement for turbomind
248-
pass
390+
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start, chat_template_kwargs)
391+
inputs = [x['content'] for x in messages if x['role'] == 'preprocess'][0]
392+
self._ensure_turbomind_image_only(inputs)
393+
info = super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
394+
grid_thws = [x['image_grid_thw'].tolist()[0] for x in inputs]
395+
seq_len = len(info['input_ids'])
396+
ranges = info['input_embedding_ranges']
397+
mrope_position_ids, mrope_position_delta = self.get_mrope_info(seq_len, grid_thws, ranges)
398+
meta = dict(mrope_position_ids=mrope_position_ids, mrope_position_delta=mrope_position_delta)
399+
info.update(dict(input_meta=meta))
400+
return info

0 commit comments

Comments
 (0)