Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion tensorrt_llm/_torch/configs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from tensorrt_llm._torch.configs.deepseek_v3 import DeepseekV3Config
from tensorrt_llm._torch.configs.qwen3_5 import Qwen3_5MoeConfig

__all__ = ["DeepseekV3Config"]
__all__ = [
"DeepseekV3Config",
"Qwen3_5MoeConfig",
]
95 changes: 93 additions & 2 deletions tensorrt_llm/_torch/models/modeling_qwen3_5.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,26 @@
import re

from typing import Dict, List

import torch
from transformers import PretrainedConfig

from ...inputs import (
MultimodalPlaceholderMetadata,
MultimodalPlaceholderPlacement,
register_input_processor,
support_multimodal_disaggregated,
)
from .checkpoints.base_weight_mapper import BaseWeightMapper
from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper
from .modeling_multimodal_utils import _is_disagg
from .modeling_qwen3_next import Qwen3NextForCausalLM
from .modeling_utils import register_auto_model
from .modeling_qwen3vl import (
Qwen3VisionModel,
Qwen3VisionModelBase,
Qwen3VLInputProcessorBase,
Qwen3VLModelBase,
)
from .modeling_utils import ModelConfig, register_auto_model, register_vision_encoder

_LANG_PREFIX = "model.language_model."

Expand Down Expand Up @@ -34,6 +53,32 @@ def _normalize_qwen35_exclude_modules(model_config):
qc.exclude_modules = sorted(normalized)


def _ensure_qwen35_mrope_compat(text_config: PretrainedConfig) -> None:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

QQ) Was there any case that Qwen3.5 has top-level rope_parameters?

"""Normalize Qwen3.5 mRoPE fields for the shared Qwen3-VL wrapper.

Qwen3.5 stores RoPE metadata in ``rope_parameters``. Some config classes
may also materialize default top-level ``rope_theta`` or
``partial_rotary_factor`` values, so prefer the checkpoint-provided nested
values unconditionally here.
"""
rope_parameters = getattr(text_config, "rope_parameters", None)
if not rope_parameters:
return

rope_params = dict(rope_parameters)
rope_theta = rope_params.pop("rope_theta", None)
if rope_theta is not None:
text_config.rope_theta = rope_theta

partial_rotary_factor = rope_params.pop("partial_rotary_factor", None)
if partial_rotary_factor is not None:
text_config.partial_rotary_factor = partial_rotary_factor

if not getattr(text_config, "rope_scaling", None):
rope_params.pop("rope_type", None)
text_config.rope_scaling = rope_params
Comment on lines +56 to +79

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Merge nested mRoPE fields even when rope_scaling already exists.

Qwen3VLModelBase.init_mrope_embedding() reads mrope_section and mrope_interleaved from text_config.rope_scaling, but this helper only copies the nested rope_parameters payload when rope_scaling is falsy. Any config class that prepopulates a top-level rope_scaling will therefore lose the checkpoint-provided mRoPE metadata on the VLM path.

Suggested change
-    if not getattr(text_config, "rope_scaling", None):
-        rope_params.pop("rope_type", None)
-        text_config.rope_scaling = rope_params
+    rope_params.pop("rope_type", None)
+    if rope_params:
+        rope_scaling = dict(getattr(text_config, "rope_scaling", None) or {})
+        rope_scaling.update(rope_params)
+        text_config.rope_scaling = rope_scaling
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tensorrt_llm/_torch/models/modeling_qwen3_5.py` around lines 56 - 79, The
helper _ensure_qwen35_mrope_compat currently only assigns rope_parameters into
text_config.rope_scaling when rope_scaling is falsy, which discards checkpoint
mRoPE metadata if rope_scaling is prepopulated; change it to always merge the
nested rope_parameters into the top-level rope_scaling dict: after extracting
rope_params (and popping rope_type), obtain the existing
text_config.rope_scaling (or an empty dict), update/override it with rope_params
so checkpoint fields (e.g., mrope_section, mrope_interleaved) replace or augment
existing keys, and then assign the merged dict back to text_config.rope_scaling
while leaving rope_theta and partial_rotary_factor handling unchanged.



@register_auto_model("Qwen3_5MoeForCausalLM")
class Qwen3_5MoeForCausalLM(Qwen3NextForCausalLM):
"""Thin wrapper that registers the Qwen3.5 MoE text architecture.
Expand Down Expand Up @@ -74,3 +119,49 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM):
def __init__(self, model_config):
_normalize_qwen35_exclude_modules(model_config)
super().__init__(model_config)


@support_multimodal_disaggregated
@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel)
@register_auto_model("Qwen3_5MoeForConditionalGeneration")
@register_input_processor(
Qwen3VLInputProcessorBase,
model_type="qwen3_5_moe",
placeholder_metadata=MultimodalPlaceholderMetadata(
placeholder_map={
"image": "<|vision_start|><|image_pad|><|vision_end|>",
"video": "<|vision_start|><|video_pad|><|vision_end|>",
},
placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
placeholders_separator="",
),
)
class Qwen3_5MoeVLModel(Qwen3VLModelBase):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we only adding Qwen3.5-MoE models? Seems there is a dense model, too.

https://huggingface.co/Qwen/Qwen3.5-27B

"""VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder."""

def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs):
_ensure_qwen35_mrope_compat(model_config.pretrained_config.text_config)

kwargs["vision_model_class"] = Qwen3VisionModel
kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False)
super().__init__(model_config, *args, **kwargs)

@property
def multimodal_data_device_paths(self) -> List[str]:
return [
"image.pixel_values",
"video.pixel_values_videos",
"multimodal_embedding",
]

def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper):
if not _is_disagg():
self.mm_encoder.load_weights(weights)

weight_mapper = Qwen3_5MoeHfWeightMapper()
weight_mapper.init_model_and_config(self.llm, self.model_config)
filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")}
params_map = {
r"^model\.language_model\.(.*)$": r"model.\1",
}
self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map)
13 changes: 11 additions & 2 deletions tensorrt_llm/_torch/models/modeling_qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -1337,9 +1337,18 @@ def get_model_defaults(cls, llm_args: 'TorchLlmArgs') -> dict:
# is supported for Mamba/SSM-based models
return {"kv_cache_config": {"enable_block_reuse": False}}

def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper):
def load_weights(self,
weights: dict,
weight_mapper: BaseWeightMapper,
params_map: Optional[Dict[str, str]] = None,
allow_partial_loading: bool = False):
new_weights = weight_mapper.preprocess_weights(weights)
super().load_weights(new_weights, weight_mapper)
super().load_weights(
new_weights,
weight_mapper=weight_mapper,
params_map=params_map,
allow_partial_loading=allow_partial_loading,
)

def post_load_weights(self):
for idx, layer in enumerate(
Expand Down
7 changes: 6 additions & 1 deletion tensorrt_llm/_torch/models/modeling_qwen3vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,8 @@ def __init__(
llm_model_config.pretrained_config.architectures = ["Qwen3ForCausalLM"]
elif self.original_arch == "Qwen3VLMoeForConditionalGeneration":
llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"]
elif self.original_arch == "Qwen3_5MoeForConditionalGeneration":
llm_model_config.pretrained_config.architectures = ["Qwen3_5MoeForCausalLM"]
else:
raise ValueError(f"Unsupported architecture: {self.original_arch}")
# Qwen3ForCausalLM.
Expand Down Expand Up @@ -962,9 +964,12 @@ def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]):
mrope_section=config.rope_scaling.get("mrope_section", None),
mrope_interleaved=config.rope_scaling.get("mrope_interleaved", False),
)
head_dim = getattr(config, "head_dim", None)
if not isinstance(head_dim, int):
head_dim = config.hidden_size // config.num_attention_heads
self.rotary_emb = MRotaryEmbedding(
pos_embd_params.rope,
head_dim=config.hidden_size // config.num_attention_heads,
head_dim=head_dim,
is_neox=pos_embd_params.is_neox,
mrope_section=pos_embd_params.mrope_section,
mrope_interleaved=pos_embd_params.mrope_interleaved,
Expand Down
1 change: 1 addition & 0 deletions tensorrt_llm/_torch/pyexecutor/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ def __getitem__(self, key):
deepseek_v32="DeepseekV3Config",
kimi_k2="DeepseekV3Config",
glm_moe_dsa="DeepseekV3Config",
qwen3_5_moe="Qwen3_5MoeConfig",

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q) Can I know the reason for having separate Qwen3_5MoeConfig?

) # NOTE: HF config.json uses deepseek_v32 as model_type but with same DSV3 config class


Expand Down
2 changes: 2 additions & 0 deletions tests/integration/defs/accuracy/references/mmmu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,5 @@ Qwen/Qwen3-VL-8B-Instruct:
- accuracy: 55.11
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
- accuracy: 57.0
Qwen/Qwen3.5-35B-A3B:
- accuracy: 59.0
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,29 @@ def test_nvfp4_4gpus(
task.evaluate(llm, sampling_params=self.sampling_params)


class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add the new feature test case into QA test list? Thanks!

MODEL_NAME = "Qwen/Qwen3.5-35B-A3B"
MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B"
MAX_NUM_TOKENS = 16384

sampling_params = SamplingParams(
max_tokens=MAX_NUM_TOKENS,
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
stop="<|endoftext|>",
)

kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)

def test_auto_dtype(self):
with LLM(
self.MODEL_PATH,
max_num_tokens=self.MAX_NUM_TOKENS,
kv_cache_config=self.kv_cache_config,
) as llm:
task = MMMU(self.MODEL_NAME)
task.evaluate(llm, sampling_params=self.sampling_params)
Comment on lines +400 to +420

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Add a device-memory guard for this 35B MMMU case.

The 30B A3B test right above already skips below a memory threshold, but this larger model has no equivalent guard. That will turn low-memory jobs into predictable OOM failures instead of clean skips.

Suggested change
+@pytest.mark.skip_less_device_memory(140000)
 class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):

At minimum, mirror the 30B case above and bump the threshold if this model needs more headroom.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py` around
lines 400 - 420, The 35B test (class TestQwen3_5_35B_A3B_VL, method
test_auto_dtype) lacks a device-memory guard and will OOM on low-memory CI;
mirror the 30B test’s skip logic by checking available GPU memory (using the
same helper used in the 30B case) at the start of test_auto_dtype and skip the
test if free GPU memory is below the threshold, increasing the threshold value
compared to the 30B test to provide extra headroom for this larger model; ensure
the check runs before creating the LLM (before LLM(...) context) so the job
cleanly skips on low-memory hosts.



class TestQwen3VL(LlmapiAccuracyTestHarness):
MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct"
MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-8B-Instruct"
Expand Down
Loading