diff --git a/tensorrt_llm/_torch/configs/__init__.py b/tensorrt_llm/_torch/configs/__init__.py index 619a015769ff..338bf16278b8 100644 --- a/tensorrt_llm/_torch/configs/__init__.py +++ b/tensorrt_llm/_torch/configs/__init__.py @@ -1,3 +1,7 @@ from tensorrt_llm._torch.configs.deepseek_v3 import DeepseekV3Config +from tensorrt_llm._torch.configs.qwen3_5 import Qwen3_5MoeConfig -__all__ = ["DeepseekV3Config"] +__all__ = [ + "DeepseekV3Config", + "Qwen3_5MoeConfig", +] diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py index 6b1f0536965a..92e90c9448aa 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py @@ -1,7 +1,26 @@ import re - +from typing import Dict, List + +import torch +from transformers import PretrainedConfig + +from ...inputs import ( + MultimodalPlaceholderMetadata, + MultimodalPlaceholderPlacement, + register_input_processor, + support_multimodal_disaggregated, +) +from .checkpoints.base_weight_mapper import BaseWeightMapper +from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper +from .modeling_multimodal_utils import _is_disagg from .modeling_qwen3_next import Qwen3NextForCausalLM -from .modeling_utils import register_auto_model +from .modeling_qwen3vl import ( + Qwen3VisionModel, + Qwen3VisionModelBase, + Qwen3VLInputProcessorBase, + Qwen3VLModelBase, +) +from .modeling_utils import ModelConfig, register_auto_model, register_vision_encoder _LANG_PREFIX = "model.language_model." @@ -34,6 +53,32 @@ def _normalize_qwen35_exclude_modules(model_config): qc.exclude_modules = sorted(normalized) +def _ensure_qwen35_mrope_compat(text_config: PretrainedConfig) -> None: + """Normalize Qwen3.5 mRoPE fields for the shared Qwen3-VL wrapper. + + Qwen3.5 stores RoPE metadata in ``rope_parameters``. Some config classes + may also materialize default top-level ``rope_theta`` or + ``partial_rotary_factor`` values, so prefer the checkpoint-provided nested + values unconditionally here. + """ + rope_parameters = getattr(text_config, "rope_parameters", None) + if not rope_parameters: + return + + rope_params = dict(rope_parameters) + rope_theta = rope_params.pop("rope_theta", None) + if rope_theta is not None: + text_config.rope_theta = rope_theta + + partial_rotary_factor = rope_params.pop("partial_rotary_factor", None) + if partial_rotary_factor is not None: + text_config.partial_rotary_factor = partial_rotary_factor + + if not getattr(text_config, "rope_scaling", None): + rope_params.pop("rope_type", None) + text_config.rope_scaling = rope_params + + @register_auto_model("Qwen3_5MoeForCausalLM") class Qwen3_5MoeForCausalLM(Qwen3NextForCausalLM): """Thin wrapper that registers the Qwen3.5 MoE text architecture. @@ -74,3 +119,49 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM): def __init__(self, model_config): _normalize_qwen35_exclude_modules(model_config) super().__init__(model_config) + + +@support_multimodal_disaggregated +@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel) +@register_auto_model("Qwen3_5MoeForConditionalGeneration") +@register_input_processor( + Qwen3VLInputProcessorBase, + model_type="qwen3_5_moe", + placeholder_metadata=MultimodalPlaceholderMetadata( + placeholder_map={ + "image": "<|vision_start|><|image_pad|><|vision_end|>", + "video": "<|vision_start|><|video_pad|><|vision_end|>", + }, + placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT, + placeholders_separator="", + ), +) +class Qwen3_5MoeVLModel(Qwen3VLModelBase): + """VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder.""" + + def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs): + _ensure_qwen35_mrope_compat(model_config.pretrained_config.text_config) + + kwargs["vision_model_class"] = Qwen3VisionModel + kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False) + super().__init__(model_config, *args, **kwargs) + + @property + def multimodal_data_device_paths(self) -> List[str]: + return [ + "image.pixel_values", + "video.pixel_values_videos", + "multimodal_embedding", + ] + + def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper): + if not _is_disagg(): + self.mm_encoder.load_weights(weights) + + weight_mapper = Qwen3_5MoeHfWeightMapper() + weight_mapper.init_model_and_config(self.llm, self.model_config) + filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")} + params_map = { + r"^model\.language_model\.(.*)$": r"model.\1", + } + self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map) diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py index 68e9c5a70f5f..c01da18ca78a 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_next.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_next.py @@ -1337,9 +1337,18 @@ def get_model_defaults(cls, llm_args: 'TorchLlmArgs') -> dict: # is supported for Mamba/SSM-based models return {"kv_cache_config": {"enable_block_reuse": False}} - def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper): + def load_weights(self, + weights: dict, + weight_mapper: BaseWeightMapper, + params_map: Optional[Dict[str, str]] = None, + allow_partial_loading: bool = False): new_weights = weight_mapper.preprocess_weights(weights) - super().load_weights(new_weights, weight_mapper) + super().load_weights( + new_weights, + weight_mapper=weight_mapper, + params_map=params_map, + allow_partial_loading=allow_partial_loading, + ) def post_load_weights(self): for idx, layer in enumerate( diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py index a98f35fe3f72..bc83f7dffe4e 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3vl.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3vl.py @@ -929,6 +929,8 @@ def __init__( llm_model_config.pretrained_config.architectures = ["Qwen3ForCausalLM"] elif self.original_arch == "Qwen3VLMoeForConditionalGeneration": llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"] + elif self.original_arch == "Qwen3_5MoeForConditionalGeneration": + llm_model_config.pretrained_config.architectures = ["Qwen3_5MoeForCausalLM"] else: raise ValueError(f"Unsupported architecture: {self.original_arch}") # Qwen3ForCausalLM. @@ -962,9 +964,12 @@ def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]): mrope_section=config.rope_scaling.get("mrope_section", None), mrope_interleaved=config.rope_scaling.get("mrope_interleaved", False), ) + head_dim = getattr(config, "head_dim", None) + if not isinstance(head_dim, int): + head_dim = config.hidden_size // config.num_attention_heads self.rotary_emb = MRotaryEmbedding( pos_embd_params.rope, - head_dim=config.hidden_size // config.num_attention_heads, + head_dim=head_dim, is_neox=pos_embd_params.is_neox, mrope_section=pos_embd_params.mrope_section, mrope_interleaved=pos_embd_params.mrope_interleaved, diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index 9c3b4c37560f..f8b30d260dfc 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -248,6 +248,7 @@ def __getitem__(self, key): deepseek_v32="DeepseekV3Config", kimi_k2="DeepseekV3Config", glm_moe_dsa="DeepseekV3Config", + qwen3_5_moe="Qwen3_5MoeConfig", ) # NOTE: HF config.json uses deepseek_v32 as model_type but with same DSV3 config class diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index 53e00ff87e9d..3ca1883589c9 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -46,3 +46,5 @@ Qwen/Qwen3-VL-8B-Instruct: - accuracy: 55.11 mistralai/Mistral-Small-3.1-24B-Instruct-2503: - accuracy: 57.0 +Qwen/Qwen3.5-35B-A3B: + - accuracy: 59.0 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index a885718fb8aa..4adae6ed69d5 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -397,6 +397,29 @@ def test_nvfp4_4gpus( task.evaluate(llm, sampling_params=self.sampling_params) +class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness): + MODEL_NAME = "Qwen/Qwen3.5-35B-A3B" + MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B" + MAX_NUM_TOKENS = 16384 + + sampling_params = SamplingParams( + max_tokens=MAX_NUM_TOKENS, + truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, + stop="<|endoftext|>", + ) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + + def test_auto_dtype(self): + with LLM( + self.MODEL_PATH, + max_num_tokens=self.MAX_NUM_TOKENS, + kv_cache_config=self.kv_cache_config, + ) as llm: + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) + + class TestQwen3VL(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-8B-Instruct"