From d917e03c42ba5558ae1190c69e496fd1a919e562 Mon Sep 17 00:00:00 2001 From: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> Date: Tue, 31 Mar 2026 03:08:11 +0000 Subject: [PATCH 1/9] [None][feat] Add the Qwen3.5 multimodal support. Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> --- tensorrt_llm/_torch/configs/__init__.py | 5 +- .../_torch/models/modeling_qwen3_5.py | 95 ++++++++++++++++++- .../_torch/models/modeling_qwen3_next.py | 13 ++- .../_torch/models/modeling_qwen3vl.py | 7 +- .../_torch/pyexecutor/config_utils.py | 1 + .../defs/accuracy/references/mmmu.yaml | 2 + .../test_llm_api_pytorch_multimodal.py | 23 +++++ 7 files changed, 140 insertions(+), 6 deletions(-) diff --git a/tensorrt_llm/_torch/configs/__init__.py b/tensorrt_llm/_torch/configs/__init__.py index 6496e3283451..b4ba4c5183f6 100644 --- a/tensorrt_llm/_torch/configs/__init__.py +++ b/tensorrt_llm/_torch/configs/__init__.py @@ -24,4 +24,7 @@ def _register_custom_configs_with_transformers() -> None: _register_custom_configs_with_transformers() del _register_custom_configs_with_transformers -__all__ = ["DeepseekV3Config"] +__all__ = [ + "DeepseekV3Config", + "Qwen3_5MoeConfig", +] diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py index bf83e916db29..2c15b851d511 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py @@ -1,7 +1,26 @@ import re - +from typing import Dict, List + +import torch +from transformers import PretrainedConfig + +from ...inputs import ( + MultimodalPlaceholderMetadata, + MultimodalPlaceholderPlacement, + register_input_processor, + support_multimodal_disaggregated, +) +from .checkpoints.base_weight_mapper import BaseWeightMapper +from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper +from .modeling_multimodal_utils import _is_disagg from .modeling_qwen3_next import Qwen3NextForCausalLM -from .modeling_utils import register_auto_model +from .modeling_qwen3vl import ( + Qwen3VisionModel, + Qwen3VisionModelBase, + Qwen3VLInputProcessorBase, + Qwen3VLModelBase, +) +from .modeling_utils import ModelConfig, register_auto_model, register_vision_encoder _LANG_PREFIX = "model.language_model." @@ -93,6 +112,32 @@ def _normalize_qwen35_exclude_modules(model_config): qc.exclude_modules = sorted(normalized) +def _ensure_qwen35_mrope_compat(text_config: PretrainedConfig) -> None: + """Normalize Qwen3.5 mRoPE fields for the shared Qwen3-VL wrapper. + + Qwen3.5 stores RoPE metadata in ``rope_parameters``. Some config classes + may also materialize default top-level ``rope_theta`` or + ``partial_rotary_factor`` values, so prefer the checkpoint-provided nested + values unconditionally here. + """ + rope_parameters = getattr(text_config, "rope_parameters", None) + if not rope_parameters: + return + + rope_params = dict(rope_parameters) + rope_theta = rope_params.pop("rope_theta", None) + if rope_theta is not None: + text_config.rope_theta = rope_theta + + partial_rotary_factor = rope_params.pop("partial_rotary_factor", None) + if partial_rotary_factor is not None: + text_config.partial_rotary_factor = partial_rotary_factor + + if not getattr(text_config, "rope_scaling", None): + rope_params.pop("rope_type", None) + text_config.rope_scaling = rope_params + + @register_auto_model("Qwen3_5MoeForCausalLM") class Qwen3_5MoeForCausalLM(Qwen3NextForCausalLM): """Thin wrapper that registers the Qwen3.5 MoE text architecture. @@ -133,3 +178,49 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM): def __init__(self, model_config): _normalize_qwen35_exclude_modules(model_config) super().__init__(model_config) + + +@support_multimodal_disaggregated +@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel) +@register_auto_model("Qwen3_5MoeForConditionalGeneration") +@register_input_processor( + Qwen3VLInputProcessorBase, + model_type="qwen3_5_moe", + placeholder_metadata=MultimodalPlaceholderMetadata( + placeholder_map={ + "image": "<|vision_start|><|image_pad|><|vision_end|>", + "video": "<|vision_start|><|video_pad|><|vision_end|>", + }, + placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT, + placeholders_separator="", + ), +) +class Qwen3_5MoeVLModel(Qwen3VLModelBase): + """VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder.""" + + def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs): + _ensure_qwen35_mrope_compat(model_config.pretrained_config.text_config) + + kwargs["vision_model_class"] = Qwen3VisionModel + kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False) + super().__init__(model_config, *args, **kwargs) + + @property + def multimodal_data_device_paths(self) -> List[str]: + return [ + "image.pixel_values", + "video.pixel_values_videos", + "multimodal_embedding", + ] + + def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper): + if not _is_disagg(): + self.mm_encoder.load_weights(weights) + + weight_mapper = Qwen3_5MoeHfWeightMapper() + weight_mapper.init_model_and_config(self.llm, self.model_config) + filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")} + params_map = { + r"^model\.language_model\.(.*)$": r"model.\1", + } + self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map) diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py index d6f4fd57794f..5d8ca8e81cbd 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_next.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_next.py @@ -973,9 +973,18 @@ def get_model_defaults(cls, llm_args: 'TorchLlmArgs') -> dict: # is supported for Mamba/SSM-based models return {"kv_cache_config": {"enable_block_reuse": False}} - def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper): + def load_weights(self, + weights: dict, + weight_mapper: BaseWeightMapper, + params_map: Optional[Dict[str, str]] = None, + allow_partial_loading: bool = False): new_weights = weight_mapper.preprocess_weights(weights) - super().load_weights(new_weights, weight_mapper) + super().load_weights( + new_weights, + weight_mapper=weight_mapper, + params_map=params_map, + allow_partial_loading=allow_partial_loading, + ) def post_load_weights(self): for idx, layer in enumerate( diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py index 2031a4b7dc18..526b84dbc216 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3vl.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3vl.py @@ -998,6 +998,8 @@ def __init__( llm_model_config.pretrained_config.architectures = ["Qwen3ForCausalLM"] elif self.original_arch == "Qwen3VLMoeForConditionalGeneration": llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"] + elif self.original_arch == "Qwen3_5MoeForConditionalGeneration": + llm_model_config.pretrained_config.architectures = ["Qwen3_5MoeForCausalLM"] else: raise ValueError(f"Unsupported architecture: {self.original_arch}") # Qwen3ForCausalLM. @@ -1035,9 +1037,12 @@ def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]): mrope_section=config.rope_scaling.get("mrope_section", None), mrope_interleaved=config.rope_scaling.get("mrope_interleaved", False), ) + head_dim = getattr(config, "head_dim", None) + if not isinstance(head_dim, int): + head_dim = config.hidden_size // config.num_attention_heads self.rotary_emb = MRotaryEmbedding( pos_embd_params.rope, - head_dim=config.hidden_size // config.num_attention_heads, + head_dim=head_dim, is_neox=pos_embd_params.is_neox, mrope_section=pos_embd_params.mrope_section, mrope_interleaved=pos_embd_params.mrope_interleaved, diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index f4ec50639715..7626d839e92e 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -427,6 +427,7 @@ def __getitem__(self, key): deepseek_v32="DeepseekV3Config", kimi_k2="DeepseekV3Config", glm_moe_dsa="DeepseekV3Config", + qwen3_5_moe="Qwen3_5MoeConfig", ) # NOTE: HF config.json uses deepseek_v32 as model_type but with same DSV3 config class diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index 34083c6ba5d5..2ecb68d691b6 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -68,3 +68,5 @@ moonshotai/Kimi-K2.5: - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 81.56 +Qwen/Qwen3.5-35B-A3B: + - accuracy: 59.0 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index d7623cd828ae..37c4b349f102 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -426,6 +426,29 @@ def test_nvfp4_4gpus( task.evaluate(llm, sampling_params=self.sampling_params) +class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness): + MODEL_NAME = "Qwen/Qwen3.5-35B-A3B" + MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B" + MAX_NUM_TOKENS = 16384 + + sampling_params = SamplingParams( + max_tokens=MAX_NUM_TOKENS, + truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, + stop="<|endoftext|>", + ) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + + def test_auto_dtype(self): + with LLM( + self.MODEL_PATH, + max_num_tokens=self.MAX_NUM_TOKENS, + kv_cache_config=self.kv_cache_config, + ) as llm: + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) + + class TestQwen3VL(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-8B-Instruct" From fef5361b7a13dc1136b74efd1050f27b60273f54 Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Thu, 14 May 2026 18:32:41 -0700 Subject: [PATCH 2/9] Qwen3.5 VL MoE Working draft Signed-off-by: Michal Guzek --- tensorrt_llm/_torch/configs/__init__.py | 1 - tensorrt_llm/_torch/models/__init__.py | 4 +- .../checkpoints/hf/qwen3_5_weight_mapper.py | 1 + .../_torch/models/modeling_qwen3_5.py | 30 +--- .../_torch/pyexecutor/config_utils.py | 151 +++++++++++++++++- .../_torch/pyexecutor/model_loader.py | 11 +- .../defs/accuracy/references/mmmu.yaml | 3 +- .../test_llm_api_pytorch_multimodal.py | 5 +- .../modeling/test_modeling_qwen3_5_vl_moe.py | 144 +++++++++++++++++ 9 files changed, 307 insertions(+), 43 deletions(-) create mode 100644 tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py diff --git a/tensorrt_llm/_torch/configs/__init__.py b/tensorrt_llm/_torch/configs/__init__.py index b4ba4c5183f6..0ab6bc3fcacf 100644 --- a/tensorrt_llm/_torch/configs/__init__.py +++ b/tensorrt_llm/_torch/configs/__init__.py @@ -26,5 +26,4 @@ def _register_custom_configs_with_transformers() -> None: __all__ = [ "DeepseekV3Config", - "Qwen3_5MoeConfig", ] diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py index 55177767ac5a..42229246ae08 100644 --- a/tensorrt_llm/_torch/models/__init__.py +++ b/tensorrt_llm/_torch/models/__init__.py @@ -35,7 +35,8 @@ Qwen2ForRewardModel) from .modeling_qwen2vl import Qwen2_5_VLModel, Qwen2VLModel from .modeling_qwen3 import Qwen3ForCausalLM -from .modeling_qwen3_5 import Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM +from .modeling_qwen3_5 import (Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM, + Qwen3_5MoeVLModel) from .modeling_qwen3_moe import Qwen3MoeForCausalLM from .modeling_qwen3_next import Qwen3NextForCausalLM from .modeling_qwen3vl import Qwen3VLModel @@ -86,6 +87,7 @@ "Qwen3MoeForCausalLM", "Qwen3_5ForCausalLM", "Qwen3_5MoeForCausalLM", + "Qwen3_5MoeVLModel", "Qwen3NextForCausalLM", "Qwen3MoeVLModel", "GptOssForCausalLM", diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py index fa2f161bdc4f..65e0168bec55 100644 --- a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py +++ b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py @@ -13,6 +13,7 @@ @register_mapper("HF", "Qwen3_5MoeForCausalLM") +@register_mapper("HF", "Qwen3_5MoeForConditionalGeneration") @register_mapper("HF", "Qwen3_5ForCausalLM") class Qwen3_5MoeHfWeightMapper(Qwen3NextHfWeightMapper): """Weight mapper for Qwen3.5 MoE text checkpoints. diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py index 2c15b851d511..e815c94bd063 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py @@ -5,6 +5,7 @@ from transformers import PretrainedConfig from ...inputs import ( + ContentFormat, MultimodalPlaceholderMetadata, MultimodalPlaceholderPlacement, register_input_processor, @@ -112,32 +113,6 @@ def _normalize_qwen35_exclude_modules(model_config): qc.exclude_modules = sorted(normalized) -def _ensure_qwen35_mrope_compat(text_config: PretrainedConfig) -> None: - """Normalize Qwen3.5 mRoPE fields for the shared Qwen3-VL wrapper. - - Qwen3.5 stores RoPE metadata in ``rope_parameters``. Some config classes - may also materialize default top-level ``rope_theta`` or - ``partial_rotary_factor`` values, so prefer the checkpoint-provided nested - values unconditionally here. - """ - rope_parameters = getattr(text_config, "rope_parameters", None) - if not rope_parameters: - return - - rope_params = dict(rope_parameters) - rope_theta = rope_params.pop("rope_theta", None) - if rope_theta is not None: - text_config.rope_theta = rope_theta - - partial_rotary_factor = rope_params.pop("partial_rotary_factor", None) - if partial_rotary_factor is not None: - text_config.partial_rotary_factor = partial_rotary_factor - - if not getattr(text_config, "rope_scaling", None): - rope_params.pop("rope_type", None) - text_config.rope_scaling = rope_params - - @register_auto_model("Qwen3_5MoeForCausalLM") class Qwen3_5MoeForCausalLM(Qwen3NextForCausalLM): """Thin wrapper that registers the Qwen3.5 MoE text architecture. @@ -193,14 +168,13 @@ def __init__(self, model_config): }, placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT, placeholders_separator="", + content_format=ContentFormat.STRING, ), ) class Qwen3_5MoeVLModel(Qwen3VLModelBase): """VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder.""" def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs): - _ensure_qwen35_mrope_compat(model_config.pretrained_config.text_config) - kwargs["vision_model_class"] = Qwen3VisionModel kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False) super().__init__(model_config, *args, **kwargs) diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index 7626d839e92e..761efebbf7cc 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -6,6 +6,7 @@ import torch import transformers +from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.logger import logger @@ -21,6 +22,57 @@ def is_hybrid_linear(config): return is_nemotron_hybrid(config) or is_qwen3_hybrid(config) +def _coerce_torch_dtype(dtype): + """Normalize dtype values from HF configs into torch dtype objects. + + HF configs may store dtype fields as torch dtypes, strings, or the sentinel + value "auto". Returning None for "auto" lets the caller keep its normal + fallback path instead of treating "auto" as a concrete cache dtype. + """ + if isinstance(dtype, torch.dtype): + return dtype + if dtype == "auto": + return None + if isinstance(dtype, str): + return str_dtype_to_torch(dtype) + return dtype + + +def resolve_hf_torch_dtype(config): + """Return the model's regular tensor dtype from common HF config fields. + + Transformers has used both dtype and torch_dtype across versions and model + families. This helper checks both names and coerces whichever one is present + into the form expected by TRT-LLM runtime code. + """ + for attr in ("dtype", "torch_dtype"): + dtype = getattr(config, attr, None) + if dtype is not None: + return _coerce_torch_dtype(dtype) + return None + + +def resolve_mamba_ssm_cache_dtype(config): + """Return the dtype to use for hybrid Mamba/SSM cache allocations. + + Qwen3.5-style configs may store this field on the top-level config or the + nested text_config, and may call it either mamba_ssm_cache_dtype or + mamba_ssm_dtype. This helper centralizes that lookup so cache creation does + not fail later with a missing dtype. + """ + configs = [config] + text_config = getattr(config, "text_config", None) + if text_config is not None: + configs.append(text_config) + + for candidate_config in configs: + for attr in ("mamba_ssm_cache_dtype", "mamba_ssm_dtype"): + dtype = getattr(candidate_config, attr, None) + if dtype is not None: + return _coerce_torch_dtype(dtype) + return None + + def is_nemotron_hybrid(config): if hasattr(config, "hybrid_override_pattern" ) and config.hybrid_override_pattern is not None and len( @@ -251,6 +303,12 @@ def extract_mamba_kv_cache_params( mamba_ssm_cache_dtype = (quant_config.mamba_ssm_cache_dtype if quant_config is not None else None) + if mamba_ssm_cache_dtype is not None: + mamba_ssm_cache_dtype = _coerce_torch_dtype(mamba_ssm_cache_dtype) + else: + mamba_ssm_cache_dtype = (resolve_mamba_ssm_cache_dtype(config) + or resolve_hf_torch_dtype(config) + or torch.bfloat16) return MambaKVCacheParams( state_size=state_size, @@ -262,16 +320,21 @@ def extract_mamba_kv_cache_params( full_attention_layer_mask=full_attn_mask, num_mamba_layers=sum(mamba_mask), num_full_attention_layers=sum(full_attn_mask), - dtype=config.torch_dtype, + dtype=resolve_hf_torch_dtype(config) or torch.bfloat16, mamba_ssm_cache_dtype=mamba_ssm_cache_dtype, ) class _Qwen35ConfigCompat: - """Temporary shim that normalizes Qwen3.5 HF configs into Qwen3NextConfig. + """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig. + + This is used for Qwen3.5 text-only configs and for shared helper logic such + as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM + configs should stay composite and use transformers.Qwen3_5MoeConfig plus + _normalize_qwen35_moe_vl_config instead. To remove: delete this class and the elif branch in - load_pretrained_config that references it. + load_pretrained_config that flattens Qwen3.5 text configs. """ @staticmethod @@ -415,6 +478,80 @@ def _flatten_rope(text_config: dict) -> dict: return text_config +def _normalize_qwen35_mrope_config(text_config) -> None: + """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path. + + HF stores RoPE metadata under ``rope_parameters``; the shared Qwen3-VL + wrapper reads ``rope_theta``, ``partial_rotary_factor``, and + ``rope_scaling`` directly on the text config. + """ + rope_parameters = getattr(text_config, "rope_parameters", None) + if not rope_parameters: + return + if hasattr(rope_parameters, "to_dict"): + rope_parameters = rope_parameters.to_dict() + flattened = _Qwen35ConfigCompat._flatten_rope({ + "rope_parameters": + dict(rope_parameters), + "rope_scaling": + dict(getattr(text_config, "rope_scaling", None) or {}), + }) + for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"): + value = flattened.get(attr) + if value is not None: + setattr(text_config, attr, value) + + +def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None: + """Materialize Qwen3Next-style text aliases used by the shared runtime.""" + if getattr(text_config, "intermediate_size", None) is None: + moe_intermediate_size = getattr(text_config, "moe_intermediate_size", + None) + num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None) + shared_expert_intermediate_size = getattr( + text_config, "shared_expert_intermediate_size", 0) or 0 + if (moe_intermediate_size is not None + and num_experts_per_tok is not None): + text_config.intermediate_size = ( + num_experts_per_tok * moe_intermediate_size + + shared_expert_intermediate_size) + + +def _normalize_qwen35_quantization_config(model_config) -> None: + quantization_config = getattr(model_config, "quantization_config", None) + if not isinstance(quantization_config, dict): + return + + modules = quantization_config.get("modules_to_not_convert") + if modules is None: + return + + text_config = getattr(model_config, "text_config", None) + normalized_modules = _Qwen35ConfigCompat._normalize_exclude_modules(modules) + if text_config is not None: + normalized_modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround( + text_config.to_dict(), normalized_modules) + quantization_config["modules_to_not_convert"] = sorted( + set(normalized_modules)) + + +def _normalize_qwen35_moe_vl_config(model_config) -> None: + """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions.""" + if not getattr(model_config, "architectures", None): + model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"] + + text_config = getattr(model_config, "text_config", None) + if text_config is None: + raise ValueError("Qwen3.5-MoE VLM config is missing text_config") + + text_config.architectures = ["Qwen3_5MoeForCausalLM"] + _normalize_qwen35_qwen3next_text_aliases(text_config) + _normalize_qwen35_mrope_config(text_config) + + model_config.get_text_config = lambda decoder=False: text_config + _normalize_qwen35_quantization_config(model_config) + + # TODO: remove this once the transformers can support all of those models in _CONFIG_REGISTRY class LazyConfigDict(dict): @@ -427,7 +564,6 @@ def __getitem__(self, key): deepseek_v32="DeepseekV3Config", kimi_k2="DeepseekV3Config", glm_moe_dsa="DeepseekV3Config", - qwen3_5_moe="Qwen3_5MoeConfig", ) # NOTE: HF config.json uses deepseek_v32 as model_type but with same DSV3 config class @@ -445,6 +581,13 @@ def load_pretrained_config(model_name_or_path: str, MistralConfigLoader model_config = MistralConfigLoader().load( model_name_or_path).pretrained_config + elif (model_type == "qwen3_5_moe" and + (("text_config" in config_dict and "vision_config" in config_dict) or + (architectures + and architectures[0] == "Qwen3_5MoeForConditionalGeneration"))): + model_config = transformers.Qwen3_5MoeConfig.from_pretrained( + model_name_or_path, **kwargs) + _normalize_qwen35_moe_vl_config(model_config) elif model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] model_config = config_class.from_pretrained(model_name_or_path, diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py index 14d813a99dfd..54c02754f12d 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_loader.py +++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py @@ -29,6 +29,7 @@ MoeLoadBalancer, maybe_create_moe_load_balancer) from ..virtual_memory import RestoreMode from ..virtual_memory import scope as virtual_memory_scope +from .config_utils import resolve_hf_torch_dtype, resolve_mamba_ssm_cache_dtype _KV_CACHE_MAP = { "fp8": QuantAlgo.FP8.value, @@ -44,12 +45,10 @@ def validate_and_set_mamba_ssm_cache_dtype( mamba_ssm_stochastic_rounding: bool = False, mamba_ssm_philox_rounds: int = 10) -> None: if mamba_ssm_cache_dtype == "auto": - hf_dtype = getattr(config.pretrained_config, "mamba_ssm_cache_dtype", - None) - if hf_dtype is not None: - mamba_ssm_cache_dtype = str_dtype_to_torch(hf_dtype) - else: - mamba_ssm_cache_dtype = config.pretrained_config.torch_dtype + mamba_ssm_cache_dtype = ( + resolve_mamba_ssm_cache_dtype(config.pretrained_config) + or resolve_hf_torch_dtype(config.pretrained_config) + or config.torch_dtype) else: mamba_ssm_cache_dtype = str_dtype_to_torch(mamba_ssm_cache_dtype) diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index 2ecb68d691b6..1bfb7c4869a8 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -60,6 +60,7 @@ Qwen/Qwen3-VL-8B-Instruct: mistralai/Mistral-Small-3.1-24B-Instruct-2503: - accuracy: 57.0 Qwen/Qwen3.5-35B-A3B: + - accuracy: 59.0 - dtype: bfloat16 accuracy: 60.444 # Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params). @@ -68,5 +69,3 @@ moonshotai/Kimi-K2.5: - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 81.56 -Qwen/Qwen3.5-35B-A3B: - - accuracy: 59.0 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index 37c4b349f102..cbce9be563ac 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -426,10 +426,12 @@ def test_nvfp4_4gpus( task.evaluate(llm, sampling_params=self.sampling_params) +@pytest.mark.skip_less_device_memory(80000) class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen3.5-35B-A3B" MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B" MAX_NUM_TOKENS = 16384 + MAX_BATCH_SIZE = 32 sampling_params = SamplingParams( max_tokens=MAX_NUM_TOKENS, @@ -437,12 +439,13 @@ class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness): stop="<|endoftext|>", ) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False) def test_auto_dtype(self): with LLM( self.MODEL_PATH, max_num_tokens=self.MAX_NUM_TOKENS, + max_batch_size=self.MAX_BATCH_SIZE, kv_cache_config=self.kv_cache_config, ) as llm: task = MMMU(self.MODEL_NAME) diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py new file mode 100644 index 000000000000..49fc4cbe4902 --- /dev/null +++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py @@ -0,0 +1,144 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import json +from pathlib import Path + +import torch +import transformers + +from tensorrt_llm._torch.model_config import ModelConfig +from tensorrt_llm._torch.models import Qwen3_5MoeVLModel +from tensorrt_llm._torch.models.checkpoints.auto_mapper import AutoCheckpointMapper +from tensorrt_llm._torch.models.checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper +from tensorrt_llm._torch.models.modeling_auto import AutoModelForCausalLM +from tensorrt_llm._torch.pyexecutor.config_utils import ( + extract_mamba_kv_cache_params, + load_pretrained_config, +) +from tensorrt_llm._torch.pyexecutor.model_loader import validate_and_set_mamba_ssm_cache_dtype +from tensorrt_llm.inputs import ContentFormat +from tensorrt_llm.inputs.registry import MULTIMODAL_PLACEHOLDER_REGISTRY + + +def _write_qwen35_moe_vl_config(tmp_path: Path) -> Path: + config = { + "architectures": ["Qwen3_5MoeForConditionalGeneration"], + "image_token_id": 248056, + "model_type": "qwen3_5_moe", + "text_config": { + "attention_bias": False, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151645, + "full_attention_interval": 4, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "linear_conv_kernel_dim": 4, + "linear_key_head_dim": 128, + "linear_num_key_heads": 16, + "linear_num_value_heads": 32, + "linear_value_head_dim": 128, + "mamba_ssm_dtype": "float32", + "max_position_embeddings": 262144, + "mlp_only_layers": [], + "model_type": "qwen3_5_moe_text", + "moe_intermediate_size": 512, + "norm_topk_prob": True, + "num_attention_heads": 32, + "num_experts": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 2, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-6, + "shared_expert_intermediate_size": 512, + "rope_parameters": { + "mrope_section": [11, 11, 10], + "partial_rotary_factor": 0.25, + "rope_theta": 1000000.0, + "rope_type": "default", + }, + "use_cache": True, + "vocab_size": 151936, + }, + "tie_word_embeddings": False, + "video_token_id": 248057, + "vision_config": { + "deepstack_visual_indexes": [8, 16, 24], + "depth": 27, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "in_channels": 3, + "intermediate_size": 4304, + "model_type": "qwen3_5_moe", + "num_heads": 16, + "num_position_embeddings": 2304, + "out_hidden_size": 2048, + "patch_size": 16, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + }, + "vision_end_token_id": 248054, + "vision_start_token_id": 248053, + } + (tmp_path / "config.json").write_text(json.dumps(config), encoding="utf-8") + return tmp_path + + +def test_qwen35_moe_vl_config_preserves_vlm_architecture( + tmp_path: Path, +) -> None: + config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path))) + + assert isinstance(config, transformers.Qwen3_5MoeConfig) + assert config.architectures == ["Qwen3_5MoeForConditionalGeneration"] + assert config.text_config.architectures == ["Qwen3_5MoeForCausalLM"] + assert config.text_config.num_experts == 128 + assert config.text_config.intermediate_size == 4608 + assert config.text_config.rope_theta == 1000000.0 + assert config.text_config.partial_rotary_factor == 0.25 + assert config.text_config.rope_scaling["type"] == "mrope" + assert config.text_config.rope_scaling["mrope_section"] == [11, 11, 10] + assert config.text_config.mamba_ssm_dtype == "float32" + assert config.get_text_config() is config.text_config + + +def test_qwen35_moe_vl_resolves_mamba_ssm_cache_dtype( + tmp_path: Path, +) -> None: + config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path))) + model_config = ModelConfig(pretrained_config=config) + + validate_and_set_mamba_ssm_cache_dtype(model_config, "auto") + assert model_config.quant_config.mamba_ssm_cache_dtype is torch.float32 + + mamba_params = extract_mamba_kv_cache_params( + config.text_config, + quant_config=model_config.quant_config, + ) + assert mamba_params.dtype is torch.bfloat16 + assert mamba_params.mamba_ssm_cache_dtype is torch.float32 + + +def test_qwen35_moe_vl_resolves_model_and_mapper(tmp_path: Path) -> None: + config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path))) + model_config = ModelConfig(pretrained_config=config) + + assert AutoModelForCausalLM._resolve_class(model_config) is Qwen3_5MoeVLModel + assert isinstance( + AutoCheckpointMapper.get("HF", "Qwen3_5MoeForConditionalGeneration"), + Qwen3_5MoeHfWeightMapper, + ) + + +def test_qwen35_moe_vl_placeholder_metadata_registered() -> None: + metadata = MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholder_metadata("qwen3_5_moe") + + assert metadata.placeholder_map == { + "image": "<|vision_start|><|image_pad|><|vision_end|>", + "video": "<|vision_start|><|video_pad|><|vision_end|>", + } + assert metadata.placeholders_separator == "" + assert metadata.content_format is ContentFormat.STRING From 1e9a86617e779b985ae902fc3b008257a6c098aa Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Fri, 15 May 2026 10:58:11 -0700 Subject: [PATCH 3/9] Address review comments Signed-off-by: Michal Guzek --- .../_torch/pyexecutor/config_utils.py | 31 ++++++++++--------- .../test_llm_api_pytorch_multimodal.py | 2 +- .../test_lists/qa/llm_function_core.txt | 1 + 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index 761efebbf7cc..d58a7db52f43 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -27,7 +27,7 @@ def _coerce_torch_dtype(dtype): HF configs may store dtype fields as torch dtypes, strings, or the sentinel value "auto". Returning None for "auto" lets the caller keep its normal - fallback path instead of treating "auto" as a concrete cache dtype. + fallback path instead of treating "auto" as a concrete dtype. """ if isinstance(dtype, torch.dtype): return dtype @@ -43,12 +43,14 @@ def resolve_hf_torch_dtype(config): Transformers has used both dtype and torch_dtype across versions and model families. This helper checks both names and coerces whichever one is present - into the form expected by TRT-LLM runtime code. + into the form expected by TRT-LLM runtime code. An "auto" value in any + field is treated the same as missing, so scanning continues to the next + field instead of stopping with None. """ for attr in ("dtype", "torch_dtype"): - dtype = getattr(config, attr, None) - if dtype is not None: - return _coerce_torch_dtype(dtype) + coerced = _coerce_torch_dtype(getattr(config, attr, None)) + if coerced is not None: + return coerced return None @@ -58,7 +60,8 @@ def resolve_mamba_ssm_cache_dtype(config): Qwen3.5-style configs may store this field on the top-level config or the nested text_config, and may call it either mamba_ssm_cache_dtype or mamba_ssm_dtype. This helper centralizes that lookup so cache creation does - not fail later with a missing dtype. + not fail later with a missing dtype. An "auto" value in any field is + treated the same as missing. """ configs = [config] text_config = getattr(config, "text_config", None) @@ -67,9 +70,9 @@ def resolve_mamba_ssm_cache_dtype(config): for candidate_config in configs: for attr in ("mamba_ssm_cache_dtype", "mamba_ssm_dtype"): - dtype = getattr(candidate_config, attr, None) - if dtype is not None: - return _coerce_torch_dtype(dtype) + coerced = _coerce_torch_dtype(getattr(candidate_config, attr, None)) + if coerced is not None: + return coerced return None @@ -301,11 +304,11 @@ def extract_mamba_kv_cache_params( full_attn_mask.extend([True] * num_spec_layers) mamba_mask.extend([False] * num_spec_layers) - mamba_ssm_cache_dtype = (quant_config.mamba_ssm_cache_dtype - if quant_config is not None else None) - if mamba_ssm_cache_dtype is not None: - mamba_ssm_cache_dtype = _coerce_torch_dtype(mamba_ssm_cache_dtype) - else: + mamba_ssm_cache_dtype = None + if quant_config is not None: + mamba_ssm_cache_dtype = _coerce_torch_dtype( + quant_config.mamba_ssm_cache_dtype) + if mamba_ssm_cache_dtype is None: mamba_ssm_cache_dtype = (resolve_mamba_ssm_cache_dtype(config) or resolve_hf_torch_dtype(config) or torch.bfloat16) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index cbce9be563ac..2a715fc33124 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -441,7 +441,7 @@ class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False) - def test_auto_dtype(self): + def test_auto_dtype(self) -> None: with LLM( self.MODEL_PATH, max_num_tokens=self.MAX_NUM_TOKENS, diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 6ac7f64897b9..e52c36078273 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -801,6 +801,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced_chunked_prefill] accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestKimiK25::test_nvfp4[dep8] +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray unittest/disaggregated/test_openai_disagg_server.py From 05c59b2c554de4729b16a19ebf786773edc92f86 Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Mon, 18 May 2026 11:16:38 -0700 Subject: [PATCH 4/9] Address review comments Signed-off-by: Michal Guzek --- .../_torch/models/modeling_qwen3_5.py | 244 ++++++++++++++++++ .../_torch/pyexecutor/config_utils.py | 235 +---------------- .../defs/accuracy/references/mmmu.yaml | 1 + 3 files changed, 251 insertions(+), 229 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py index e815c94bd063..a1d8aaa69a08 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py @@ -1,4 +1,5 @@ import re +from types import SimpleNamespace from typing import Dict, List import torch @@ -11,6 +12,7 @@ register_input_processor, support_multimodal_disaggregated, ) +from ..pyexecutor.config_utils import get_qwen3_hybrid_layer_types from .checkpoints.base_weight_mapper import BaseWeightMapper from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper from .modeling_multimodal_utils import _is_disagg @@ -71,6 +73,248 @@ def _translate_mtp_pattern(name, n_hidden_layers): return None +# --- Config adapters -------------------------------------------------------- +# +# These run from ``load_pretrained_config`` in +# ``tensorrt_llm/_torch/pyexecutor/config_utils.py`` via lazy import — the +# runtime layer asks the model module how to load its own config. +# +# There are two entry points: +# - ``_Qwen35ConfigCompat.normalize(config_dict)`` — for text-only +# Qwen3.5 (MoE and dense). Returns a dict that +# ``transformers.Qwen3NextConfig.from_dict(...)`` can consume, so the +# existing Qwen3Next runtime is reused unchanged. +# - ``_normalize_qwen35_moe_vl_config(model_config)`` — for the +# Qwen3.5-MoE VLM. Mutates the HF-native ``transformers.Qwen3_5MoeConfig`` +# in place, attaching the runtime aliases the Qwen3Next-based LM expects +# while keeping ``text_config`` / ``vision_config`` composite. + + +class _Qwen35ConfigCompat: + """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig. + + We normalize to ``Qwen3NextConfig`` (rather than to a Qwen3.5-native + schema) so the runtime can reuse the existing ``Qwen3NextForCausalLM`` + model implementation unchanged — Qwen3.5 text is structurally identical + to Qwen3Next, so matching the config schema lets the same code serve + both. + + This is used for Qwen3.5 text-only configs and for shared helper logic such + as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM + configs should stay composite and use transformers.Qwen3_5MoeConfig plus + _normalize_qwen35_moe_vl_config instead. + + To remove: delete this class and the elif branch in + load_pretrained_config that flattens Qwen3.5 text configs. + """ + + @staticmethod + def normalize(config_dict: dict) -> dict: + """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict.""" + text_config = _Qwen35ConfigCompat._extract_text_config(config_dict) + text_config = _Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config) + text_config = _Qwen35ConfigCompat._flatten_rope(text_config) + + # Detect dense vs MoE and set architecture + MoE defaults accordingly + is_moe = "num_experts" in text_config and text_config["num_experts"] > 0 + if is_moe: + text_config["architectures"] = ["Qwen3_5MoeForCausalLM"] + else: + text_config["architectures"] = ["Qwen3_5ForCausalLM"] + # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't + # accidentally enable MoE for the dense model. + text_config.setdefault("num_experts", 0) + text_config.setdefault("num_experts_per_tok", 0) + text_config.setdefault("moe_intermediate_size", 0) + text_config.setdefault("shared_expert_intermediate_size", 0) + return text_config + + _VLM_ARCHITECTURES = { + "Qwen3_5MoeForConditionalGeneration", + "Qwen3_5ForConditionalGeneration", + } + + @staticmethod + def _extract_text_config(config_dict: dict) -> dict: + """Pull nested text_config from VLM checkpoints, or use dict as-is.""" + architectures = config_dict.get("architectures") or [] + if architectures and architectures[0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES: + text_config = dict(config_dict.get("text_config") or {}) + else: + text_config = dict(config_dict) + if not text_config: + raise ValueError("Qwen3.5 config is missing a usable text_config") + return text_config + + @staticmethod + def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict: + """Copy top-level quantization_config into text_config with name normalization. + + Also adds a temporary workaround that keeps packed linear-attention + in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is + fixed for that layout. + """ + if "quantization_config" in text_config: + return text_config + if "quantization_config" not in config_dict: + return text_config + + quantization_config = dict(config_dict["quantization_config"]) + if "modules_to_not_convert" in quantization_config: + modules = _Qwen35ConfigCompat._normalize_exclude_modules( + quantization_config["modules_to_not_convert"] + ) + modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules) + quantization_config["modules_to_not_convert"] = sorted(set(modules)) + text_config["quantization_config"] = quantization_config + return text_config + + @staticmethod + def _normalize_exclude_modules(modules: list[str]) -> list[str]: + """Translate HF quantization exclude-module paths to TRT-LLM names. + + - Strip model.language_model. prefix -> model. + - Drop model.visual.* and mtp.* entries + - Map split projection names to packed TRT-LLM names + """ + normalized = set() + for name in modules: + if name.startswith("model.language_model."): + name = "model." + name[len("model.language_model.") :] + if name.startswith("model.visual.") or name.startswith("mtp."): + continue + name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name) + name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name) + normalized.add(name) + return sorted(normalized) + + @staticmethod + def _add_qkvz_bf16_workaround(text_config: dict, modules: list[str]) -> list[str]: + """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers. + + Temporary until FP8 block-scale TP loading is fixed for this layout. + """ + try: + layer_types = get_qwen3_hybrid_layer_types(SimpleNamespace(**text_config)) + except (ValueError, AttributeError): + return modules + for layer_idx, layer_type in enumerate(layer_types): + if layer_type == "linear_attention": + modules.append(f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz") + return modules + + @staticmethod + def _flatten_rope(text_config: dict) -> dict: + """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling. + + Qwen3.5 nests these inside a rope_parameters dict and uses rope_type + instead of type in rope_scaling. Qwen3NextConfig expects them as + top-level fields with rope_scaling.type. + """ + rope_parameters = dict(text_config.pop("rope_parameters", {}) or {}) + rope_scaling = dict(text_config.get("rope_scaling") or {}) + if rope_parameters: + rope_theta = rope_parameters.pop("rope_theta", None) + if rope_theta is not None: + text_config.setdefault("rope_theta", rope_theta) + partial_rotary_factor = rope_parameters.pop("partial_rotary_factor", None) + if partial_rotary_factor is not None: + text_config.setdefault("partial_rotary_factor", partial_rotary_factor) + if rope_parameters: + rope_scaling = rope_parameters | rope_scaling + if rope_scaling: + has_mrope = "mrope_section" in rope_scaling or rope_scaling.get( + "mrope_interleaved", False + ) + if has_mrope: + rope_scaling["type"] = "mrope" + rope_scaling.pop("rope_type", None) + elif "type" not in rope_scaling and "rope_type" in rope_scaling: + rope_type = rope_scaling.pop("rope_type") + # "default" means standard RoPE (no scaling) — don't set + # rope_scaling to avoid triggering scaling code paths. + if rope_type == "default": + rope_scaling = {} + else: + rope_scaling["type"] = rope_type + if rope_scaling: + text_config["rope_scaling"] = rope_scaling + return text_config + + +def _normalize_qwen35_mrope_config(text_config) -> None: + """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path. + + HF stores RoPE metadata under `rope_parameters`; the shared Qwen3-VL + wrapper reads `rope_theta`, `partial_rotary_factor`, and + `rope_scaling` directly on the text config. + """ + rope_parameters = getattr(text_config, "rope_parameters", None) + if not rope_parameters: + return + if hasattr(rope_parameters, "to_dict"): + rope_parameters = rope_parameters.to_dict() + flattened = _Qwen35ConfigCompat._flatten_rope( + { + "rope_parameters": dict(rope_parameters), + "rope_scaling": dict(getattr(text_config, "rope_scaling", None) or {}), + } + ) + for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"): + value = flattened.get(attr) + if value is not None: + setattr(text_config, attr, value) + + +def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None: + """Materialize Qwen3Next-style text aliases used by the shared runtime.""" + if getattr(text_config, "intermediate_size", None) is None: + moe_intermediate_size = getattr(text_config, "moe_intermediate_size", None) + num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None) + shared_expert_intermediate_size = ( + getattr(text_config, "shared_expert_intermediate_size", 0) or 0 + ) + if moe_intermediate_size is not None and num_experts_per_tok is not None: + text_config.intermediate_size = ( + num_experts_per_tok * moe_intermediate_size + shared_expert_intermediate_size + ) + + +def _normalize_qwen35_quantization_config(model_config) -> None: + quantization_config = getattr(model_config, "quantization_config", None) + if not isinstance(quantization_config, dict): + return + + modules = quantization_config.get("modules_to_not_convert") + if modules is None: + return + + text_config = getattr(model_config, "text_config", None) + normalized_modules = _Qwen35ConfigCompat._normalize_exclude_modules(modules) + if text_config is not None: + normalized_modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround( + text_config.to_dict(), normalized_modules + ) + quantization_config["modules_to_not_convert"] = sorted(set(normalized_modules)) + + +def _normalize_qwen35_moe_vl_config(model_config) -> None: + """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions.""" + if not getattr(model_config, "architectures", None): + model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"] + + text_config = getattr(model_config, "text_config", None) + if text_config is None: + raise ValueError("Qwen3.5-MoE VLM config is missing text_config") + + text_config.architectures = ["Qwen3_5MoeForCausalLM"] + _normalize_qwen35_qwen3next_text_aliases(text_config) + _normalize_qwen35_mrope_config(text_config) + + model_config.get_text_config = lambda decoder=False: text_config + _normalize_qwen35_quantization_config(model_config) + + def _normalize_qwen35_exclude_modules(model_config): """Normalize NVFP4/FP8 exclude_modules from HF naming to TRT-LLM naming. diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index d58a7db52f43..978796f26d46 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -1,6 +1,4 @@ import dataclasses -import re -from types import SimpleNamespace from typing import List, Optional import torch @@ -328,233 +326,6 @@ def extract_mamba_kv_cache_params( ) -class _Qwen35ConfigCompat: - """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig. - - This is used for Qwen3.5 text-only configs and for shared helper logic such - as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM - configs should stay composite and use transformers.Qwen3_5MoeConfig plus - _normalize_qwen35_moe_vl_config instead. - - To remove: delete this class and the elif branch in - load_pretrained_config that flattens Qwen3.5 text configs. - """ - - @staticmethod - def normalize(config_dict: dict) -> dict: - """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict.""" - text_config = _Qwen35ConfigCompat._extract_text_config(config_dict) - text_config = _Qwen35ConfigCompat._inherit_quantization_config( - config_dict, text_config) - text_config = _Qwen35ConfigCompat._flatten_rope(text_config) - - # Detect dense vs MoE and set architecture + MoE defaults accordingly - is_moe = "num_experts" in text_config and text_config["num_experts"] > 0 - if is_moe: - text_config["architectures"] = ["Qwen3_5MoeForCausalLM"] - else: - text_config["architectures"] = ["Qwen3_5ForCausalLM"] - # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't - # accidentally enable MoE for the dense model. - text_config.setdefault("num_experts", 0) - text_config.setdefault("num_experts_per_tok", 0) - text_config.setdefault("moe_intermediate_size", 0) - text_config.setdefault("shared_expert_intermediate_size", 0) - return text_config - - _VLM_ARCHITECTURES = { - "Qwen3_5MoeForConditionalGeneration", - "Qwen3_5ForConditionalGeneration", - } - - @staticmethod - def _extract_text_config(config_dict: dict) -> dict: - """Pull nested text_config from VLM checkpoints, or use dict as-is.""" - architectures = config_dict.get("architectures") or [] - if architectures and architectures[ - 0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES: - text_config = dict(config_dict.get("text_config") or {}) - else: - text_config = dict(config_dict) - if not text_config: - raise ValueError("Qwen3.5 config is missing a usable text_config") - return text_config - - @staticmethod - def _inherit_quantization_config(config_dict: dict, - text_config: dict) -> dict: - """Copy top-level quantization_config into text_config with name normalization. - - Also adds a temporary workaround that keeps packed linear-attention - in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is - fixed for that layout. - """ - if "quantization_config" in text_config: - return text_config - if "quantization_config" not in config_dict: - return text_config - - quantization_config = dict(config_dict["quantization_config"]) - if "modules_to_not_convert" in quantization_config: - modules = _Qwen35ConfigCompat._normalize_exclude_modules( - quantization_config["modules_to_not_convert"]) - modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround( - text_config, modules) - quantization_config["modules_to_not_convert"] = sorted(set(modules)) - text_config["quantization_config"] = quantization_config - return text_config - - @staticmethod - def _normalize_exclude_modules(modules: list[str]) -> list[str]: - """Translate HF quantization exclude-module paths to TRT-LLM names. - - - Strip model.language_model. prefix -> model. - - Drop model.visual.* and mtp.* entries - - Map split projection names to packed TRT-LLM names - """ - normalized = set() - for name in modules: - if name.startswith("model.language_model."): - name = "model." + name[len("model.language_model."):] - if name.startswith("model.visual.") or name.startswith("mtp."): - continue - name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name) - name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name) - normalized.add(name) - return sorted(normalized) - - @staticmethod - def _add_qkvz_bf16_workaround(text_config: dict, - modules: list[str]) -> list[str]: - """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers. - - Temporary until FP8 block-scale TP loading is fixed for this layout. - """ - try: - layer_types = get_qwen3_hybrid_layer_types( - SimpleNamespace(**text_config)) - except (ValueError, AttributeError): - return modules - for layer_idx, layer_type in enumerate(layer_types): - if layer_type == "linear_attention": - modules.append( - f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz") - return modules - - @staticmethod - def _flatten_rope(text_config: dict) -> dict: - """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling. - - Qwen3.5 nests these inside a rope_parameters dict and uses rope_type - instead of type in rope_scaling. Qwen3NextConfig expects them as - top-level fields with rope_scaling.type. - """ - rope_parameters = dict(text_config.pop("rope_parameters", {}) or {}) - rope_scaling = dict(text_config.get("rope_scaling") or {}) - if rope_parameters: - rope_theta = rope_parameters.pop("rope_theta", None) - if rope_theta is not None: - text_config.setdefault("rope_theta", rope_theta) - partial_rotary_factor = rope_parameters.pop("partial_rotary_factor", - None) - if partial_rotary_factor is not None: - text_config.setdefault("partial_rotary_factor", - partial_rotary_factor) - if rope_parameters: - rope_scaling = rope_parameters | rope_scaling - if rope_scaling: - has_mrope = ("mrope_section" in rope_scaling - or rope_scaling.get("mrope_interleaved", False)) - if has_mrope: - rope_scaling["type"] = "mrope" - rope_scaling.pop("rope_type", None) - elif "type" not in rope_scaling and "rope_type" in rope_scaling: - rope_type = rope_scaling.pop("rope_type") - # "default" means standard RoPE (no scaling) — don't set - # rope_scaling to avoid triggering scaling code paths. - if rope_type == "default": - rope_scaling = {} - else: - rope_scaling["type"] = rope_type - if rope_scaling: - text_config["rope_scaling"] = rope_scaling - return text_config - - -def _normalize_qwen35_mrope_config(text_config) -> None: - """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path. - - HF stores RoPE metadata under ``rope_parameters``; the shared Qwen3-VL - wrapper reads ``rope_theta``, ``partial_rotary_factor``, and - ``rope_scaling`` directly on the text config. - """ - rope_parameters = getattr(text_config, "rope_parameters", None) - if not rope_parameters: - return - if hasattr(rope_parameters, "to_dict"): - rope_parameters = rope_parameters.to_dict() - flattened = _Qwen35ConfigCompat._flatten_rope({ - "rope_parameters": - dict(rope_parameters), - "rope_scaling": - dict(getattr(text_config, "rope_scaling", None) or {}), - }) - for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"): - value = flattened.get(attr) - if value is not None: - setattr(text_config, attr, value) - - -def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None: - """Materialize Qwen3Next-style text aliases used by the shared runtime.""" - if getattr(text_config, "intermediate_size", None) is None: - moe_intermediate_size = getattr(text_config, "moe_intermediate_size", - None) - num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None) - shared_expert_intermediate_size = getattr( - text_config, "shared_expert_intermediate_size", 0) or 0 - if (moe_intermediate_size is not None - and num_experts_per_tok is not None): - text_config.intermediate_size = ( - num_experts_per_tok * moe_intermediate_size + - shared_expert_intermediate_size) - - -def _normalize_qwen35_quantization_config(model_config) -> None: - quantization_config = getattr(model_config, "quantization_config", None) - if not isinstance(quantization_config, dict): - return - - modules = quantization_config.get("modules_to_not_convert") - if modules is None: - return - - text_config = getattr(model_config, "text_config", None) - normalized_modules = _Qwen35ConfigCompat._normalize_exclude_modules(modules) - if text_config is not None: - normalized_modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround( - text_config.to_dict(), normalized_modules) - quantization_config["modules_to_not_convert"] = sorted( - set(normalized_modules)) - - -def _normalize_qwen35_moe_vl_config(model_config) -> None: - """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions.""" - if not getattr(model_config, "architectures", None): - model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"] - - text_config = getattr(model_config, "text_config", None) - if text_config is None: - raise ValueError("Qwen3.5-MoE VLM config is missing text_config") - - text_config.architectures = ["Qwen3_5MoeForCausalLM"] - _normalize_qwen35_qwen3next_text_aliases(text_config) - _normalize_qwen35_mrope_config(text_config) - - model_config.get_text_config = lambda decoder=False: text_config - _normalize_qwen35_quantization_config(model_config) - - # TODO: remove this once the transformers can support all of those models in _CONFIG_REGISTRY class LazyConfigDict(dict): @@ -588,6 +359,9 @@ def load_pretrained_config(model_name_or_path: str, (("text_config" in config_dict and "vision_config" in config_dict) or (architectures and architectures[0] == "Qwen3_5MoeForConditionalGeneration"))): + # Qwen3.5-MoE VLM: HF native composite config + model-side normalizer. + from tensorrt_llm._torch.models.modeling_qwen3_5 import \ + _normalize_qwen35_moe_vl_config model_config = transformers.Qwen3_5MoeConfig.from_pretrained( model_name_or_path, **kwargs) _normalize_qwen35_moe_vl_config(model_config) @@ -603,6 +377,9 @@ def load_pretrained_config(model_name_or_path: str, "Qwen3_5ForCausalLM", "Qwen3_5ForConditionalGeneration", )): + # Qwen3.5 text-only: flatten to Qwen3NextConfig via the model-side shim. + from tensorrt_llm._torch.models.modeling_qwen3_5 import \ + _Qwen35ConfigCompat model_config = transformers.Qwen3NextConfig.from_dict( _Qwen35ConfigCompat.normalize(config_dict)) elif (model_type == "exaone4" and config_dict.get("sliding_window") is None diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index 1bfb7c4869a8..15d77e50e2de 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -60,6 +60,7 @@ Qwen/Qwen3-VL-8B-Instruct: mistralai/Mistral-Small-3.1-24B-Instruct-2503: - accuracy: 57.0 Qwen/Qwen3.5-35B-A3B: + # The default accuracy for `test_auto_dtype` tests. - accuracy: 59.0 - dtype: bfloat16 accuracy: 60.444 From 8cf91a626d13f6dea10114daf369a37e8e0e7e28 Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Mon, 18 May 2026 11:17:47 -0700 Subject: [PATCH 5/9] Add tests Signed-off-by: Michal Guzek --- .../modeling/test_modeling_multimodal.py | 133 +++++++- .../modeling/test_modeling_qwen3_5_vl_moe.py | 286 ++++++++++++++++++ 2 files changed, 412 insertions(+), 7 deletions(-) diff --git a/tests/unittest/_torch/modeling/test_modeling_multimodal.py b/tests/unittest/_torch/modeling/test_modeling_multimodal.py index 53fe5e044fc6..ab7166b68bf3 100644 --- a/tests/unittest/_torch/modeling/test_modeling_multimodal.py +++ b/tests/unittest/_torch/modeling/test_modeling_multimodal.py @@ -18,6 +18,12 @@ from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models.modeling_multimodal_utils import bypass_processor_output_validation +from tensorrt_llm._torch.pyexecutor.config_utils import ( + extract_mamba_kv_cache_params, + is_nemotron_hybrid, + is_qwen3_hybrid, +) +from tensorrt_llm._torch.pyexecutor.mamba_cache_manager import CppMambaHybridCacheManager from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.bindings.executor import KvCacheConfig @@ -28,6 +34,7 @@ prompt_inputs, ) from tensorrt_llm.inputs.multimodal import MultimodalParams, MultimodalRuntimeData +from tensorrt_llm.llmapi.llm_args import KvCacheConfig as PyKvCacheConfig from tensorrt_llm.mapping import Mapping @@ -518,6 +525,13 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario): Note: This method uses get_kv_cache_config() to obtain configuration. Override get_kv_cache_config() to customize cache settings. + + For hybrid linear-attention models (Qwen3Next, Qwen3.5, + Nemotron-Hybrid) this dispatches to + `get_hybrid_kv_cache_manager` so the linear-attention layers + get a `CppMambaHybridCacheManager` for SSM/conv state. + Mirrors the production dispatch in + `_util.py:_create_kv_cache_manager`. """ # Get cache configuration from the configurable method cache_config = self.get_kv_cache_config(scenario) @@ -527,17 +541,114 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario): num_blocks = (max_seq_len + tokens_per_block - 1) // tokens_per_block - self.kv_cache_manager = self.get_kv_cache_manager( - dtype=self.model_config.pretrained_config.torch_dtype, - config=self.model_config.pretrained_config, + config = self.model_config.pretrained_config + text_config = getattr(config, "text_config", config) + + if is_qwen3_hybrid(text_config) or is_nemotron_hybrid(text_config): + self.kv_cache_manager = self.get_hybrid_kv_cache_manager( + text_config=text_config, + tokens_per_block=tokens_per_block, + max_seq_len=max_seq_len, + batch_size=batch_size, + num_blocks=num_blocks, + ) + else: + self.kv_cache_manager = self.get_kv_cache_manager( + dtype=self.model_config.pretrained_config.torch_dtype, + config=self.model_config.pretrained_config, + tokens_per_block=tokens_per_block, + max_seq_len=max_seq_len, + batch_size=batch_size, + num_blocks=num_blocks, + ) + + self.kv_cache_manager.add_dummy_requests( + request_ids=[1], + token_nums=[max_seq_len], + **self._dummy_request_kwargs(scenario), + ) + + def _dummy_request_kwargs(self, scenario: MultimodalScenario) -> Dict: + """Optional override hook for extra kwargs to `add_dummy_requests`. + + Subclasses for mRoPE-using models (Qwen2.5-VL, Qwen3-VL, Qwen3.5-VL, + …) should return `{"use_mrope": True}` here so the cache manager + allocates the mRoPE position-id buffer at dummy-request time. + Defaults to an empty dict, preserving existing behavior for tests + that don't care. + """ + return {} + + def get_hybrid_kv_cache_manager( + self, + text_config: PretrainedConfig, + tokens_per_block: int, + max_seq_len: int, + batch_size: int, + num_blocks: int, + ): + """Build a `CppMambaHybridCacheManager` for hybrid linear-attention + models (Qwen3Next, Qwen3.5, Nemotron-Hybrid). + + Mirrors the production construction in + `_util.py:_create_kv_cache_manager` for `is_qwen3_hybrid` / + `is_nemotron_hybrid` configs: pulls the state-shape / dtype / + layer-mask parameters from `extract_mamba_kv_cache_params` and + threads them through the constructor. Tests that need a different + concrete manager (e.g. `MixedMambaHybridCacheManager` for + disagg-style coverage) should override this method. + """ + dtype_map = { + torch.half: tensorrt_llm.bindings.DataType.HALF, + torch.float16: tensorrt_llm.bindings.DataType.HALF, + torch.bfloat16: tensorrt_llm.bindings.DataType.BF16, + } + + mamba_params = extract_mamba_kv_cache_params(text_config) + if mamba_params.dtype not in dtype_map: + raise ValueError( + f"Unsupported dtype for hybrid cache manager: " + f"{mamba_params.dtype}. Supported: {list(dtype_map.keys())}" + ) + kv_cache_dtype = dtype_map[mamba_params.dtype] + + head_dim = getattr(text_config, "head_dim", None) + if not isinstance(head_dim, int): + head_dim = text_config.hidden_size // text_config.num_attention_heads + + # CppMambaHybridCacheManager reads Pydantic-only fields + # (mamba_state_cache_interval, enable_block_reuse) so we have to + # construct the llmapi.llm_args.KvCacheConfig here, not the C++ + # bindings KvCacheConfig that the standard KVCacheManager path uses. + kv_cache_config = PyKvCacheConfig(max_tokens=num_blocks * tokens_per_block) + mapping = Mapping(world_size=1, tp_size=1, rank=0) + + return CppMambaHybridCacheManager( + # mamba cache parameters (positional) + mamba_params.state_size, + mamba_params.conv_kernel, + mamba_params.num_heads, + mamba_params.n_groups, + mamba_params.head_dim, + mamba_params.num_mamba_layers, + mamba_params.mamba_layer_mask, + mamba_params.dtype, + mamba_params.mamba_ssm_cache_dtype, + # kv cache parameters (positional) + kv_cache_config, + tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF, + # kw-only + num_layers=mamba_params.num_full_attention_layers, + layer_mask=mamba_params.full_attention_layer_mask, + num_kv_heads=text_config.num_key_value_heads, + head_dim=head_dim, tokens_per_block=tokens_per_block, max_seq_len=max_seq_len, - batch_size=batch_size, - num_blocks=num_blocks, + max_batch_size=batch_size, + mapping=mapping, + dtype=kv_cache_dtype, ) - self.kv_cache_manager.add_dummy_requests(request_ids=[1], token_nums=[max_seq_len]) - def get_max_num_tokens(self, scenario: MultimodalScenario) -> int: """Get maximum number of tokens for attention metadata.""" if scenario.chunked_prefill: @@ -695,6 +806,14 @@ def setUp(self): # TODO: Add multi-GPU support self.device = torch.device("cuda:0") + # Pre-initialize fields that tearDown / setup_scenario expect to + # exist. Without this, a test method that doesn't run + # setup_scenario (e.g. a setUp-only smoke test) leaves + # self.kv_cache_manager unset and tearDown errors with + # AttributeError on the ``is not None`` check. + self.kv_cache_manager = None + self.attn_metadata = None + self.hf_config = self.create_hf_config() if self.skip_hf_inference: # Create a dummy torch module if skipping HF inference. diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py index 49fc4cbe4902..b102d231b810 100644 --- a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py +++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py @@ -2,16 +2,23 @@ # SPDX-License-Identifier: Apache-2.0 import json +import os +from copy import deepcopy from pathlib import Path +from typing import List, Optional import torch import transformers +from test_modeling_multimodal import MultimodalScenario, TestModelingMultimodal +from transformers import Qwen3_5MoeForConditionalGeneration as HFQwen3_5MoeForConditionalGeneration +from utils.llm_data import llm_models_root from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models import Qwen3_5MoeVLModel from tensorrt_llm._torch.models.checkpoints.auto_mapper import AutoCheckpointMapper from tensorrt_llm._torch.models.checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper from tensorrt_llm._torch.models.modeling_auto import AutoModelForCausalLM +from tensorrt_llm._torch.models.modeling_qwen3_5 import _normalize_qwen35_moe_vl_config from tensorrt_llm._torch.pyexecutor.config_utils import ( extract_mamba_kv_cache_params, load_pretrained_config, @@ -142,3 +149,282 @@ def test_qwen35_moe_vl_placeholder_metadata_registered() -> None: } assert metadata.placeholders_separator == "" assert metadata.content_format is ContentFormat.STRING + + +# --- Layered parity test scaffold ------------------------------------------- +# +# Tiny synthetic config used by TestQwen3_5MoeVL below. Same architecture as +# the real Qwen/Qwen3.5-35B-A3B checkpoint but with much smaller dimensions +# where possible. +# +# Shapes that have to match real Qwen3.5 (can't shrink without breaking +# things downstream): +# +# - head_dim=256, partial_rotary_factor=0.25 --> rotary tensor width is +# `head_dim * 0.25 / 2 = 32`, which equals `sum(mrope_section)`. +# A smaller head_dim (e.g. 128) yields a 16-wide tensor that mRoPE +# can't split with section sum 32. +# - num_attention_heads=16, num_key_value_heads=2 match the real +# model's 8:1 GQA layout; Q proj is 2048 --> 4096, K/V are 2048 --> 512. +# - Vision deepstack indices [8, 16, 24] match the real config, and +# depth=27 is the smallest value that hosts those indices. Disabling +# deepstack (indices=[], depth=2) produces fewer vision embeddings +# than the HF processor reserves placeholder tokens for, which +# breaks `fuse_input_embeds`. +# - vocab_size=248320 matches the real Qwen3.5 tokenizer. The +# tokenizer (loaded via _name_or_path) emits special-token ids in +# the 248k range; `fuse_input_embeds` uses `vocab_size` as the +# OOV threshold to identify image-pad tokens. A smaller vocab_size +# would misclassify regular chat-template specials as mm tokens +# and trip the placeholder/embedding count check. +# +# Shapes that can be shrunk for tests: +# +# - num_hidden_layers: 2 (vs 40+). +# - num_experts: 128 (vs 256). Still moderate so MoE routing runs. +# - full_attention_interval=2 with 2 LM layers yields the pattern +# [linear_attention, full_attention] — one of each kind, exercising +# both the regular KV cache and the Mamba SSM/conv state via the +# base-class dispatch. +# +# `_name_or_path` points at the real checkpoint dir so the test can load +# the tokenizer/processor (only the processor; not the full model weights). +QWEN3_5_VL_MOE_PARITY_CONFIG = { + "architectures": ["Qwen3_5MoeForConditionalGeneration"], + "image_token_id": 248056, + "model_type": "qwen3_5_moe", + "text_config": { + "attention_bias": False, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151645, + "full_attention_interval": 2, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 2048, + "linear_conv_kernel_dim": 4, + "linear_key_head_dim": 128, + "linear_num_key_heads": 16, + "linear_num_value_heads": 32, + "linear_value_head_dim": 128, + "mamba_ssm_dtype": "float32", + "max_position_embeddings": 8192, + "mlp_only_layers": [], + "model_type": "qwen3_5_moe_text", + "moe_intermediate_size": 512, + "norm_topk_prob": True, + "num_attention_heads": 16, + "num_experts": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 2, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-6, + "shared_expert_intermediate_size": 512, + "rope_parameters": { + "mrope_section": [11, 11, 10], + "partial_rotary_factor": 0.25, + "rope_theta": 1000000.0, + "rope_type": "default", + }, + "use_cache": True, + "vocab_size": 248320, + }, + "tie_word_embeddings": False, + "video_token_id": 248057, + "vision_config": { + "deepstack_visual_indexes": [8, 16, 24], + "depth": 27, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "in_channels": 3, + "initializer_range": 0.02, + "intermediate_size": 4304, + "model_type": "qwen3_5_moe", + "num_heads": 16, + "num_position_embeddings": 2304, + "out_hidden_size": 2048, + "patch_size": 16, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + }, + "vision_end_token_id": 248054, + "vision_start_token_id": 248053, + "_name_or_path": str(os.path.join(llm_models_root(), "Qwen3.5-35B-A3B")), +} + + +class TestQwen3_5MoeVL(TestModelingMultimodal): + """Forward-parity test for Qwen3.5-MoE-VL against HuggingFace. + + Tiny-synthetic-config parity test in the same shape as + `TestQwen3VLMoe` / `TestQwen2_5VL`: both stacks are constructed + from `QWEN3_5_VL_MOE_PARITY_CONFIG` (2 LM layers, 1 linear + 1 full + attention, 128 experts, 2 vision layers), HF weights are copied + into TRT-LLM via `Qwen3_5MoeHfWeightMapper`, then `test_all` + sweeps the default `MultimodalScenario`s comparing last-position + logits at context + generation phases. + + Two-config design: + - `self.hf_config` is the raw `Qwen3_5MoeConfig.from_dict(...)` + result. HF model construction sees the native HF schema + (`rope_parameters` intact with `rope_type`, + `moe_intermediate_size`, …). + - TRT-LLM gets a deep-copied + normalized version via the + `create_trtllm_model` override below. That copy goes through + `_normalize_qwen35_moe_vl_config` exactly the same way + production `load_pretrained_config` does, so the Qwen3Next + runtime sees the flat aliases it expects + (`intermediate_size`, `rope_theta`, `rope_scaling`, …). + + Keeping the two configs separate means the production normalizer + doesn't need to be HF-safe — production only ever constructs the + TRT-LLM model from a normalized config, and the test mirrors that + boundary explicitly. The hybrid-cache path is handled by the base + class's `init_kv_cache_manager` dispatch on + `is_qwen3_hybrid` / `is_nemotron_hybrid`. + """ + + def get_model_config(self): + return QWEN3_5_VL_MOE_PARITY_CONFIG + + def get_trtllm_model_class(self): + return Qwen3_5MoeVLModel + + def get_hf_model_class(self): + return HFQwen3_5MoeForConditionalGeneration + + def get_weight_mapper_class(self): + return Qwen3_5MoeHfWeightMapper + + def get_model_type(self): + return "qwen3_5_moe" + + def get_model_config_class(self): + return transformers.Qwen3_5MoeConfig + + def create_trtllm_model( + self, + load_weights: bool = False, + hf_model_state_dict: Optional[dict] = None, + **kwargs, + ): + """Build the TRT-LLM model from a *normalized copy* of `self.hf_config`. + + Mirrors the base-class body but swaps in + `_normalize_qwen35_moe_vl_config(trtllm_config)` before + wrapping in `ModelConfig`. `self.hf_config` itself stays + raw so the HF model that the base class builds in `setUp` + sees native HF schema. + """ + trtllm_config = deepcopy(self.hf_config) + _normalize_qwen35_moe_vl_config(trtllm_config) + + model_config = ModelConfig(pretrained_config=trtllm_config) + model_class = self.get_trtllm_model_class() + model = model_class(model_config, **kwargs).to("cuda") + + if load_weights: + weight_mapper_class = self.get_weight_mapper_class() + if weight_mapper_class is not None: + weight_mapper = weight_mapper_class() + weight_mapper.init_model_and_config(model, trtllm_config) + model.load_weights(hf_model_state_dict, weight_mapper) + else: + model.load_weights(hf_model_state_dict) + + for module in model.modules(): + if hasattr(module, "post_load_weights") and not getattr( + module, "_weights_removed", False + ): + module.post_load_weights() + + return model, model_config + + def _dummy_request_kwargs(self, scenario): + """Qwen3.5-VL uses mRoPE; the cache manager needs the mRoPE + position-id buffer allocated at dummy-request time.""" + return {"use_mrope": True} + + def get_trtllm_inputs( + self, + input_ids, + multimodal_params_list, + is_gen: bool = False, + num_cached_tokens_per_seq: Optional[List[int]] = None, + total_prompt_len: Optional[int] = None, + ): + """Override position_ids with mRoPE position IDs from the + multimodal params. Same pattern as `TestQwen3VLMoe` — the + VLM wrapper feeds mRoPE-shaped position IDs to the decoder, + not the simple range-based default the base class produces. + """ + trtllm_inputs = super().get_trtllm_inputs( + input_ids, + multimodal_params_list, + is_gen, + num_cached_tokens_per_seq, + total_prompt_len=total_prompt_len, + ) + + if is_gen: + mrope_gen_position_ids = [] + for multimodal_param in multimodal_params_list: + mrope_gen_position_ids.append( + multimodal_param.multimodal_data["mrope_config"]["mrope_position_deltas"] + ) + mrope_gen_position_ids = torch.cat(mrope_gen_position_ids, dim=-1).to(self.device) + trtllm_inputs["position_ids"] = ( + (trtllm_inputs["position_ids"] + mrope_gen_position_ids).expand(3, -1, 1).cuda() + ) + gen_multimodal_params_list = [] + for multimodal_param in multimodal_params_list: + multimodal_param.strip_for_generation() + multimodal_param.to_device( + "multimodal_data", + self.device, + pin_memory=True, + target_keywords=["mrope_config.mrope_position_deltas"], + ) + gen_multimodal_params_list.append(multimodal_param) + trtllm_inputs["multimodal_params"] = gen_multimodal_params_list + else: + mrope_position_ids = [] + for multimodal_param in multimodal_params_list: + mrope_position_ids.append( + multimodal_param.multimodal_data["mrope_config"]["mrope_position_ids"] + ) + position_ids = torch.cat(mrope_position_ids, dim=-1).cuda() + trtllm_inputs["position_ids"] = position_ids + + return trtllm_inputs + + def get_scenarios(self) -> List[MultimodalScenario]: + """Minimal scenario sweep for the initial coverage. + + Starts with one image scenario, no CUDA graph / chunked + prefill / kv-cache reuse — those add additional surface area + (mRoPE handling under graph capture, multimodal cumsum under + chunking, etc.) that's worth adding incrementally once the + baseline parity passes. + """ + return [ + MultimodalScenario( + modality="image", + use_cuda_graph=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), + ] + + def test_construction_and_weight_loading_smoke(self): + """Smoke test: setUp built HF + TRT-LLM models and copied HF + weights into TRT-LLM via the weight mapper. Detailed + assertions on the normalizer's outputs live in the routing + tests above (e.g. `test_qwen35_moe_vl_config_preserves_vlm_architecture`) + — this one just confirms construction reached the end without + exception. + """ + self.assertIsNotNone(self.hf_model) + self.assertIsNotNone(self.trtllm_model) + self.assertIsNotNone(self.model_config) From 6b72d9d7b2b8b00ea3479c37ba9510f0c76e5470 Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Mon, 18 May 2026 11:23:33 -0700 Subject: [PATCH 6/9] Formatting Signed-off-by: Michal Guzek --- tensorrt_llm/_torch/models/modeling_qwen3_5.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py index a1d8aaa69a08..28922b23bb6e 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py @@ -75,26 +75,26 @@ def _translate_mtp_pattern(name, n_hidden_layers): # --- Config adapters -------------------------------------------------------- # -# These run from ``load_pretrained_config`` in -# ``tensorrt_llm/_torch/pyexecutor/config_utils.py`` via lazy import — the +# These run from `load_pretrained_config` in +# `tensorrt_llm/_torch/pyexecutor/config_utils.py` via lazy import — the # runtime layer asks the model module how to load its own config. # # There are two entry points: -# - ``_Qwen35ConfigCompat.normalize(config_dict)`` — for text-only +# - `_Qwen35ConfigCompat.normalize(config_dict)` — for text-only # Qwen3.5 (MoE and dense). Returns a dict that -# ``transformers.Qwen3NextConfig.from_dict(...)`` can consume, so the +# `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the # existing Qwen3Next runtime is reused unchanged. -# - ``_normalize_qwen35_moe_vl_config(model_config)`` — for the -# Qwen3.5-MoE VLM. Mutates the HF-native ``transformers.Qwen3_5MoeConfig`` +# - `_normalize_qwen35_moe_vl_config(model_config)` — for the +# Qwen3.5-MoE VLM. Mutates the HF-native `transformers.Qwen3_5MoeConfig` # in place, attaching the runtime aliases the Qwen3Next-based LM expects -# while keeping ``text_config`` / ``vision_config`` composite. +# while keeping `text_config` / `vision_config` composite. class _Qwen35ConfigCompat: """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig. - We normalize to ``Qwen3NextConfig`` (rather than to a Qwen3.5-native - schema) so the runtime can reuse the existing ``Qwen3NextForCausalLM`` + We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native + schema) so the runtime can reuse the existing `Qwen3NextForCausalLM` model implementation unchanged — Qwen3.5 text is structurally identical to Qwen3Next, so matching the config schema lets the same code serve both. From 8da13df7da73e60f47fb9e12e2df18bfd1134fe5 Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Tue, 19 May 2026 12:57:26 -0700 Subject: [PATCH 7/9] Address CodeRabbit review Signed-off-by: Michal Guzek --- .../defs/accuracy/references/mmmu.yaml | 2 ++ .../test_llm_api_pytorch_multimodal.py | 19 +++++++++--- .../test_lists/qa/llm_function_core.txt | 1 + .../modeling/test_modeling_qwen3_5_vl_moe.py | 31 ++++++++++++++----- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index 15d77e50e2de..69e8c050e440 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -64,6 +64,8 @@ Qwen/Qwen3.5-35B-A3B: - accuracy: 59.0 - dtype: bfloat16 accuracy: 60.444 + - quant_algo: FP8_BLOCK_SCALES + accuracy: 58.889 # Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params). # Values below are measured with NVFP4 checkpoint (thinking mode enabled). moonshotai/Kimi-K2.5: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index 2a715fc33124..76c2b532e006 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -441,13 +441,24 @@ class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False) - def test_auto_dtype(self) -> None: - with LLM( - self.MODEL_PATH, + def _make_llm(self, model_path: str) -> LLM: + return LLM( + model_path, max_num_tokens=self.MAX_NUM_TOKENS, max_batch_size=self.MAX_BATCH_SIZE, kv_cache_config=self.kv_cache_config, - ) as llm: + ) + + def test_auto_dtype(self) -> None: + with self._make_llm(self.MODEL_PATH) as llm: + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) + + @skip_pre_hopper + def test_fp8_prequantized(self) -> None: + model_path = f"{llm_models_root()}/Qwen3.5-35B-A3B-FP8" + with self._make_llm(model_path) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES task = MMMU(self.MODEL_NAME) task.evaluate(llm, sampling_params=self.sampling_params) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index e52c36078273..360170d89f68 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -802,6 +802,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestKimiK25::test_nvfp4[dep8] accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_fp8_prequantized accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray unittest/disaggregated/test_openai_disagg_server.py diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py index b102d231b810..6956d497e3a6 100644 --- a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py +++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py @@ -400,13 +400,18 @@ def get_trtllm_inputs( return trtllm_inputs def get_scenarios(self) -> List[MultimodalScenario]: - """Minimal scenario sweep for the initial coverage. - - Starts with one image scenario, no CUDA graph / chunked - prefill / kv-cache reuse — those add additional surface area - (mRoPE handling under graph capture, multimodal cumsum under - chunking, etc.) that's worth adding incrementally once the - baseline parity passes. + """Modality-sanity sweep (image / multiple_image / video). + + These three catch differences in placeholder counts and the + multimodal-cumsum path between single-image, multi-image, and + video inputs. + + CUDA-graph capture is intentionally not exercised here. The + standard `attn_metadata.create_cuda_graph_metadata` path only + addresses attention metadata; the Mamba SSM state buffer of the + hybrid (Mamba + attention) cache is not threaded through, so + replayed logits diverge from the HF reference. Adding that path + is dedicated harness work and tracked separately. """ return [ MultimodalScenario( @@ -415,6 +420,18 @@ def get_scenarios(self) -> List[MultimodalScenario]: chunked_prefill=False, kv_cache_reuse=False, ), + MultimodalScenario( + modality="multiple_image", + use_cuda_graph=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), + MultimodalScenario( + modality="video", + use_cuda_graph=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), ] def test_construction_and_weight_loading_smoke(self): From d5e221a64b500880381ca0b0b9934a6d208b63ae Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Tue, 19 May 2026 13:53:11 -0700 Subject: [PATCH 8/9] Address review comments Signed-off-by: Michal Guzek --- docs/source/models/supported-models.md | 1 + .../_torch/models/modeling_qwen3_5.py | 25 ++++++++++--------- .../_torch/pyexecutor/config_utils.py | 4 +-- .../test_lists/test-db/l0_l40s.yml | 1 + .../modeling/test_modeling_qwen3_5_vl_moe.py | 17 +++++++------ 5 files changed, 27 insertions(+), 21 deletions(-) diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md index 6670e8366b96..6c839c97ac4f 100644 --- a/docs/source/models/supported-models.md +++ b/docs/source/models/supported-models.md @@ -95,6 +95,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl | `Qwen2_5_VLForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | L + I + V | | `Qwen3VLForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | L + I + V | | `Qwen3VLMoeForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | L + I + V | +| `Qwen3_5MoeForConditionalGeneration` | Yes | Yes | Untested | Yes | Yes | No | Untested | Yes | L + I + V | Note: - L: Language diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py index 28922b23bb6e..4f325dbb0bcb 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py @@ -80,7 +80,7 @@ def _translate_mtp_pattern(name, n_hidden_layers): # runtime layer asks the model module how to load its own config. # # There are two entry points: -# - `_Qwen35ConfigCompat.normalize(config_dict)` — for text-only +# - `Qwen35ConfigCompat.normalize(config_dict)` — for text-only # Qwen3.5 (MoE and dense). Returns a dict that # `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the # existing Qwen3Next runtime is reused unchanged. @@ -90,7 +90,7 @@ def _translate_mtp_pattern(name, n_hidden_layers): # while keeping `text_config` / `vision_config` composite. -class _Qwen35ConfigCompat: +class Qwen35ConfigCompat: """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig. We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native @@ -111,9 +111,9 @@ class _Qwen35ConfigCompat: @staticmethod def normalize(config_dict: dict) -> dict: """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict.""" - text_config = _Qwen35ConfigCompat._extract_text_config(config_dict) - text_config = _Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config) - text_config = _Qwen35ConfigCompat._flatten_rope(text_config) + text_config = Qwen35ConfigCompat._extract_text_config(config_dict) + text_config = Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config) + text_config = Qwen35ConfigCompat._flatten_rope(text_config) # Detect dense vs MoE and set architecture + MoE defaults accordingly is_moe = "num_experts" in text_config and text_config["num_experts"] > 0 @@ -138,7 +138,7 @@ def normalize(config_dict: dict) -> dict: def _extract_text_config(config_dict: dict) -> dict: """Pull nested text_config from VLM checkpoints, or use dict as-is.""" architectures = config_dict.get("architectures") or [] - if architectures and architectures[0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES: + if architectures and architectures[0] in Qwen35ConfigCompat._VLM_ARCHITECTURES: text_config = dict(config_dict.get("text_config") or {}) else: text_config = dict(config_dict) @@ -161,10 +161,10 @@ def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict: quantization_config = dict(config_dict["quantization_config"]) if "modules_to_not_convert" in quantization_config: - modules = _Qwen35ConfigCompat._normalize_exclude_modules( + modules = Qwen35ConfigCompat._normalize_exclude_modules( quantization_config["modules_to_not_convert"] ) - modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules) + modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules) quantization_config["modules_to_not_convert"] = sorted(set(modules)) text_config["quantization_config"] = quantization_config return text_config @@ -254,7 +254,7 @@ def _normalize_qwen35_mrope_config(text_config) -> None: return if hasattr(rope_parameters, "to_dict"): rope_parameters = rope_parameters.to_dict() - flattened = _Qwen35ConfigCompat._flatten_rope( + flattened = Qwen35ConfigCompat._flatten_rope( { "rope_parameters": dict(rope_parameters), "rope_scaling": dict(getattr(text_config, "rope_scaling", None) or {}), @@ -290,9 +290,9 @@ def _normalize_qwen35_quantization_config(model_config) -> None: return text_config = getattr(model_config, "text_config", None) - normalized_modules = _Qwen35ConfigCompat._normalize_exclude_modules(modules) + normalized_modules = Qwen35ConfigCompat._normalize_exclude_modules(modules) if text_config is not None: - normalized_modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround( + normalized_modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround( text_config.to_dict(), normalized_modules ) quantization_config["modules_to_not_convert"] = sorted(set(normalized_modules)) @@ -390,7 +390,7 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM): Same reuse pattern as Qwen3_5MoeForCausalLM, but for the dense 27B variant which uses GatedMLP instead of SparseMoeBlock. The config - normalizer (_Qwen35ConfigCompat) sets num_experts=0 so that + normalizer (Qwen35ConfigCompat) sets num_experts=0 so that Qwen3NextModel selects GatedMLP for the feed-forward layers. """ @@ -399,6 +399,7 @@ def __init__(self, model_config): super().__init__(model_config) +# TODO: Add tests for disaggregated support. @support_multimodal_disaggregated @register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel) @register_auto_model("Qwen3_5MoeForConditionalGeneration") diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index 978796f26d46..c6790ebacf45 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -379,9 +379,9 @@ def load_pretrained_config(model_name_or_path: str, )): # Qwen3.5 text-only: flatten to Qwen3NextConfig via the model-side shim. from tensorrt_llm._torch.models.modeling_qwen3_5 import \ - _Qwen35ConfigCompat + Qwen35ConfigCompat model_config = transformers.Qwen3NextConfig.from_dict( - _Qwen35ConfigCompat.normalize(config_dict)) + Qwen35ConfigCompat.normalize(config_dict)) elif (model_type == "exaone4" and config_dict.get("sliding_window") is None and config_dict.get("layer_types") is None): # transformers 5.5.x Exaone4Config.__post_init__ first forces diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml index 9c72f9dccb86..a82a62e2c77d 100644 --- a/tests/integration/test_lists/test-db/l0_l40s.yml +++ b/tests/integration/test_lists/test-db/l0_l40s.yml @@ -23,6 +23,7 @@ l0_l40s: - unittest/_torch/modeling/test_modeling_qwen2_5vl.py::TestQwen2_5_VL::test_all - unittest/_torch/modeling/test_modeling_qwen3vl_moe.py::TestQwen3VLMoe::test_all - unittest/_torch/modeling/test_modeling_qwen3vl.py::TestQwen3VL::test_all + - unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py::TestQwen3_5MoeVL::test_all - test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] - unittest/llmapi/apps/_test_openai_chat_multimodal.py::test_single_chat_session_image_embeds -m needs_l40s # MMMU sanity check diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py index 6956d497e3a6..df30e93d89e0 100644 --- a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py +++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py @@ -325,13 +325,9 @@ def create_trtllm_model( model = model_class(model_config, **kwargs).to("cuda") if load_weights: - weight_mapper_class = self.get_weight_mapper_class() - if weight_mapper_class is not None: - weight_mapper = weight_mapper_class() - weight_mapper.init_model_and_config(model, trtllm_config) - model.load_weights(hf_model_state_dict, weight_mapper) - else: - model.load_weights(hf_model_state_dict) + weight_mapper = self.get_weight_mapper_class()() + weight_mapper.init_model_and_config(model, trtllm_config) + model.load_weights(hf_model_state_dict, weight_mapper) for module in model.modules(): if hasattr(module, "post_load_weights") and not getattr( @@ -346,6 +342,13 @@ def _dummy_request_kwargs(self, scenario): position-id buffer allocated at dummy-request time.""" return {"use_mrope": True} + def get_tolerance(self): + """Tighten `rtol` to `0.1` (4x tighter than the base 0.4 + default) while keeping `atol` at `0.4` to absorb single-logit + tail outliers seen on `multiple_image` / `video`. + """ + return 0.4, 0.1 + def get_trtllm_inputs( self, input_ids, From ee6511e349831e89e07b09ed77bc5fd846aff0ae Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Wed, 20 May 2026 20:38:57 -0700 Subject: [PATCH 9/9] Restore tensorrt_llm/_torch/configs/__init__.py from main Signed-off-by: Michal Guzek --- tensorrt_llm/_torch/configs/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/configs/__init__.py b/tensorrt_llm/_torch/configs/__init__.py index 0ab6bc3fcacf..6496e3283451 100644 --- a/tensorrt_llm/_torch/configs/__init__.py +++ b/tensorrt_llm/_torch/configs/__init__.py @@ -24,6 +24,4 @@ def _register_custom_configs_with_transformers() -> None: _register_custom_configs_with_transformers() del _register_custom_configs_with_transformers -__all__ = [ - "DeepseekV3Config", -] +__all__ = ["DeepseekV3Config"]