From d917e03c42ba5558ae1190c69e496fd1a919e562 Mon Sep 17 00:00:00 2001
From: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
Date: Tue, 31 Mar 2026 03:08:11 +0000
Subject: [PATCH 1/9] [None][feat] Add the Qwen3.5 multimodal support.

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
---
 tensorrt_llm/_torch/configs/__init__.py       |  5 +-
 .../_torch/models/modeling_qwen3_5.py         | 95 ++++++++++++++++++-
 .../_torch/models/modeling_qwen3_next.py      | 13 ++-
 .../_torch/models/modeling_qwen3vl.py         |  7 +-
 .../_torch/pyexecutor/config_utils.py         |  1 +
 .../defs/accuracy/references/mmmu.yaml        |  2 +
 .../test_llm_api_pytorch_multimodal.py        | 23 +++++
 7 files changed, 140 insertions(+), 6 deletions(-)

diff --git a/tensorrt_llm/_torch/configs/__init__.py b/tensorrt_llm/_torch/configs/__init__.py
index 6496e3283451..b4ba4c5183f6 100644
--- a/tensorrt_llm/_torch/configs/__init__.py
+++ b/tensorrt_llm/_torch/configs/__init__.py
@@ -24,4 +24,7 @@ def _register_custom_configs_with_transformers() -> None:
 _register_custom_configs_with_transformers()
 del _register_custom_configs_with_transformers
 
-__all__ = ["DeepseekV3Config"]
+__all__ = [
+    "DeepseekV3Config",
+    "Qwen3_5MoeConfig",
+]
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
index bf83e916db29..2c15b851d511 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
@@ -1,7 +1,26 @@
 import re
-
+from typing import Dict, List
+
+import torch
+from transformers import PretrainedConfig
+
+from ...inputs import (
+    MultimodalPlaceholderMetadata,
+    MultimodalPlaceholderPlacement,
+    register_input_processor,
+    support_multimodal_disaggregated,
+)
+from .checkpoints.base_weight_mapper import BaseWeightMapper
+from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper
+from .modeling_multimodal_utils import _is_disagg
 from .modeling_qwen3_next import Qwen3NextForCausalLM
-from .modeling_utils import register_auto_model
+from .modeling_qwen3vl import (
+    Qwen3VisionModel,
+    Qwen3VisionModelBase,
+    Qwen3VLInputProcessorBase,
+    Qwen3VLModelBase,
+)
+from .modeling_utils import ModelConfig, register_auto_model, register_vision_encoder
 
 _LANG_PREFIX = "model.language_model."
 
@@ -93,6 +112,32 @@ def _normalize_qwen35_exclude_modules(model_config):
     qc.exclude_modules = sorted(normalized)
 
 
+def _ensure_qwen35_mrope_compat(text_config: PretrainedConfig) -> None:
+    """Normalize Qwen3.5 mRoPE fields for the shared Qwen3-VL wrapper.
+
+    Qwen3.5 stores RoPE metadata in ``rope_parameters``.  Some config classes
+    may also materialize default top-level ``rope_theta`` or
+    ``partial_rotary_factor`` values, so prefer the checkpoint-provided nested
+    values unconditionally here.
+    """
+    rope_parameters = getattr(text_config, "rope_parameters", None)
+    if not rope_parameters:
+        return
+
+    rope_params = dict(rope_parameters)
+    rope_theta = rope_params.pop("rope_theta", None)
+    if rope_theta is not None:
+        text_config.rope_theta = rope_theta
+
+    partial_rotary_factor = rope_params.pop("partial_rotary_factor", None)
+    if partial_rotary_factor is not None:
+        text_config.partial_rotary_factor = partial_rotary_factor
+
+    if not getattr(text_config, "rope_scaling", None):
+        rope_params.pop("rope_type", None)
+        text_config.rope_scaling = rope_params
+
+
 @register_auto_model("Qwen3_5MoeForCausalLM")
 class Qwen3_5MoeForCausalLM(Qwen3NextForCausalLM):
     """Thin wrapper that registers the Qwen3.5 MoE text architecture.
@@ -133,3 +178,49 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM):
     def __init__(self, model_config):
         _normalize_qwen35_exclude_modules(model_config)
         super().__init__(model_config)
+
+
+@support_multimodal_disaggregated
+@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel)
+@register_auto_model("Qwen3_5MoeForConditionalGeneration")
+@register_input_processor(
+    Qwen3VLInputProcessorBase,
+    model_type="qwen3_5_moe",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={
+            "image": "<|vision_start|><|image_pad|><|vision_end|>",
+            "video": "<|vision_start|><|video_pad|><|vision_end|>",
+        },
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+        placeholders_separator="",
+    ),
+)
+class Qwen3_5MoeVLModel(Qwen3VLModelBase):
+    """VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder."""
+
+    def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs):
+        _ensure_qwen35_mrope_compat(model_config.pretrained_config.text_config)
+
+        kwargs["vision_model_class"] = Qwen3VisionModel
+        kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False)
+        super().__init__(model_config, *args, **kwargs)
+
+    @property
+    def multimodal_data_device_paths(self) -> List[str]:
+        return [
+            "image.pixel_values",
+            "video.pixel_values_videos",
+            "multimodal_embedding",
+        ]
+
+    def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper):
+        if not _is_disagg():
+            self.mm_encoder.load_weights(weights)
+
+        weight_mapper = Qwen3_5MoeHfWeightMapper()
+        weight_mapper.init_model_and_config(self.llm, self.model_config)
+        filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")}
+        params_map = {
+            r"^model\.language_model\.(.*)$": r"model.\1",
+        }
+        self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map)
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py
index d6f4fd57794f..5d8ca8e81cbd 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_next.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_next.py
@@ -973,9 +973,18 @@ def get_model_defaults(cls, llm_args: 'TorchLlmArgs') -> dict:
         # is supported for Mamba/SSM-based models
         return {"kv_cache_config": {"enable_block_reuse": False}}
 
-    def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper):
+    def load_weights(self,
+                     weights: dict,
+                     weight_mapper: BaseWeightMapper,
+                     params_map: Optional[Dict[str, str]] = None,
+                     allow_partial_loading: bool = False):
         new_weights = weight_mapper.preprocess_weights(weights)
-        super().load_weights(new_weights, weight_mapper)
+        super().load_weights(
+            new_weights,
+            weight_mapper=weight_mapper,
+            params_map=params_map,
+            allow_partial_loading=allow_partial_loading,
+        )
 
     def post_load_weights(self):
         for idx, layer in enumerate(
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py
index 2031a4b7dc18..526b84dbc216 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3vl.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3vl.py
@@ -998,6 +998,8 @@ def __init__(
             llm_model_config.pretrained_config.architectures = ["Qwen3ForCausalLM"]
         elif self.original_arch == "Qwen3VLMoeForConditionalGeneration":
             llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"]
+        elif self.original_arch == "Qwen3_5MoeForConditionalGeneration":
+            llm_model_config.pretrained_config.architectures = ["Qwen3_5MoeForCausalLM"]
         else:
             raise ValueError(f"Unsupported architecture: {self.original_arch}")
         # Qwen3ForCausalLM.
@@ -1035,9 +1037,12 @@ def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]):
             mrope_section=config.rope_scaling.get("mrope_section", None),
             mrope_interleaved=config.rope_scaling.get("mrope_interleaved", False),
         )
+        head_dim = getattr(config, "head_dim", None)
+        if not isinstance(head_dim, int):
+            head_dim = config.hidden_size // config.num_attention_heads
         self.rotary_emb = MRotaryEmbedding(
             pos_embd_params.rope,
-            head_dim=config.hidden_size // config.num_attention_heads,
+            head_dim=head_dim,
             is_neox=pos_embd_params.is_neox,
             mrope_section=pos_embd_params.mrope_section,
             mrope_interleaved=pos_embd_params.mrope_interleaved,
diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py
index f4ec50639715..7626d839e92e 100644
--- a/tensorrt_llm/_torch/pyexecutor/config_utils.py
+++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py
@@ -427,6 +427,7 @@ def __getitem__(self, key):
     deepseek_v32="DeepseekV3Config",
     kimi_k2="DeepseekV3Config",
     glm_moe_dsa="DeepseekV3Config",
+    qwen3_5_moe="Qwen3_5MoeConfig",
 )  # NOTE: HF config.json uses deepseek_v32 as model_type but with same DSV3 config class
 
 
diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
index 34083c6ba5d5..2ecb68d691b6 100644
--- a/tests/integration/defs/accuracy/references/mmmu.yaml
+++ b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -68,3 +68,5 @@ moonshotai/Kimi-K2.5:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 81.56
+Qwen/Qwen3.5-35B-A3B:
+  - accuracy: 59.0
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
index d7623cd828ae..37c4b349f102 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
@@ -426,6 +426,29 @@ def test_nvfp4_4gpus(
             task.evaluate(llm, sampling_params=self.sampling_params)
 
 
+class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "Qwen/Qwen3.5-35B-A3B"
+    MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B"
+    MAX_NUM_TOKENS = 16384
+
+    sampling_params = SamplingParams(
+        max_tokens=MAX_NUM_TOKENS,
+        truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
+        stop="<|endoftext|>",
+    )
+
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+
+    def test_auto_dtype(self):
+        with LLM(
+            self.MODEL_PATH,
+            max_num_tokens=self.MAX_NUM_TOKENS,
+            kv_cache_config=self.kv_cache_config,
+        ) as llm:
+            task = MMMU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
+
+
 class TestQwen3VL(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-8B-Instruct"

From fef5361b7a13dc1136b74efd1050f27b60273f54 Mon Sep 17 00:00:00 2001
From: Michal Guzek <mguzek@nvidia.com>
Date: Thu, 14 May 2026 18:32:41 -0700
Subject: [PATCH 2/9] Qwen3.5 VL MoE Working draft

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
---
 tensorrt_llm/_torch/configs/__init__.py       |   1 -
 tensorrt_llm/_torch/models/__init__.py        |   4 +-
 .../checkpoints/hf/qwen3_5_weight_mapper.py   |   1 +
 .../_torch/models/modeling_qwen3_5.py         |  30 +---
 .../_torch/pyexecutor/config_utils.py         | 151 +++++++++++++++++-
 .../_torch/pyexecutor/model_loader.py         |  11 +-
 .../defs/accuracy/references/mmmu.yaml        |   3 +-
 .../test_llm_api_pytorch_multimodal.py        |   5 +-
 .../modeling/test_modeling_qwen3_5_vl_moe.py  | 144 +++++++++++++++++
 9 files changed, 307 insertions(+), 43 deletions(-)
 create mode 100644 tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py

diff --git a/tensorrt_llm/_torch/configs/__init__.py b/tensorrt_llm/_torch/configs/__init__.py
index b4ba4c5183f6..0ab6bc3fcacf 100644
--- a/tensorrt_llm/_torch/configs/__init__.py
+++ b/tensorrt_llm/_torch/configs/__init__.py
@@ -26,5 +26,4 @@ def _register_custom_configs_with_transformers() -> None:
 
 __all__ = [
     "DeepseekV3Config",
-    "Qwen3_5MoeConfig",
 ]
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
index 55177767ac5a..42229246ae08 100644
--- a/tensorrt_llm/_torch/models/__init__.py
+++ b/tensorrt_llm/_torch/models/__init__.py
@@ -35,7 +35,8 @@
                             Qwen2ForRewardModel)
 from .modeling_qwen2vl import Qwen2_5_VLModel, Qwen2VLModel
 from .modeling_qwen3 import Qwen3ForCausalLM
-from .modeling_qwen3_5 import Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM
+from .modeling_qwen3_5 import (Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM,
+                               Qwen3_5MoeVLModel)
 from .modeling_qwen3_moe import Qwen3MoeForCausalLM
 from .modeling_qwen3_next import Qwen3NextForCausalLM
 from .modeling_qwen3vl import Qwen3VLModel
@@ -86,6 +87,7 @@
     "Qwen3MoeForCausalLM",
     "Qwen3_5ForCausalLM",
     "Qwen3_5MoeForCausalLM",
+    "Qwen3_5MoeVLModel",
     "Qwen3NextForCausalLM",
     "Qwen3MoeVLModel",
     "GptOssForCausalLM",
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py
index fa2f161bdc4f..65e0168bec55 100644
--- a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py
@@ -13,6 +13,7 @@
 
 
 @register_mapper("HF", "Qwen3_5MoeForCausalLM")
+@register_mapper("HF", "Qwen3_5MoeForConditionalGeneration")
 @register_mapper("HF", "Qwen3_5ForCausalLM")
 class Qwen3_5MoeHfWeightMapper(Qwen3NextHfWeightMapper):
     """Weight mapper for Qwen3.5 MoE text checkpoints.
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
index 2c15b851d511..e815c94bd063 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
@@ -5,6 +5,7 @@
 from transformers import PretrainedConfig
 
 from ...inputs import (
+    ContentFormat,
     MultimodalPlaceholderMetadata,
     MultimodalPlaceholderPlacement,
     register_input_processor,
@@ -112,32 +113,6 @@ def _normalize_qwen35_exclude_modules(model_config):
     qc.exclude_modules = sorted(normalized)
 
 
-def _ensure_qwen35_mrope_compat(text_config: PretrainedConfig) -> None:
-    """Normalize Qwen3.5 mRoPE fields for the shared Qwen3-VL wrapper.
-
-    Qwen3.5 stores RoPE metadata in ``rope_parameters``.  Some config classes
-    may also materialize default top-level ``rope_theta`` or
-    ``partial_rotary_factor`` values, so prefer the checkpoint-provided nested
-    values unconditionally here.
-    """
-    rope_parameters = getattr(text_config, "rope_parameters", None)
-    if not rope_parameters:
-        return
-
-    rope_params = dict(rope_parameters)
-    rope_theta = rope_params.pop("rope_theta", None)
-    if rope_theta is not None:
-        text_config.rope_theta = rope_theta
-
-    partial_rotary_factor = rope_params.pop("partial_rotary_factor", None)
-    if partial_rotary_factor is not None:
-        text_config.partial_rotary_factor = partial_rotary_factor
-
-    if not getattr(text_config, "rope_scaling", None):
-        rope_params.pop("rope_type", None)
-        text_config.rope_scaling = rope_params
-
-
 @register_auto_model("Qwen3_5MoeForCausalLM")
 class Qwen3_5MoeForCausalLM(Qwen3NextForCausalLM):
     """Thin wrapper that registers the Qwen3.5 MoE text architecture.
@@ -193,14 +168,13 @@ def __init__(self, model_config):
         },
         placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
         placeholders_separator="",
+        content_format=ContentFormat.STRING,
     ),
 )
 class Qwen3_5MoeVLModel(Qwen3VLModelBase):
     """VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder."""
 
     def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs):
-        _ensure_qwen35_mrope_compat(model_config.pretrained_config.text_config)
-
         kwargs["vision_model_class"] = Qwen3VisionModel
         kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False)
         super().__init__(model_config, *args, **kwargs)
diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py
index 7626d839e92e..761efebbf7cc 100644
--- a/tensorrt_llm/_torch/pyexecutor/config_utils.py
+++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py
@@ -6,6 +6,7 @@
 import torch
 import transformers
 
+from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.logger import logger
 
 
@@ -21,6 +22,57 @@ def is_hybrid_linear(config):
     return is_nemotron_hybrid(config) or is_qwen3_hybrid(config)
 
 
+def _coerce_torch_dtype(dtype):
+    """Normalize dtype values from HF configs into torch dtype objects.
+
+    HF configs may store dtype fields as torch dtypes, strings, or the sentinel
+    value "auto". Returning None for "auto" lets the caller keep its normal
+    fallback path instead of treating "auto" as a concrete cache dtype.
+    """
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if dtype == "auto":
+        return None
+    if isinstance(dtype, str):
+        return str_dtype_to_torch(dtype)
+    return dtype
+
+
+def resolve_hf_torch_dtype(config):
+    """Return the model's regular tensor dtype from common HF config fields.
+
+    Transformers has used both dtype and torch_dtype across versions and model
+    families. This helper checks both names and coerces whichever one is present
+    into the form expected by TRT-LLM runtime code.
+    """
+    for attr in ("dtype", "torch_dtype"):
+        dtype = getattr(config, attr, None)
+        if dtype is not None:
+            return _coerce_torch_dtype(dtype)
+    return None
+
+
+def resolve_mamba_ssm_cache_dtype(config):
+    """Return the dtype to use for hybrid Mamba/SSM cache allocations.
+
+    Qwen3.5-style configs may store this field on the top-level config or the
+    nested text_config, and may call it either mamba_ssm_cache_dtype or
+    mamba_ssm_dtype. This helper centralizes that lookup so cache creation does
+    not fail later with a missing dtype.
+    """
+    configs = [config]
+    text_config = getattr(config, "text_config", None)
+    if text_config is not None:
+        configs.append(text_config)
+
+    for candidate_config in configs:
+        for attr in ("mamba_ssm_cache_dtype", "mamba_ssm_dtype"):
+            dtype = getattr(candidate_config, attr, None)
+            if dtype is not None:
+                return _coerce_torch_dtype(dtype)
+    return None
+
+
 def is_nemotron_hybrid(config):
     if hasattr(config, "hybrid_override_pattern"
                ) and config.hybrid_override_pattern is not None and len(
@@ -251,6 +303,12 @@ def extract_mamba_kv_cache_params(
 
     mamba_ssm_cache_dtype = (quant_config.mamba_ssm_cache_dtype
                              if quant_config is not None else None)
+    if mamba_ssm_cache_dtype is not None:
+        mamba_ssm_cache_dtype = _coerce_torch_dtype(mamba_ssm_cache_dtype)
+    else:
+        mamba_ssm_cache_dtype = (resolve_mamba_ssm_cache_dtype(config)
+                                 or resolve_hf_torch_dtype(config)
+                                 or torch.bfloat16)
 
     return MambaKVCacheParams(
         state_size=state_size,
@@ -262,16 +320,21 @@ def extract_mamba_kv_cache_params(
         full_attention_layer_mask=full_attn_mask,
         num_mamba_layers=sum(mamba_mask),
         num_full_attention_layers=sum(full_attn_mask),
-        dtype=config.torch_dtype,
+        dtype=resolve_hf_torch_dtype(config) or torch.bfloat16,
         mamba_ssm_cache_dtype=mamba_ssm_cache_dtype,
     )
 
 
 class _Qwen35ConfigCompat:
-    """Temporary shim that normalizes Qwen3.5 HF configs into Qwen3NextConfig.
+    """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig.
+
+    This is used for Qwen3.5 text-only configs and for shared helper logic such
+    as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM
+    configs should stay composite and use transformers.Qwen3_5MoeConfig plus
+    _normalize_qwen35_moe_vl_config instead.
 
     To remove: delete this class and the elif branch in
-    load_pretrained_config that references it.
+    load_pretrained_config that flattens Qwen3.5 text configs.
     """
 
     @staticmethod
@@ -415,6 +478,80 @@ def _flatten_rope(text_config: dict) -> dict:
         return text_config
 
 
+def _normalize_qwen35_mrope_config(text_config) -> None:
+    """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path.
+
+    HF stores RoPE metadata under ``rope_parameters``; the shared Qwen3-VL
+    wrapper reads ``rope_theta``, ``partial_rotary_factor``, and
+    ``rope_scaling`` directly on the text config.
+    """
+    rope_parameters = getattr(text_config, "rope_parameters", None)
+    if not rope_parameters:
+        return
+    if hasattr(rope_parameters, "to_dict"):
+        rope_parameters = rope_parameters.to_dict()
+    flattened = _Qwen35ConfigCompat._flatten_rope({
+        "rope_parameters":
+        dict(rope_parameters),
+        "rope_scaling":
+        dict(getattr(text_config, "rope_scaling", None) or {}),
+    })
+    for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"):
+        value = flattened.get(attr)
+        if value is not None:
+            setattr(text_config, attr, value)
+
+
+def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None:
+    """Materialize Qwen3Next-style text aliases used by the shared runtime."""
+    if getattr(text_config, "intermediate_size", None) is None:
+        moe_intermediate_size = getattr(text_config, "moe_intermediate_size",
+                                        None)
+        num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None)
+        shared_expert_intermediate_size = getattr(
+            text_config, "shared_expert_intermediate_size", 0) or 0
+        if (moe_intermediate_size is not None
+                and num_experts_per_tok is not None):
+            text_config.intermediate_size = (
+                num_experts_per_tok * moe_intermediate_size +
+                shared_expert_intermediate_size)
+
+
+def _normalize_qwen35_quantization_config(model_config) -> None:
+    quantization_config = getattr(model_config, "quantization_config", None)
+    if not isinstance(quantization_config, dict):
+        return
+
+    modules = quantization_config.get("modules_to_not_convert")
+    if modules is None:
+        return
+
+    text_config = getattr(model_config, "text_config", None)
+    normalized_modules = _Qwen35ConfigCompat._normalize_exclude_modules(modules)
+    if text_config is not None:
+        normalized_modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(
+            text_config.to_dict(), normalized_modules)
+    quantization_config["modules_to_not_convert"] = sorted(
+        set(normalized_modules))
+
+
+def _normalize_qwen35_moe_vl_config(model_config) -> None:
+    """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions."""
+    if not getattr(model_config, "architectures", None):
+        model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"]
+
+    text_config = getattr(model_config, "text_config", None)
+    if text_config is None:
+        raise ValueError("Qwen3.5-MoE VLM config is missing text_config")
+
+    text_config.architectures = ["Qwen3_5MoeForCausalLM"]
+    _normalize_qwen35_qwen3next_text_aliases(text_config)
+    _normalize_qwen35_mrope_config(text_config)
+
+    model_config.get_text_config = lambda decoder=False: text_config
+    _normalize_qwen35_quantization_config(model_config)
+
+
 # TODO: remove this once the transformers can support all of those models in _CONFIG_REGISTRY
 class LazyConfigDict(dict):
 
@@ -427,7 +564,6 @@ def __getitem__(self, key):
     deepseek_v32="DeepseekV3Config",
     kimi_k2="DeepseekV3Config",
     glm_moe_dsa="DeepseekV3Config",
-    qwen3_5_moe="Qwen3_5MoeConfig",
 )  # NOTE: HF config.json uses deepseek_v32 as model_type but with same DSV3 config class
 
 
@@ -445,6 +581,13 @@ def load_pretrained_config(model_name_or_path: str,
             MistralConfigLoader
         model_config = MistralConfigLoader().load(
             model_name_or_path).pretrained_config
+    elif (model_type == "qwen3_5_moe" and
+          (("text_config" in config_dict and "vision_config" in config_dict) or
+           (architectures
+            and architectures[0] == "Qwen3_5MoeForConditionalGeneration"))):
+        model_config = transformers.Qwen3_5MoeConfig.from_pretrained(
+            model_name_or_path, **kwargs)
+        _normalize_qwen35_moe_vl_config(model_config)
     elif model_type in _CONFIG_REGISTRY:
         config_class = _CONFIG_REGISTRY[model_type]
         model_config = config_class.from_pretrained(model_name_or_path,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index 14d813a99dfd..54c02754f12d 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -29,6 +29,7 @@
     MoeLoadBalancer, maybe_create_moe_load_balancer)
 from ..virtual_memory import RestoreMode
 from ..virtual_memory import scope as virtual_memory_scope
+from .config_utils import resolve_hf_torch_dtype, resolve_mamba_ssm_cache_dtype
 
 _KV_CACHE_MAP = {
     "fp8": QuantAlgo.FP8.value,
@@ -44,12 +45,10 @@ def validate_and_set_mamba_ssm_cache_dtype(
         mamba_ssm_stochastic_rounding: bool = False,
         mamba_ssm_philox_rounds: int = 10) -> None:
     if mamba_ssm_cache_dtype == "auto":
-        hf_dtype = getattr(config.pretrained_config, "mamba_ssm_cache_dtype",
-                           None)
-        if hf_dtype is not None:
-            mamba_ssm_cache_dtype = str_dtype_to_torch(hf_dtype)
-        else:
-            mamba_ssm_cache_dtype = config.pretrained_config.torch_dtype
+        mamba_ssm_cache_dtype = (
+            resolve_mamba_ssm_cache_dtype(config.pretrained_config)
+            or resolve_hf_torch_dtype(config.pretrained_config)
+            or config.torch_dtype)
     else:
         mamba_ssm_cache_dtype = str_dtype_to_torch(mamba_ssm_cache_dtype)
 
diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
index 2ecb68d691b6..1bfb7c4869a8 100644
--- a/tests/integration/defs/accuracy/references/mmmu.yaml
+++ b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -60,6 +60,7 @@ Qwen/Qwen3-VL-8B-Instruct:
 mistralai/Mistral-Small-3.1-24B-Instruct-2503:
   - accuracy: 57.0
 Qwen/Qwen3.5-35B-A3B:
+  - accuracy: 59.0
   - dtype: bfloat16
     accuracy: 60.444
 # Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params).
@@ -68,5 +69,3 @@ moonshotai/Kimi-K2.5:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 81.56
-Qwen/Qwen3.5-35B-A3B:
-  - accuracy: 59.0
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
index 37c4b349f102..cbce9be563ac 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
@@ -426,10 +426,12 @@ def test_nvfp4_4gpus(
             task.evaluate(llm, sampling_params=self.sampling_params)
 
 
+@pytest.mark.skip_less_device_memory(80000)
 class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/Qwen3.5-35B-A3B"
     MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B"
     MAX_NUM_TOKENS = 16384
+    MAX_BATCH_SIZE = 32
 
     sampling_params = SamplingParams(
         max_tokens=MAX_NUM_TOKENS,
@@ -437,12 +439,13 @@ class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):
         stop="<|endoftext|>",
     )
 
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False)
 
     def test_auto_dtype(self):
         with LLM(
             self.MODEL_PATH,
             max_num_tokens=self.MAX_NUM_TOKENS,
+            max_batch_size=self.MAX_BATCH_SIZE,
             kv_cache_config=self.kv_cache_config,
         ) as llm:
             task = MMMU(self.MODEL_NAME)
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
new file mode 100644
index 000000000000..49fc4cbe4902
--- /dev/null
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
@@ -0,0 +1,144 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+from pathlib import Path
+
+import torch
+import transformers
+
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models import Qwen3_5MoeVLModel
+from tensorrt_llm._torch.models.checkpoints.auto_mapper import AutoCheckpointMapper
+from tensorrt_llm._torch.models.checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper
+from tensorrt_llm._torch.models.modeling_auto import AutoModelForCausalLM
+from tensorrt_llm._torch.pyexecutor.config_utils import (
+    extract_mamba_kv_cache_params,
+    load_pretrained_config,
+)
+from tensorrt_llm._torch.pyexecutor.model_loader import validate_and_set_mamba_ssm_cache_dtype
+from tensorrt_llm.inputs import ContentFormat
+from tensorrt_llm.inputs.registry import MULTIMODAL_PLACEHOLDER_REGISTRY
+
+
+def _write_qwen35_moe_vl_config(tmp_path: Path) -> Path:
+    config = {
+        "architectures": ["Qwen3_5MoeForConditionalGeneration"],
+        "image_token_id": 248056,
+        "model_type": "qwen3_5_moe",
+        "text_config": {
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "bos_token_id": 151643,
+            "dtype": "bfloat16",
+            "eos_token_id": 151645,
+            "full_attention_interval": 4,
+            "head_dim": 128,
+            "hidden_act": "silu",
+            "hidden_size": 2048,
+            "linear_conv_kernel_dim": 4,
+            "linear_key_head_dim": 128,
+            "linear_num_key_heads": 16,
+            "linear_num_value_heads": 32,
+            "linear_value_head_dim": 128,
+            "mamba_ssm_dtype": "float32",
+            "max_position_embeddings": 262144,
+            "mlp_only_layers": [],
+            "model_type": "qwen3_5_moe_text",
+            "moe_intermediate_size": 512,
+            "norm_topk_prob": True,
+            "num_attention_heads": 32,
+            "num_experts": 128,
+            "num_experts_per_tok": 8,
+            "num_hidden_layers": 2,
+            "num_key_value_heads": 4,
+            "rms_norm_eps": 1e-6,
+            "shared_expert_intermediate_size": 512,
+            "rope_parameters": {
+                "mrope_section": [11, 11, 10],
+                "partial_rotary_factor": 0.25,
+                "rope_theta": 1000000.0,
+                "rope_type": "default",
+            },
+            "use_cache": True,
+            "vocab_size": 151936,
+        },
+        "tie_word_embeddings": False,
+        "video_token_id": 248057,
+        "vision_config": {
+            "deepstack_visual_indexes": [8, 16, 24],
+            "depth": 27,
+            "hidden_act": "gelu_pytorch_tanh",
+            "hidden_size": 1152,
+            "in_channels": 3,
+            "intermediate_size": 4304,
+            "model_type": "qwen3_5_moe",
+            "num_heads": 16,
+            "num_position_embeddings": 2304,
+            "out_hidden_size": 2048,
+            "patch_size": 16,
+            "spatial_merge_size": 2,
+            "temporal_patch_size": 2,
+        },
+        "vision_end_token_id": 248054,
+        "vision_start_token_id": 248053,
+    }
+    (tmp_path / "config.json").write_text(json.dumps(config), encoding="utf-8")
+    return tmp_path
+
+
+def test_qwen35_moe_vl_config_preserves_vlm_architecture(
+    tmp_path: Path,
+) -> None:
+    config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path)))
+
+    assert isinstance(config, transformers.Qwen3_5MoeConfig)
+    assert config.architectures == ["Qwen3_5MoeForConditionalGeneration"]
+    assert config.text_config.architectures == ["Qwen3_5MoeForCausalLM"]
+    assert config.text_config.num_experts == 128
+    assert config.text_config.intermediate_size == 4608
+    assert config.text_config.rope_theta == 1000000.0
+    assert config.text_config.partial_rotary_factor == 0.25
+    assert config.text_config.rope_scaling["type"] == "mrope"
+    assert config.text_config.rope_scaling["mrope_section"] == [11, 11, 10]
+    assert config.text_config.mamba_ssm_dtype == "float32"
+    assert config.get_text_config() is config.text_config
+
+
+def test_qwen35_moe_vl_resolves_mamba_ssm_cache_dtype(
+    tmp_path: Path,
+) -> None:
+    config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path)))
+    model_config = ModelConfig(pretrained_config=config)
+
+    validate_and_set_mamba_ssm_cache_dtype(model_config, "auto")
+    assert model_config.quant_config.mamba_ssm_cache_dtype is torch.float32
+
+    mamba_params = extract_mamba_kv_cache_params(
+        config.text_config,
+        quant_config=model_config.quant_config,
+    )
+    assert mamba_params.dtype is torch.bfloat16
+    assert mamba_params.mamba_ssm_cache_dtype is torch.float32
+
+
+def test_qwen35_moe_vl_resolves_model_and_mapper(tmp_path: Path) -> None:
+    config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path)))
+    model_config = ModelConfig(pretrained_config=config)
+
+    assert AutoModelForCausalLM._resolve_class(model_config) is Qwen3_5MoeVLModel
+    assert isinstance(
+        AutoCheckpointMapper.get("HF", "Qwen3_5MoeForConditionalGeneration"),
+        Qwen3_5MoeHfWeightMapper,
+    )
+
+
+def test_qwen35_moe_vl_placeholder_metadata_registered() -> None:
+    metadata = MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholder_metadata("qwen3_5_moe")
+
+    assert metadata.placeholder_map == {
+        "image": "<|vision_start|><|image_pad|><|vision_end|>",
+        "video": "<|vision_start|><|video_pad|><|vision_end|>",
+    }
+    assert metadata.placeholders_separator == ""
+    assert metadata.content_format is ContentFormat.STRING

From 1e9a86617e779b985ae902fc3b008257a6c098aa Mon Sep 17 00:00:00 2001
From: Michal Guzek <mguzek@nvidia.com>
Date: Fri, 15 May 2026 10:58:11 -0700
Subject: [PATCH 3/9] Address review comments

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
---
 .../_torch/pyexecutor/config_utils.py         | 31 ++++++++++---------
 .../test_llm_api_pytorch_multimodal.py        |  2 +-
 .../test_lists/qa/llm_function_core.txt       |  1 +
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py
index 761efebbf7cc..d58a7db52f43 100644
--- a/tensorrt_llm/_torch/pyexecutor/config_utils.py
+++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py
@@ -27,7 +27,7 @@ def _coerce_torch_dtype(dtype):
 
     HF configs may store dtype fields as torch dtypes, strings, or the sentinel
     value "auto". Returning None for "auto" lets the caller keep its normal
-    fallback path instead of treating "auto" as a concrete cache dtype.
+    fallback path instead of treating "auto" as a concrete dtype.
     """
     if isinstance(dtype, torch.dtype):
         return dtype
@@ -43,12 +43,14 @@ def resolve_hf_torch_dtype(config):
 
     Transformers has used both dtype and torch_dtype across versions and model
     families. This helper checks both names and coerces whichever one is present
-    into the form expected by TRT-LLM runtime code.
+    into the form expected by TRT-LLM runtime code. An "auto" value in any
+    field is treated the same as missing, so scanning continues to the next
+    field instead of stopping with None.
     """
     for attr in ("dtype", "torch_dtype"):
-        dtype = getattr(config, attr, None)
-        if dtype is not None:
-            return _coerce_torch_dtype(dtype)
+        coerced = _coerce_torch_dtype(getattr(config, attr, None))
+        if coerced is not None:
+            return coerced
     return None
 
 
@@ -58,7 +60,8 @@ def resolve_mamba_ssm_cache_dtype(config):
     Qwen3.5-style configs may store this field on the top-level config or the
     nested text_config, and may call it either mamba_ssm_cache_dtype or
     mamba_ssm_dtype. This helper centralizes that lookup so cache creation does
-    not fail later with a missing dtype.
+    not fail later with a missing dtype. An "auto" value in any field is
+    treated the same as missing.
     """
     configs = [config]
     text_config = getattr(config, "text_config", None)
@@ -67,9 +70,9 @@ def resolve_mamba_ssm_cache_dtype(config):
 
     for candidate_config in configs:
         for attr in ("mamba_ssm_cache_dtype", "mamba_ssm_dtype"):
-            dtype = getattr(candidate_config, attr, None)
-            if dtype is not None:
-                return _coerce_torch_dtype(dtype)
+            coerced = _coerce_torch_dtype(getattr(candidate_config, attr, None))
+            if coerced is not None:
+                return coerced
     return None
 
 
@@ -301,11 +304,11 @@ def extract_mamba_kv_cache_params(
             full_attn_mask.extend([True] * num_spec_layers)
             mamba_mask.extend([False] * num_spec_layers)
 
-    mamba_ssm_cache_dtype = (quant_config.mamba_ssm_cache_dtype
-                             if quant_config is not None else None)
-    if mamba_ssm_cache_dtype is not None:
-        mamba_ssm_cache_dtype = _coerce_torch_dtype(mamba_ssm_cache_dtype)
-    else:
+    mamba_ssm_cache_dtype = None
+    if quant_config is not None:
+        mamba_ssm_cache_dtype = _coerce_torch_dtype(
+            quant_config.mamba_ssm_cache_dtype)
+    if mamba_ssm_cache_dtype is None:
         mamba_ssm_cache_dtype = (resolve_mamba_ssm_cache_dtype(config)
                                  or resolve_hf_torch_dtype(config)
                                  or torch.bfloat16)
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
index cbce9be563ac..2a715fc33124 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
@@ -441,7 +441,7 @@ class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):
 
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False)
 
-    def test_auto_dtype(self):
+    def test_auto_dtype(self) -> None:
         with LLM(
             self.MODEL_PATH,
             max_num_tokens=self.MAX_NUM_TOKENS,
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
index 6ac7f64897b9..e52c36078273 100644
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -801,6 +801,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced_chunked_prefill]
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestKimiK25::test_nvfp4[dep8]
+accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
 accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray
 unittest/disaggregated/test_openai_disagg_server.py

From 05c59b2c554de4729b16a19ebf786773edc92f86 Mon Sep 17 00:00:00 2001
From: Michal Guzek <mguzek@nvidia.com>
Date: Mon, 18 May 2026 11:16:38 -0700
Subject: [PATCH 4/9] Address review comments

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
---
 .../_torch/models/modeling_qwen3_5.py         | 244 ++++++++++++++++++
 .../_torch/pyexecutor/config_utils.py         | 235 +----------------
 .../defs/accuracy/references/mmmu.yaml        |   1 +
 3 files changed, 251 insertions(+), 229 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
index e815c94bd063..a1d8aaa69a08 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
@@ -1,4 +1,5 @@
 import re
+from types import SimpleNamespace
 from typing import Dict, List
 
 import torch
@@ -11,6 +12,7 @@
     register_input_processor,
     support_multimodal_disaggregated,
 )
+from ..pyexecutor.config_utils import get_qwen3_hybrid_layer_types
 from .checkpoints.base_weight_mapper import BaseWeightMapper
 from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper
 from .modeling_multimodal_utils import _is_disagg
@@ -71,6 +73,248 @@ def _translate_mtp_pattern(name, n_hidden_layers):
     return None
 
 
+# --- Config adapters --------------------------------------------------------
+#
+# These run from ``load_pretrained_config`` in
+# ``tensorrt_llm/_torch/pyexecutor/config_utils.py`` via lazy import — the
+# runtime layer asks the model module how to load its own config.
+#
+# There are two entry points:
+#   - ``_Qwen35ConfigCompat.normalize(config_dict)`` — for text-only
+#     Qwen3.5 (MoE and dense). Returns a dict that
+#     ``transformers.Qwen3NextConfig.from_dict(...)`` can consume, so the
+#     existing Qwen3Next runtime is reused unchanged.
+#   - ``_normalize_qwen35_moe_vl_config(model_config)`` — for the
+#     Qwen3.5-MoE VLM. Mutates the HF-native ``transformers.Qwen3_5MoeConfig``
+#     in place, attaching the runtime aliases the Qwen3Next-based LM expects
+#     while keeping ``text_config`` / ``vision_config`` composite.
+
+
+class _Qwen35ConfigCompat:
+    """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig.
+
+    We normalize to ``Qwen3NextConfig`` (rather than to a Qwen3.5-native
+    schema) so the runtime can reuse the existing ``Qwen3NextForCausalLM``
+    model implementation unchanged — Qwen3.5 text is structurally identical
+    to Qwen3Next, so matching the config schema lets the same code serve
+    both.
+
+    This is used for Qwen3.5 text-only configs and for shared helper logic such
+    as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM
+    configs should stay composite and use transformers.Qwen3_5MoeConfig plus
+    _normalize_qwen35_moe_vl_config instead.
+
+    To remove: delete this class and the elif branch in
+    load_pretrained_config that flattens Qwen3.5 text configs.
+    """
+
+    @staticmethod
+    def normalize(config_dict: dict) -> dict:
+        """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict."""
+        text_config = _Qwen35ConfigCompat._extract_text_config(config_dict)
+        text_config = _Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config)
+        text_config = _Qwen35ConfigCompat._flatten_rope(text_config)
+
+        # Detect dense vs MoE and set architecture + MoE defaults accordingly
+        is_moe = "num_experts" in text_config and text_config["num_experts"] > 0
+        if is_moe:
+            text_config["architectures"] = ["Qwen3_5MoeForCausalLM"]
+        else:
+            text_config["architectures"] = ["Qwen3_5ForCausalLM"]
+            # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't
+            # accidentally enable MoE for the dense model.
+            text_config.setdefault("num_experts", 0)
+            text_config.setdefault("num_experts_per_tok", 0)
+            text_config.setdefault("moe_intermediate_size", 0)
+            text_config.setdefault("shared_expert_intermediate_size", 0)
+        return text_config
+
+    _VLM_ARCHITECTURES = {
+        "Qwen3_5MoeForConditionalGeneration",
+        "Qwen3_5ForConditionalGeneration",
+    }
+
+    @staticmethod
+    def _extract_text_config(config_dict: dict) -> dict:
+        """Pull nested text_config from VLM checkpoints, or use dict as-is."""
+        architectures = config_dict.get("architectures") or []
+        if architectures and architectures[0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES:
+            text_config = dict(config_dict.get("text_config") or {})
+        else:
+            text_config = dict(config_dict)
+        if not text_config:
+            raise ValueError("Qwen3.5 config is missing a usable text_config")
+        return text_config
+
+    @staticmethod
+    def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict:
+        """Copy top-level quantization_config into text_config with name normalization.
+
+        Also adds a temporary workaround that keeps packed linear-attention
+        in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is
+        fixed for that layout.
+        """
+        if "quantization_config" in text_config:
+            return text_config
+        if "quantization_config" not in config_dict:
+            return text_config
+
+        quantization_config = dict(config_dict["quantization_config"])
+        if "modules_to_not_convert" in quantization_config:
+            modules = _Qwen35ConfigCompat._normalize_exclude_modules(
+                quantization_config["modules_to_not_convert"]
+            )
+            modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules)
+            quantization_config["modules_to_not_convert"] = sorted(set(modules))
+        text_config["quantization_config"] = quantization_config
+        return text_config
+
+    @staticmethod
+    def _normalize_exclude_modules(modules: list[str]) -> list[str]:
+        """Translate HF quantization exclude-module paths to TRT-LLM names.
+
+        - Strip model.language_model. prefix -> model.
+        - Drop model.visual.* and mtp.* entries
+        - Map split projection names to packed TRT-LLM names
+        """
+        normalized = set()
+        for name in modules:
+            if name.startswith("model.language_model."):
+                name = "model." + name[len("model.language_model.") :]
+            if name.startswith("model.visual.") or name.startswith("mtp."):
+                continue
+            name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name)
+            name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name)
+            normalized.add(name)
+        return sorted(normalized)
+
+    @staticmethod
+    def _add_qkvz_bf16_workaround(text_config: dict, modules: list[str]) -> list[str]:
+        """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers.
+
+        Temporary until FP8 block-scale TP loading is fixed for this layout.
+        """
+        try:
+            layer_types = get_qwen3_hybrid_layer_types(SimpleNamespace(**text_config))
+        except (ValueError, AttributeError):
+            return modules
+        for layer_idx, layer_type in enumerate(layer_types):
+            if layer_type == "linear_attention":
+                modules.append(f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz")
+        return modules
+
+    @staticmethod
+    def _flatten_rope(text_config: dict) -> dict:
+        """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling.
+
+        Qwen3.5 nests these inside a rope_parameters dict and uses rope_type
+        instead of type in rope_scaling.  Qwen3NextConfig expects them as
+        top-level fields with rope_scaling.type.
+        """
+        rope_parameters = dict(text_config.pop("rope_parameters", {}) or {})
+        rope_scaling = dict(text_config.get("rope_scaling") or {})
+        if rope_parameters:
+            rope_theta = rope_parameters.pop("rope_theta", None)
+            if rope_theta is not None:
+                text_config.setdefault("rope_theta", rope_theta)
+            partial_rotary_factor = rope_parameters.pop("partial_rotary_factor", None)
+            if partial_rotary_factor is not None:
+                text_config.setdefault("partial_rotary_factor", partial_rotary_factor)
+            if rope_parameters:
+                rope_scaling = rope_parameters | rope_scaling
+        if rope_scaling:
+            has_mrope = "mrope_section" in rope_scaling or rope_scaling.get(
+                "mrope_interleaved", False
+            )
+            if has_mrope:
+                rope_scaling["type"] = "mrope"
+                rope_scaling.pop("rope_type", None)
+            elif "type" not in rope_scaling and "rope_type" in rope_scaling:
+                rope_type = rope_scaling.pop("rope_type")
+                # "default" means standard RoPE (no scaling) — don't set
+                # rope_scaling to avoid triggering scaling code paths.
+                if rope_type == "default":
+                    rope_scaling = {}
+                else:
+                    rope_scaling["type"] = rope_type
+            if rope_scaling:
+                text_config["rope_scaling"] = rope_scaling
+        return text_config
+
+
+def _normalize_qwen35_mrope_config(text_config) -> None:
+    """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path.
+
+    HF stores RoPE metadata under `rope_parameters`; the shared Qwen3-VL
+    wrapper reads `rope_theta`, `partial_rotary_factor`, and
+    `rope_scaling` directly on the text config.
+    """
+    rope_parameters = getattr(text_config, "rope_parameters", None)
+    if not rope_parameters:
+        return
+    if hasattr(rope_parameters, "to_dict"):
+        rope_parameters = rope_parameters.to_dict()
+    flattened = _Qwen35ConfigCompat._flatten_rope(
+        {
+            "rope_parameters": dict(rope_parameters),
+            "rope_scaling": dict(getattr(text_config, "rope_scaling", None) or {}),
+        }
+    )
+    for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"):
+        value = flattened.get(attr)
+        if value is not None:
+            setattr(text_config, attr, value)
+
+
+def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None:
+    """Materialize Qwen3Next-style text aliases used by the shared runtime."""
+    if getattr(text_config, "intermediate_size", None) is None:
+        moe_intermediate_size = getattr(text_config, "moe_intermediate_size", None)
+        num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None)
+        shared_expert_intermediate_size = (
+            getattr(text_config, "shared_expert_intermediate_size", 0) or 0
+        )
+        if moe_intermediate_size is not None and num_experts_per_tok is not None:
+            text_config.intermediate_size = (
+                num_experts_per_tok * moe_intermediate_size + shared_expert_intermediate_size
+            )
+
+
+def _normalize_qwen35_quantization_config(model_config) -> None:
+    quantization_config = getattr(model_config, "quantization_config", None)
+    if not isinstance(quantization_config, dict):
+        return
+
+    modules = quantization_config.get("modules_to_not_convert")
+    if modules is None:
+        return
+
+    text_config = getattr(model_config, "text_config", None)
+    normalized_modules = _Qwen35ConfigCompat._normalize_exclude_modules(modules)
+    if text_config is not None:
+        normalized_modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(
+            text_config.to_dict(), normalized_modules
+        )
+    quantization_config["modules_to_not_convert"] = sorted(set(normalized_modules))
+
+
+def _normalize_qwen35_moe_vl_config(model_config) -> None:
+    """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions."""
+    if not getattr(model_config, "architectures", None):
+        model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"]
+
+    text_config = getattr(model_config, "text_config", None)
+    if text_config is None:
+        raise ValueError("Qwen3.5-MoE VLM config is missing text_config")
+
+    text_config.architectures = ["Qwen3_5MoeForCausalLM"]
+    _normalize_qwen35_qwen3next_text_aliases(text_config)
+    _normalize_qwen35_mrope_config(text_config)
+
+    model_config.get_text_config = lambda decoder=False: text_config
+    _normalize_qwen35_quantization_config(model_config)
+
+
 def _normalize_qwen35_exclude_modules(model_config):
     """Normalize NVFP4/FP8 exclude_modules from HF naming to TRT-LLM naming.
 
diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py
index d58a7db52f43..978796f26d46 100644
--- a/tensorrt_llm/_torch/pyexecutor/config_utils.py
+++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py
@@ -1,6 +1,4 @@
 import dataclasses
-import re
-from types import SimpleNamespace
 from typing import List, Optional
 
 import torch
@@ -328,233 +326,6 @@ def extract_mamba_kv_cache_params(
     )
 
 
-class _Qwen35ConfigCompat:
-    """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig.
-
-    This is used for Qwen3.5 text-only configs and for shared helper logic such
-    as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM
-    configs should stay composite and use transformers.Qwen3_5MoeConfig plus
-    _normalize_qwen35_moe_vl_config instead.
-
-    To remove: delete this class and the elif branch in
-    load_pretrained_config that flattens Qwen3.5 text configs.
-    """
-
-    @staticmethod
-    def normalize(config_dict: dict) -> dict:
-        """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict."""
-        text_config = _Qwen35ConfigCompat._extract_text_config(config_dict)
-        text_config = _Qwen35ConfigCompat._inherit_quantization_config(
-            config_dict, text_config)
-        text_config = _Qwen35ConfigCompat._flatten_rope(text_config)
-
-        # Detect dense vs MoE and set architecture + MoE defaults accordingly
-        is_moe = "num_experts" in text_config and text_config["num_experts"] > 0
-        if is_moe:
-            text_config["architectures"] = ["Qwen3_5MoeForCausalLM"]
-        else:
-            text_config["architectures"] = ["Qwen3_5ForCausalLM"]
-            # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't
-            # accidentally enable MoE for the dense model.
-            text_config.setdefault("num_experts", 0)
-            text_config.setdefault("num_experts_per_tok", 0)
-            text_config.setdefault("moe_intermediate_size", 0)
-            text_config.setdefault("shared_expert_intermediate_size", 0)
-        return text_config
-
-    _VLM_ARCHITECTURES = {
-        "Qwen3_5MoeForConditionalGeneration",
-        "Qwen3_5ForConditionalGeneration",
-    }
-
-    @staticmethod
-    def _extract_text_config(config_dict: dict) -> dict:
-        """Pull nested text_config from VLM checkpoints, or use dict as-is."""
-        architectures = config_dict.get("architectures") or []
-        if architectures and architectures[
-                0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES:
-            text_config = dict(config_dict.get("text_config") or {})
-        else:
-            text_config = dict(config_dict)
-        if not text_config:
-            raise ValueError("Qwen3.5 config is missing a usable text_config")
-        return text_config
-
-    @staticmethod
-    def _inherit_quantization_config(config_dict: dict,
-                                     text_config: dict) -> dict:
-        """Copy top-level quantization_config into text_config with name normalization.
-
-        Also adds a temporary workaround that keeps packed linear-attention
-        in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is
-        fixed for that layout.
-        """
-        if "quantization_config" in text_config:
-            return text_config
-        if "quantization_config" not in config_dict:
-            return text_config
-
-        quantization_config = dict(config_dict["quantization_config"])
-        if "modules_to_not_convert" in quantization_config:
-            modules = _Qwen35ConfigCompat._normalize_exclude_modules(
-                quantization_config["modules_to_not_convert"])
-            modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(
-                text_config, modules)
-            quantization_config["modules_to_not_convert"] = sorted(set(modules))
-        text_config["quantization_config"] = quantization_config
-        return text_config
-
-    @staticmethod
-    def _normalize_exclude_modules(modules: list[str]) -> list[str]:
-        """Translate HF quantization exclude-module paths to TRT-LLM names.
-
-        - Strip model.language_model. prefix -> model.
-        - Drop model.visual.* and mtp.* entries
-        - Map split projection names to packed TRT-LLM names
-        """
-        normalized = set()
-        for name in modules:
-            if name.startswith("model.language_model."):
-                name = "model." + name[len("model.language_model."):]
-            if name.startswith("model.visual.") or name.startswith("mtp."):
-                continue
-            name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name)
-            name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name)
-            normalized.add(name)
-        return sorted(normalized)
-
-    @staticmethod
-    def _add_qkvz_bf16_workaround(text_config: dict,
-                                  modules: list[str]) -> list[str]:
-        """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers.
-
-        Temporary until FP8 block-scale TP loading is fixed for this layout.
-        """
-        try:
-            layer_types = get_qwen3_hybrid_layer_types(
-                SimpleNamespace(**text_config))
-        except (ValueError, AttributeError):
-            return modules
-        for layer_idx, layer_type in enumerate(layer_types):
-            if layer_type == "linear_attention":
-                modules.append(
-                    f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz")
-        return modules
-
-    @staticmethod
-    def _flatten_rope(text_config: dict) -> dict:
-        """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling.
-
-        Qwen3.5 nests these inside a rope_parameters dict and uses rope_type
-        instead of type in rope_scaling.  Qwen3NextConfig expects them as
-        top-level fields with rope_scaling.type.
-        """
-        rope_parameters = dict(text_config.pop("rope_parameters", {}) or {})
-        rope_scaling = dict(text_config.get("rope_scaling") or {})
-        if rope_parameters:
-            rope_theta = rope_parameters.pop("rope_theta", None)
-            if rope_theta is not None:
-                text_config.setdefault("rope_theta", rope_theta)
-            partial_rotary_factor = rope_parameters.pop("partial_rotary_factor",
-                                                        None)
-            if partial_rotary_factor is not None:
-                text_config.setdefault("partial_rotary_factor",
-                                       partial_rotary_factor)
-            if rope_parameters:
-                rope_scaling = rope_parameters | rope_scaling
-        if rope_scaling:
-            has_mrope = ("mrope_section" in rope_scaling
-                         or rope_scaling.get("mrope_interleaved", False))
-            if has_mrope:
-                rope_scaling["type"] = "mrope"
-                rope_scaling.pop("rope_type", None)
-            elif "type" not in rope_scaling and "rope_type" in rope_scaling:
-                rope_type = rope_scaling.pop("rope_type")
-                # "default" means standard RoPE (no scaling) — don't set
-                # rope_scaling to avoid triggering scaling code paths.
-                if rope_type == "default":
-                    rope_scaling = {}
-                else:
-                    rope_scaling["type"] = rope_type
-            if rope_scaling:
-                text_config["rope_scaling"] = rope_scaling
-        return text_config
-
-
-def _normalize_qwen35_mrope_config(text_config) -> None:
-    """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path.
-
-    HF stores RoPE metadata under ``rope_parameters``; the shared Qwen3-VL
-    wrapper reads ``rope_theta``, ``partial_rotary_factor``, and
-    ``rope_scaling`` directly on the text config.
-    """
-    rope_parameters = getattr(text_config, "rope_parameters", None)
-    if not rope_parameters:
-        return
-    if hasattr(rope_parameters, "to_dict"):
-        rope_parameters = rope_parameters.to_dict()
-    flattened = _Qwen35ConfigCompat._flatten_rope({
-        "rope_parameters":
-        dict(rope_parameters),
-        "rope_scaling":
-        dict(getattr(text_config, "rope_scaling", None) or {}),
-    })
-    for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"):
-        value = flattened.get(attr)
-        if value is not None:
-            setattr(text_config, attr, value)
-
-
-def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None:
-    """Materialize Qwen3Next-style text aliases used by the shared runtime."""
-    if getattr(text_config, "intermediate_size", None) is None:
-        moe_intermediate_size = getattr(text_config, "moe_intermediate_size",
-                                        None)
-        num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None)
-        shared_expert_intermediate_size = getattr(
-            text_config, "shared_expert_intermediate_size", 0) or 0
-        if (moe_intermediate_size is not None
-                and num_experts_per_tok is not None):
-            text_config.intermediate_size = (
-                num_experts_per_tok * moe_intermediate_size +
-                shared_expert_intermediate_size)
-
-
-def _normalize_qwen35_quantization_config(model_config) -> None:
-    quantization_config = getattr(model_config, "quantization_config", None)
-    if not isinstance(quantization_config, dict):
-        return
-
-    modules = quantization_config.get("modules_to_not_convert")
-    if modules is None:
-        return
-
-    text_config = getattr(model_config, "text_config", None)
-    normalized_modules = _Qwen35ConfigCompat._normalize_exclude_modules(modules)
-    if text_config is not None:
-        normalized_modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(
-            text_config.to_dict(), normalized_modules)
-    quantization_config["modules_to_not_convert"] = sorted(
-        set(normalized_modules))
-
-
-def _normalize_qwen35_moe_vl_config(model_config) -> None:
-    """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions."""
-    if not getattr(model_config, "architectures", None):
-        model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"]
-
-    text_config = getattr(model_config, "text_config", None)
-    if text_config is None:
-        raise ValueError("Qwen3.5-MoE VLM config is missing text_config")
-
-    text_config.architectures = ["Qwen3_5MoeForCausalLM"]
-    _normalize_qwen35_qwen3next_text_aliases(text_config)
-    _normalize_qwen35_mrope_config(text_config)
-
-    model_config.get_text_config = lambda decoder=False: text_config
-    _normalize_qwen35_quantization_config(model_config)
-
-
 # TODO: remove this once the transformers can support all of those models in _CONFIG_REGISTRY
 class LazyConfigDict(dict):
 
@@ -588,6 +359,9 @@ def load_pretrained_config(model_name_or_path: str,
           (("text_config" in config_dict and "vision_config" in config_dict) or
            (architectures
             and architectures[0] == "Qwen3_5MoeForConditionalGeneration"))):
+        # Qwen3.5-MoE VLM: HF native composite config + model-side normalizer.
+        from tensorrt_llm._torch.models.modeling_qwen3_5 import \
+            _normalize_qwen35_moe_vl_config
         model_config = transformers.Qwen3_5MoeConfig.from_pretrained(
             model_name_or_path, **kwargs)
         _normalize_qwen35_moe_vl_config(model_config)
@@ -603,6 +377,9 @@ def load_pretrained_config(model_name_or_path: str,
                                 "Qwen3_5ForCausalLM",
                                 "Qwen3_5ForConditionalGeneration",
                             )):
+        # Qwen3.5 text-only: flatten to Qwen3NextConfig via the model-side shim.
+        from tensorrt_llm._torch.models.modeling_qwen3_5 import \
+            _Qwen35ConfigCompat
         model_config = transformers.Qwen3NextConfig.from_dict(
             _Qwen35ConfigCompat.normalize(config_dict))
     elif (model_type == "exaone4" and config_dict.get("sliding_window") is None
diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
index 1bfb7c4869a8..15d77e50e2de 100644
--- a/tests/integration/defs/accuracy/references/mmmu.yaml
+++ b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -60,6 +60,7 @@ Qwen/Qwen3-VL-8B-Instruct:
 mistralai/Mistral-Small-3.1-24B-Instruct-2503:
   - accuracy: 57.0
 Qwen/Qwen3.5-35B-A3B:
+  # The default accuracy for `test_auto_dtype` tests.
   - accuracy: 59.0
   - dtype: bfloat16
     accuracy: 60.444

From 8cf91a626d13f6dea10114daf369a37e8e0e7e28 Mon Sep 17 00:00:00 2001
From: Michal Guzek <mguzek@nvidia.com>
Date: Mon, 18 May 2026 11:17:47 -0700
Subject: [PATCH 5/9] Add tests

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
---
 .../modeling/test_modeling_multimodal.py      | 133 +++++++-
 .../modeling/test_modeling_qwen3_5_vl_moe.py  | 286 ++++++++++++++++++
 2 files changed, 412 insertions(+), 7 deletions(-)

diff --git a/tests/unittest/_torch/modeling/test_modeling_multimodal.py b/tests/unittest/_torch/modeling/test_modeling_multimodal.py
index 53fe5e044fc6..ab7166b68bf3 100644
--- a/tests/unittest/_torch/modeling/test_modeling_multimodal.py
+++ b/tests/unittest/_torch/modeling/test_modeling_multimodal.py
@@ -18,6 +18,12 @@
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_multimodal_utils import bypass_processor_output_validation
+from tensorrt_llm._torch.pyexecutor.config_utils import (
+    extract_mamba_kv_cache_params,
+    is_nemotron_hybrid,
+    is_qwen3_hybrid,
+)
+from tensorrt_llm._torch.pyexecutor.mamba_cache_manager import CppMambaHybridCacheManager
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.bindings.executor import KvCacheConfig
@@ -28,6 +34,7 @@
     prompt_inputs,
 )
 from tensorrt_llm.inputs.multimodal import MultimodalParams, MultimodalRuntimeData
+from tensorrt_llm.llmapi.llm_args import KvCacheConfig as PyKvCacheConfig
 from tensorrt_llm.mapping import Mapping
 
 
@@ -518,6 +525,13 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario):
         Note:
             This method uses get_kv_cache_config() to obtain configuration.
             Override get_kv_cache_config() to customize cache settings.
+
+            For hybrid linear-attention models (Qwen3Next, Qwen3.5,
+            Nemotron-Hybrid) this dispatches to
+            `get_hybrid_kv_cache_manager` so the linear-attention layers
+            get a `CppMambaHybridCacheManager` for SSM/conv state.
+            Mirrors the production dispatch in
+            `_util.py:_create_kv_cache_manager`.
         """
         # Get cache configuration from the configurable method
         cache_config = self.get_kv_cache_config(scenario)
@@ -527,17 +541,114 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario):
 
         num_blocks = (max_seq_len + tokens_per_block - 1) // tokens_per_block
 
-        self.kv_cache_manager = self.get_kv_cache_manager(
-            dtype=self.model_config.pretrained_config.torch_dtype,
-            config=self.model_config.pretrained_config,
+        config = self.model_config.pretrained_config
+        text_config = getattr(config, "text_config", config)
+
+        if is_qwen3_hybrid(text_config) or is_nemotron_hybrid(text_config):
+            self.kv_cache_manager = self.get_hybrid_kv_cache_manager(
+                text_config=text_config,
+                tokens_per_block=tokens_per_block,
+                max_seq_len=max_seq_len,
+                batch_size=batch_size,
+                num_blocks=num_blocks,
+            )
+        else:
+            self.kv_cache_manager = self.get_kv_cache_manager(
+                dtype=self.model_config.pretrained_config.torch_dtype,
+                config=self.model_config.pretrained_config,
+                tokens_per_block=tokens_per_block,
+                max_seq_len=max_seq_len,
+                batch_size=batch_size,
+                num_blocks=num_blocks,
+            )
+
+        self.kv_cache_manager.add_dummy_requests(
+            request_ids=[1],
+            token_nums=[max_seq_len],
+            **self._dummy_request_kwargs(scenario),
+        )
+
+    def _dummy_request_kwargs(self, scenario: MultimodalScenario) -> Dict:
+        """Optional override hook for extra kwargs to `add_dummy_requests`.
+
+        Subclasses for mRoPE-using models (Qwen2.5-VL, Qwen3-VL, Qwen3.5-VL,
+        …) should return `{"use_mrope": True}` here so the cache manager
+        allocates the mRoPE position-id buffer at dummy-request time.
+        Defaults to an empty dict, preserving existing behavior for tests
+        that don't care.
+        """
+        return {}
+
+    def get_hybrid_kv_cache_manager(
+        self,
+        text_config: PretrainedConfig,
+        tokens_per_block: int,
+        max_seq_len: int,
+        batch_size: int,
+        num_blocks: int,
+    ):
+        """Build a `CppMambaHybridCacheManager` for hybrid linear-attention
+        models (Qwen3Next, Qwen3.5, Nemotron-Hybrid).
+
+        Mirrors the production construction in
+        `_util.py:_create_kv_cache_manager` for `is_qwen3_hybrid` /
+        `is_nemotron_hybrid` configs: pulls the state-shape / dtype /
+        layer-mask parameters from `extract_mamba_kv_cache_params` and
+        threads them through the constructor. Tests that need a different
+        concrete manager (e.g. `MixedMambaHybridCacheManager` for
+        disagg-style coverage) should override this method.
+        """
+        dtype_map = {
+            torch.half: tensorrt_llm.bindings.DataType.HALF,
+            torch.float16: tensorrt_llm.bindings.DataType.HALF,
+            torch.bfloat16: tensorrt_llm.bindings.DataType.BF16,
+        }
+
+        mamba_params = extract_mamba_kv_cache_params(text_config)
+        if mamba_params.dtype not in dtype_map:
+            raise ValueError(
+                f"Unsupported dtype for hybrid cache manager: "
+                f"{mamba_params.dtype}. Supported: {list(dtype_map.keys())}"
+            )
+        kv_cache_dtype = dtype_map[mamba_params.dtype]
+
+        head_dim = getattr(text_config, "head_dim", None)
+        if not isinstance(head_dim, int):
+            head_dim = text_config.hidden_size // text_config.num_attention_heads
+
+        # CppMambaHybridCacheManager reads Pydantic-only fields
+        # (mamba_state_cache_interval, enable_block_reuse) so we have to
+        # construct the llmapi.llm_args.KvCacheConfig here, not the C++
+        # bindings KvCacheConfig that the standard KVCacheManager path uses.
+        kv_cache_config = PyKvCacheConfig(max_tokens=num_blocks * tokens_per_block)
+        mapping = Mapping(world_size=1, tp_size=1, rank=0)
+
+        return CppMambaHybridCacheManager(
+            # mamba cache parameters (positional)
+            mamba_params.state_size,
+            mamba_params.conv_kernel,
+            mamba_params.num_heads,
+            mamba_params.n_groups,
+            mamba_params.head_dim,
+            mamba_params.num_mamba_layers,
+            mamba_params.mamba_layer_mask,
+            mamba_params.dtype,
+            mamba_params.mamba_ssm_cache_dtype,
+            # kv cache parameters (positional)
+            kv_cache_config,
+            tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
+            # kw-only
+            num_layers=mamba_params.num_full_attention_layers,
+            layer_mask=mamba_params.full_attention_layer_mask,
+            num_kv_heads=text_config.num_key_value_heads,
+            head_dim=head_dim,
             tokens_per_block=tokens_per_block,
             max_seq_len=max_seq_len,
-            batch_size=batch_size,
-            num_blocks=num_blocks,
+            max_batch_size=batch_size,
+            mapping=mapping,
+            dtype=kv_cache_dtype,
         )
 
-        self.kv_cache_manager.add_dummy_requests(request_ids=[1], token_nums=[max_seq_len])
-
     def get_max_num_tokens(self, scenario: MultimodalScenario) -> int:
         """Get maximum number of tokens for attention metadata."""
         if scenario.chunked_prefill:
@@ -695,6 +806,14 @@ def setUp(self):
         # TODO: Add multi-GPU support
         self.device = torch.device("cuda:0")
 
+        # Pre-initialize fields that tearDown / setup_scenario expect to
+        # exist. Without this, a test method that doesn't run
+        # setup_scenario (e.g. a setUp-only smoke test) leaves
+        # self.kv_cache_manager unset and tearDown errors with
+        # AttributeError on the ``is not None`` check.
+        self.kv_cache_manager = None
+        self.attn_metadata = None
+
         self.hf_config = self.create_hf_config()
         if self.skip_hf_inference:
             # Create a dummy torch module if skipping HF inference.
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
index 49fc4cbe4902..b102d231b810 100644
--- a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
@@ -2,16 +2,23 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
+import os
+from copy import deepcopy
 from pathlib import Path
+from typing import List, Optional
 
 import torch
 import transformers
+from test_modeling_multimodal import MultimodalScenario, TestModelingMultimodal
+from transformers import Qwen3_5MoeForConditionalGeneration as HFQwen3_5MoeForConditionalGeneration
+from utils.llm_data import llm_models_root
 
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models import Qwen3_5MoeVLModel
 from tensorrt_llm._torch.models.checkpoints.auto_mapper import AutoCheckpointMapper
 from tensorrt_llm._torch.models.checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper
 from tensorrt_llm._torch.models.modeling_auto import AutoModelForCausalLM
+from tensorrt_llm._torch.models.modeling_qwen3_5 import _normalize_qwen35_moe_vl_config
 from tensorrt_llm._torch.pyexecutor.config_utils import (
     extract_mamba_kv_cache_params,
     load_pretrained_config,
@@ -142,3 +149,282 @@ def test_qwen35_moe_vl_placeholder_metadata_registered() -> None:
     }
     assert metadata.placeholders_separator == ""
     assert metadata.content_format is ContentFormat.STRING
+
+
+# --- Layered parity test scaffold -------------------------------------------
+#
+# Tiny synthetic config used by TestQwen3_5MoeVL below. Same architecture as
+# the real Qwen/Qwen3.5-35B-A3B checkpoint but with much smaller dimensions
+# where possible.
+#
+# Shapes that have to match real Qwen3.5 (can't shrink without breaking
+# things downstream):
+#
+#   - head_dim=256, partial_rotary_factor=0.25 --> rotary tensor width is
+#     `head_dim * 0.25 / 2 = 32`, which equals `sum(mrope_section)`.
+#     A smaller head_dim (e.g. 128) yields a 16-wide tensor that mRoPE
+#     can't split with section sum 32.
+#   - num_attention_heads=16, num_key_value_heads=2 match the real
+#     model's 8:1 GQA layout; Q proj is 2048 --> 4096, K/V are 2048 --> 512.
+#   - Vision deepstack indices [8, 16, 24] match the real config, and
+#     depth=27 is the smallest value that hosts those indices. Disabling
+#     deepstack (indices=[], depth=2) produces fewer vision embeddings
+#     than the HF processor reserves placeholder tokens for, which
+#     breaks `fuse_input_embeds`.
+#   - vocab_size=248320 matches the real Qwen3.5 tokenizer. The
+#     tokenizer (loaded via _name_or_path) emits special-token ids in
+#     the 248k range; `fuse_input_embeds` uses `vocab_size` as the
+#     OOV threshold to identify image-pad tokens. A smaller vocab_size
+#     would misclassify regular chat-template specials as mm tokens
+#     and trip the placeholder/embedding count check.
+#
+# Shapes that can be shrunk for tests:
+#
+#   - num_hidden_layers: 2 (vs 40+).
+#   - num_experts: 128 (vs 256). Still moderate so MoE routing runs.
+#   - full_attention_interval=2 with 2 LM layers yields the pattern
+#     [linear_attention, full_attention] — one of each kind, exercising
+#     both the regular KV cache and the Mamba SSM/conv state via the
+#     base-class dispatch.
+#
+# `_name_or_path` points at the real checkpoint dir so the test can load
+# the tokenizer/processor (only the processor; not the full model weights).
+QWEN3_5_VL_MOE_PARITY_CONFIG = {
+    "architectures": ["Qwen3_5MoeForConditionalGeneration"],
+    "image_token_id": 248056,
+    "model_type": "qwen3_5_moe",
+    "text_config": {
+        "attention_bias": False,
+        "attention_dropout": 0.0,
+        "bos_token_id": 151643,
+        "dtype": "bfloat16",
+        "eos_token_id": 151645,
+        "full_attention_interval": 2,
+        "head_dim": 256,
+        "hidden_act": "silu",
+        "hidden_size": 2048,
+        "linear_conv_kernel_dim": 4,
+        "linear_key_head_dim": 128,
+        "linear_num_key_heads": 16,
+        "linear_num_value_heads": 32,
+        "linear_value_head_dim": 128,
+        "mamba_ssm_dtype": "float32",
+        "max_position_embeddings": 8192,
+        "mlp_only_layers": [],
+        "model_type": "qwen3_5_moe_text",
+        "moe_intermediate_size": 512,
+        "norm_topk_prob": True,
+        "num_attention_heads": 16,
+        "num_experts": 128,
+        "num_experts_per_tok": 8,
+        "num_hidden_layers": 2,
+        "num_key_value_heads": 2,
+        "rms_norm_eps": 1e-6,
+        "shared_expert_intermediate_size": 512,
+        "rope_parameters": {
+            "mrope_section": [11, 11, 10],
+            "partial_rotary_factor": 0.25,
+            "rope_theta": 1000000.0,
+            "rope_type": "default",
+        },
+        "use_cache": True,
+        "vocab_size": 248320,
+    },
+    "tie_word_embeddings": False,
+    "video_token_id": 248057,
+    "vision_config": {
+        "deepstack_visual_indexes": [8, 16, 24],
+        "depth": 27,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_size": 1152,
+        "in_channels": 3,
+        "initializer_range": 0.02,
+        "intermediate_size": 4304,
+        "model_type": "qwen3_5_moe",
+        "num_heads": 16,
+        "num_position_embeddings": 2304,
+        "out_hidden_size": 2048,
+        "patch_size": 16,
+        "spatial_merge_size": 2,
+        "temporal_patch_size": 2,
+    },
+    "vision_end_token_id": 248054,
+    "vision_start_token_id": 248053,
+    "_name_or_path": str(os.path.join(llm_models_root(), "Qwen3.5-35B-A3B")),
+}
+
+
+class TestQwen3_5MoeVL(TestModelingMultimodal):
+    """Forward-parity test for Qwen3.5-MoE-VL against HuggingFace.
+
+    Tiny-synthetic-config parity test in the same shape as
+    `TestQwen3VLMoe` / `TestQwen2_5VL`: both stacks are constructed
+    from `QWEN3_5_VL_MOE_PARITY_CONFIG` (2 LM layers, 1 linear + 1 full
+    attention, 128 experts, 2 vision layers), HF weights are copied
+    into TRT-LLM via `Qwen3_5MoeHfWeightMapper`, then `test_all`
+    sweeps the default `MultimodalScenario`s comparing last-position
+    logits at context + generation phases.
+
+    Two-config design:
+      - `self.hf_config` is the raw `Qwen3_5MoeConfig.from_dict(...)`
+        result. HF model construction sees the native HF schema
+        (`rope_parameters` intact with `rope_type`,
+        `moe_intermediate_size`, …).
+      - TRT-LLM gets a deep-copied + normalized version via the
+        `create_trtllm_model` override below. That copy goes through
+        `_normalize_qwen35_moe_vl_config` exactly the same way
+        production `load_pretrained_config` does, so the Qwen3Next
+        runtime sees the flat aliases it expects
+        (`intermediate_size`, `rope_theta`, `rope_scaling`, …).
+
+    Keeping the two configs separate means the production normalizer
+    doesn't need to be HF-safe — production only ever constructs the
+    TRT-LLM model from a normalized config, and the test mirrors that
+    boundary explicitly. The hybrid-cache path is handled by the base
+    class's `init_kv_cache_manager` dispatch on
+    `is_qwen3_hybrid` / `is_nemotron_hybrid`.
+    """
+
+    def get_model_config(self):
+        return QWEN3_5_VL_MOE_PARITY_CONFIG
+
+    def get_trtllm_model_class(self):
+        return Qwen3_5MoeVLModel
+
+    def get_hf_model_class(self):
+        return HFQwen3_5MoeForConditionalGeneration
+
+    def get_weight_mapper_class(self):
+        return Qwen3_5MoeHfWeightMapper
+
+    def get_model_type(self):
+        return "qwen3_5_moe"
+
+    def get_model_config_class(self):
+        return transformers.Qwen3_5MoeConfig
+
+    def create_trtllm_model(
+        self,
+        load_weights: bool = False,
+        hf_model_state_dict: Optional[dict] = None,
+        **kwargs,
+    ):
+        """Build the TRT-LLM model from a *normalized copy* of `self.hf_config`.
+
+        Mirrors the base-class body but swaps in
+        `_normalize_qwen35_moe_vl_config(trtllm_config)` before
+        wrapping in `ModelConfig`. `self.hf_config` itself stays
+        raw so the HF model that the base class builds in `setUp`
+        sees native HF schema.
+        """
+        trtllm_config = deepcopy(self.hf_config)
+        _normalize_qwen35_moe_vl_config(trtllm_config)
+
+        model_config = ModelConfig(pretrained_config=trtllm_config)
+        model_class = self.get_trtllm_model_class()
+        model = model_class(model_config, **kwargs).to("cuda")
+
+        if load_weights:
+            weight_mapper_class = self.get_weight_mapper_class()
+            if weight_mapper_class is not None:
+                weight_mapper = weight_mapper_class()
+                weight_mapper.init_model_and_config(model, trtllm_config)
+                model.load_weights(hf_model_state_dict, weight_mapper)
+            else:
+                model.load_weights(hf_model_state_dict)
+
+            for module in model.modules():
+                if hasattr(module, "post_load_weights") and not getattr(
+                    module, "_weights_removed", False
+                ):
+                    module.post_load_weights()
+
+        return model, model_config
+
+    def _dummy_request_kwargs(self, scenario):
+        """Qwen3.5-VL uses mRoPE; the cache manager needs the mRoPE
+        position-id buffer allocated at dummy-request time."""
+        return {"use_mrope": True}
+
+    def get_trtllm_inputs(
+        self,
+        input_ids,
+        multimodal_params_list,
+        is_gen: bool = False,
+        num_cached_tokens_per_seq: Optional[List[int]] = None,
+        total_prompt_len: Optional[int] = None,
+    ):
+        """Override position_ids with mRoPE position IDs from the
+        multimodal params. Same pattern as `TestQwen3VLMoe` — the
+        VLM wrapper feeds mRoPE-shaped position IDs to the decoder,
+        not the simple range-based default the base class produces.
+        """
+        trtllm_inputs = super().get_trtllm_inputs(
+            input_ids,
+            multimodal_params_list,
+            is_gen,
+            num_cached_tokens_per_seq,
+            total_prompt_len=total_prompt_len,
+        )
+
+        if is_gen:
+            mrope_gen_position_ids = []
+            for multimodal_param in multimodal_params_list:
+                mrope_gen_position_ids.append(
+                    multimodal_param.multimodal_data["mrope_config"]["mrope_position_deltas"]
+                )
+            mrope_gen_position_ids = torch.cat(mrope_gen_position_ids, dim=-1).to(self.device)
+            trtllm_inputs["position_ids"] = (
+                (trtllm_inputs["position_ids"] + mrope_gen_position_ids).expand(3, -1, 1).cuda()
+            )
+            gen_multimodal_params_list = []
+            for multimodal_param in multimodal_params_list:
+                multimodal_param.strip_for_generation()
+                multimodal_param.to_device(
+                    "multimodal_data",
+                    self.device,
+                    pin_memory=True,
+                    target_keywords=["mrope_config.mrope_position_deltas"],
+                )
+                gen_multimodal_params_list.append(multimodal_param)
+            trtllm_inputs["multimodal_params"] = gen_multimodal_params_list
+        else:
+            mrope_position_ids = []
+            for multimodal_param in multimodal_params_list:
+                mrope_position_ids.append(
+                    multimodal_param.multimodal_data["mrope_config"]["mrope_position_ids"]
+                )
+            position_ids = torch.cat(mrope_position_ids, dim=-1).cuda()
+            trtllm_inputs["position_ids"] = position_ids
+
+        return trtllm_inputs
+
+    def get_scenarios(self) -> List[MultimodalScenario]:
+        """Minimal scenario sweep for the initial coverage.
+
+        Starts with one image scenario, no CUDA graph / chunked
+        prefill / kv-cache reuse — those add additional surface area
+        (mRoPE handling under graph capture, multimodal cumsum under
+        chunking, etc.) that's worth adding incrementally once the
+        baseline parity passes.
+        """
+        return [
+            MultimodalScenario(
+                modality="image",
+                use_cuda_graph=False,
+                chunked_prefill=False,
+                kv_cache_reuse=False,
+            ),
+        ]
+
+    def test_construction_and_weight_loading_smoke(self):
+        """Smoke test: setUp built HF + TRT-LLM models and copied HF
+        weights into TRT-LLM via the weight mapper. Detailed
+        assertions on the normalizer's outputs live in the routing
+        tests above (e.g. `test_qwen35_moe_vl_config_preserves_vlm_architecture`)
+        — this one just confirms construction reached the end without
+        exception.
+        """
+        self.assertIsNotNone(self.hf_model)
+        self.assertIsNotNone(self.trtllm_model)
+        self.assertIsNotNone(self.model_config)

From 6b72d9d7b2b8b00ea3479c37ba9510f0c76e5470 Mon Sep 17 00:00:00 2001
From: Michal Guzek <mguzek@nvidia.com>
Date: Mon, 18 May 2026 11:23:33 -0700
Subject: [PATCH 6/9] Formatting

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
---
 tensorrt_llm/_torch/models/modeling_qwen3_5.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
index a1d8aaa69a08..28922b23bb6e 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
@@ -75,26 +75,26 @@ def _translate_mtp_pattern(name, n_hidden_layers):
 
 # --- Config adapters --------------------------------------------------------
 #
-# These run from ``load_pretrained_config`` in
-# ``tensorrt_llm/_torch/pyexecutor/config_utils.py`` via lazy import — the
+# These run from `load_pretrained_config` in
+# `tensorrt_llm/_torch/pyexecutor/config_utils.py` via lazy import — the
 # runtime layer asks the model module how to load its own config.
 #
 # There are two entry points:
-#   - ``_Qwen35ConfigCompat.normalize(config_dict)`` — for text-only
+#   - `_Qwen35ConfigCompat.normalize(config_dict)` — for text-only
 #     Qwen3.5 (MoE and dense). Returns a dict that
-#     ``transformers.Qwen3NextConfig.from_dict(...)`` can consume, so the
+#     `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the
 #     existing Qwen3Next runtime is reused unchanged.
-#   - ``_normalize_qwen35_moe_vl_config(model_config)`` — for the
-#     Qwen3.5-MoE VLM. Mutates the HF-native ``transformers.Qwen3_5MoeConfig``
+#   - `_normalize_qwen35_moe_vl_config(model_config)` — for the
+#     Qwen3.5-MoE VLM. Mutates the HF-native `transformers.Qwen3_5MoeConfig`
 #     in place, attaching the runtime aliases the Qwen3Next-based LM expects
-#     while keeping ``text_config`` / ``vision_config`` composite.
+#     while keeping `text_config` / `vision_config` composite.
 
 
 class _Qwen35ConfigCompat:
     """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig.
 
-    We normalize to ``Qwen3NextConfig`` (rather than to a Qwen3.5-native
-    schema) so the runtime can reuse the existing ``Qwen3NextForCausalLM``
+    We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native
+    schema) so the runtime can reuse the existing `Qwen3NextForCausalLM`
     model implementation unchanged — Qwen3.5 text is structurally identical
     to Qwen3Next, so matching the config schema lets the same code serve
     both.

From 8da13df7da73e60f47fb9e12e2df18bfd1134fe5 Mon Sep 17 00:00:00 2001
From: Michal Guzek <mguzek@nvidia.com>
Date: Tue, 19 May 2026 12:57:26 -0700
Subject: [PATCH 7/9] Address CodeRabbit review

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
---
 .../defs/accuracy/references/mmmu.yaml        |  2 ++
 .../test_llm_api_pytorch_multimodal.py        | 19 +++++++++---
 .../test_lists/qa/llm_function_core.txt       |  1 +
 .../modeling/test_modeling_qwen3_5_vl_moe.py  | 31 ++++++++++++++-----
 4 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
index 15d77e50e2de..69e8c050e440 100644
--- a/tests/integration/defs/accuracy/references/mmmu.yaml
+++ b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -64,6 +64,8 @@ Qwen/Qwen3.5-35B-A3B:
   - accuracy: 59.0
   - dtype: bfloat16
     accuracy: 60.444
+  - quant_algo: FP8_BLOCK_SCALES
+    accuracy: 58.889
 # Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params).
 # Values below are measured with NVFP4 checkpoint (thinking mode enabled).
 moonshotai/Kimi-K2.5:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
index 2a715fc33124..76c2b532e006 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
@@ -441,13 +441,24 @@ class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):
 
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False)
 
-    def test_auto_dtype(self) -> None:
-        with LLM(
-            self.MODEL_PATH,
+    def _make_llm(self, model_path: str) -> LLM:
+        return LLM(
+            model_path,
             max_num_tokens=self.MAX_NUM_TOKENS,
             max_batch_size=self.MAX_BATCH_SIZE,
             kv_cache_config=self.kv_cache_config,
-        ) as llm:
+        )
+
+    def test_auto_dtype(self) -> None:
+        with self._make_llm(self.MODEL_PATH) as llm:
+            task = MMMU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
+
+    @skip_pre_hopper
+    def test_fp8_prequantized(self) -> None:
+        model_path = f"{llm_models_root()}/Qwen3.5-35B-A3B-FP8"
+        with self._make_llm(model_path) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
             task = MMMU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=self.sampling_params)
 
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
index e52c36078273..360170d89f68 100644
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -802,6 +802,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestKimiK25::test_nvfp4[dep8]
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype
+accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_fp8_prequantized
 accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
 accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray
 unittest/disaggregated/test_openai_disagg_server.py
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
index b102d231b810..6956d497e3a6 100644
--- a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
@@ -400,13 +400,18 @@ def get_trtllm_inputs(
         return trtllm_inputs
 
     def get_scenarios(self) -> List[MultimodalScenario]:
-        """Minimal scenario sweep for the initial coverage.
-
-        Starts with one image scenario, no CUDA graph / chunked
-        prefill / kv-cache reuse — those add additional surface area
-        (mRoPE handling under graph capture, multimodal cumsum under
-        chunking, etc.) that's worth adding incrementally once the
-        baseline parity passes.
+        """Modality-sanity sweep (image / multiple_image / video).
+
+        These three catch differences in placeholder counts and the
+        multimodal-cumsum path between single-image, multi-image, and
+        video inputs.
+
+        CUDA-graph capture is intentionally not exercised here. The
+        standard `attn_metadata.create_cuda_graph_metadata` path only
+        addresses attention metadata; the Mamba SSM state buffer of the
+        hybrid (Mamba + attention) cache is not threaded through, so
+        replayed logits diverge from the HF reference. Adding that path
+        is dedicated harness work and tracked separately.
         """
         return [
             MultimodalScenario(
@@ -415,6 +420,18 @@ def get_scenarios(self) -> List[MultimodalScenario]:
                 chunked_prefill=False,
                 kv_cache_reuse=False,
             ),
+            MultimodalScenario(
+                modality="multiple_image",
+                use_cuda_graph=False,
+                chunked_prefill=False,
+                kv_cache_reuse=False,
+            ),
+            MultimodalScenario(
+                modality="video",
+                use_cuda_graph=False,
+                chunked_prefill=False,
+                kv_cache_reuse=False,
+            ),
         ]
 
     def test_construction_and_weight_loading_smoke(self):

From d5e221a64b500880381ca0b0b9934a6d208b63ae Mon Sep 17 00:00:00 2001
From: Michal Guzek <mguzek@nvidia.com>
Date: Tue, 19 May 2026 13:53:11 -0700
Subject: [PATCH 8/9] Address review comments

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
---
 docs/source/models/supported-models.md        |  1 +
 .../_torch/models/modeling_qwen3_5.py         | 25 ++++++++++---------
 .../_torch/pyexecutor/config_utils.py         |  4 +--
 .../test_lists/test-db/l0_l40s.yml            |  1 +
 .../modeling/test_modeling_qwen3_5_vl_moe.py  | 17 +++++++------
 5 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md
index 6670e8366b96..6c839c97ac4f 100644
--- a/docs/source/models/supported-models.md
+++ b/docs/source/models/supported-models.md
@@ -95,6 +95,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 | `Qwen2_5_VLForConditionalGeneration` | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
 | `Qwen3VLForConditionalGeneration`    | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
 | `Qwen3VLMoeForConditionalGeneration` | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
+| `Qwen3_5MoeForConditionalGeneration` | Yes               | Yes        | Untested        | Yes           | Yes              | No             | Untested              | Yes                       | L + I + V |
 
 Note:
 - L: Language
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
index 28922b23bb6e..4f325dbb0bcb 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
@@ -80,7 +80,7 @@ def _translate_mtp_pattern(name, n_hidden_layers):
 # runtime layer asks the model module how to load its own config.
 #
 # There are two entry points:
-#   - `_Qwen35ConfigCompat.normalize(config_dict)` — for text-only
+#   - `Qwen35ConfigCompat.normalize(config_dict)` — for text-only
 #     Qwen3.5 (MoE and dense). Returns a dict that
 #     `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the
 #     existing Qwen3Next runtime is reused unchanged.
@@ -90,7 +90,7 @@ def _translate_mtp_pattern(name, n_hidden_layers):
 #     while keeping `text_config` / `vision_config` composite.
 
 
-class _Qwen35ConfigCompat:
+class Qwen35ConfigCompat:
     """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig.
 
     We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native
@@ -111,9 +111,9 @@ class _Qwen35ConfigCompat:
     @staticmethod
     def normalize(config_dict: dict) -> dict:
         """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict."""
-        text_config = _Qwen35ConfigCompat._extract_text_config(config_dict)
-        text_config = _Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config)
-        text_config = _Qwen35ConfigCompat._flatten_rope(text_config)
+        text_config = Qwen35ConfigCompat._extract_text_config(config_dict)
+        text_config = Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config)
+        text_config = Qwen35ConfigCompat._flatten_rope(text_config)
 
         # Detect dense vs MoE and set architecture + MoE defaults accordingly
         is_moe = "num_experts" in text_config and text_config["num_experts"] > 0
@@ -138,7 +138,7 @@ def normalize(config_dict: dict) -> dict:
     def _extract_text_config(config_dict: dict) -> dict:
         """Pull nested text_config from VLM checkpoints, or use dict as-is."""
         architectures = config_dict.get("architectures") or []
-        if architectures and architectures[0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES:
+        if architectures and architectures[0] in Qwen35ConfigCompat._VLM_ARCHITECTURES:
             text_config = dict(config_dict.get("text_config") or {})
         else:
             text_config = dict(config_dict)
@@ -161,10 +161,10 @@ def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict:
 
         quantization_config = dict(config_dict["quantization_config"])
         if "modules_to_not_convert" in quantization_config:
-            modules = _Qwen35ConfigCompat._normalize_exclude_modules(
+            modules = Qwen35ConfigCompat._normalize_exclude_modules(
                 quantization_config["modules_to_not_convert"]
             )
-            modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules)
+            modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules)
             quantization_config["modules_to_not_convert"] = sorted(set(modules))
         text_config["quantization_config"] = quantization_config
         return text_config
@@ -254,7 +254,7 @@ def _normalize_qwen35_mrope_config(text_config) -> None:
         return
     if hasattr(rope_parameters, "to_dict"):
         rope_parameters = rope_parameters.to_dict()
-    flattened = _Qwen35ConfigCompat._flatten_rope(
+    flattened = Qwen35ConfigCompat._flatten_rope(
         {
             "rope_parameters": dict(rope_parameters),
             "rope_scaling": dict(getattr(text_config, "rope_scaling", None) or {}),
@@ -290,9 +290,9 @@ def _normalize_qwen35_quantization_config(model_config) -> None:
         return
 
     text_config = getattr(model_config, "text_config", None)
-    normalized_modules = _Qwen35ConfigCompat._normalize_exclude_modules(modules)
+    normalized_modules = Qwen35ConfigCompat._normalize_exclude_modules(modules)
     if text_config is not None:
-        normalized_modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(
+        normalized_modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(
             text_config.to_dict(), normalized_modules
         )
     quantization_config["modules_to_not_convert"] = sorted(set(normalized_modules))
@@ -390,7 +390,7 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM):
 
     Same reuse pattern as Qwen3_5MoeForCausalLM, but for the dense 27B
     variant which uses GatedMLP instead of SparseMoeBlock.  The config
-    normalizer (_Qwen35ConfigCompat) sets num_experts=0 so that
+    normalizer (Qwen35ConfigCompat) sets num_experts=0 so that
     Qwen3NextModel selects GatedMLP for the feed-forward layers.
     """
 
@@ -399,6 +399,7 @@ def __init__(self, model_config):
         super().__init__(model_config)
 
 
+# TODO: Add tests for disaggregated support.
 @support_multimodal_disaggregated
 @register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel)
 @register_auto_model("Qwen3_5MoeForConditionalGeneration")
diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py
index 978796f26d46..c6790ebacf45 100644
--- a/tensorrt_llm/_torch/pyexecutor/config_utils.py
+++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py
@@ -379,9 +379,9 @@ def load_pretrained_config(model_name_or_path: str,
                             )):
         # Qwen3.5 text-only: flatten to Qwen3NextConfig via the model-side shim.
         from tensorrt_llm._torch.models.modeling_qwen3_5 import \
-            _Qwen35ConfigCompat
+            Qwen35ConfigCompat
         model_config = transformers.Qwen3NextConfig.from_dict(
-            _Qwen35ConfigCompat.normalize(config_dict))
+            Qwen35ConfigCompat.normalize(config_dict))
     elif (model_type == "exaone4" and config_dict.get("sliding_window") is None
           and config_dict.get("layer_types") is None):
         # transformers 5.5.x Exaone4Config.__post_init__ first forces
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
index 9c72f9dccb86..a82a62e2c77d 100644
--- a/tests/integration/test_lists/test-db/l0_l40s.yml
+++ b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -23,6 +23,7 @@ l0_l40s:
   - unittest/_torch/modeling/test_modeling_qwen2_5vl.py::TestQwen2_5_VL::test_all
   - unittest/_torch/modeling/test_modeling_qwen3vl_moe.py::TestQwen3VLMoe::test_all
   - unittest/_torch/modeling/test_modeling_qwen3vl.py::TestQwen3VL::test_all
+  - unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py::TestQwen3_5MoeVL::test_all
   - test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B]
   - unittest/llmapi/apps/_test_openai_chat_multimodal.py::test_single_chat_session_image_embeds -m needs_l40s
   # MMMU sanity check
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
index 6956d497e3a6..df30e93d89e0 100644
--- a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
@@ -325,13 +325,9 @@ def create_trtllm_model(
         model = model_class(model_config, **kwargs).to("cuda")
 
         if load_weights:
-            weight_mapper_class = self.get_weight_mapper_class()
-            if weight_mapper_class is not None:
-                weight_mapper = weight_mapper_class()
-                weight_mapper.init_model_and_config(model, trtllm_config)
-                model.load_weights(hf_model_state_dict, weight_mapper)
-            else:
-                model.load_weights(hf_model_state_dict)
+            weight_mapper = self.get_weight_mapper_class()()
+            weight_mapper.init_model_and_config(model, trtllm_config)
+            model.load_weights(hf_model_state_dict, weight_mapper)
 
             for module in model.modules():
                 if hasattr(module, "post_load_weights") and not getattr(
@@ -346,6 +342,13 @@ def _dummy_request_kwargs(self, scenario):
         position-id buffer allocated at dummy-request time."""
         return {"use_mrope": True}
 
+    def get_tolerance(self):
+        """Tighten `rtol` to `0.1` (4x tighter than the base 0.4
+        default) while keeping `atol` at `0.4` to absorb single-logit
+        tail outliers seen on `multiple_image` / `video`.
+        """
+        return 0.4, 0.1
+
     def get_trtllm_inputs(
         self,
         input_ids,

From ee6511e349831e89e07b09ed77bc5fd846aff0ae Mon Sep 17 00:00:00 2001
From: Michal Guzek <mguzek@nvidia.com>
Date: Wed, 20 May 2026 20:38:57 -0700
Subject: [PATCH 9/9] Restore tensorrt_llm/_torch/configs/__init__.py from main

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
---
 tensorrt_llm/_torch/configs/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/configs/__init__.py b/tensorrt_llm/_torch/configs/__init__.py
index 0ab6bc3fcacf..6496e3283451 100644
--- a/tensorrt_llm/_torch/configs/__init__.py
+++ b/tensorrt_llm/_torch/configs/__init__.py
@@ -24,6 +24,4 @@ def _register_custom_configs_with_transformers() -> None:
 _register_custom_configs_with_transformers()
 del _register_custom_configs_with_transformers
 
-__all__ = [
-    "DeepseekV3Config",
-]
+__all__ = ["DeepseekV3Config"]