diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md
index 0d9e19e68104..fafaa1eb3e31 100644
--- a/docs/source/models/supported-models.md
+++ b/docs/source/models/supported-models.md
@@ -106,6 +106,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 | `Qwen3VLMoeForConditionalGeneration` | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
 | `Step3p7ForConditionalGeneration`    | Yes               | Yes        | Untested        | Yes           | Untested         | Untested       | Untested              | Untested                  | L + I     |
 | `MiniMaxM3SparseForConditionalGeneration` [^11] | Yes               | Yes        | Untested        | Yes           | Untested         | No             | Untested              | Untested                  | L + I + V |
+| `Qwen3_5MoeForConditionalGeneration` | Yes               | Yes        | Untested        | Yes           | Yes              | No             | Untested              | Yes                       | L + I + V |
 
 Note:
 - L: Language
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
index f33fd49ff916..aa2fb03d0b38 100644
--- a/tensorrt_llm/_torch/models/__init__.py
+++ b/tensorrt_llm/_torch/models/__init__.py
@@ -42,7 +42,8 @@
                             Qwen2ForRewardModel)
 from .modeling_qwen2vl import Qwen2_5_VLModel, Qwen2VLModel
 from .modeling_qwen3 import Qwen3ForCausalLM
-from .modeling_qwen3_5 import Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM
+from .modeling_qwen3_5 import (Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM,
+                               Qwen3_5MoeVLModel)
 from .modeling_qwen3_moe import Qwen3MoeForCausalLM
 from .modeling_qwen3_next import Qwen3NextForCausalLM
 from .modeling_qwen3vl import Qwen3VLModel
@@ -102,6 +103,7 @@
     "Qwen3MoeForCausalLM",
     "Qwen3_5ForCausalLM",
     "Qwen3_5MoeForCausalLM",
+    "Qwen3_5MoeVLModel",
     "Qwen3NextForCausalLM",
     "Qwen3MoeVLModel",
     "GptOssForCausalLM",
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py
index e0e8dc9dbb40..a4226af7d87b 100644
--- a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py
@@ -16,6 +16,7 @@
 
 
 @register_mapper("HF", "Qwen3_5MoeForCausalLM")
+@register_mapper("HF", "Qwen3_5MoeForConditionalGeneration")
 @register_mapper("HF", "Qwen3_5ForCausalLM")
 class Qwen3_5MoeHfWeightMapper(Qwen3NextHfWeightMapper):
     """Weight mapper for Qwen3.5 MoE text checkpoints.
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
index 7c85f1cdf6fc..ce9fcefb9ece 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
@@ -1,7 +1,29 @@
 import re
+from types import SimpleNamespace
+from typing import Dict, List
 
+import torch
+from transformers import PretrainedConfig
+
+from ...inputs import (
+    ContentFormat,
+    MultimodalPlaceholderMetadata,
+    MultimodalPlaceholderPlacement,
+    register_input_processor,
+    support_multimodal_disaggregated,
+)
+from ..pyexecutor.config_utils import get_qwen3_hybrid_layer_types
+from .checkpoints.base_weight_mapper import BaseWeightMapper
+from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper
+from .modeling_multimodal_utils import _is_mm_disagg
 from .modeling_qwen3_next import Qwen3NextForCausalLM
-from .modeling_utils import register_auto_model
+from .modeling_qwen3vl import (
+    Qwen3VisionModel,
+    Qwen3VisionModelBase,
+    Qwen3VLInputProcessorBase,
+    Qwen3VLModelBase,
+)
+from .modeling_utils import ModelConfig, register_auto_model, register_vision_encoder
 
 _LANG_PREFIX = "model.language_model."
 
@@ -51,6 +73,248 @@ def _translate_mtp_pattern(name, n_hidden_layers):
     return None
 
 
+# --- Config adapters --------------------------------------------------------
+#
+# These run from `load_pretrained_config` in
+# `tensorrt_llm/_torch/pyexecutor/config_utils.py` via lazy import — the
+# runtime layer asks the model module how to load its own config.
+#
+# There are two entry points:
+#   - `Qwen35ConfigCompat.normalize(config_dict)` — for text-only
+#     Qwen3.5 (MoE and dense). Returns a dict that
+#     `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the
+#     existing Qwen3Next runtime is reused unchanged.
+#   - `_normalize_qwen35_moe_vl_config(model_config)` — for the
+#     Qwen3.5-MoE VLM. Mutates the HF-native `transformers.Qwen3_5MoeConfig`
+#     in place, attaching the runtime aliases the Qwen3Next-based LM expects
+#     while keeping `text_config` / `vision_config` composite.
+
+
+class Qwen35ConfigCompat:
+    """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig.
+
+    We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native
+    schema) so the runtime can reuse the existing `Qwen3NextForCausalLM`
+    model implementation unchanged — Qwen3.5 text is structurally identical
+    to Qwen3Next, so matching the config schema lets the same code serve
+    both.
+
+    This is used for Qwen3.5 text-only configs and for shared helper logic such
+    as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM
+    configs should stay composite and use transformers.Qwen3_5MoeConfig plus
+    _normalize_qwen35_moe_vl_config instead.
+
+    To remove: delete this class and the elif branch in
+    load_pretrained_config that flattens Qwen3.5 text configs.
+    """
+
+    @staticmethod
+    def normalize(config_dict: dict) -> dict:
+        """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict."""
+        text_config = Qwen35ConfigCompat._extract_text_config(config_dict)
+        text_config = Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config)
+        text_config = Qwen35ConfigCompat._flatten_rope(text_config)
+
+        # Detect dense vs MoE and set architecture + MoE defaults accordingly
+        is_moe = "num_experts" in text_config and text_config["num_experts"] > 0
+        if is_moe:
+            text_config["architectures"] = ["Qwen3_5MoeForCausalLM"]
+        else:
+            text_config["architectures"] = ["Qwen3_5ForCausalLM"]
+            # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't
+            # accidentally enable MoE for the dense model.
+            text_config.setdefault("num_experts", 0)
+            text_config.setdefault("num_experts_per_tok", 0)
+            text_config.setdefault("moe_intermediate_size", 0)
+            text_config.setdefault("shared_expert_intermediate_size", 0)
+        return text_config
+
+    _VLM_ARCHITECTURES = {
+        "Qwen3_5MoeForConditionalGeneration",
+        "Qwen3_5ForConditionalGeneration",
+    }
+
+    @staticmethod
+    def _extract_text_config(config_dict: dict) -> dict:
+        """Pull nested text_config from VLM checkpoints, or use dict as-is."""
+        architectures = config_dict.get("architectures") or []
+        if architectures and architectures[0] in Qwen35ConfigCompat._VLM_ARCHITECTURES:
+            text_config = dict(config_dict.get("text_config") or {})
+        else:
+            text_config = dict(config_dict)
+        if not text_config:
+            raise ValueError("Qwen3.5 config is missing a usable text_config")
+        return text_config
+
+    @staticmethod
+    def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict:
+        """Copy top-level quantization_config into text_config with name normalization.
+
+        Also adds a temporary workaround that keeps packed linear-attention
+        in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is
+        fixed for that layout.
+        """
+        if "quantization_config" in text_config:
+            return text_config
+        if "quantization_config" not in config_dict:
+            return text_config
+
+        quantization_config = dict(config_dict["quantization_config"])
+        if "modules_to_not_convert" in quantization_config:
+            modules = Qwen35ConfigCompat._normalize_exclude_modules(
+                quantization_config["modules_to_not_convert"]
+            )
+            modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules)
+            quantization_config["modules_to_not_convert"] = sorted(set(modules))
+        text_config["quantization_config"] = quantization_config
+        return text_config
+
+    @staticmethod
+    def _normalize_exclude_modules(modules: list[str]) -> list[str]:
+        """Translate HF quantization exclude-module paths to TRT-LLM names.
+
+        - Strip model.language_model. prefix -> model.
+        - Drop model.visual.* and mtp.* entries
+        - Map split projection names to packed TRT-LLM names
+        """
+        normalized = set()
+        for name in modules:
+            if name.startswith("model.language_model."):
+                name = "model." + name[len("model.language_model.") :]
+            if name.startswith("model.visual.") or name.startswith("mtp."):
+                continue
+            name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name)
+            name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name)
+            normalized.add(name)
+        return sorted(normalized)
+
+    @staticmethod
+    def _add_qkvz_bf16_workaround(text_config: dict, modules: list[str]) -> list[str]:
+        """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers.
+
+        Temporary until FP8 block-scale TP loading is fixed for this layout.
+        """
+        try:
+            layer_types = get_qwen3_hybrid_layer_types(SimpleNamespace(**text_config))
+        except (ValueError, AttributeError):
+            return modules
+        for layer_idx, layer_type in enumerate(layer_types):
+            if layer_type == "linear_attention":
+                modules.append(f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz")
+        return modules
+
+    @staticmethod
+    def _flatten_rope(text_config: dict) -> dict:
+        """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling.
+
+        Qwen3.5 nests these inside a rope_parameters dict and uses rope_type
+        instead of type in rope_scaling.  Qwen3NextConfig expects them as
+        top-level fields with rope_scaling.type.
+        """
+        rope_parameters = dict(text_config.pop("rope_parameters", {}) or {})
+        rope_scaling = dict(text_config.get("rope_scaling") or {})
+        if rope_parameters:
+            rope_theta = rope_parameters.pop("rope_theta", None)
+            if rope_theta is not None:
+                text_config.setdefault("rope_theta", rope_theta)
+            partial_rotary_factor = rope_parameters.pop("partial_rotary_factor", None)
+            if partial_rotary_factor is not None:
+                text_config.setdefault("partial_rotary_factor", partial_rotary_factor)
+            if rope_parameters:
+                rope_scaling = rope_parameters | rope_scaling
+        if rope_scaling:
+            has_mrope = "mrope_section" in rope_scaling or rope_scaling.get(
+                "mrope_interleaved", False
+            )
+            if has_mrope:
+                rope_scaling["type"] = "mrope"
+                rope_scaling.pop("rope_type", None)
+            elif "type" not in rope_scaling and "rope_type" in rope_scaling:
+                rope_type = rope_scaling.pop("rope_type")
+                # "default" means standard RoPE (no scaling) — don't set
+                # rope_scaling to avoid triggering scaling code paths.
+                if rope_type == "default":
+                    rope_scaling = {}
+                else:
+                    rope_scaling["type"] = rope_type
+            if rope_scaling:
+                text_config["rope_scaling"] = rope_scaling
+        return text_config
+
+
+def _normalize_qwen35_mrope_config(text_config) -> None:
+    """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path.
+
+    HF stores RoPE metadata under `rope_parameters`; the shared Qwen3-VL
+    wrapper reads `rope_theta`, `partial_rotary_factor`, and
+    `rope_scaling` directly on the text config.
+    """
+    rope_parameters = getattr(text_config, "rope_parameters", None)
+    if not rope_parameters:
+        return
+    if hasattr(rope_parameters, "to_dict"):
+        rope_parameters = rope_parameters.to_dict()
+    flattened = Qwen35ConfigCompat._flatten_rope(
+        {
+            "rope_parameters": dict(rope_parameters),
+            "rope_scaling": dict(getattr(text_config, "rope_scaling", None) or {}),
+        }
+    )
+    for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"):
+        value = flattened.get(attr)
+        if value is not None:
+            setattr(text_config, attr, value)
+
+
+def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None:
+    """Materialize Qwen3Next-style text aliases used by the shared runtime."""
+    if getattr(text_config, "intermediate_size", None) is None:
+        moe_intermediate_size = getattr(text_config, "moe_intermediate_size", None)
+        num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None)
+        shared_expert_intermediate_size = (
+            getattr(text_config, "shared_expert_intermediate_size", 0) or 0
+        )
+        if moe_intermediate_size is not None and num_experts_per_tok is not None:
+            text_config.intermediate_size = (
+                num_experts_per_tok * moe_intermediate_size + shared_expert_intermediate_size
+            )
+
+
+def _normalize_qwen35_quantization_config(model_config) -> None:
+    quantization_config = getattr(model_config, "quantization_config", None)
+    if not isinstance(quantization_config, dict):
+        return
+
+    modules = quantization_config.get("modules_to_not_convert")
+    if modules is None:
+        return
+
+    text_config = getattr(model_config, "text_config", None)
+    normalized_modules = Qwen35ConfigCompat._normalize_exclude_modules(modules)
+    if text_config is not None:
+        normalized_modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(
+            text_config.to_dict(), normalized_modules
+        )
+    quantization_config["modules_to_not_convert"] = sorted(set(normalized_modules))
+
+
+def _normalize_qwen35_moe_vl_config(model_config) -> None:
+    """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions."""
+    if not getattr(model_config, "architectures", None):
+        model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"]
+
+    text_config = getattr(model_config, "text_config", None)
+    if text_config is None:
+        raise ValueError("Qwen3.5-MoE VLM config is missing text_config")
+
+    text_config.architectures = ["Qwen3_5MoeForCausalLM"]
+    _normalize_qwen35_qwen3next_text_aliases(text_config)
+    _normalize_qwen35_mrope_config(text_config)
+
+    model_config.get_text_config = lambda decoder=False: text_config
+    _normalize_qwen35_quantization_config(model_config)
+
+
 def _normalize_qwen35_exclude_modules(model_config):
     """Normalize NVFP4/FP8 exclude_modules from HF naming to TRT-LLM naming.
 
@@ -130,10 +394,58 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM):
 
     Same reuse pattern as Qwen3_5MoeForCausalLM, but for the dense 27B
     variant which uses GatedMLP instead of SparseMoeBlock.  The config
-    normalizer (_Qwen35ConfigCompat) sets num_experts=0 so that
+    normalizer (Qwen35ConfigCompat) sets num_experts=0 so that
     Qwen3NextModel selects GatedMLP for the feed-forward layers.
     """
 
     def __init__(self, model_config):
         _normalize_qwen35_exclude_modules(model_config)
         super().__init__(model_config)
+
+
+# TODO: Add tests for disaggregated support.
+@support_multimodal_disaggregated
+@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel)
+@register_auto_model("Qwen3_5MoeForConditionalGeneration")
+@register_input_processor(
+    Qwen3VLInputProcessorBase,
+    model_type="qwen3_5_moe",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={
+            "image": "<|vision_start|><|image_pad|><|vision_end|>",
+            "video": "<|vision_start|><|video_pad|><|vision_end|>",
+        },
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+        placeholders_separator="",
+        content_format=ContentFormat.STRING,
+    ),
+)
+class Qwen3_5MoeVLModel(Qwen3VLModelBase):
+    """VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder."""
+
+    def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs):
+        kwargs["vision_model_class"] = Qwen3VisionModel
+        kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False)
+        super().__init__(model_config, *args, **kwargs)
+
+    @property
+    def multimodal_data_device_paths(self) -> List[str]:
+        return [
+            "image.pixel_values",
+            "video.pixel_values_videos",
+            "multimodal_embedding",
+            "mrope_config.mrope_position_ids",
+            "mrope_config.mrope_position_deltas",
+        ]
+
+    def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper):
+        if not _is_mm_disagg():
+            self.mm_encoder.load_weights(weights)
+
+        weight_mapper = Qwen3_5MoeHfWeightMapper()
+        weight_mapper.init_model_and_config(self.llm, self.model_config)
+        filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")}
+        params_map = {
+            r"^model\.language_model\.(.*)$": r"model.\1",
+        }
+        self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map)
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py
index 7667972804ad..cf8607fb59e2 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_next.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_next.py
@@ -976,9 +976,18 @@ def get_model_defaults(cls, llm_args: 'TorchLlmArgs') -> dict:
         # is supported for Mamba/SSM-based models
         return {"kv_cache_config": {"enable_block_reuse": False}}
 
-    def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper):
+    def load_weights(self,
+                     weights: dict,
+                     weight_mapper: BaseWeightMapper,
+                     params_map: Optional[Dict[str, str]] = None,
+                     allow_partial_loading: bool = False):
         new_weights = weight_mapper.preprocess_weights(weights)
-        super().load_weights(new_weights, weight_mapper)
+        super().load_weights(
+            new_weights,
+            weight_mapper=weight_mapper,
+            params_map=params_map,
+            allow_partial_loading=allow_partial_loading,
+        )
 
     def setup_aliases(self) -> None:
         for idx, layer in enumerate(
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py
index 82613d938982..19fb82811b4f 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3vl.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3vl.py
@@ -1119,6 +1119,8 @@ def __init__(
             llm_model_config.pretrained_config.architectures = ["Qwen3ForCausalLM"]
         elif self.original_arch == "Qwen3VLMoeForConditionalGeneration":
             llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"]
+        elif self.original_arch == "Qwen3_5MoeForConditionalGeneration":
+            llm_model_config.pretrained_config.architectures = ["Qwen3_5MoeForCausalLM"]
         else:
             raise ValueError(f"Unsupported architecture: {self.original_arch}")
         # Qwen3ForCausalLM.
@@ -1180,9 +1182,12 @@ def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]):
             mrope_section=config.rope_scaling.get("mrope_section", None),
             mrope_interleaved=config.rope_scaling.get("mrope_interleaved", False),
         )
+        head_dim = getattr(config, "head_dim", None)
+        if not isinstance(head_dim, int):
+            head_dim = config.hidden_size // config.num_attention_heads
         self.rotary_emb = MRotaryEmbedding(
             pos_embd_params.rope,
-            head_dim=config.hidden_size // config.num_attention_heads,
+            head_dim=head_dim,
             is_neox=pos_embd_params.is_neox,
             mrope_section=pos_embd_params.mrope_section,
             mrope_interleaved=pos_embd_params.mrope_interleaved,
@@ -1311,6 +1316,13 @@ def forward(
             )
             deepstack_embeds = list(deepstack_buffer.unbind(0))
 
+        # Preserve the pre-fusion token IDs. `fuse_input_embeds` collapses
+        # input_ids -> None when MM embeddings are fused in, but spec
+        # decoding (MTP / Eagle) still needs the original prompt token
+        # IDs for drafter context preparation; pass them through as a
+        # dedicated kwarg consumed by `SpecDecOneEngineForCausalLM.forward`.
+        orig_input_ids = input_ids
+
         input_ids, input_embeds = fuse_input_embeds(
             self.llm.model.embed_tokens,
             input_ids,
@@ -1327,8 +1339,14 @@ def forward(
             return_context_logits=return_context_logits,
             deepstack_embeds=deepstack_embeds,
             mrope_config=mrope_config,
+            spec_metadata=kwargs.get("spec_metadata"),
+            resource_manager=kwargs.get("resource_manager"),
+            orig_input_ids=orig_input_ids,
         )
-        logger.debug(f"output shape: {output_prob.shape}")
+        # Spec-decoding (MTP / Eagle) returns a dict (accepted tokens,
+        # draft tokens, logits); plain forward returns a tensor.
+        if hasattr(output_prob, "shape"):
+            logger.debug(f"output shape: {output_prob.shape}")
         return output_prob
 
     def _get_requests_with_mm_data(self, multimodal_params):
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
index 62683b3f62f2..552a8fbc4fbf 100755
--- a/tensorrt_llm/_torch/models/modeling_speculative.py
+++ b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -1765,12 +1765,17 @@ def forward(
                 True,
             )
 
-            spec_input_ids = input_ids
+            # VLM wrappers (e.g. Qwen3VLModelBase) replace input_ids with
+            # fused inputs_embeds; fall back to the pre-fusion token IDs
+            # they forward via `orig_input_ids` so MTP / Eagle drafters
+            # can still access the prompt tokens.
+            spec_input_ids = input_ids if input_ids is not None else kwargs.get(
+                "orig_input_ids")
             spec_position_ids = position_ids
             if attn_metadata.padded_num_tokens is not None:
-                if input_ids is not None:
+                if spec_input_ids is not None:
                     # Slice along the first dimension
-                    spec_input_ids = input_ids[:attn_metadata.num_tokens]
+                    spec_input_ids = spec_input_ids[:attn_metadata.num_tokens]
                 if position_ids is not None:
                     spec_position_ids = _slice_spec_position_ids(
                         position_ids, attn_metadata.num_tokens)
diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py
index 2ab710bf14ac..5ad48cb880a1 100644
--- a/tensorrt_llm/_torch/pyexecutor/config_utils.py
+++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py
@@ -1,11 +1,10 @@
 import dataclasses
-import re
-from types import SimpleNamespace
 from typing import List, Optional
 
 import torch
 import transformers
 
+from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.logger import logger
 
 
@@ -21,6 +20,60 @@ def is_hybrid_linear(config):
     return is_nemotron_hybrid(config) or is_qwen3_hybrid(config)
 
 
+def _coerce_torch_dtype(dtype):
+    """Normalize dtype values from HF configs into torch dtype objects.
+
+    HF configs may store dtype fields as torch dtypes, strings, or the sentinel
+    value "auto". Returning None for "auto" lets the caller keep its normal
+    fallback path instead of treating "auto" as a concrete dtype.
+    """
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if dtype == "auto":
+        return None
+    if isinstance(dtype, str):
+        return str_dtype_to_torch(dtype)
+    return dtype
+
+
+def resolve_hf_torch_dtype(config):
+    """Return the model's regular tensor dtype from common HF config fields.
+
+    Transformers has used both dtype and torch_dtype across versions and model
+    families. This helper checks both names and coerces whichever one is present
+    into the form expected by TRT-LLM runtime code. An "auto" value in any
+    field is treated the same as missing, so scanning continues to the next
+    field instead of stopping with None.
+    """
+    for attr in ("dtype", "torch_dtype"):
+        coerced = _coerce_torch_dtype(getattr(config, attr, None))
+        if coerced is not None:
+            return coerced
+    return None
+
+
+def resolve_mamba_ssm_cache_dtype(config):
+    """Return the dtype to use for hybrid Mamba/SSM cache allocations.
+
+    Qwen3.5-style configs may store this field on the top-level config or the
+    nested text_config, and may call it either mamba_ssm_cache_dtype or
+    mamba_ssm_dtype. This helper centralizes that lookup so cache creation does
+    not fail later with a missing dtype. An "auto" value in any field is
+    treated the same as missing.
+    """
+    configs = [config]
+    text_config = getattr(config, "text_config", None)
+    if text_config is not None:
+        configs.append(text_config)
+
+    for candidate_config in configs:
+        for attr in ("mamba_ssm_cache_dtype", "mamba_ssm_dtype"):
+            coerced = _coerce_torch_dtype(getattr(candidate_config, attr, None))
+            if coerced is not None:
+                return coerced
+    return None
+
+
 def is_nemotron_hybrid(config):
     if hasattr(config, "hybrid_override_pattern"
                ) and config.hybrid_override_pattern is not None and len(
@@ -249,8 +302,14 @@ def extract_mamba_kv_cache_params(
             full_attn_mask.extend([True] * num_spec_layers)
             mamba_mask.extend([False] * num_spec_layers)
 
-    mamba_ssm_cache_dtype = (quant_config.mamba_ssm_cache_dtype
-                             if quant_config is not None else None)
+    mamba_ssm_cache_dtype = None
+    if quant_config is not None:
+        mamba_ssm_cache_dtype = _coerce_torch_dtype(
+            quant_config.mamba_ssm_cache_dtype)
+    if mamba_ssm_cache_dtype is None:
+        mamba_ssm_cache_dtype = (resolve_mamba_ssm_cache_dtype(config)
+                                 or resolve_hf_torch_dtype(config)
+                                 or torch.bfloat16)
 
     return MambaKVCacheParams(
         state_size=state_size,
@@ -262,176 +321,11 @@ def extract_mamba_kv_cache_params(
         full_attention_layer_mask=full_attn_mask,
         num_mamba_layers=sum(mamba_mask),
         num_full_attention_layers=sum(full_attn_mask),
-        dtype=config.torch_dtype,
+        dtype=resolve_hf_torch_dtype(config) or torch.bfloat16,
         mamba_ssm_cache_dtype=mamba_ssm_cache_dtype,
     )
 
 
-class _Qwen35MoeVLMConfig(transformers.Qwen3NextConfig):
-    """Thin subclass that restores the top-level model_type for Qwen3.5 MoE.
-
-    ``_Qwen35ConfigCompat`` normalizes the HF config into Qwen3NextConfig
-    (needed by the PyTorch backend model), but that loses the original
-    ``model_type``.  The serving layer needs ``model_type = "qwen3_5_moe"``
-    for ``MULTIMODAL_PLACEHOLDER_REGISTRY`` lookup; without it,
-    ``resolve_top_level_model_type`` returns ``"qwen3_next"`` and multimodal
-    requests fail with "Unknown modality".
-
-    To remove: when ``_Qwen35ConfigCompat`` is removed and the PyTorch backend
-    consumes ``Qwen3_5MoeConfig`` directly.
-    """
-
-    model_type = "qwen3_5_moe"
-
-
-class _Qwen35ConfigCompat:
-    """Temporary shim that normalizes Qwen3.5 HF configs into Qwen3NextConfig.
-
-    To remove: delete this class and the elif branch in
-    load_pretrained_config that references it.
-    """
-
-    @staticmethod
-    def normalize(config_dict: dict) -> dict:
-        """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict."""
-        text_config = _Qwen35ConfigCompat._extract_text_config(config_dict)
-        text_config = _Qwen35ConfigCompat._inherit_quantization_config(
-            config_dict, text_config)
-        text_config = _Qwen35ConfigCompat._flatten_rope(text_config)
-
-        # Detect dense vs MoE and set architecture + MoE defaults accordingly
-        is_moe = "num_experts" in text_config and text_config["num_experts"] > 0
-        if is_moe:
-            text_config["architectures"] = ["Qwen3_5MoeForCausalLM"]
-        else:
-            text_config["architectures"] = ["Qwen3_5ForCausalLM"]
-            # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't
-            # accidentally enable MoE for the dense model.
-            text_config.setdefault("num_experts", 0)
-            text_config.setdefault("num_experts_per_tok", 0)
-            text_config.setdefault("moe_intermediate_size", 0)
-            text_config.setdefault("shared_expert_intermediate_size", 0)
-        return text_config
-
-    _VLM_ARCHITECTURES = {
-        "Qwen3_5MoeForConditionalGeneration",
-        "Qwen3_5ForConditionalGeneration",
-    }
-
-    @staticmethod
-    def _extract_text_config(config_dict: dict) -> dict:
-        """Pull nested text_config from VLM checkpoints, or use dict as-is."""
-        architectures = config_dict.get("architectures") or []
-        if architectures and architectures[
-                0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES:
-            text_config = dict(config_dict.get("text_config") or {})
-        else:
-            text_config = dict(config_dict)
-        if not text_config:
-            raise ValueError("Qwen3.5 config is missing a usable text_config")
-        return text_config
-
-    @staticmethod
-    def _inherit_quantization_config(config_dict: dict,
-                                     text_config: dict) -> dict:
-        """Copy top-level quantization_config into text_config with name normalization.
-
-        Also adds a temporary workaround that keeps packed linear-attention
-        in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is
-        fixed for that layout.
-        """
-        if "quantization_config" in text_config:
-            return text_config
-        if "quantization_config" not in config_dict:
-            return text_config
-
-        quantization_config = dict(config_dict["quantization_config"])
-        if "modules_to_not_convert" in quantization_config:
-            modules = _Qwen35ConfigCompat._normalize_exclude_modules(
-                quantization_config["modules_to_not_convert"])
-            modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(
-                text_config, modules)
-            quantization_config["modules_to_not_convert"] = sorted(set(modules))
-        text_config["quantization_config"] = quantization_config
-        return text_config
-
-    @staticmethod
-    def _normalize_exclude_modules(modules: list[str]) -> list[str]:
-        """Translate HF quantization exclude-module paths to TRT-LLM names.
-
-        - Strip model.language_model. prefix -> model.
-        - Drop model.visual.* and mtp.* entries
-        - Map split projection names to packed TRT-LLM names
-        """
-        normalized = set()
-        for name in modules:
-            if name.startswith("model.language_model."):
-                name = "model." + name[len("model.language_model."):]
-            if name.startswith("model.visual.") or name.startswith("mtp."):
-                continue
-            name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name)
-            name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name)
-            normalized.add(name)
-        return sorted(normalized)
-
-    @staticmethod
-    def _add_qkvz_bf16_workaround(text_config: dict,
-                                  modules: list[str]) -> list[str]:
-        """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers.
-
-        Temporary until FP8 block-scale TP loading is fixed for this layout.
-        """
-        try:
-            layer_types = get_qwen3_hybrid_layer_types(
-                SimpleNamespace(**text_config))
-        except (ValueError, AttributeError):
-            return modules
-        for layer_idx, layer_type in enumerate(layer_types):
-            if layer_type == "linear_attention":
-                modules.append(
-                    f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz")
-        return modules
-
-    @staticmethod
-    def _flatten_rope(text_config: dict) -> dict:
-        """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling.
-
-        Qwen3.5 nests these inside a rope_parameters dict and uses rope_type
-        instead of type in rope_scaling.  Qwen3NextConfig expects them as
-        top-level fields with rope_scaling.type.
-        """
-        rope_parameters = dict(text_config.pop("rope_parameters", {}) or {})
-        rope_scaling = dict(text_config.get("rope_scaling") or {})
-        if rope_parameters:
-            rope_theta = rope_parameters.pop("rope_theta", None)
-            if rope_theta is not None:
-                text_config.setdefault("rope_theta", rope_theta)
-            partial_rotary_factor = rope_parameters.pop("partial_rotary_factor",
-                                                        None)
-            if partial_rotary_factor is not None:
-                text_config.setdefault("partial_rotary_factor",
-                                       partial_rotary_factor)
-            if rope_parameters:
-                rope_scaling = rope_parameters | rope_scaling
-        if rope_scaling:
-            has_mrope = ("mrope_section" in rope_scaling
-                         or rope_scaling.get("mrope_interleaved", False))
-            if has_mrope:
-                rope_scaling["type"] = "mrope"
-                rope_scaling.pop("rope_type", None)
-            elif "type" not in rope_scaling and "rope_type" in rope_scaling:
-                rope_type = rope_scaling.pop("rope_type")
-                # "default" means standard RoPE (no scaling) — don't set
-                # rope_scaling to avoid triggering scaling code paths.
-                if rope_type == "default":
-                    rope_scaling = {}
-                else:
-                    rope_scaling["type"] = rope_type
-            if rope_scaling:
-                text_config["rope_scaling"] = rope_scaling
-        return text_config
-
-
 # TODO: remove this once the transformers can support all of those models in _CONFIG_REGISTRY
 class LazyConfigDict(dict):
 
@@ -462,6 +356,16 @@ def load_pretrained_config(model_name_or_path: str,
             MistralConfigLoader
         model_config = MistralConfigLoader().load(
             model_name_or_path).pretrained_config
+    elif (model_type == "qwen3_5_moe" and
+          (("text_config" in config_dict and "vision_config" in config_dict) or
+           (architectures
+            and architectures[0] == "Qwen3_5MoeForConditionalGeneration"))):
+        # Qwen3.5-MoE VLM: HF native composite config + model-side normalizer.
+        from tensorrt_llm._torch.models.modeling_qwen3_5 import \
+            _normalize_qwen35_moe_vl_config
+        model_config = transformers.Qwen3_5MoeConfig.from_pretrained(
+            model_name_or_path, **kwargs)
+        _normalize_qwen35_moe_vl_config(model_config)
     elif model_type in _CONFIG_REGISTRY:
         config_class = _CONFIG_REGISTRY[model_type]
         model_config = config_class.from_pretrained(model_name_or_path,
@@ -474,11 +378,11 @@ def load_pretrained_config(model_name_or_path: str,
                                 "Qwen3_5ForCausalLM",
                                 "Qwen3_5ForConditionalGeneration",
                             )):
-        normalized = _Qwen35ConfigCompat.normalize(config_dict)
-        if model_type in ("qwen3_5_moe", "qwen3_5_moe_text"):
-            model_config = _Qwen35MoeVLMConfig.from_dict(normalized)
-        else:
-            model_config = transformers.Qwen3NextConfig.from_dict(normalized)
+        # Qwen3.5 text-only: flatten to Qwen3NextConfig via the model-side shim.
+        from tensorrt_llm._torch.models.modeling_qwen3_5 import \
+            Qwen35ConfigCompat
+        model_config = transformers.Qwen3NextConfig.from_dict(
+            Qwen35ConfigCompat.normalize(config_dict))
     elif (model_type == "exaone4" and config_dict.get("sliding_window") is None
           and config_dict.get("layer_types") is None):
         # transformers 5.5.x Exaone4Config.__post_init__ first forces
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index 2c955793ba61..3002606ef44d 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -32,6 +32,7 @@
     MoeLoadBalancer, maybe_create_moe_load_balancer)
 from ..virtual_memory import RestoreMode
 from ..virtual_memory import scope as virtual_memory_scope
+from .config_utils import resolve_hf_torch_dtype, resolve_mamba_ssm_cache_dtype
 
 _KV_CACHE_MAP = {
     "fp8": QuantAlgo.FP8.value,
@@ -47,12 +48,10 @@ def validate_and_set_mamba_ssm_cache_dtype(
         mamba_ssm_stochastic_rounding: bool = False,
         mamba_ssm_philox_rounds: int = 10) -> None:
     if mamba_ssm_cache_dtype == "auto":
-        hf_dtype = getattr(config.pretrained_config, "mamba_ssm_cache_dtype",
-                           None)
-        if hf_dtype is not None:
-            mamba_ssm_cache_dtype = str_dtype_to_torch(hf_dtype)
-        else:
-            mamba_ssm_cache_dtype = config.pretrained_config.torch_dtype
+        mamba_ssm_cache_dtype = (
+            resolve_mamba_ssm_cache_dtype(config.pretrained_config)
+            or resolve_hf_torch_dtype(config.pretrained_config)
+            or config.torch_dtype)
     else:
         mamba_ssm_cache_dtype = str_dtype_to_torch(mamba_ssm_cache_dtype)
 
diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
index e2cbb94aadfa..a7a0cc0aa8ce 100644
--- a/tests/integration/defs/accuracy/references/mmmu.yaml
+++ b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -72,8 +72,12 @@ Qwen/Qwen3-VL-8B-Instruct:
 mistralai/Mistral-Small-3.1-24B-Instruct-2503:
   - accuracy: 57.0
 Qwen/Qwen3.5-35B-A3B:
+  # The default accuracy for `test_auto_dtype` tests.
+  - accuracy: 59.0
   - dtype: bfloat16
     accuracy: 60.444
+  - quant_algo: FP8_BLOCK_SCALES
+    accuracy: 58.889
 # Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params).
 # Values below are measured with NVFP4 checkpoint (thinking mode enabled).
 moonshotai/Kimi-K2.5:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
index bdcfee64ce4f..31d61af3d004 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
@@ -480,6 +480,45 @@ def test_nvfp4_4gpus(
             task.evaluate(llm, sampling_params=self.sampling_params)
 
 
+# Qwen3.5-MoE-VL is hybrid (Mamba + attention);
+# the FlashInfer GDN prefill kernel is sm90+ only.
+@skip_pre_hopper
+@pytest.mark.skip_less_device_memory(80000)
+class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "Qwen/Qwen3.5-35B-A3B"
+    MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B"
+    MAX_NUM_TOKENS = 16384
+    MAX_BATCH_SIZE = 32
+
+    sampling_params = SamplingParams(
+        max_tokens=MAX_NUM_TOKENS,
+        truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
+        stop="<|endoftext|>",
+    )
+
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False)
+
+    def _make_llm(self, model_path: str) -> LLM:
+        return LLM(
+            model_path,
+            max_num_tokens=self.MAX_NUM_TOKENS,
+            max_batch_size=self.MAX_BATCH_SIZE,
+            kv_cache_config=self.kv_cache_config,
+        )
+
+    def test_auto_dtype(self) -> None:
+        with self._make_llm(self.MODEL_PATH) as llm:
+            task = MMMU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
+
+    def test_fp8_prequantized(self) -> None:
+        model_path = f"{llm_models_root()}/Qwen3.5-35B-A3B-FP8"
+        with self._make_llm(model_path) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+            task = MMMU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
+
+
 class TestQwen3VL(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-8B-Instruct"
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
index 4be9bf6364b1..33308801e2f2 100644
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -825,6 +825,8 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[
 accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[mtp_nextn=3] TIMEOUT (120)
 accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_nvfp4[mtp_nextn=0] TIMEOUT (120)
 accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_nvfp4[mtp_nextn=3] TIMEOUT (120)
+accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype
+accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_fp8_prequantized
 accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
 accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray
 unittest/disaggregated/test_openai_disagg_server.py
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 3dde117a84cb..32262b4b6707 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -57,6 +57,11 @@ l0_h100:
   - unittest/_torch/modeling -k "modeling_gpt_oss"
   - unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_sanity
   - unittest/_torch/modeling/test_multimodal_encoder_graph.py
+  # Qwen3.5-MoE-VL is hybrid (Mamba SSM + attention); FlashInfer's
+  # chunk_gated_delta_rule GDN prefill kernel is sm90+ only, so this
+  # test must run on Hopper-or-newer GPUs. Peer Qwen3-VL / Qwen3-VL-MoE
+  # tests stay on L40s because they're pure attention and don't trigger the GDN kernel.
+  - unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py::TestQwen3_5MoeVL::test_all
   - unittest/disaggregated/test_disagg_utils.py
   - unittest/disaggregated/test_router.py
   - unittest/disaggregated/test_remoteDictionary.py
diff --git a/tests/unittest/_torch/modeling/test_modeling_multimodal.py b/tests/unittest/_torch/modeling/test_modeling_multimodal.py
index 5be764ca59d4..00d967fac180 100644
--- a/tests/unittest/_torch/modeling/test_modeling_multimodal.py
+++ b/tests/unittest/_torch/modeling/test_modeling_multimodal.py
@@ -17,6 +17,12 @@
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.pyexecutor.config_utils import (
+    extract_mamba_kv_cache_params,
+    is_nemotron_hybrid,
+    is_qwen3_hybrid,
+)
+from tensorrt_llm._torch.pyexecutor.mamba_cache_manager import CppMambaHybridCacheManager
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.bindings.executor import KvCacheConfig
@@ -27,6 +33,7 @@
     prompt_inputs,
 )
 from tensorrt_llm.inputs.multimodal import MultimodalParams, MultimodalRuntimeData
+from tensorrt_llm.llmapi.llm_args import KvCacheConfig as PyKvCacheConfig
 from tensorrt_llm.mapping import Mapping
 
 
@@ -520,6 +527,13 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario):
         Note:
             This method uses get_kv_cache_config() to obtain configuration.
             Override get_kv_cache_config() to customize cache settings.
+
+            For hybrid linear-attention models (Qwen3Next, Qwen3.5,
+            Nemotron-Hybrid) this dispatches to
+            `get_hybrid_kv_cache_manager` so the linear-attention layers
+            get a `CppMambaHybridCacheManager` for SSM/conv state.
+            Mirrors the production dispatch in
+            `_util.py:_create_kv_cache_manager`.
         """
         # Get cache configuration from the configurable method
         cache_config = self.get_kv_cache_config(scenario)
@@ -529,17 +543,114 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario):
 
         num_blocks = (max_seq_len + tokens_per_block - 1) // tokens_per_block
 
-        self.kv_cache_manager = self.get_kv_cache_manager(
-            dtype=self.model_config.pretrained_config.torch_dtype,
-            config=self.model_config.pretrained_config,
+        config = self.model_config.pretrained_config
+        text_config = getattr(config, "text_config", config)
+
+        if is_qwen3_hybrid(text_config) or is_nemotron_hybrid(text_config):
+            self.kv_cache_manager = self.get_hybrid_kv_cache_manager(
+                text_config=text_config,
+                tokens_per_block=tokens_per_block,
+                max_seq_len=max_seq_len,
+                batch_size=batch_size,
+                num_blocks=num_blocks,
+            )
+        else:
+            self.kv_cache_manager = self.get_kv_cache_manager(
+                dtype=self.model_config.pretrained_config.torch_dtype,
+                config=self.model_config.pretrained_config,
+                tokens_per_block=tokens_per_block,
+                max_seq_len=max_seq_len,
+                batch_size=batch_size,
+                num_blocks=num_blocks,
+            )
+
+        self.kv_cache_manager.add_dummy_requests(
+            request_ids=[1],
+            token_nums=[max_seq_len],
+            **self._dummy_request_kwargs(scenario),
+        )
+
+    def _dummy_request_kwargs(self, scenario: MultimodalScenario) -> Dict:
+        """Optional override hook for extra kwargs to `add_dummy_requests`.
+
+        Subclasses for mRoPE-using models (Qwen2.5-VL, Qwen3-VL, Qwen3.5-VL,
+        …) should return `{"use_mrope": True}` here so the cache manager
+        allocates the mRoPE position-id buffer at dummy-request time.
+        Defaults to an empty dict, preserving existing behavior for tests
+        that don't care.
+        """
+        return {}
+
+    def get_hybrid_kv_cache_manager(
+        self,
+        text_config: PretrainedConfig,
+        tokens_per_block: int,
+        max_seq_len: int,
+        batch_size: int,
+        num_blocks: int,
+    ):
+        """Build a `CppMambaHybridCacheManager` for hybrid linear-attention
+        models (Qwen3Next, Qwen3.5, Nemotron-Hybrid).
+
+        Mirrors the production construction in
+        `_util.py:_create_kv_cache_manager` for `is_qwen3_hybrid` /
+        `is_nemotron_hybrid` configs: pulls the state-shape / dtype /
+        layer-mask parameters from `extract_mamba_kv_cache_params` and
+        threads them through the constructor. Tests that need a different
+        concrete manager (e.g. `MixedMambaHybridCacheManager` for
+        disagg-style coverage) should override this method.
+        """
+        dtype_map = {
+            torch.half: tensorrt_llm.bindings.DataType.HALF,
+            torch.float16: tensorrt_llm.bindings.DataType.HALF,
+            torch.bfloat16: tensorrt_llm.bindings.DataType.BF16,
+        }
+
+        mamba_params = extract_mamba_kv_cache_params(text_config)
+        if mamba_params.dtype not in dtype_map:
+            raise ValueError(
+                f"Unsupported dtype for hybrid cache manager: "
+                f"{mamba_params.dtype}. Supported: {list(dtype_map.keys())}"
+            )
+        kv_cache_dtype = dtype_map[mamba_params.dtype]
+
+        head_dim = getattr(text_config, "head_dim", None)
+        if not isinstance(head_dim, int):
+            head_dim = text_config.hidden_size // text_config.num_attention_heads
+
+        # CppMambaHybridCacheManager reads Pydantic-only fields
+        # (mamba_state_cache_interval, enable_block_reuse) so we have to
+        # construct the llmapi.llm_args.KvCacheConfig here, not the C++
+        # bindings KvCacheConfig that the standard KVCacheManager path uses.
+        kv_cache_config = PyKvCacheConfig(max_tokens=num_blocks * tokens_per_block)
+        mapping = Mapping(world_size=1, tp_size=1, rank=0)
+
+        return CppMambaHybridCacheManager(
+            # mamba cache parameters (positional)
+            mamba_params.state_size,
+            mamba_params.conv_kernel,
+            mamba_params.num_heads,
+            mamba_params.n_groups,
+            mamba_params.head_dim,
+            mamba_params.num_mamba_layers,
+            mamba_params.mamba_layer_mask,
+            mamba_params.dtype,
+            mamba_params.mamba_ssm_cache_dtype,
+            # kv cache parameters (positional)
+            kv_cache_config,
+            tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
+            # kw-only
+            num_layers=mamba_params.num_full_attention_layers,
+            layer_mask=mamba_params.full_attention_layer_mask,
+            num_kv_heads=text_config.num_key_value_heads,
+            head_dim=head_dim,
             tokens_per_block=tokens_per_block,
             max_seq_len=max_seq_len,
-            batch_size=batch_size,
-            num_blocks=num_blocks,
+            max_batch_size=batch_size,
+            mapping=mapping,
+            dtype=kv_cache_dtype,
         )
 
-        self.kv_cache_manager.add_dummy_requests(request_ids=[1], token_nums=[max_seq_len])
-
     def get_max_num_tokens(self, scenario: MultimodalScenario) -> int:
         """Get maximum number of tokens for attention metadata."""
         if scenario.chunked_prefill:
@@ -697,6 +808,14 @@ def setUp(self):
         # TODO: Add multi-GPU support
         self.device = torch.device("cuda:0")
 
+        # Pre-initialize fields that tearDown / setup_scenario expect to
+        # exist. Without this, a test method that doesn't run
+        # setup_scenario (e.g. a setUp-only smoke test) leaves
+        # self.kv_cache_manager unset and tearDown errors with
+        # AttributeError on the ``is not None`` check.
+        self.kv_cache_manager = None
+        self.attn_metadata = None
+
         self.hf_config = self.create_hf_config()
         if self.skip_hf_inference:
             # Create a dummy torch module if skipping HF inference.
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
new file mode 100644
index 000000000000..0349bae2264a
--- /dev/null
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
@@ -0,0 +1,452 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+from copy import deepcopy
+from pathlib import Path
+from typing import List, Optional
+
+import torch
+import transformers
+from test_modeling_multimodal import MultimodalScenario, TestModelingMultimodal
+from transformers import Qwen3_5MoeForConditionalGeneration as HFQwen3_5MoeForConditionalGeneration
+from utils.llm_data import llm_models_root
+from utils.util import skip_pre_hopper
+
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models import Qwen3_5MoeVLModel
+from tensorrt_llm._torch.models.checkpoints.auto_mapper import AutoCheckpointMapper
+from tensorrt_llm._torch.models.checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper
+from tensorrt_llm._torch.models.modeling_auto import AutoModelForCausalLM
+from tensorrt_llm._torch.models.modeling_qwen3_5 import _normalize_qwen35_moe_vl_config
+from tensorrt_llm._torch.pyexecutor.config_utils import (
+    extract_mamba_kv_cache_params,
+    load_pretrained_config,
+)
+from tensorrt_llm._torch.pyexecutor.model_loader import validate_and_set_mamba_ssm_cache_dtype
+from tensorrt_llm.inputs import ContentFormat
+from tensorrt_llm.inputs.registry import MULTIMODAL_PLACEHOLDER_REGISTRY
+
+
+def _write_qwen35_moe_vl_config(tmp_path: Path) -> Path:
+    config = {
+        "architectures": ["Qwen3_5MoeForConditionalGeneration"],
+        "image_token_id": 248056,
+        "model_type": "qwen3_5_moe",
+        "text_config": {
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "bos_token_id": 151643,
+            "dtype": "bfloat16",
+            "eos_token_id": 151645,
+            "full_attention_interval": 4,
+            "head_dim": 128,
+            "hidden_act": "silu",
+            "hidden_size": 2048,
+            "linear_conv_kernel_dim": 4,
+            "linear_key_head_dim": 128,
+            "linear_num_key_heads": 16,
+            "linear_num_value_heads": 32,
+            "linear_value_head_dim": 128,
+            "mamba_ssm_dtype": "float32",
+            "max_position_embeddings": 262144,
+            "mlp_only_layers": [],
+            "model_type": "qwen3_5_moe_text",
+            "moe_intermediate_size": 512,
+            "norm_topk_prob": True,
+            "num_attention_heads": 32,
+            "num_experts": 128,
+            "num_experts_per_tok": 8,
+            "num_hidden_layers": 2,
+            "num_key_value_heads": 4,
+            "rms_norm_eps": 1e-6,
+            "shared_expert_intermediate_size": 512,
+            "rope_parameters": {
+                "mrope_section": [11, 11, 10],
+                "partial_rotary_factor": 0.25,
+                "rope_theta": 1000000.0,
+                "rope_type": "default",
+            },
+            "use_cache": True,
+            "vocab_size": 151936,
+        },
+        "tie_word_embeddings": False,
+        "video_token_id": 248057,
+        "vision_config": {
+            "deepstack_visual_indexes": [8, 16, 24],
+            "depth": 27,
+            "hidden_act": "gelu_pytorch_tanh",
+            "hidden_size": 1152,
+            "in_channels": 3,
+            "intermediate_size": 4304,
+            "model_type": "qwen3_5_moe",
+            "num_heads": 16,
+            "num_position_embeddings": 2304,
+            "out_hidden_size": 2048,
+            "patch_size": 16,
+            "spatial_merge_size": 2,
+            "temporal_patch_size": 2,
+        },
+        "vision_end_token_id": 248054,
+        "vision_start_token_id": 248053,
+    }
+    (tmp_path / "config.json").write_text(json.dumps(config), encoding="utf-8")
+    return tmp_path
+
+
+def test_qwen35_moe_vl_config_preserves_vlm_architecture(
+    tmp_path: Path,
+) -> None:
+    config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path)))
+
+    assert isinstance(config, transformers.Qwen3_5MoeConfig)
+    assert config.architectures == ["Qwen3_5MoeForConditionalGeneration"]
+    assert config.text_config.architectures == ["Qwen3_5MoeForCausalLM"]
+    assert config.text_config.num_experts == 128
+    assert config.text_config.intermediate_size == 4608
+    assert config.text_config.rope_theta == 1000000.0
+    assert config.text_config.partial_rotary_factor == 0.25
+    assert config.text_config.rope_scaling["type"] == "mrope"
+    assert config.text_config.rope_scaling["mrope_section"] == [11, 11, 10]
+    assert config.text_config.mamba_ssm_dtype == "float32"
+    assert config.get_text_config() is config.text_config
+
+
+def test_qwen35_moe_vl_resolves_mamba_ssm_cache_dtype(
+    tmp_path: Path,
+) -> None:
+    config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path)))
+    model_config = ModelConfig(pretrained_config=config)
+
+    validate_and_set_mamba_ssm_cache_dtype(model_config, "auto")
+    assert model_config.quant_config.mamba_ssm_cache_dtype is torch.float32
+
+    mamba_params = extract_mamba_kv_cache_params(
+        config.text_config,
+        quant_config=model_config.quant_config,
+    )
+    assert mamba_params.dtype is torch.bfloat16
+    assert mamba_params.mamba_ssm_cache_dtype is torch.float32
+
+
+def test_qwen35_moe_vl_resolves_model_and_mapper(tmp_path: Path) -> None:
+    config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path)))
+    model_config = ModelConfig(pretrained_config=config)
+
+    assert AutoModelForCausalLM._resolve_class(model_config) is Qwen3_5MoeVLModel
+    assert isinstance(
+        AutoCheckpointMapper.get("HF", "Qwen3_5MoeForConditionalGeneration"),
+        Qwen3_5MoeHfWeightMapper,
+    )
+
+
+def test_qwen35_moe_vl_placeholder_metadata_registered() -> None:
+    metadata = MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholder_metadata("qwen3_5_moe")
+
+    assert metadata.placeholder_map == {
+        "image": "<|vision_start|><|image_pad|><|vision_end|>",
+        "video": "<|vision_start|><|video_pad|><|vision_end|>",
+    }
+    assert metadata.placeholders_separator == ""
+    assert metadata.content_format is ContentFormat.STRING
+
+
+# --- Layered parity test scaffold -------------------------------------------
+#
+# Tiny synthetic config used by TestQwen3_5MoeVL below. Same architecture as
+# the real Qwen/Qwen3.5-35B-A3B checkpoint but with much smaller dimensions
+# where possible.
+#
+# Shapes that have to match real Qwen3.5 (can't shrink without breaking
+# things downstream):
+#
+#   - head_dim=256, partial_rotary_factor=0.25 --> rotary tensor width is
+#     `head_dim * 0.25 / 2 = 32`, which equals `sum(mrope_section)`.
+#     A smaller head_dim (e.g. 128) yields a 16-wide tensor that mRoPE
+#     can't split with section sum 32.
+#   - num_attention_heads=16, num_key_value_heads=2 match the real
+#     model's 8:1 GQA layout; Q proj is 2048 --> 4096, K/V are 2048 --> 512.
+#   - Vision deepstack indices [8, 16, 24] match the real config, and
+#     depth=27 is the smallest value that hosts those indices. Disabling
+#     deepstack (indices=[], depth=2) produces fewer vision embeddings
+#     than the HF processor reserves placeholder tokens for, which
+#     breaks `fuse_input_embeds`.
+#   - vocab_size=248320 matches the real Qwen3.5 tokenizer. The
+#     tokenizer (loaded via _name_or_path) emits special-token ids in
+#     the 248k range; `fuse_input_embeds` uses `vocab_size` as the
+#     OOV threshold to identify image-pad tokens. A smaller vocab_size
+#     would misclassify regular chat-template specials as mm tokens
+#     and trip the placeholder/embedding count check.
+#
+# Shapes that can be shrunk for tests:
+#
+#   - num_hidden_layers: 2 (vs 40+).
+#   - num_experts: 128 (vs 256). Still moderate so MoE routing runs.
+#   - full_attention_interval=2 with 2 LM layers yields the pattern
+#     [linear_attention, full_attention] — one of each kind, exercising
+#     both the regular KV cache and the Mamba SSM/conv state via the
+#     base-class dispatch.
+#
+# `_name_or_path` points at the real checkpoint dir so the test can load
+# the tokenizer/processor (only the processor; not the full model weights).
+QWEN3_5_VL_MOE_PARITY_CONFIG = {
+    "architectures": ["Qwen3_5MoeForConditionalGeneration"],
+    "image_token_id": 248056,
+    "model_type": "qwen3_5_moe",
+    "text_config": {
+        "attention_bias": False,
+        "attention_dropout": 0.0,
+        "bos_token_id": 151643,
+        "dtype": "bfloat16",
+        "eos_token_id": 151645,
+        "full_attention_interval": 2,
+        "head_dim": 256,
+        "hidden_act": "silu",
+        "hidden_size": 2048,
+        "linear_conv_kernel_dim": 4,
+        "linear_key_head_dim": 128,
+        "linear_num_key_heads": 16,
+        "linear_num_value_heads": 32,
+        "linear_value_head_dim": 128,
+        "mamba_ssm_dtype": "float32",
+        "max_position_embeddings": 8192,
+        "mlp_only_layers": [],
+        "model_type": "qwen3_5_moe_text",
+        "moe_intermediate_size": 512,
+        "norm_topk_prob": True,
+        "num_attention_heads": 16,
+        "num_experts": 128,
+        "num_experts_per_tok": 8,
+        "num_hidden_layers": 2,
+        "num_key_value_heads": 2,
+        "rms_norm_eps": 1e-6,
+        "shared_expert_intermediate_size": 512,
+        "rope_parameters": {
+            "mrope_section": [11, 11, 10],
+            "partial_rotary_factor": 0.25,
+            "rope_theta": 1000000.0,
+            "rope_type": "default",
+        },
+        "use_cache": True,
+        "vocab_size": 248320,
+    },
+    "tie_word_embeddings": False,
+    "video_token_id": 248057,
+    "vision_config": {
+        "deepstack_visual_indexes": [8, 16, 24],
+        "depth": 27,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_size": 1152,
+        "in_channels": 3,
+        "initializer_range": 0.02,
+        "intermediate_size": 4304,
+        "model_type": "qwen3_5_moe",
+        "num_heads": 16,
+        "num_position_embeddings": 2304,
+        "out_hidden_size": 2048,
+        "patch_size": 16,
+        "spatial_merge_size": 2,
+        "temporal_patch_size": 2,
+    },
+    "vision_end_token_id": 248054,
+    "vision_start_token_id": 248053,
+    "_name_or_path": str(os.path.join(llm_models_root(), "Qwen3.5-35B-A3B")),
+}
+
+
+@skip_pre_hopper
+class TestQwen3_5MoeVL(TestModelingMultimodal):
+    """Forward-parity test for Qwen3.5-MoE-VL against HuggingFace.
+
+    Tiny-synthetic-config parity test in the same shape as
+    `TestQwen3VLMoe` / `TestQwen2_5VL`: both stacks are constructed
+    from `QWEN3_5_VL_MOE_PARITY_CONFIG` (2 LM layers, 1 linear + 1 full
+    attention, 128 experts, 2 vision layers), HF weights are copied
+    into TRT-LLM via `Qwen3_5MoeHfWeightMapper`, then `test_all`
+    sweeps the default `MultimodalScenario`s comparing last-position
+    logits at context + generation phases.
+
+    Two-config design:
+      - `self.hf_config` is the raw `Qwen3_5MoeConfig.from_dict(...)`
+        result. HF model construction sees the native HF schema
+        (`rope_parameters` intact with `rope_type`,
+        `moe_intermediate_size`, …).
+      - TRT-LLM gets a deep-copied + normalized version via the
+        `create_trtllm_model` override below. That copy goes through
+        `_normalize_qwen35_moe_vl_config` exactly the same way
+        production `load_pretrained_config` does, so the Qwen3Next
+        runtime sees the flat aliases it expects
+        (`intermediate_size`, `rope_theta`, `rope_scaling`, …).
+
+    Keeping the two configs separate means the production normalizer
+    doesn't need to be HF-safe — production only ever constructs the
+    TRT-LLM model from a normalized config, and the test mirrors that
+    boundary explicitly. The hybrid-cache path is handled by the base
+    class's `init_kv_cache_manager` dispatch on
+    `is_qwen3_hybrid` / `is_nemotron_hybrid`.
+    """
+
+    def get_model_config(self):
+        return QWEN3_5_VL_MOE_PARITY_CONFIG
+
+    def get_trtllm_model_class(self):
+        return Qwen3_5MoeVLModel
+
+    def get_hf_model_class(self):
+        return HFQwen3_5MoeForConditionalGeneration
+
+    def get_weight_mapper_class(self):
+        return Qwen3_5MoeHfWeightMapper
+
+    def get_model_type(self):
+        return "qwen3_5_moe"
+
+    def get_model_config_class(self):
+        return transformers.Qwen3_5MoeConfig
+
+    def create_trtllm_model(
+        self,
+        load_weights: bool = False,
+        hf_model_state_dict: Optional[dict] = None,
+        **kwargs,
+    ):
+        """Build the TRT-LLM model from a *normalized copy* of `self.hf_config`.
+
+        Mirrors the base-class body but swaps in
+        `_normalize_qwen35_moe_vl_config(trtllm_config)` before
+        wrapping in `ModelConfig`. `self.hf_config` itself stays
+        raw so the HF model that the base class builds in `setUp`
+        sees native HF schema.
+        """
+        trtllm_config = deepcopy(self.hf_config)
+        _normalize_qwen35_moe_vl_config(trtllm_config)
+
+        model_config = ModelConfig(pretrained_config=trtllm_config)
+        model_class = self.get_trtllm_model_class()
+        model = model_class(model_config, **kwargs).to("cuda")
+
+        if load_weights:
+            weight_mapper = self.get_weight_mapper_class()()
+            weight_mapper.init_model_and_config(model, trtllm_config)
+            model.load_weights(hf_model_state_dict, weight_mapper)
+
+            for module in model.modules():
+                if hasattr(module, "post_load_weights") and not getattr(
+                    module, "_weights_removed", False
+                ):
+                    module.post_load_weights()
+
+        return model, model_config
+
+    def _dummy_request_kwargs(self, scenario):
+        """Qwen3.5-VL uses mRoPE; the cache manager needs the mRoPE
+        position-id buffer allocated at dummy-request time."""
+        return {"use_mrope": True}
+
+    def get_tolerance(self):
+        """Tighten `rtol` to `0.1` (4x tighter than the base 0.4
+        default) while keeping `atol` at `0.4` to absorb single-logit
+        tail outliers seen on `multiple_image` / `video`.
+        """
+        return 0.4, 0.1
+
+    def get_trtllm_inputs(
+        self,
+        input_ids,
+        multimodal_params_list,
+        is_gen: bool = False,
+        num_cached_tokens_per_seq: Optional[List[int]] = None,
+        total_prompt_len: Optional[int] = None,
+    ):
+        """Override position_ids with mRoPE position IDs from the
+        multimodal params. Same pattern as `TestQwen3VLMoe` — the
+        VLM wrapper feeds mRoPE-shaped position IDs to the decoder,
+        not the simple range-based default the base class produces.
+        """
+        trtllm_inputs = super().get_trtllm_inputs(
+            input_ids,
+            multimodal_params_list,
+            is_gen,
+            num_cached_tokens_per_seq,
+            total_prompt_len=total_prompt_len,
+        )
+
+        if is_gen:
+            mrope_gen_position_ids = []
+            for multimodal_param in multimodal_params_list:
+                mrope_gen_position_ids.append(
+                    multimodal_param.multimodal_data["mrope_config"]["mrope_position_deltas"]
+                )
+            mrope_gen_position_ids = torch.cat(mrope_gen_position_ids, dim=-1).to(self.device)
+            trtllm_inputs["position_ids"] = (
+                (trtllm_inputs["position_ids"] + mrope_gen_position_ids).expand(3, -1, 1).cuda()
+            )
+            gen_multimodal_params_list = []
+            for multimodal_param in multimodal_params_list:
+                multimodal_param.strip_for_generation()
+                multimodal_param.to_device(
+                    "multimodal_data",
+                    self.device,
+                    pin_memory=True,
+                    target_keywords=["mrope_config.mrope_position_deltas"],
+                )
+                gen_multimodal_params_list.append(multimodal_param)
+            trtllm_inputs["multimodal_params"] = gen_multimodal_params_list
+        else:
+            mrope_position_ids = []
+            for multimodal_param in multimodal_params_list:
+                mrope_position_ids.append(
+                    multimodal_param.multimodal_data["mrope_config"]["mrope_position_ids"]
+                )
+            position_ids = torch.cat(mrope_position_ids, dim=-1).cuda()
+            trtllm_inputs["position_ids"] = position_ids
+
+        return trtllm_inputs
+
+    def get_scenarios(self) -> List[MultimodalScenario]:
+        """Modality-sanity sweep (image / multiple_image / video).
+
+        These three catch differences in placeholder counts and the
+        multimodal-cumsum path between single-image, multi-image, and
+        video inputs.
+
+        CUDA-graph capture is intentionally not exercised here. The
+        standard `attn_metadata.create_cuda_graph_metadata` path only
+        addresses attention metadata; the Mamba SSM state buffer of the
+        hybrid (Mamba + attention) cache is not threaded through, so
+        replayed logits diverge from the HF reference. Adding that path
+        is dedicated harness work and tracked separately.
+        """
+        return [
+            MultimodalScenario(
+                modality="image",
+                use_cuda_graph=False,
+                chunked_prefill=False,
+                kv_cache_reuse=False,
+            ),
+            MultimodalScenario(
+                modality="multiple_image",
+                use_cuda_graph=False,
+                chunked_prefill=False,
+                kv_cache_reuse=False,
+            ),
+            MultimodalScenario(
+                modality="video",
+                use_cuda_graph=False,
+                chunked_prefill=False,
+                kv_cache_reuse=False,
+            ),
+        ]
+
+    def test_construction_and_weight_loading_smoke(self):
+        """Smoke test: setUp built HF + TRT-LLM models and copied HF
+        weights into TRT-LLM via the weight mapper. Detailed
+        assertions on the normalizer's outputs live in the routing
+        tests above (e.g. `test_qwen35_moe_vl_config_preserves_vlm_architecture`)
+        — this one just confirms construction reached the end without
+        exception.
+        """
+        self.assertIsNotNone(self.hf_model)
+        self.assertIsNotNone(self.trtllm_model)
+        self.assertIsNotNone(self.model_config)