diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md
index 685680f5b0b0..e14d62b6f7c6 100644
--- a/docs/source/models/supported-models.md
+++ b/docs/source/models/supported-models.md
@@ -97,7 +97,6 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 | `Qwen2_5_VLForConditionalGeneration` | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
 | `Qwen3VLForConditionalGeneration`    | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
 | `Qwen3VLMoeForConditionalGeneration` | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
-| `Qwen3_5MoeForConditionalGeneration` | Yes               | Yes        | Untested        | Yes           | Yes              | No             | Untested              | Yes                       | L + I + V |
 
 Note:
 - L: Language
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
index 8f67788bba40..9c3b032421b2 100644
--- a/tensorrt_llm/_torch/models/__init__.py
+++ b/tensorrt_llm/_torch/models/__init__.py
@@ -37,8 +37,7 @@
                             Qwen2ForRewardModel)
 from .modeling_qwen2vl import Qwen2_5_VLModel, Qwen2VLModel
 from .modeling_qwen3 import Qwen3ForCausalLM
-from .modeling_qwen3_5 import (Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM,
-                               Qwen3_5MoeVLModel)
+from .modeling_qwen3_5 import Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM
 from .modeling_qwen3_moe import Qwen3MoeForCausalLM
 from .modeling_qwen3_next import Qwen3NextForCausalLM
 from .modeling_qwen3vl import Qwen3VLModel
@@ -91,7 +90,6 @@
     "Qwen3MoeForCausalLM",
     "Qwen3_5ForCausalLM",
     "Qwen3_5MoeForCausalLM",
-    "Qwen3_5MoeVLModel",
     "Qwen3NextForCausalLM",
     "Qwen3MoeVLModel",
     "GptOssForCausalLM",
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py
index 65e0168bec55..fa2f161bdc4f 100644
--- a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py
@@ -13,7 +13,6 @@
 
 
 @register_mapper("HF", "Qwen3_5MoeForCausalLM")
-@register_mapper("HF", "Qwen3_5MoeForConditionalGeneration")
 @register_mapper("HF", "Qwen3_5ForCausalLM")
 class Qwen3_5MoeHfWeightMapper(Qwen3NextHfWeightMapper):
     """Weight mapper for Qwen3.5 MoE text checkpoints.
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
index 4f325dbb0bcb..bf83e916db29 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
@@ -1,29 +1,7 @@
 import re
-from types import SimpleNamespace
-from typing import Dict, List
 
-import torch
-from transformers import PretrainedConfig
-
-from ...inputs import (
-    ContentFormat,
-    MultimodalPlaceholderMetadata,
-    MultimodalPlaceholderPlacement,
-    register_input_processor,
-    support_multimodal_disaggregated,
-)
-from ..pyexecutor.config_utils import get_qwen3_hybrid_layer_types
-from .checkpoints.base_weight_mapper import BaseWeightMapper
-from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper
-from .modeling_multimodal_utils import _is_disagg
 from .modeling_qwen3_next import Qwen3NextForCausalLM
-from .modeling_qwen3vl import (
-    Qwen3VisionModel,
-    Qwen3VisionModelBase,
-    Qwen3VLInputProcessorBase,
-    Qwen3VLModelBase,
-)
-from .modeling_utils import ModelConfig, register_auto_model, register_vision_encoder
+from .modeling_utils import register_auto_model
 
 _LANG_PREFIX = "model.language_model."
 
@@ -73,248 +51,6 @@ def _translate_mtp_pattern(name, n_hidden_layers):
     return None
 
 
-# --- Config adapters --------------------------------------------------------
-#
-# These run from `load_pretrained_config` in
-# `tensorrt_llm/_torch/pyexecutor/config_utils.py` via lazy import — the
-# runtime layer asks the model module how to load its own config.
-#
-# There are two entry points:
-#   - `Qwen35ConfigCompat.normalize(config_dict)` — for text-only
-#     Qwen3.5 (MoE and dense). Returns a dict that
-#     `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the
-#     existing Qwen3Next runtime is reused unchanged.
-#   - `_normalize_qwen35_moe_vl_config(model_config)` — for the
-#     Qwen3.5-MoE VLM. Mutates the HF-native `transformers.Qwen3_5MoeConfig`
-#     in place, attaching the runtime aliases the Qwen3Next-based LM expects
-#     while keeping `text_config` / `vision_config` composite.
-
-
-class Qwen35ConfigCompat:
-    """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig.
-
-    We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native
-    schema) so the runtime can reuse the existing `Qwen3NextForCausalLM`
-    model implementation unchanged — Qwen3.5 text is structurally identical
-    to Qwen3Next, so matching the config schema lets the same code serve
-    both.
-
-    This is used for Qwen3.5 text-only configs and for shared helper logic such
-    as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM
-    configs should stay composite and use transformers.Qwen3_5MoeConfig plus
-    _normalize_qwen35_moe_vl_config instead.
-
-    To remove: delete this class and the elif branch in
-    load_pretrained_config that flattens Qwen3.5 text configs.
-    """
-
-    @staticmethod
-    def normalize(config_dict: dict) -> dict:
-        """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict."""
-        text_config = Qwen35ConfigCompat._extract_text_config(config_dict)
-        text_config = Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config)
-        text_config = Qwen35ConfigCompat._flatten_rope(text_config)
-
-        # Detect dense vs MoE and set architecture + MoE defaults accordingly
-        is_moe = "num_experts" in text_config and text_config["num_experts"] > 0
-        if is_moe:
-            text_config["architectures"] = ["Qwen3_5MoeForCausalLM"]
-        else:
-            text_config["architectures"] = ["Qwen3_5ForCausalLM"]
-            # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't
-            # accidentally enable MoE for the dense model.
-            text_config.setdefault("num_experts", 0)
-            text_config.setdefault("num_experts_per_tok", 0)
-            text_config.setdefault("moe_intermediate_size", 0)
-            text_config.setdefault("shared_expert_intermediate_size", 0)
-        return text_config
-
-    _VLM_ARCHITECTURES = {
-        "Qwen3_5MoeForConditionalGeneration",
-        "Qwen3_5ForConditionalGeneration",
-    }
-
-    @staticmethod
-    def _extract_text_config(config_dict: dict) -> dict:
-        """Pull nested text_config from VLM checkpoints, or use dict as-is."""
-        architectures = config_dict.get("architectures") or []
-        if architectures and architectures[0] in Qwen35ConfigCompat._VLM_ARCHITECTURES:
-            text_config = dict(config_dict.get("text_config") or {})
-        else:
-            text_config = dict(config_dict)
-        if not text_config:
-            raise ValueError("Qwen3.5 config is missing a usable text_config")
-        return text_config
-
-    @staticmethod
-    def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict:
-        """Copy top-level quantization_config into text_config with name normalization.
-
-        Also adds a temporary workaround that keeps packed linear-attention
-        in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is
-        fixed for that layout.
-        """
-        if "quantization_config" in text_config:
-            return text_config
-        if "quantization_config" not in config_dict:
-            return text_config
-
-        quantization_config = dict(config_dict["quantization_config"])
-        if "modules_to_not_convert" in quantization_config:
-            modules = Qwen35ConfigCompat._normalize_exclude_modules(
-                quantization_config["modules_to_not_convert"]
-            )
-            modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules)
-            quantization_config["modules_to_not_convert"] = sorted(set(modules))
-        text_config["quantization_config"] = quantization_config
-        return text_config
-
-    @staticmethod
-    def _normalize_exclude_modules(modules: list[str]) -> list[str]:
-        """Translate HF quantization exclude-module paths to TRT-LLM names.
-
-        - Strip model.language_model. prefix -> model.
-        - Drop model.visual.* and mtp.* entries
-        - Map split projection names to packed TRT-LLM names
-        """
-        normalized = set()
-        for name in modules:
-            if name.startswith("model.language_model."):
-                name = "model." + name[len("model.language_model.") :]
-            if name.startswith("model.visual.") or name.startswith("mtp."):
-                continue
-            name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name)
-            name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name)
-            normalized.add(name)
-        return sorted(normalized)
-
-    @staticmethod
-    def _add_qkvz_bf16_workaround(text_config: dict, modules: list[str]) -> list[str]:
-        """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers.
-
-        Temporary until FP8 block-scale TP loading is fixed for this layout.
-        """
-        try:
-            layer_types = get_qwen3_hybrid_layer_types(SimpleNamespace(**text_config))
-        except (ValueError, AttributeError):
-            return modules
-        for layer_idx, layer_type in enumerate(layer_types):
-            if layer_type == "linear_attention":
-                modules.append(f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz")
-        return modules
-
-    @staticmethod
-    def _flatten_rope(text_config: dict) -> dict:
-        """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling.
-
-        Qwen3.5 nests these inside a rope_parameters dict and uses rope_type
-        instead of type in rope_scaling.  Qwen3NextConfig expects them as
-        top-level fields with rope_scaling.type.
-        """
-        rope_parameters = dict(text_config.pop("rope_parameters", {}) or {})
-        rope_scaling = dict(text_config.get("rope_scaling") or {})
-        if rope_parameters:
-            rope_theta = rope_parameters.pop("rope_theta", None)
-            if rope_theta is not None:
-                text_config.setdefault("rope_theta", rope_theta)
-            partial_rotary_factor = rope_parameters.pop("partial_rotary_factor", None)
-            if partial_rotary_factor is not None:
-                text_config.setdefault("partial_rotary_factor", partial_rotary_factor)
-            if rope_parameters:
-                rope_scaling = rope_parameters | rope_scaling
-        if rope_scaling:
-            has_mrope = "mrope_section" in rope_scaling or rope_scaling.get(
-                "mrope_interleaved", False
-            )
-            if has_mrope:
-                rope_scaling["type"] = "mrope"
-                rope_scaling.pop("rope_type", None)
-            elif "type" not in rope_scaling and "rope_type" in rope_scaling:
-                rope_type = rope_scaling.pop("rope_type")
-                # "default" means standard RoPE (no scaling) — don't set
-                # rope_scaling to avoid triggering scaling code paths.
-                if rope_type == "default":
-                    rope_scaling = {}
-                else:
-                    rope_scaling["type"] = rope_type
-            if rope_scaling:
-                text_config["rope_scaling"] = rope_scaling
-        return text_config
-
-
-def _normalize_qwen35_mrope_config(text_config) -> None:
-    """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path.
-
-    HF stores RoPE metadata under `rope_parameters`; the shared Qwen3-VL
-    wrapper reads `rope_theta`, `partial_rotary_factor`, and
-    `rope_scaling` directly on the text config.
-    """
-    rope_parameters = getattr(text_config, "rope_parameters", None)
-    if not rope_parameters:
-        return
-    if hasattr(rope_parameters, "to_dict"):
-        rope_parameters = rope_parameters.to_dict()
-    flattened = Qwen35ConfigCompat._flatten_rope(
-        {
-            "rope_parameters": dict(rope_parameters),
-            "rope_scaling": dict(getattr(text_config, "rope_scaling", None) or {}),
-        }
-    )
-    for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"):
-        value = flattened.get(attr)
-        if value is not None:
-            setattr(text_config, attr, value)
-
-
-def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None:
-    """Materialize Qwen3Next-style text aliases used by the shared runtime."""
-    if getattr(text_config, "intermediate_size", None) is None:
-        moe_intermediate_size = getattr(text_config, "moe_intermediate_size", None)
-        num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None)
-        shared_expert_intermediate_size = (
-            getattr(text_config, "shared_expert_intermediate_size", 0) or 0
-        )
-        if moe_intermediate_size is not None and num_experts_per_tok is not None:
-            text_config.intermediate_size = (
-                num_experts_per_tok * moe_intermediate_size + shared_expert_intermediate_size
-            )
-
-
-def _normalize_qwen35_quantization_config(model_config) -> None:
-    quantization_config = getattr(model_config, "quantization_config", None)
-    if not isinstance(quantization_config, dict):
-        return
-
-    modules = quantization_config.get("modules_to_not_convert")
-    if modules is None:
-        return
-
-    text_config = getattr(model_config, "text_config", None)
-    normalized_modules = Qwen35ConfigCompat._normalize_exclude_modules(modules)
-    if text_config is not None:
-        normalized_modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(
-            text_config.to_dict(), normalized_modules
-        )
-    quantization_config["modules_to_not_convert"] = sorted(set(normalized_modules))
-
-
-def _normalize_qwen35_moe_vl_config(model_config) -> None:
-    """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions."""
-    if not getattr(model_config, "architectures", None):
-        model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"]
-
-    text_config = getattr(model_config, "text_config", None)
-    if text_config is None:
-        raise ValueError("Qwen3.5-MoE VLM config is missing text_config")
-
-    text_config.architectures = ["Qwen3_5MoeForCausalLM"]
-    _normalize_qwen35_qwen3next_text_aliases(text_config)
-    _normalize_qwen35_mrope_config(text_config)
-
-    model_config.get_text_config = lambda decoder=False: text_config
-    _normalize_qwen35_quantization_config(model_config)
-
-
 def _normalize_qwen35_exclude_modules(model_config):
     """Normalize NVFP4/FP8 exclude_modules from HF naming to TRT-LLM naming.
 
@@ -390,56 +126,10 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM):
 
     Same reuse pattern as Qwen3_5MoeForCausalLM, but for the dense 27B
     variant which uses GatedMLP instead of SparseMoeBlock.  The config
-    normalizer (Qwen35ConfigCompat) sets num_experts=0 so that
+    normalizer (_Qwen35ConfigCompat) sets num_experts=0 so that
     Qwen3NextModel selects GatedMLP for the feed-forward layers.
     """
 
     def __init__(self, model_config):
         _normalize_qwen35_exclude_modules(model_config)
         super().__init__(model_config)
-
-
-# TODO: Add tests for disaggregated support.
-@support_multimodal_disaggregated
-@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel)
-@register_auto_model("Qwen3_5MoeForConditionalGeneration")
-@register_input_processor(
-    Qwen3VLInputProcessorBase,
-    model_type="qwen3_5_moe",
-    placeholder_metadata=MultimodalPlaceholderMetadata(
-        placeholder_map={
-            "image": "<|vision_start|><|image_pad|><|vision_end|>",
-            "video": "<|vision_start|><|video_pad|><|vision_end|>",
-        },
-        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
-        placeholders_separator="",
-        content_format=ContentFormat.STRING,
-    ),
-)
-class Qwen3_5MoeVLModel(Qwen3VLModelBase):
-    """VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder."""
-
-    def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs):
-        kwargs["vision_model_class"] = Qwen3VisionModel
-        kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False)
-        super().__init__(model_config, *args, **kwargs)
-
-    @property
-    def multimodal_data_device_paths(self) -> List[str]:
-        return [
-            "image.pixel_values",
-            "video.pixel_values_videos",
-            "multimodal_embedding",
-        ]
-
-    def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper):
-        if not _is_disagg():
-            self.mm_encoder.load_weights(weights)
-
-        weight_mapper = Qwen3_5MoeHfWeightMapper()
-        weight_mapper.init_model_and_config(self.llm, self.model_config)
-        filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")}
-        params_map = {
-            r"^model\.language_model\.(.*)$": r"model.\1",
-        }
-        self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map)
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py
index 5d8ca8e81cbd..d6f4fd57794f 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_next.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_next.py
@@ -973,18 +973,9 @@ def get_model_defaults(cls, llm_args: 'TorchLlmArgs') -> dict:
         # is supported for Mamba/SSM-based models
         return {"kv_cache_config": {"enable_block_reuse": False}}
 
-    def load_weights(self,
-                     weights: dict,
-                     weight_mapper: BaseWeightMapper,
-                     params_map: Optional[Dict[str, str]] = None,
-                     allow_partial_loading: bool = False):
+    def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper):
         new_weights = weight_mapper.preprocess_weights(weights)
-        super().load_weights(
-            new_weights,
-            weight_mapper=weight_mapper,
-            params_map=params_map,
-            allow_partial_loading=allow_partial_loading,
-        )
+        super().load_weights(new_weights, weight_mapper)
 
     def post_load_weights(self):
         for idx, layer in enumerate(
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py
index ed724b0a6307..ecdbc5fde4b3 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3vl.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3vl.py
@@ -1053,8 +1053,6 @@ def __init__(
             llm_model_config.pretrained_config.architectures = ["Qwen3ForCausalLM"]
         elif self.original_arch == "Qwen3VLMoeForConditionalGeneration":
             llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"]
-        elif self.original_arch == "Qwen3_5MoeForConditionalGeneration":
-            llm_model_config.pretrained_config.architectures = ["Qwen3_5MoeForCausalLM"]
         else:
             raise ValueError(f"Unsupported architecture: {self.original_arch}")
         # Qwen3ForCausalLM.
@@ -1092,12 +1090,9 @@ def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]):
             mrope_section=config.rope_scaling.get("mrope_section", None),
             mrope_interleaved=config.rope_scaling.get("mrope_interleaved", False),
         )
-        head_dim = getattr(config, "head_dim", None)
-        if not isinstance(head_dim, int):
-            head_dim = config.hidden_size // config.num_attention_heads
         self.rotary_emb = MRotaryEmbedding(
             pos_embd_params.rope,
-            head_dim=head_dim,
+            head_dim=config.hidden_size // config.num_attention_heads,
             is_neox=pos_embd_params.is_neox,
             mrope_section=pos_embd_params.mrope_section,
             mrope_interleaved=pos_embd_params.mrope_interleaved,
diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py
index 6a46f0f984d9..ef270040fc26 100644
--- a/tensorrt_llm/_torch/pyexecutor/config_utils.py
+++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py
@@ -1,10 +1,11 @@
 import dataclasses
+import re
+from types import SimpleNamespace
 from typing import List, Optional
 
 import torch
 import transformers
 
-from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.logger import logger
 
 
@@ -20,60 +21,6 @@ def is_hybrid_linear(config):
     return is_nemotron_hybrid(config) or is_qwen3_hybrid(config)
 
 
-def _coerce_torch_dtype(dtype):
-    """Normalize dtype values from HF configs into torch dtype objects.
-
-    HF configs may store dtype fields as torch dtypes, strings, or the sentinel
-    value "auto". Returning None for "auto" lets the caller keep its normal
-    fallback path instead of treating "auto" as a concrete dtype.
-    """
-    if isinstance(dtype, torch.dtype):
-        return dtype
-    if dtype == "auto":
-        return None
-    if isinstance(dtype, str):
-        return str_dtype_to_torch(dtype)
-    return dtype
-
-
-def resolve_hf_torch_dtype(config):
-    """Return the model's regular tensor dtype from common HF config fields.
-
-    Transformers has used both dtype and torch_dtype across versions and model
-    families. This helper checks both names and coerces whichever one is present
-    into the form expected by TRT-LLM runtime code. An "auto" value in any
-    field is treated the same as missing, so scanning continues to the next
-    field instead of stopping with None.
-    """
-    for attr in ("dtype", "torch_dtype"):
-        coerced = _coerce_torch_dtype(getattr(config, attr, None))
-        if coerced is not None:
-            return coerced
-    return None
-
-
-def resolve_mamba_ssm_cache_dtype(config):
-    """Return the dtype to use for hybrid Mamba/SSM cache allocations.
-
-    Qwen3.5-style configs may store this field on the top-level config or the
-    nested text_config, and may call it either mamba_ssm_cache_dtype or
-    mamba_ssm_dtype. This helper centralizes that lookup so cache creation does
-    not fail later with a missing dtype. An "auto" value in any field is
-    treated the same as missing.
-    """
-    configs = [config]
-    text_config = getattr(config, "text_config", None)
-    if text_config is not None:
-        configs.append(text_config)
-
-    for candidate_config in configs:
-        for attr in ("mamba_ssm_cache_dtype", "mamba_ssm_dtype"):
-            coerced = _coerce_torch_dtype(getattr(candidate_config, attr, None))
-            if coerced is not None:
-                return coerced
-    return None
-
-
 def is_nemotron_hybrid(config):
     if hasattr(config, "hybrid_override_pattern"
                ) and config.hybrid_override_pattern is not None and len(
@@ -302,14 +249,8 @@ def extract_mamba_kv_cache_params(
             full_attn_mask.extend([True] * num_spec_layers)
             mamba_mask.extend([False] * num_spec_layers)
 
-    mamba_ssm_cache_dtype = None
-    if quant_config is not None:
-        mamba_ssm_cache_dtype = _coerce_torch_dtype(
-            quant_config.mamba_ssm_cache_dtype)
-    if mamba_ssm_cache_dtype is None:
-        mamba_ssm_cache_dtype = (resolve_mamba_ssm_cache_dtype(config)
-                                 or resolve_hf_torch_dtype(config)
-                                 or torch.bfloat16)
+    mamba_ssm_cache_dtype = (quant_config.mamba_ssm_cache_dtype
+                             if quant_config is not None else None)
 
     return MambaKVCacheParams(
         state_size=state_size,
@@ -321,11 +262,159 @@ def extract_mamba_kv_cache_params(
         full_attention_layer_mask=full_attn_mask,
         num_mamba_layers=sum(mamba_mask),
         num_full_attention_layers=sum(full_attn_mask),
-        dtype=resolve_hf_torch_dtype(config) or torch.bfloat16,
+        dtype=config.torch_dtype,
         mamba_ssm_cache_dtype=mamba_ssm_cache_dtype,
     )
 
 
+class _Qwen35ConfigCompat:
+    """Temporary shim that normalizes Qwen3.5 HF configs into Qwen3NextConfig.
+
+    To remove: delete this class and the elif branch in
+    load_pretrained_config that references it.
+    """
+
+    @staticmethod
+    def normalize(config_dict: dict) -> dict:
+        """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict."""
+        text_config = _Qwen35ConfigCompat._extract_text_config(config_dict)
+        text_config = _Qwen35ConfigCompat._inherit_quantization_config(
+            config_dict, text_config)
+        text_config = _Qwen35ConfigCompat._flatten_rope(text_config)
+
+        # Detect dense vs MoE and set architecture + MoE defaults accordingly
+        is_moe = "num_experts" in text_config and text_config["num_experts"] > 0
+        if is_moe:
+            text_config["architectures"] = ["Qwen3_5MoeForCausalLM"]
+        else:
+            text_config["architectures"] = ["Qwen3_5ForCausalLM"]
+            # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't
+            # accidentally enable MoE for the dense model.
+            text_config.setdefault("num_experts", 0)
+            text_config.setdefault("num_experts_per_tok", 0)
+            text_config.setdefault("moe_intermediate_size", 0)
+            text_config.setdefault("shared_expert_intermediate_size", 0)
+        return text_config
+
+    _VLM_ARCHITECTURES = {
+        "Qwen3_5MoeForConditionalGeneration",
+        "Qwen3_5ForConditionalGeneration",
+    }
+
+    @staticmethod
+    def _extract_text_config(config_dict: dict) -> dict:
+        """Pull nested text_config from VLM checkpoints, or use dict as-is."""
+        architectures = config_dict.get("architectures") or []
+        if architectures and architectures[
+                0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES:
+            text_config = dict(config_dict.get("text_config") or {})
+        else:
+            text_config = dict(config_dict)
+        if not text_config:
+            raise ValueError("Qwen3.5 config is missing a usable text_config")
+        return text_config
+
+    @staticmethod
+    def _inherit_quantization_config(config_dict: dict,
+                                     text_config: dict) -> dict:
+        """Copy top-level quantization_config into text_config with name normalization.
+
+        Also adds a temporary workaround that keeps packed linear-attention
+        in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is
+        fixed for that layout.
+        """
+        if "quantization_config" in text_config:
+            return text_config
+        if "quantization_config" not in config_dict:
+            return text_config
+
+        quantization_config = dict(config_dict["quantization_config"])
+        if "modules_to_not_convert" in quantization_config:
+            modules = _Qwen35ConfigCompat._normalize_exclude_modules(
+                quantization_config["modules_to_not_convert"])
+            modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(
+                text_config, modules)
+            quantization_config["modules_to_not_convert"] = sorted(set(modules))
+        text_config["quantization_config"] = quantization_config
+        return text_config
+
+    @staticmethod
+    def _normalize_exclude_modules(modules: list[str]) -> list[str]:
+        """Translate HF quantization exclude-module paths to TRT-LLM names.
+
+        - Strip model.language_model. prefix -> model.
+        - Drop model.visual.* and mtp.* entries
+        - Map split projection names to packed TRT-LLM names
+        """
+        normalized = set()
+        for name in modules:
+            if name.startswith("model.language_model."):
+                name = "model." + name[len("model.language_model."):]
+            if name.startswith("model.visual.") or name.startswith("mtp."):
+                continue
+            name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name)
+            name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name)
+            normalized.add(name)
+        return sorted(normalized)
+
+    @staticmethod
+    def _add_qkvz_bf16_workaround(text_config: dict,
+                                  modules: list[str]) -> list[str]:
+        """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers.
+
+        Temporary until FP8 block-scale TP loading is fixed for this layout.
+        """
+        try:
+            layer_types = get_qwen3_hybrid_layer_types(
+                SimpleNamespace(**text_config))
+        except (ValueError, AttributeError):
+            return modules
+        for layer_idx, layer_type in enumerate(layer_types):
+            if layer_type == "linear_attention":
+                modules.append(
+                    f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz")
+        return modules
+
+    @staticmethod
+    def _flatten_rope(text_config: dict) -> dict:
+        """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling.
+
+        Qwen3.5 nests these inside a rope_parameters dict and uses rope_type
+        instead of type in rope_scaling.  Qwen3NextConfig expects them as
+        top-level fields with rope_scaling.type.
+        """
+        rope_parameters = dict(text_config.pop("rope_parameters", {}) or {})
+        rope_scaling = dict(text_config.get("rope_scaling") or {})
+        if rope_parameters:
+            rope_theta = rope_parameters.pop("rope_theta", None)
+            if rope_theta is not None:
+                text_config.setdefault("rope_theta", rope_theta)
+            partial_rotary_factor = rope_parameters.pop("partial_rotary_factor",
+                                                        None)
+            if partial_rotary_factor is not None:
+                text_config.setdefault("partial_rotary_factor",
+                                       partial_rotary_factor)
+            if rope_parameters:
+                rope_scaling = rope_parameters | rope_scaling
+        if rope_scaling:
+            has_mrope = ("mrope_section" in rope_scaling
+                         or rope_scaling.get("mrope_interleaved", False))
+            if has_mrope:
+                rope_scaling["type"] = "mrope"
+                rope_scaling.pop("rope_type", None)
+            elif "type" not in rope_scaling and "rope_type" in rope_scaling:
+                rope_type = rope_scaling.pop("rope_type")
+                # "default" means standard RoPE (no scaling) — don't set
+                # rope_scaling to avoid triggering scaling code paths.
+                if rope_type == "default":
+                    rope_scaling = {}
+                else:
+                    rope_scaling["type"] = rope_type
+            if rope_scaling:
+                text_config["rope_scaling"] = rope_scaling
+        return text_config
+
+
 # TODO: remove this once the transformers can support all of those models in _CONFIG_REGISTRY
 class LazyConfigDict(dict):
 
@@ -356,16 +445,6 @@ def load_pretrained_config(model_name_or_path: str,
             MistralConfigLoader
         model_config = MistralConfigLoader().load(
             model_name_or_path).pretrained_config
-    elif (model_type == "qwen3_5_moe" and
-          (("text_config" in config_dict and "vision_config" in config_dict) or
-           (architectures
-            and architectures[0] == "Qwen3_5MoeForConditionalGeneration"))):
-        # Qwen3.5-MoE VLM: HF native composite config + model-side normalizer.
-        from tensorrt_llm._torch.models.modeling_qwen3_5 import \
-            _normalize_qwen35_moe_vl_config
-        model_config = transformers.Qwen3_5MoeConfig.from_pretrained(
-            model_name_or_path, **kwargs)
-        _normalize_qwen35_moe_vl_config(model_config)
     elif model_type in _CONFIG_REGISTRY:
         config_class = _CONFIG_REGISTRY[model_type]
         model_config = config_class.from_pretrained(model_name_or_path,
@@ -378,11 +457,8 @@ def load_pretrained_config(model_name_or_path: str,
                                 "Qwen3_5ForCausalLM",
                                 "Qwen3_5ForConditionalGeneration",
                             )):
-        # Qwen3.5 text-only: flatten to Qwen3NextConfig via the model-side shim.
-        from tensorrt_llm._torch.models.modeling_qwen3_5 import \
-            Qwen35ConfigCompat
         model_config = transformers.Qwen3NextConfig.from_dict(
-            Qwen35ConfigCompat.normalize(config_dict))
+            _Qwen35ConfigCompat.normalize(config_dict))
     elif (model_type == "exaone4" and config_dict.get("sliding_window") is None
           and config_dict.get("layer_types") is None):
         # transformers 5.5.x Exaone4Config.__post_init__ first forces
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index 54c02754f12d..14d813a99dfd 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -29,7 +29,6 @@
     MoeLoadBalancer, maybe_create_moe_load_balancer)
 from ..virtual_memory import RestoreMode
 from ..virtual_memory import scope as virtual_memory_scope
-from .config_utils import resolve_hf_torch_dtype, resolve_mamba_ssm_cache_dtype
 
 _KV_CACHE_MAP = {
     "fp8": QuantAlgo.FP8.value,
@@ -45,10 +44,12 @@ def validate_and_set_mamba_ssm_cache_dtype(
         mamba_ssm_stochastic_rounding: bool = False,
         mamba_ssm_philox_rounds: int = 10) -> None:
     if mamba_ssm_cache_dtype == "auto":
-        mamba_ssm_cache_dtype = (
-            resolve_mamba_ssm_cache_dtype(config.pretrained_config)
-            or resolve_hf_torch_dtype(config.pretrained_config)
-            or config.torch_dtype)
+        hf_dtype = getattr(config.pretrained_config, "mamba_ssm_cache_dtype",
+                           None)
+        if hf_dtype is not None:
+            mamba_ssm_cache_dtype = str_dtype_to_torch(hf_dtype)
+        else:
+            mamba_ssm_cache_dtype = config.pretrained_config.torch_dtype
     else:
         mamba_ssm_cache_dtype = str_dtype_to_torch(mamba_ssm_cache_dtype)
 
diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
index 21aa37d6642c..fc23b4cbc6eb 100644
--- a/tests/integration/defs/accuracy/references/mmmu.yaml
+++ b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -62,12 +62,8 @@ Qwen/Qwen3-VL-8B-Instruct:
 mistralai/Mistral-Small-3.1-24B-Instruct-2503:
   - accuracy: 57.0
 Qwen/Qwen3.5-35B-A3B:
-  # The default accuracy for `test_auto_dtype` tests.
-  - accuracy: 59.0
   - dtype: bfloat16
     accuracy: 60.444
-  - quant_algo: FP8_BLOCK_SCALES
-    accuracy: 58.889
 # Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params).
 # Values below are measured with NVFP4 checkpoint (thinking mode enabled).
 moonshotai/Kimi-K2.5:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
index 3f2026c65b0a..39ce3d05d54d 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
@@ -459,43 +459,6 @@ def test_nvfp4_4gpus(
             task.evaluate(llm, sampling_params=self.sampling_params)
 
 
-@pytest.mark.skip_less_device_memory(80000)
-class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):
-    MODEL_NAME = "Qwen/Qwen3.5-35B-A3B"
-    MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B"
-    MAX_NUM_TOKENS = 16384
-    MAX_BATCH_SIZE = 32
-
-    sampling_params = SamplingParams(
-        max_tokens=MAX_NUM_TOKENS,
-        truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
-        stop="<|endoftext|>",
-    )
-
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False)
-
-    def _make_llm(self, model_path: str) -> LLM:
-        return LLM(
-            model_path,
-            max_num_tokens=self.MAX_NUM_TOKENS,
-            max_batch_size=self.MAX_BATCH_SIZE,
-            kv_cache_config=self.kv_cache_config,
-        )
-
-    def test_auto_dtype(self) -> None:
-        with self._make_llm(self.MODEL_PATH) as llm:
-            task = MMMU(self.MODEL_NAME)
-            task.evaluate(llm, sampling_params=self.sampling_params)
-
-    @skip_pre_hopper
-    def test_fp8_prequantized(self) -> None:
-        model_path = f"{llm_models_root()}/Qwen3.5-35B-A3B-FP8"
-        with self._make_llm(model_path) as llm:
-            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-            task = MMMU(self.MODEL_NAME)
-            task.evaluate(llm, sampling_params=self.sampling_params)
-
-
 class TestQwen3VL(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-8B-Instruct"
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
index 5fc18da8b40a..9fd6d2c0c74c 100644
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -807,8 +807,6 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced_chunked_prefill]
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestKimiK25::test_nvfp4[dep8]
-accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype
-accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_fp8_prequantized
 accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
 accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray
 unittest/disaggregated/test_openai_disagg_server.py
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
index a82a62e2c77d..9c72f9dccb86 100644
--- a/tests/integration/test_lists/test-db/l0_l40s.yml
+++ b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -23,7 +23,6 @@ l0_l40s:
   - unittest/_torch/modeling/test_modeling_qwen2_5vl.py::TestQwen2_5_VL::test_all
   - unittest/_torch/modeling/test_modeling_qwen3vl_moe.py::TestQwen3VLMoe::test_all
   - unittest/_torch/modeling/test_modeling_qwen3vl.py::TestQwen3VL::test_all
-  - unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py::TestQwen3_5MoeVL::test_all
   - test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B]
   - unittest/llmapi/apps/_test_openai_chat_multimodal.py::test_single_chat_session_image_embeds -m needs_l40s
   # MMMU sanity check
diff --git a/tests/unittest/_torch/modeling/test_modeling_multimodal.py b/tests/unittest/_torch/modeling/test_modeling_multimodal.py
index ab7166b68bf3..53fe5e044fc6 100644
--- a/tests/unittest/_torch/modeling/test_modeling_multimodal.py
+++ b/tests/unittest/_torch/modeling/test_modeling_multimodal.py
@@ -18,12 +18,6 @@
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_multimodal_utils import bypass_processor_output_validation
-from tensorrt_llm._torch.pyexecutor.config_utils import (
-    extract_mamba_kv_cache_params,
-    is_nemotron_hybrid,
-    is_qwen3_hybrid,
-)
-from tensorrt_llm._torch.pyexecutor.mamba_cache_manager import CppMambaHybridCacheManager
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.bindings.executor import KvCacheConfig
@@ -34,7 +28,6 @@
     prompt_inputs,
 )
 from tensorrt_llm.inputs.multimodal import MultimodalParams, MultimodalRuntimeData
-from tensorrt_llm.llmapi.llm_args import KvCacheConfig as PyKvCacheConfig
 from tensorrt_llm.mapping import Mapping
 
 
@@ -525,13 +518,6 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario):
         Note:
             This method uses get_kv_cache_config() to obtain configuration.
             Override get_kv_cache_config() to customize cache settings.
-
-            For hybrid linear-attention models (Qwen3Next, Qwen3.5,
-            Nemotron-Hybrid) this dispatches to
-            `get_hybrid_kv_cache_manager` so the linear-attention layers
-            get a `CppMambaHybridCacheManager` for SSM/conv state.
-            Mirrors the production dispatch in
-            `_util.py:_create_kv_cache_manager`.
         """
         # Get cache configuration from the configurable method
         cache_config = self.get_kv_cache_config(scenario)
@@ -541,114 +527,17 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario):
 
         num_blocks = (max_seq_len + tokens_per_block - 1) // tokens_per_block
 
-        config = self.model_config.pretrained_config
-        text_config = getattr(config, "text_config", config)
-
-        if is_qwen3_hybrid(text_config) or is_nemotron_hybrid(text_config):
-            self.kv_cache_manager = self.get_hybrid_kv_cache_manager(
-                text_config=text_config,
-                tokens_per_block=tokens_per_block,
-                max_seq_len=max_seq_len,
-                batch_size=batch_size,
-                num_blocks=num_blocks,
-            )
-        else:
-            self.kv_cache_manager = self.get_kv_cache_manager(
-                dtype=self.model_config.pretrained_config.torch_dtype,
-                config=self.model_config.pretrained_config,
-                tokens_per_block=tokens_per_block,
-                max_seq_len=max_seq_len,
-                batch_size=batch_size,
-                num_blocks=num_blocks,
-            )
-
-        self.kv_cache_manager.add_dummy_requests(
-            request_ids=[1],
-            token_nums=[max_seq_len],
-            **self._dummy_request_kwargs(scenario),
-        )
-
-    def _dummy_request_kwargs(self, scenario: MultimodalScenario) -> Dict:
-        """Optional override hook for extra kwargs to `add_dummy_requests`.
-
-        Subclasses for mRoPE-using models (Qwen2.5-VL, Qwen3-VL, Qwen3.5-VL,
-        …) should return `{"use_mrope": True}` here so the cache manager
-        allocates the mRoPE position-id buffer at dummy-request time.
-        Defaults to an empty dict, preserving existing behavior for tests
-        that don't care.
-        """
-        return {}
-
-    def get_hybrid_kv_cache_manager(
-        self,
-        text_config: PretrainedConfig,
-        tokens_per_block: int,
-        max_seq_len: int,
-        batch_size: int,
-        num_blocks: int,
-    ):
-        """Build a `CppMambaHybridCacheManager` for hybrid linear-attention
-        models (Qwen3Next, Qwen3.5, Nemotron-Hybrid).
-
-        Mirrors the production construction in
-        `_util.py:_create_kv_cache_manager` for `is_qwen3_hybrid` /
-        `is_nemotron_hybrid` configs: pulls the state-shape / dtype /
-        layer-mask parameters from `extract_mamba_kv_cache_params` and
-        threads them through the constructor. Tests that need a different
-        concrete manager (e.g. `MixedMambaHybridCacheManager` for
-        disagg-style coverage) should override this method.
-        """
-        dtype_map = {
-            torch.half: tensorrt_llm.bindings.DataType.HALF,
-            torch.float16: tensorrt_llm.bindings.DataType.HALF,
-            torch.bfloat16: tensorrt_llm.bindings.DataType.BF16,
-        }
-
-        mamba_params = extract_mamba_kv_cache_params(text_config)
-        if mamba_params.dtype not in dtype_map:
-            raise ValueError(
-                f"Unsupported dtype for hybrid cache manager: "
-                f"{mamba_params.dtype}. Supported: {list(dtype_map.keys())}"
-            )
-        kv_cache_dtype = dtype_map[mamba_params.dtype]
-
-        head_dim = getattr(text_config, "head_dim", None)
-        if not isinstance(head_dim, int):
-            head_dim = text_config.hidden_size // text_config.num_attention_heads
-
-        # CppMambaHybridCacheManager reads Pydantic-only fields
-        # (mamba_state_cache_interval, enable_block_reuse) so we have to
-        # construct the llmapi.llm_args.KvCacheConfig here, not the C++
-        # bindings KvCacheConfig that the standard KVCacheManager path uses.
-        kv_cache_config = PyKvCacheConfig(max_tokens=num_blocks * tokens_per_block)
-        mapping = Mapping(world_size=1, tp_size=1, rank=0)
-
-        return CppMambaHybridCacheManager(
-            # mamba cache parameters (positional)
-            mamba_params.state_size,
-            mamba_params.conv_kernel,
-            mamba_params.num_heads,
-            mamba_params.n_groups,
-            mamba_params.head_dim,
-            mamba_params.num_mamba_layers,
-            mamba_params.mamba_layer_mask,
-            mamba_params.dtype,
-            mamba_params.mamba_ssm_cache_dtype,
-            # kv cache parameters (positional)
-            kv_cache_config,
-            tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
-            # kw-only
-            num_layers=mamba_params.num_full_attention_layers,
-            layer_mask=mamba_params.full_attention_layer_mask,
-            num_kv_heads=text_config.num_key_value_heads,
-            head_dim=head_dim,
+        self.kv_cache_manager = self.get_kv_cache_manager(
+            dtype=self.model_config.pretrained_config.torch_dtype,
+            config=self.model_config.pretrained_config,
             tokens_per_block=tokens_per_block,
             max_seq_len=max_seq_len,
-            max_batch_size=batch_size,
-            mapping=mapping,
-            dtype=kv_cache_dtype,
+            batch_size=batch_size,
+            num_blocks=num_blocks,
         )
 
+        self.kv_cache_manager.add_dummy_requests(request_ids=[1], token_nums=[max_seq_len])
+
     def get_max_num_tokens(self, scenario: MultimodalScenario) -> int:
         """Get maximum number of tokens for attention metadata."""
         if scenario.chunked_prefill:
@@ -806,14 +695,6 @@ def setUp(self):
         # TODO: Add multi-GPU support
         self.device = torch.device("cuda:0")
 
-        # Pre-initialize fields that tearDown / setup_scenario expect to
-        # exist. Without this, a test method that doesn't run
-        # setup_scenario (e.g. a setUp-only smoke test) leaves
-        # self.kv_cache_manager unset and tearDown errors with
-        # AttributeError on the ``is not None`` check.
-        self.kv_cache_manager = None
-        self.attn_metadata = None
-
         self.hf_config = self.create_hf_config()
         if self.skip_hf_inference:
             # Create a dummy torch module if skipping HF inference.
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
deleted file mode 100644
index df30e93d89e0..000000000000
--- a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
+++ /dev/null
@@ -1,450 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import json
-import os
-from copy import deepcopy
-from pathlib import Path
-from typing import List, Optional
-
-import torch
-import transformers
-from test_modeling_multimodal import MultimodalScenario, TestModelingMultimodal
-from transformers import Qwen3_5MoeForConditionalGeneration as HFQwen3_5MoeForConditionalGeneration
-from utils.llm_data import llm_models_root
-
-from tensorrt_llm._torch.model_config import ModelConfig
-from tensorrt_llm._torch.models import Qwen3_5MoeVLModel
-from tensorrt_llm._torch.models.checkpoints.auto_mapper import AutoCheckpointMapper
-from tensorrt_llm._torch.models.checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper
-from tensorrt_llm._torch.models.modeling_auto import AutoModelForCausalLM
-from tensorrt_llm._torch.models.modeling_qwen3_5 import _normalize_qwen35_moe_vl_config
-from tensorrt_llm._torch.pyexecutor.config_utils import (
-    extract_mamba_kv_cache_params,
-    load_pretrained_config,
-)
-from tensorrt_llm._torch.pyexecutor.model_loader import validate_and_set_mamba_ssm_cache_dtype
-from tensorrt_llm.inputs import ContentFormat
-from tensorrt_llm.inputs.registry import MULTIMODAL_PLACEHOLDER_REGISTRY
-
-
-def _write_qwen35_moe_vl_config(tmp_path: Path) -> Path:
-    config = {
-        "architectures": ["Qwen3_5MoeForConditionalGeneration"],
-        "image_token_id": 248056,
-        "model_type": "qwen3_5_moe",
-        "text_config": {
-            "attention_bias": False,
-            "attention_dropout": 0.0,
-            "bos_token_id": 151643,
-            "dtype": "bfloat16",
-            "eos_token_id": 151645,
-            "full_attention_interval": 4,
-            "head_dim": 128,
-            "hidden_act": "silu",
-            "hidden_size": 2048,
-            "linear_conv_kernel_dim": 4,
-            "linear_key_head_dim": 128,
-            "linear_num_key_heads": 16,
-            "linear_num_value_heads": 32,
-            "linear_value_head_dim": 128,
-            "mamba_ssm_dtype": "float32",
-            "max_position_embeddings": 262144,
-            "mlp_only_layers": [],
-            "model_type": "qwen3_5_moe_text",
-            "moe_intermediate_size": 512,
-            "norm_topk_prob": True,
-            "num_attention_heads": 32,
-            "num_experts": 128,
-            "num_experts_per_tok": 8,
-            "num_hidden_layers": 2,
-            "num_key_value_heads": 4,
-            "rms_norm_eps": 1e-6,
-            "shared_expert_intermediate_size": 512,
-            "rope_parameters": {
-                "mrope_section": [11, 11, 10],
-                "partial_rotary_factor": 0.25,
-                "rope_theta": 1000000.0,
-                "rope_type": "default",
-            },
-            "use_cache": True,
-            "vocab_size": 151936,
-        },
-        "tie_word_embeddings": False,
-        "video_token_id": 248057,
-        "vision_config": {
-            "deepstack_visual_indexes": [8, 16, 24],
-            "depth": 27,
-            "hidden_act": "gelu_pytorch_tanh",
-            "hidden_size": 1152,
-            "in_channels": 3,
-            "intermediate_size": 4304,
-            "model_type": "qwen3_5_moe",
-            "num_heads": 16,
-            "num_position_embeddings": 2304,
-            "out_hidden_size": 2048,
-            "patch_size": 16,
-            "spatial_merge_size": 2,
-            "temporal_patch_size": 2,
-        },
-        "vision_end_token_id": 248054,
-        "vision_start_token_id": 248053,
-    }
-    (tmp_path / "config.json").write_text(json.dumps(config), encoding="utf-8")
-    return tmp_path
-
-
-def test_qwen35_moe_vl_config_preserves_vlm_architecture(
-    tmp_path: Path,
-) -> None:
-    config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path)))
-
-    assert isinstance(config, transformers.Qwen3_5MoeConfig)
-    assert config.architectures == ["Qwen3_5MoeForConditionalGeneration"]
-    assert config.text_config.architectures == ["Qwen3_5MoeForCausalLM"]
-    assert config.text_config.num_experts == 128
-    assert config.text_config.intermediate_size == 4608
-    assert config.text_config.rope_theta == 1000000.0
-    assert config.text_config.partial_rotary_factor == 0.25
-    assert config.text_config.rope_scaling["type"] == "mrope"
-    assert config.text_config.rope_scaling["mrope_section"] == [11, 11, 10]
-    assert config.text_config.mamba_ssm_dtype == "float32"
-    assert config.get_text_config() is config.text_config
-
-
-def test_qwen35_moe_vl_resolves_mamba_ssm_cache_dtype(
-    tmp_path: Path,
-) -> None:
-    config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path)))
-    model_config = ModelConfig(pretrained_config=config)
-
-    validate_and_set_mamba_ssm_cache_dtype(model_config, "auto")
-    assert model_config.quant_config.mamba_ssm_cache_dtype is torch.float32
-
-    mamba_params = extract_mamba_kv_cache_params(
-        config.text_config,
-        quant_config=model_config.quant_config,
-    )
-    assert mamba_params.dtype is torch.bfloat16
-    assert mamba_params.mamba_ssm_cache_dtype is torch.float32
-
-
-def test_qwen35_moe_vl_resolves_model_and_mapper(tmp_path: Path) -> None:
-    config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path)))
-    model_config = ModelConfig(pretrained_config=config)
-
-    assert AutoModelForCausalLM._resolve_class(model_config) is Qwen3_5MoeVLModel
-    assert isinstance(
-        AutoCheckpointMapper.get("HF", "Qwen3_5MoeForConditionalGeneration"),
-        Qwen3_5MoeHfWeightMapper,
-    )
-
-
-def test_qwen35_moe_vl_placeholder_metadata_registered() -> None:
-    metadata = MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholder_metadata("qwen3_5_moe")
-
-    assert metadata.placeholder_map == {
-        "image": "<|vision_start|><|image_pad|><|vision_end|>",
-        "video": "<|vision_start|><|video_pad|><|vision_end|>",
-    }
-    assert metadata.placeholders_separator == ""
-    assert metadata.content_format is ContentFormat.STRING
-
-
-# --- Layered parity test scaffold -------------------------------------------
-#
-# Tiny synthetic config used by TestQwen3_5MoeVL below. Same architecture as
-# the real Qwen/Qwen3.5-35B-A3B checkpoint but with much smaller dimensions
-# where possible.
-#
-# Shapes that have to match real Qwen3.5 (can't shrink without breaking
-# things downstream):
-#
-#   - head_dim=256, partial_rotary_factor=0.25 --> rotary tensor width is
-#     `head_dim * 0.25 / 2 = 32`, which equals `sum(mrope_section)`.
-#     A smaller head_dim (e.g. 128) yields a 16-wide tensor that mRoPE
-#     can't split with section sum 32.
-#   - num_attention_heads=16, num_key_value_heads=2 match the real
-#     model's 8:1 GQA layout; Q proj is 2048 --> 4096, K/V are 2048 --> 512.
-#   - Vision deepstack indices [8, 16, 24] match the real config, and
-#     depth=27 is the smallest value that hosts those indices. Disabling
-#     deepstack (indices=[], depth=2) produces fewer vision embeddings
-#     than the HF processor reserves placeholder tokens for, which
-#     breaks `fuse_input_embeds`.
-#   - vocab_size=248320 matches the real Qwen3.5 tokenizer. The
-#     tokenizer (loaded via _name_or_path) emits special-token ids in
-#     the 248k range; `fuse_input_embeds` uses `vocab_size` as the
-#     OOV threshold to identify image-pad tokens. A smaller vocab_size
-#     would misclassify regular chat-template specials as mm tokens
-#     and trip the placeholder/embedding count check.
-#
-# Shapes that can be shrunk for tests:
-#
-#   - num_hidden_layers: 2 (vs 40+).
-#   - num_experts: 128 (vs 256). Still moderate so MoE routing runs.
-#   - full_attention_interval=2 with 2 LM layers yields the pattern
-#     [linear_attention, full_attention] — one of each kind, exercising
-#     both the regular KV cache and the Mamba SSM/conv state via the
-#     base-class dispatch.
-#
-# `_name_or_path` points at the real checkpoint dir so the test can load
-# the tokenizer/processor (only the processor; not the full model weights).
-QWEN3_5_VL_MOE_PARITY_CONFIG = {
-    "architectures": ["Qwen3_5MoeForConditionalGeneration"],
-    "image_token_id": 248056,
-    "model_type": "qwen3_5_moe",
-    "text_config": {
-        "attention_bias": False,
-        "attention_dropout": 0.0,
-        "bos_token_id": 151643,
-        "dtype": "bfloat16",
-        "eos_token_id": 151645,
-        "full_attention_interval": 2,
-        "head_dim": 256,
-        "hidden_act": "silu",
-        "hidden_size": 2048,
-        "linear_conv_kernel_dim": 4,
-        "linear_key_head_dim": 128,
-        "linear_num_key_heads": 16,
-        "linear_num_value_heads": 32,
-        "linear_value_head_dim": 128,
-        "mamba_ssm_dtype": "float32",
-        "max_position_embeddings": 8192,
-        "mlp_only_layers": [],
-        "model_type": "qwen3_5_moe_text",
-        "moe_intermediate_size": 512,
-        "norm_topk_prob": True,
-        "num_attention_heads": 16,
-        "num_experts": 128,
-        "num_experts_per_tok": 8,
-        "num_hidden_layers": 2,
-        "num_key_value_heads": 2,
-        "rms_norm_eps": 1e-6,
-        "shared_expert_intermediate_size": 512,
-        "rope_parameters": {
-            "mrope_section": [11, 11, 10],
-            "partial_rotary_factor": 0.25,
-            "rope_theta": 1000000.0,
-            "rope_type": "default",
-        },
-        "use_cache": True,
-        "vocab_size": 248320,
-    },
-    "tie_word_embeddings": False,
-    "video_token_id": 248057,
-    "vision_config": {
-        "deepstack_visual_indexes": [8, 16, 24],
-        "depth": 27,
-        "hidden_act": "gelu_pytorch_tanh",
-        "hidden_size": 1152,
-        "in_channels": 3,
-        "initializer_range": 0.02,
-        "intermediate_size": 4304,
-        "model_type": "qwen3_5_moe",
-        "num_heads": 16,
-        "num_position_embeddings": 2304,
-        "out_hidden_size": 2048,
-        "patch_size": 16,
-        "spatial_merge_size": 2,
-        "temporal_patch_size": 2,
-    },
-    "vision_end_token_id": 248054,
-    "vision_start_token_id": 248053,
-    "_name_or_path": str(os.path.join(llm_models_root(), "Qwen3.5-35B-A3B")),
-}
-
-
-class TestQwen3_5MoeVL(TestModelingMultimodal):
-    """Forward-parity test for Qwen3.5-MoE-VL against HuggingFace.
-
-    Tiny-synthetic-config parity test in the same shape as
-    `TestQwen3VLMoe` / `TestQwen2_5VL`: both stacks are constructed
-    from `QWEN3_5_VL_MOE_PARITY_CONFIG` (2 LM layers, 1 linear + 1 full
-    attention, 128 experts, 2 vision layers), HF weights are copied
-    into TRT-LLM via `Qwen3_5MoeHfWeightMapper`, then `test_all`
-    sweeps the default `MultimodalScenario`s comparing last-position
-    logits at context + generation phases.
-
-    Two-config design:
-      - `self.hf_config` is the raw `Qwen3_5MoeConfig.from_dict(...)`
-        result. HF model construction sees the native HF schema
-        (`rope_parameters` intact with `rope_type`,
-        `moe_intermediate_size`, …).
-      - TRT-LLM gets a deep-copied + normalized version via the
-        `create_trtllm_model` override below. That copy goes through
-        `_normalize_qwen35_moe_vl_config` exactly the same way
-        production `load_pretrained_config` does, so the Qwen3Next
-        runtime sees the flat aliases it expects
-        (`intermediate_size`, `rope_theta`, `rope_scaling`, …).
-
-    Keeping the two configs separate means the production normalizer
-    doesn't need to be HF-safe — production only ever constructs the
-    TRT-LLM model from a normalized config, and the test mirrors that
-    boundary explicitly. The hybrid-cache path is handled by the base
-    class's `init_kv_cache_manager` dispatch on
-    `is_qwen3_hybrid` / `is_nemotron_hybrid`.
-    """
-
-    def get_model_config(self):
-        return QWEN3_5_VL_MOE_PARITY_CONFIG
-
-    def get_trtllm_model_class(self):
-        return Qwen3_5MoeVLModel
-
-    def get_hf_model_class(self):
-        return HFQwen3_5MoeForConditionalGeneration
-
-    def get_weight_mapper_class(self):
-        return Qwen3_5MoeHfWeightMapper
-
-    def get_model_type(self):
-        return "qwen3_5_moe"
-
-    def get_model_config_class(self):
-        return transformers.Qwen3_5MoeConfig
-
-    def create_trtllm_model(
-        self,
-        load_weights: bool = False,
-        hf_model_state_dict: Optional[dict] = None,
-        **kwargs,
-    ):
-        """Build the TRT-LLM model from a *normalized copy* of `self.hf_config`.
-
-        Mirrors the base-class body but swaps in
-        `_normalize_qwen35_moe_vl_config(trtllm_config)` before
-        wrapping in `ModelConfig`. `self.hf_config` itself stays
-        raw so the HF model that the base class builds in `setUp`
-        sees native HF schema.
-        """
-        trtllm_config = deepcopy(self.hf_config)
-        _normalize_qwen35_moe_vl_config(trtllm_config)
-
-        model_config = ModelConfig(pretrained_config=trtllm_config)
-        model_class = self.get_trtllm_model_class()
-        model = model_class(model_config, **kwargs).to("cuda")
-
-        if load_weights:
-            weight_mapper = self.get_weight_mapper_class()()
-            weight_mapper.init_model_and_config(model, trtllm_config)
-            model.load_weights(hf_model_state_dict, weight_mapper)
-
-            for module in model.modules():
-                if hasattr(module, "post_load_weights") and not getattr(
-                    module, "_weights_removed", False
-                ):
-                    module.post_load_weights()
-
-        return model, model_config
-
-    def _dummy_request_kwargs(self, scenario):
-        """Qwen3.5-VL uses mRoPE; the cache manager needs the mRoPE
-        position-id buffer allocated at dummy-request time."""
-        return {"use_mrope": True}
-
-    def get_tolerance(self):
-        """Tighten `rtol` to `0.1` (4x tighter than the base 0.4
-        default) while keeping `atol` at `0.4` to absorb single-logit
-        tail outliers seen on `multiple_image` / `video`.
-        """
-        return 0.4, 0.1
-
-    def get_trtllm_inputs(
-        self,
-        input_ids,
-        multimodal_params_list,
-        is_gen: bool = False,
-        num_cached_tokens_per_seq: Optional[List[int]] = None,
-        total_prompt_len: Optional[int] = None,
-    ):
-        """Override position_ids with mRoPE position IDs from the
-        multimodal params. Same pattern as `TestQwen3VLMoe` — the
-        VLM wrapper feeds mRoPE-shaped position IDs to the decoder,
-        not the simple range-based default the base class produces.
-        """
-        trtllm_inputs = super().get_trtllm_inputs(
-            input_ids,
-            multimodal_params_list,
-            is_gen,
-            num_cached_tokens_per_seq,
-            total_prompt_len=total_prompt_len,
-        )
-
-        if is_gen:
-            mrope_gen_position_ids = []
-            for multimodal_param in multimodal_params_list:
-                mrope_gen_position_ids.append(
-                    multimodal_param.multimodal_data["mrope_config"]["mrope_position_deltas"]
-                )
-            mrope_gen_position_ids = torch.cat(mrope_gen_position_ids, dim=-1).to(self.device)
-            trtllm_inputs["position_ids"] = (
-                (trtllm_inputs["position_ids"] + mrope_gen_position_ids).expand(3, -1, 1).cuda()
-            )
-            gen_multimodal_params_list = []
-            for multimodal_param in multimodal_params_list:
-                multimodal_param.strip_for_generation()
-                multimodal_param.to_device(
-                    "multimodal_data",
-                    self.device,
-                    pin_memory=True,
-                    target_keywords=["mrope_config.mrope_position_deltas"],
-                )
-                gen_multimodal_params_list.append(multimodal_param)
-            trtllm_inputs["multimodal_params"] = gen_multimodal_params_list
-        else:
-            mrope_position_ids = []
-            for multimodal_param in multimodal_params_list:
-                mrope_position_ids.append(
-                    multimodal_param.multimodal_data["mrope_config"]["mrope_position_ids"]
-                )
-            position_ids = torch.cat(mrope_position_ids, dim=-1).cuda()
-            trtllm_inputs["position_ids"] = position_ids
-
-        return trtllm_inputs
-
-    def get_scenarios(self) -> List[MultimodalScenario]:
-        """Modality-sanity sweep (image / multiple_image / video).
-
-        These three catch differences in placeholder counts and the
-        multimodal-cumsum path between single-image, multi-image, and
-        video inputs.
-
-        CUDA-graph capture is intentionally not exercised here. The
-        standard `attn_metadata.create_cuda_graph_metadata` path only
-        addresses attention metadata; the Mamba SSM state buffer of the
-        hybrid (Mamba + attention) cache is not threaded through, so
-        replayed logits diverge from the HF reference. Adding that path
-        is dedicated harness work and tracked separately.
-        """
-        return [
-            MultimodalScenario(
-                modality="image",
-                use_cuda_graph=False,
-                chunked_prefill=False,
-                kv_cache_reuse=False,
-            ),
-            MultimodalScenario(
-                modality="multiple_image",
-                use_cuda_graph=False,
-                chunked_prefill=False,
-                kv_cache_reuse=False,
-            ),
-            MultimodalScenario(
-                modality="video",
-                use_cuda_graph=False,
-                chunked_prefill=False,
-                kv_cache_reuse=False,
-            ),
-        ]
-
-    def test_construction_and_weight_loading_smoke(self):
-        """Smoke test: setUp built HF + TRT-LLM models and copied HF
-        weights into TRT-LLM via the weight mapper. Detailed
-        assertions on the normalizer's outputs live in the routing
-        tests above (e.g. `test_qwen35_moe_vl_config_preserves_vlm_architecture`)
-        — this one just confirms construction reached the end without
-        exception.
-        """
-        self.assertIsNotNone(self.hf_model)
-        self.assertIsNotNone(self.trtllm_model)
-        self.assertIsNotNone(self.model_config)