diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md index 0d9e19e68104..fafaa1eb3e31 100644 --- a/docs/source/models/supported-models.md +++ b/docs/source/models/supported-models.md @@ -106,6 +106,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl | `Qwen3VLMoeForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | L + I + V | | `Step3p7ForConditionalGeneration` | Yes | Yes | Untested | Yes | Untested | Untested | Untested | Untested | L + I | | `MiniMaxM3SparseForConditionalGeneration` [^11] | Yes | Yes | Untested | Yes | Untested | No | Untested | Untested | L + I + V | +| `Qwen3_5MoeForConditionalGeneration` | Yes | Yes | Untested | Yes | Yes | No | Untested | Yes | L + I + V | Note: - L: Language diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py index f33fd49ff916..aa2fb03d0b38 100644 --- a/tensorrt_llm/_torch/models/__init__.py +++ b/tensorrt_llm/_torch/models/__init__.py @@ -42,7 +42,8 @@ Qwen2ForRewardModel) from .modeling_qwen2vl import Qwen2_5_VLModel, Qwen2VLModel from .modeling_qwen3 import Qwen3ForCausalLM -from .modeling_qwen3_5 import Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM +from .modeling_qwen3_5 import (Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM, + Qwen3_5MoeVLModel) from .modeling_qwen3_moe import Qwen3MoeForCausalLM from .modeling_qwen3_next import Qwen3NextForCausalLM from .modeling_qwen3vl import Qwen3VLModel @@ -102,6 +103,7 @@ "Qwen3MoeForCausalLM", "Qwen3_5ForCausalLM", "Qwen3_5MoeForCausalLM", + "Qwen3_5MoeVLModel", "Qwen3NextForCausalLM", "Qwen3MoeVLModel", "GptOssForCausalLM", diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py index e0e8dc9dbb40..a4226af7d87b 100644 --- a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py +++ b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py @@ -16,6 +16,7 @@ @register_mapper("HF", "Qwen3_5MoeForCausalLM") +@register_mapper("HF", "Qwen3_5MoeForConditionalGeneration") @register_mapper("HF", "Qwen3_5ForCausalLM") class Qwen3_5MoeHfWeightMapper(Qwen3NextHfWeightMapper): """Weight mapper for Qwen3.5 MoE text checkpoints. diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py index 7c85f1cdf6fc..ce9fcefb9ece 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py @@ -1,7 +1,29 @@ import re +from types import SimpleNamespace +from typing import Dict, List +import torch +from transformers import PretrainedConfig + +from ...inputs import ( + ContentFormat, + MultimodalPlaceholderMetadata, + MultimodalPlaceholderPlacement, + register_input_processor, + support_multimodal_disaggregated, +) +from ..pyexecutor.config_utils import get_qwen3_hybrid_layer_types +from .checkpoints.base_weight_mapper import BaseWeightMapper +from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper +from .modeling_multimodal_utils import _is_mm_disagg from .modeling_qwen3_next import Qwen3NextForCausalLM -from .modeling_utils import register_auto_model +from .modeling_qwen3vl import ( + Qwen3VisionModel, + Qwen3VisionModelBase, + Qwen3VLInputProcessorBase, + Qwen3VLModelBase, +) +from .modeling_utils import ModelConfig, register_auto_model, register_vision_encoder _LANG_PREFIX = "model.language_model." @@ -51,6 +73,248 @@ def _translate_mtp_pattern(name, n_hidden_layers): return None +# --- Config adapters -------------------------------------------------------- +# +# These run from `load_pretrained_config` in +# `tensorrt_llm/_torch/pyexecutor/config_utils.py` via lazy import — the +# runtime layer asks the model module how to load its own config. +# +# There are two entry points: +# - `Qwen35ConfigCompat.normalize(config_dict)` — for text-only +# Qwen3.5 (MoE and dense). Returns a dict that +# `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the +# existing Qwen3Next runtime is reused unchanged. +# - `_normalize_qwen35_moe_vl_config(model_config)` — for the +# Qwen3.5-MoE VLM. Mutates the HF-native `transformers.Qwen3_5MoeConfig` +# in place, attaching the runtime aliases the Qwen3Next-based LM expects +# while keeping `text_config` / `vision_config` composite. + + +class Qwen35ConfigCompat: + """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig. + + We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native + schema) so the runtime can reuse the existing `Qwen3NextForCausalLM` + model implementation unchanged — Qwen3.5 text is structurally identical + to Qwen3Next, so matching the config schema lets the same code serve + both. + + This is used for Qwen3.5 text-only configs and for shared helper logic such + as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM + configs should stay composite and use transformers.Qwen3_5MoeConfig plus + _normalize_qwen35_moe_vl_config instead. + + To remove: delete this class and the elif branch in + load_pretrained_config that flattens Qwen3.5 text configs. + """ + + @staticmethod + def normalize(config_dict: dict) -> dict: + """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict.""" + text_config = Qwen35ConfigCompat._extract_text_config(config_dict) + text_config = Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config) + text_config = Qwen35ConfigCompat._flatten_rope(text_config) + + # Detect dense vs MoE and set architecture + MoE defaults accordingly + is_moe = "num_experts" in text_config and text_config["num_experts"] > 0 + if is_moe: + text_config["architectures"] = ["Qwen3_5MoeForCausalLM"] + else: + text_config["architectures"] = ["Qwen3_5ForCausalLM"] + # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't + # accidentally enable MoE for the dense model. + text_config.setdefault("num_experts", 0) + text_config.setdefault("num_experts_per_tok", 0) + text_config.setdefault("moe_intermediate_size", 0) + text_config.setdefault("shared_expert_intermediate_size", 0) + return text_config + + _VLM_ARCHITECTURES = { + "Qwen3_5MoeForConditionalGeneration", + "Qwen3_5ForConditionalGeneration", + } + + @staticmethod + def _extract_text_config(config_dict: dict) -> dict: + """Pull nested text_config from VLM checkpoints, or use dict as-is.""" + architectures = config_dict.get("architectures") or [] + if architectures and architectures[0] in Qwen35ConfigCompat._VLM_ARCHITECTURES: + text_config = dict(config_dict.get("text_config") or {}) + else: + text_config = dict(config_dict) + if not text_config: + raise ValueError("Qwen3.5 config is missing a usable text_config") + return text_config + + @staticmethod + def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict: + """Copy top-level quantization_config into text_config with name normalization. + + Also adds a temporary workaround that keeps packed linear-attention + in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is + fixed for that layout. + """ + if "quantization_config" in text_config: + return text_config + if "quantization_config" not in config_dict: + return text_config + + quantization_config = dict(config_dict["quantization_config"]) + if "modules_to_not_convert" in quantization_config: + modules = Qwen35ConfigCompat._normalize_exclude_modules( + quantization_config["modules_to_not_convert"] + ) + modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules) + quantization_config["modules_to_not_convert"] = sorted(set(modules)) + text_config["quantization_config"] = quantization_config + return text_config + + @staticmethod + def _normalize_exclude_modules(modules: list[str]) -> list[str]: + """Translate HF quantization exclude-module paths to TRT-LLM names. + + - Strip model.language_model. prefix -> model. + - Drop model.visual.* and mtp.* entries + - Map split projection names to packed TRT-LLM names + """ + normalized = set() + for name in modules: + if name.startswith("model.language_model."): + name = "model." + name[len("model.language_model.") :] + if name.startswith("model.visual.") or name.startswith("mtp."): + continue + name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name) + name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name) + normalized.add(name) + return sorted(normalized) + + @staticmethod + def _add_qkvz_bf16_workaround(text_config: dict, modules: list[str]) -> list[str]: + """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers. + + Temporary until FP8 block-scale TP loading is fixed for this layout. + """ + try: + layer_types = get_qwen3_hybrid_layer_types(SimpleNamespace(**text_config)) + except (ValueError, AttributeError): + return modules + for layer_idx, layer_type in enumerate(layer_types): + if layer_type == "linear_attention": + modules.append(f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz") + return modules + + @staticmethod + def _flatten_rope(text_config: dict) -> dict: + """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling. + + Qwen3.5 nests these inside a rope_parameters dict and uses rope_type + instead of type in rope_scaling. Qwen3NextConfig expects them as + top-level fields with rope_scaling.type. + """ + rope_parameters = dict(text_config.pop("rope_parameters", {}) or {}) + rope_scaling = dict(text_config.get("rope_scaling") or {}) + if rope_parameters: + rope_theta = rope_parameters.pop("rope_theta", None) + if rope_theta is not None: + text_config.setdefault("rope_theta", rope_theta) + partial_rotary_factor = rope_parameters.pop("partial_rotary_factor", None) + if partial_rotary_factor is not None: + text_config.setdefault("partial_rotary_factor", partial_rotary_factor) + if rope_parameters: + rope_scaling = rope_parameters | rope_scaling + if rope_scaling: + has_mrope = "mrope_section" in rope_scaling or rope_scaling.get( + "mrope_interleaved", False + ) + if has_mrope: + rope_scaling["type"] = "mrope" + rope_scaling.pop("rope_type", None) + elif "type" not in rope_scaling and "rope_type" in rope_scaling: + rope_type = rope_scaling.pop("rope_type") + # "default" means standard RoPE (no scaling) — don't set + # rope_scaling to avoid triggering scaling code paths. + if rope_type == "default": + rope_scaling = {} + else: + rope_scaling["type"] = rope_type + if rope_scaling: + text_config["rope_scaling"] = rope_scaling + return text_config + + +def _normalize_qwen35_mrope_config(text_config) -> None: + """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path. + + HF stores RoPE metadata under `rope_parameters`; the shared Qwen3-VL + wrapper reads `rope_theta`, `partial_rotary_factor`, and + `rope_scaling` directly on the text config. + """ + rope_parameters = getattr(text_config, "rope_parameters", None) + if not rope_parameters: + return + if hasattr(rope_parameters, "to_dict"): + rope_parameters = rope_parameters.to_dict() + flattened = Qwen35ConfigCompat._flatten_rope( + { + "rope_parameters": dict(rope_parameters), + "rope_scaling": dict(getattr(text_config, "rope_scaling", None) or {}), + } + ) + for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"): + value = flattened.get(attr) + if value is not None: + setattr(text_config, attr, value) + + +def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None: + """Materialize Qwen3Next-style text aliases used by the shared runtime.""" + if getattr(text_config, "intermediate_size", None) is None: + moe_intermediate_size = getattr(text_config, "moe_intermediate_size", None) + num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None) + shared_expert_intermediate_size = ( + getattr(text_config, "shared_expert_intermediate_size", 0) or 0 + ) + if moe_intermediate_size is not None and num_experts_per_tok is not None: + text_config.intermediate_size = ( + num_experts_per_tok * moe_intermediate_size + shared_expert_intermediate_size + ) + + +def _normalize_qwen35_quantization_config(model_config) -> None: + quantization_config = getattr(model_config, "quantization_config", None) + if not isinstance(quantization_config, dict): + return + + modules = quantization_config.get("modules_to_not_convert") + if modules is None: + return + + text_config = getattr(model_config, "text_config", None) + normalized_modules = Qwen35ConfigCompat._normalize_exclude_modules(modules) + if text_config is not None: + normalized_modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround( + text_config.to_dict(), normalized_modules + ) + quantization_config["modules_to_not_convert"] = sorted(set(normalized_modules)) + + +def _normalize_qwen35_moe_vl_config(model_config) -> None: + """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions.""" + if not getattr(model_config, "architectures", None): + model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"] + + text_config = getattr(model_config, "text_config", None) + if text_config is None: + raise ValueError("Qwen3.5-MoE VLM config is missing text_config") + + text_config.architectures = ["Qwen3_5MoeForCausalLM"] + _normalize_qwen35_qwen3next_text_aliases(text_config) + _normalize_qwen35_mrope_config(text_config) + + model_config.get_text_config = lambda decoder=False: text_config + _normalize_qwen35_quantization_config(model_config) + + def _normalize_qwen35_exclude_modules(model_config): """Normalize NVFP4/FP8 exclude_modules from HF naming to TRT-LLM naming. @@ -130,10 +394,58 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM): Same reuse pattern as Qwen3_5MoeForCausalLM, but for the dense 27B variant which uses GatedMLP instead of SparseMoeBlock. The config - normalizer (_Qwen35ConfigCompat) sets num_experts=0 so that + normalizer (Qwen35ConfigCompat) sets num_experts=0 so that Qwen3NextModel selects GatedMLP for the feed-forward layers. """ def __init__(self, model_config): _normalize_qwen35_exclude_modules(model_config) super().__init__(model_config) + + +# TODO: Add tests for disaggregated support. +@support_multimodal_disaggregated +@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel) +@register_auto_model("Qwen3_5MoeForConditionalGeneration") +@register_input_processor( + Qwen3VLInputProcessorBase, + model_type="qwen3_5_moe", + placeholder_metadata=MultimodalPlaceholderMetadata( + placeholder_map={ + "image": "<|vision_start|><|image_pad|><|vision_end|>", + "video": "<|vision_start|><|video_pad|><|vision_end|>", + }, + placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT, + placeholders_separator="", + content_format=ContentFormat.STRING, + ), +) +class Qwen3_5MoeVLModel(Qwen3VLModelBase): + """VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder.""" + + def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs): + kwargs["vision_model_class"] = Qwen3VisionModel + kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False) + super().__init__(model_config, *args, **kwargs) + + @property + def multimodal_data_device_paths(self) -> List[str]: + return [ + "image.pixel_values", + "video.pixel_values_videos", + "multimodal_embedding", + "mrope_config.mrope_position_ids", + "mrope_config.mrope_position_deltas", + ] + + def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper): + if not _is_mm_disagg(): + self.mm_encoder.load_weights(weights) + + weight_mapper = Qwen3_5MoeHfWeightMapper() + weight_mapper.init_model_and_config(self.llm, self.model_config) + filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")} + params_map = { + r"^model\.language_model\.(.*)$": r"model.\1", + } + self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map) diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py index 7667972804ad..cf8607fb59e2 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_next.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_next.py @@ -976,9 +976,18 @@ def get_model_defaults(cls, llm_args: 'TorchLlmArgs') -> dict: # is supported for Mamba/SSM-based models return {"kv_cache_config": {"enable_block_reuse": False}} - def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper): + def load_weights(self, + weights: dict, + weight_mapper: BaseWeightMapper, + params_map: Optional[Dict[str, str]] = None, + allow_partial_loading: bool = False): new_weights = weight_mapper.preprocess_weights(weights) - super().load_weights(new_weights, weight_mapper) + super().load_weights( + new_weights, + weight_mapper=weight_mapper, + params_map=params_map, + allow_partial_loading=allow_partial_loading, + ) def setup_aliases(self) -> None: for idx, layer in enumerate( diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py index 82613d938982..19fb82811b4f 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3vl.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3vl.py @@ -1119,6 +1119,8 @@ def __init__( llm_model_config.pretrained_config.architectures = ["Qwen3ForCausalLM"] elif self.original_arch == "Qwen3VLMoeForConditionalGeneration": llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"] + elif self.original_arch == "Qwen3_5MoeForConditionalGeneration": + llm_model_config.pretrained_config.architectures = ["Qwen3_5MoeForCausalLM"] else: raise ValueError(f"Unsupported architecture: {self.original_arch}") # Qwen3ForCausalLM. @@ -1180,9 +1182,12 @@ def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]): mrope_section=config.rope_scaling.get("mrope_section", None), mrope_interleaved=config.rope_scaling.get("mrope_interleaved", False), ) + head_dim = getattr(config, "head_dim", None) + if not isinstance(head_dim, int): + head_dim = config.hidden_size // config.num_attention_heads self.rotary_emb = MRotaryEmbedding( pos_embd_params.rope, - head_dim=config.hidden_size // config.num_attention_heads, + head_dim=head_dim, is_neox=pos_embd_params.is_neox, mrope_section=pos_embd_params.mrope_section, mrope_interleaved=pos_embd_params.mrope_interleaved, @@ -1311,6 +1316,13 @@ def forward( ) deepstack_embeds = list(deepstack_buffer.unbind(0)) + # Preserve the pre-fusion token IDs. `fuse_input_embeds` collapses + # input_ids -> None when MM embeddings are fused in, but spec + # decoding (MTP / Eagle) still needs the original prompt token + # IDs for drafter context preparation; pass them through as a + # dedicated kwarg consumed by `SpecDecOneEngineForCausalLM.forward`. + orig_input_ids = input_ids + input_ids, input_embeds = fuse_input_embeds( self.llm.model.embed_tokens, input_ids, @@ -1327,8 +1339,14 @@ def forward( return_context_logits=return_context_logits, deepstack_embeds=deepstack_embeds, mrope_config=mrope_config, + spec_metadata=kwargs.get("spec_metadata"), + resource_manager=kwargs.get("resource_manager"), + orig_input_ids=orig_input_ids, ) - logger.debug(f"output shape: {output_prob.shape}") + # Spec-decoding (MTP / Eagle) returns a dict (accepted tokens, + # draft tokens, logits); plain forward returns a tensor. + if hasattr(output_prob, "shape"): + logger.debug(f"output shape: {output_prob.shape}") return output_prob def _get_requests_with_mm_data(self, multimodal_params): diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py index 62683b3f62f2..552a8fbc4fbf 100755 --- a/tensorrt_llm/_torch/models/modeling_speculative.py +++ b/tensorrt_llm/_torch/models/modeling_speculative.py @@ -1765,12 +1765,17 @@ def forward( True, ) - spec_input_ids = input_ids + # VLM wrappers (e.g. Qwen3VLModelBase) replace input_ids with + # fused inputs_embeds; fall back to the pre-fusion token IDs + # they forward via `orig_input_ids` so MTP / Eagle drafters + # can still access the prompt tokens. + spec_input_ids = input_ids if input_ids is not None else kwargs.get( + "orig_input_ids") spec_position_ids = position_ids if attn_metadata.padded_num_tokens is not None: - if input_ids is not None: + if spec_input_ids is not None: # Slice along the first dimension - spec_input_ids = input_ids[:attn_metadata.num_tokens] + spec_input_ids = spec_input_ids[:attn_metadata.num_tokens] if position_ids is not None: spec_position_ids = _slice_spec_position_ids( position_ids, attn_metadata.num_tokens) diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index 2ab710bf14ac..5ad48cb880a1 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -1,11 +1,10 @@ import dataclasses -import re -from types import SimpleNamespace from typing import List, Optional import torch import transformers +from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.logger import logger @@ -21,6 +20,60 @@ def is_hybrid_linear(config): return is_nemotron_hybrid(config) or is_qwen3_hybrid(config) +def _coerce_torch_dtype(dtype): + """Normalize dtype values from HF configs into torch dtype objects. + + HF configs may store dtype fields as torch dtypes, strings, or the sentinel + value "auto". Returning None for "auto" lets the caller keep its normal + fallback path instead of treating "auto" as a concrete dtype. + """ + if isinstance(dtype, torch.dtype): + return dtype + if dtype == "auto": + return None + if isinstance(dtype, str): + return str_dtype_to_torch(dtype) + return dtype + + +def resolve_hf_torch_dtype(config): + """Return the model's regular tensor dtype from common HF config fields. + + Transformers has used both dtype and torch_dtype across versions and model + families. This helper checks both names and coerces whichever one is present + into the form expected by TRT-LLM runtime code. An "auto" value in any + field is treated the same as missing, so scanning continues to the next + field instead of stopping with None. + """ + for attr in ("dtype", "torch_dtype"): + coerced = _coerce_torch_dtype(getattr(config, attr, None)) + if coerced is not None: + return coerced + return None + + +def resolve_mamba_ssm_cache_dtype(config): + """Return the dtype to use for hybrid Mamba/SSM cache allocations. + + Qwen3.5-style configs may store this field on the top-level config or the + nested text_config, and may call it either mamba_ssm_cache_dtype or + mamba_ssm_dtype. This helper centralizes that lookup so cache creation does + not fail later with a missing dtype. An "auto" value in any field is + treated the same as missing. + """ + configs = [config] + text_config = getattr(config, "text_config", None) + if text_config is not None: + configs.append(text_config) + + for candidate_config in configs: + for attr in ("mamba_ssm_cache_dtype", "mamba_ssm_dtype"): + coerced = _coerce_torch_dtype(getattr(candidate_config, attr, None)) + if coerced is not None: + return coerced + return None + + def is_nemotron_hybrid(config): if hasattr(config, "hybrid_override_pattern" ) and config.hybrid_override_pattern is not None and len( @@ -249,8 +302,14 @@ def extract_mamba_kv_cache_params( full_attn_mask.extend([True] * num_spec_layers) mamba_mask.extend([False] * num_spec_layers) - mamba_ssm_cache_dtype = (quant_config.mamba_ssm_cache_dtype - if quant_config is not None else None) + mamba_ssm_cache_dtype = None + if quant_config is not None: + mamba_ssm_cache_dtype = _coerce_torch_dtype( + quant_config.mamba_ssm_cache_dtype) + if mamba_ssm_cache_dtype is None: + mamba_ssm_cache_dtype = (resolve_mamba_ssm_cache_dtype(config) + or resolve_hf_torch_dtype(config) + or torch.bfloat16) return MambaKVCacheParams( state_size=state_size, @@ -262,176 +321,11 @@ def extract_mamba_kv_cache_params( full_attention_layer_mask=full_attn_mask, num_mamba_layers=sum(mamba_mask), num_full_attention_layers=sum(full_attn_mask), - dtype=config.torch_dtype, + dtype=resolve_hf_torch_dtype(config) or torch.bfloat16, mamba_ssm_cache_dtype=mamba_ssm_cache_dtype, ) -class _Qwen35MoeVLMConfig(transformers.Qwen3NextConfig): - """Thin subclass that restores the top-level model_type for Qwen3.5 MoE. - - ``_Qwen35ConfigCompat`` normalizes the HF config into Qwen3NextConfig - (needed by the PyTorch backend model), but that loses the original - ``model_type``. The serving layer needs ``model_type = "qwen3_5_moe"`` - for ``MULTIMODAL_PLACEHOLDER_REGISTRY`` lookup; without it, - ``resolve_top_level_model_type`` returns ``"qwen3_next"`` and multimodal - requests fail with "Unknown modality". - - To remove: when ``_Qwen35ConfigCompat`` is removed and the PyTorch backend - consumes ``Qwen3_5MoeConfig`` directly. - """ - - model_type = "qwen3_5_moe" - - -class _Qwen35ConfigCompat: - """Temporary shim that normalizes Qwen3.5 HF configs into Qwen3NextConfig. - - To remove: delete this class and the elif branch in - load_pretrained_config that references it. - """ - - @staticmethod - def normalize(config_dict: dict) -> dict: - """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict.""" - text_config = _Qwen35ConfigCompat._extract_text_config(config_dict) - text_config = _Qwen35ConfigCompat._inherit_quantization_config( - config_dict, text_config) - text_config = _Qwen35ConfigCompat._flatten_rope(text_config) - - # Detect dense vs MoE and set architecture + MoE defaults accordingly - is_moe = "num_experts" in text_config and text_config["num_experts"] > 0 - if is_moe: - text_config["architectures"] = ["Qwen3_5MoeForCausalLM"] - else: - text_config["architectures"] = ["Qwen3_5ForCausalLM"] - # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't - # accidentally enable MoE for the dense model. - text_config.setdefault("num_experts", 0) - text_config.setdefault("num_experts_per_tok", 0) - text_config.setdefault("moe_intermediate_size", 0) - text_config.setdefault("shared_expert_intermediate_size", 0) - return text_config - - _VLM_ARCHITECTURES = { - "Qwen3_5MoeForConditionalGeneration", - "Qwen3_5ForConditionalGeneration", - } - - @staticmethod - def _extract_text_config(config_dict: dict) -> dict: - """Pull nested text_config from VLM checkpoints, or use dict as-is.""" - architectures = config_dict.get("architectures") or [] - if architectures and architectures[ - 0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES: - text_config = dict(config_dict.get("text_config") or {}) - else: - text_config = dict(config_dict) - if not text_config: - raise ValueError("Qwen3.5 config is missing a usable text_config") - return text_config - - @staticmethod - def _inherit_quantization_config(config_dict: dict, - text_config: dict) -> dict: - """Copy top-level quantization_config into text_config with name normalization. - - Also adds a temporary workaround that keeps packed linear-attention - in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is - fixed for that layout. - """ - if "quantization_config" in text_config: - return text_config - if "quantization_config" not in config_dict: - return text_config - - quantization_config = dict(config_dict["quantization_config"]) - if "modules_to_not_convert" in quantization_config: - modules = _Qwen35ConfigCompat._normalize_exclude_modules( - quantization_config["modules_to_not_convert"]) - modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround( - text_config, modules) - quantization_config["modules_to_not_convert"] = sorted(set(modules)) - text_config["quantization_config"] = quantization_config - return text_config - - @staticmethod - def _normalize_exclude_modules(modules: list[str]) -> list[str]: - """Translate HF quantization exclude-module paths to TRT-LLM names. - - - Strip model.language_model. prefix -> model. - - Drop model.visual.* and mtp.* entries - - Map split projection names to packed TRT-LLM names - """ - normalized = set() - for name in modules: - if name.startswith("model.language_model."): - name = "model." + name[len("model.language_model."):] - if name.startswith("model.visual.") or name.startswith("mtp."): - continue - name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name) - name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name) - normalized.add(name) - return sorted(normalized) - - @staticmethod - def _add_qkvz_bf16_workaround(text_config: dict, - modules: list[str]) -> list[str]: - """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers. - - Temporary until FP8 block-scale TP loading is fixed for this layout. - """ - try: - layer_types = get_qwen3_hybrid_layer_types( - SimpleNamespace(**text_config)) - except (ValueError, AttributeError): - return modules - for layer_idx, layer_type in enumerate(layer_types): - if layer_type == "linear_attention": - modules.append( - f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz") - return modules - - @staticmethod - def _flatten_rope(text_config: dict) -> dict: - """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling. - - Qwen3.5 nests these inside a rope_parameters dict and uses rope_type - instead of type in rope_scaling. Qwen3NextConfig expects them as - top-level fields with rope_scaling.type. - """ - rope_parameters = dict(text_config.pop("rope_parameters", {}) or {}) - rope_scaling = dict(text_config.get("rope_scaling") or {}) - if rope_parameters: - rope_theta = rope_parameters.pop("rope_theta", None) - if rope_theta is not None: - text_config.setdefault("rope_theta", rope_theta) - partial_rotary_factor = rope_parameters.pop("partial_rotary_factor", - None) - if partial_rotary_factor is not None: - text_config.setdefault("partial_rotary_factor", - partial_rotary_factor) - if rope_parameters: - rope_scaling = rope_parameters | rope_scaling - if rope_scaling: - has_mrope = ("mrope_section" in rope_scaling - or rope_scaling.get("mrope_interleaved", False)) - if has_mrope: - rope_scaling["type"] = "mrope" - rope_scaling.pop("rope_type", None) - elif "type" not in rope_scaling and "rope_type" in rope_scaling: - rope_type = rope_scaling.pop("rope_type") - # "default" means standard RoPE (no scaling) — don't set - # rope_scaling to avoid triggering scaling code paths. - if rope_type == "default": - rope_scaling = {} - else: - rope_scaling["type"] = rope_type - if rope_scaling: - text_config["rope_scaling"] = rope_scaling - return text_config - - # TODO: remove this once the transformers can support all of those models in _CONFIG_REGISTRY class LazyConfigDict(dict): @@ -462,6 +356,16 @@ def load_pretrained_config(model_name_or_path: str, MistralConfigLoader model_config = MistralConfigLoader().load( model_name_or_path).pretrained_config + elif (model_type == "qwen3_5_moe" and + (("text_config" in config_dict and "vision_config" in config_dict) or + (architectures + and architectures[0] == "Qwen3_5MoeForConditionalGeneration"))): + # Qwen3.5-MoE VLM: HF native composite config + model-side normalizer. + from tensorrt_llm._torch.models.modeling_qwen3_5 import \ + _normalize_qwen35_moe_vl_config + model_config = transformers.Qwen3_5MoeConfig.from_pretrained( + model_name_or_path, **kwargs) + _normalize_qwen35_moe_vl_config(model_config) elif model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] model_config = config_class.from_pretrained(model_name_or_path, @@ -474,11 +378,11 @@ def load_pretrained_config(model_name_or_path: str, "Qwen3_5ForCausalLM", "Qwen3_5ForConditionalGeneration", )): - normalized = _Qwen35ConfigCompat.normalize(config_dict) - if model_type in ("qwen3_5_moe", "qwen3_5_moe_text"): - model_config = _Qwen35MoeVLMConfig.from_dict(normalized) - else: - model_config = transformers.Qwen3NextConfig.from_dict(normalized) + # Qwen3.5 text-only: flatten to Qwen3NextConfig via the model-side shim. + from tensorrt_llm._torch.models.modeling_qwen3_5 import \ + Qwen35ConfigCompat + model_config = transformers.Qwen3NextConfig.from_dict( + Qwen35ConfigCompat.normalize(config_dict)) elif (model_type == "exaone4" and config_dict.get("sliding_window") is None and config_dict.get("layer_types") is None): # transformers 5.5.x Exaone4Config.__post_init__ first forces diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py index 2c955793ba61..3002606ef44d 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_loader.py +++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py @@ -32,6 +32,7 @@ MoeLoadBalancer, maybe_create_moe_load_balancer) from ..virtual_memory import RestoreMode from ..virtual_memory import scope as virtual_memory_scope +from .config_utils import resolve_hf_torch_dtype, resolve_mamba_ssm_cache_dtype _KV_CACHE_MAP = { "fp8": QuantAlgo.FP8.value, @@ -47,12 +48,10 @@ def validate_and_set_mamba_ssm_cache_dtype( mamba_ssm_stochastic_rounding: bool = False, mamba_ssm_philox_rounds: int = 10) -> None: if mamba_ssm_cache_dtype == "auto": - hf_dtype = getattr(config.pretrained_config, "mamba_ssm_cache_dtype", - None) - if hf_dtype is not None: - mamba_ssm_cache_dtype = str_dtype_to_torch(hf_dtype) - else: - mamba_ssm_cache_dtype = config.pretrained_config.torch_dtype + mamba_ssm_cache_dtype = ( + resolve_mamba_ssm_cache_dtype(config.pretrained_config) + or resolve_hf_torch_dtype(config.pretrained_config) + or config.torch_dtype) else: mamba_ssm_cache_dtype = str_dtype_to_torch(mamba_ssm_cache_dtype) diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index e2cbb94aadfa..a7a0cc0aa8ce 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -72,8 +72,12 @@ Qwen/Qwen3-VL-8B-Instruct: mistralai/Mistral-Small-3.1-24B-Instruct-2503: - accuracy: 57.0 Qwen/Qwen3.5-35B-A3B: + # The default accuracy for `test_auto_dtype` tests. + - accuracy: 59.0 - dtype: bfloat16 accuracy: 60.444 + - quant_algo: FP8_BLOCK_SCALES + accuracy: 58.889 # Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params). # Values below are measured with NVFP4 checkpoint (thinking mode enabled). moonshotai/Kimi-K2.5: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index bdcfee64ce4f..31d61af3d004 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -480,6 +480,45 @@ def test_nvfp4_4gpus( task.evaluate(llm, sampling_params=self.sampling_params) +# Qwen3.5-MoE-VL is hybrid (Mamba + attention); +# the FlashInfer GDN prefill kernel is sm90+ only. +@skip_pre_hopper +@pytest.mark.skip_less_device_memory(80000) +class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness): + MODEL_NAME = "Qwen/Qwen3.5-35B-A3B" + MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B" + MAX_NUM_TOKENS = 16384 + MAX_BATCH_SIZE = 32 + + sampling_params = SamplingParams( + max_tokens=MAX_NUM_TOKENS, + truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, + stop="<|endoftext|>", + ) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False) + + def _make_llm(self, model_path: str) -> LLM: + return LLM( + model_path, + max_num_tokens=self.MAX_NUM_TOKENS, + max_batch_size=self.MAX_BATCH_SIZE, + kv_cache_config=self.kv_cache_config, + ) + + def test_auto_dtype(self) -> None: + with self._make_llm(self.MODEL_PATH) as llm: + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) + + def test_fp8_prequantized(self) -> None: + model_path = f"{llm_models_root()}/Qwen3.5-35B-A3B-FP8" + with self._make_llm(model_path) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) + + class TestQwen3VL(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-8B-Instruct" diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 4be9bf6364b1..33308801e2f2 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -825,6 +825,8 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[ accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[mtp_nextn=3] TIMEOUT (120) accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_nvfp4[mtp_nextn=0] TIMEOUT (120) accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_nvfp4[mtp_nextn=3] TIMEOUT (120) +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_fp8_prequantized accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray unittest/disaggregated/test_openai_disagg_server.py diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 3dde117a84cb..32262b4b6707 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -57,6 +57,11 @@ l0_h100: - unittest/_torch/modeling -k "modeling_gpt_oss" - unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_sanity - unittest/_torch/modeling/test_multimodal_encoder_graph.py + # Qwen3.5-MoE-VL is hybrid (Mamba SSM + attention); FlashInfer's + # chunk_gated_delta_rule GDN prefill kernel is sm90+ only, so this + # test must run on Hopper-or-newer GPUs. Peer Qwen3-VL / Qwen3-VL-MoE + # tests stay on L40s because they're pure attention and don't trigger the GDN kernel. + - unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py::TestQwen3_5MoeVL::test_all - unittest/disaggregated/test_disagg_utils.py - unittest/disaggregated/test_router.py - unittest/disaggregated/test_remoteDictionary.py diff --git a/tests/unittest/_torch/modeling/test_modeling_multimodal.py b/tests/unittest/_torch/modeling/test_modeling_multimodal.py index 5be764ca59d4..00d967fac180 100644 --- a/tests/unittest/_torch/modeling/test_modeling_multimodal.py +++ b/tests/unittest/_torch/modeling/test_modeling_multimodal.py @@ -17,6 +17,12 @@ from tensorrt_llm._torch.attention_backend.utils import get_attention_backend from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig +from tensorrt_llm._torch.pyexecutor.config_utils import ( + extract_mamba_kv_cache_params, + is_nemotron_hybrid, + is_qwen3_hybrid, +) +from tensorrt_llm._torch.pyexecutor.mamba_cache_manager import CppMambaHybridCacheManager from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.bindings.executor import KvCacheConfig @@ -27,6 +33,7 @@ prompt_inputs, ) from tensorrt_llm.inputs.multimodal import MultimodalParams, MultimodalRuntimeData +from tensorrt_llm.llmapi.llm_args import KvCacheConfig as PyKvCacheConfig from tensorrt_llm.mapping import Mapping @@ -520,6 +527,13 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario): Note: This method uses get_kv_cache_config() to obtain configuration. Override get_kv_cache_config() to customize cache settings. + + For hybrid linear-attention models (Qwen3Next, Qwen3.5, + Nemotron-Hybrid) this dispatches to + `get_hybrid_kv_cache_manager` so the linear-attention layers + get a `CppMambaHybridCacheManager` for SSM/conv state. + Mirrors the production dispatch in + `_util.py:_create_kv_cache_manager`. """ # Get cache configuration from the configurable method cache_config = self.get_kv_cache_config(scenario) @@ -529,17 +543,114 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario): num_blocks = (max_seq_len + tokens_per_block - 1) // tokens_per_block - self.kv_cache_manager = self.get_kv_cache_manager( - dtype=self.model_config.pretrained_config.torch_dtype, - config=self.model_config.pretrained_config, + config = self.model_config.pretrained_config + text_config = getattr(config, "text_config", config) + + if is_qwen3_hybrid(text_config) or is_nemotron_hybrid(text_config): + self.kv_cache_manager = self.get_hybrid_kv_cache_manager( + text_config=text_config, + tokens_per_block=tokens_per_block, + max_seq_len=max_seq_len, + batch_size=batch_size, + num_blocks=num_blocks, + ) + else: + self.kv_cache_manager = self.get_kv_cache_manager( + dtype=self.model_config.pretrained_config.torch_dtype, + config=self.model_config.pretrained_config, + tokens_per_block=tokens_per_block, + max_seq_len=max_seq_len, + batch_size=batch_size, + num_blocks=num_blocks, + ) + + self.kv_cache_manager.add_dummy_requests( + request_ids=[1], + token_nums=[max_seq_len], + **self._dummy_request_kwargs(scenario), + ) + + def _dummy_request_kwargs(self, scenario: MultimodalScenario) -> Dict: + """Optional override hook for extra kwargs to `add_dummy_requests`. + + Subclasses for mRoPE-using models (Qwen2.5-VL, Qwen3-VL, Qwen3.5-VL, + …) should return `{"use_mrope": True}` here so the cache manager + allocates the mRoPE position-id buffer at dummy-request time. + Defaults to an empty dict, preserving existing behavior for tests + that don't care. + """ + return {} + + def get_hybrid_kv_cache_manager( + self, + text_config: PretrainedConfig, + tokens_per_block: int, + max_seq_len: int, + batch_size: int, + num_blocks: int, + ): + """Build a `CppMambaHybridCacheManager` for hybrid linear-attention + models (Qwen3Next, Qwen3.5, Nemotron-Hybrid). + + Mirrors the production construction in + `_util.py:_create_kv_cache_manager` for `is_qwen3_hybrid` / + `is_nemotron_hybrid` configs: pulls the state-shape / dtype / + layer-mask parameters from `extract_mamba_kv_cache_params` and + threads them through the constructor. Tests that need a different + concrete manager (e.g. `MixedMambaHybridCacheManager` for + disagg-style coverage) should override this method. + """ + dtype_map = { + torch.half: tensorrt_llm.bindings.DataType.HALF, + torch.float16: tensorrt_llm.bindings.DataType.HALF, + torch.bfloat16: tensorrt_llm.bindings.DataType.BF16, + } + + mamba_params = extract_mamba_kv_cache_params(text_config) + if mamba_params.dtype not in dtype_map: + raise ValueError( + f"Unsupported dtype for hybrid cache manager: " + f"{mamba_params.dtype}. Supported: {list(dtype_map.keys())}" + ) + kv_cache_dtype = dtype_map[mamba_params.dtype] + + head_dim = getattr(text_config, "head_dim", None) + if not isinstance(head_dim, int): + head_dim = text_config.hidden_size // text_config.num_attention_heads + + # CppMambaHybridCacheManager reads Pydantic-only fields + # (mamba_state_cache_interval, enable_block_reuse) so we have to + # construct the llmapi.llm_args.KvCacheConfig here, not the C++ + # bindings KvCacheConfig that the standard KVCacheManager path uses. + kv_cache_config = PyKvCacheConfig(max_tokens=num_blocks * tokens_per_block) + mapping = Mapping(world_size=1, tp_size=1, rank=0) + + return CppMambaHybridCacheManager( + # mamba cache parameters (positional) + mamba_params.state_size, + mamba_params.conv_kernel, + mamba_params.num_heads, + mamba_params.n_groups, + mamba_params.head_dim, + mamba_params.num_mamba_layers, + mamba_params.mamba_layer_mask, + mamba_params.dtype, + mamba_params.mamba_ssm_cache_dtype, + # kv cache parameters (positional) + kv_cache_config, + tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF, + # kw-only + num_layers=mamba_params.num_full_attention_layers, + layer_mask=mamba_params.full_attention_layer_mask, + num_kv_heads=text_config.num_key_value_heads, + head_dim=head_dim, tokens_per_block=tokens_per_block, max_seq_len=max_seq_len, - batch_size=batch_size, - num_blocks=num_blocks, + max_batch_size=batch_size, + mapping=mapping, + dtype=kv_cache_dtype, ) - self.kv_cache_manager.add_dummy_requests(request_ids=[1], token_nums=[max_seq_len]) - def get_max_num_tokens(self, scenario: MultimodalScenario) -> int: """Get maximum number of tokens for attention metadata.""" if scenario.chunked_prefill: @@ -697,6 +808,14 @@ def setUp(self): # TODO: Add multi-GPU support self.device = torch.device("cuda:0") + # Pre-initialize fields that tearDown / setup_scenario expect to + # exist. Without this, a test method that doesn't run + # setup_scenario (e.g. a setUp-only smoke test) leaves + # self.kv_cache_manager unset and tearDown errors with + # AttributeError on the ``is not None`` check. + self.kv_cache_manager = None + self.attn_metadata = None + self.hf_config = self.create_hf_config() if self.skip_hf_inference: # Create a dummy torch module if skipping HF inference. diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py new file mode 100644 index 000000000000..0349bae2264a --- /dev/null +++ b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py @@ -0,0 +1,452 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from copy import deepcopy +from pathlib import Path +from typing import List, Optional + +import torch +import transformers +from test_modeling_multimodal import MultimodalScenario, TestModelingMultimodal +from transformers import Qwen3_5MoeForConditionalGeneration as HFQwen3_5MoeForConditionalGeneration +from utils.llm_data import llm_models_root +from utils.util import skip_pre_hopper + +from tensorrt_llm._torch.model_config import ModelConfig +from tensorrt_llm._torch.models import Qwen3_5MoeVLModel +from tensorrt_llm._torch.models.checkpoints.auto_mapper import AutoCheckpointMapper +from tensorrt_llm._torch.models.checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper +from tensorrt_llm._torch.models.modeling_auto import AutoModelForCausalLM +from tensorrt_llm._torch.models.modeling_qwen3_5 import _normalize_qwen35_moe_vl_config +from tensorrt_llm._torch.pyexecutor.config_utils import ( + extract_mamba_kv_cache_params, + load_pretrained_config, +) +from tensorrt_llm._torch.pyexecutor.model_loader import validate_and_set_mamba_ssm_cache_dtype +from tensorrt_llm.inputs import ContentFormat +from tensorrt_llm.inputs.registry import MULTIMODAL_PLACEHOLDER_REGISTRY + + +def _write_qwen35_moe_vl_config(tmp_path: Path) -> Path: + config = { + "architectures": ["Qwen3_5MoeForConditionalGeneration"], + "image_token_id": 248056, + "model_type": "qwen3_5_moe", + "text_config": { + "attention_bias": False, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151645, + "full_attention_interval": 4, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "linear_conv_kernel_dim": 4, + "linear_key_head_dim": 128, + "linear_num_key_heads": 16, + "linear_num_value_heads": 32, + "linear_value_head_dim": 128, + "mamba_ssm_dtype": "float32", + "max_position_embeddings": 262144, + "mlp_only_layers": [], + "model_type": "qwen3_5_moe_text", + "moe_intermediate_size": 512, + "norm_topk_prob": True, + "num_attention_heads": 32, + "num_experts": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 2, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-6, + "shared_expert_intermediate_size": 512, + "rope_parameters": { + "mrope_section": [11, 11, 10], + "partial_rotary_factor": 0.25, + "rope_theta": 1000000.0, + "rope_type": "default", + }, + "use_cache": True, + "vocab_size": 151936, + }, + "tie_word_embeddings": False, + "video_token_id": 248057, + "vision_config": { + "deepstack_visual_indexes": [8, 16, 24], + "depth": 27, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "in_channels": 3, + "intermediate_size": 4304, + "model_type": "qwen3_5_moe", + "num_heads": 16, + "num_position_embeddings": 2304, + "out_hidden_size": 2048, + "patch_size": 16, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + }, + "vision_end_token_id": 248054, + "vision_start_token_id": 248053, + } + (tmp_path / "config.json").write_text(json.dumps(config), encoding="utf-8") + return tmp_path + + +def test_qwen35_moe_vl_config_preserves_vlm_architecture( + tmp_path: Path, +) -> None: + config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path))) + + assert isinstance(config, transformers.Qwen3_5MoeConfig) + assert config.architectures == ["Qwen3_5MoeForConditionalGeneration"] + assert config.text_config.architectures == ["Qwen3_5MoeForCausalLM"] + assert config.text_config.num_experts == 128 + assert config.text_config.intermediate_size == 4608 + assert config.text_config.rope_theta == 1000000.0 + assert config.text_config.partial_rotary_factor == 0.25 + assert config.text_config.rope_scaling["type"] == "mrope" + assert config.text_config.rope_scaling["mrope_section"] == [11, 11, 10] + assert config.text_config.mamba_ssm_dtype == "float32" + assert config.get_text_config() is config.text_config + + +def test_qwen35_moe_vl_resolves_mamba_ssm_cache_dtype( + tmp_path: Path, +) -> None: + config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path))) + model_config = ModelConfig(pretrained_config=config) + + validate_and_set_mamba_ssm_cache_dtype(model_config, "auto") + assert model_config.quant_config.mamba_ssm_cache_dtype is torch.float32 + + mamba_params = extract_mamba_kv_cache_params( + config.text_config, + quant_config=model_config.quant_config, + ) + assert mamba_params.dtype is torch.bfloat16 + assert mamba_params.mamba_ssm_cache_dtype is torch.float32 + + +def test_qwen35_moe_vl_resolves_model_and_mapper(tmp_path: Path) -> None: + config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path))) + model_config = ModelConfig(pretrained_config=config) + + assert AutoModelForCausalLM._resolve_class(model_config) is Qwen3_5MoeVLModel + assert isinstance( + AutoCheckpointMapper.get("HF", "Qwen3_5MoeForConditionalGeneration"), + Qwen3_5MoeHfWeightMapper, + ) + + +def test_qwen35_moe_vl_placeholder_metadata_registered() -> None: + metadata = MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholder_metadata("qwen3_5_moe") + + assert metadata.placeholder_map == { + "image": "<|vision_start|><|image_pad|><|vision_end|>", + "video": "<|vision_start|><|video_pad|><|vision_end|>", + } + assert metadata.placeholders_separator == "" + assert metadata.content_format is ContentFormat.STRING + + +# --- Layered parity test scaffold ------------------------------------------- +# +# Tiny synthetic config used by TestQwen3_5MoeVL below. Same architecture as +# the real Qwen/Qwen3.5-35B-A3B checkpoint but with much smaller dimensions +# where possible. +# +# Shapes that have to match real Qwen3.5 (can't shrink without breaking +# things downstream): +# +# - head_dim=256, partial_rotary_factor=0.25 --> rotary tensor width is +# `head_dim * 0.25 / 2 = 32`, which equals `sum(mrope_section)`. +# A smaller head_dim (e.g. 128) yields a 16-wide tensor that mRoPE +# can't split with section sum 32. +# - num_attention_heads=16, num_key_value_heads=2 match the real +# model's 8:1 GQA layout; Q proj is 2048 --> 4096, K/V are 2048 --> 512. +# - Vision deepstack indices [8, 16, 24] match the real config, and +# depth=27 is the smallest value that hosts those indices. Disabling +# deepstack (indices=[], depth=2) produces fewer vision embeddings +# than the HF processor reserves placeholder tokens for, which +# breaks `fuse_input_embeds`. +# - vocab_size=248320 matches the real Qwen3.5 tokenizer. The +# tokenizer (loaded via _name_or_path) emits special-token ids in +# the 248k range; `fuse_input_embeds` uses `vocab_size` as the +# OOV threshold to identify image-pad tokens. A smaller vocab_size +# would misclassify regular chat-template specials as mm tokens +# and trip the placeholder/embedding count check. +# +# Shapes that can be shrunk for tests: +# +# - num_hidden_layers: 2 (vs 40+). +# - num_experts: 128 (vs 256). Still moderate so MoE routing runs. +# - full_attention_interval=2 with 2 LM layers yields the pattern +# [linear_attention, full_attention] — one of each kind, exercising +# both the regular KV cache and the Mamba SSM/conv state via the +# base-class dispatch. +# +# `_name_or_path` points at the real checkpoint dir so the test can load +# the tokenizer/processor (only the processor; not the full model weights). +QWEN3_5_VL_MOE_PARITY_CONFIG = { + "architectures": ["Qwen3_5MoeForConditionalGeneration"], + "image_token_id": 248056, + "model_type": "qwen3_5_moe", + "text_config": { + "attention_bias": False, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151645, + "full_attention_interval": 2, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 2048, + "linear_conv_kernel_dim": 4, + "linear_key_head_dim": 128, + "linear_num_key_heads": 16, + "linear_num_value_heads": 32, + "linear_value_head_dim": 128, + "mamba_ssm_dtype": "float32", + "max_position_embeddings": 8192, + "mlp_only_layers": [], + "model_type": "qwen3_5_moe_text", + "moe_intermediate_size": 512, + "norm_topk_prob": True, + "num_attention_heads": 16, + "num_experts": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 2, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-6, + "shared_expert_intermediate_size": 512, + "rope_parameters": { + "mrope_section": [11, 11, 10], + "partial_rotary_factor": 0.25, + "rope_theta": 1000000.0, + "rope_type": "default", + }, + "use_cache": True, + "vocab_size": 248320, + }, + "tie_word_embeddings": False, + "video_token_id": 248057, + "vision_config": { + "deepstack_visual_indexes": [8, 16, 24], + "depth": 27, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "in_channels": 3, + "initializer_range": 0.02, + "intermediate_size": 4304, + "model_type": "qwen3_5_moe", + "num_heads": 16, + "num_position_embeddings": 2304, + "out_hidden_size": 2048, + "patch_size": 16, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + }, + "vision_end_token_id": 248054, + "vision_start_token_id": 248053, + "_name_or_path": str(os.path.join(llm_models_root(), "Qwen3.5-35B-A3B")), +} + + +@skip_pre_hopper +class TestQwen3_5MoeVL(TestModelingMultimodal): + """Forward-parity test for Qwen3.5-MoE-VL against HuggingFace. + + Tiny-synthetic-config parity test in the same shape as + `TestQwen3VLMoe` / `TestQwen2_5VL`: both stacks are constructed + from `QWEN3_5_VL_MOE_PARITY_CONFIG` (2 LM layers, 1 linear + 1 full + attention, 128 experts, 2 vision layers), HF weights are copied + into TRT-LLM via `Qwen3_5MoeHfWeightMapper`, then `test_all` + sweeps the default `MultimodalScenario`s comparing last-position + logits at context + generation phases. + + Two-config design: + - `self.hf_config` is the raw `Qwen3_5MoeConfig.from_dict(...)` + result. HF model construction sees the native HF schema + (`rope_parameters` intact with `rope_type`, + `moe_intermediate_size`, …). + - TRT-LLM gets a deep-copied + normalized version via the + `create_trtllm_model` override below. That copy goes through + `_normalize_qwen35_moe_vl_config` exactly the same way + production `load_pretrained_config` does, so the Qwen3Next + runtime sees the flat aliases it expects + (`intermediate_size`, `rope_theta`, `rope_scaling`, …). + + Keeping the two configs separate means the production normalizer + doesn't need to be HF-safe — production only ever constructs the + TRT-LLM model from a normalized config, and the test mirrors that + boundary explicitly. The hybrid-cache path is handled by the base + class's `init_kv_cache_manager` dispatch on + `is_qwen3_hybrid` / `is_nemotron_hybrid`. + """ + + def get_model_config(self): + return QWEN3_5_VL_MOE_PARITY_CONFIG + + def get_trtllm_model_class(self): + return Qwen3_5MoeVLModel + + def get_hf_model_class(self): + return HFQwen3_5MoeForConditionalGeneration + + def get_weight_mapper_class(self): + return Qwen3_5MoeHfWeightMapper + + def get_model_type(self): + return "qwen3_5_moe" + + def get_model_config_class(self): + return transformers.Qwen3_5MoeConfig + + def create_trtllm_model( + self, + load_weights: bool = False, + hf_model_state_dict: Optional[dict] = None, + **kwargs, + ): + """Build the TRT-LLM model from a *normalized copy* of `self.hf_config`. + + Mirrors the base-class body but swaps in + `_normalize_qwen35_moe_vl_config(trtllm_config)` before + wrapping in `ModelConfig`. `self.hf_config` itself stays + raw so the HF model that the base class builds in `setUp` + sees native HF schema. + """ + trtllm_config = deepcopy(self.hf_config) + _normalize_qwen35_moe_vl_config(trtllm_config) + + model_config = ModelConfig(pretrained_config=trtllm_config) + model_class = self.get_trtllm_model_class() + model = model_class(model_config, **kwargs).to("cuda") + + if load_weights: + weight_mapper = self.get_weight_mapper_class()() + weight_mapper.init_model_and_config(model, trtllm_config) + model.load_weights(hf_model_state_dict, weight_mapper) + + for module in model.modules(): + if hasattr(module, "post_load_weights") and not getattr( + module, "_weights_removed", False + ): + module.post_load_weights() + + return model, model_config + + def _dummy_request_kwargs(self, scenario): + """Qwen3.5-VL uses mRoPE; the cache manager needs the mRoPE + position-id buffer allocated at dummy-request time.""" + return {"use_mrope": True} + + def get_tolerance(self): + """Tighten `rtol` to `0.1` (4x tighter than the base 0.4 + default) while keeping `atol` at `0.4` to absorb single-logit + tail outliers seen on `multiple_image` / `video`. + """ + return 0.4, 0.1 + + def get_trtllm_inputs( + self, + input_ids, + multimodal_params_list, + is_gen: bool = False, + num_cached_tokens_per_seq: Optional[List[int]] = None, + total_prompt_len: Optional[int] = None, + ): + """Override position_ids with mRoPE position IDs from the + multimodal params. Same pattern as `TestQwen3VLMoe` — the + VLM wrapper feeds mRoPE-shaped position IDs to the decoder, + not the simple range-based default the base class produces. + """ + trtllm_inputs = super().get_trtllm_inputs( + input_ids, + multimodal_params_list, + is_gen, + num_cached_tokens_per_seq, + total_prompt_len=total_prompt_len, + ) + + if is_gen: + mrope_gen_position_ids = [] + for multimodal_param in multimodal_params_list: + mrope_gen_position_ids.append( + multimodal_param.multimodal_data["mrope_config"]["mrope_position_deltas"] + ) + mrope_gen_position_ids = torch.cat(mrope_gen_position_ids, dim=-1).to(self.device) + trtllm_inputs["position_ids"] = ( + (trtllm_inputs["position_ids"] + mrope_gen_position_ids).expand(3, -1, 1).cuda() + ) + gen_multimodal_params_list = [] + for multimodal_param in multimodal_params_list: + multimodal_param.strip_for_generation() + multimodal_param.to_device( + "multimodal_data", + self.device, + pin_memory=True, + target_keywords=["mrope_config.mrope_position_deltas"], + ) + gen_multimodal_params_list.append(multimodal_param) + trtllm_inputs["multimodal_params"] = gen_multimodal_params_list + else: + mrope_position_ids = [] + for multimodal_param in multimodal_params_list: + mrope_position_ids.append( + multimodal_param.multimodal_data["mrope_config"]["mrope_position_ids"] + ) + position_ids = torch.cat(mrope_position_ids, dim=-1).cuda() + trtllm_inputs["position_ids"] = position_ids + + return trtllm_inputs + + def get_scenarios(self) -> List[MultimodalScenario]: + """Modality-sanity sweep (image / multiple_image / video). + + These three catch differences in placeholder counts and the + multimodal-cumsum path between single-image, multi-image, and + video inputs. + + CUDA-graph capture is intentionally not exercised here. The + standard `attn_metadata.create_cuda_graph_metadata` path only + addresses attention metadata; the Mamba SSM state buffer of the + hybrid (Mamba + attention) cache is not threaded through, so + replayed logits diverge from the HF reference. Adding that path + is dedicated harness work and tracked separately. + """ + return [ + MultimodalScenario( + modality="image", + use_cuda_graph=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), + MultimodalScenario( + modality="multiple_image", + use_cuda_graph=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), + MultimodalScenario( + modality="video", + use_cuda_graph=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), + ] + + def test_construction_and_weight_loading_smoke(self): + """Smoke test: setUp built HF + TRT-LLM models and copied HF + weights into TRT-LLM via the weight mapper. Detailed + assertions on the normalizer's outputs live in the routing + tests above (e.g. `test_qwen35_moe_vl_config_preserves_vlm_architecture`) + — this one just confirms construction reached the end without + exception. + """ + self.assertIsNotNone(self.hf_model) + self.assertIsNotNone(self.trtllm_model) + self.assertIsNotNone(self.model_config)