diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md index 685680f5b0b0..e14d62b6f7c6 100644 --- a/docs/source/models/supported-models.md +++ b/docs/source/models/supported-models.md @@ -97,7 +97,6 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl | `Qwen2_5_VLForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | L + I + V | | `Qwen3VLForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | L + I + V | | `Qwen3VLMoeForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | L + I + V | -| `Qwen3_5MoeForConditionalGeneration` | Yes | Yes | Untested | Yes | Yes | No | Untested | Yes | L + I + V | Note: - L: Language diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py index 8f67788bba40..9c3b032421b2 100644 --- a/tensorrt_llm/_torch/models/__init__.py +++ b/tensorrt_llm/_torch/models/__init__.py @@ -37,8 +37,7 @@ Qwen2ForRewardModel) from .modeling_qwen2vl import Qwen2_5_VLModel, Qwen2VLModel from .modeling_qwen3 import Qwen3ForCausalLM -from .modeling_qwen3_5 import (Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM, - Qwen3_5MoeVLModel) +from .modeling_qwen3_5 import Qwen3_5ForCausalLM, Qwen3_5MoeForCausalLM from .modeling_qwen3_moe import Qwen3MoeForCausalLM from .modeling_qwen3_next import Qwen3NextForCausalLM from .modeling_qwen3vl import Qwen3VLModel @@ -91,7 +90,6 @@ "Qwen3MoeForCausalLM", "Qwen3_5ForCausalLM", "Qwen3_5MoeForCausalLM", - "Qwen3_5MoeVLModel", "Qwen3NextForCausalLM", "Qwen3MoeVLModel", "GptOssForCausalLM", diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py index 65e0168bec55..fa2f161bdc4f 100644 --- a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py +++ b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_5_weight_mapper.py @@ -13,7 +13,6 @@ @register_mapper("HF", "Qwen3_5MoeForCausalLM") -@register_mapper("HF", "Qwen3_5MoeForConditionalGeneration") @register_mapper("HF", "Qwen3_5ForCausalLM") class Qwen3_5MoeHfWeightMapper(Qwen3NextHfWeightMapper): """Weight mapper for Qwen3.5 MoE text checkpoints. diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py index 4f325dbb0bcb..bf83e916db29 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_5.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_5.py @@ -1,29 +1,7 @@ import re -from types import SimpleNamespace -from typing import Dict, List -import torch -from transformers import PretrainedConfig - -from ...inputs import ( - ContentFormat, - MultimodalPlaceholderMetadata, - MultimodalPlaceholderPlacement, - register_input_processor, - support_multimodal_disaggregated, -) -from ..pyexecutor.config_utils import get_qwen3_hybrid_layer_types -from .checkpoints.base_weight_mapper import BaseWeightMapper -from .checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper -from .modeling_multimodal_utils import _is_disagg from .modeling_qwen3_next import Qwen3NextForCausalLM -from .modeling_qwen3vl import ( - Qwen3VisionModel, - Qwen3VisionModelBase, - Qwen3VLInputProcessorBase, - Qwen3VLModelBase, -) -from .modeling_utils import ModelConfig, register_auto_model, register_vision_encoder +from .modeling_utils import register_auto_model _LANG_PREFIX = "model.language_model." @@ -73,248 +51,6 @@ def _translate_mtp_pattern(name, n_hidden_layers): return None -# --- Config adapters -------------------------------------------------------- -# -# These run from `load_pretrained_config` in -# `tensorrt_llm/_torch/pyexecutor/config_utils.py` via lazy import — the -# runtime layer asks the model module how to load its own config. -# -# There are two entry points: -# - `Qwen35ConfigCompat.normalize(config_dict)` — for text-only -# Qwen3.5 (MoE and dense). Returns a dict that -# `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the -# existing Qwen3Next runtime is reused unchanged. -# - `_normalize_qwen35_moe_vl_config(model_config)` — for the -# Qwen3.5-MoE VLM. Mutates the HF-native `transformers.Qwen3_5MoeConfig` -# in place, attaching the runtime aliases the Qwen3Next-based LM expects -# while keeping `text_config` / `vision_config` composite. - - -class Qwen35ConfigCompat: - """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig. - - We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native - schema) so the runtime can reuse the existing `Qwen3NextForCausalLM` - model implementation unchanged — Qwen3.5 text is structurally identical - to Qwen3Next, so matching the config schema lets the same code serve - both. - - This is used for Qwen3.5 text-only configs and for shared helper logic such - as RoPE and quantization exclude-module normalization. Qwen3.5-MoE VLM - configs should stay composite and use transformers.Qwen3_5MoeConfig plus - _normalize_qwen35_moe_vl_config instead. - - To remove: delete this class and the elif branch in - load_pretrained_config that flattens Qwen3.5 text configs. - """ - - @staticmethod - def normalize(config_dict: dict) -> dict: - """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict.""" - text_config = Qwen35ConfigCompat._extract_text_config(config_dict) - text_config = Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config) - text_config = Qwen35ConfigCompat._flatten_rope(text_config) - - # Detect dense vs MoE and set architecture + MoE defaults accordingly - is_moe = "num_experts" in text_config and text_config["num_experts"] > 0 - if is_moe: - text_config["architectures"] = ["Qwen3_5MoeForCausalLM"] - else: - text_config["architectures"] = ["Qwen3_5ForCausalLM"] - # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't - # accidentally enable MoE for the dense model. - text_config.setdefault("num_experts", 0) - text_config.setdefault("num_experts_per_tok", 0) - text_config.setdefault("moe_intermediate_size", 0) - text_config.setdefault("shared_expert_intermediate_size", 0) - return text_config - - _VLM_ARCHITECTURES = { - "Qwen3_5MoeForConditionalGeneration", - "Qwen3_5ForConditionalGeneration", - } - - @staticmethod - def _extract_text_config(config_dict: dict) -> dict: - """Pull nested text_config from VLM checkpoints, or use dict as-is.""" - architectures = config_dict.get("architectures") or [] - if architectures and architectures[0] in Qwen35ConfigCompat._VLM_ARCHITECTURES: - text_config = dict(config_dict.get("text_config") or {}) - else: - text_config = dict(config_dict) - if not text_config: - raise ValueError("Qwen3.5 config is missing a usable text_config") - return text_config - - @staticmethod - def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict: - """Copy top-level quantization_config into text_config with name normalization. - - Also adds a temporary workaround that keeps packed linear-attention - in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is - fixed for that layout. - """ - if "quantization_config" in text_config: - return text_config - if "quantization_config" not in config_dict: - return text_config - - quantization_config = dict(config_dict["quantization_config"]) - if "modules_to_not_convert" in quantization_config: - modules = Qwen35ConfigCompat._normalize_exclude_modules( - quantization_config["modules_to_not_convert"] - ) - modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules) - quantization_config["modules_to_not_convert"] = sorted(set(modules)) - text_config["quantization_config"] = quantization_config - return text_config - - @staticmethod - def _normalize_exclude_modules(modules: list[str]) -> list[str]: - """Translate HF quantization exclude-module paths to TRT-LLM names. - - - Strip model.language_model. prefix -> model. - - Drop model.visual.* and mtp.* entries - - Map split projection names to packed TRT-LLM names - """ - normalized = set() - for name in modules: - if name.startswith("model.language_model."): - name = "model." + name[len("model.language_model.") :] - if name.startswith("model.visual.") or name.startswith("mtp."): - continue - name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name) - name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name) - normalized.add(name) - return sorted(normalized) - - @staticmethod - def _add_qkvz_bf16_workaround(text_config: dict, modules: list[str]) -> list[str]: - """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers. - - Temporary until FP8 block-scale TP loading is fixed for this layout. - """ - try: - layer_types = get_qwen3_hybrid_layer_types(SimpleNamespace(**text_config)) - except (ValueError, AttributeError): - return modules - for layer_idx, layer_type in enumerate(layer_types): - if layer_type == "linear_attention": - modules.append(f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz") - return modules - - @staticmethod - def _flatten_rope(text_config: dict) -> dict: - """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling. - - Qwen3.5 nests these inside a rope_parameters dict and uses rope_type - instead of type in rope_scaling. Qwen3NextConfig expects them as - top-level fields with rope_scaling.type. - """ - rope_parameters = dict(text_config.pop("rope_parameters", {}) or {}) - rope_scaling = dict(text_config.get("rope_scaling") or {}) - if rope_parameters: - rope_theta = rope_parameters.pop("rope_theta", None) - if rope_theta is not None: - text_config.setdefault("rope_theta", rope_theta) - partial_rotary_factor = rope_parameters.pop("partial_rotary_factor", None) - if partial_rotary_factor is not None: - text_config.setdefault("partial_rotary_factor", partial_rotary_factor) - if rope_parameters: - rope_scaling = rope_parameters | rope_scaling - if rope_scaling: - has_mrope = "mrope_section" in rope_scaling or rope_scaling.get( - "mrope_interleaved", False - ) - if has_mrope: - rope_scaling["type"] = "mrope" - rope_scaling.pop("rope_type", None) - elif "type" not in rope_scaling and "rope_type" in rope_scaling: - rope_type = rope_scaling.pop("rope_type") - # "default" means standard RoPE (no scaling) — don't set - # rope_scaling to avoid triggering scaling code paths. - if rope_type == "default": - rope_scaling = {} - else: - rope_scaling["type"] = rope_type - if rope_scaling: - text_config["rope_scaling"] = rope_scaling - return text_config - - -def _normalize_qwen35_mrope_config(text_config) -> None: - """Materialize Qwen3.5 mRoPE aliases needed by the Qwen3-VL path. - - HF stores RoPE metadata under `rope_parameters`; the shared Qwen3-VL - wrapper reads `rope_theta`, `partial_rotary_factor`, and - `rope_scaling` directly on the text config. - """ - rope_parameters = getattr(text_config, "rope_parameters", None) - if not rope_parameters: - return - if hasattr(rope_parameters, "to_dict"): - rope_parameters = rope_parameters.to_dict() - flattened = Qwen35ConfigCompat._flatten_rope( - { - "rope_parameters": dict(rope_parameters), - "rope_scaling": dict(getattr(text_config, "rope_scaling", None) or {}), - } - ) - for attr in ("rope_theta", "partial_rotary_factor", "rope_scaling"): - value = flattened.get(attr) - if value is not None: - setattr(text_config, attr, value) - - -def _normalize_qwen35_qwen3next_text_aliases(text_config) -> None: - """Materialize Qwen3Next-style text aliases used by the shared runtime.""" - if getattr(text_config, "intermediate_size", None) is None: - moe_intermediate_size = getattr(text_config, "moe_intermediate_size", None) - num_experts_per_tok = getattr(text_config, "num_experts_per_tok", None) - shared_expert_intermediate_size = ( - getattr(text_config, "shared_expert_intermediate_size", 0) or 0 - ) - if moe_intermediate_size is not None and num_experts_per_tok is not None: - text_config.intermediate_size = ( - num_experts_per_tok * moe_intermediate_size + shared_expert_intermediate_size - ) - - -def _normalize_qwen35_quantization_config(model_config) -> None: - quantization_config = getattr(model_config, "quantization_config", None) - if not isinstance(quantization_config, dict): - return - - modules = quantization_config.get("modules_to_not_convert") - if modules is None: - return - - text_config = getattr(model_config, "text_config", None) - normalized_modules = Qwen35ConfigCompat._normalize_exclude_modules(modules) - if text_config is not None: - normalized_modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround( - text_config.to_dict(), normalized_modules - ) - quantization_config["modules_to_not_convert"] = sorted(set(normalized_modules)) - - -def _normalize_qwen35_moe_vl_config(model_config) -> None: - """Adapt HF Qwen3.5-MoE VLM config to TRT-LLM runtime conventions.""" - if not getattr(model_config, "architectures", None): - model_config.architectures = ["Qwen3_5MoeForConditionalGeneration"] - - text_config = getattr(model_config, "text_config", None) - if text_config is None: - raise ValueError("Qwen3.5-MoE VLM config is missing text_config") - - text_config.architectures = ["Qwen3_5MoeForCausalLM"] - _normalize_qwen35_qwen3next_text_aliases(text_config) - _normalize_qwen35_mrope_config(text_config) - - model_config.get_text_config = lambda decoder=False: text_config - _normalize_qwen35_quantization_config(model_config) - - def _normalize_qwen35_exclude_modules(model_config): """Normalize NVFP4/FP8 exclude_modules from HF naming to TRT-LLM naming. @@ -390,56 +126,10 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM): Same reuse pattern as Qwen3_5MoeForCausalLM, but for the dense 27B variant which uses GatedMLP instead of SparseMoeBlock. The config - normalizer (Qwen35ConfigCompat) sets num_experts=0 so that + normalizer (_Qwen35ConfigCompat) sets num_experts=0 so that Qwen3NextModel selects GatedMLP for the feed-forward layers. """ def __init__(self, model_config): _normalize_qwen35_exclude_modules(model_config) super().__init__(model_config) - - -# TODO: Add tests for disaggregated support. -@support_multimodal_disaggregated -@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel) -@register_auto_model("Qwen3_5MoeForConditionalGeneration") -@register_input_processor( - Qwen3VLInputProcessorBase, - model_type="qwen3_5_moe", - placeholder_metadata=MultimodalPlaceholderMetadata( - placeholder_map={ - "image": "<|vision_start|><|image_pad|><|vision_end|>", - "video": "<|vision_start|><|video_pad|><|vision_end|>", - }, - placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT, - placeholders_separator="", - content_format=ContentFormat.STRING, - ), -) -class Qwen3_5MoeVLModel(Qwen3VLModelBase): - """VLM wrapper composing Qwen3 vision encoder with Qwen3.5 MoE text decoder.""" - - def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs): - kwargs["vision_model_class"] = Qwen3VisionModel - kwargs["disable_fuse_rope"] = kwargs.get("disable_fuse_rope", False) - super().__init__(model_config, *args, **kwargs) - - @property - def multimodal_data_device_paths(self) -> List[str]: - return [ - "image.pixel_values", - "video.pixel_values_videos", - "multimodal_embedding", - ] - - def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper): - if not _is_disagg(): - self.mm_encoder.load_weights(weights) - - weight_mapper = Qwen3_5MoeHfWeightMapper() - weight_mapper.init_model_and_config(self.llm, self.model_config) - filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")} - params_map = { - r"^model\.language_model\.(.*)$": r"model.\1", - } - self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map) diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py index 5d8ca8e81cbd..d6f4fd57794f 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_next.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_next.py @@ -973,18 +973,9 @@ def get_model_defaults(cls, llm_args: 'TorchLlmArgs') -> dict: # is supported for Mamba/SSM-based models return {"kv_cache_config": {"enable_block_reuse": False}} - def load_weights(self, - weights: dict, - weight_mapper: BaseWeightMapper, - params_map: Optional[Dict[str, str]] = None, - allow_partial_loading: bool = False): + def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper): new_weights = weight_mapper.preprocess_weights(weights) - super().load_weights( - new_weights, - weight_mapper=weight_mapper, - params_map=params_map, - allow_partial_loading=allow_partial_loading, - ) + super().load_weights(new_weights, weight_mapper) def post_load_weights(self): for idx, layer in enumerate( diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py index ed724b0a6307..ecdbc5fde4b3 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3vl.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3vl.py @@ -1053,8 +1053,6 @@ def __init__( llm_model_config.pretrained_config.architectures = ["Qwen3ForCausalLM"] elif self.original_arch == "Qwen3VLMoeForConditionalGeneration": llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"] - elif self.original_arch == "Qwen3_5MoeForConditionalGeneration": - llm_model_config.pretrained_config.architectures = ["Qwen3_5MoeForCausalLM"] else: raise ValueError(f"Unsupported architecture: {self.original_arch}") # Qwen3ForCausalLM. @@ -1092,12 +1090,9 @@ def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]): mrope_section=config.rope_scaling.get("mrope_section", None), mrope_interleaved=config.rope_scaling.get("mrope_interleaved", False), ) - head_dim = getattr(config, "head_dim", None) - if not isinstance(head_dim, int): - head_dim = config.hidden_size // config.num_attention_heads self.rotary_emb = MRotaryEmbedding( pos_embd_params.rope, - head_dim=head_dim, + head_dim=config.hidden_size // config.num_attention_heads, is_neox=pos_embd_params.is_neox, mrope_section=pos_embd_params.mrope_section, mrope_interleaved=pos_embd_params.mrope_interleaved, diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index 6a46f0f984d9..ef270040fc26 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -1,10 +1,11 @@ import dataclasses +import re +from types import SimpleNamespace from typing import List, Optional import torch import transformers -from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.logger import logger @@ -20,60 +21,6 @@ def is_hybrid_linear(config): return is_nemotron_hybrid(config) or is_qwen3_hybrid(config) -def _coerce_torch_dtype(dtype): - """Normalize dtype values from HF configs into torch dtype objects. - - HF configs may store dtype fields as torch dtypes, strings, or the sentinel - value "auto". Returning None for "auto" lets the caller keep its normal - fallback path instead of treating "auto" as a concrete dtype. - """ - if isinstance(dtype, torch.dtype): - return dtype - if dtype == "auto": - return None - if isinstance(dtype, str): - return str_dtype_to_torch(dtype) - return dtype - - -def resolve_hf_torch_dtype(config): - """Return the model's regular tensor dtype from common HF config fields. - - Transformers has used both dtype and torch_dtype across versions and model - families. This helper checks both names and coerces whichever one is present - into the form expected by TRT-LLM runtime code. An "auto" value in any - field is treated the same as missing, so scanning continues to the next - field instead of stopping with None. - """ - for attr in ("dtype", "torch_dtype"): - coerced = _coerce_torch_dtype(getattr(config, attr, None)) - if coerced is not None: - return coerced - return None - - -def resolve_mamba_ssm_cache_dtype(config): - """Return the dtype to use for hybrid Mamba/SSM cache allocations. - - Qwen3.5-style configs may store this field on the top-level config or the - nested text_config, and may call it either mamba_ssm_cache_dtype or - mamba_ssm_dtype. This helper centralizes that lookup so cache creation does - not fail later with a missing dtype. An "auto" value in any field is - treated the same as missing. - """ - configs = [config] - text_config = getattr(config, "text_config", None) - if text_config is not None: - configs.append(text_config) - - for candidate_config in configs: - for attr in ("mamba_ssm_cache_dtype", "mamba_ssm_dtype"): - coerced = _coerce_torch_dtype(getattr(candidate_config, attr, None)) - if coerced is not None: - return coerced - return None - - def is_nemotron_hybrid(config): if hasattr(config, "hybrid_override_pattern" ) and config.hybrid_override_pattern is not None and len( @@ -302,14 +249,8 @@ def extract_mamba_kv_cache_params( full_attn_mask.extend([True] * num_spec_layers) mamba_mask.extend([False] * num_spec_layers) - mamba_ssm_cache_dtype = None - if quant_config is not None: - mamba_ssm_cache_dtype = _coerce_torch_dtype( - quant_config.mamba_ssm_cache_dtype) - if mamba_ssm_cache_dtype is None: - mamba_ssm_cache_dtype = (resolve_mamba_ssm_cache_dtype(config) - or resolve_hf_torch_dtype(config) - or torch.bfloat16) + mamba_ssm_cache_dtype = (quant_config.mamba_ssm_cache_dtype + if quant_config is not None else None) return MambaKVCacheParams( state_size=state_size, @@ -321,11 +262,159 @@ def extract_mamba_kv_cache_params( full_attention_layer_mask=full_attn_mask, num_mamba_layers=sum(mamba_mask), num_full_attention_layers=sum(full_attn_mask), - dtype=resolve_hf_torch_dtype(config) or torch.bfloat16, + dtype=config.torch_dtype, mamba_ssm_cache_dtype=mamba_ssm_cache_dtype, ) +class _Qwen35ConfigCompat: + """Temporary shim that normalizes Qwen3.5 HF configs into Qwen3NextConfig. + + To remove: delete this class and the elif branch in + load_pretrained_config that references it. + """ + + @staticmethod + def normalize(config_dict: dict) -> dict: + """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict.""" + text_config = _Qwen35ConfigCompat._extract_text_config(config_dict) + text_config = _Qwen35ConfigCompat._inherit_quantization_config( + config_dict, text_config) + text_config = _Qwen35ConfigCompat._flatten_rope(text_config) + + # Detect dense vs MoE and set architecture + MoE defaults accordingly + is_moe = "num_experts" in text_config and text_config["num_experts"] > 0 + if is_moe: + text_config["architectures"] = ["Qwen3_5MoeForCausalLM"] + else: + text_config["architectures"] = ["Qwen3_5ForCausalLM"] + # Ensure MoE fields are zeroed so Qwen3NextConfig defaults don't + # accidentally enable MoE for the dense model. + text_config.setdefault("num_experts", 0) + text_config.setdefault("num_experts_per_tok", 0) + text_config.setdefault("moe_intermediate_size", 0) + text_config.setdefault("shared_expert_intermediate_size", 0) + return text_config + + _VLM_ARCHITECTURES = { + "Qwen3_5MoeForConditionalGeneration", + "Qwen3_5ForConditionalGeneration", + } + + @staticmethod + def _extract_text_config(config_dict: dict) -> dict: + """Pull nested text_config from VLM checkpoints, or use dict as-is.""" + architectures = config_dict.get("architectures") or [] + if architectures and architectures[ + 0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES: + text_config = dict(config_dict.get("text_config") or {}) + else: + text_config = dict(config_dict) + if not text_config: + raise ValueError("Qwen3.5 config is missing a usable text_config") + return text_config + + @staticmethod + def _inherit_quantization_config(config_dict: dict, + text_config: dict) -> dict: + """Copy top-level quantization_config into text_config with name normalization. + + Also adds a temporary workaround that keeps packed linear-attention + in_proj_qkvz on the bf16 path until FP8 block-scale TP loading is + fixed for that layout. + """ + if "quantization_config" in text_config: + return text_config + if "quantization_config" not in config_dict: + return text_config + + quantization_config = dict(config_dict["quantization_config"]) + if "modules_to_not_convert" in quantization_config: + modules = _Qwen35ConfigCompat._normalize_exclude_modules( + quantization_config["modules_to_not_convert"]) + modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround( + text_config, modules) + quantization_config["modules_to_not_convert"] = sorted(set(modules)) + text_config["quantization_config"] = quantization_config + return text_config + + @staticmethod + def _normalize_exclude_modules(modules: list[str]) -> list[str]: + """Translate HF quantization exclude-module paths to TRT-LLM names. + + - Strip model.language_model. prefix -> model. + - Drop model.visual.* and mtp.* entries + - Map split projection names to packed TRT-LLM names + """ + normalized = set() + for name in modules: + if name.startswith("model.language_model."): + name = "model." + name[len("model.language_model."):] + if name.startswith("model.visual.") or name.startswith("mtp."): + continue + name = re.sub(r"\.in_proj_[ab]$", ".in_proj_ba", name) + name = re.sub(r"\.in_proj_(q|k|v|z|qkv)$", ".in_proj_qkvz", name) + normalized.add(name) + return sorted(normalized) + + @staticmethod + def _add_qkvz_bf16_workaround(text_config: dict, + modules: list[str]) -> list[str]: + """Keep packed linear-attention qkvz on bf16 path for all linear-attention layers. + + Temporary until FP8 block-scale TP loading is fixed for this layout. + """ + try: + layer_types = get_qwen3_hybrid_layer_types( + SimpleNamespace(**text_config)) + except (ValueError, AttributeError): + return modules + for layer_idx, layer_type in enumerate(layer_types): + if layer_type == "linear_attention": + modules.append( + f"model.layers.{layer_idx}.linear_attn.in_proj_qkvz") + return modules + + @staticmethod + def _flatten_rope(text_config: dict) -> dict: + """Flatten rope_parameters into top-level rope_theta / partial_rotary_factor / rope_scaling. + + Qwen3.5 nests these inside a rope_parameters dict and uses rope_type + instead of type in rope_scaling. Qwen3NextConfig expects them as + top-level fields with rope_scaling.type. + """ + rope_parameters = dict(text_config.pop("rope_parameters", {}) or {}) + rope_scaling = dict(text_config.get("rope_scaling") or {}) + if rope_parameters: + rope_theta = rope_parameters.pop("rope_theta", None) + if rope_theta is not None: + text_config.setdefault("rope_theta", rope_theta) + partial_rotary_factor = rope_parameters.pop("partial_rotary_factor", + None) + if partial_rotary_factor is not None: + text_config.setdefault("partial_rotary_factor", + partial_rotary_factor) + if rope_parameters: + rope_scaling = rope_parameters | rope_scaling + if rope_scaling: + has_mrope = ("mrope_section" in rope_scaling + or rope_scaling.get("mrope_interleaved", False)) + if has_mrope: + rope_scaling["type"] = "mrope" + rope_scaling.pop("rope_type", None) + elif "type" not in rope_scaling and "rope_type" in rope_scaling: + rope_type = rope_scaling.pop("rope_type") + # "default" means standard RoPE (no scaling) — don't set + # rope_scaling to avoid triggering scaling code paths. + if rope_type == "default": + rope_scaling = {} + else: + rope_scaling["type"] = rope_type + if rope_scaling: + text_config["rope_scaling"] = rope_scaling + return text_config + + # TODO: remove this once the transformers can support all of those models in _CONFIG_REGISTRY class LazyConfigDict(dict): @@ -356,16 +445,6 @@ def load_pretrained_config(model_name_or_path: str, MistralConfigLoader model_config = MistralConfigLoader().load( model_name_or_path).pretrained_config - elif (model_type == "qwen3_5_moe" and - (("text_config" in config_dict and "vision_config" in config_dict) or - (architectures - and architectures[0] == "Qwen3_5MoeForConditionalGeneration"))): - # Qwen3.5-MoE VLM: HF native composite config + model-side normalizer. - from tensorrt_llm._torch.models.modeling_qwen3_5 import \ - _normalize_qwen35_moe_vl_config - model_config = transformers.Qwen3_5MoeConfig.from_pretrained( - model_name_or_path, **kwargs) - _normalize_qwen35_moe_vl_config(model_config) elif model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] model_config = config_class.from_pretrained(model_name_or_path, @@ -378,11 +457,8 @@ def load_pretrained_config(model_name_or_path: str, "Qwen3_5ForCausalLM", "Qwen3_5ForConditionalGeneration", )): - # Qwen3.5 text-only: flatten to Qwen3NextConfig via the model-side shim. - from tensorrt_llm._torch.models.modeling_qwen3_5 import \ - Qwen35ConfigCompat model_config = transformers.Qwen3NextConfig.from_dict( - Qwen35ConfigCompat.normalize(config_dict)) + _Qwen35ConfigCompat.normalize(config_dict)) elif (model_type == "exaone4" and config_dict.get("sliding_window") is None and config_dict.get("layer_types") is None): # transformers 5.5.x Exaone4Config.__post_init__ first forces diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py index 54c02754f12d..14d813a99dfd 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_loader.py +++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py @@ -29,7 +29,6 @@ MoeLoadBalancer, maybe_create_moe_load_balancer) from ..virtual_memory import RestoreMode from ..virtual_memory import scope as virtual_memory_scope -from .config_utils import resolve_hf_torch_dtype, resolve_mamba_ssm_cache_dtype _KV_CACHE_MAP = { "fp8": QuantAlgo.FP8.value, @@ -45,10 +44,12 @@ def validate_and_set_mamba_ssm_cache_dtype( mamba_ssm_stochastic_rounding: bool = False, mamba_ssm_philox_rounds: int = 10) -> None: if mamba_ssm_cache_dtype == "auto": - mamba_ssm_cache_dtype = ( - resolve_mamba_ssm_cache_dtype(config.pretrained_config) - or resolve_hf_torch_dtype(config.pretrained_config) - or config.torch_dtype) + hf_dtype = getattr(config.pretrained_config, "mamba_ssm_cache_dtype", + None) + if hf_dtype is not None: + mamba_ssm_cache_dtype = str_dtype_to_torch(hf_dtype) + else: + mamba_ssm_cache_dtype = config.pretrained_config.torch_dtype else: mamba_ssm_cache_dtype = str_dtype_to_torch(mamba_ssm_cache_dtype) diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index 21aa37d6642c..fc23b4cbc6eb 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -62,12 +62,8 @@ Qwen/Qwen3-VL-8B-Instruct: mistralai/Mistral-Small-3.1-24B-Instruct-2503: - accuracy: 57.0 Qwen/Qwen3.5-35B-A3B: - # The default accuracy for `test_auto_dtype` tests. - - accuracy: 59.0 - dtype: bfloat16 accuracy: 60.444 - - quant_algo: FP8_BLOCK_SCALES - accuracy: 58.889 # Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params). # Values below are measured with NVFP4 checkpoint (thinking mode enabled). moonshotai/Kimi-K2.5: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index 3f2026c65b0a..39ce3d05d54d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -459,43 +459,6 @@ def test_nvfp4_4gpus( task.evaluate(llm, sampling_params=self.sampling_params) -@pytest.mark.skip_less_device_memory(80000) -class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness): - MODEL_NAME = "Qwen/Qwen3.5-35B-A3B" - MODEL_PATH = f"{llm_models_root()}/Qwen3.5-35B-A3B" - MAX_NUM_TOKENS = 16384 - MAX_BATCH_SIZE = 32 - - sampling_params = SamplingParams( - max_tokens=MAX_NUM_TOKENS, - truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, - stop="<|endoftext|>", - ) - - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False) - - def _make_llm(self, model_path: str) -> LLM: - return LLM( - model_path, - max_num_tokens=self.MAX_NUM_TOKENS, - max_batch_size=self.MAX_BATCH_SIZE, - kv_cache_config=self.kv_cache_config, - ) - - def test_auto_dtype(self) -> None: - with self._make_llm(self.MODEL_PATH) as llm: - task = MMMU(self.MODEL_NAME) - task.evaluate(llm, sampling_params=self.sampling_params) - - @skip_pre_hopper - def test_fp8_prequantized(self) -> None: - model_path = f"{llm_models_root()}/Qwen3.5-35B-A3B-FP8" - with self._make_llm(model_path) as llm: - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - task = MMMU(self.MODEL_NAME) - task.evaluate(llm, sampling_params=self.sampling_params) - - class TestQwen3VL(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-8B-Instruct" diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 5fc18da8b40a..9fd6d2c0c74c 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -807,8 +807,6 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced_chunked_prefill] accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestKimiK25::test_nvfp4[dep8] -accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_fp8_prequantized accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray unittest/disaggregated/test_openai_disagg_server.py diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml index a82a62e2c77d..9c72f9dccb86 100644 --- a/tests/integration/test_lists/test-db/l0_l40s.yml +++ b/tests/integration/test_lists/test-db/l0_l40s.yml @@ -23,7 +23,6 @@ l0_l40s: - unittest/_torch/modeling/test_modeling_qwen2_5vl.py::TestQwen2_5_VL::test_all - unittest/_torch/modeling/test_modeling_qwen3vl_moe.py::TestQwen3VLMoe::test_all - unittest/_torch/modeling/test_modeling_qwen3vl.py::TestQwen3VL::test_all - - unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py::TestQwen3_5MoeVL::test_all - test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] - unittest/llmapi/apps/_test_openai_chat_multimodal.py::test_single_chat_session_image_embeds -m needs_l40s # MMMU sanity check diff --git a/tests/unittest/_torch/modeling/test_modeling_multimodal.py b/tests/unittest/_torch/modeling/test_modeling_multimodal.py index ab7166b68bf3..53fe5e044fc6 100644 --- a/tests/unittest/_torch/modeling/test_modeling_multimodal.py +++ b/tests/unittest/_torch/modeling/test_modeling_multimodal.py @@ -18,12 +18,6 @@ from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models.modeling_multimodal_utils import bypass_processor_output_validation -from tensorrt_llm._torch.pyexecutor.config_utils import ( - extract_mamba_kv_cache_params, - is_nemotron_hybrid, - is_qwen3_hybrid, -) -from tensorrt_llm._torch.pyexecutor.mamba_cache_manager import CppMambaHybridCacheManager from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.bindings.executor import KvCacheConfig @@ -34,7 +28,6 @@ prompt_inputs, ) from tensorrt_llm.inputs.multimodal import MultimodalParams, MultimodalRuntimeData -from tensorrt_llm.llmapi.llm_args import KvCacheConfig as PyKvCacheConfig from tensorrt_llm.mapping import Mapping @@ -525,13 +518,6 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario): Note: This method uses get_kv_cache_config() to obtain configuration. Override get_kv_cache_config() to customize cache settings. - - For hybrid linear-attention models (Qwen3Next, Qwen3.5, - Nemotron-Hybrid) this dispatches to - `get_hybrid_kv_cache_manager` so the linear-attention layers - get a `CppMambaHybridCacheManager` for SSM/conv state. - Mirrors the production dispatch in - `_util.py:_create_kv_cache_manager`. """ # Get cache configuration from the configurable method cache_config = self.get_kv_cache_config(scenario) @@ -541,114 +527,17 @@ def init_kv_cache_manager(self, scenario: MultimodalScenario): num_blocks = (max_seq_len + tokens_per_block - 1) // tokens_per_block - config = self.model_config.pretrained_config - text_config = getattr(config, "text_config", config) - - if is_qwen3_hybrid(text_config) or is_nemotron_hybrid(text_config): - self.kv_cache_manager = self.get_hybrid_kv_cache_manager( - text_config=text_config, - tokens_per_block=tokens_per_block, - max_seq_len=max_seq_len, - batch_size=batch_size, - num_blocks=num_blocks, - ) - else: - self.kv_cache_manager = self.get_kv_cache_manager( - dtype=self.model_config.pretrained_config.torch_dtype, - config=self.model_config.pretrained_config, - tokens_per_block=tokens_per_block, - max_seq_len=max_seq_len, - batch_size=batch_size, - num_blocks=num_blocks, - ) - - self.kv_cache_manager.add_dummy_requests( - request_ids=[1], - token_nums=[max_seq_len], - **self._dummy_request_kwargs(scenario), - ) - - def _dummy_request_kwargs(self, scenario: MultimodalScenario) -> Dict: - """Optional override hook for extra kwargs to `add_dummy_requests`. - - Subclasses for mRoPE-using models (Qwen2.5-VL, Qwen3-VL, Qwen3.5-VL, - …) should return `{"use_mrope": True}` here so the cache manager - allocates the mRoPE position-id buffer at dummy-request time. - Defaults to an empty dict, preserving existing behavior for tests - that don't care. - """ - return {} - - def get_hybrid_kv_cache_manager( - self, - text_config: PretrainedConfig, - tokens_per_block: int, - max_seq_len: int, - batch_size: int, - num_blocks: int, - ): - """Build a `CppMambaHybridCacheManager` for hybrid linear-attention - models (Qwen3Next, Qwen3.5, Nemotron-Hybrid). - - Mirrors the production construction in - `_util.py:_create_kv_cache_manager` for `is_qwen3_hybrid` / - `is_nemotron_hybrid` configs: pulls the state-shape / dtype / - layer-mask parameters from `extract_mamba_kv_cache_params` and - threads them through the constructor. Tests that need a different - concrete manager (e.g. `MixedMambaHybridCacheManager` for - disagg-style coverage) should override this method. - """ - dtype_map = { - torch.half: tensorrt_llm.bindings.DataType.HALF, - torch.float16: tensorrt_llm.bindings.DataType.HALF, - torch.bfloat16: tensorrt_llm.bindings.DataType.BF16, - } - - mamba_params = extract_mamba_kv_cache_params(text_config) - if mamba_params.dtype not in dtype_map: - raise ValueError( - f"Unsupported dtype for hybrid cache manager: " - f"{mamba_params.dtype}. Supported: {list(dtype_map.keys())}" - ) - kv_cache_dtype = dtype_map[mamba_params.dtype] - - head_dim = getattr(text_config, "head_dim", None) - if not isinstance(head_dim, int): - head_dim = text_config.hidden_size // text_config.num_attention_heads - - # CppMambaHybridCacheManager reads Pydantic-only fields - # (mamba_state_cache_interval, enable_block_reuse) so we have to - # construct the llmapi.llm_args.KvCacheConfig here, not the C++ - # bindings KvCacheConfig that the standard KVCacheManager path uses. - kv_cache_config = PyKvCacheConfig(max_tokens=num_blocks * tokens_per_block) - mapping = Mapping(world_size=1, tp_size=1, rank=0) - - return CppMambaHybridCacheManager( - # mamba cache parameters (positional) - mamba_params.state_size, - mamba_params.conv_kernel, - mamba_params.num_heads, - mamba_params.n_groups, - mamba_params.head_dim, - mamba_params.num_mamba_layers, - mamba_params.mamba_layer_mask, - mamba_params.dtype, - mamba_params.mamba_ssm_cache_dtype, - # kv cache parameters (positional) - kv_cache_config, - tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF, - # kw-only - num_layers=mamba_params.num_full_attention_layers, - layer_mask=mamba_params.full_attention_layer_mask, - num_kv_heads=text_config.num_key_value_heads, - head_dim=head_dim, + self.kv_cache_manager = self.get_kv_cache_manager( + dtype=self.model_config.pretrained_config.torch_dtype, + config=self.model_config.pretrained_config, tokens_per_block=tokens_per_block, max_seq_len=max_seq_len, - max_batch_size=batch_size, - mapping=mapping, - dtype=kv_cache_dtype, + batch_size=batch_size, + num_blocks=num_blocks, ) + self.kv_cache_manager.add_dummy_requests(request_ids=[1], token_nums=[max_seq_len]) + def get_max_num_tokens(self, scenario: MultimodalScenario) -> int: """Get maximum number of tokens for attention metadata.""" if scenario.chunked_prefill: @@ -806,14 +695,6 @@ def setUp(self): # TODO: Add multi-GPU support self.device = torch.device("cuda:0") - # Pre-initialize fields that tearDown / setup_scenario expect to - # exist. Without this, a test method that doesn't run - # setup_scenario (e.g. a setUp-only smoke test) leaves - # self.kv_cache_manager unset and tearDown errors with - # AttributeError on the ``is not None`` check. - self.kv_cache_manager = None - self.attn_metadata = None - self.hf_config = self.create_hf_config() if self.skip_hf_inference: # Create a dummy torch module if skipping HF inference. diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py deleted file mode 100644 index df30e93d89e0..000000000000 --- a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py +++ /dev/null @@ -1,450 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -import os -from copy import deepcopy -from pathlib import Path -from typing import List, Optional - -import torch -import transformers -from test_modeling_multimodal import MultimodalScenario, TestModelingMultimodal -from transformers import Qwen3_5MoeForConditionalGeneration as HFQwen3_5MoeForConditionalGeneration -from utils.llm_data import llm_models_root - -from tensorrt_llm._torch.model_config import ModelConfig -from tensorrt_llm._torch.models import Qwen3_5MoeVLModel -from tensorrt_llm._torch.models.checkpoints.auto_mapper import AutoCheckpointMapper -from tensorrt_llm._torch.models.checkpoints.hf.qwen3_5_weight_mapper import Qwen3_5MoeHfWeightMapper -from tensorrt_llm._torch.models.modeling_auto import AutoModelForCausalLM -from tensorrt_llm._torch.models.modeling_qwen3_5 import _normalize_qwen35_moe_vl_config -from tensorrt_llm._torch.pyexecutor.config_utils import ( - extract_mamba_kv_cache_params, - load_pretrained_config, -) -from tensorrt_llm._torch.pyexecutor.model_loader import validate_and_set_mamba_ssm_cache_dtype -from tensorrt_llm.inputs import ContentFormat -from tensorrt_llm.inputs.registry import MULTIMODAL_PLACEHOLDER_REGISTRY - - -def _write_qwen35_moe_vl_config(tmp_path: Path) -> Path: - config = { - "architectures": ["Qwen3_5MoeForConditionalGeneration"], - "image_token_id": 248056, - "model_type": "qwen3_5_moe", - "text_config": { - "attention_bias": False, - "attention_dropout": 0.0, - "bos_token_id": 151643, - "dtype": "bfloat16", - "eos_token_id": 151645, - "full_attention_interval": 4, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 2048, - "linear_conv_kernel_dim": 4, - "linear_key_head_dim": 128, - "linear_num_key_heads": 16, - "linear_num_value_heads": 32, - "linear_value_head_dim": 128, - "mamba_ssm_dtype": "float32", - "max_position_embeddings": 262144, - "mlp_only_layers": [], - "model_type": "qwen3_5_moe_text", - "moe_intermediate_size": 512, - "norm_topk_prob": True, - "num_attention_heads": 32, - "num_experts": 128, - "num_experts_per_tok": 8, - "num_hidden_layers": 2, - "num_key_value_heads": 4, - "rms_norm_eps": 1e-6, - "shared_expert_intermediate_size": 512, - "rope_parameters": { - "mrope_section": [11, 11, 10], - "partial_rotary_factor": 0.25, - "rope_theta": 1000000.0, - "rope_type": "default", - }, - "use_cache": True, - "vocab_size": 151936, - }, - "tie_word_embeddings": False, - "video_token_id": 248057, - "vision_config": { - "deepstack_visual_indexes": [8, 16, 24], - "depth": 27, - "hidden_act": "gelu_pytorch_tanh", - "hidden_size": 1152, - "in_channels": 3, - "intermediate_size": 4304, - "model_type": "qwen3_5_moe", - "num_heads": 16, - "num_position_embeddings": 2304, - "out_hidden_size": 2048, - "patch_size": 16, - "spatial_merge_size": 2, - "temporal_patch_size": 2, - }, - "vision_end_token_id": 248054, - "vision_start_token_id": 248053, - } - (tmp_path / "config.json").write_text(json.dumps(config), encoding="utf-8") - return tmp_path - - -def test_qwen35_moe_vl_config_preserves_vlm_architecture( - tmp_path: Path, -) -> None: - config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path))) - - assert isinstance(config, transformers.Qwen3_5MoeConfig) - assert config.architectures == ["Qwen3_5MoeForConditionalGeneration"] - assert config.text_config.architectures == ["Qwen3_5MoeForCausalLM"] - assert config.text_config.num_experts == 128 - assert config.text_config.intermediate_size == 4608 - assert config.text_config.rope_theta == 1000000.0 - assert config.text_config.partial_rotary_factor == 0.25 - assert config.text_config.rope_scaling["type"] == "mrope" - assert config.text_config.rope_scaling["mrope_section"] == [11, 11, 10] - assert config.text_config.mamba_ssm_dtype == "float32" - assert config.get_text_config() is config.text_config - - -def test_qwen35_moe_vl_resolves_mamba_ssm_cache_dtype( - tmp_path: Path, -) -> None: - config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path))) - model_config = ModelConfig(pretrained_config=config) - - validate_and_set_mamba_ssm_cache_dtype(model_config, "auto") - assert model_config.quant_config.mamba_ssm_cache_dtype is torch.float32 - - mamba_params = extract_mamba_kv_cache_params( - config.text_config, - quant_config=model_config.quant_config, - ) - assert mamba_params.dtype is torch.bfloat16 - assert mamba_params.mamba_ssm_cache_dtype is torch.float32 - - -def test_qwen35_moe_vl_resolves_model_and_mapper(tmp_path: Path) -> None: - config = load_pretrained_config(str(_write_qwen35_moe_vl_config(tmp_path))) - model_config = ModelConfig(pretrained_config=config) - - assert AutoModelForCausalLM._resolve_class(model_config) is Qwen3_5MoeVLModel - assert isinstance( - AutoCheckpointMapper.get("HF", "Qwen3_5MoeForConditionalGeneration"), - Qwen3_5MoeHfWeightMapper, - ) - - -def test_qwen35_moe_vl_placeholder_metadata_registered() -> None: - metadata = MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholder_metadata("qwen3_5_moe") - - assert metadata.placeholder_map == { - "image": "<|vision_start|><|image_pad|><|vision_end|>", - "video": "<|vision_start|><|video_pad|><|vision_end|>", - } - assert metadata.placeholders_separator == "" - assert metadata.content_format is ContentFormat.STRING - - -# --- Layered parity test scaffold ------------------------------------------- -# -# Tiny synthetic config used by TestQwen3_5MoeVL below. Same architecture as -# the real Qwen/Qwen3.5-35B-A3B checkpoint but with much smaller dimensions -# where possible. -# -# Shapes that have to match real Qwen3.5 (can't shrink without breaking -# things downstream): -# -# - head_dim=256, partial_rotary_factor=0.25 --> rotary tensor width is -# `head_dim * 0.25 / 2 = 32`, which equals `sum(mrope_section)`. -# A smaller head_dim (e.g. 128) yields a 16-wide tensor that mRoPE -# can't split with section sum 32. -# - num_attention_heads=16, num_key_value_heads=2 match the real -# model's 8:1 GQA layout; Q proj is 2048 --> 4096, K/V are 2048 --> 512. -# - Vision deepstack indices [8, 16, 24] match the real config, and -# depth=27 is the smallest value that hosts those indices. Disabling -# deepstack (indices=[], depth=2) produces fewer vision embeddings -# than the HF processor reserves placeholder tokens for, which -# breaks `fuse_input_embeds`. -# - vocab_size=248320 matches the real Qwen3.5 tokenizer. The -# tokenizer (loaded via _name_or_path) emits special-token ids in -# the 248k range; `fuse_input_embeds` uses `vocab_size` as the -# OOV threshold to identify image-pad tokens. A smaller vocab_size -# would misclassify regular chat-template specials as mm tokens -# and trip the placeholder/embedding count check. -# -# Shapes that can be shrunk for tests: -# -# - num_hidden_layers: 2 (vs 40+). -# - num_experts: 128 (vs 256). Still moderate so MoE routing runs. -# - full_attention_interval=2 with 2 LM layers yields the pattern -# [linear_attention, full_attention] — one of each kind, exercising -# both the regular KV cache and the Mamba SSM/conv state via the -# base-class dispatch. -# -# `_name_or_path` points at the real checkpoint dir so the test can load -# the tokenizer/processor (only the processor; not the full model weights). -QWEN3_5_VL_MOE_PARITY_CONFIG = { - "architectures": ["Qwen3_5MoeForConditionalGeneration"], - "image_token_id": 248056, - "model_type": "qwen3_5_moe", - "text_config": { - "attention_bias": False, - "attention_dropout": 0.0, - "bos_token_id": 151643, - "dtype": "bfloat16", - "eos_token_id": 151645, - "full_attention_interval": 2, - "head_dim": 256, - "hidden_act": "silu", - "hidden_size": 2048, - "linear_conv_kernel_dim": 4, - "linear_key_head_dim": 128, - "linear_num_key_heads": 16, - "linear_num_value_heads": 32, - "linear_value_head_dim": 128, - "mamba_ssm_dtype": "float32", - "max_position_embeddings": 8192, - "mlp_only_layers": [], - "model_type": "qwen3_5_moe_text", - "moe_intermediate_size": 512, - "norm_topk_prob": True, - "num_attention_heads": 16, - "num_experts": 128, - "num_experts_per_tok": 8, - "num_hidden_layers": 2, - "num_key_value_heads": 2, - "rms_norm_eps": 1e-6, - "shared_expert_intermediate_size": 512, - "rope_parameters": { - "mrope_section": [11, 11, 10], - "partial_rotary_factor": 0.25, - "rope_theta": 1000000.0, - "rope_type": "default", - }, - "use_cache": True, - "vocab_size": 248320, - }, - "tie_word_embeddings": False, - "video_token_id": 248057, - "vision_config": { - "deepstack_visual_indexes": [8, 16, 24], - "depth": 27, - "hidden_act": "gelu_pytorch_tanh", - "hidden_size": 1152, - "in_channels": 3, - "initializer_range": 0.02, - "intermediate_size": 4304, - "model_type": "qwen3_5_moe", - "num_heads": 16, - "num_position_embeddings": 2304, - "out_hidden_size": 2048, - "patch_size": 16, - "spatial_merge_size": 2, - "temporal_patch_size": 2, - }, - "vision_end_token_id": 248054, - "vision_start_token_id": 248053, - "_name_or_path": str(os.path.join(llm_models_root(), "Qwen3.5-35B-A3B")), -} - - -class TestQwen3_5MoeVL(TestModelingMultimodal): - """Forward-parity test for Qwen3.5-MoE-VL against HuggingFace. - - Tiny-synthetic-config parity test in the same shape as - `TestQwen3VLMoe` / `TestQwen2_5VL`: both stacks are constructed - from `QWEN3_5_VL_MOE_PARITY_CONFIG` (2 LM layers, 1 linear + 1 full - attention, 128 experts, 2 vision layers), HF weights are copied - into TRT-LLM via `Qwen3_5MoeHfWeightMapper`, then `test_all` - sweeps the default `MultimodalScenario`s comparing last-position - logits at context + generation phases. - - Two-config design: - - `self.hf_config` is the raw `Qwen3_5MoeConfig.from_dict(...)` - result. HF model construction sees the native HF schema - (`rope_parameters` intact with `rope_type`, - `moe_intermediate_size`, …). - - TRT-LLM gets a deep-copied + normalized version via the - `create_trtllm_model` override below. That copy goes through - `_normalize_qwen35_moe_vl_config` exactly the same way - production `load_pretrained_config` does, so the Qwen3Next - runtime sees the flat aliases it expects - (`intermediate_size`, `rope_theta`, `rope_scaling`, …). - - Keeping the two configs separate means the production normalizer - doesn't need to be HF-safe — production only ever constructs the - TRT-LLM model from a normalized config, and the test mirrors that - boundary explicitly. The hybrid-cache path is handled by the base - class's `init_kv_cache_manager` dispatch on - `is_qwen3_hybrid` / `is_nemotron_hybrid`. - """ - - def get_model_config(self): - return QWEN3_5_VL_MOE_PARITY_CONFIG - - def get_trtllm_model_class(self): - return Qwen3_5MoeVLModel - - def get_hf_model_class(self): - return HFQwen3_5MoeForConditionalGeneration - - def get_weight_mapper_class(self): - return Qwen3_5MoeHfWeightMapper - - def get_model_type(self): - return "qwen3_5_moe" - - def get_model_config_class(self): - return transformers.Qwen3_5MoeConfig - - def create_trtllm_model( - self, - load_weights: bool = False, - hf_model_state_dict: Optional[dict] = None, - **kwargs, - ): - """Build the TRT-LLM model from a *normalized copy* of `self.hf_config`. - - Mirrors the base-class body but swaps in - `_normalize_qwen35_moe_vl_config(trtllm_config)` before - wrapping in `ModelConfig`. `self.hf_config` itself stays - raw so the HF model that the base class builds in `setUp` - sees native HF schema. - """ - trtllm_config = deepcopy(self.hf_config) - _normalize_qwen35_moe_vl_config(trtllm_config) - - model_config = ModelConfig(pretrained_config=trtllm_config) - model_class = self.get_trtllm_model_class() - model = model_class(model_config, **kwargs).to("cuda") - - if load_weights: - weight_mapper = self.get_weight_mapper_class()() - weight_mapper.init_model_and_config(model, trtllm_config) - model.load_weights(hf_model_state_dict, weight_mapper) - - for module in model.modules(): - if hasattr(module, "post_load_weights") and not getattr( - module, "_weights_removed", False - ): - module.post_load_weights() - - return model, model_config - - def _dummy_request_kwargs(self, scenario): - """Qwen3.5-VL uses mRoPE; the cache manager needs the mRoPE - position-id buffer allocated at dummy-request time.""" - return {"use_mrope": True} - - def get_tolerance(self): - """Tighten `rtol` to `0.1` (4x tighter than the base 0.4 - default) while keeping `atol` at `0.4` to absorb single-logit - tail outliers seen on `multiple_image` / `video`. - """ - return 0.4, 0.1 - - def get_trtllm_inputs( - self, - input_ids, - multimodal_params_list, - is_gen: bool = False, - num_cached_tokens_per_seq: Optional[List[int]] = None, - total_prompt_len: Optional[int] = None, - ): - """Override position_ids with mRoPE position IDs from the - multimodal params. Same pattern as `TestQwen3VLMoe` — the - VLM wrapper feeds mRoPE-shaped position IDs to the decoder, - not the simple range-based default the base class produces. - """ - trtllm_inputs = super().get_trtllm_inputs( - input_ids, - multimodal_params_list, - is_gen, - num_cached_tokens_per_seq, - total_prompt_len=total_prompt_len, - ) - - if is_gen: - mrope_gen_position_ids = [] - for multimodal_param in multimodal_params_list: - mrope_gen_position_ids.append( - multimodal_param.multimodal_data["mrope_config"]["mrope_position_deltas"] - ) - mrope_gen_position_ids = torch.cat(mrope_gen_position_ids, dim=-1).to(self.device) - trtllm_inputs["position_ids"] = ( - (trtllm_inputs["position_ids"] + mrope_gen_position_ids).expand(3, -1, 1).cuda() - ) - gen_multimodal_params_list = [] - for multimodal_param in multimodal_params_list: - multimodal_param.strip_for_generation() - multimodal_param.to_device( - "multimodal_data", - self.device, - pin_memory=True, - target_keywords=["mrope_config.mrope_position_deltas"], - ) - gen_multimodal_params_list.append(multimodal_param) - trtllm_inputs["multimodal_params"] = gen_multimodal_params_list - else: - mrope_position_ids = [] - for multimodal_param in multimodal_params_list: - mrope_position_ids.append( - multimodal_param.multimodal_data["mrope_config"]["mrope_position_ids"] - ) - position_ids = torch.cat(mrope_position_ids, dim=-1).cuda() - trtllm_inputs["position_ids"] = position_ids - - return trtllm_inputs - - def get_scenarios(self) -> List[MultimodalScenario]: - """Modality-sanity sweep (image / multiple_image / video). - - These three catch differences in placeholder counts and the - multimodal-cumsum path between single-image, multi-image, and - video inputs. - - CUDA-graph capture is intentionally not exercised here. The - standard `attn_metadata.create_cuda_graph_metadata` path only - addresses attention metadata; the Mamba SSM state buffer of the - hybrid (Mamba + attention) cache is not threaded through, so - replayed logits diverge from the HF reference. Adding that path - is dedicated harness work and tracked separately. - """ - return [ - MultimodalScenario( - modality="image", - use_cuda_graph=False, - chunked_prefill=False, - kv_cache_reuse=False, - ), - MultimodalScenario( - modality="multiple_image", - use_cuda_graph=False, - chunked_prefill=False, - kv_cache_reuse=False, - ), - MultimodalScenario( - modality="video", - use_cuda_graph=False, - chunked_prefill=False, - kv_cache_reuse=False, - ), - ] - - def test_construction_and_weight_loading_smoke(self): - """Smoke test: setUp built HF + TRT-LLM models and copied HF - weights into TRT-LLM via the weight mapper. Detailed - assertions on the normalizer's outputs live in the routing - tests above (e.g. `test_qwen35_moe_vl_config_preserves_vlm_architecture`) - — this one just confirms construction reached the end without - exception. - """ - self.assertIsNotNone(self.hf_model) - self.assertIsNotNone(self.trtllm_model) - self.assertIsNotNone(self.model_config)