3535# runtime layer asks the model module how to load its own config.
3636#
3737# There are two entry points:
38- # - `_Qwen35ConfigCompat .normalize(config_dict)` — for text-only
38+ # - `Qwen35ConfigCompat .normalize(config_dict)` — for text-only
3939# Qwen3.5 (MoE and dense). Returns a dict that
4040# `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the
4141# existing Qwen3Next runtime is reused unchanged.
4545# while keeping `text_config` / `vision_config` composite.
4646
4747
48- class _Qwen35ConfigCompat :
48+ class Qwen35ConfigCompat :
4949 """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig.
5050
5151 We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native
@@ -66,9 +66,9 @@ class _Qwen35ConfigCompat:
6666 @staticmethod
6767 def normalize (config_dict : dict ) -> dict :
6868 """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict."""
69- text_config = _Qwen35ConfigCompat ._extract_text_config (config_dict )
70- text_config = _Qwen35ConfigCompat ._inherit_quantization_config (config_dict , text_config )
71- text_config = _Qwen35ConfigCompat ._flatten_rope (text_config )
69+ text_config = Qwen35ConfigCompat ._extract_text_config (config_dict )
70+ text_config = Qwen35ConfigCompat ._inherit_quantization_config (config_dict , text_config )
71+ text_config = Qwen35ConfigCompat ._flatten_rope (text_config )
7272
7373 # Detect dense vs MoE and set architecture + MoE defaults accordingly
7474 is_moe = "num_experts" in text_config and text_config ["num_experts" ] > 0
@@ -93,7 +93,7 @@ def normalize(config_dict: dict) -> dict:
9393 def _extract_text_config (config_dict : dict ) -> dict :
9494 """Pull nested text_config from VLM checkpoints, or use dict as-is."""
9595 architectures = config_dict .get ("architectures" ) or []
96- if architectures and architectures [0 ] in _Qwen35ConfigCompat ._VLM_ARCHITECTURES :
96+ if architectures and architectures [0 ] in Qwen35ConfigCompat ._VLM_ARCHITECTURES :
9797 text_config = dict (config_dict .get ("text_config" ) or {})
9898 else :
9999 text_config = dict (config_dict )
@@ -116,10 +116,10 @@ def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict:
116116
117117 quantization_config = dict (config_dict ["quantization_config" ])
118118 if "modules_to_not_convert" in quantization_config :
119- modules = _Qwen35ConfigCompat ._normalize_exclude_modules (
119+ modules = Qwen35ConfigCompat ._normalize_exclude_modules (
120120 quantization_config ["modules_to_not_convert" ]
121121 )
122- modules = _Qwen35ConfigCompat ._add_qkvz_bf16_workaround (text_config , modules )
122+ modules = Qwen35ConfigCompat ._add_qkvz_bf16_workaround (text_config , modules )
123123 quantization_config ["modules_to_not_convert" ] = sorted (set (modules ))
124124 text_config ["quantization_config" ] = quantization_config
125125 return text_config
@@ -209,7 +209,7 @@ def _normalize_qwen35_mrope_config(text_config) -> None:
209209 return
210210 if hasattr (rope_parameters , "to_dict" ):
211211 rope_parameters = rope_parameters .to_dict ()
212- flattened = _Qwen35ConfigCompat ._flatten_rope (
212+ flattened = Qwen35ConfigCompat ._flatten_rope (
213213 {
214214 "rope_parameters" : dict (rope_parameters ),
215215 "rope_scaling" : dict (getattr (text_config , "rope_scaling" , None ) or {}),
@@ -245,9 +245,9 @@ def _normalize_qwen35_quantization_config(model_config) -> None:
245245 return
246246
247247 text_config = getattr (model_config , "text_config" , None )
248- normalized_modules = _Qwen35ConfigCompat ._normalize_exclude_modules (modules )
248+ normalized_modules = Qwen35ConfigCompat ._normalize_exclude_modules (modules )
249249 if text_config is not None :
250- normalized_modules = _Qwen35ConfigCompat ._add_qkvz_bf16_workaround (
250+ normalized_modules = Qwen35ConfigCompat ._add_qkvz_bf16_workaround (
251251 text_config .to_dict (), normalized_modules
252252 )
253253 quantization_config ["modules_to_not_convert" ] = sorted (set (normalized_modules ))
@@ -331,7 +331,7 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM):
331331
332332 Same reuse pattern as Qwen3_5MoeForCausalLM, but for the dense 27B
333333 variant which uses GatedMLP instead of SparseMoeBlock. The config
334- normalizer (_Qwen35ConfigCompat ) sets num_experts=0 so that
334+ normalizer (Qwen35ConfigCompat ) sets num_experts=0 so that
335335 Qwen3NextModel selects GatedMLP for the feed-forward layers.
336336 """
337337
@@ -340,6 +340,7 @@ def __init__(self, model_config):
340340 super ().__init__ (model_config )
341341
342342
343+ # TODO: Add tests for disaggregated support.
343344@support_multimodal_disaggregated
344345@register_vision_encoder (Qwen3VisionModelBase , vlm_base_model = Qwen3VisionModel )
345346@register_auto_model ("Qwen3_5MoeForConditionalGeneration" )
0 commit comments