6666 save_expert_token_count_table ,
6767)
6868from modelopt .torch .export .model_utils import get_language_model_from_vl , is_multimodal_model
69+ from modelopt .torch .quantization ._auto_quantize_cost import EXCLUDED_MODULE_NAME_PATTERNS_KEY
6970from modelopt .torch .quantization .config import _default_disabled_quantizer_cfg , need_calibration
7071from modelopt .torch .quantization .plugins .accelerate import init_quantized_weights
7172from modelopt .torch .quantization .utils import is_quantized
@@ -140,6 +141,35 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
140141mto .enable_huggingface_checkpointing ()
141142
142143
144+ # TODO: To be refacored into config system.
145+ _QWEN36_AUTOQ_DISABLED_LAYERS = (
146+ "*shared_expert_gate*" ,
147+ "*linear_attn.in_proj_a*" ,
148+ "*linear_attn.in_proj_b*" ,
149+ )
150+ _VLM_AUTOQ_DISABLED_LAYERS = ("*visual*" , "*mtp*" , "*vision_tower*" )
151+
152+
153+ def get_auto_quantize_disabled_layers (model ) -> list [str ]:
154+ """Return layer patterns that should be excluded from AutoQuantize search."""
155+ disabled_layers = [
156+ entry ["quantizer_name" ]
157+ for entry in _default_disabled_quantizer_cfg
158+ if "parent_class" not in entry and entry ["quantizer_name" ] != "*lm_head*"
159+ ]
160+ disabled_layers .extend (p for p in _QWEN36_AUTOQ_DISABLED_LAYERS if p not in disabled_layers )
161+ if is_multimodal_model (model ):
162+ disabled_layers .extend (p for p in _VLM_AUTOQ_DISABLED_LAYERS if p not in disabled_layers )
163+ return disabled_layers
164+
165+
166+ def get_auto_quantize_cost_excluded_patterns (model ) -> list [str ]:
167+ """Return layer patterns excluded only from AutoQuantize cost accounting."""
168+ if is_multimodal_model (model ):
169+ return list (_VLM_AUTOQ_DISABLED_LAYERS )
170+ return []
171+
172+
143173def extract_and_prepare_language_model_from_vl (full_model ):
144174 """Extract language model from VL model and disable quantization for non-language components.
145175
@@ -323,6 +353,7 @@ def auto_quantize(
323353 "nvfp4_awq" ,
324354 "nvfp4_mse" ,
325355 "w4a8_awq" ,
356+ "w4a16_nvfp4" ,
326357 "fp8_pb_wo" ,
327358 "w4a8_mxfp4_fp8" ,
328359 "nvfp4_mlp_only" ,
@@ -386,10 +417,14 @@ def forward_step(model, batch):
386417 "effective_bits" : args .auto_quantize_bits ,
387418 "cost_model" : args .auto_quantize_cost_model ,
388419 }
420+ auto_quantize_cost = {}
389421 if args .auto_quantize_active_moe_expert_ratio is not None :
390- auto_quantize_constraints ["cost" ] = {
391- "active_moe_expert_ratio" : args .auto_quantize_active_moe_expert_ratio
392- }
422+ auto_quantize_cost ["active_moe_expert_ratio" ] = args .auto_quantize_active_moe_expert_ratio
423+ cost_excluded_patterns = get_auto_quantize_cost_excluded_patterns (language_model )
424+ if cost_excluded_patterns :
425+ auto_quantize_cost [EXCLUDED_MODULE_NAME_PATTERNS_KEY ] = cost_excluded_patterns
426+ if auto_quantize_cost :
427+ auto_quantize_constraints ["cost" ] = auto_quantize_cost
393428
394429 language_model , _ = mtq .auto_quantize (
395430 language_model ,
@@ -405,12 +440,7 @@ def forward_step(model, batch):
405440 len (calib_dataloader ), max (auto_quantize_score_size // args .batch_size , 1 )
406441 ),
407442 verbose = True ,
408- # Disable all default disabled layers such as lm_head, mlp.gate, router etc.
409- disabled_layers = [
410- entry ["quantizer_name" ]
411- for entry in _default_disabled_quantizer_cfg
412- if "parent_class" not in entry
413- ],
443+ disabled_layers = get_auto_quantize_disabled_layers (language_model ),
414444 method = auto_quantize_method ,
415445 checkpoint = auto_quantize_checkpoint ,
416446 )
@@ -550,12 +580,10 @@ def load_model(args: argparse.Namespace):
550580 : len (args .dataset )
551581 ]
552582
553- # We only quantize the language model for VLMs other than the type supported above.
554- # Recipe mode is the exception: in Qwen3.5/3.6-MoE VLMs, lm_head sits
555- # on the outer CausalLM, not the inner language backbone. A recipe that targets
556- # lm_head must therefore quantize against the full model and explicitly keep visual
557- # and MTP siblings disabled.
558- if args .recipe is None :
583+ # Plain PTQ quantizes only the extracted language model. Recipe and
584+ # AutoQuantize paths keep the outer CausalLM so recipes/search can see
585+ # Qwen3.5/3.6-MoE VLM lm_head.
586+ if args .recipe is None and args .auto_quantize_bits is None :
559587 extracted_lm , extracted_model_type = extract_and_prepare_language_model_from_vl (
560588 full_model
561589 )
@@ -1081,9 +1109,16 @@ def _is_layerwise(obj):
10811109 "Auto quantization needs multiple quantization format."
10821110 )
10831111
1112+ # For VL models, autoquant must walk submodules of the OUTER CausalLM
1113+ # (which carries lm_head and the LM-head forward path) — otherwise
1114+ # lm_head and any sibling-of-language_model modules are silently
1115+ # invisible to the search. ``forward_step`` also needs the outer model
1116+ # to produce ``CausalLMOutputWithPast`` (for ``.loss`` / ``.logits``).
1117+ # Visual tower and MTP siblings are auto-excluded inside
1118+ # ``auto_quantize()`` via *visual* / *mtp* / *vision_tower* patterns.
10841119 auto_quantize (
10851120 args ,
1086- language_model ,
1121+ full_model ,
10871122 calib_dataloader ,
10881123 auto_quantize_method = args .auto_quantize_method ,
10891124 auto_quantize_score_size = args .auto_quantize_score_size ,
0 commit comments