2727from cast_mxfp4_to_nvfp4 import apply_to_model as apply_cast_mxfp4_to_nvfp4
2828from cast_mxfp4_to_nvfp4 import force_weight_quantizers_static
2929from example_utils import (
30+ _get_auto_quantize_cost_excluded_patterns ,
31+ _get_auto_quantize_disabled_layers ,
3032 build_quant_cfg ,
3133 copy_custom_model_files ,
3234 create_vlm_calibration_loop ,
7274 save_expert_token_count_table ,
7375)
7476from modelopt .torch .export .model_utils import get_language_model_from_vl , is_multimodal_model
75- from modelopt .torch .quantization .config import _default_disabled_quantizer_cfg , need_calibration
77+ from modelopt .torch .quantization ._auto_quantize_cost import EXCLUDED_MODULE_NAME_PATTERNS_KEY
78+ from modelopt .torch .quantization .config import need_calibration
7679from modelopt .torch .quantization .plugins .accelerate import init_quantized_weights
7780from modelopt .torch .quantization .utils import is_quantized
7881from modelopt .torch .speculative .eagle .utils import (
@@ -132,6 +135,7 @@ def _kv_cfg_uses_constant_amax(kv_quant_cfg: list[dict[str, Any]]) -> bool:
132135 "nvfp4_awq_lite" ,
133136 "nvfp4_w4a4_weight_mse_fp8_sweep" ,
134137 "w4a8_awq_beta" ,
138+ "w4a16_nvfp4" ,
135139 "fp8_2d_blockwise_weight_only" ,
136140 "w4a8_mxfp4_fp8" ,
137141 "nvfp4_mlp_only" ,
@@ -387,10 +391,14 @@ def forward_step(model, batch):
387391 "effective_bits" : args .auto_quantize_bits ,
388392 "cost_model" : args .auto_quantize_cost_model ,
389393 }
394+ auto_quantize_cost = {}
390395 if args .auto_quantize_active_moe_expert_ratio is not None :
391- auto_quantize_constraints ["cost" ] = {
392- "active_moe_expert_ratio" : args .auto_quantize_active_moe_expert_ratio
393- }
396+ auto_quantize_cost ["active_moe_expert_ratio" ] = args .auto_quantize_active_moe_expert_ratio
397+ cost_excluded_patterns = _get_auto_quantize_cost_excluded_patterns (language_model )
398+ if cost_excluded_patterns :
399+ auto_quantize_cost [EXCLUDED_MODULE_NAME_PATTERNS_KEY ] = cost_excluded_patterns
400+ if auto_quantize_cost :
401+ auto_quantize_constraints ["cost" ] = auto_quantize_cost
394402
395403 language_model , _ = mtq .auto_quantize (
396404 language_model ,
@@ -406,12 +414,7 @@ def forward_step(model, batch):
406414 len (calib_dataloader ), max (auto_quantize_score_size // args .batch_size , 1 )
407415 ),
408416 verbose = True ,
409- # Disable all default disabled layers such as lm_head, mlp.gate, router etc.
410- disabled_layers = [
411- entry ["quantizer_name" ]
412- for entry in _default_disabled_quantizer_cfg
413- if "parent_class" not in entry
414- ],
417+ disabled_layers = _get_auto_quantize_disabled_layers (language_model ),
415418 method = auto_quantize_method ,
416419 checkpoint = auto_quantize_checkpoint ,
417420 )
@@ -487,7 +490,7 @@ def load_model(args: argparse.Namespace):
487490 is_nemotron_vl_model = is_nemotron_vl (full_model )
488491
489492 # Default to image-text calibration for VLM models
490- if is_nemotron_vl_model and not args .calib_with_images :
493+ if is_nemotron_vl_model and not args .calib_with_images and args . auto_quantize_bits is None :
491494 print ("Nemotron VL model detected. Enabling image-text calibration by default." )
492495 args .calib_with_images = True
493496
@@ -539,12 +542,10 @@ def load_model(args: argparse.Namespace):
539542 : len (args .dataset )
540543 ]
541544
542- # We only quantize the language model for VLMs other than the type supported above.
543- # Recipe mode is the exception: in Qwen3.5/3.6-MoE VLMs, lm_head sits
544- # on the outer CausalLM, not the inner language backbone. A recipe that targets
545- # lm_head must therefore quantize against the full model and explicitly keep visual
546- # and MTP siblings disabled.
547- if args .recipe is None :
545+ # Plain PTQ quantizes only the extracted language model. Recipe and
546+ # AutoQuantize paths keep the outer CausalLM so recipes/search can see
547+ # Qwen3.5/3.6-MoE VLM lm_head.
548+ if args .recipe is None and args .auto_quantize_bits is None :
548549 extracted_lm , extracted_model_type = extract_and_prepare_language_model_from_vl (
549550 full_model
550551 )
@@ -1070,9 +1071,16 @@ def _is_layerwise(obj):
10701071 "Auto quantization needs multiple quantization format."
10711072 )
10721073
1074+ # For VL models, autoquant must walk submodules of the OUTER CausalLM
1075+ # (which carries lm_head and the LM-head forward path) — otherwise
1076+ # lm_head and any sibling-of-language_model modules are silently
1077+ # invisible to the search. ``forward_step`` also needs the outer model
1078+ # to produce ``CausalLMOutputWithPast`` (for ``.loss`` / ``.logits``).
1079+ # Visual tower and MTP siblings are auto-excluded inside
1080+ # ``auto_quantize()`` via *visual* / *mtp* / *vision_tower* patterns.
10731081 auto_quantize (
10741082 args ,
1075- language_model ,
1083+ full_model ,
10761084 calib_dataloader ,
10771085 auto_quantize_method = args .auto_quantize_method ,
10781086 auto_quantize_score_size = args .auto_quantize_score_size ,
@@ -1437,6 +1445,8 @@ def parse_args() -> argparse.Namespace:
14371445 args = parser .parse_args ()
14381446 if args .moe_calib_experts_ratio is not None and not (0.0 < args .moe_calib_experts_ratio <= 1.0 ):
14391447 parser .error ("--moe_calib_experts_ratio must be in the range (0.0, 1.0]." )
1448+ if args .auto_quantize_bits is not None and args .calib_with_images :
1449+ parser .error ("--calib_with_images is not supported with --auto_quantize_bits." )
14401450 if args .auto_quantize_active_moe_expert_ratio is not None and not (
14411451 0.0 < args .auto_quantize_active_moe_expert_ratio <= 1.0
14421452 ):
0 commit comments