NVIDIA
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 28 additions & 19 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 28 additions & 19 deletions
diff --git a/‎examples/llm_ptq/scripts/parser.sh‎
Lines changed: 6 additions & 4 deletions b/‎examples/llm_ptq/scripts/parser.sh‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎modelopt/torch/quantization/_auto_quantize_cost.py‎
Lines changed: 16 additions & 1 deletion b/‎modelopt/torch/quantization/_auto_quantize_cost.py‎
Lines changed: 16 additions & 1 deletion
@@ -66,7 +66,7 @@
     save_expert_token_count_table,
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
-from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
+from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
 from modelopt.torch.speculative.eagle.utils import (
@@ -292,6 +292,7 @@ def auto_quantize(
     args: argparse.Namespace,
     language_model: torch.nn.Module,
     calib_dataloader: DataLoader,
+    recipe: ModelOptPTQRecipe | None = None,
     auto_quantize_method="gradient",
     auto_quantize_score_size=128,
     auto_quantize_checkpoint=None,
@@ -323,6 +324,7 @@ def auto_quantize(
             "nvfp4_awq",
             "nvfp4_mse",
             "w4a8_awq",
+            "w4a16_nvfp4",
             "fp8_pb_wo",
             "w4a8_mxfp4_fp8",
             "nvfp4_mlp_only",
@@ -391,6 +393,10 @@ def forward_step(model, batch):
             "active_moe_expert_ratio": args.auto_quantize_active_moe_expert_ratio
         }
 
+    disabled_layers = recipe.quantize.disabled_layers if recipe is not None else None
+    if disabled_layers:
+        print(f"AutoQuantize disabled layers from recipe: {disabled_layers}")
+
     language_model, _ = mtq.auto_quantize(
         language_model,
         constraints=auto_quantize_constraints,
@@ -405,12 +411,7 @@ def forward_step(model, batch):
             len(calib_dataloader), max(auto_quantize_score_size // args.batch_size, 1)
         ),
         verbose=True,
-        # Disable all default disabled layers such as lm_head, mlp.gate, router etc.
-        disabled_layers=[
-            entry["quantizer_name"]
-            for entry in _default_disabled_quantizer_cfg
-            if "parent_class" not in entry
-        ],
+        disabled_layers=disabled_layers,
         method=auto_quantize_method,
         checkpoint=auto_quantize_checkpoint,
     )
@@ -550,12 +551,9 @@ def load_model(args: argparse.Namespace):
                 : len(args.dataset)
             ]
 
-            # We only quantize the language model for VLMs other than the type supported above.
-            # Recipe mode is the exception: in Qwen3.5/3.6-MoE VLMs, lm_head sits
-            # on the outer CausalLM, not the inner language backbone. A recipe that targets
-            # lm_head must therefore quantize against the full model and explicitly keep visual
-            # and MTP siblings disabled.
-            if args.recipe is None:
+            # AutoQuantize walks the outer CausalLM so lm_head is visible to the
+            # search. Visual/MTP siblings are excluded by disabled-layer patterns.
+            if args.auto_quantize_bits is None:
                 extracted_lm, extracted_model_type = extract_and_prepare_language_model_from_vl(
                     full_model
                 )
@@ -994,9 +992,10 @@ def quantize_main(
     default_pad_token,
     device: torch.device,
 ):
-    # Load the recipe up front so we can detect layerwise calibration before batch-size probing.
+    # Load the recipe up front so we can detect layerwise calibration before batch-size probing
+    # and read AutoQuantize search metadata such as disabled_layers.
     recipe = None
-    if args.recipe is not None and not args.auto_quantize_bits:
+    if args.recipe is not None:
         print(f"Use recipe {args.recipe} for quantization")
         recipe = load_recipe(args.recipe)
         if not isinstance(recipe, ModelOptPTQRecipe):
@@ -1081,10 +1080,18 @@ def _is_layerwise(obj):
             "Auto quantization needs multiple quantization format."
         )
 
+        # For VL models, autoquant must walk submodules of the OUTER CausalLM
+        # (which carries lm_head and the LM-head forward path) — otherwise
+        # lm_head and any sibling-of-language_model modules are silently
+        # invisible to the search. ``forward_step`` also needs the outer model
+        # to produce ``CausalLMOutputWithPast`` (for ``.loss`` / ``.logits``).
+        # Visual tower and MTP siblings are auto-excluded inside
+        # ``auto_quantize()`` via *visual* / *mtp* / *vision_tower* patterns.
         auto_quantize(
             args,
-            language_model,
+            full_model,
             calib_dataloader,
+            recipe=recipe,
             auto_quantize_method=args.auto_quantize_method,
             auto_quantize_score_size=args.auto_quantize_score_size,
             auto_quantize_checkpoint=args.auto_quantize_checkpoint,
@@ -1209,7 +1216,9 @@ def parse_args() -> argparse.Namespace:
         help=(
             "PTQ recipe YAML file or name without suffix (e.g. general/ptq/fp8_default-kv_fp8_cast, "
             "general/ptq/nvfp4_default-kv_fp8_cast, general/ptq/nvfp4_default-kv_nvfp4_cast). "
-            "When set, --kv_cache_qformat is ignored; the recipe fully determines KV cache config."
+            "For plain PTQ, the recipe fully determines the quantization config and --kv_cache_qformat "
+            "is ignored. For AutoQuantize, --qformat still determines the search formats while the "
+            "recipe may provide search metadata such as quantize.disabled_layers."
         ),
         default=None,
     )
@@ -1299,8 +1308,8 @@ def parse_args() -> argparse.Namespace:
             "Formats ending in '_cast' (fp8_cast, nvfp4_cast) set the amax to FP8 range "
             "without data-driven calibration. "
             "Other formats (fp8, nvfp4, etc.) use data-driven calibration. "
-            "Ignored when --recipe is given: the recipe YAML is authoritative for KV "
-            "cache config (use the *_cast_kv.yaml recipes for the cast variants)."
+            "Ignored for plain PTQ when --recipe is given because the recipe YAML is authoritative "
+            "for KV cache config (use the *_cast_kv.yaml recipes for the cast variants)."
         ),
     )
     parser.add_argument(
 
@@ -103,14 +103,16 @@ parse_options() {
   # Verify required options are provided
   if [ -z "$MODEL_PATH" ] || [ -z "$TASKS" ] || ([ -z "$QFORMAT" ] && [ -z "$RECIPE" ]); then
     echo "Usage: $0 --model=<MODEL_PATH> (--quant=<QFORMAT> | --recipe=<RECIPE>) --tasks=<TASK,...>"
+    echo "       AutoQuant may use both --quant=<QFORMATS> and --recipe=<RECIPE>."
     echo "Optional args: --sparsity=<SPARSITY_FMT> --awq_block_size=<AWQ_BLOCK_SIZE> --calib=<CALIB_SIZE>"
     exit 1
   fi
 
-  # --quant and --recipe are mutually exclusive: --recipe is a full PTQ spec, while
-  # --quant selects a built-in qformat preset. Pick exactly one.
-  if [ -n "$QFORMAT" ] && [ -n "$RECIPE" ]; then
-    echo "Cannot specify both --quant and --recipe; pick one." >&2
+  # For plain PTQ, --quant and --recipe are mutually exclusive: --recipe is a full PTQ spec,
+  # while --quant selects a built-in qformat preset. For AutoQuant, --quant selects the search
+  # candidates and --recipe may provide search metadata such as disabled_layers.
+  if [ -n "$QFORMAT" ] && [ -n "$RECIPE" ] && [ -z "$AUTO_QUANTIZE_BITS" ]; then
+    echo "Cannot specify both --quant and --recipe for plain PTQ; pick one." >&2
     exit 1
   fi
 
 
@@ -90,6 +90,21 @@ def is_routed_moe_module_name(name: str) -> bool:
     return "shared_expert" not in name and _ROUTED_MOE_EXPERT_NAME_RE.search(name) is not None
 
 
+def _get_module_weight_numel(module: nn.Module) -> int:
+    """Return the parameter count for a module's quantizable weights."""
+    weight = getattr(module, "weight", None)
+    if weight is not None:
+        return weight.numel()
+
+    # Fused MoE expert containers expose projection tensors directly instead of
+    # a single ``weight`` parameter.
+    return sum(
+        param.numel()
+        for attr in ("gate_up_proj", "down_proj")
+        if (param := getattr(module, attr, None)) is not None
+    )
+
+
 class AutoQuantizeCostModel:
     """Base class for AutoQuantize effective-bits cost accounting."""
 
@@ -119,7 +134,7 @@ def total_weight_size(
     ) -> float:
         """Return the cost denominator for the effective-bits constraint."""
         return sum(
-            module.weight.numel() * self.module_cost_weight([name], cost_constraints)
+            _get_module_weight_numel(module) * self.module_cost_weight([name], cost_constraints)
             for name, module in named_modules
             if is_auto_quantize_module(module)
         )