NVIDIA
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 70 additions & 16 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 70 additions & 16 deletions
diff --git a/‎modelopt/torch/quantization/_auto_quantize_cost.py‎
Lines changed: 54 additions & 4 deletions b/‎modelopt/torch/quantization/_auto_quantize_cost.py‎
Lines changed: 54 additions & 4 deletions
@@ -66,6 +66,7 @@
     save_expert_token_count_table,
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
+from modelopt.torch.quantization._auto_quantize_cost import EXCLUDED_MODULE_NAME_PATTERNS_KEY
 from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
@@ -140,6 +141,36 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
 mto.enable_huggingface_checkpointing()
 
 
+# TODO: To be refacored into config system.
+_QWEN36_AUTOQ_DISABLED_LAYERS = (
+    "*shared_expert_gate*",
+    "*linear_attn.in_proj_a*",
+    "*linear_attn.in_proj_b*",
+)
+_VLM_AUTOQ_DISABLED_LAYERS = ("*visual*", "*mtp*", "*vision_tower*")
+
+
+def get_auto_quantize_disabled_layers(model) -> list[str]:
+    """Return layer patterns that should be excluded from AutoQuantize search."""
+    disabled_layers = [
+        entry["quantizer_name"]
+        for entry in _default_disabled_quantizer_cfg
+        if "parent_class" not in entry and entry["quantizer_name"] != "*lm_head*"
+    ]
+    disabled_layers.extend(p for p in _QWEN36_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
+    if is_multimodal_model(model):
+        disabled_layers.extend(p for p in _VLM_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
+    return disabled_layers
+
+
+def get_auto_quantize_cost_excluded_patterns(args, model) -> list[str]:
+    """Return layer patterns excluded only from AutoQuantize cost accounting."""
+    excluded_patterns = list(args.auto_quantize_cost_exclude_patterns or [])
+    if args.auto_quantize_cost_exclude_vlm_modules and is_multimodal_model(model):
+        excluded_patterns.extend(_VLM_AUTOQ_DISABLED_LAYERS)
+    return list(dict.fromkeys(excluded_patterns))
+
+
 def extract_and_prepare_language_model_from_vl(full_model):
     """Extract language model from VL model and disable quantization for non-language components.
 
@@ -323,6 +354,7 @@ def auto_quantize(
             "nvfp4_awq",
             "nvfp4_mse",
             "w4a8_awq",
+            "w4a16_nvfp4",
             "fp8_pb_wo",
             "w4a8_mxfp4_fp8",
             "nvfp4_mlp_only",
@@ -386,10 +418,14 @@ def forward_step(model, batch):
         "effective_bits": args.auto_quantize_bits,
         "cost_model": args.auto_quantize_cost_model,
     }
+    auto_quantize_cost = {}
     if args.auto_quantize_active_moe_expert_ratio is not None:
-        auto_quantize_constraints["cost"] = {
-            "active_moe_expert_ratio": args.auto_quantize_active_moe_expert_ratio
-        }
+        auto_quantize_cost["active_moe_expert_ratio"] = args.auto_quantize_active_moe_expert_ratio
+    cost_excluded_patterns = get_auto_quantize_cost_excluded_patterns(args, language_model)
+    if cost_excluded_patterns:
+        auto_quantize_cost[EXCLUDED_MODULE_NAME_PATTERNS_KEY] = cost_excluded_patterns
+    if auto_quantize_cost:
+        auto_quantize_constraints["cost"] = auto_quantize_cost
 
     language_model, _ = mtq.auto_quantize(
         language_model,
@@ -405,12 +441,7 @@ def forward_step(model, batch):
             len(calib_dataloader), max(auto_quantize_score_size // args.batch_size, 1)
         ),
         verbose=True,
-        # Disable all default disabled layers such as lm_head, mlp.gate, router etc.
-        disabled_layers=[
-            entry["quantizer_name"]
-            for entry in _default_disabled_quantizer_cfg
-            if "parent_class" not in entry
-        ],
+        disabled_layers=get_auto_quantize_disabled_layers(language_model),
         method=auto_quantize_method,
         checkpoint=auto_quantize_checkpoint,
     )
@@ -550,12 +581,10 @@ def load_model(args: argparse.Namespace):
                 : len(args.dataset)
             ]
 
-            # We only quantize the language model for VLMs other than the type supported above.
-            # Recipe mode is the exception: in Qwen3.5/3.6-MoE VLMs, lm_head sits
-            # on the outer CausalLM, not the inner language backbone. A recipe that targets
-            # lm_head must therefore quantize against the full model and explicitly keep visual
-            # and MTP siblings disabled.
-            if args.recipe is None:
+            # Plain PTQ quantizes only the extracted language model. Recipe and
+            # AutoQuantize paths keep the outer CausalLM so recipes/search can see
+            # Qwen3.5/3.6-MoE VLM lm_head.
+            if args.recipe is None and args.auto_quantize_bits is None:
                 extracted_lm, extracted_model_type = extract_and_prepare_language_model_from_vl(
                     full_model
                 )
@@ -1081,9 +1110,16 @@ def _is_layerwise(obj):
             "Auto quantization needs multiple quantization format."
         )
 
+        # For VL models, autoquant must walk submodules of the OUTER CausalLM
+        # (which carries lm_head and the LM-head forward path) — otherwise
+        # lm_head and any sibling-of-language_model modules are silently
+        # invisible to the search. ``forward_step`` also needs the outer model
+        # to produce ``CausalLMOutputWithPast`` (for ``.loss`` / ``.logits``).
+        # Visual tower and MTP siblings are auto-excluded inside
+        # ``auto_quantize()`` via *visual* / *mtp* / *vision_tower* patterns.
         auto_quantize(
             args,
-            language_model,
+            full_model,
             calib_dataloader,
             auto_quantize_method=args.auto_quantize_method,
             auto_quantize_score_size=args.auto_quantize_score_size,
@@ -1423,6 +1459,24 @@ def parse_args() -> argparse.Namespace:
             "routing; use --moe_calib_experts_ratio to control calibration expert coverage."
         ),
     )
+    parser.add_argument(
+        "--auto_quantize_cost_exclude_patterns",
+        nargs="+",
+        default=None,
+        help=(
+            "Wildcard module-name patterns to exclude from AutoQuantize effective-bits cost "
+            "accounting. The matched modules can still be disabled from quantization separately; "
+            "this flag only changes the budget denominator and selected-cost calculation."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_cost_exclude_vlm_modules",
+        action="store_true",
+        help=(
+            "Exclude VLM sibling modules matching *visual*, *vision_tower*, and *mtp* from "
+            "AutoQuantize effective-bits cost accounting."
+        ),
+    )
     parser.add_argument(
         "--moe_calib_experts_ratio",
         type=float,
 
@@ -15,6 +15,7 @@
 
 """Cost models for AutoQuantize effective-bits accounting."""
 
+import fnmatch
 from collections.abc import Callable, Iterable, Sequence
 from typing import Any, Final
 
@@ -27,6 +28,7 @@
 
 AUTO_QUANTIZE_CONSTRAINT_KEYS: Final = frozenset({"effective_bits", "cost_model", "cost"})
 ACTIVE_MOE_EXPERT_RATIO_KEY: Final = "active_moe_expert_ratio"
+EXCLUDED_MODULE_NAME_PATTERNS_KEY: Final = "excluded_module_name_patterns"
 COST_MODEL_WEIGHT: Final = "weight"
 COST_MODEL_ACTIVE_MOE: Final = "active_moe"
 
@@ -90,11 +92,31 @@ def is_routed_moe_module_name(name: str) -> bool:
     return "shared_expert" not in name and _ROUTED_MOE_EXPERT_NAME_RE.search(name) is not None
 
 
+def _get_module_weight_numel(module: nn.Module) -> int:
+    """Return the parameter count for a module's quantizable weights.
+
+    Standard quantized linear modules have a single ``weight`` parameter. Fused
+    MoE expert containers expose projection tensors directly instead, so both
+    fused projections contribute to AutoQuantize cost accounting.
+    """
+    weight = getattr(module, "weight", None)
+    if weight is not None:
+        return weight.numel()
+
+    # Fused MoE expert containers expose projection tensors directly instead of
+    # a single ``weight`` parameter.
+    return sum(
+        param.numel()
+        for attr in ("gate_up_proj", "down_proj")
+        if (param := getattr(module, attr, None)) is not None
+    )
+
+
 class AutoQuantizeCostModel:
     """Base class for AutoQuantize effective-bits cost accounting."""
 
     name: str
-    supported_cost_keys: frozenset[str] = frozenset()
+    supported_cost_keys: frozenset[str] = frozenset({EXCLUDED_MODULE_NAME_PATTERNS_KEY})
 
     def normalize_cost_constraints(
         self, model: nn.Module, cost_constraints: dict[str, Any]
@@ -103,12 +125,35 @@ def normalize_cost_constraints(
         unknown_cost_keys = set(cost_constraints) - self.supported_cost_keys
         if unknown_cost_keys:
             raise ValueError(f"Unsupported auto_quantize cost constraints: {unknown_cost_keys}.")
+        excluded_patterns = cost_constraints.get(EXCLUDED_MODULE_NAME_PATTERNS_KEY)
+        if excluded_patterns is None:
+            return cost_constraints
+        if isinstance(excluded_patterns, str):
+            excluded_patterns = [excluded_patterns]
+        if not isinstance(excluded_patterns, Sequence) or not all(
+            isinstance(pattern, str) for pattern in excluded_patterns
+        ):
+            raise ValueError(
+                f"constraints['cost']['{EXCLUDED_MODULE_NAME_PATTERNS_KEY}'] must be a string "
+                "or a sequence of strings."
+            )
+        cost_constraints[EXCLUDED_MODULE_NAME_PATTERNS_KEY] = list(excluded_patterns)
         return cost_constraints
 
     def module_cost_weight(
         self, module_names: Sequence[str], cost_constraints: dict[str, Any]
     ) -> float:
         """Return the cost multiplier for a group of modules."""
+        excluded_patterns = cost_constraints.get(EXCLUDED_MODULE_NAME_PATTERNS_KEY, [])
+        if (
+            module_names
+            and excluded_patterns
+            and all(
+                any(fnmatch.fnmatch(name, pattern) for pattern in excluded_patterns)
+                for name in module_names
+            )
+        ):
+            return 0.0
         return 1.0
 
     def total_weight_size(
@@ -119,7 +164,7 @@ def total_weight_size(
     ) -> float:
         """Return the cost denominator for the effective-bits constraint."""
         return sum(
-            module.weight.numel() * self.module_cost_weight([name], cost_constraints)
+            _get_module_weight_numel(module) * self.module_cost_weight([name], cost_constraints)
             for name, module in named_modules
             if is_auto_quantize_module(module)
         )
@@ -135,7 +180,9 @@ class ActiveMoECostModel(AutoQuantizeCostModel):
     """Scale routed MoE expert weights by the active experts per-token ratio."""
 
     name = COST_MODEL_ACTIVE_MOE
-    supported_cost_keys = frozenset({ACTIVE_MOE_EXPERT_RATIO_KEY})
+    supported_cost_keys = frozenset(
+        {ACTIVE_MOE_EXPERT_RATIO_KEY, EXCLUDED_MODULE_NAME_PATTERNS_KEY}
+    )
 
     def normalize_cost_constraints(
         self, model: nn.Module, cost_constraints: dict[str, Any]
@@ -164,9 +211,12 @@ def normalize_cost_constraints(
     def module_cost_weight(
         self, module_names: Sequence[str], cost_constraints: dict[str, Any]
     ) -> float:
+        base_weight = super().module_cost_weight(module_names, cost_constraints)
+        if base_weight == 0.0:
+            return 0.0
         if any(is_routed_moe_module_name(n) for n in module_names):
             return cost_constraints[ACTIVE_MOE_EXPERT_RATIO_KEY]
-        return 1.0
+        return base_weight
 
 
 _COST_MODELS: Final = {