NVIDIA
diff --git a/‎examples/llm_ptq/example_utils.py‎
Lines changed: 55 additions & 1 deletion b/‎examples/llm_ptq/example_utils.py‎
Lines changed: 55 additions & 1 deletion
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 28 additions & 18 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 28 additions & 18 deletions
diff --git a/‎modelopt/torch/quantization/_auto_quantize_cost.py‎
Lines changed: 54 additions & 4 deletions b/‎modelopt/torch/quantization/_auto_quantize_cost.py‎
Lines changed: 54 additions & 4 deletions
@@ -42,6 +42,9 @@
     ProcessorMixin,
 )
 
+from modelopt.torch.export.model_utils import is_multimodal_model
+from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg
+
 try:
     from huggingface_hub import snapshot_download
 except ImportError:
@@ -51,6 +54,58 @@
 
 SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
 
+# TODO: Refactor into the config system.
+_QWEN36_AUTOQ_DISABLED_LAYERS = (
+    "*shared_expert_gate*",
+    "*linear_attn.in_proj_a*",
+    "*linear_attn.in_proj_b*",
+)
+_VLM_AUTOQ_DISABLED_LAYERS = ("*visual*", "*mtp*", "*vision_tower*")
+
+
+def _is_qwen_model(model) -> bool:
+    """Return True when model/config identifiers indicate a Qwen-family model."""
+    candidates = [type(model).__name__]
+    config = getattr(model, "config", None)
+    configs = [
+        config,
+        getattr(config, "text_config", None),
+        getattr(config, "language_config", None),
+    ]
+    for cfg in configs:
+        if cfg is None:
+            continue
+        candidates.append(type(cfg).__name__)
+        model_type = getattr(cfg, "model_type", None)
+        if model_type is not None:
+            candidates.append(str(model_type))
+        architectures = getattr(cfg, "architectures", ()) or ()
+        if isinstance(architectures, str):
+            architectures = (architectures,)
+        candidates.extend(str(architecture) for architecture in architectures)
+    return any("qwen" in candidate.lower() for candidate in candidates)
+
+
+def _get_auto_quantize_disabled_layers(model) -> list[str]:
+    """Return layer patterns that should be excluded from AutoQuantize search."""
+    disabled_layers = [
+        entry["quantizer_name"]
+        for entry in _default_disabled_quantizer_cfg
+        if "parent_class" not in entry and entry["quantizer_name"] != "*lm_head*"
+    ]
+    if _is_qwen_model(model):
+        disabled_layers.extend(p for p in _QWEN36_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
+    if is_multimodal_model(model):
+        disabled_layers.extend(p for p in _VLM_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
+    return disabled_layers
+
+
+def _get_auto_quantize_cost_excluded_patterns(model) -> list[str]:
+    """Return layer patterns excluded only from AutoQuantize cost accounting."""
+    if is_multimodal_model(model):
+        return list(_VLM_AUTOQ_DISABLED_LAYERS)
+    return []
+
 
 def run_nemotron_vl_preview(
     full_model,
@@ -133,7 +188,6 @@ def is_nemotron_vl(model_or_config):
     # Try to get config from model, or use directly if it's a config
     if hasattr(model_or_config, "config"):
         config = model_or_config.config
-        from modelopt.torch.export.model_utils import is_multimodal_model
 
         if not is_multimodal_model(model_or_config):
             return False
 
@@ -27,6 +27,8 @@
 from cast_mxfp4_to_nvfp4 import apply_to_model as apply_cast_mxfp4_to_nvfp4
 from cast_mxfp4_to_nvfp4 import force_weight_quantizers_static
 from example_utils import (
+    _get_auto_quantize_cost_excluded_patterns,
+    _get_auto_quantize_disabled_layers,
     build_quant_cfg,
     copy_custom_model_files,
     create_vlm_calibration_loop,
@@ -72,7 +74,8 @@
     save_expert_token_count_table,
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
-from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
+from modelopt.torch.quantization._auto_quantize_cost import EXCLUDED_MODULE_NAME_PATTERNS_KEY
+from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
 from modelopt.torch.speculative.eagle.utils import (
@@ -132,6 +135,7 @@ def _kv_cfg_uses_constant_amax(kv_quant_cfg: list[dict[str, Any]]) -> bool:
         "nvfp4_awq_lite",
         "nvfp4_w4a4_weight_mse_fp8_sweep",
         "w4a8_awq_beta",
+        "w4a16_nvfp4",
         "fp8_2d_blockwise_weight_only",
         "w4a8_mxfp4_fp8",
         "nvfp4_mlp_only",
@@ -387,10 +391,14 @@ def forward_step(model, batch):
         "effective_bits": args.auto_quantize_bits,
         "cost_model": args.auto_quantize_cost_model,
     }
+    auto_quantize_cost = {}
     if args.auto_quantize_active_moe_expert_ratio is not None:
-        auto_quantize_constraints["cost"] = {
-            "active_moe_expert_ratio": args.auto_quantize_active_moe_expert_ratio
-        }
+        auto_quantize_cost["active_moe_expert_ratio"] = args.auto_quantize_active_moe_expert_ratio
+    cost_excluded_patterns = _get_auto_quantize_cost_excluded_patterns(language_model)
+    if cost_excluded_patterns:
+        auto_quantize_cost[EXCLUDED_MODULE_NAME_PATTERNS_KEY] = cost_excluded_patterns
+    if auto_quantize_cost:
+        auto_quantize_constraints["cost"] = auto_quantize_cost
 
     language_model, _ = mtq.auto_quantize(
         language_model,
@@ -406,12 +414,7 @@ def forward_step(model, batch):
             len(calib_dataloader), max(auto_quantize_score_size // args.batch_size, 1)
         ),
         verbose=True,
-        # Disable all default disabled layers such as lm_head, mlp.gate, router etc.
-        disabled_layers=[
-            entry["quantizer_name"]
-            for entry in _default_disabled_quantizer_cfg
-            if "parent_class" not in entry
-        ],
+        disabled_layers=_get_auto_quantize_disabled_layers(language_model),
         method=auto_quantize_method,
         checkpoint=auto_quantize_checkpoint,
     )
@@ -487,7 +490,7 @@ def load_model(args: argparse.Namespace):
     is_nemotron_vl_model = is_nemotron_vl(full_model)
 
     # Default to image-text calibration for VLM models
-    if is_nemotron_vl_model and not args.calib_with_images:
+    if is_nemotron_vl_model and not args.calib_with_images and args.auto_quantize_bits is None:
         print("Nemotron VL model detected. Enabling image-text calibration by default.")
         args.calib_with_images = True
 
@@ -539,12 +542,10 @@ def load_model(args: argparse.Namespace):
                 : len(args.dataset)
             ]
 
-            # We only quantize the language model for VLMs other than the type supported above.
-            # Recipe mode is the exception: in Qwen3.5/3.6-MoE VLMs, lm_head sits
-            # on the outer CausalLM, not the inner language backbone. A recipe that targets
-            # lm_head must therefore quantize against the full model and explicitly keep visual
-            # and MTP siblings disabled.
-            if args.recipe is None:
+            # Plain PTQ quantizes only the extracted language model. Recipe and
+            # AutoQuantize paths keep the outer CausalLM so recipes/search can see
+            # Qwen3.5/3.6-MoE VLM lm_head.
+            if args.recipe is None and args.auto_quantize_bits is None:
                 extracted_lm, extracted_model_type = extract_and_prepare_language_model_from_vl(
                     full_model
                 )
@@ -1070,9 +1071,16 @@ def _is_layerwise(obj):
             "Auto quantization needs multiple quantization format."
         )
 
+        # For VL models, autoquant must walk submodules of the OUTER CausalLM
+        # (which carries lm_head and the LM-head forward path) — otherwise
+        # lm_head and any sibling-of-language_model modules are silently
+        # invisible to the search. ``forward_step`` also needs the outer model
+        # to produce ``CausalLMOutputWithPast`` (for ``.loss`` / ``.logits``).
+        # Visual tower and MTP siblings are auto-excluded inside
+        # ``auto_quantize()`` via *visual* / *mtp* / *vision_tower* patterns.
         auto_quantize(
             args,
-            language_model,
+            full_model,
             calib_dataloader,
             auto_quantize_method=args.auto_quantize_method,
             auto_quantize_score_size=args.auto_quantize_score_size,
@@ -1437,6 +1445,8 @@ def parse_args() -> argparse.Namespace:
     args = parser.parse_args()
     if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0):
         parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")
+    if args.auto_quantize_bits is not None and args.calib_with_images:
+        parser.error("--calib_with_images is not supported with --auto_quantize_bits.")
     if args.auto_quantize_active_moe_expert_ratio is not None and not (
         0.0 < args.auto_quantize_active_moe_expert_ratio <= 1.0
     ):
 
@@ -15,6 +15,7 @@
 
 """Cost models for AutoQuantize effective-bits accounting."""
 
+import fnmatch
 from collections.abc import Callable, Iterable, Sequence
 from typing import Any, Final
 
@@ -27,6 +28,7 @@
 
 AUTO_QUANTIZE_CONSTRAINT_KEYS: Final = frozenset({"effective_bits", "cost_model", "cost"})
 ACTIVE_MOE_EXPERT_RATIO_KEY: Final = "active_moe_expert_ratio"
+EXCLUDED_MODULE_NAME_PATTERNS_KEY: Final = "excluded_module_name_patterns"
 COST_MODEL_WEIGHT: Final = "weight"
 COST_MODEL_ACTIVE_MOE: Final = "active_moe"
 
@@ -90,11 +92,31 @@ def is_routed_moe_module_name(name: str) -> bool:
     return "shared_expert" not in name and _ROUTED_MOE_EXPERT_NAME_RE.search(name) is not None
 
 
+def _get_module_weight_numel(module: nn.Module) -> int:
+    """Return the parameter count for a module's quantizable weights.
+
+    Standard quantized linear modules have a single ``weight`` parameter. Fused
+    MoE expert containers expose projection tensors directly instead, so both
+    fused projections contribute to AutoQuantize cost accounting.
+    """
+    weight = getattr(module, "weight", None)
+    if weight is not None:
+        return weight.numel()
+
+    # Fused MoE expert containers expose projection tensors directly instead of
+    # a single ``weight`` parameter.
+    return sum(
+        param.numel()
+        for attr in ("gate_up_proj", "down_proj")
+        if (param := getattr(module, attr, None)) is not None
+    )
+
+
 class AutoQuantizeCostModel:
     """Base class for AutoQuantize effective-bits cost accounting."""
 
     name: str
-    supported_cost_keys: frozenset[str] = frozenset()
+    supported_cost_keys: frozenset[str] = frozenset({EXCLUDED_MODULE_NAME_PATTERNS_KEY})
 
     def normalize_cost_constraints(
         self, model: nn.Module, cost_constraints: dict[str, Any]
@@ -103,12 +125,35 @@ def normalize_cost_constraints(
         unknown_cost_keys = set(cost_constraints) - self.supported_cost_keys
         if unknown_cost_keys:
             raise ValueError(f"Unsupported auto_quantize cost constraints: {unknown_cost_keys}.")
+        excluded_patterns = cost_constraints.get(EXCLUDED_MODULE_NAME_PATTERNS_KEY)
+        if excluded_patterns is None:
+            return cost_constraints
+        if isinstance(excluded_patterns, str):
+            excluded_patterns = [excluded_patterns]
+        if not isinstance(excluded_patterns, Sequence) or not all(
+            isinstance(pattern, str) for pattern in excluded_patterns
+        ):
+            raise ValueError(
+                f"constraints['cost']['{EXCLUDED_MODULE_NAME_PATTERNS_KEY}'] must be a string "
+                "or a sequence of strings."
+            )
+        cost_constraints[EXCLUDED_MODULE_NAME_PATTERNS_KEY] = list(excluded_patterns)
         return cost_constraints
 
     def module_cost_weight(
         self, module_names: Sequence[str], cost_constraints: dict[str, Any]
     ) -> float:
         """Return the cost multiplier for a group of modules."""
+        excluded_patterns = cost_constraints.get(EXCLUDED_MODULE_NAME_PATTERNS_KEY, [])
+        if (
+            module_names
+            and excluded_patterns
+            and all(
+                any(fnmatch.fnmatch(name, pattern) for pattern in excluded_patterns)
+                for name in module_names
+            )
+        ):
+            return 0.0
         return 1.0
 
     def total_weight_size(
@@ -119,7 +164,7 @@ def total_weight_size(
     ) -> float:
         """Return the cost denominator for the effective-bits constraint."""
         return sum(
-            module.weight.numel() * self.module_cost_weight([name], cost_constraints)
+            _get_module_weight_numel(module) * self.module_cost_weight([name], cost_constraints)
             for name, module in named_modules
             if is_auto_quantize_module(module)
         )
@@ -135,7 +180,9 @@ class ActiveMoECostModel(AutoQuantizeCostModel):
     """Scale routed MoE expert weights by the active experts per-token ratio."""
 
     name = COST_MODEL_ACTIVE_MOE
-    supported_cost_keys = frozenset({ACTIVE_MOE_EXPERT_RATIO_KEY})
+    supported_cost_keys = frozenset(
+        {ACTIVE_MOE_EXPERT_RATIO_KEY, EXCLUDED_MODULE_NAME_PATTERNS_KEY}
+    )
 
     def normalize_cost_constraints(
         self, model: nn.Module, cost_constraints: dict[str, Any]
@@ -164,9 +211,12 @@ def normalize_cost_constraints(
     def module_cost_weight(
         self, module_names: Sequence[str], cost_constraints: dict[str, Any]
     ) -> float:
+        base_weight = super().module_cost_weight(module_names, cost_constraints)
+        if base_weight == 0.0:
+            return 0.0
         if any(is_routed_moe_module_name(n) for n in module_names):
             return cost_constraints[ACTIVE_MOE_EXPERT_RATIO_KEY]
-        return 1.0
+        return base_weight
 
 
 _COST_MODELS: Final = {