NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/llm_ptq/example_utils.py‎
Lines changed: 33 additions & 0 deletions b/‎examples/llm_ptq/example_utils.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 14 additions & 3 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎modelopt/torch/export/plugins/vllm_fakequant_hf.py‎
Lines changed: 135 additions & 54 deletions b/‎modelopt/torch/export/plugins/vllm_fakequant_hf.py‎
Lines changed: 135 additions & 54 deletions
diff --git a/‎modelopt/torch/quantization/config.py‎
Lines changed: 24 additions & 4 deletions b/‎modelopt/torch/quantization/config.py‎
Lines changed: 24 additions & 4 deletions
@@ -15,6 +15,7 @@ Changelog
 - Enable PTQ workflow for the Step3.5-Flash MoE model with NVFP4 W4A4 + FP8 KV cache quantization. See `modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml>`_ for more details.
 - Add support for vLLM fakequant reload using ModelOpt state for HF models. See `examples/vllm_serve/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/vllm_serve#load-qatptq-model-and-serve-in-vllm-wip>`_ for more details.
 - [Early Testing] Add Claude Code PTQ skill (``.claude/skills/ptq/``) for agent-assisted post-training quantization. The skill guides the agent through environment detection, model support checking, format selection, and execution via the launcher or manual SLURM/Docker/bare GPU paths. Includes handling for unlisted models with custom module patching. This feature is in early testing — use with caution.
+- Add performant layerwise calibration for large models that don't fit on GPU (e.g. DeepSeek-R1, Kimi-K2). See `modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml>`_ for usage. Layerwise calibration also supports PTQ with intermediate progress saving — useful when long PTQ runs get hit with Slurm timeouts. See `modelopt_recipes/general/ptq/nvfp4_default-none_kv_gptq.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/general/ptq/nvfp4_default-none_kv_gptq.yaml>`_ for usage.
 
 **Backward Breaking Changes**
 
 
@@ -15,6 +15,7 @@
 
 import copy
 import glob
+import hashlib
 import inspect
 import json
 import logging
@@ -854,3 +855,35 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
         print(f"Successfully copied {len(copied_files)} custom model files to {export_path}")
     else:
         print("No custom model files found to copy")
+
+
+def needs_checkpoint_path_update(quant_cfg: dict) -> bool:
+    """Check if quant_cfg has a layerwise_checkpoint_dir that should be auto-resolved to a unique subpath."""
+    algorithm = quant_cfg.get("algorithm")
+    if not isinstance(algorithm, dict):
+        return False
+    return algorithm.get("layerwise_checkpoint_dir") is not None
+
+
+def resolve_checkpoint_dir(quant_cfg: dict, model_path: str) -> dict:
+    """Append a unique ``<model_name>_<config_hash>`` subdirectory to layerwise_checkpoint_dir.
+
+    Allows a single recipe to be reused across models without checkpoint collisions.
+    Must only be called when :func:`needs_checkpoint_path_update` returns True.
+    """
+    algorithm = quant_cfg["algorithm"]
+    base_dir = algorithm["layerwise_checkpoint_dir"]
+
+    name = model_path.rstrip("/")
+    if "/" in name and not os.path.isabs(name):
+        name = name.replace("/", "--")
+    else:
+        name = Path(name).name
+
+    config_hash = hashlib.sha256(json.dumps(quant_cfg, default=str).encode()).hexdigest()[:8]
+
+    quant_cfg = copy.deepcopy(quant_cfg)
+    quant_cfg["algorithm"]["layerwise_checkpoint_dir"] = os.path.join(
+        base_dir, f"{name}_{config_hash}"
+    )
+    return quant_cfg
@@ -34,6 +34,8 @@
     is_enc_dec,
     is_nemotron_vl,
     load_mtp_weights,
+    needs_checkpoint_path_update,
+    resolve_checkpoint_dir,
     run_nemotron_vl_preview,
 )
 from torch.utils.data import DataLoader
@@ -91,8 +93,9 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
     for i, entry in enumerate(quant_cfg):
         if entry.get("quantizer_name") != "*[kv]_bmm_quantizer":
             continue
-        assert isinstance(entry.get("cfg", {}), dict)
-        quant_cfg[i] = {**entry, "cfg": {**entry.get("cfg", {}), "use_constant_amax": True}}
+        cfg = entry.get("cfg") or {}
+        assert isinstance(cfg, dict)
+        quant_cfg[i] = {**entry, "cfg": {**cfg, "use_constant_amax": True}}
         break
 
 
@@ -760,7 +763,9 @@ def export_quantized(
             # Load any missing weights from non-standard safetensors (handled in get_model for non-low-memory mode)
             # Store the MTP layer prefixes on the model for later exclusion from quantization
             if args.vllm_fakequant_export:
-                export_hf_vllm_fq_checkpoint(full_model, export_dir=export_path)
+                export_hf_vllm_fq_checkpoint(
+                    full_model, export_dir=export_path, inplace_mem_efficient=True
+                )
             else:
                 mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(
                     full_model, args.pyt_ckpt_path
@@ -1105,6 +1110,12 @@ def quantize_main(
             quant_cfg = copy.deepcopy(quant_cfg)
             _set_kv_cache_constant_amax(quant_cfg["quant_cfg"])
 
+        if needs_checkpoint_path_update(quant_cfg):
+            quant_cfg = resolve_checkpoint_dir(quant_cfg, args.pyt_ckpt_path)
+            print(
+                f"Auto-resolved layerwise_checkpoint_dir: {quant_cfg['algorithm']['layerwise_checkpoint_dir']}"
+            )
+
         if args.qformat in QUANT_CFG_CHOICES:
             mono_quantize(
                 args,
 
@@ -24,6 +24,8 @@
 from modelopt.torch.quantization.conversion import quantizer_state
 from modelopt.torch.quantization.nn import QuantModule, TensorQuantizer
 from modelopt.torch.quantization.utils import get_quantizer_state_dict
+from modelopt.torch.quantization.utils.core_utils import enable_weight_access_and_writeback
+from modelopt.torch.quantization.utils.layerwise_calib import LayerActivationCollector
 from modelopt.torch.utils import get_unwrapped_name
 
 __all__ = ["export_hf_vllm_fq_checkpoint"]
@@ -38,9 +40,75 @@ def disable_rotate(quantizer: TensorQuantizer):
     return False
 
 
+def _fakequant_module_weights(
+    module: nn.Module,
+    module_name: str,
+    model: nn.Module,
+    state_dict: dict | None,
+    input_quantizers_folded_pqs: set,
+    fakequant_weights: set,
+    inplace: bool,
+):
+    """Apply fake-quant to a single QuantModule's weights.
+
+    When ``inplace=False``, reads/writes weights from/to ``state_dict``.
+    When ``inplace=True``, modifies the module's weight parameters directly.
+    """
+    if not isinstance(module, QuantModule):
+        return
+    for attr_name, quantizer in module.named_children():
+        if not (
+            attr_name.endswith("weight_quantizer")
+            and isinstance(quantizer, TensorQuantizer)
+            and quantizer.fake_quant
+            and quantizer.is_enabled
+        ):
+            continue
+        weight_name = attr_name.removesuffix("_quantizer")
+        prefix = f"{module_name}." if module_name else ""
+        sd_key = f"{prefix}{weight_name}"
+        assert sd_key not in fakequant_weights, f"Weight {sd_key} has already been fakequantized"
+
+        if inplace:
+            w = getattr(module, weight_name)
+            w_quant = quantizer(w.float()).to(w.dtype)
+        else:
+            assert state_dict is not None
+            if sd_key not in state_dict:
+                continue
+            w = state_dict[sd_key]
+            w_quant = quantizer(w.float()).to(w.dtype)
+
+        # Fold pre_quant_scale: (x*s)@fake_quant(W) = x@(fake_quant(W)*s)
+        # Only valid when input_quantizer does NOT fake-quant activations. If it does
+        # fake_quant(x*s), the non-linearity prevents folding s into W.
+        inp_attr = attr_name.replace("weight_quantizer", "input_quantizer")
+        if hasattr(module, inp_attr):
+            inp_q = getattr(module, inp_attr)
+            if (
+                hasattr(inp_q, "_pre_quant_scale")
+                and inp_q._pre_quant_scale is not None
+                and inp_q._disabled
+            ):
+                scale = inp_q._pre_quant_scale.squeeze().to(device=w_quant.device)
+                w_quant = (w_quant * scale[None, :]).to(w_quant.dtype)
+                inp_q_key = get_unwrapped_name(
+                    f"{module_name}.{inp_attr}" if module_name else inp_attr, model
+                )
+                input_quantizers_folded_pqs.add(inp_q_key)
+
+        if inplace:
+            w.data.copy_(w_quant)
+        else:
+            assert state_dict is not None
+            state_dict[sd_key] = w_quant.cpu()
+        fakequant_weights.add(sd_key)
+
+
 def export_hf_vllm_fq_checkpoint(
     model: nn.Module,
     export_dir: Path | str,
+    inplace_mem_efficient: bool = False,
 ):
     """Export quantized HF weights + ``vllm_fq_modelopt_state.pth`` for vLLM fake-quant reload.
 
@@ -53,62 +121,66 @@ def export_hf_vllm_fq_checkpoint(
     Args:
         model: In-memory quantized model.
         export_dir: Output dir for HF files and ``vllm_fq_modelopt_state.pth``.
+        inplace_mem_efficient: When True, applies fake-quant inplace one decoder layer at
+            a time using ``enable_weight_access_and_writeback``, avoiding full state
+            dict materialization. This is destructive — model weights are permanently
+            modified and weight quantizers are not re-enabled after export.
     """
     export_dir = Path(export_dir)
     export_dir.mkdir(parents=True, exist_ok=True)
 
     # Step 1: Build the folded HF state dict.
-    # model.state_dict() returns detached copies of all tensors, so model
-    # parameters are never modified. Apply each weight quantizer's fake-quant
-    # to the corresponding weight tensor in the copy.
-    state_dict = model.state_dict()
     fakequant_weights = set()
-    input_quantizers_folded_pqs = (
-        set()
-    )  # keys for input_quantizers where pre_quant_scale was folded
+    input_quantizers_folded_pqs = set()
     with torch.inference_mode():
-        for module_name, module in model.named_modules():
-            if not isinstance(module, QuantModule):
-                continue
-            for attr_name, quantizer in module.named_children():
-                if not (
-                    attr_name.endswith("weight_quantizer")
-                    and isinstance(quantizer, TensorQuantizer)
-                    and quantizer.fake_quant
-                    and quantizer.is_enabled
-                ):
+        if inplace_mem_efficient:
+            # Inplace path: iterate decoder layers, one offload<->onload per layer.
+            decoder_layers = LayerActivationCollector.get_decoder_layers(model)
+            assert decoder_layers is not None, (
+                "inplace_mem_efficient=True requires a model with discoverable decoder layers"
+            )
+            for name, module in model.named_modules():
+                if module not in decoder_layers:
                     continue
-                weight_name = attr_name.removesuffix("_quantizer")
-                prefix = f"{module_name}." if module_name else ""
-                sd_key = f"{prefix}{weight_name}"
-                assert sd_key not in fakequant_weights, (
-                    f"Weight {sd_key} has already been fakequantized"
-                )
-                if sd_key in state_dict:
-                    w = state_dict[sd_key]
-                    w_quant = quantizer(w.float()).to(w.dtype).cpu()
-                    # Fold pre_quant_scale: (x*s)@fake_quant(W) = x@(fake_quant(W)*s)
-                    # Only valid when input_quantizer does NOT fake-quant activations. If it does
-                    # fake_quant(x*s), the non-linearity prevents folding s into W.
-                    inp_attr = attr_name.replace("weight_quantizer", "input_quantizer")
-                    if hasattr(module, inp_attr):
-                        inp_q = getattr(module, inp_attr)
-                        if (
-                            hasattr(inp_q, "_pre_quant_scale")
-                            and inp_q._pre_quant_scale is not None
-                            and inp_q._disabled
-                        ):
-                            scale = inp_q._pre_quant_scale.squeeze().to(device=w_quant.device)
-                            w_quant = (w_quant * scale[None, :]).to(w_quant.dtype)
-                            inp_q_key = get_unwrapped_name(
-                                f"{module_name}.{inp_attr}" if module_name else inp_attr, model
-                            )
-                            input_quantizers_folded_pqs.add(inp_q_key)
-                    state_dict[sd_key] = w_quant
-                    fakequant_weights.add(sd_key)
-
-    # Filter quantizer tensors out for a clean HF checkpoint.
-    clean_sd = {k: v for k, v in state_dict.items() if "quantizer" not in k}
+                with enable_weight_access_and_writeback(module, module):
+                    for sub_name, sub_mod in module.named_modules():
+                        full_name = f"{name}.{sub_name}" if sub_name else name
+                        _fakequant_module_weights(
+                            sub_mod,
+                            full_name,
+                            model,
+                            None,
+                            input_quantizers_folded_pqs,
+                            fakequant_weights,
+                            inplace=True,
+                        )
+            # Meta tensors for offloaded weights (free); offload maps now have
+            # fakequanted values via writeback.
+            state_dict = model.state_dict()
+        else:
+            # Default path: full state_dict copy, fakequant into the copy.
+            state_dict = model.state_dict()
+            for module_name, module in model.named_modules():
+                with enable_weight_access_and_writeback(module, model):
+                    _fakequant_module_weights(
+                        module,
+                        module_name,
+                        model,
+                        state_dict,
+                        input_quantizers_folded_pqs,
+                        fakequant_weights,
+                        inplace=False,
+                    )
+
+    if inplace_mem_efficient:
+        # Let save_pretrained build its own state_dict so offloaded params go through
+        # its module_map / get_state_dict_from_offload path (modeling_utils.py:3967+).
+        # Passing state_dict= bypasses that path and crashes on meta tensors.
+        quantizer_keys = [k for k in state_dict if "quantizer" in k]
+        clean_sd = None
+    else:
+        clean_sd = {k: v for k, v in state_dict.items() if "quantizer" not in k}
+        quantizer_keys = None
 
     # Step 2: Disable weight quantizers, save modelopt state + quantizer state
     # dict, then re-enable. The _disabled=True flag is captured in modelopt_state
@@ -161,9 +233,18 @@ def export_hf_vllm_fq_checkpoint(
     modelopt_state["modelopt_state_weights"] = quantizer_state_dict
     torch.save(modelopt_state, export_dir / "vllm_fq_modelopt_state.pth")
 
-    # Step 3: Save HF weights using the pre-built folded state dict.
-    model.save_pretrained(export_dir, state_dict=clean_sd, save_modelopt_state=False)
-
-    for wq, orig_rotate in wqs_to_restore:
-        wq.enable()
-        wq._rotate = orig_rotate
+    # Step 3: Save HF weights.
+    if inplace_mem_efficient:
+        prev_ignore = getattr(model, "_keys_to_ignore_on_save", None)
+        model._keys_to_ignore_on_save = quantizer_keys
+        try:
+            model.save_pretrained(export_dir, save_modelopt_state=False)
+        finally:
+            model._keys_to_ignore_on_save = prev_ignore
+    else:
+        model.save_pretrained(export_dir, state_dict=clean_sd, save_modelopt_state=False)
+
+    if not inplace_mem_efficient:
+        for wq, orig_rotate in wqs_to_restore:
+            wq.enable()
+            wq._rotate = orig_rotate
@@ -1217,16 +1217,36 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
         ),
     )
 
-    use_sequential: bool = ModeloptField(
+    layerwise: bool = ModeloptField(
         default=False,
-        title="Enable sequential layer-by-layer calibration.",
+        title="Enable layerwise (layer-by-layer) calibration.",
         description=(
-            "If True, the calibration algorithm is applied sequentially to each decoder block. "
-            "Each layer's inputs are captured via a single forward pass that reflects the "
+            "If True, the calibration algorithm is applied layer by layer. "
+            "Each layer's inputs are captured via a forward pass that reflects the "
             "quantization of all preceding layers, incurring O(N) forward passes for N layers."
         ),
     )
 
+    layerwise_checkpoint_dir: str | None = ModeloptField(
+        default=None,
+        title="Checkpoint directory for layerwise calibration.",
+        description=(
+            "If set together with layerwise=True, per-layer checkpoints are saved to this "
+            "directory during calibration. On restart, calibration resumes from the last "
+            "completed layer."
+        ),
+    )
+
+    @model_validator(mode="after")
+    def validate_layerwise_checkpoint_dir(self):
+        """Raise if layerwise_checkpoint_dir is set but layerwise is False."""
+        if self.layerwise_checkpoint_dir is not None and not self.layerwise:
+            raise ValueError(
+                "layerwise_checkpoint_dir requires layerwise=True. "
+                "Set layerwise=True or remove layerwise_checkpoint_dir."
+            )
+        return self
+
 
 class MaxCalibConfig(QuantizeAlgorithmConfig):
     """The config for max calibration algorithm.