Add layerwise calibration for large models

realAsma · claude · realAsma · commit c50c4a797b23 · 2026-04-15T17:44:03.000Z
Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -91,8 +91,9 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
     for i, entry in enumerate(quant_cfg):
         if entry.get("quantizer_name") != "*[kv]_bmm_quantizer":
             continue
-        assert isinstance(entry.get("cfg", {}), dict)
-        quant_cfg[i] = {**entry, "cfg": {**entry.get("cfg", {}), "use_constant_amax": True}}
+        cfg = entry.get("cfg") or {}
+        assert isinstance(cfg, dict)
+        quant_cfg[i] = {**entry, "cfg": {**cfg, "use_constant_amax": True}}
         break
 
 
@@ -1104,6 +1105,15 @@ def quantize_main(
             quant_cfg = copy.deepcopy(quant_cfg)
             _set_kv_cache_constant_amax(quant_cfg["quant_cfg"])
 
+        from modelopt.torch.quantization.utils.layerwise_calib import (
+            needs_checkpoint_path_update,
+            resolve_checkpoint_dir,
+        )
+
+        if needs_checkpoint_path_update(quant_cfg):
+            quant_cfg = resolve_checkpoint_dir(quant_cfg, args.pyt_ckpt_path)
+            print(f"Auto-resolved checkpoint_dir: {quant_cfg['algorithm']['checkpoint_dir']}")
+
         if args.qformat in QUANT_CFG_CHOICES:
             mono_quantize(
                 args,
diff --git a/modelopt/torch/quantization/mode.py b/modelopt/torch/quantization/mode.py
@@ -239,6 +239,7 @@ def wrapped_calib_func(
 
     if func is not None:
         if layerwise:
+            # TODO: add a method guard here — not all calib methods support per-layer invocation
             if forward_loop is None:
                 raise ValueError("forward_loop is required for calibration but got None.")
             # Wrap with layerwise processing
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -28,7 +28,10 @@
 from tqdm import tqdm
 
 from modelopt.torch.opt.searcher import ForwardLoop
-from modelopt.torch.quantization.utils.layerwise_calib import LayerActivationCollector
+from modelopt.torch.quantization.utils.layerwise_calib import (
+    LayerActivationCollector,
+    _CheckpointState,
+)
 from modelopt.torch.utils import print_rank_0
 from modelopt.torch.utils.distributed import DistributedProcessGroup, ParallelState
 from modelopt.torch.utils.network import bind_forward_method, unpatch_forward_method
@@ -1569,8 +1572,6 @@ def layerwise_calibrate(
     are saved after each layer completes. On restart, calibration resumes from
     the last completed layer.
     """
-    from modelopt.torch.quantization.utils.layerwise_calib import _CheckpointState
-
     checkpoint_dir = calib_kwargs.pop("checkpoint_dir", None)
 
     if forward_loop is None:
diff --git a/modelopt/torch/quantization/plugins/accelerate.py b/modelopt/torch/quantization/plugins/accelerate.py
@@ -34,9 +34,8 @@
 def _get_cpu_offload_hook(hook):
     if isinstance(hook, AlignDevicesHook) and hook.offload and hook.weights_map is not None:
         assert len(hook.weights_map) > 0
-        if (
-            isinstance(hook.weights_map, PrefixedDataset)
-            and hook.weights_map.prefix + "weight" not in hook.weights_map.dataset.state_dict
+        if isinstance(hook.weights_map, PrefixedDataset) and not any(
+            k.startswith(hook.weights_map.prefix) for k in hook.weights_map.dataset.state_dict
         ):
             raise NotImplementedError(
                 "This layer could be offloaded to disk. We don't support this yet."
diff --git a/modelopt/torch/quantization/utils/layerwise_calib.py b/modelopt/torch/quantization/utils/layerwise_calib.py
@@ -33,6 +33,7 @@
 import torch
 import torch.nn as nn
 
+from modelopt.torch.utils import distributed as dist
 from modelopt.torch.utils import print_rank_0
 from modelopt.torch.utils.network import (
     bind_forward_method,
@@ -77,6 +78,16 @@ def __init__(self, original: nn.Module):
         object.__setattr__(self, "_original", original)
         self._layerwise_calib = _LayerCalibState(mode="skip")
 
+    def __getattr__(self, name: str):
+        # Proxy non-special attribute lookups to the original layer so that
+        # parent-model code that accesses layer-level attributes (e.g.,
+        # NemotronH's ``block_type``) still works when the layer is replaced
+        # with a _SkipLayer.
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(object.__getattribute__(self, "_original"), name)
+
     def forward(self, *args, **kwargs):
         return LayerActivationCollector._zeros_from_meta(
             self._original._layerwise_calib.output_meta
@@ -315,7 +326,13 @@ def _log_layer_summary(self, layer_idx: int):
             mode = layer._layerwise_calib.mode
             if mode in ("skip", "run", "capture"):
                 groups.setdefault(mode, []).append(i + 1)
-        parts = [f"{mode}: {groups[mode]}" for mode in ("skip", "run", "capture") if mode in groups]
+
+        parts = []
+        for mode in ("skip", "run", "capture"):
+            if mode not in groups:
+                continue
+            ids = groups[mode]
+            parts.append(f"{mode}: {len(ids)}" if mode == "skip" else f"{mode}: {ids}")
         print_rank_0(f"Calibrating layer {layer_idx + 1}/{n} | {' | '.join(parts)}")
 
     @torch.no_grad()
@@ -489,6 +506,42 @@ def _save_layer(
     _write_manifest(checkpoint_dir, idx, num_layers)
 
 
+def needs_checkpoint_path_update(quant_cfg: dict) -> bool:
+    """Check if quant_cfg has a checkpoint_dir that should be auto-resolved to a unique subpath."""
+    algorithm = quant_cfg.get("algorithm")
+    if algorithm is None or isinstance(algorithm, str):
+        return False
+    return algorithm.get("checkpoint_dir") is not None
+
+
+def resolve_checkpoint_dir(quant_cfg: dict, model_path: str) -> dict:
+    """Append a unique ``<model_name>_<config_hash>`` subdirectory to checkpoint_dir.
+
+    Allows a single recipe to be reused across models without checkpoint collisions.
+    Must only be called when :func:`needs_checkpoint_path_update` returns True.
+    """
+    import copy
+    import hashlib
+    from pathlib import Path
+
+    algorithm = quant_cfg["algorithm"]
+    base_dir = algorithm["checkpoint_dir"]
+
+    name = model_path.rstrip("/")
+    if "/" in name and not os.path.isabs(name):
+        name = name.replace("/", "--")
+    else:
+        name = Path(name).name
+
+    config_hash = hashlib.sha256(
+        json.dumps(quant_cfg, sort_keys=True, default=str).encode()
+    ).hexdigest()[:8]
+
+    quant_cfg = copy.deepcopy(quant_cfg)
+    quant_cfg["algorithm"]["checkpoint_dir"] = os.path.join(base_dir, f"{name}_{config_hash}")
+    return quant_cfg
+
+
 def detect_resume_point(checkpoint_dir: str) -> tuple[int, dict] | None:
     """Detect where to resume from an existing checkpoint directory.
 
@@ -512,9 +565,21 @@ class _CheckpointState:
 
     Handles both saving per-layer checkpoints during calibration and
     restoring from a previous partial run.
+
+    .. todo::
+        Support distributed checkpoint save/restore for FSDP2:
+        use ``torch.distributed.checkpoint`` (or save only from rank 0 + barrier)
+        and broadcast restored state to all ranks during resume.
     """
 
     def __init__(self, checkpoint_dir: str, num_layers: int, start_layer: int = 0):
+        if dist.is_initialized() and dist.size() > 1:
+            raise RuntimeError(
+                "Layerwise calibration checkpointing is not supported in "
+                "multi-process distributed jobs (e.g. FSDP2). "
+                "Use single-process calibration or disable checkpointing."
+            )
+
         self.checkpoint_dir = checkpoint_dir
         self.num_layers = num_layers
         self.start_layer = start_layer
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
@@ -594,16 +594,28 @@ def _forward_loop(
         dataloader: DataLoader containing the batched input data
         allowed_non_tensor_keys: Set of key names whose values may be non-tensor types
     """
-    with torch.no_grad():
-        is_enc_dec = model_type_is_enc_dec(model)
-        infer_method = model.generate if is_enc_dec else model.forward
-        max_working_batch_size = None  # Initialize max working batch size as None
+    # Disable KV caching during calibration — it is unnecessary overhead and causes
+    # correctness issues with hybrid Mamba/attention models whose cache state is mutated
+    # in-place (e.g., NemotronH).
+    config = getattr(model, "config", None)
+    prev_use_cache = getattr(config, "use_cache", None)
+    if config is not None and prev_use_cache is not None:
+        config.use_cache = False
 
-        for _, data in enumerate(tqdm(dataloader)):
-            # Process batch and update max working batch size
-            max_working_batch_size = _process_batch(
-                data, infer_method, max_working_batch_size, allowed_non_tensor_keys
-            )
+    try:
+        with torch.no_grad():
+            is_enc_dec = model_type_is_enc_dec(model)
+            infer_method = model.generate if is_enc_dec else model.forward
+            max_working_batch_size = None  # Initialize max working batch size as None
+
+            for _, data in enumerate(tqdm(dataloader)):
+                # Process batch and update max working batch size
+                max_working_batch_size = _process_batch(
+                    data, infer_method, max_working_batch_size, allowed_non_tensor_keys
+                )
+    finally:
+        if config is not None and prev_use_cache is not None:
+            config.use_cache = prev_use_cache
 
 
 def create_forward_loop(