claude feedback

Fridah-nv · Fridah-nv · commit 67a2b94bb73d · 2026-05-21T18:01:05.000Z
Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -1755,6 +1755,11 @@ def layerwise_calibrate(
     ckpt = _CheckpointState.from_folder(checkpoint_dir, num_layers)
     start_layer = ckpt.start_layer if ckpt else 0
 
+    if ckpt and start_layer >= num_layers:
+        ckpt.full_restore(transformer_layers, model)
+        print_rank_0("Layerwise calibration completed (restored from checkpoint)")
+        return
+
     input_getter = LayerActivationCollector(model)
     input_getter._patch_all_layers(decoder_layers=transformer_layers)
 
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -824,31 +824,6 @@ def _fake_quantize(self, inputs):
                 getattr(self, "_onnx_quantizer_type", None),
                 self._pass_through_bwd,
             )
-        elif (
-            self.block_sizes is not None
-            and self._num_bits == (2, 1)
-            and self.block_sizes.get("scale_bits") == (4, 3)
-        ):
-            # Static NVFP4: plain TensorQuantizer should have been promoted to
-            # NVFP4StaticQuantizer during MSE setup. For per-expert quantizers
-            # in fused MoEs, promotion is gated on `_amax` having been set during
-            # max_calibrate; experts not activated during max_calibrate stay
-            # plain. MSE later sets a per-block `_amax`, so by the time forward
-            # runs again the quantizer has a valid amax — dispatch to the static
-            # NVFP4 fake-quant path here.
-            if amax is not None:
-                outputs = static_blockwise_fp4_fake_quant(
-                    inputs,
-                    amax,
-                    None,  # global_amax — computed internally by the kernel
-                    True,
-                    inputs.dtype,
-                    self._pass_through_bwd,
-                )
-            else:
-                # No amax at all (truly uncalibrated): pass through unchanged so
-                # forward doesn't crash. Should not normally be reachable.
-                outputs = inputs
         elif isinstance(self._num_bits, tuple):
             # Float-point quantization, e.g., FP8
             E, M = self._num_bits  # noqa: N806
@@ -959,9 +934,7 @@ def set_quant_params(axis, block_reshape_size, padding, slices, amax_shape=None)
 
         quant_axis = [i for i in range(len(quantize_axis)) if quantize_axis[i]]
 
-        slices = (
-            None if all(s is None for s in slices) else [s if s else slice(None) for s in slices]
-        )
+        slices = None if all(s is None for s in slices) else [s or slice(None) for s in slices]
 
         if all(p is None for p in paddings):
             paddings = None
@@ -970,7 +943,7 @@ def set_quant_params(axis, block_reshape_size, padding, slices, amax_shape=None)
             for padding in paddings:
                 if not (new_paddings or padding):
                     continue
-                new_paddings.extend(padding if padding else (0, 0))
+                new_paddings.extend(padding or (0, 0))
             paddings = tuple(reversed(new_paddings))
 
         set_quant_params(quant_axis, reshape_size, paddings, slices)
diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py
@@ -945,21 +945,26 @@ def update_quant_cfg_with_kv_cache_quant(
 def promote_nvfp4_static_quantizers(model: nn.Module) -> int:
     """Convert eligible TensorQuantizers to NVFP4StaticQuantizer in-place.
 
-    After max calibration sets per-block amax values, NVFP4 static quantizers
-    need to be promoted so they use the two-level scaling path (global amax +
-    per-block amax) instead of the generic E4M3 path.
+    Promotion is purely a class swap based on the static-NVFP4 *format*; it does
+    not require ``_amax`` to be set. Quantizers without ``_amax`` (e.g. MoE
+    experts that received no calibration tokens) still get promoted so that any
+    later forward — once MSE or bootstrap populates ``_amax`` — dispatches via
+    the subclass's two-level scaling path instead of the parent's generic E4M3.
 
     Returns the number of quantizers converted.
     """
     from modelopt.torch.quantization.nn import NVFP4StaticQuantizer, TensorQuantizer
 
     converted = 0
     for _name, module in list(model.named_modules()):
-        if isinstance(module, TensorQuantizer) and not module._disabled:
-            if module._calibrator is not None and not module._dynamic and hasattr(module, "_amax"):
-                if module.is_nvfp4_static:
-                    initial_amax = module._amax.clone().detach()
-                    global_amax = reduce_amax(initial_amax, axis=None)
-                    NVFP4StaticQuantizer.from_tensor_quantizer(module, global_amax=global_amax)
-                    converted += 1
+        if not isinstance(module, TensorQuantizer) or module._disabled:
+            continue
+        if module._calibrator is None or module._dynamic:
+            continue
+        if not module.is_nvfp4_static or isinstance(module, NVFP4StaticQuantizer):
+            continue
+        amax = getattr(module, "_amax", None)
+        global_amax = reduce_amax(amax.detach(), axis=None) if amax is not None else None
+        NVFP4StaticQuantizer.from_tensor_quantizer(module, global_amax=global_amax)
+        converted += 1
     return converted
diff --git a/modelopt/torch/quantization/utils/layerwise_calib.py b/modelopt/torch/quantization/utils/layerwise_calib.py
@@ -227,22 +227,20 @@ def _patched_forward(self, *args, **kwargs):
                     f"Layer {info.name} is in 'run' mode but has no cached inputs to replay."
                 )
                 real_args, real_kwargs = info.cached_inputs.popleft()
-                if (
-                    real_args
-                    and isinstance(real_args[0], torch.Tensor)
-                    and real_args[0].device.type == "cpu"
-                ):
-                    device = get_module_device(self)
-                    real_args = _move_to_device(real_args, device)
-                    real_kwargs = _move_to_device(real_kwargs, device)
+                # Captured inputs are stored on CPU (see "capture" branch); move
+                # back to the layer's device for replay. `_move_to_device` is a
+                # no-op for tensors already on `device`.
+                device = get_module_device(self)
+                real_args = _move_to_device(real_args, device)
+                real_kwargs = _move_to_device(real_kwargs, device)
                 output = self._original_forward(*real_args, **real_kwargs)
                 info.output_meta = LayerActivationCollector._extract_output_meta(output)
                 return output
 
             if info.mode == "capture":
                 # Offload captured inputs to CPU at append time. For early layers
-                # on a single GPU (e.g. layer 0–2 on GPU 0 with seq_device_map),
-                # accumulating thousands of batches' worth of (bs × seq × hidden)
+                # on a single GPU (e.g. layer 0-2 on GPU 0 with seq_device_map),
+                # accumulating thousands of batches' worth of (bs x seq x hidden)
                 # activations on-device saturates that GPU during the capture loop
                 # and OOMs before _set_layer_states gets a chance to move them.
                 # The "run" branch already handles CPU-resident inputs (see the
@@ -333,11 +331,8 @@ def _set_layer_states(self, layer_idx: int):
                     "was called for every preceding layer in order."
                 )
             prev.mode = "run"
-            cpu = torch.device("cpu")
-            prev.cached_inputs = deque(
-                (_move_to_device(args, cpu), _move_to_device(kwargs, cpu))
-                for args, kwargs in prev.collected_inputs
-            )
+            # Inputs are already CPU-resident at capture time (see _patched_forward).
+            prev.cached_inputs = deque(prev.collected_inputs)
             prev.collected_inputs = []
 
         cur = self._decoder_layers[layer_idx]._layerwise_calib
@@ -534,9 +529,6 @@ def _save_layer(
     torch.save(output_meta, os.path.join(d, "output_meta.pt"))
     if next_inputs is not None:
         torch.save(next_inputs, os.path.join(d, "next_inputs.pt"))
-    amax_state = {k: v for k, v in weights.items() if "_amax" in k}
-    if amax_state:
-        torch.save(amax_state, os.path.join(d, "quantizer_amaxes.pt"))
     _write_manifest(checkpoint_dir, idx, num_layers)
 
 
@@ -635,17 +627,8 @@ def setup_resume(self, layers: nn.ModuleList) -> list | None:
         # Keep on CPU — _patched_forward's run mode moves each entry to device on pop.
         return next_inputs
 
-    def full_restore(
-        self, layers: nn.ModuleList, model: nn.Module, restore_weights: bool = True
-    ) -> None:
-        """Restore weights and quantizer state for layers 0..K-1 after the calibration loop.
-
-        Args:
-            restore_weights: If False, skip reloading ``weights.pt`` and load only the
-                ``_amax`` values (from ``quantizer_amaxes.pt`` or filtered from ``weights.pt``).
-                Set to False for calibration algorithms (max, MSE) that never modify weights
-                to avoid re-reading gigabytes of unchanged expert weights from disk.
-        """
+    def full_restore(self, layers: nn.ModuleList, model: nn.Module) -> None:
+        """Restore weights and quantizer state for layers 0..K-1 after the calibration loop."""
         from modelopt.torch.quantization.config import QuantizeConfig
         from modelopt.torch.quantization.conversion import restore_quantizer_state
         from modelopt.torch.quantization.utils.core_utils import enable_weight_access_and_writeback
@@ -671,31 +654,13 @@ def full_restore(
                     map_location="cpu",
                     weights_only=False,
                 )
+                weights = torch.load(
+                    os.path.join(d, "weights.pt"),
+                    map_location="cpu",
+                    weights_only=False,
+                )
                 restore_quantizer_state(layer, dummy_config, {"quantizer_state": qstate})
-                if restore_weights:
-                    weights = torch.load(
-                        os.path.join(d, "weights.pt"),
-                        map_location="cpu",
-                        weights_only=False,
-                    )
-                    layer.load_state_dict(weights, strict=False, assign=False)
-                else:
-                    # Load only _amax entries — skip gigabytes of unchanged expert weights.
-                    # Use map_location="cpu" to get fresh CPU tensors (no storage_offset).
-                    # _export_fused_experts moves _amax to the weight device on demand.
-                    amax_path = os.path.join(d, "quantizer_amaxes.pt")
-                    if os.path.exists(amax_path):
-                        amaxes = torch.load(amax_path, map_location="cpu", weights_only=False)
-                    else:
-                        # Legacy checkpoint: filter _amax entries from the full weights.pt.
-                        weights = torch.load(
-                            os.path.join(d, "weights.pt"),
-                            map_location="cpu",
-                            weights_only=False,
-                        )
-                        amaxes = {k: v for k, v in weights.items() if "_amax" in k}
-                    if amaxes:
-                        layer.load_state_dict(amaxes, strict=False, assign=True)
+                layer.load_state_dict(weights, strict=False, assign=False)
 
         print_rank_0(f"Checkpoint: restored {self.start_layer} previously calibrated layers")
 
diff --git a/tests/unit/torch/quantization/test_layerwise_calibrate.py b/tests/unit/torch/quantization/test_layerwise_calibrate.py
@@ -25,7 +25,12 @@
 import modelopt.torch.quantization as mtq
 from modelopt.torch.quantization.model_calib import layerwise_calibrate
 from modelopt.torch.quantization.nn import TensorQuantizer
-from modelopt.torch.quantization.utils.layerwise_calib import LayerActivationCollector, _SkipLayer
+from modelopt.torch.quantization.utils.layerwise_calib import (
+    LayerActivationCollector,
+    _CheckpointState,
+    _SkipLayer,
+    detect_resume_point,
+)
 
 
 class _DecoderBlock(nn.Module):
@@ -719,3 +724,98 @@ def test_mtq_quantize_layerwise_raises_for_unsupported_algorithm():
             config,
             forward_loop=lambda m: m(torch.randint(0, 32, (2, 8))),
         )
+
+
+# Checkpoint resume + capture-time CPU offload
+
+
+def test_collected_inputs_are_cpu_at_capture(monkeypatch):
+    """Capture-time CPU offload: collected_inputs must be on CPU even if data starts elsewhere.
+
+    This is the OOM-prevention invariant — without it, accumulating thousands of
+    batches' worth of activations on the layer's compute device saturates GPU
+    memory before the run-mode transition gets a chance to move them.
+    """
+    _register_test_discoverer(monkeypatch)
+    model = _SimpleTwoLayerModel(dim=8)
+    collector = LayerActivationCollector(model)
+
+    def forward_loop(m):
+        m(torch.randn(2, 8))
+
+    collector._patch_all_layers()
+    try:
+        inputs = collector.get_input_activations(model.layers[0], forward_loop)
+    finally:
+        collector._unpatch_all_layers()
+
+    args, _ = inputs[0]
+    assert args[0].device.type == "cpu", "captured tensor must be CPU-resident"
+
+
+def test_detect_resume_point_returns_num_layers_when_complete(tmp_path):
+    """Completed checkpoint reports ``start = num_layers`` (not None)."""
+    ckpt_dir = str(tmp_path / "ckpt")
+    state = _CheckpointState(ckpt_dir, num_layers=3)
+    import os
+
+    os.makedirs(ckpt_dir, exist_ok=True)
+    from modelopt.torch.quantization.utils.layerwise_calib import _write_manifest
+
+    _write_manifest(ckpt_dir, last_completed_layer=2, num_layers=3)
+
+    result = detect_resume_point(ckpt_dir)
+    assert result is not None
+    start, _ = result
+    assert start == state.num_layers == 3
+
+
+def test_layerwise_calibrate_early_returns_on_completed_checkpoint(monkeypatch, tmp_path):
+    """Fully-completed checkpoint must short-circuit calibration: no forward_loop calls."""
+    _register_test_discoverer(monkeypatch)
+    torch.manual_seed(0)
+
+    # Set up a model and run one round of layerwise calibration to write a complete checkpoint.
+    model = _SimpleTransformerModel(n_layers=2, dim=16)
+    calib_data = [torch.randint(0, 32, (2, 8))]
+    ckpt_dir = str(tmp_path / "ckpt")
+
+    config = _int8_layerwise_config(
+        {"method": "max", "layerwise": True, "layerwise_checkpoint_dir": ckpt_dir}
+    )
+    mtq.quantize(model, config, forward_loop=lambda m: [m(b) for b in calib_data])
+
+    # Second invocation against the same dir should never call forward_loop again.
+    fresh = _SimpleTransformerModel(n_layers=2, dim=16)
+    config2 = _int8_layerwise_config(
+        {"method": "max", "layerwise": True, "layerwise_checkpoint_dir": ckpt_dir}
+    )
+
+    call_count = {"n": 0}
+
+    def counting_forward(m):
+        call_count["n"] += 1
+        m(calib_data[0])
+
+    mtq.quantize(fresh, config2, forward_loop=counting_forward)
+    assert call_count["n"] == 0, "completed checkpoint must skip the calibration forward loop"
+
+
+def test_layerwise_calibrate_resumes_from_partial_checkpoint(monkeypatch, tmp_path):
+    """Partial checkpoint runs only the remaining layers."""
+    _register_test_discoverer(monkeypatch)
+
+    # Hand-write a manifest claiming layer 0 of 2 is complete, but with a dummy
+    # layer_0000 directory that won't actually load. The test only checks that
+    # detect_resume_point returns start=1 (not None) — verifying the partial-resume
+    # branch and the "all done" branch are distinct.
+    import os
+
+    ckpt_dir = str(tmp_path / "ckpt")
+    os.makedirs(ckpt_dir, exist_ok=True)
+    from modelopt.torch.quantization.utils.layerwise_calib import _write_manifest
+
+    _write_manifest(ckpt_dir, last_completed_layer=0, num_layers=2)
+
+    result = detect_resume_point(ckpt_dir)
+    assert result == (1, {"last_completed_layer": 0, "num_layers": 2})
diff --git a/tests/unit/torch/quantization/test_nvfp4_promotion.py b/tests/unit/torch/quantization/test_nvfp4_promotion.py