Address PR review feedback for layerwise calibration

realAsma · claude · realAsma · commit 43d1888bf381 · 2026-04-15T19:43:23.000Z
- Add inline security comments for all torch.load(weights_only=False) calls
- Replace bare assert with RuntimeError for unsupported offload hook layout
- Write back buffers (not just parameters) in _writeback_params_to_weights_map
- Add cross-field validator rejecting layerwise_checkpoint_dir without layerwise=True
- Validate num_layers mismatch on checkpoint resume
- Handle integer device ordinals in _get_execution_device_from_hook
- Clean up stale layer artifacts in partial-checkpoint tests
- Guard non-dict algorithm values in needs_checkpoint_path_update
- Add comment explaining dummy output_meta for last layer

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -860,7 +860,7 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
 def needs_checkpoint_path_update(quant_cfg: dict) -> bool:
     """Check if quant_cfg has a layerwise_checkpoint_dir that should be auto-resolved to a unique subpath."""
     algorithm = quant_cfg.get("algorithm")
-    if algorithm is None or isinstance(algorithm, str):
+    if not isinstance(algorithm, dict):
         return False
     return algorithm.get("layerwise_checkpoint_dir") is not None
 
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -1237,6 +1237,16 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
         ),
     )
 
+    @model_validator(mode="after")
+    def validate_layerwise_checkpoint_dir(self):
+        """Raise if layerwise_checkpoint_dir is set but layerwise is False."""
+        if self.layerwise_checkpoint_dir is not None and not self.layerwise:
+            raise ValueError(
+                "layerwise_checkpoint_dir requires layerwise=True. "
+                "Set layerwise=True or remove layerwise_checkpoint_dir."
+            )
+        return self
+
 
 class MaxCalibConfig(QuantizeAlgorithmConfig):
     """The config for max calibration algorithm.
diff --git a/modelopt/torch/quantization/plugins/accelerate.py b/modelopt/torch/quantization/plugins/accelerate.py
@@ -50,9 +50,9 @@ def _get_cpu_offload_hook(hook):
 
 
 def _writeback_params_to_weights_map(module, align_hook):
-    """Write all non-meta parameters back to the hook's CPU weights_map."""
-    for name, param in module.named_parameters():
-        if param.device.type == "meta":
+    """Write all non-meta parameters and buffers back to the hook's CPU weights_map."""
+    for name, tensor in module.state_dict(keep_vars=True).items():
+        if tensor.device.type == "meta":
             continue
         if isinstance(align_hook.weights_map, PrefixedDataset):
             key = align_hook.weights_map.prefix + name
@@ -61,7 +61,7 @@ def _writeback_params_to_weights_map(module, align_hook):
             w_map = align_hook.weights_map
             key = name
         if key in w_map:
-            w_map[key] = param.data.to(w_map[key].device, dtype=w_map[key].dtype)
+            w_map[key] = tensor.detach().to(w_map[key].device, dtype=w_map[key].dtype)
 
 
 @contextmanager
@@ -85,14 +85,15 @@ def weight_access_and_writeback_context(module):
         # Guard: the sub-module branch below is not reached when the parent has
         # an offload hook.  Assert that no children also carry offload hooks,
         # which would require a combined writeback strategy.
-        assert not any(
+        if any(
             _get_cpu_offload_hook(mod._hf_hook)
             for mod in module.modules()
             if mod is not module and hasattr(mod, "_hf_hook")
-        ), (
-            "Both the module and one of its sub-modules have CPU-offload hooks. "
-            "weight_access_and_writeback_context does not support this layout yet."
-        )
+        ):
+            raise RuntimeError(
+                "Both the module and one of its sub-modules have CPU-offload hooks. "
+                "weight_access_and_writeback_context does not support this layout yet."
+            )
         align_hook.pre_forward(module)
         align_hook.offload = False
         try:
diff --git a/modelopt/torch/quantization/utils/layerwise_calib.py b/modelopt/torch/quantization/utils/layerwise_calib.py
@@ -555,6 +555,13 @@ def from_folder(cls, checkpoint_dir: str | None, num_layers: int) -> _Checkpoint
             return None
         os.makedirs(checkpoint_dir, exist_ok=True)
         info = detect_resume_point(checkpoint_dir)
+        if info is not None:
+            manifest_num_layers = info[1].get("num_layers")
+            if manifest_num_layers is not None and manifest_num_layers != num_layers:
+                raise ValueError(
+                    f"Checkpoint num_layers mismatch: manifest has {manifest_num_layers} "
+                    f"but model has {num_layers}. Use a fresh checkpoint directory."
+                )
         start = info[0] if info else 0
         if start > 0:
             print_rank_0(
@@ -575,6 +582,7 @@ def setup_resume(self, layers: nn.ModuleList) -> list | None:
 
         for i in range(self.start_layer):
             d = _layer_dir(self.checkpoint_dir, i)
+            # weights_only=False is safe: file is internally generated by _save_layer, not user-supplied
             meta = torch.load(
                 os.path.join(d, "output_meta.pt"), map_location="cpu", weights_only=False
             )
@@ -586,6 +594,7 @@ def setup_resume(self, layers: nn.ModuleList) -> list | None:
         next_inputs_path = os.path.join(d, "next_inputs.pt")
         if not os.path.isfile(next_inputs_path):
             raise FileNotFoundError(f"Cannot resume: next_inputs.pt missing for layer {last_ckpt}")
+        # weights_only=False is safe: file is internally generated by _save_layer, not user-supplied
         next_inputs = torch.load(next_inputs_path, map_location="cpu", weights_only=False)
         resume_device = get_module_device(layers[self.start_layer])
         next_inputs = _move_to_device(next_inputs, resume_device)
@@ -610,14 +619,20 @@ def full_restore(self, layers: nn.ModuleList, model: nn.Module) -> None:
             # Restore quantizer state first: may promote TensorQuantizer to
             # NVFP4StaticQuantizer, changing module structure that load_state_dict
             # expects.
-            qstate = torch.load(os.path.join(d, "quantizer_state.pt"), map_location=layer_device)
+            # weights_only=False is safe: file is internally generated by _save_layer, not user-supplied
+            qstate = torch.load(
+                os.path.join(d, "quantizer_state.pt"), map_location=layer_device, weights_only=False
+            )
             restore_quantizer_state(layer, dummy_config, {"quantizer_state": qstate})
 
             # Load weights inside the framework's access context so that
             # managed-weight frameworks (accelerate CPU offload, FSDP2) sync
             # their internal state with the restored parameters.
             with enable_weight_access_and_writeback(layer, model, name_to_module):
-                weights = torch.load(os.path.join(d, "weights.pt"), map_location=layer_device)
+                # weights_only=False is safe: file is internally generated by _save_layer, not user-supplied
+                weights = torch.load(
+                    os.path.join(d, "weights.pt"), map_location=layer_device, weights_only=False
+                )
                 layer.load_state_dict(weights, strict=False)
 
         print_rank_0(f"Checkpoint: restored {self.start_layer} previously calibrated layers")
@@ -649,6 +664,8 @@ def save(
 
         output_meta = getattr(layer._layerwise_calib, "output_meta", None)
         if output_meta is None:
+            # Placeholder for the last layer: output_meta is never used for skip mode
+            # since there is no subsequent layer that needs a correctly shaped dummy output.
             output_meta = LayerActivationCollector._extract_output_meta(torch.zeros(1))
 
         _save_layer(
diff --git a/modelopt/torch/utils/network.py b/modelopt/torch/utils/network.py
@@ -103,12 +103,12 @@ def _get_execution_device_from_hook(module: nn.Module) -> torch.device | None:
 
     dev = getattr(hook, "execution_device", None)
     if dev is not None:
-        return torch.device(dev)
+        return torch.device("cuda", dev) if isinstance(dev, int) else torch.device(dev)
 
     for h in getattr(hook, "hooks", ()):
         dev = getattr(h, "execution_device", None)
         if dev is not None:
-            return torch.device(dev)
+            return torch.device("cuda", dev) if isinstance(dev, int) else torch.device(dev)
 
     return None
 
diff --git a/tests/gpu/torch/quantization/plugins/test_accelerate_gpu.py b/tests/gpu/torch/quantization/plugins/test_accelerate_gpu.py
@@ -16,6 +16,7 @@
 import copy
 import json
 import os
+import shutil
 
 import pytest
 import torch
@@ -29,6 +30,7 @@
     enable_weight_access_and_writeback,
     is_quantized_linear,
 )
+from modelopt.torch.quantization.utils.layerwise_calib import _layer_dir
 
 
 @pytest.mark.parametrize(
@@ -204,10 +206,15 @@ def test_sequential_checkpoint_resume_cpu_offloaded(tmp_path, quant_cfg):
     mtq.quantize(model_ref, seq_ckpt_cfg, lambda model: model(inputs))
     output_ref = model_ref(inputs)
 
-    # Simulate crash after layer 0 by truncating the manifest
+    # Simulate crash after layer 0 by truncating the manifest and removing later layers
+    last_completed_layer = 0
     manifest_path = os.path.join(ckpt_dir, "manifest.json")
     with open(manifest_path, "w") as f:
-        json.dump({"last_completed_layer": 0, "num_layers": num_layers}, f)
+        json.dump({"last_completed_layer": last_completed_layer, "num_layers": num_layers}, f)
+    for i in range(last_completed_layer + 1, num_layers):
+        d = _layer_dir(ckpt_dir, i)
+        if os.path.isdir(d):
+            shutil.rmtree(d)
 
     # Resume from a fresh CPU-offloaded model
     with init_empty_weights():
@@ -257,9 +264,14 @@ def _make_multi_offload_model():
     output_ref = model_ref(inputs)
 
     # Simulate crash after layer 0
+    last_completed_layer = 0
     manifest_path = os.path.join(ckpt_dir, "manifest.json")
     with open(manifest_path, "w") as f:
-        json.dump({"last_completed_layer": 0, "num_layers": num_layers}, f)
+        json.dump({"last_completed_layer": last_completed_layer, "num_layers": num_layers}, f)
+    for i in range(last_completed_layer + 1, num_layers):
+        d = _layer_dir(ckpt_dir, i)
+        if os.path.isdir(d):
+            shutil.rmtree(d)
 
     # Resume from fresh model with same offload layout
     model_resumed = _make_multi_offload_model()
@@ -346,9 +358,14 @@ def test_sequential_gptq_checkpoint_resume_cpu_offloaded(tmp_path):
     output_ref = model_ref(inputs)
 
     # Simulate crash after layer 0
+    last_completed_layer = 0
     manifest_path = os.path.join(ckpt_dir, "manifest.json")
     with open(manifest_path, "w") as f:
-        json.dump({"last_completed_layer": 0, "num_layers": num_layers}, f)
+        json.dump({"last_completed_layer": last_completed_layer, "num_layers": num_layers}, f)
+    for i in range(last_completed_layer + 1, num_layers):
+        d = _layer_dir(ckpt_dir, i)
+        if os.path.isdir(d):
+            shutil.rmtree(d)
 
     # Resume from fresh CPU-offloaded model
     with init_empty_weights():