NVIDIA
diff --git a/‎modelopt/torch/quantization/config.py‎
Lines changed: 10 additions & 0 deletions b/‎modelopt/torch/quantization/config.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎modelopt/torch/quantization/mode.py‎
Lines changed: 2 additions & 3 deletions b/‎modelopt/torch/quantization/mode.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎modelopt/torch/quantization/model_calib.py‎
Lines changed: 43 additions & 8 deletions b/‎modelopt/torch/quantization/model_calib.py‎
Lines changed: 43 additions & 8 deletions
diff --git a/‎modelopt/torch/quantization/plugins/accelerate.py‎
Lines changed: 64 additions & 17 deletions b/‎modelopt/torch/quantization/plugins/accelerate.py‎
Lines changed: 64 additions & 17 deletions
diff --git a/‎modelopt/torch/quantization/plugins/huggingface.py‎
Lines changed: 1 addition & 1 deletion b/‎modelopt/torch/quantization/plugins/huggingface.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/torch/quantization/utils/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎modelopt/torch/quantization/utils/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -1227,6 +1227,16 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
         ),
     )
 
+    checkpoint_dir: str | None = ModeloptField(
+        default=None,
+        title="Checkpoint directory for sequential calibration.",
+        description=(
+            "If set together with use_sequential=True, per-layer checkpoints are saved to this "
+            "directory during calibration. On restart, calibration resumes from the last "
+            "completed layer."
+        ),
+    )
+
 
 class MaxCalibConfig(QuantizeAlgorithmConfig):
     """The config for max calibration algorithm.
 
@@ -223,6 +223,7 @@ def wrapped_calib_func(
     kwargs = config.model_dump()
     method = kwargs.pop("method")
     sequential = kwargs.pop("use_sequential", False)
+    checkpoint_dir = kwargs.pop("checkpoint_dir", None)
     if method is not None and "awq" in method:
         # For backward compatibility
         kwargs["algorithm"] = method
@@ -240,14 +241,12 @@ def wrapped_calib_func(
         if sequential:
             if forward_loop is None:
                 raise ValueError("forward_loop is required for calibration but got None.")
-            assert method in ["max", "gptq"], (
-                f"Sequential calibration currently only supports max and gptq calibration, got {method}"
-            )
             # Wrap with sequential processing
             sequential_calibrate(
                 model,
                 forward_loop=forward_loop,
                 calib_func=func,
+                checkpoint_dir=checkpoint_dir,
                 **kwargs,
             )
         else:
 
@@ -28,7 +28,7 @@
 from tqdm import tqdm
 
 from modelopt.torch.opt.searcher import ForwardLoop
-from modelopt.torch.quantization.utils.activation_collector import LayerActivationCollector
+from modelopt.torch.quantization.utils.layerwise_calib import LayerActivationCollector
 from modelopt.torch.utils import print_rank_0
 from modelopt.torch.utils.distributed import DistributedProcessGroup, ParallelState
 from modelopt.torch.utils.network import bind_forward_method, unpatch_forward_method
@@ -1563,7 +1563,15 @@ def sequential_calibrate(
     Runs the full model forward per layer but patches decoder layers with a
     skip / run / capture strategy so that inter-layer logic in parent modules
     (e.g. mask construction) executes naturally without model-specific hooks.
+
+    If ``checkpoint_dir`` is passed (via ``calib_kwargs``), per-layer checkpoints
+    are saved after each layer completes. On restart, calibration resumes from
+    the last completed layer.
     """
+    from modelopt.torch.quantization.utils.layerwise_calib import _CheckpointState
+
+    checkpoint_dir = calib_kwargs.pop("checkpoint_dir", None)
+
     if forward_loop is None:
         raise ValueError(
             "forward_loop must not be None for sequential calibration. "
@@ -1577,27 +1585,52 @@ def sequential_calibrate(
             "Sequential calibration requires a model with identifiable transformer layers."
         )
 
-    print_rank_0(f"Sequential calibration: Found {len(transformer_layers)} transformer layers")
+    num_layers = len(transformer_layers)
+    print_rank_0(f"Sequential calibration: Found {num_layers} transformer layers")
+
+    ckpt = _CheckpointState.from_folder(checkpoint_dir, num_layers)
+    start_layer = ckpt.start_layer if ckpt else 0
 
     input_getter = LayerActivationCollector(model)
     input_getter._patch_all_layers(decoder_layers=transformer_layers)
 
+    resumed_inputs = ckpt.setup_resume(transformer_layers) if ckpt and start_layer > 0 else None
+
     try:
-        for layer_idx, layer in enumerate(transformer_layers):
-            print_rank_0(f"Calibrating layer {layer_idx + 1}/{len(transformer_layers)}")
-            layer_inputs = input_getter.get_input_activations(layer, forward_loop)
+        # Bootstrap: get first layer's inputs (or use resumed inputs).
+        layer_inputs = input_getter.get_first_layer_inputs(
+            start_layer, resumed_inputs, forward_loop
+        )
 
-            def _layer_forward_loop(m, _inputs=layer_inputs):
-                for args, kwargs_input in _inputs:
+        for layer_idx in range(start_layer, num_layers):
+            layer = transformer_layers[layer_idx]
+
+            def _layer_forward_loop(m):
+                for args, kwargs_input in layer_inputs:
                     m(*args, **kwargs_input)
 
             calib_func(layer, _layer_forward_loop, **calib_kwargs)
 
+            # Run one more forward to get next layer's inputs and set
+            # output_meta on the just-calibrated layer (via "run" mode).
+            is_last = layer_idx + 1 >= num_layers
+            if not is_last:
+                next_inputs = input_getter.cache_outputs_for_next_layer_calib(layer, forward_loop)
+            else:
+                next_inputs = None
+
+            if ckpt:
+                ckpt.save(layer_idx, layer, model, transformer_layers, next_inputs)
+
             del layer_inputs
             torch.cuda.empty_cache()
+            layer_inputs = next_inputs
     finally:
         input_getter._unpatch_all_layers()
 
+    if ckpt:
+        ckpt.full_restore(transformer_layers, model)
+
     print_rank_0("Sequential calibration completed")
 
 
@@ -1663,8 +1696,10 @@ def gptq(
         handle.cleanup()
 
     print_rank_0("Updating weights using GPTQ algorithm...")
+    name_to_module = dict(model.named_modules())
     for handle in gptq_handles.values():
-        handle.update_weights(block_size, perc_damp)
+        with enable_weight_access_and_writeback(handle.module, model, name_to_module):
+            handle.update_weights(block_size, perc_damp)
         handle.free()
     del gptq_handles
 
 
@@ -33,7 +33,7 @@
 
 def _get_cpu_offload_hook(hook):
     if isinstance(hook, AlignDevicesHook) and hook.offload and hook.weights_map is not None:
-        assert "weight" in hook.weights_map
+        assert len(hook.weights_map) > 0
         if (
             isinstance(hook.weights_map, PrefixedDataset)
             and hook.weights_map.prefix + "weight" not in hook.weights_map.dataset.state_dict
@@ -50,32 +50,79 @@ def _get_cpu_offload_hook(hook):
     return None
 
 
+def _writeback_params_to_weights_map(module, align_hook):
+    """Write all non-meta parameters back to the hook's CPU weights_map."""
+    for name, param in module.named_parameters():
+        if param.device.type == "meta":
+            continue
+        if isinstance(align_hook.weights_map, PrefixedDataset):
+            key = align_hook.weights_map.prefix + name
+            w_map = align_hook.weights_map.dataset.state_dict
+        else:
+            w_map = align_hook.weights_map
+            key = name
+        if key in w_map:
+            w_map[key] = param.data.to(w_map[key].device, dtype=w_map[key].dtype)
+
+
 @contextmanager
 def weight_access_and_writeback_context(module):
-    """Context manager for weight access and writeback for modules managed by accelerate."""
+    """Context manager for weight access and writeback for modules managed by accelerate.
+
+    Handles two cases:
+    1. **Single-module**: the module's own ``_hf_hook`` is an offload hook.
+    2. **Sub-module**: the module's hook is non-offloading, but its children have
+       offload hooks (common with ``SequentialHook`` on sub-modules placed by
+       ``load_checkpoint_and_dispatch``).
+
+    For the sub-module case, ``pre_forward`` is skipped on sub-modules whose weights
+    are already materialized (not on meta).  This allows the context manager to be
+    used as a pure writeback after weight-modifying algorithms.
+    """
     assert hasattr(module, "_hf_hook")
     align_hook = _get_cpu_offload_hook(module._hf_hook)
 
     if align_hook:
-        # Accelerate uses AlignDevicesHook to offload weights to CPU/Disk and then reload them in the forward pass
-        # The CPU/Disk offloaded weights are managed by PrefixDataset and OffloadedWeightsLoader
-        # See https://github.com/huggingface/accelerate/blame/f48d95c4939b281505a45b3d6e0bf554b65cc1ea/src/accelerate/utils/offload.py#L104-L141
-        # TODO: Add support for disk-offloaded models if needed (they will be really slow, hence low priority)
-
-        # This will load the weights from CPU state_dict and move it to the GPU from meta device
+        # Guard: the sub-module branch below is not reached when the parent has
+        # an offload hook.  Assert that no children also carry offload hooks,
+        # which would require a combined writeback strategy.
+        assert not any(
+            _get_cpu_offload_hook(mod._hf_hook)
+            for mod in module.modules()
+            if mod is not module and hasattr(mod, "_hf_hook")
+        ), (
+            "Both the module and one of its sub-modules have CPU-offload hooks. "
+            "weight_access_and_writeback_context does not support this layout yet."
+        )
         align_hook.pre_forward(module)
+        try:
+            yield
+        finally:
+            _writeback_params_to_weights_map(module, align_hook)
+            align_hook.post_forward(module, None)
+        return
+
+    materialized: list[tuple[torch.nn.Module, AlignDevicesHook, bool]] = []
+    for mod in module.modules():
+        if mod is module or not hasattr(mod, "_hf_hook"):
+            continue
+        hook = _get_cpu_offload_hook(mod._hf_hook)
+        if hook is None:
+            continue
+        # Only call pre_forward if weights need materializing; already-materialized
+        # weights would be overwritten with stale CPU state_dict values.
+        needs_materialize = any(p.device.type == "meta" for p in mod.parameters())
+        if needs_materialize:
+            hook.pre_forward(mod)
+        materialized.append((mod, hook, needs_materialize))
+
     try:
         yield
     finally:
-        if align_hook:
-            # Update the weight in the CPU state_dict
-            if isinstance(align_hook.weights_map, PrefixedDataset):
-                key = align_hook.weights_map.prefix + "weight"
-                w_map = align_hook.weights_map.dataset.state_dict
-            else:
-                key, w_map = "weight", align_hook.weights_map
-            w_map[key] = module.weight.data.to(w_map[key].device, dtype=w_map[key].dtype)
-            align_hook.post_forward(module, None)
+        for mod, hook, was_materialized in materialized:
+            _writeback_params_to_weights_map(mod, hook)
+            if was_materialized:
+                hook.post_forward(mod, None)
 
 
 @contextmanager
 
@@ -39,7 +39,7 @@
 from ..nn.modules.quant_linear import _QuantLinear
 from ..triton import IS_AVAILABLE as IS_TRITON_AVAILABLE
 from ..utils import replace_function, sync_moe_expert_amax
-from ..utils.activation_collector import LayerActivationCollector
+from ..utils.layerwise_calib import LayerActivationCollector
 from .attention import register_attention_for_kv_quant
 from .custom import CUSTOM_MODEL_PLUGINS, _ParallelLinear, _QuantFunctionalMixin
 
 
@@ -16,8 +16,8 @@
 # ruff: noqa: F405
 """Quantization utilities."""
 
-from .activation_collector import LayerActivationCollector
 from .core_utils import *
+from .layerwise_calib import LayerActivationCollector
 
 __all__ = [
     "EXPORT_MODE",