Support decoder block-level sequential calibration (#924)

sugunav14 · web-flow · commit fcdaf6519d63 · 2026-03-01T23:38:59.000-08:00
## What does this PR do? **Type of change:** New feature  **Overview:** Add support for sequential calibration of layers (at decoder level granularity) in ModelOpt. Calibration flow 1. Get list of decoder blocks 2. For current block call get input activations (considering weight and activation QDQ from all other previous blocks) and call specified calibration function. functions added 1. get_decoder_layers() -> to detect and get list of blocks to iterate over 2. LayerActivationCollector class -> to get input activations to the layer 3. sequential_calibrate() -> to perform the described calibration flow 4. use_sequential field in QuantizeAlgorithmConfig ## Usage  ```python # Sample config NVFP4_DEFAULT_CFG = { "quant_cfg": { "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, }, **_default_disabled_quantizer_cfg, }, "algorithm": { "method": "max", "use_sequential": True, } ``` Set use_sequential=True in QUANT_CFG's "algorithm" section. ## Testing  ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information   ## Summary by CodeRabbit * **New Features** * Sequential layer-by-layer calibration: Quantization now supports processing decoder layers sequentially to improve memory efficiency on large models.  --------- Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com>
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -1130,6 +1130,16 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
         ),
     )
 
+    use_sequential: bool = ModeloptField(
+        default=False,
+        title="Enable sequential layer-by-layer calibration.",
+        description=(
+            "If True, the calibration algorithm is applied sequentially to each decoder block. "
+            "The current approach recomputes a full forward pass per layer to propagate updated activations,"
+            "incurring O(N²) cost. Future revisions will add caching to eliminate redundant passes."
+        ),
+    )
+
 
 class MaxCalibConfig(QuantizeAlgorithmConfig):
     """The config for max calibration algorithm.
diff --git a/modelopt/torch/quantization/mode.py b/modelopt/torch/quantization/mode.py
@@ -63,6 +63,7 @@
     local_hessian_calibrate,
     max_calibrate,
     mse_calibrate,
+    sequential_calibrate,
     smoothquant,
     svdquant,
 )
@@ -221,6 +222,7 @@ def wrapped_calib_func(
     """
     kwargs = config.model_dump()
     method = kwargs.pop("method")
+    sequential = kwargs.pop("use_sequential", False)
     if method is not None and "awq" in method:
         # For backward compatibility
         kwargs["algorithm"] = method
@@ -235,8 +237,22 @@ def wrapped_calib_func(
                 module._moe_calib_experts_ratio = moe_calib_experts_ratio
 
     if func is not None:
-        # Call the function with forward_loop as a separate argument
-        func(model, forward_loop=forward_loop, **kwargs)
+        if sequential:
+            if forward_loop is None:
+                raise ValueError("forward_loop is required for calibration but got None.")
+            assert method in ["max"], (
+                f"Sequential calibration currently only supports max calibration, got {method}"
+            )
+            # Wrap with sequential processing
+            sequential_calibrate(
+                model,
+                forward_loop=forward_loop,
+                calib_func=func,
+                **kwargs,
+            )
+        else:
+            # Direct calibration (existing behavior)
+            func(model, forward_loop=forward_loop, **kwargs)
 
     # Lets get the latest metadata for the quantizer states
     metadata = {}
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -28,9 +28,14 @@
 from tqdm import tqdm
 
 from modelopt.torch.opt.searcher import ForwardLoop
+from modelopt.torch.quantization.utils import LayerActivationCollector
 from modelopt.torch.utils import print_rank_0
 from modelopt.torch.utils.distributed import DistributedProcessGroup, ParallelState
-from modelopt.torch.utils.network import bind_forward_method, unpatch_forward_method
+from modelopt.torch.utils.network import (
+    bind_forward_method,
+    get_decoder_layers,
+    unpatch_forward_method,
+)
 from modelopt.torch.utils.perf import get_used_gpu_mem_fraction
 
 from .calib import MseCalibrator, NVFP4MSECalibrator
@@ -49,7 +54,14 @@
     weight_attr_names,
 )
 
-__all__ = ["awq", "local_hessian_calibrate", "max_calibrate", "smoothquant", "svdquant"]
+__all__ = [
+    "awq",
+    "local_hessian_calibrate",
+    "max_calibrate",
+    "sequential_calibrate",
+    "smoothquant",
+    "svdquant",
+]
 
 
 def weight_only_quantize(model: nn.Module):
@@ -1819,3 +1831,40 @@ def hessian_hook(module, input, output):
         torch.cuda.empty_cache()
 
     print_rank_0("GPTQ-lite quantization completed successfully")
+
+
+@torch.no_grad()
+def sequential_calibrate(
+    model: nn.Module,
+    forward_loop: ForwardLoop,
+    calib_func: Callable,
+    **calib_kwargs,
+):
+    """Sequential calibration - a sequential layer-by-layer calibration algorithm."""
+    if forward_loop is None:
+        raise ValueError("forward_loop must not be None for sequential calibration.")
+
+    transformer_layers = get_decoder_layers(model)
+    if transformer_layers is None:
+        raise ValueError(
+            "Could not find transformer layers in model'. "
+            "Sequential calibration requires a model with identifiable transformer layers."
+        )
+
+    print_rank_0(f"Sequential calibration: Found {len(transformer_layers)} transformer layers")
+
+    gettr = LayerActivationCollector(model)
+
+    for layer in transformer_layers:
+        # Get updated input activations to the current layer
+        layer_inputs = gettr.get_input_activations(layer, forward_loop)
+
+        # Define a forward loop for the current layer
+        def _layer_forward_loop(m, _inputs=layer_inputs):
+            for args, kwargs_input in _inputs:
+                m(*args, **kwargs_input)
+
+        # Call calibration function
+        calib_func(layer, _layer_forward_loop, **calib_kwargs)
+        del layer_inputs
+        torch.cuda.empty_cache()
diff --git a/modelopt/torch/quantization/utils.py b/modelopt/torch/quantization/utils.py
@@ -29,10 +29,13 @@
 from torch.distributed.tensor import Replicate
 
 from modelopt.torch.utils import get_unwrapped_name, print_rank_0
+from modelopt.torch.utils.network import bind_forward_method, unpatch_forward_method
 
 if TYPE_CHECKING:
     from collections.abc import Generator
 
+    from modelopt.torch.opt.searcher import ForwardLoop
+
 __all__ = [
     "EXPORT_MODE",
     "convert_quantization_axis_to_reduce_axis",
@@ -808,3 +811,64 @@ def update_quant_cfg_with_kv_cache_quant(
         quant_cfg["algorithm"] = "max"
     print_rank_0(f"Updated quant_cfg with KV cache quantization: {quant_cfg}")
     return quant_cfg
+
+
+class _EarlyStopForwardError(Exception):
+    """Error to stop the forward pass after collection."""
+
+
+class LayerActivationCollector:
+    """Helper class for collecting layer activations during forward passes.
+
+    This class allows for sequential layer calibration by
+    patching layers to capture inputs/outputs during forward passes
+    """
+
+    def __init__(self, model: nn.Module):
+        self.model = model
+
+    @staticmethod
+    def _patch_and_initialize_layer(layer: torch.nn.Module, stop_after_collection: bool = False):
+        """Patch a layer to collect inputs during forward passes."""
+
+        def _forward_w_data_collection(self, *args, **kwargs):
+            # Note: 'self' refers to the patched layer.
+            assert len(args) >= 1, (
+                f"Expected at least 1 positional arg, got {len(args)} args and {list(kwargs.keys())} kwargs"
+            )
+            # Only collect the inputs to the layer
+            self.inputs.append((args, kwargs))
+            if stop_after_collection:
+                raise _EarlyStopForwardError()  # Stop the forward pass after collection
+
+            return self._original_forward(*args, **kwargs)
+
+        bind_forward_method(layer, _forward_w_data_collection, "_original_forward")
+        layer.inputs = []
+
+    @staticmethod
+    def _unpatch_and_cleanup_layer(layer: torch.nn.Module):
+        if hasattr(layer, "_original_forward"):
+            unpatch_forward_method(layer, "_original_forward")
+        if hasattr(layer, "inputs"):
+            del layer.inputs
+
+    @torch.no_grad()
+    def get_input_activations(self, layer: torch.nn.Module, forward_loop: ForwardLoop) -> list:
+        # Wrap model forward to catch _EarlyStopForward per-batch
+        def _early_stop_forward(self, *args, **kwargs):
+            try:
+                return self._original_forward(*args, **kwargs)
+            except _EarlyStopForwardError:
+                return None  # Stop propagation but allow next batch
+
+        try:
+            bind_forward_method(self.model, _early_stop_forward, "_original_forward")
+            self._patch_and_initialize_layer(layer, stop_after_collection=True)
+            forward_loop(self.model)
+            inputs = layer.inputs.copy()
+        finally:
+            self._unpatch_and_cleanup_layer(layer)
+            unpatch_forward_method(self.model, "_original_forward")
+
+        return inputs
diff --git a/modelopt/torch/utils/network.py b/modelopt/torch/utils/network.py
@@ -634,3 +634,36 @@ def unpatch_forward_method(module: nn.Module, orig_forward_cache_name: str):
     with temporarily_remove_accelerate_hook(module):
         setattr(module, "forward", getattr(module, orig_forward_cache_name))
         delattr(module, orig_forward_cache_name)
+
+
+def get_decoder_layers(model: nn.Module, granularity: str = "decoder") -> nn.ModuleList | None:
+    """Detect the decoder layers from a model for sequential calibration.
+
+    This temporary decoder-layer detection heuristic will be replaced with a more robust solution
+    that also supports FSDP/DDP models.
+    """
+    if granularity != "decoder":
+        raise ValueError(f"Unsupported granularity: {granularity}. Only 'decoder' is supported.")
+
+    # HuggingFace transformers pattern: model.model.layers
+    if hasattr(model, "model") and hasattr(model.model, "layers"):
+        return model.model.layers
+
+    # Megatron/MCore pattern: model.decoder.layers
+    if hasattr(model, "decoder") and hasattr(model.decoder, "layers"):
+        return model.decoder.layers
+
+    # Direct layers attribute (some models)
+    if hasattr(model, "layers") and isinstance(model.layers, nn.ModuleList):
+        return model.layers
+
+    # GPT-style: model.transformer.h
+    if hasattr(model, "transformer") and hasattr(model.transformer, "h"):
+        return model.transformer.h
+
+    # Nemotron Super/Nano
+    if hasattr(model, "backbone") and hasattr(model.backbone, "layers"):
+        return model.backbone.layers
+
+    print("No decoder layers found for model, returning None")
+    return None
diff --git a/tests/unit/torch/quantization/test_sequential_calibrate.py b/tests/unit/torch/quantization/test_sequential_calibrate.py