Add per-mode opt-out for layerwise calibration

realAsma · realAsma · commit baaf80f4a626 · 2026-04-17T15:50:26.000Z
Introduce `_supports_layerwise: bool = True` on `BaseCalibrateModeDescriptor`
so individual calibration modes can declare incompatibility with layer-by-layer
calibration. `wrapped_calib_func` raises a clear `ValueError` when `layerwise=True`
is requested on a mode that opts out, instead of failing deep inside the algorithm.

Opt `SVDQuantModeDescriptor` out — `create_and_replace_svdquant_linear_on_the_fly`
reads `ModeloptStateManager` from the root model, which is not present when
`layerwise_calibrate` dispatches per decoder layer.

Restructure the end-to-end layerwise tests:
- `test_mtq_quantize_layerwise_e2e_max` runs the full happy path on `max`
- `test_mtq_quantize_layerwise_dispatches_for_algorithm` stubs `layerwise_calibrate`
  and verifies every supporting algorithm (including gptq) routes through it
- `test_mtq_quantize_layerwise_raises_for_unsupported_algorithm` guards the new
  ValueError for svdquant

Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/mode.py b/modelopt/torch/quantization/mode.py
@@ -213,6 +213,7 @@ def wrapped_calib_func(
     config: QuantizeAlgorithmConfig,
     forward_loop: ForwardLoop | None = None,
     func: Callable | None = None,
+    supports_layerwise: bool = True,
 ) -> ConvertReturnType:
     """Wrap the calibration function to be compatible with the ModelOpt convert entrypoint.
 
@@ -241,6 +242,13 @@ def wrapped_calib_func(
         if layerwise:
             # All currently implemented PTQ algorithms support layerwise calibration;
             # future algorithms that need full-model context must add a guard here.
+            if not supports_layerwise:
+                raise ValueError(
+                    f"Calibration algorithm '{method}' does not support layerwise=True. "
+                    "Set layerwise=False, or override `_supports_layerwise = True` on the "
+                    "corresponding CalibrateModeDescriptor once the algorithm is made "
+                    "compatible with per-layer calibration."
+                )
             if forward_loop is None:
                 raise ValueError("forward_loop is required for calibration but got None.")
             # Wrap with layerwise processing
@@ -282,6 +290,10 @@ class BaseCalibrateModeDescriptor(ModeDescriptor):
 
     _calib_func: Callable | None
 
+    # Override to False when the algorithm requires full-model context and
+    # cannot run per decoder layer (e.g. needs ModeloptStateManager on the root).
+    _supports_layerwise: bool = True
+
     def __init__(self, *args, **kwargs):
         """Initialize Base calibrate mode descriptor."""
         assert issubclass(self.config_class, QuantizeAlgorithmConfig), (
@@ -327,7 +339,13 @@ def convert(self) -> ConvertEntrypoint:
         def wrapped_func(model, config, forward_loop=None):
             # Access _calib_func as a class attribute to avoid binding
             # Check if _calib_func is defined as a class attribute
-            return wrapped_calib_func(model, config, forward_loop, func=self.__class__._calib_func)
+            return wrapped_calib_func(
+                model,
+                config,
+                forward_loop,
+                func=self.__class__._calib_func,
+                supports_layerwise=self.__class__._supports_layerwise,
+            )
 
         return wrapped_func
 
@@ -486,6 +504,9 @@ def config_class(self) -> type[QuantizeAlgorithmConfig]:
         return SVDQuantConfig
 
     _calib_func = svdquant
+    # create_and_replace_svdquant_linear_on_the_fly reads ModeloptStateManager from the
+    # root model, which is not present when layerwise_calibrate dispatches per decoder layer.
+    _supports_layerwise = False
 
     @property
     def restore(self) -> RestoreEntrypoint:
diff --git a/tests/unit/torch/quantization/test_layerwise_calibrate.py b/tests/unit/torch/quantization/test_layerwise_calibrate.py
@@ -15,13 +15,16 @@
 
 """Unit tests for layerwise_calibrate and LayerActivationCollector."""
 
+import copy
 from collections import deque
 
 import pytest
 import torch
 import torch.nn as nn
 
+import modelopt.torch.quantization as mtq
 from modelopt.torch.quantization.model_calib import layerwise_calibrate
+from modelopt.torch.quantization.nn import TensorQuantizer
 from modelopt.torch.quantization.utils.layerwise_calib import LayerActivationCollector, _SkipLayer
 
 
@@ -593,3 +596,131 @@ def forward_loop(m):
     for i, orig in enumerate(originals):
         assert model.layers[i] is orig, f"Layer {i} not restored to original after cleanup"
         assert not hasattr(orig, "_layerwise_calib"), f"Layer {i} still has _layerwise_calib"
+
+
+# ---------------------------------------------------------------------------
+# End-to-end mtq.quantize(..., algorithm={"layerwise": True}) per PTQ algorithm
+# ---------------------------------------------------------------------------
+
+
+def _int8_layerwise_config(algorithm: dict) -> dict:
+    """Start from the shipped INT8 config and enable layerwise in the algorithm block.
+
+    Using a real shipped config guarantees the same include/exclude rules
+    production PTQ relies on, so algorithm dispatch matches real usage.
+    """
+    cfg = copy.deepcopy(mtq.INT8_SMOOTHQUANT_CFG)
+    cfg["algorithm"] = algorithm
+    return cfg
+
+
+def _awq_layerwise_config() -> dict:
+    """INT4 weight-only AWQ config sized for the _DecoderBlock test model."""
+    cfg = copy.deepcopy(mtq.INT4_AWQ_CFG)
+    # Resize AWQ block to fit dim=16 hidden.
+    for entry in cfg["quant_cfg"]:
+        if entry.get("quantizer_name") == "*weight_quantizer":
+            entry.setdefault("cfg", {})["block_sizes"] = {-1: 8, "type": "static"}
+    cfg["algorithm"] = {"method": "awq_lite", "alpha_step": 0.5, "layerwise": True}
+    return cfg
+
+
+def _svdquant_layerwise_config() -> dict:
+    """SVDQuant config sized for the _DecoderBlock test model."""
+    cfg = copy.deepcopy(mtq.INT4_AWQ_CFG)
+    for entry in cfg["quant_cfg"]:
+        if entry.get("quantizer_name") == "*weight_quantizer":
+            entry.setdefault("cfg", {})["block_sizes"] = {-1: 8, "type": "static"}
+    cfg["algorithm"] = {"method": "svdquant", "lowrank": 4, "layerwise": True}
+    return cfg
+
+
+def test_mtq_quantize_layerwise_e2e_max(monkeypatch):
+    """End-to-end: mtq.quantize with layerwise=True produces populated amax values.
+
+    ``max`` is the representative algorithm for the layerwise happy path because
+    every other algorithm seeds amax via max_calibrate first — if max works, the
+    shared skip/run/capture machinery is sound. Other algorithms are covered by
+    the dispatch-only test below to avoid hardware requirements (e.g. gptq needs
+    CUDA) or unnecessary duplication.
+    """
+    _register_test_discoverer(monkeypatch)
+    config = _int8_layerwise_config({"method": "max", "layerwise": True})
+
+    torch.manual_seed(0)
+    model = _SimpleTransformerModel(n_layers=3, dim=16)
+    calib_data = [torch.randint(0, 32, (2, 8)) for _ in range(2)]
+
+    def forward_loop(m):
+        for batch in calib_data:
+            m(batch)
+
+    model = mtq.quantize(model, config, forward_loop=forward_loop)
+
+    for i, layer in enumerate(model.layers):
+        assert not isinstance(layer, _SkipLayer), f"layer {i} left as _SkipLayer"
+        assert not hasattr(layer, "_layerwise_calib"), f"layer {i} leaked _layerwise_calib"
+
+    amax_count = sum(
+        1
+        for layer in model.layers
+        for module in layer.modules()
+        if (
+            isinstance(module, TensorQuantizer)
+            and module.is_enabled
+            and getattr(module, "_amax", None) is not None
+        )
+    )
+    assert amax_count > 0, "no TensorQuantizer in decoder layers had _amax populated"
+
+    with torch.no_grad():
+        model(calib_data[0])
+
+
+@pytest.mark.parametrize(
+    "algorithm",
+    ["gptq", "awq_lite", "smoothquant", "mse"],
+)
+def test_mtq_quantize_layerwise_dispatches_for_algorithm(monkeypatch, algorithm):
+    """Every layerwise-supporting algorithm must route through layerwise_calibrate.
+
+    Stubs layerwise_calibrate to a spy so the dispatch contract is checked without
+    running the algorithm's full calibration — lets ``gptq`` (CUDA-only at runtime)
+    and other expensive algorithms participate in CPU unit tests.
+    """
+    spy: dict = {}
+
+    def stub(model, forward_loop, calib_func, **kwargs):
+        spy["calib_func"] = calib_func
+        spy["kwargs"] = kwargs
+
+    monkeypatch.setattr("modelopt.torch.quantization.mode.layerwise_calibrate", stub)
+
+    if algorithm == "awq_lite":
+        config = _awq_layerwise_config()
+    else:
+        config = _int8_layerwise_config({"method": algorithm, "layerwise": True})
+
+    torch.manual_seed(0)
+    model = _SimpleTransformerModel(n_layers=2, dim=16)
+    mtq.quantize(
+        model,
+        config,
+        forward_loop=lambda m: m(torch.randint(0, 32, (2, 8))),
+    )
+
+    assert "calib_func" in spy, f"{algorithm} did not dispatch through layerwise_calibrate"
+    assert callable(spy["calib_func"])
+
+
+def test_mtq_quantize_layerwise_raises_for_unsupported_algorithm():
+    """Modes with ``_supports_layerwise = False`` must raise a clear ValueError."""
+    config = _svdquant_layerwise_config()
+    torch.manual_seed(0)
+    model = _SimpleTransformerModel(n_layers=2, dim=16)
+    with pytest.raises(ValueError, match="does not support layerwise=True"):
+        mtq.quantize(
+            model,
+            config,
+            forward_loop=lambda m: m(torch.randint(0, 32, (2, 8))),
+        )