Modular/Plugin based sequential calib

realAsma · realAsma · commit 1b87855468f6 · 2026-02-24T22:43:26.000Z
Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -31,11 +31,7 @@
 from modelopt.torch.quantization.utils import LayerActivationCollector
 from modelopt.torch.utils import print_rank_0
 from modelopt.torch.utils.distributed import DistributedProcessGroup, ParallelState
-from modelopt.torch.utils.network import (
-    bind_forward_method,
-    get_decoder_layers,
-    unpatch_forward_method,
-)
+from modelopt.torch.utils.network import bind_forward_method, unpatch_forward_method
 from modelopt.torch.utils.perf import get_used_gpu_mem_fraction
 
 from .calib import MseCalibrator, NVFP4MSECalibrator
@@ -1841,27 +1837,30 @@ def sequential_calibrate(
     **calib_kwargs,
 ):
     """Sequential calibration - a sequential layer-by-layer calibration algorithm."""
-    transformer_layers = get_decoder_layers(model)
-    if transformer_layers is None:
+    if not LayerActivationCollector.is_supported(model):
         raise ValueError(
-            "Could not find transformer layers in model'. "
+            "Could not find transformer layers in model. "
             "Sequential calibration requires a model with identifiable transformer layers."
         )
+    transformer_layers = LayerActivationCollector.get_decoder_layers(model)
+    assert transformer_layers is not None
 
     print_rank_0(f"Sequential calibration: Found {len(transformer_layers)} transformer layers")
+    if len(transformer_layers) == 0:
+        return
 
-    gettr = LayerActivationCollector(model)
+    input_getter = LayerActivationCollector(model)
 
-    for _, layer in enumerate(transformer_layers):
-        # Get updated input activations to the current layer
-        inputs = gettr.get_input_activations(layer, forward_loop)
+    for layer in transformer_layers:
+        layer_inputs = input_getter.get_input_activations(layer, forward_loop)
 
         # Define a forward loop for the current layer
-        def _layer_forward_loop(m):
-            for args, kwargs_input in inputs:  # noqa: F821
+        def _layer_forward_loop(m, _inputs=layer_inputs):
+            for args, kwargs_input in _inputs:
                 m(*args, **kwargs_input)
 
         # Call calibration function
         calib_func(layer, _layer_forward_loop, **calib_kwargs)
-        del inputs
+
+        del layer_inputs
         torch.cuda.empty_cache()
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -56,7 +56,7 @@
 else:
     weight_dequant = None
 
-from ..utils import replace_function
+from ..utils import LayerActivationCollector, replace_function
 from .attention import register_attention_for_kv_quant
 from .custom import CUSTOM_MODEL_PLUGINS, _ParallelLinear, _QuantFunctionalMixin
 
@@ -1042,6 +1042,55 @@ def _is_supported_hf_model(model):
     return isinstance(model, tuple(supported_models))
 
 
+def is_homogenous_hf_model(model: nn.Module) -> bool:
+    decoder_layers = get_homogeneous_hf_decoder_layers(model)
+    if decoder_layers is None or len(decoder_layers) == 0:
+        return False
+    layer_classes = {type(layer) for layer in decoder_layers}
+    return len(layer_classes) == 1
+
+
+def get_homogeneous_hf_decoder_layers(model: nn.Module) -> nn.ModuleList | None:
+    if not _is_supported_hf_model(model):
+        return None
+
+    if hasattr(model, "model") and hasattr(model.model, "layers"):
+        return model.model.layers
+
+    return None
+
+
+def build_hf_homogenous_next_layer_inputs_hook(model: nn.Module):
+    def _extract_hidden_states(layer_output):
+        if isinstance(layer_output, tuple):
+            return layer_output[0]
+        if isinstance(layer_output, dict):
+            if "hidden_states" in layer_output:
+                return layer_output["hidden_states"]
+        return layer_output
+
+    def _build_next_layer_inputs_hook(prev_layer, cached_inputs):
+        next_inputs = []
+        for args, kwargs in cached_inputs:
+            prev_output = prev_layer(*args, **kwargs)
+            hidden_states = _extract_hidden_states(prev_output)
+            if len(args) >= 1:
+                next_args = (hidden_states, *args[1:])
+                next_kwargs = kwargs
+            elif "hidden_states" in kwargs:
+                next_args = args
+                next_kwargs = dict(kwargs)
+                next_kwargs["hidden_states"] = hidden_states
+            else:
+                raise ValueError(
+                    "Unable to build next-layer inputs without hidden_states in args/kwargs."
+                )
+            next_inputs.append((next_args, next_kwargs))
+        return next_inputs
+
+    return _build_next_layer_inputs_hook
+
+
 @contextmanager
 def setup_model_for_gradient_checkpointing(model: nn.Module):
     use_cache = None
@@ -1091,6 +1140,14 @@ def _is_param_grad_enabled_for_auto_quantize(pname, model):
     _is_param_grad_enabled_for_auto_quantize,
 )
 
+LayerActivationCollector.register_decoder_layer_support(
+    is_homogenous_hf_model, get_homogeneous_hf_decoder_layers
+)
+
+LayerActivationCollector.register_next_layer_input_support(
+    is_homogenous_hf_model, build_hf_homogenous_next_layer_inputs_hook
+)
+
 CUSTOM_MODEL_PLUGINS.update(
     [
         register_falcon_linears_on_the_fly,
diff --git a/modelopt/torch/quantization/utils.py b/modelopt/torch/quantization/utils.py
@@ -824,8 +824,43 @@ class LayerActivationCollector:
     patching layers to capture inputs/outputs during forward passes
     """
 
+    _next_layer_input_support: list[tuple[Any, Any]] = []
+    _decoder_layer_support: list[tuple[Any, Any]] = []
+
     def __init__(self, model: nn.Module):
         self.model = model
+        self._previous_layer = None
+        self._previous_layer_inputs = None
+
+    @staticmethod
+    def get_decoder_layers(model: nn.Module) -> nn.ModuleList | None:
+        """Return decoder layers supported by sequential calibration."""
+        for is_supported, discoverer in LayerActivationCollector._decoder_layer_support:
+            if not is_supported(model):
+                continue
+            decoder_layers = discoverer(model)
+            if decoder_layers is not None:
+                return decoder_layers
+        return None
+
+    @staticmethod
+    def is_supported(model: nn.Module) -> bool:
+        """Whether the model supports decoder-layer sequential calibration."""
+        return LayerActivationCollector.get_decoder_layers(model) is not None
+
+    @classmethod
+    def register_next_layer_input_support(
+        cls, is_supported: Any, build_next_layer_inputs_hook: Any
+    ):
+        entry = (is_supported, build_next_layer_inputs_hook)
+        if entry not in cls._next_layer_input_support:
+            cls._next_layer_input_support.append(entry)
+
+    @classmethod
+    def register_decoder_layer_support(cls, is_supported: Any, discoverer: Any):
+        entry = (is_supported, discoverer)
+        if entry not in cls._decoder_layer_support:
+            cls._decoder_layer_support.append(entry)
 
     @staticmethod
     def _patch_and_initialize_layer(layer: torch.nn.Module, stop_after_collection: bool = False):
@@ -851,8 +886,15 @@ def _unpatch_and_cleanup_layer(layer: torch.nn.Module):
         if hasattr(layer, "inputs"):
             del layer.inputs
 
+    def _resolve_next_layer_inputs_hook(self):
+        for is_supported, build_next_layer_inputs_hook in self._next_layer_input_support:
+            if not is_supported(self.model):
+                continue
+            return build_next_layer_inputs_hook(self.model)
+        return None
+
     @torch.no_grad()
-    def get_input_activations(self, layer: torch.nn.Module, forward_loop: ForwardLoop) -> list:
+    def _collect_input_activations(self, layer: torch.nn.Module, forward_loop: ForwardLoop) -> list:
         # Wrap model forward to catch _EarlyStopForward per-batch
         def _early_stop_forward(self, *args, **kwargs):
             try:
@@ -870,3 +912,19 @@ def _early_stop_forward(self, *args, **kwargs):
             unpatch_forward_method(self.model, "_original_forward")
 
         return inputs
+
+    @torch.no_grad()
+    def get_input_activations(self, layer: torch.nn.Module, forward_loop: ForwardLoop) -> list:
+        is_first_layer = self._previous_layer is None or self._previous_layer_inputs is None
+        if is_first_layer:
+            inputs = self._collect_input_activations(layer, forward_loop)
+        else:
+            next_layer_inputs_hook = self._resolve_next_layer_inputs_hook()
+            if next_layer_inputs_hook is None:
+                inputs = self._collect_input_activations(layer, forward_loop)
+            else:
+                inputs = next_layer_inputs_hook(self._previous_layer, self._previous_layer_inputs)
+
+        self._previous_layer = layer
+        self._previous_layer_inputs = inputs
+        return inputs
diff --git a/modelopt/torch/utils/network.py b/modelopt/torch/utils/network.py
@@ -634,32 +634,3 @@ def unpatch_forward_method(module: nn.Module, orig_forward_cache_name: str):
     with temporarily_remove_accelerate_hook(module):
         setattr(module, "forward", getattr(module, orig_forward_cache_name))
         delattr(module, orig_forward_cache_name)
-
-
-def get_decoder_layers(model: nn.Module, granularity: str = "decoder") -> nn.ModuleList | None:
-    """Detect the decoder layers from a model for sequential calibration."""
-    if granularity != "decoder":
-        raise ValueError(f"Unsupported granularity: {granularity}. Only 'decoder' is supported.")
-
-    # HuggingFace transformers pattern: model.model.layers
-    if hasattr(model, "model") and hasattr(model.model, "layers"):
-        return model.model.layers
-
-    # Megatron/MCore pattern: model.decoder.layers
-    if hasattr(model, "decoder") and hasattr(model.decoder, "layers"):
-        return model.decoder.layers
-
-    # Direct layers attribute (some models)
-    if hasattr(model, "layers") and isinstance(model.layers, nn.ModuleList):
-        return model.layers
-
-    # GPT-style: model.transformer.h
-    if hasattr(model, "transformer") and hasattr(model.transformer, "h"):
-        return model.transformer.h
-
-    # Nemotron Super/Nano
-    if hasattr(model, "backbone") and hasattr(model.backbone, "layers"):
-        return model.backbone.layers
-
-    print("No decoder layers found for model, returning None")
-    return None
diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -23,13 +23,19 @@
 from _test_utils.torch.misc import set_seed
 from _test_utils.torch.transformers_models import (
     create_tiny_llama_dir,
+    get_tiny_gpt_oss,
     get_tiny_llama,
     get_tiny_qwen3_moe,
     tf_modelopt_state_and_output_tester,
 )
 
 import modelopt.torch.quantization as mtq
 from modelopt.torch.quantization.nn import QuantLinear, QuantModuleRegistry
+from modelopt.torch.quantization.plugins.huggingface import (
+    get_homogeneous_hf_decoder_layers,
+    is_homogenous_hf_model,
+)
+from modelopt.torch.quantization.utils import LayerActivationCollector
 
 pytest.importorskip("transformers")
 
@@ -199,3 +205,24 @@ def test_quantized_transformers_save_restore(tmp_path, model_cls, quant_config):
 
     model_test = model_cls.from_pretrained(tiny_llama_dir / "modelopt_model")
     tf_modelopt_state_and_output_tester(model_ref, model_test)
+
+
+def test_is_homogenous_hf_model_llama():
+    model = get_tiny_llama()
+    assert is_homogenous_hf_model(model)
+
+
+def test_is_homogenous_hf_model_gpt_oss():
+    model = get_tiny_gpt_oss(num_hidden_layers=1)
+    assert is_homogenous_hf_model(model)
+
+
+def test_hf_decoder_discoverer_registration_path():
+    model = get_tiny_llama()
+    assert any(
+        is_supported is is_homogenous_hf_model and discoverer is get_homogeneous_hf_decoder_layers
+        for is_supported, discoverer in LayerActivationCollector._decoder_layer_support
+    )
+    assert LayerActivationCollector.get_decoder_layers(model) is get_homogeneous_hf_decoder_layers(
+        model
+    )
diff --git a/tests/unit/torch/quantization/test_calib.py b/tests/unit/torch/quantization/test_calib.py
diff --git a/tests/unit/torch/quantization/test_utils.py b/tests/unit/torch/quantization/test_utils.py