Inline local-Hessian activation capture; drop the QuantModule hook API

Fridah-nv · Fridah-nv · commit acc6aa6149ea · 2026-06-05T22:34:04.000Z
Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -501,10 +501,10 @@ def _make_weight_mse_calibrator(
         )
         if backend is not None and backend_factory is not None:
             if error_func is not None:
-                # Registered backends can't take a custom error_func; skip Hessian refinement.
+                # Registered backend factories don't accept a custom error_func.
                 warnings.warn(
-                    f"local_hessian: backend '{backend}' does not support a custom error "
-                    "function; skipping Hessian-weighted calibration for this quantizer."
+                    f"backend '{backend}' does not support a custom error function; skipping "
+                    "error-function-weighted MSE calibration for this quantizer."
                 )
                 return None
             return backend_factory(initial_amax, axis, quant_func)
@@ -706,6 +706,80 @@ def _warn_local_hessian_fallback(name, weight, weight_quantizer, block_size, war
     _warn_if_block_size_mismatch(weight_quantizer, block_size, name)
 
 
+def _is_quant_fused_experts(module: nn.Module) -> bool:
+    """Whether ``module`` is a converted HF fused-MoE-experts wrapper with per-expert quantizers."""
+    return hasattr(module, "_current_expert_idx") and hasattr(
+        module, "gate_up_proj_weight_quantizers"
+    )
+
+
+def _register_local_hessian_input_hooks(model, name_to_module, capture, block_size, warned):
+    """Register forward hooks feeding each weight's input activations to ``capture``.
+
+    Local-Hessian-specific (kept here rather than as a general ``QuantModule`` API): dense
+    quantized linears hook the layer input; HF fused-MoE experts hook the shared input quantizers,
+    keyed by the active expert (``_current_expert_idx``). Weights without a hook (conv,
+    SequentialQuantizer, non-eager experts) fall back to plain MSE. Returns removable handles.
+    """
+    handles: list = []
+
+    def _make_expert_hook(expert_module, weight_name, quantizers, enabled):
+        def _expert_hook(_input_quantizer, args):
+            if not args:
+                return
+            idx = expert_module._current_expert_idx
+            if idx in enabled:
+                # Read the weight fresh (valid under accelerate/FSDP re-materialization).
+                capture(quantizers[idx], getattr(expert_module, weight_name)[idx], args[0])
+
+        return _expert_hook
+
+    for name, module in name_to_module.items():
+        if is_quantized_linear(module) and isinstance(module.weight_quantizer, TensorQuantizer):
+            with enable_weight_access_and_writeback(module, model, name_to_module):
+                # ``weight`` may be absent (e.g. TE GroupedLinear exposes weight0..N, not weight);
+                # such modules have no single 2-D weight to pair and fall back to plain MSE.
+                weight = getattr(module, "weight", None)
+                if weight is None or weight.dim() != 2 or not module.weight_quantizer.is_enabled:
+                    continue
+                _warn_local_hessian_fallback(
+                    name, weight, module.weight_quantizer, block_size, warned
+                )
+
+            def _dense_hook(linear, args):
+                if args:
+                    capture(linear.weight_quantizer, linear.weight, args[0])
+
+            handles.append(module.register_forward_pre_hook(_dense_hook))
+        elif _is_quant_fused_experts(module):
+            with enable_weight_access_and_writeback(module, model, name_to_module):
+                for weight_name, quantizers_name, input_q_name in (
+                    (
+                        "gate_up_proj",
+                        "gate_up_proj_weight_quantizers",
+                        "gate_up_proj_input_quantizer",
+                    ),
+                    ("down_proj", "down_proj_weight_quantizers", "down_proj_input_quantizer"),
+                ):
+                    weight = getattr(module, weight_name, None)
+                    quantizers = getattr(module, quantizers_name, None)
+                    input_quantizer = getattr(module, input_q_name, None)
+                    if weight is None or quantizers is None or input_quantizer is None:
+                        continue
+                    _warn_local_hessian_fallback(
+                        f"{name}.{weight_name}", weight[0], quantizers[0], block_size, warned
+                    )
+                    # Snapshot which experts are enabled now, before the caching forward silences
+                    # all weight quantizers — so we don't capture (and discard) disabled experts.
+                    enabled = {i for i, q in enumerate(quantizers) if q.is_enabled}
+                    handles.append(
+                        input_quantizer.register_forward_pre_hook(
+                            _make_expert_hook(module, weight_name, quantizers, enabled)
+                        )
+                    )
+    return handles
+
+
 @torch.no_grad()
 def local_hessian_calibrate(
     model: nn.Module,
@@ -767,53 +841,19 @@ def capture(weight_quantizer, weight, input_tensor):
             accumulators[id(weight_quantizer)] = acc
         acc.accumulate(input_local)
 
-    # Phase 2: register capture hooks, disable weight fake-quant (input quantizers left as-is,
-    # matching prior behavior), run one forward to accumulate Hessians. Hooks live only for it.
-    handles: list = []
-    silenced_weight_quantizers: list[TensorQuantizer] = []
+    # Phase 2: capture each weight's input activations during a forward with weight fake-quant
+    # disabled (so H = ΣXᵀX reflects full-precision weights); input quantizers are left as-is.
     warned: set = set()
-    seen_modules: set[int] = set()
-    for name, module in name_to_module.items():
-        if not isinstance(module, QuantModule) or id(module) in seen_modules:
-            continue
-        seen_modules.add(id(module))
-        with enable_weight_access_and_writeback(module, model, name_to_module):
-            captures = module.register_calibration_input_hooks(capture)
-            handles.extend(captures)
-            for weight, weight_quantizer in module.iter_weights_for_calibration():
-                # Silence weight fake-quant (incl. SequentialQuantizer leaves) so the capture
-                # forward uses full-precision weights and downstream Hessians aren't corrupted.
-                leaves = (
-                    list(weight_quantizer)
-                    if isinstance(weight_quantizer, SequentialQuantizer)
-                    else [weight_quantizer]
-                )
-                silenced_weight_quantizers.extend(
-                    q
-                    for q in leaves
-                    if isinstance(q, TensorQuantizer) and q.is_enabled and q._if_quant
-                )
-                # Only TensorQuantizer weights are refined (same as mse_calibrate); other types
-                # (e.g. SequentialQuantizer) are unsupported and left at their max-cal scale.
-                if not isinstance(weight_quantizer, TensorQuantizer):
-                    if weight_quantizer.is_enabled and "unsupported" not in warned:
-                        warned.add("unsupported")
-                        warn_rank_0(
-                            "local_hessian: only TensorQuantizer weights are calibrated; other "
-                            "types (e.g. SequentialQuantizer) stay at their max-calibrated scale."
-                        )
-                    continue
-                if captures:
-                    _warn_local_hessian_fallback(name, weight, weight_quantizer, block_size, warned)
-
-    for weight_quantizer in silenced_weight_quantizers:
-        weight_quantizer.disable_quant()
+    handles = _register_local_hessian_input_hooks(
+        model, name_to_module, capture, block_size, warned
+    )
     print_rank_0("local_hessian: Caching activations and computing local Hessian...")
     try:
-        forward_loop(model)
+        with set_quantizer_by_cfg_context(
+            model, [{"quantizer_name": "*weight_quantizer", "enable": False}]
+        ):
+            forward_loop(model)
     finally:
-        for weight_quantizer in silenced_weight_quantizers:
-            weight_quantizer.enable_quant()
         for handle in handles:
             handle.remove()
 
diff --git a/modelopt/torch/quantization/nn/modules/quant_module.py b/modelopt/torch/quantization/nn/modules/quant_module.py
@@ -17,7 +17,6 @@
 
 import contextlib
 import warnings
-from collections.abc import Callable
 from typing import Any
 
 import torch
@@ -128,17 +127,6 @@ def iter_weights_for_calibration(self):
             weight_quantizer = getattr(self, quantizer_attr_names(weight_name).weight_quantizer)
             yield getattr(self, weight_name), weight_quantizer
 
-    def register_calibration_input_hooks(
-        self, callback: Callable[[TensorQuantizer, torch.Tensor, torch.Tensor], None]
-    ) -> list:
-        """Register forward hooks calling ``callback(weight_quantizer, weight, input)`` per weight.
-
-        Activation-side counterpart to :meth:`iter_weights_for_calibration`, used by
-        activation-aware calibration (e.g. local-Hessian). Returns removable handles; the base
-        default is ``[]`` (no pairing available -> plain weight calibration). Override per module.
-        """
-        return []
-
     def fold_weight(self, keep_attrs: bool = False):
         """Fold the weight for faster eval."""
         # Handle all attributes that end with _weight_quantizer
@@ -259,27 +247,6 @@ def _setup(self):
         self._register_temp_attribute("_enable_weight_quantization", False)
         self._register_dynamic_attribute("weight", self._get_quantized_weight)
 
-    def register_calibration_input_hooks(self, callback):
-        """Pair the weight quantizer with the forward input.
-
-        Only a 2-D weight with an enabled ``TensorQuantizer`` is hooked; conv (4-D) and
-        ``SequentialQuantizer`` weights are unsupported and fall back to plain calibration.
-        """
-        weight = getattr(self, "weight", None)
-        if (
-            weight is None
-            or weight.dim() != 2
-            or not isinstance(self.weight_quantizer, TensorQuantizer)
-            or not self.weight_quantizer.is_enabled
-        ):
-            return []
-
-        def _pre_hook(module, args):
-            if args:
-                callback(module.weight_quantizer, module.weight, args[0])
-
-        return [self.register_forward_pre_hook(_pre_hook)]
-
 
 class _LegacyQuantInputBaseMixin:
     """A mixin to support legacy quantized modules which needs to have an __init__ method."""
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -918,36 +918,6 @@ def iter_weights_for_calibration(self):
             for idx, q in enumerate(quantizers):
                 yield weight[idx], q
 
-    def register_calibration_input_hooks(self, callback):
-        """Pair each per-expert weight quantizer with its routed input activation.
-
-        Hooks the shared input quantizers, which the eager ``F.linear`` path calls per expert
-        while ``_current_expert_idx`` is set. Batched/grouped kernels never call them, so those
-        experts get no capture (fall back to plain weight calibration).
-        """
-        handles = []
-        for weight_name, quantizers_name, input_quantizer_name in (
-            ("gate_up_proj", "gate_up_proj_weight_quantizers", "gate_up_proj_input_quantizer"),
-            ("down_proj", "down_proj_weight_quantizers", "down_proj_input_quantizer"),
-        ):
-            weight = getattr(self, weight_name, None)
-            quantizers = getattr(self, quantizers_name, None)
-            input_quantizer = getattr(self, input_quantizer_name, None)
-            if weight is None or quantizers is None or input_quantizer is None:
-                continue
-
-            def _pre_hook(_iq, args, _weight_name=weight_name, _quantizers=quantizers):
-                if not args:
-                    return
-                idx = self._current_expert_idx
-                weight_quantizer = _quantizers[idx]
-                if weight_quantizer.is_enabled:
-                    # Read the weight fresh (valid under accelerate/FSDP re-materialization).
-                    callback(weight_quantizer, getattr(self, _weight_name)[idx], args[0])
-
-            handles.append(input_quantizer.register_forward_pre_hook(_pre_hook))
-        return handles
-
     def fold_weight(self, keep_attrs: bool = False):
         """Fold per-expert weight quantizers into the fused 3-D weights.
 
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -658,9 +658,8 @@ def forward_loop(m):
 
         self._cleanup_registry(expert_type)
 
-    def test_local_hessian_per_expert_capture_and_refinement(self):
-        """The plugin's extension point pairs each per-expert weight quantizer with its routed
-        input, and local_hessian uses that to refine every expert's weight amax."""
+    def test_local_hessian_refines_per_expert_weights(self):
+        """local_hessian captures each expert's routed activations and refines its weight amax."""
         model = _TinyMoEModel()
         expert_type = type(model.moe.experts)
         self._cleanup_registry(expert_type)
@@ -685,28 +684,25 @@ def forward_loop(m):
         expert_quantizers = list(experts.gate_up_proj_weight_quantizers) + list(
             experts.down_proj_weight_quantizers
         )
-
-        # Extension point captures per-expert (weight_quantizer, weight_slice, cin).
-        captured = []
-        handles = experts.register_calibration_input_hooks(
-            lambda wq, w, x: captured.append((id(wq), tuple(w.shape), x.shape[-1]))
-        )
-        assert len(handles) == 2  # one pre-hook per shared input quantizer (gate_up, down)
-        with torch.no_grad():
-            model(torch.randn(1, 8, HIDDEN_DIM))
-        for h in handles:
-            h.remove()
-        valid_ids = {id(q) for q in expert_quantizers}
-        shapes = {(2 * INTERMEDIATE_DIM, HIDDEN_DIM), (HIDDEN_DIM, INTERMEDIATE_DIM)}
-        assert captured and all(
-            wq_id in valid_ids and shape in shapes and cin == shape[1]
-            for wq_id, shape, cin in captured
-        )
-
-        # End-to-end: local_hessian refines per-expert weight amax via that capture.
         max_amax = {id(q): q.amax.clone() for q in expert_quantizers if q.amax is not None}
+        # Expected (cout, cin) keyed by quantizer id, to verify each Hessian pairs with its
+        # own expert's weight slice (catches gate_up/down swaps and stale-index mis-pairing).
+        expected_shape = {}
+        for quantizers, weight in (
+            (experts.gate_up_proj_weight_quantizers, experts.gate_up_proj),
+            (experts.down_proj_weight_quantizers, experts.down_proj),
+        ):
+            for i, q in enumerate(quantizers):
+                expected_shape[id(q)] = (weight[i].shape[0], weight[i].shape[1])
+
         local_hessian_calibrate(model, forward_loop, fp8_scale_sweep=False, debug=True)
-        assert any(a.num_samples > 0 for a in model._local_hessian_accumulators.values())
+
+        # Each captured Hessian is keyed to a real per-expert quantizer with the matching weight
+        # shape, spans multiple distinct experts, and the refinement moved at least one amax.
+        routed = {qid: a for qid, a in model._local_hessian_accumulators.items() if a.num_samples}
+        assert len(routed) >= 2, "expected multiple distinct experts to capture Hessians"
+        for qid, acc in routed.items():
+            assert (acc.cout, acc.cin) == expected_shape[qid]
         assert all(q.amax is not None and torch.isfinite(q.amax).all() for q in expert_quantizers)
         assert any(
             id(q) in max_amax and not torch.allclose(q.amax, max_amax[id(q)])
diff --git a/tests/unit/torch/quantization/test_local_hessian.py b/tests/unit/torch/quantization/test_local_hessian.py
@@ -158,36 +158,30 @@ def test_no_forward_loop_is_skipped(self):
         assert all(torch.equal(before[n], a) for n, a in _weight_amaxes(model).items())
 
 
-class TestActivationCaptureExtensionPoint:
-    """The extension point that decouples local-Hessian capture from module type."""
+class TestLocalHessianFallbacks:
+    """Weights local-Hessian can't pair with an input fall back to plain MSE (no Hessian)."""
 
-    def test_dense_captures_and_conv_falls_back(self):
+    def test_conv_weight_falls_back_without_crash(self):
         torch.manual_seed(0)
-        model = SimpleLinear()
-        mtq.quantize(model, INT8_WEIGHT_CFG, forward_loop=_make_forward_loop())
-        captured = []
-        handles = model.net[0].register_calibration_input_hooks(
-            lambda wq, w, x: captured.append((tuple(w.shape), x.shape[-1]))
-        )
-        assert len(handles) == 1
-        with torch.no_grad():
-            model(torch.randn(2, 16))
-        for h in handles:
-            h.remove()
-        assert captured and captured[0] == ((32, 16), 16)  # cin from activation matches weight
-
-        conv = SimpleConv()
-        mtq.quantize(conv, INT8_WEIGHT_CFG, forward_loop=lambda m: m(SimpleConv.get_input()))
-        assert conv.net[0].register_calibration_input_hooks(lambda *a: None) == []  # 4-D weight
-
-    def test_sequential_quantizer_weight_not_hooked(self):
+        model = SimpleConv()  # 4-D conv weights — no single 2-D weight to pair
+        forward_loop = lambda m: m(SimpleConv.get_input())  # noqa: E731
+        mtq.quantize(model, INT8_WEIGHT_CFG, forward_loop=forward_loop)
+        local_hessian_calibrate(model, forward_loop, fp8_scale_sweep=False, debug=True)
+        conv = model.net[0]
+        assert id(conv.weight_quantizer) not in model._local_hessian_accumulators
+        assert conv.weight_quantizer.amax is not None  # still calibrated via plain MSE
+
+    def test_sequential_quantizer_weight_falls_back_without_crash(self):
         torch.manual_seed(0)
         model = SimpleLinear()
         mtq.quantize(model, INT8_WEIGHT_CFG, forward_loop=_make_forward_loop())
         linear = model.net[0]
         linear.weight_quantizer = SequentialQuantizer(TensorQuantizer(), TensorQuantizer())
-        assert linear.register_calibration_input_hooks(lambda *a: None) == []  # unsupported
+        local_hessian_calibrate(model, _make_forward_loop(), fp8_scale_sweep=False, debug=True)
+        assert id(linear.weight_quantizer) not in model._local_hessian_accumulators
+
 
+class TestBlockSizeMismatchWarning:
     def test_block_size_mismatch_warns_only_on_mismatch(self):
         def q(block):
             return TensorQuantizer(