integration

sychen52 · sychen52 · commit 08a7392b0c01 · 2026-04-20T14:40:32.000-07:00
Signed-off-by: Shiyang Chen &lt;shiychen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/utils/calib_utils.py b/modelopt/torch/quantization/utils/calib_utils.py
@@ -74,6 +74,42 @@ def update_hessian(input, hessian, n_samples):
     return hessian, n_samples
 
 
+def compute_hessian_inverse(hessian, weight, perc_damp):
+    """Compute damped upper-Cholesky inverse Hessian.
+
+    Dead-neuron columns (all-zero in ``weight``) are zeroed in the
+    Hessian before inversion, matching the FP-Quant reference:
+    https://github.com/IST-DASLab/FP-Quant/blob/d2e3092f968262c4de5fb050e1aef568a280dadd/src/quantization/gptq.py#L200
+
+    Args:
+        hessian: Hessian matrix ``[in_features, in_features]``.
+        weight: Weight matrix ``[out_features, in_features]`` for dead-neuron detection.
+        perc_damp: Percentage of average Hessian diagonal for damping.
+
+    Returns:
+        Upper-triangular Cholesky factor of the damped inverse Hessian
+        ``[in_features, in_features]``.  Falls back to the identity matrix
+        when the Hessian is not positive definite.
+    """
+    h = hessian.clone()
+    zero_cols = torch.nonzero(weight.eq(0).all(dim=0)).unsqueeze(-1)
+
+    h[zero_cols, :] = 0
+    h[:, zero_cols] = 0
+    h[zero_cols, zero_cols] = 1
+
+    damp = perc_damp * torch.mean(torch.diag(h))
+    diag_indices = torch.arange(h.shape[0], device=h.device)
+    h[diag_indices, diag_indices] += damp
+
+    try:
+        h = torch.cholesky_inverse(torch.linalg.cholesky(h))
+        return torch.linalg.cholesky(h, upper=True)
+    except (RuntimeError, torch.linalg.LinAlgError):
+        print_rank_0("Warning: Hessian is not positive definite, using identity matrix")
+        return torch.eye(h.shape[0], device=h.device, dtype=h.dtype)
+
+
 class GPTQHelper:
     """Encapsulates per-module GPTQ state and operations.
 
@@ -154,38 +190,14 @@ def update_weights(self, block_size, perc_damp):
     # ------------------------------------------------------------------
 
     def _prepare_hessian_inverse(self, hessian, perc_damp):
-        """Compute damped inverse Hessian and store as ``self.h_inv``.
-
-        Dead-neuron columns (all-zero in ``self.weight``) are zeroed in the
-        Hessian before inversion, matching the FP-Quant reference:
-        https://github.com/IST-DASLab/FP-Quant/blob/d2e3092f968262c4de5fb050e1aef568a280dadd/src/quantization/gptq.py#L200
-        """
+        """Compute damped inverse Hessian and store as ``self.h_inv``."""
         assert self.weight is not None, "_prepare_hessian_inverse called before update_weights()"
-        h = hessian.clone()
-        zero_cols = torch.nonzero(self.weight.eq(0).all(dim=0)).unsqueeze(-1)
-
-        h[zero_cols, :] = 0
-        h[:, zero_cols] = 0
-        h[zero_cols, zero_cols] = 1
-
-        damp = perc_damp * torch.mean(torch.diag(h))
-        diag_indices = torch.arange(h.shape[0], device=h.device)
-        h[diag_indices, diag_indices] += damp
-
-        try:
-            h = torch.cholesky_inverse(torch.linalg.cholesky(h))
-            self.h_inv = torch.linalg.cholesky(h, upper=True)
-        except (RuntimeError, torch.linalg.LinAlgError):
-            print_rank_0("Warning: Hessian is not positive definite, using identity matrix")
-            self.h_inv = torch.eye(h.shape[0], device=h.device, dtype=h.dtype)
+        self.h_inv = compute_hessian_inverse(hessian, self.weight, perc_damp)
 
     def _blockwise_update(self, block_size):
         """Column-wise GPTQ update using full-matrix QDQ.
 
-        For each column, quantizes the full weight matrix via the quantizer and
-        extracts the quantized column. This is the standard GPTQ approach.
-
-        Reads/writes ``self.weight`` and ``self.h_inv`` in-place.
+        Delegates to :func:`gptq_blockwise_update` with the module's weight quantizer.
         """
         assert self.weight is not None and self.h_inv is not None, (
             "_blockwise_update called before _prepare_hessian_inverse()"
@@ -199,28 +211,7 @@ def _blockwise_update(self, block_size):
                     f"GPTQ block_size ({block_size}) must be divisible by the quantizer"
                     f" group_size ({group_size})"
                 )
-        num_cols = self.weight.shape[1]
-
-        for block_start in range(0, num_cols, block_size):
-            block_end = min(block_start + block_size, num_cols)
-            n_cols_blk = block_end - block_start
-            h_inv_cho_blk = self.h_inv[block_start:block_end, block_start:block_end]
-
-            wblk = self.weight.clone()
-            errs = torch.zeros_like(wblk[:, block_start:block_end])
-
-            for i in range(n_cols_blk):
-                w_ci = wblk[:, block_start + i]
-                d = h_inv_cho_blk[i, i]
-                qdq = quantizer(wblk)
-                self.weight[:, block_start + i] = qdq[:, block_start + i]
-                err = (w_ci - qdq[:, block_start + i]) / d
-                wblk[:, block_start + i : block_end].addr_(err, h_inv_cho_blk[i, i:], alpha=-1)
-                errs[:, i] = err
-
-            self.weight[:, block_end:].addmm_(
-                errs, self.h_inv[block_start:block_end, block_end:], alpha=-1
-            )
+        gptq_blockwise_update(self.weight, self.h_inv, block_size, quantizer)
 
     def _print_mse_error(self, hessian):
         """Log Hessian-weighted relative MSE between ``self.weight`` and original weights."""
@@ -231,6 +222,115 @@ def _print_mse_error(self, hessian):
         print_rank_0(f"[{self.name}] Relative MSE error: {mse.item():.2e}{suffix}")
 
 
+def gptq_blockwise_update(weight, h_inv, block_size, quantize_fn):
+    """Column-wise GPTQ update using full-matrix fake quantization.
+
+    For each column, quantizes the full weight matrix via ``quantize_fn`` and
+    extracts the quantized column.  Error is propagated to remaining columns
+    within the block and then to all subsequent columns via the inverse Hessian.
+
+    Args:
+        weight: Weight tensor ``[out_features, in_features]``, modified **in-place**
+            with fake-quantized values.
+        h_inv: Upper-triangular Cholesky factor of the damped inverse Hessian
+            ``[in_features, in_features]``.
+        block_size: Number of columns to process per GPTQ block.
+        quantize_fn: Callable ``(weight) -> qdq_weight`` that fake-quantizes
+            the full weight matrix.
+    """
+    num_cols = weight.shape[1]
+
+    for block_start in range(0, num_cols, block_size):
+        block_end = min(block_start + block_size, num_cols)
+        n_cols_blk = block_end - block_start
+        h_inv_cho_blk = h_inv[block_start:block_end, block_start:block_end]
+
+        wblk = weight.clone()
+        errs = torch.zeros_like(weight[:, block_start:block_end])
+
+        for i in range(n_cols_blk):
+            w_ci = wblk[:, block_start + i]
+            d = h_inv_cho_blk[i, i]
+            qdq = quantize_fn(wblk)
+            weight[:, block_start + i] = qdq[:, block_start + i]
+            err = (w_ci - qdq[:, block_start + i]) / d
+            wblk[:, block_start + i : block_end].addr_(err, h_inv_cho_blk[i, i:], alpha=-1)
+            errs[:, i] = err
+
+        weight[:, block_end:].addmm_(errs, h_inv[block_start:block_end, block_end:], alpha=-1)
+
+
+def gptq_blockwise_update_fused_scalar(weight, scales_2d, h_inv, block_size, quant_block_size):
+    """Fused GPTQ blockwise update for NVFP4 scalar quantization.
+
+    Uses a fused Triton kernel that combines quantization and per-column
+    error propagation into one launch per GPTQ block, avoiding the
+    Python-level per-column loop in :func:`gptq_blockwise_update`.
+
+    Args:
+        weight: Weight tensor ``[out_features, in_features]``, modified **in-place**
+            with fake-quantized values.
+        scales_2d: Pre-computed per-block scales ``[out_features, n_scale_blocks]``.
+        h_inv: Upper-triangular Cholesky factor of the damped inverse Hessian
+            ``[in_features, in_features]``.
+        block_size: Number of columns to process per GPTQ block.
+        quant_block_size: Number of elements sharing one quantization scale factor.
+    """
+    from modelopt.torch.quantization.triton.gptq_fused_kernel import gptq_fused_block_scalar
+
+    num_cols = weight.shape[1]
+    for bs in range(0, num_cols, block_size):
+        be = min(bs + block_size, num_cols)
+        qw, err = gptq_fused_block_scalar(
+            weight[:, bs:be].clone().contiguous(),
+            scales_2d,
+            h_inv[bs:be, bs:be].contiguous(),
+            quant_block_size,
+            bs,
+        )
+        weight[:, bs:be] = qw
+        if be < num_cols:
+            weight[:, be:].addmm_(err, h_inv[bs:be, be:], alpha=-1)
+
+
+class FusedScalarGPTQHelper(GPTQHelper):
+    """GPTQHelper using the fused Triton kernel for NVFP4 scalar quantization.
+
+    Overrides :meth:`_blockwise_update` to extract pre-computed scales from the
+    ``NVFP4StaticQuantizer`` and delegate to :func:`gptq_blockwise_update_fused_scalar`.
+    """
+
+    def _blockwise_update(self, block_size):
+        """Fused GPTQ using Triton kernel for NVFP4 scalar quantization."""
+        assert self.weight is not None and self.h_inv is not None, (
+            "_blockwise_update called before _prepare_hessian_inverse()"
+        )
+        from modelopt.torch.quantization.triton.fp4_kernel import compute_fp4_scales
+
+        quantizer = self.module.weight_quantizer
+        block_sizes = getattr(quantizer, "block_sizes", None)
+        quant_block_size = None
+        if block_sizes is not None:
+            quant_block_size = block_sizes.get(-1) or block_sizes.get(1)
+
+        if quant_block_size is not None and block_size % quant_block_size != 0:
+            raise ValueError(
+                f"GPTQ block_size ({block_size}) must be divisible by the quantizer"
+                f" group_size ({quant_block_size})"
+            )
+
+        out_features, num_cols = self.weight.shape
+        n_blocks = num_cols // quant_block_size
+
+        # Pre-compute scales from the calibrated amax (frozen during GPTQ).
+        amax = quantizer.amax.reshape(out_features, n_blocks)
+        scales_2d = compute_fp4_scales(amax, quantizer.global_amax, quantize_block_scales=True)
+
+        gptq_blockwise_update_fused_scalar(
+            self.weight, scales_2d, self.h_inv, block_size, quant_block_size
+        )
+
+
 _GPTQ_HELPER_REGISTRY: dict[str, type[GPTQHelper]] = {}
 
 
@@ -242,3 +342,7 @@ def register_gptq_helper(backend: str, factory: type[GPTQHelper]) -> None:
     construct ``factory`` instead of the default ``GPTQHelper``.
     """
     _GPTQ_HELPER_REGISTRY[backend] = factory
+
+
+# Built-in registrations
+register_gptq_helper("fused_gptq_nvfp4", FusedScalarGPTQHelper)
diff --git a/tests/gpu/torch/quantization/test_gptq.py b/tests/gpu/torch/quantization/test_gptq.py
@@ -25,7 +25,12 @@
 from modelopt.torch.export.unified_export_hf import _export_quantized_weight
 from modelopt.torch.quantization.model_calib import gptq
 from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor
-from modelopt.torch.quantization.utils.calib_utils import update_hessian
+from modelopt.torch.quantization.utils.calib_utils import (
+    compute_hessian_inverse,
+    gptq_blockwise_update,
+    gptq_blockwise_update_fused_scalar,
+    update_hessian,
+)
 from modelopt.torch.utils.dataset_utils import create_forward_loop, get_dataset_dataloader
 
 RAND_SEED = 42
@@ -240,21 +245,6 @@ def test_gptq_e2e_flow(quant_cfg):
 # ---------------------------------------------------------------------------
 
 
-# TODO(shiychen): This should be extracted out from production code path
-def _compute_h_inv(hessian, weight, percdamp=0.01):
-    """Compute damped upper-Cholesky inverse Hessian."""
-    h = hessian.clone()
-    zero_cols = torch.nonzero(weight.eq(0).all(dim=0)).unsqueeze(-1)
-    h[zero_cols, :] = 0
-    h[:, zero_cols] = 0
-    h[zero_cols, zero_cols] = 1
-    damp = percdamp * torch.mean(torch.diag(h))
-    diag_idx = torch.arange(h.shape[0], device=h.device)
-    h[diag_idx, diag_idx] += damp
-    h = torch.cholesky_inverse(torch.linalg.cholesky(h))
-    return torch.linalg.cholesky(h, upper=True)
-
-
 def _make_nvfp4_test_data(quant_block_size, out_features, dim):
     """Create weight, h_inv, and scales_2d for NVFP4 GPTQ tests."""
     from modelopt.torch.quantization.triton.fp4_kernel import compute_fp4_scales
@@ -268,14 +258,13 @@ def _make_nvfp4_test_data(quant_block_size, out_features, dim):
     hessian = torch.zeros(dim, dim, dtype=torch.float32)
     hessian, _ = update_hessian(inp, hessian, 0)
     hessian = hessian.to("cuda")
-    h_inv = _compute_h_inv(hessian, weight)
+    h_inv = compute_hessian_inverse(hessian, weight, perc_damp=0.01)
 
     return weight, scales_2d, h_inv
 
 
-# TODO(shiychen): This should be extracted out from production code path
 def _run_unfused_gptq_nvfp4(weight, scales_2d, h_inv, gptq_block_size, quant_block_size):
-    """Unfused NVFP4 GPTQ using the production Triton FP4 kernel per column.
+    """Unfused NVFP4 GPTQ using the production blockwise update with Triton FP4 kernel.
 
     Both fused and unfused use the same frozen pre-computed scales so the
     test verifies the fused kernel's correctness (not scale computation).
@@ -285,52 +274,23 @@ def _run_unfused_gptq_nvfp4(weight, scales_2d, h_inv, gptq_block_size, quant_blo
     out_features, num_cols = weight.shape
     n_blocks = num_cols // quant_block_size
     w = weight.float().clone()
-    q = torch.zeros_like(w)
     # Recover amax from scales (scales = amax / 6.0, already FP8-quantized)
     amax_flat = (scales_2d * 6.0).reshape(out_features * n_blocks)
 
-    for i in range(0, num_cols, gptq_block_size):
-        j_end = min(i + gptq_block_size, num_cols)
-        e = torch.zeros(out_features, j_end - i, dtype=w.dtype, device=w.device)
-
-        for j in range(i, j_end):
-            # Quantize full weight using production Triton FP4 kernel
-            w_blocked = w.reshape(out_features * n_blocks, quant_block_size)
-            qdq = static_blockwise_fp4_fake_quant(
-                w_blocked,
-                amax_flat,
-                quantize_block_scales=False,
-            ).reshape(out_features, num_cols)
-            q[:, j] = qdq[:, j]
-
-            err = (w[:, j] - q[:, j]) / h_inv[j, j]
-            e[:, j - i] = err
-            w[:, j:j_end] -= err.unsqueeze(1) * h_inv[j, j:j_end].unsqueeze(0)
+    def quantize_fn(w_input):
+        w_blocked = w_input.reshape(out_features * n_blocks, quant_block_size)
+        return static_blockwise_fp4_fake_quant(
+            w_blocked, amax_flat, quantize_block_scales=False
+        ).reshape(out_features, num_cols)
 
-        if j_end < num_cols:
-            w[:, j_end:] -= e @ h_inv[i:j_end, j_end:]
-
-    return q
+    gptq_blockwise_update(w, h_inv, gptq_block_size, quantize_fn)
+    return w
 
 
 def _run_fused_gptq_nvfp4(weight, scales_2d, h_inv, gptq_block_size, quant_block_size):
-    """Fused Triton GPTQ for NVFP4."""
-    from modelopt.torch.quantization.triton.gptq_fused_kernel import gptq_fused_block_scalar
-
-    dim = weight.shape[1]
+    """Fused Triton GPTQ for NVFP4 using the production fused update."""
     w = weight.float().clone()
-    for bs in range(0, dim, gptq_block_size):
-        be = min(bs + gptq_block_size, dim)
-        qw, err = gptq_fused_block_scalar(
-            w[:, bs:be].clone().contiguous(),
-            scales_2d,
-            h_inv[bs:be, bs:be].contiguous(),
-            quant_block_size,
-            bs,
-        )
-        w[:, bs:be] = qw
-        if be < dim:
-            w[:, be:].addmm_(err, h_inv[bs:be, be:], alpha=-1)
+    gptq_blockwise_update_fused_scalar(w, scales_2d, h_inv, gptq_block_size, quant_block_size)
     return w