gptq faster

sugunav14 · sugunav14 · commit 3f2d7c05a489 · 2026-03-20T00:32:41.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -1545,7 +1545,7 @@ def _print_relative_mse_error(
     delta = q - w
     mse = (delta).mm(h).mul(delta).mean() / (w.mm(h).mul(w).mean() + 1e-6)
     suffix = f", n_hessian_samples: {n_samples}" if n_samples is not None else ""
-    print(f"[{module_name}] Relative MSE error: {mse.item():.2e}{suffix}")
+    print_rank_0(f"[{module_name}] Relative MSE error: {mse.item():.2e}{suffix}")
 
 
 def update_hessian(input, hessian, n_samples):
@@ -1604,7 +1604,7 @@ def prepare_hessian_inverse(h, weight, percdamp):
         h = torch.cholesky_inverse(torch.linalg.cholesky(h))
         h_inv = torch.linalg.cholesky(h, upper=True)
     except (RuntimeError, torch.linalg.LinAlgError):
-        print("Warning: Hessian is not positive definite, using identity matrix")
+        print_rank_0("Warning: Hessian is not positive definite, using identity matrix")
         h_inv = torch.eye(h.shape[0], device=h.device, dtype=h.dtype)
     return h_inv
 
@@ -1706,37 +1706,104 @@ def _column_qdq_tensor(col, col_idx, _s=scalar_scale, _mx=max_bound, _mn=min_bou
     return _column_qdq_tensor, True
 
 
+def _can_use_fused_gptq(quantizer) -> bool:
+    """Check whether the fused Triton GPTQ kernel can be used for *quantizer*."""
+    if not isinstance(quantizer, NVFP4StaticQuantizer):
+        return False
+    if not hasattr(quantizer, "_amax") or quantizer._amax is None:
+        return False
+    from modelopt.torch.quantization.triton import IS_AVAILABLE as _TRITON_OK
+
+    return _TRITON_OK
+
+
 def blockwise_weight_update(module, h, block_size, percdamp, n_samples=None):
     """Update module weights using GPTQ-style blockwise quantization.
 
+    Dispatches to one of three internal paths depending on quantizer type:
+
+    1. **Fused Triton** — for :class:`NVFP4StaticQuantizer` when Triton is
+       available.  Runs the entire column loop in a single GPU kernel per
+       block (~130x faster than the unfused path on Blackwell GPUs).
+    2. **Column-QDQ** — for integer quantizers whose scale geometry allows
+       single-column fake-quant via :func:`_build_column_qdq`.
+    3. **Full-matrix fallback** — calls the quantizer on the full weight matrix
+       each column (slowest, but always correct).
+
     Args:
-        module: Neural network module with weight and weight_quantizer
-        H: Hessian matrix (d x d)
-        block_size: Size of blocks to process at once
-        percdamp: Damping percentage for Hessian diagonal
-        n_samples: Number of Hessian samples for logging (optional)
+        module: Neural network module with ``weight`` and ``weight_quantizer``.
+        h: Hessian matrix of shape ``(d, d)``.
+        block_size: Number of columns processed per block.
+        percdamp: Damping as a fraction of the mean Hessian diagonal.
+        n_samples: Number of Hessian samples (used only for logging).
     """
     weight = module.weight.data.float().clone()
-    _, num_cols = weight.shape
+    num_rows, num_cols = weight.shape
 
-    # Preprocess Hessian: handle dead neurons and add damping
     h_inv = prepare_hessian_inverse(h, weight, percdamp)
 
-    # Try to build fast column-wise qdq (avoids quantizing the full matrix per column)
-    col_qdq_fn, col_qdq_supported = _build_column_qdq(module.weight_quantizer, weight.shape)
+    quantizer = module.weight_quantizer
+    if _can_use_fused_gptq(quantizer):
+        _blockwise_weight_update_fused(weight, h_inv, quantizer, num_rows, num_cols, block_size)
+    else:
+        col_qdq_fn, col_qdq_supported = _build_column_qdq(quantizer, weight.shape)
+        _blockwise_weight_update_unfused(
+            weight, h_inv, quantizer, num_cols, block_size, col_qdq_fn, col_qdq_supported
+        )
+
+    _print_relative_mse_error(weight, module.weight.float(), h, module.name, n_samples)
+    module.weight.data = weight.reshape(module.weight.shape).to(module.weight.data.dtype)
+
+
+def _blockwise_weight_update_fused(weight, h_inv, quantizer, num_rows, num_cols, block_size):
+    """Fused Triton path for NVFP4: one kernel launch per block."""
+    from modelopt.torch.quantization.triton.gptq_fused_kernel import gptq_fused_block
+
+    group_size = quantizer.block_sizes.get(-1, None) or quantizer.block_sizes.get(1, None)
+    num_groups = math.ceil(num_cols / group_size)
+    amax_grouped = quantizer._amax.float().reshape(num_rows, num_groups).contiguous()
+    global_amax = quantizer.global_amax.float()
 
-    # Process weights in blocks
     for block_start in range(0, num_cols, block_size):
         block_end = min(block_start + block_size, num_cols)
-        n_cols = block_end - block_start
+        n_cols_blk = block_end - block_start
+
+        w_block = weight[:, block_start:block_end].clone().contiguous()
+        h_inv_cho_blk = h_inv[block_start:block_end, block_start:block_end].contiguous()
+
+        qw_block, err_block = gptq_fused_block(
+            w_block,
+            amax_grouped,
+            global_amax,
+            h_inv_cho_blk,
+            group_size,
+            block_start,
+            n_cols_blk,
+        )
+
+        weight[:, block_start:block_end] = qw_block
+        if block_end < num_cols:
+            weight[:, block_end:].addmm_(
+                err_block[:, :n_cols_blk],
+                h_inv[block_start:block_end, block_end:],
+                alpha=-1,
+            )
+
+
+def _blockwise_weight_update_unfused(
+    weight, h_inv, quantizer, num_cols, block_size, col_qdq_fn, col_qdq_supported
+):
+    """Column-QDQ or full-matrix fallback for non-NVFP4 quantizers."""
+    for block_start in range(0, num_cols, block_size):
+        block_end = min(block_start + block_size, num_cols)
+        n_cols_blk = block_end - block_start
         h_inv_cho_blk = h_inv[block_start:block_end, block_start:block_end]
 
         if col_qdq_supported:
-            # Fast path: clone only the block columns, quantize only per-column
             wblk = weight[:, block_start:block_end].clone()
             errs = torch.zeros_like(wblk)
 
-            for i in range(n_cols):
+            for i in range(n_cols_blk):
                 w_ci = wblk[:, i]
                 d = h_inv_cho_blk[i, i]
                 qdq_col = col_qdq_fn(w_ci, block_start + i)
@@ -1745,27 +1812,20 @@ def blockwise_weight_update(module, h, block_size, percdamp, n_samples=None):
                 wblk[:, i:].addr_(err, h_inv_cho_blk[i, i:], alpha=-1)
                 errs[:, i] = err
         else:
-            # Fallback: original full-matrix quantization path
             wblk = weight.clone()
             errs = torch.zeros_like(wblk[:, block_start:block_end])
 
-            for i in range(n_cols):
+            for i in range(n_cols_blk):
                 w_ci = wblk[:, block_start + i]
                 d = h_inv_cho_blk[i, i]
-                qdq = module.weight_quantizer(wblk)
+                qdq = quantizer(wblk)
                 weight[:, block_start + i] = qdq[:, block_start + i]
                 err = (w_ci - qdq[:, block_start + i]) / d
                 wblk[:, block_start + i : block_end].addr_(err, h_inv_cho_blk[i, i:], alpha=-1)
                 errs[:, i] = err
 
-        # Propagate errors to remaining weights
         weight[:, block_end:].addmm_(errs, h_inv[block_start:block_end, block_end:], alpha=-1)
 
-    # Print relative mse error
-    _print_relative_mse_error(weight, module.weight.float(), h, module.name, n_samples)
-    # Update module weights
-    module.weight.data = weight.reshape(module.weight.shape).to(module.weight.data.dtype)
-
 
 def gptq_lite(
     model: nn.Module,
diff --git a/modelopt/torch/quantization/triton/gptq_fused_kernel.py b/modelopt/torch/quantization/triton/gptq_fused_kernel.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Fused Triton kernel for the GPTQ blockwise weight-update inner loop.
+
+The standard GPTQ inner loop launches ~10-15 CUDA kernels per column
+(amax lookup, FP4 quantization, error computation, rank-1 update).
+For ``block_size=128`` that is ~1 500 kernel launches per block, each with
+~5-10 us of launch overhead dominating actual compute.
+
+This module fuses the entire inner loop into a **single** Triton kernel per
+block.  Rows are independent and map to Triton programs; columns are processed
+sequentially inside each program so the rank-1 error update is carried forward
+without synchronisation.
+
+Supported quantisation format: **NVFP4 static block quantisation** (two-level
+scaling with per-group amax and a global amax).
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+__all__ = ["gptq_fused_block"]
+
+# -- NVFP4 constants used by the kernel ------------------------------------
+# Maximum representable FP4-E2M1 value (1 + 1 + 0.5 = 6.0 when decoded via
+# the standard E2M1 table: {0, 0.5, 1, 1.5, 2, 3, 4, 6}).
+_FP4_MAX = 6.0
+# FP8-E4M3 has max representable value 448.
+_FP8_E4M3_MAX = 448.0
+
+
+@triton.jit
+def _gptq_fused_block_kernel(
+    w_ptr,  # [num_rows, BLOCK_SIZE] working weight block (in-place)
+    qw_ptr,  # [num_rows, BLOCK_SIZE] output: quantized weights
+    err_ptr,  # [num_rows, BLOCK_SIZE] output: quantization errors
+    amax_ptr,  # [num_rows, num_groups] per-group amax, row-major
+    global_amax_ptr,  # scalar float32 on device
+    hinv_ptr,  # [BLOCK_SIZE, BLOCK_SIZE] upper Cholesky of H^{-1}
+    num_rows,
+    num_groups,
+    group_size: tl.constexpr,
+    block_start,  # column offset of this block in the full weight matrix
+    n_cols,  # actual columns in this block (may be < BLOCK_SIZE)
+    BLOCK_SIZE: tl.constexpr,
+):
+    """One program per row; sequentially quantizes columns, propagating errors."""
+    row = tl.program_id(0)
+    if row >= num_rows:
+        return
+
+    # Base pointers for this row
+    w_base = w_ptr + row * BLOCK_SIZE
+    qw_base = qw_ptr + row * BLOCK_SIZE
+    err_base = err_ptr + row * BLOCK_SIZE
+    amax_row_base = amax_ptr + row * num_groups
+
+    # Pre-compute global FP8 scale factors (constant across columns)
+    global_amax = tl.load(global_amax_ptr).to(tl.float32)
+    global_scale = global_amax / 6.0  # _FP4_MAX
+    fp8_inv_scale = tl.where(global_scale > 0.0, 1.0 / (448.0 / global_scale), 0.0)
+
+    j_range = tl.arange(0, BLOCK_SIZE)
+
+    for i in range(BLOCK_SIZE):
+        wi = tl.load(w_base + i)
+
+        # -- Compute NVFP4 two-level scale for this column's group -----------
+        col_idx = block_start + i
+        group_idx = col_idx // group_size
+        raw_amax = tl.load(amax_row_base + group_idx).to(tl.float32)
+        raw_scale = raw_amax / 6.0  # _FP4_MAX
+
+        # FP8-quantize the block scale: scale * fp8_scale -> cast E4M3 -> back
+        fp8_scale = tl.where(global_scale > 0.0, 448.0 / global_scale, 1.0)
+        si = (raw_scale * fp8_scale).to(tl.float8e4nv).to(tl.float32) * fp8_inv_scale
+
+        # Guard: replace zero / nan / inf scale with 1.0
+        # NOTE: ``si != si`` is the standard NaN check in Triton (no math.isnan).
+        si_safe = tl.where(
+            (si == 0.0) | (si != si) | (tl.abs(si) == float("inf")),  # noqa: PLR0124
+            1.0,
+            si,
+        )
+
+        # -- FP4-E2M1 fake quantization (nearest-round to 8 levels) ----------
+        abs_scaled = tl.abs(wi) / si_safe
+        q_val = tl.where(
+            abs_scaled <= 0.25,
+            0.0,
+            tl.where(
+                abs_scaled < 0.75,
+                0.5,
+                tl.where(
+                    abs_scaled <= 1.25,
+                    1.0,
+                    tl.where(
+                        abs_scaled < 1.75,
+                        1.5,
+                        tl.where(
+                            abs_scaled <= 2.5,
+                            2.0,
+                            tl.where(abs_scaled < 3.5, 3.0, tl.where(abs_scaled <= 5.0, 4.0, 6.0)),
+                        ),
+                    ),
+                ),
+            ),
+        )
+
+        qi = q_val * si_safe * tl.where(wi >= 0.0, 1.0, -1.0)
+        tl.store(qw_base + i, qi)
+
+        # -- GPTQ error and rank-1 update ------------------------------------
+        di = tl.load(hinv_ptr + i * BLOCK_SIZE + i)
+        err_i = (wi - qi) / di
+        tl.store(err_base + i, err_i)
+
+        j_mask = (j_range > i) & (j_range < n_cols)
+        hinv_row = tl.load(hinv_ptr + i * BLOCK_SIZE + j_range, mask=j_mask, other=0.0)
+        w_rem = tl.load(w_base + j_range, mask=j_mask, other=0.0)
+        w_rem = w_rem - err_i * hinv_row
+        tl.store(w_base + j_range, w_rem, mask=j_mask)
+
+
+def gptq_fused_block(
+    w_block: torch.Tensor,
+    amax_grouped: torch.Tensor,
+    global_amax: torch.Tensor,
+    h_inv_cho_blk: torch.Tensor,
+    group_size: int,
+    block_start: int,
+    n_cols: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Run the GPTQ column loop for one block in a single Triton kernel launch.
+
+    Args:
+        w_block: Working weight block of shape ``[num_rows, block_size]`` (will be cloned).
+        amax_grouped: Per-group amax of shape ``[num_rows, num_groups]``.
+        global_amax: Scalar tensor with the global amax.
+        h_inv_cho_blk: Upper Cholesky factor of H^{-1}, shape ``[block_size, block_size]``.
+        group_size: NVFP4 quantization group size (typically 16).
+        block_start: Column offset of this block in the full weight matrix.
+        n_cols: Actual number of columns in this block (``<= block_size``).
+
+    Returns:
+        Tuple of ``(qw_block, err_block)`` each of shape ``[num_rows, block_size]``.
+    """
+    num_rows, block_size = w_block.shape
+    num_groups = amax_grouped.shape[1]
+
+    w_block = w_block.contiguous()
+    amax_grouped = amax_grouped.contiguous()
+    h_inv_cho_blk = h_inv_cho_blk.contiguous()
+
+    qw_block = torch.empty_like(w_block)
+    err_block = torch.empty_like(w_block)
+
+    grid = (num_rows,)
+    with torch.cuda.device(w_block.device):
+        _gptq_fused_block_kernel[grid](
+            w_block,
+            qw_block,
+            err_block,
+            amax_grouped,
+            global_amax,
+            h_inv_cho_blk,
+            num_rows,
+            num_groups,
+            group_size,
+            block_start,
+            n_cols,
+            BLOCK_SIZE=block_size,
+        )
+
+    return qw_block, err_block
diff --git a/tests/gpu/torch/quantization/test_gptq.py b/tests/gpu/torch/quantization/test_gptq.py