add config for fused kernel; qdq inside kernel; even more extraction

sychen52 · sychen52 · commit b331ed2033fb · 2026-04-20T21:45:41.000-07:00
Signed-off-by: Shiyang Chen &lt;shiychen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -1549,6 +1549,12 @@ class GPTQCalibConfig(QuantizeAlgorithmConfig):
         description="""The block size for GPTQ weight update, which must be a multiple of the
         group_size used in the quantization.""",
     )
+    fused: bool = ModeloptField(
+        default=False,
+        title="Use fused Triton kernel for GPTQ.",
+        description="""When True, use a fused Triton kernel that combines quantization and
+        per-column error propagation into one launch per GPTQ block.""",
+    )
 
 
 QuantizeQuantCfgType = list[QuantizerCfgEntry]
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -1698,6 +1698,7 @@ def gptq(
     forward_loop: ForwardLoop,
     perc_damp: float = 0.01,
     block_size: int = 128,
+    fused: bool = False,
 ):
     """GPTQ quantization.
 
@@ -1723,6 +1724,7 @@ def gptq(
         forward_loop: Callable that replays calibration inputs through *model*.
         perc_damp: Percentage of avg Hessian diagonal for damping (default: 0.01).
         block_size: Block size for GPTQ weight update.
+        fused: If True, use fused Triton kernel for NVFP4 static quantization.
     """
     total_start = time.time()
 
@@ -1745,7 +1747,7 @@ def _make_gptq_handle(name, m):
             cls = GPTQHelper
         else:
             cls = _GPTQ_HELPER_REGISTRY.get(backend, GPTQHelper)
-        return cls(m, name, offload_to_cpu=True)
+        return cls(m, name, offload_to_cpu=True, fused=fused)
 
     gptq_handles = {name: _make_gptq_handle(name, m) for name, m in quantized_layers}
     for handle in gptq_handles.values():
diff --git a/modelopt/torch/quantization/triton/fp4_kernel_hopper.py b/modelopt/torch/quantization/triton/fp4_kernel_hopper.py
@@ -24,7 +24,7 @@
 import triton.language as tl
 
 from .fp4_kernel import _torch_dtype_to_tl
-from .nvfp4_quant import fp4_round_magnitude
+from .nvfp4_quant import fp4_round_magnitude, fp8_quantize_scale
 
 __all__ = ["fp4_fake_quant_block"]
 
@@ -80,9 +80,7 @@ def fp4_fake_quant_kernel(
 
     block_max = tl.max(x_abs, axis=2, keep_dims=True)
 
-    block_max_scaled = block_max / (6.0 * global_scale_safe)
-    block_max_scaled = tl.minimum(block_max_scaled, 448.0)
-    block_max_quant = block_max_scaled.to(tl.float8e4nv).to(tl.float32) * global_scale
+    block_max_quant = fp8_quantize_scale(block_max, global_scale_safe)
     block_max_quant = tl.where(block_max_quant >= 1e-5, block_max_quant, 1.0)
 
     block_max_quant_broadcast = tl.broadcast_to(
diff --git a/modelopt/torch/quantization/triton/gptq_fused_kernel.py b/modelopt/torch/quantization/triton/gptq_fused_kernel.py
@@ -15,30 +15,28 @@
 
 """Fused Triton kernels for GPTQ blockwise weight-update.
 
-A kernel for scalar (NVFP4) quantization.
-Each kernel fuses quantization + per-column GPTQ error propagation into
+A kernel for scalar (NVFP4) quantization with inline two-level scale computation.
+Fuses scale computation + quantization + per-column GPTQ error propagation into
 one launch per GPTQ block, avoiding the Python-level per-column loop.
 
-Architecture (both kernels):
+Architecture:
   - One Triton program per output row.
   - ``w_full [BLOCK_SIZE]`` register tensor holds working weights.
-  - Per-column error propagation: ``w_full -= err * h_inv_row``.
-
-Scalar kernel (``_gptq_scalar_kernel``):
-  - Calls ``nvfp4_scalar_quant()`` from ``nvfp4_quant.py`` per column.
+  - Per-column: calls ``nvfp4_scalar_qdq()`` for FP4 QDQ with inline scale
+    computation, then propagates error via ``w_full -= err * h_inv_row``.
 """
 
 import torch
 import triton
 import triton.language as tl
 
-from .nvfp4_quant import nvfp4_scalar_quant
+from .nvfp4_quant import nvfp4_scalar_qdq
 
 __all__ = ["gptq_fused_block_scalar"]
 
 
 # ---------------------------------------------------------------------------
-# Scalar kernel — NVFP4 quantization + error propagation
+# Scalar kernel — NVFP4 QDQ + error propagation
 # ---------------------------------------------------------------------------
 
 
@@ -47,10 +45,11 @@ def _gptq_scalar_kernel(
     w_ptr,
     qw_ptr,
     err_ptr,
-    scales_ptr,
+    amax_ptr,
+    global_scale,
     hinv_ptr,
     num_rows,
-    n_scale_blocks,
+    n_amax_blocks,
     quant_block_size,
     block_start,
     BLOCK_SIZE: tl.constexpr,
@@ -62,19 +61,20 @@ def _gptq_scalar_kernel(
     w_base = w_ptr + row * BLOCK_SIZE
     qw_base = qw_ptr + row * BLOCK_SIZE
     err_base = err_ptr + row * BLOCK_SIZE
-    scales_base = scales_ptr + row * n_scale_blocks
+    amax_base = amax_ptr + row * n_amax_blocks
 
     j_range = tl.arange(0, BLOCK_SIZE)
     w_full = tl.load(w_base + j_range)
 
     for col in range(0, BLOCK_SIZE, 1):
-        scale = tl.load(scales_base + (block_start + col) // quant_block_size)
+        block_amax = tl.load(amax_base + (block_start + col) // quant_block_size)
 
         w_scalar = tl.sum(tl.where(j_range == col, w_full, 0.0))
         q_scalar = tl.sum(
-            nvfp4_scalar_quant(
+            nvfp4_scalar_qdq(
                 tl.full([1], w_scalar, dtype=tl.float32),
-                scale,
+                block_amax,
+                global_scale,
                 1,
             )
         )
@@ -91,16 +91,22 @@ def _gptq_scalar_kernel(
 
 def gptq_fused_block_scalar(
     w_block: torch.Tensor,
-    scales_2d: torch.Tensor,
+    block_amax: torch.Tensor,
+    global_scale: float,
     h_inv_cho_blk: torch.Tensor,
     quant_block_size: int,
     block_start: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Run scalar GPTQ (NVFP4) column loop for one block in a single Triton kernel launch.
 
+    Computes FP8-quantized scales from per-block amax inline via
+    :func:`nvfp4_scalar_qdq`, then performs NVFP4 fake quantization and
+    GPTQ error propagation per column.
+
     Args:
         w_block:         Working weights ``[num_rows, block_size]`` (float32).
-        scales_2d:       Pre-computed scales ``[num_rows, n_scale_blocks]`` (float32).
+        block_amax:      Per-block amax values ``[num_rows, n_amax_blocks]`` (float32).
+        global_scale:    Pre-computed ``global_amax / (6.0 * 448.0)`` (scalar).
         h_inv_cho_blk:   Block of upper-Cholesky inverse Hessian ``[block_size, block_size]``.
         quant_block_size: Number of elements sharing one scale factor.
         block_start:     Column offset of this block in the full weight matrix.
@@ -117,10 +123,11 @@ def gptq_fused_block_scalar(
         w_block.contiguous(),
         qw_block,
         err_block,
-        scales_2d.contiguous(),
+        block_amax.contiguous(),
+        global_scale,
         h_inv_cho_blk.contiguous(),
         num_rows,
-        scales_2d.shape[1],
+        block_amax.shape[1],
         quant_block_size,
         block_start,
         BLOCK_SIZE=block_size,
diff --git a/modelopt/torch/quantization/triton/nvfp4_quant.py b/modelopt/torch/quantization/triton/nvfp4_quant.py
@@ -93,3 +93,52 @@ def nvfp4_scalar_quant(
     x_rescaled = q_val * scale_safe
     x_quant = tl.where(x >= 0, x_rescaled, -x_rescaled)
     return x_quant
+
+
+@triton.jit
+def fp8_quantize_scale(block_amax, global_scale):
+    """FP8 E4M3 fake-quantize the per-block NVFP4 scale.
+
+    Computes ``scale = block_amax / 6.0``, then round-trips it through
+    FP8 E4M3 using ``global_scale`` for the second-level scaling.
+
+    Works with any tensor shape (scalar, 1-D, or higher) since all ops
+    are element-wise.
+
+    Args:
+        block_amax:   Per-block amax value(s).
+        global_scale: Pre-computed ``global_amax / (6.0 * 448.0)``.
+
+    Returns:
+        FP8-quantized per-block scale(s), same shape as ``block_amax``.
+    """
+    FP8_E4M3_MAX: tl.constexpr = 448.0
+    scale_in_fp8_range = block_amax / (6.0 * global_scale)
+    scale_clamped = tl.minimum(scale_in_fp8_range, FP8_E4M3_MAX)
+    return scale_clamped.to(tl.float8e4nv).to(tl.float32) * global_scale
+
+
+@triton.jit
+def nvfp4_scalar_qdq(
+    x,  # [N] float32, already loaded
+    block_amax,  # float32 scalar: per-block amax
+    global_scale,  # float32 scalar: pre-computed global_amax / (6.0 * 448.0)
+    N: tl.constexpr,
+):
+    """NVFP4 scalar fake quantization with inline two-level scale computation.
+
+    Computes the per-block FP8-quantized scale from ``block_amax`` via
+    :func:`fp8_quantize_scale`, then quantizes each element to the nearest
+    FP4 (E2M1) value.
+
+    Args:
+        x:            [N] float32 tensor of values to quantize.
+        block_amax:   Per-block amax (absolute maximum of the block).
+        global_scale: Pre-computed ``global_amax / (6.0 * 448.0)``.
+        N:            Compile-time number of elements.
+
+    Returns:
+        x_quant: [N] float32, fake-quantized values.
+    """
+    scale = fp8_quantize_scale(block_amax, global_scale)
+    return nvfp4_scalar_quant(x, scale, N)
diff --git a/modelopt/torch/quantization/utils/calib_utils.py b/modelopt/torch/quantization/utils/calib_utils.py
@@ -126,10 +126,11 @@ class GPTQHelper:
 
     CACHE_NAME = "_forward_no_gptq_hessian"
 
-    def __init__(self, module, name, offload_to_cpu=False):
+    def __init__(self, module, name, offload_to_cpu=False, fused=False):
         """Initialize GPTQHelper with module state and Hessian storage."""
         self.module = module
         self.name = name
+        self.fused = fused
         in_features = module.weight.shape[-1]
         device = module.weight.device
         if device.type == "meta" or (offload_to_cpu and get_used_gpu_mem_fraction(device) > 0.65):
@@ -195,23 +196,35 @@ def _prepare_hessian_inverse(self, hessian, perc_damp):
         self.h_inv = compute_hessian_inverse(hessian, self.weight, perc_damp)
 
     def _blockwise_update(self, block_size):
-        """Column-wise GPTQ update using full-matrix QDQ.
+        """Column-wise GPTQ update.
 
-        Delegates to :func:`gptq_blockwise_update` with the module's weight quantizer.
+        When ``self.fused`` is True and the weight quantizer is an
+        ``NVFP4StaticQuantizer``, uses :func:`gptq_blockwise_update_fused_scalar`
+        (a fused Triton kernel).  Otherwise falls back to
+        :func:`gptq_blockwise_update` (unfused column-by-column loop).
         """
         assert self.weight is not None and self.h_inv is not None, (
             "_blockwise_update called before _prepare_hessian_inverse()"
         )
         quantizer = self.module.weight_quantizer
-        block_sizes = getattr(quantizer, "block_sizes", None)
-        if block_sizes is not None:
-            group_size = block_sizes.get(-1)
-            if group_size is not None and block_size % group_size != 0:
+
+        if self.fused and getattr(quantizer, "_is_nvfp4_static_quantizer", False):
+            block_sizes = quantizer.block_sizes
+            quant_block_size = block_sizes.get(-1) or block_sizes.get(1)
+            if quant_block_size is not None and block_size % quant_block_size != 0:
                 raise ValueError(
                     f"GPTQ block_size ({block_size}) must be divisible by the quantizer"
-                    f" group_size ({group_size})"
+                    f" group_size ({quant_block_size})"
                 )
-        gptq_blockwise_update(self.weight, self.h_inv, block_size, quantizer)
+            out_features, num_cols = self.weight.shape
+            n_blocks = num_cols // quant_block_size
+            block_amax = quantizer.amax.reshape(out_features, n_blocks).float()
+            global_scale = quantizer.global_amax.float().item() / (6.0 * 448.0)
+            gptq_blockwise_update_fused_scalar(
+                self.weight, block_amax, global_scale, self.h_inv, block_size, quant_block_size
+            )
+        else:
+            gptq_blockwise_update(self.weight, self.h_inv, block_size, quantizer)
 
     def _print_mse_error(self, hessian):
         """Log Hessian-weighted relative MSE between ``self.weight`` and original weights."""
@@ -260,17 +273,20 @@ def gptq_blockwise_update(weight, h_inv, block_size, quantize_fn):
         weight[:, block_end:].addmm_(errs, h_inv[block_start:block_end, block_end:], alpha=-1)
 
 
-def gptq_blockwise_update_fused_scalar(weight, scales_2d, h_inv, block_size, quant_block_size):
+def gptq_blockwise_update_fused_scalar(
+    weight, block_amax, global_scale, h_inv, block_size, quant_block_size
+):
     """Fused GPTQ blockwise update for NVFP4 scalar quantization.
 
-    Uses a fused Triton kernel that combines quantization and per-column
-    error propagation into one launch per GPTQ block, avoiding the
-    Python-level per-column loop in :func:`gptq_blockwise_update`.
+    Uses a fused Triton kernel that combines scale computation, quantization,
+    and per-column error propagation into one launch per GPTQ block, avoiding
+    the Python-level per-column loop in :func:`gptq_blockwise_update`.
 
     Args:
         weight: Weight tensor ``[out_features, in_features]``, modified **in-place**
             with fake-quantized values.
-        scales_2d: Pre-computed per-block scales ``[out_features, n_scale_blocks]``.
+        block_amax: Per-block amax values ``[out_features, n_amax_blocks]``.
+        global_scale: Pre-computed ``global_amax / (6.0 * 448.0)`` (scalar).
         h_inv: Upper-triangular Cholesky factor of the damped inverse Hessian
             ``[in_features, in_features]``.
         block_size: Number of columns to process per GPTQ block.
@@ -283,7 +299,8 @@ def gptq_blockwise_update_fused_scalar(weight, scales_2d, h_inv, block_size, qua
         be = min(bs + block_size, num_cols)
         qw, err = gptq_fused_block_scalar(
             weight[:, bs:be].clone().contiguous(),
-            scales_2d,
+            block_amax,
+            global_scale,
             h_inv[bs:be, bs:be].contiguous(),
             quant_block_size,
             bs,
@@ -293,44 +310,6 @@ def gptq_blockwise_update_fused_scalar(weight, scales_2d, h_inv, block_size, qua
             weight[:, be:].addmm_(err, h_inv[bs:be, be:], alpha=-1)
 
 
-class FusedScalarGPTQHelper(GPTQHelper):
-    """GPTQHelper using the fused Triton kernel for NVFP4 scalar quantization.
-
-    Overrides :meth:`_blockwise_update` to extract pre-computed scales from the
-    ``NVFP4StaticQuantizer`` and delegate to :func:`gptq_blockwise_update_fused_scalar`.
-    """
-
-    def _blockwise_update(self, block_size):
-        """Fused GPTQ using Triton kernel for NVFP4 scalar quantization."""
-        assert self.weight is not None and self.h_inv is not None, (
-            "_blockwise_update called before _prepare_hessian_inverse()"
-        )
-        from modelopt.torch.quantization.triton.fp4_kernel import compute_fp4_scales
-
-        quantizer = self.module.weight_quantizer
-        block_sizes = getattr(quantizer, "block_sizes", None)
-        quant_block_size = None
-        if block_sizes is not None:
-            quant_block_size = block_sizes.get(-1) or block_sizes.get(1)
-
-        if quant_block_size is not None and block_size % quant_block_size != 0:
-            raise ValueError(
-                f"GPTQ block_size ({block_size}) must be divisible by the quantizer"
-                f" group_size ({quant_block_size})"
-            )
-
-        out_features, num_cols = self.weight.shape
-        n_blocks = num_cols // quant_block_size
-
-        # Pre-compute scales from the calibrated amax (frozen during GPTQ).
-        amax = quantizer.amax.reshape(out_features, n_blocks)
-        scales_2d = compute_fp4_scales(amax, quantizer.global_amax, quantize_block_scales=True)
-
-        gptq_blockwise_update_fused_scalar(
-            self.weight, scales_2d, self.h_inv, block_size, quant_block_size
-        )
-
-
 _GPTQ_HELPER_REGISTRY: dict[str, type[GPTQHelper]] = {}
 
 
@@ -342,7 +321,3 @@ def register_gptq_helper(backend: str, factory: type[GPTQHelper]) -> None:
     construct ``factory`` instead of the default ``GPTQHelper``.
     """
     _GPTQ_HELPER_REGISTRY[backend] = factory
-
-
-# Built-in registrations
-register_gptq_helper("fused_gptq_nvfp4", FusedScalarGPTQHelper)
diff --git a/tests/gpu/torch/quantization/test_gptq.py b/tests/gpu/torch/quantization/test_gptq.py