[Quantization] Address review feedback round 3 on FP8 sweep

cjluo-nv · cjluo-nv · commit a9c8ccf16bb1 · 2026-05-08T05:04:33.000Z
Three changes from realAsma's latest review:

- nvfp4_fp8_sweep kernel: use ``scale_safe`` rather than ``scale`` in the
  per-candidate diff so the divisor and multiplier match. Numerically
  equivalent on real inputs (the only case where ``scale_safe`` differs
  from ``scale`` is ``global_amax == 0``, in which case ``w_abs`` is also
  zero so the loss is zero either way), but more consistent.
- Extract ``fp8_scale_candidates`` to a triton-free module
  ``_fp8_scale_candidates.py`` so the calibrator's reference sweep and the
  Triton kernel wrapper share one definition. Removes the duplicate copy
  in ``NVFP4MSECalibrator._generate_candidates``.
- Parity test: extend ``test_parity_random_weights`` to cover bf16 and
  fp16 in addition to fp32 by parametrizing on dtype, so the canonical
  parity grid (3 seeds × 3 num_blocks) is now exercised on every supported
  dtype. Folded the smaller ``test_parity_dtypes`` into this since it was
  a strict subset.

Signed-off-by: Chenjie Luo &lt;chenjiel@nvidia.com&gt;
diff --git a/modelopt/torch/kernels/quantization/gemm/_fp8_scale_candidates.py b/modelopt/torch/kernels/quantization/gemm/_fp8_scale_candidates.py
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Single source of truth for the NVFP4 FP8 scale-candidate set.
+
+Pure PyTorch, no Triton dependency, so it can be imported from both the kernel
+wrapper (which is triton-gated) and the reference Python sweep in the
+:class:`NVFP4MSECalibrator` (which must work without triton too).
+"""
+
+import torch
+
+
+def fp8_scale_candidates(device: torch.device | str = "cpu") -> torch.Tensor:
+    """Return the 126 valid finite positive FP8 E4M3 scale candidates / 448."""
+    uint8_values = torch.arange(0, 128, dtype=torch.uint8, device=device)
+    fp8_values = uint8_values.view(torch.float8_e4m3fn).float()
+    valid_mask = torch.isfinite(fp8_values) & (fp8_values > 0)
+    return fp8_values[valid_mask] / 448.0
diff --git a/modelopt/torch/kernels/quantization/gemm/nvfp4_fp8_sweep.py b/modelopt/torch/kernels/quantization/gemm/nvfp4_fp8_sweep.py
@@ -32,19 +32,12 @@
 import triton
 import triton.language as tl
 
+from ._fp8_scale_candidates import fp8_scale_candidates
 from .nvfp4_quant import fp4_round_magnitude
 
 __all__ = ["fp8_scale_candidates", "nvfp4_fp8_scale_sweep"]
 
 
-def fp8_scale_candidates(device: torch.device | str = "cpu") -> torch.Tensor:
-    """Return the 126 valid finite positive FP8 E4M3 scale candidates / 448."""
-    uint8_values = torch.arange(0, 128, dtype=torch.uint8, device=device)
-    fp8_values = uint8_values.view(torch.float8_e4m3fn).float()
-    valid_mask = torch.isfinite(fp8_values) & (fp8_values > 0)
-    return fp8_values[valid_mask] / 448.0
-
-
 # Selected from a (BLOCKS_PER_PROGRAM, num_warps) sweep on B300:
 #   BPP=16,nw=2: 6.06 ms   BPP=32,nw=4: 6.06 ms   BPP=64,nw=8: 5.08 ms
 # The smaller-tile entries cover cases where N_BLOCKS is small enough that BPP=64
@@ -93,11 +86,11 @@ def _fp8_scale_sweep_kernel(
     for k in tl.static_range(NUM_CANDIDATES):
         c = tl.load(candidates_ptr + k).to(tl.float32)
         scale = c * global_amax / 6.0
-        # Avoid divide-by-zero when global_amax == 0; the resulting err == w_abs² is
-        # the same for every candidate, so any best_idx is fine.
+        # Avoid divide-by-zero when global_amax == 0; in that case w_abs is also zero
+        # (global_amax = max|w|), so the loss is zero for every candidate either way.
         scale_safe = tl.where(scale == 0.0, 1.0, scale)
         q_mag = fp4_round_magnitude(w_abs / scale_safe)
-        diff = w_abs - q_mag * scale
+        diff = w_abs - q_mag * scale_safe
         loss = tl.sum(diff * diff, axis=1)  # [BLOCKS_PER_PROGRAM]
         is_better = loss < best_loss
         best_loss = tl.where(is_better, loss, best_loss)
diff --git a/modelopt/torch/quantization/calib/mse.py b/modelopt/torch/quantization/calib/mse.py
@@ -203,17 +203,12 @@ def _compute_candidate_amax(self, candidates: torch.Tensor) -> torch.Tensor:
         return torch.ones_like(self._initial_amax) * self._global_amax * candidates
 
     def _generate_candidates(self, device: torch.device) -> torch.Tensor:
-        """Generate 126 valid FP8 E4M3 scale candidates.
+        """Generate the 126 valid FP8 E4M3 scale candidates."""
+        from modelopt.torch.kernels.quantization.gemm._fp8_scale_candidates import (
+            fp8_scale_candidates,
+        )
 
-        Kept in sync with ``fp8_scale_candidates`` in
-        ``modelopt.torch.kernels.quantization.gemm.nvfp4_fp8_sweep`` — the FP8 E4M3
-        spec is fixed, and the parity test exercises both paths against each other.
-        """
-        uint8_values = torch.arange(0, 128, dtype=torch.uint8, device=device)
-        fp8_values = uint8_values.view(torch.float8_e4m3fn).float()
-        valid_mask = torch.isfinite(fp8_values) & (fp8_values > 0)
-        fp8_values = fp8_values[valid_mask]
-        return fp8_values / 448.0
+        return fp8_scale_candidates(device)
 
     def _can_use_triton_fast_path(self, x: torch.Tensor) -> bool:
         """Whether the Triton fast path is usable for this ``collect`` input.
diff --git a/tests/gpu/torch/quantization/test_nvfp4_fp8_sweep_kernel.py b/tests/gpu/torch/quantization/test_nvfp4_fp8_sweep_kernel.py
@@ -85,14 +85,17 @@ def _run_triton(x, per_block_amax, global_amax):
 
 
 @requires_triton
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("seed", [0, 1, 2])
 @pytest.mark.parametrize("num_blocks", [4, 64, 1024])
-def test_parity_random_weights(seed, num_blocks):
-    """Triton sweep must produce the exact same per-block amax as the reference."""
+def test_parity_random_weights(seed, num_blocks, dtype):
+    """Triton sweep must produce the exact same per-block amax as the reference,
+    across every dtype supported by the NVFP4 quantizer (fp32, fp16, bf16)."""
     torch.manual_seed(seed)
     device = "cuda"
-    x = torch.randn(num_blocks, BLOCK_SIZE, device=device, dtype=torch.float32)
-    per_block_amax = x.abs().amax(dim=-1)
+    x = torch.randn(num_blocks, BLOCK_SIZE, device=device, dtype=dtype)
+    # Promote to fp32 for the per-block amax (matches what max_calibrate produces).
+    per_block_amax = x.float().abs().amax(dim=-1)
     global_amax = per_block_amax.max()
 
     ref = _run_reference(x, per_block_amax, global_amax)
@@ -102,29 +105,12 @@ def test_parity_random_weights(seed, num_blocks):
     # Both pick from the same 126-element discrete candidate set, so any disagreement
     # would show up as a non-zero diff (not a small float epsilon). Demand exact match.
     assert torch.equal(ref, tri), (
-        f"Triton sweep diverged from reference: max |diff| = "
+        f"Triton sweep diverged from reference (dtype={dtype}): max |diff| = "
         f"{(ref - tri).abs().max().item():.3e}, "
         f"differing blocks = {(ref != tri).sum().item()} / {num_blocks}"
     )
 
 
-@requires_triton
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-def test_parity_dtypes(dtype):
-    """Sweep must agree across the dtypes supported by the NVFP4 quantizer."""
-    torch.manual_seed(42)
-    device = "cuda"
-    num_blocks = 256
-    x = torch.randn(num_blocks, BLOCK_SIZE, device=device, dtype=dtype)
-    # Promote to fp32 for the per-block amax (matches what max_calibrate produces).
-    per_block_amax = x.float().abs().amax(dim=-1)
-    global_amax = per_block_amax.max()
-
-    ref = _run_reference(x, per_block_amax, global_amax)
-    tri = _run_triton(x, per_block_amax, global_amax)
-    assert torch.equal(ref, tri)
-
-
 @requires_triton
 def test_quantized_output_matches():
     """Round-tripping x through the chosen amax should give the same fake-quant result."""