perf: Add graph-safe _impl variants for fused quantize and GEMM

TimDettmers · claude · TimDettmers · commit 6d5dac1310b5 · 2026-02-28T15:04:42.000-05:00
Extract core logic into _fused_quantize_nvfp4_impl and _gemm_nvfp4_impl
that accept optional pre-allocated output buffers. When provided, zero
allocations occur, making them safe for CUDA graph capture. The existing
registered ops remain unchanged for convenience.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -936,15 +936,25 @@ def _get_rotation_matrix(device: torch.device) -> torch.Tensor:
     return _rotation_matrices[device]
 
 
-@register_kernel("bitsandbytes::cutlass_fused_quantize_nvfp4", "cuda")
-def _(
+def _fused_quantize_nvfp4_impl(
     A: torch.Tensor,
     tensor_scale: float,
+    packed_out: Optional[torch.Tensor] = None,
+    scales_out: Optional[torch.Tensor] = None,
+    global_scale_buf: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """CUTLASS-based fused quantize with randomized Hadamard rotation.
-
-    The CUTLASS kernel requires M to be a multiple of 128. We pad here
-    and trim the output to maintain a transparent API.
+    """Core CUTLASS fused quantize implementation.
+
+    When output buffers are provided, no allocations occur — safe for CUDA
+    graph capture. When None, buffers are allocated (convenient but not
+    graph-safe).
+
+    Args:
+        A: BF16 input, numel must be divisible by 16.
+        tensor_scale: Global tensor scale.
+        packed_out: Pre-allocated uint8 output (padded_M * 8 bytes). None to allocate.
+        scales_out: Pre-allocated uint8 scales (padded_M bytes). None to allocate.
+        global_scale_buf: Pre-allocated float32 scalar buffer. None to allocate.
     """
     A = A.contiguous()
     n = A.numel()
@@ -954,10 +964,8 @@ def _(
         lambda: f"CUTLASS fused quantize requires bfloat16, got {A.dtype}",
     )
 
-    # Reshape to 2D: (M, K) where K is the last dimension
-    # The fused quantize GEMM treats each group of 16 elements as one "row"
-    K = 16  # NVFP4 group size = GEMM K dimension
-    N = 16  # B matrix is 16x16
+    K = 16
+    N = 16
     orig_M = n // K
     padded_M = ((orig_M + 127) // 128) * 128
 
@@ -970,26 +978,26 @@ def _(
     else:
         A_flat = A
 
-    # Compute global_scale = 1/tensor_scale (QuTLASS convention)
-    global_scale = torch.tensor(
-        [1.0 / tensor_scale if tensor_scale > 0 else 0.0],
-        dtype=torch.float32,
-        device=A.device,
-    )
-
-    # Allocate output buffers (padded size)
-    packed_padded = torch.zeros(padded_M * K // 2, dtype=torch.uint8, device=A.device)
+    # Use pre-allocated buffers or allocate new ones
+    if global_scale_buf is not None:
+        global_scale_buf.fill_(1.0 / tensor_scale if tensor_scale > 0 else 0.0)
+        global_scale = global_scale_buf
+    else:
+        global_scale = torch.tensor(
+            [1.0 / tensor_scale if tensor_scale > 0 else 0.0],
+            dtype=torch.float32,
+            device=A.device,
+        )
 
-    # Scale output: one E4M3 scale per 16-element block = padded_M scales
-    # QuTLASS outputs as (padded_M, 1) but we flatten
-    scales_padded = torch.zeros(padded_M, dtype=torch.uint8, device=A.device)
+    packed_padded = (
+        packed_out if packed_out is not None else torch.zeros(padded_M * K // 2, dtype=torch.uint8, device=A.device)
+    )
+    scales_padded = scales_out if scales_out is not None else torch.zeros(padded_M, dtype=torch.uint8, device=A.device)
 
-    # Get the cached randomized Hadamard rotation matrix for this device
     B = _get_rotation_matrix(A.device)
 
     with _cuda_device_of(A):
-        fn = lib.cfused_quantize_nvfp4_absmax
-        fn(
+        lib.cfused_quantize_nvfp4_absmax(
             get_ptr(A_flat),
             get_ptr(B),
             get_ptr(packed_padded),
@@ -1001,14 +1009,22 @@ def _(
             _get_tensor_stream(A),
         )
 
-    # Trim to original size
     packed = packed_padded[: orig_M * K // 2] if padded_M != orig_M else packed_padded
     block_scales = scales_padded[:orig_M] if padded_M != orig_M else scales_padded
 
     ts_out = torch.tensor([tensor_scale], dtype=torch.float32, device=A.device)
     return packed, block_scales, ts_out
 
 
+@register_kernel("bitsandbytes::cutlass_fused_quantize_nvfp4", "cuda")
+def _(
+    A: torch.Tensor,
+    tensor_scale: float,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """CUTLASS-based fused quantize with randomized Hadamard rotation."""
+    return _fused_quantize_nvfp4_impl(A, tensor_scale)
+
+
 # Scale reordering for CUTLASS block-scaled GEMM
 @register_kernel("bitsandbytes::scale_to_blocked", "cuda")
 def _(scales: torch.Tensor, H: int, W: int) -> torch.Tensor:
@@ -1038,8 +1054,7 @@ def _(scales: torch.Tensor, H: int, W: int) -> torch.Tensor:
 # quantization time by scale_to_blocked). Tensor scales are folded into
 # the CUTLASS epilogue alpha. Output is BF16, converted to FP32 for
 # API compatibility.
-@register_kernel("bitsandbytes::gemm_nvfp4", "cuda")
-def _(
+def _gemm_nvfp4_impl(
     A_packed: torch.Tensor,
     B_packed: torch.Tensor,
     A_scales: torch.Tensor,
@@ -1049,12 +1064,27 @@ def _(
     M: int,
     N: int,
     K: int,
+    D_out: Optional[torch.Tensor] = None,
+    alpha_buf: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
+    """Core NVFP4 GEMM implementation.
+
+    When D_out and alpha_buf are provided, no allocations occur — safe for
+    CUDA graph capture. When None, buffers are allocated.
+
+    Args:
+        D_out: Pre-allocated BF16 output (M, N). None to allocate.
+        alpha_buf: Pre-allocated float32 scalar buffer. None to allocate.
+    """
     with _cuda_device_of(A_packed):
-        # A_scales and B_scales are already in CUTLASS block-scaled layout
-        # (pre-computed at quantization time by scale_to_blocked)
-        alpha = torch.tensor([A_tensor_scale * B_tensor_scale], dtype=torch.float32, device=A_packed.device)
-        D_out = torch.empty(M, N, dtype=torch.bfloat16, device=A_packed.device)
+        if alpha_buf is not None:
+            alpha_buf.fill_(A_tensor_scale * B_tensor_scale)
+            alpha = alpha_buf
+        else:
+            alpha = torch.tensor([A_tensor_scale * B_tensor_scale], dtype=torch.float32, device=A_packed.device)
+
+        if D_out is None:
+            D_out = torch.empty(M, N, dtype=torch.bfloat16, device=A_packed.device)
 
         lib.cgemm_nvfp4_cutlass(
             get_ptr(A_packed),
@@ -1070,3 +1100,29 @@ def _(
         )
 
     return D_out.float()
+
+
+@register_kernel("bitsandbytes::gemm_nvfp4", "cuda")
+def _(
+    A_packed: torch.Tensor,
+    B_packed: torch.Tensor,
+    A_scales: torch.Tensor,
+    B_scales: torch.Tensor,
+    A_tensor_scale: float,
+    B_tensor_scale: float,
+    M: int,
+    N: int,
+    K: int,
+) -> torch.Tensor:
+    """NVFP4 GEMM: A @ B^T with block-scaled FP4 inputs."""
+    return _gemm_nvfp4_impl(
+        A_packed,
+        B_packed,
+        A_scales,
+        B_scales,
+        A_tensor_scale,
+        B_tensor_scale,
+        M,
+        N,
+        K,
+    )