Add quantize_nvfp4_raw() with device-side global_scale, no host sync

TimDettmers · claude · TimDettmers · commit be069825e062 · 2026-03-09T12:29:10.000-04:00
New variant accepts global_scale as a GPU tensor (1/abs_max) instead of a
host float, avoiding the .item() GPU→CPU sync. Returns raw (packed, scales)
without swizzling or QuantState — caller uses scale_to_blocked_batched.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -470,6 +470,26 @@ def _(
     return packed, block_scales, ts_out
 
 
+# Device-side quantize variant: global_scale is a device tensor (no .item() sync).
+# Returns (packed, block_scales) — row-major scales without swizzling.
+torch.library.define(
+    "bitsandbytes::cutlass_fused_quantize_nvfp4_raw",
+    "(Tensor A, Tensor global_scale_dev) -> (Tensor, Tensor)",
+)
+
+
+@register_fake("bitsandbytes::cutlass_fused_quantize_nvfp4_raw")
+def _(
+    A: torch.Tensor,
+    global_scale_dev: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    n = A.numel()
+    torch._check(n % 16 == 0, lambda: f"NVFP4 requires numel divisible by 16, got {n}")
+    packed = torch.empty(n // 2, dtype=torch.uint8, device=A.device)
+    block_scales = torch.empty(n // 16, dtype=torch.uint8, device=A.device)
+    return packed, block_scales
+
+
 # Scale reordering for CUTLASS block-scaled GEMM
 torch.library.define(
     "bitsandbytes::scale_to_blocked",
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -937,6 +937,54 @@ def _(
     return _fused_quantize_nvfp4_impl(A, tensor_scale)
 
 
+@register_kernel("bitsandbytes::cutlass_fused_quantize_nvfp4_raw", "cuda")
+def _(
+    A: torch.Tensor,
+    global_scale_dev: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Device-side quantize: global_scale is a pre-computed device tensor.
+
+    Returns (packed_data, block_scales_rowmajor) — no swizzling, no QuantState.
+    The global_scale_dev tensor should contain 1.0/tensor_scale as a float32
+    scalar on the GPU (0-dim or 1-element tensor).
+    """
+    A = A.contiguous()
+    n = A.numel()
+    torch._check(n % 16 == 0, lambda: f"NVFP4 requires numel divisible by 16, got {n}")
+    torch._check(
+        A.dtype == torch.bfloat16,
+        lambda: f"CUTLASS fused quantize requires bfloat16, got {A.dtype}",
+    )
+
+    K = 16
+    orig_M = n // K
+    padded_M = ((orig_M + 127) // 128) * 128
+
+    if padded_M != orig_M:
+        A_2d = A.view(orig_M, K)
+        A_2d = torch.nn.functional.pad(A_2d, (0, 0, 0, padded_M - orig_M))
+        A_flat = A_2d.reshape(-1)
+    else:
+        A_flat = A
+
+    packed_padded = torch.zeros(padded_M * K // 2, dtype=torch.uint8, device=A.device)
+    scales_padded = torch.zeros(padded_M, dtype=torch.uint8, device=A.device)
+
+    _fused_quantize_nvfp4_raw(
+        A_flat,
+        _get_rotation_matrix(A.device),
+        packed_padded,
+        scales_padded,
+        global_scale_dev.to(dtype=torch.float32).contiguous(),
+        padded_M,
+    )
+
+    packed = packed_padded[: orig_M * K // 2] if padded_M != orig_M else packed_padded
+    block_scales = scales_padded[:orig_M] if padded_M != orig_M else scales_padded
+
+    return packed, block_scales
+
+
 # Scale reordering for CUTLASS block-scaled GEMM
 @register_kernel("bitsandbytes::scale_to_blocked", "cuda")
 def _(scales: torch.Tensor, H: int, W: int) -> torch.Tensor:
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1204,6 +1204,33 @@ def quantize_nvfp4(
     return packed, state
 
 
+def quantize_nvfp4_raw(
+    A: torch.Tensor,
+    global_scale_dev: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize to NVFP4 with a pre-computed device-side global scale.
+
+    Unlike quantize_nvfp4(), this variant:
+    - Takes global_scale as a device tensor (1/abs_max), no .item() sync
+    - Skips scale_to_blocked (caller uses scale_to_blocked_batched instead)
+    - Returns raw (packed_data, block_scales_rowmajor) without QuantState
+
+    Args:
+        A: Input tensor (bfloat16). Must have numel divisible by 16.
+        global_scale_dev: Device tensor containing 1.0/tensor_scale (float32).
+
+    Returns:
+        Tuple of (packed_data [uint8], block_scales_rowmajor [uint8]).
+    """
+    A_flat = A.reshape(-1).contiguous()
+    A_bf16 = A_flat.to(torch.bfloat16) if A_flat.dtype != torch.bfloat16 else A_flat
+
+    packed, block_scales = torch.ops.bitsandbytes.cutlass_fused_quantize_nvfp4_raw(
+        A_bf16, global_scale_dev,
+    )
+    return packed, block_scales
+
+
 def dequantize_nvfp4(
     packed_data: torch.Tensor,
     quant_state: NVFP4QuantState,