Add out parameter to dequantize_kbit for CUDA graph compatibility

TimDettmers · claude · TimDettmers · commit d1f3d75de549 · 2026-02-21T22:52:29.000-05:00
Factor dequant into _dequantize_kbit_impl that accepts a pre-allocated
output tensor. Add dequantize_kbit_ in-place op variant following the
existing pattern (dequantize_4bit.out, gemv_4bit.out). The public API
dequantize_kbit() now accepts an optional out parameter — if provided,
the kernel writes into it directly instead of allocating, which is
required for CUDA graph replay.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -475,3 +475,30 @@ def _(
     )
     num_blocks = -(n // -32)
     return torch.empty(num_blocks * 32, device=packed.device, dtype=dtype)
+
+
+torch.library.define(
+    "bitsandbytes::dequantize_kbit_",
+    "(Tensor packed, Tensor codebook, Tensor absmax, int k, int n, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)",
+)
+
+
+@register_fake("bitsandbytes::dequantize_kbit_")
+def _(
+    packed: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax: torch.Tensor,
+    k: int,
+    n: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(
+        absmax.dtype in (torch.float32, torch.uint8),
+        lambda: f"absmax must be float32 or uint8 (E4M4), got {absmax.dtype}",
+    )
+    num_blocks = -(n // -32)
+    torch._check(out.numel() >= num_blocks * 32, lambda: f"out must have at least {num_blocks * 32} elements")
+    torch._check(out.dtype == dtype, lambda: f"out dtype {out.dtype} must match requested dtype {dtype}")
+    return out
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -810,15 +810,15 @@ def _(A: torch.Tensor, codebook: torch.Tensor, k: int) -> tuple[torch.Tensor, to
 }
 
 
-@register_kernel("bitsandbytes::dequantize_kbit", "cuda")
-def _(
+def _dequantize_kbit_impl(
     packed: torch.Tensor,
     codebook: torch.Tensor,
     absmax: torch.Tensor,
     k: int,
     n: int,
     dtype: torch.dtype,
-) -> torch.Tensor:
+    out: torch.Tensor,
+) -> None:
     torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
     torch._check(
         dtype in _KBIT_DTYPE_SUFFIX,
@@ -836,9 +836,6 @@ def _(
 
         absmax = encode_absmax_e4m4(absmax)
 
-    num_blocks = -(n // -32)
-    out = torch.empty(num_blocks * 32, device=packed.device, dtype=dtype)
-
     tname = _KBIT_DTYPE_SUFFIX[dtype]
     aname = _KBIT_ABSMAX_SUFFIX[absmax.dtype]
 
@@ -853,4 +850,31 @@ def _(
             _get_tensor_stream(packed),
         )
 
+
+@register_kernel("bitsandbytes::dequantize_kbit", "cuda")
+def _(
+    packed: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax: torch.Tensor,
+    k: int,
+    n: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    num_blocks = -(n // -32)
+    out = torch.empty(num_blocks * 32, device=packed.device, dtype=dtype)
+    _dequantize_kbit_impl(packed, codebook, absmax, k, n, dtype, out)
+    return out
+
+
+@register_kernel("bitsandbytes::dequantize_kbit_", "cuda")
+def _(
+    packed: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax: torch.Tensor,
+    k: int,
+    n: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    _dequantize_kbit_impl(packed, codebook, absmax, k, n, dtype, out)
     return out
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1179,6 +1179,7 @@ def dequantize_kbit(
     k: int,
     n: int,
     dtype: torch.dtype = torch.float16,
+    out: Optional[Tensor] = None,
 ) -> Tensor:
     """Dequantize a k-bit blockwise quantized tensor.
 
@@ -1190,12 +1191,25 @@ def dequantize_kbit(
         k: Bit width (2, 3, 4, or 5).
         n: Number of original elements.
         dtype: Output dtype. Defaults to float16.
+        out: Optional pre-allocated output tensor for CUDA graph compatibility.
+            Must have at least ceil(n/32)*32 elements and matching dtype.
 
     Returns:
         Dequantized tensor of shape (n,) with the given dtype.
     """
-    out = torch.ops.bitsandbytes.dequantize_kbit(packed, codebook, absmax, k, n, dtype)
-    return out[:n]
+    num_blocks = -(n // -32)
+    padded_n = num_blocks * 32
+
+    if out is not None:
+        if out.numel() < padded_n:
+            raise ValueError(f"out tensor has {out.numel()} elements, need at least {padded_n}")
+        if out.dtype != dtype:
+            raise ValueError(f"out dtype {out.dtype} does not match requested dtype {dtype}")
+        torch.ops.bitsandbytes.dequantize_kbit_(packed, codebook, absmax, k, n, dtype, out)
+        return out[:n]
+
+    result = torch.ops.bitsandbytes.dequantize_kbit(packed, codebook, absmax, k, n, dtype)
+    return result[:n]
 
 
 @deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
diff --git a/spec.md b/spec.md
@@ -0,0 +1,50 @@
+# Spec: Add `out` parameter to kbit dequantize for CUDA graph compatibility
+
+## Problem
+
+`dequantize_kbit` allocates a fresh output tensor on every call. This breaks
+CUDA graph capture, which requires kernels to write to the same memory address
+on every replay. The dequant is on the inference hot path and needs graph support.
+
+## Changes
+
+### 1. CUDA backend (`bitsandbytes/backends/cuda/ops.py`)
+
+Factor the kernel call into `_dequantize_kbit_impl(packed, codebook, absmax, k, n, dtype, out)`:
+- Accepts a pre-allocated `out` tensor
+- Validates `out` shape, dtype, device
+- Calls the C kernel writing into `out`
+
+The existing `dequantize_kbit` registered kernel allocates `out` then calls `_impl`.
+
+### 2. torch op definition (`bitsandbytes/_ops.py`)
+
+Add a second op `bitsandbytes::dequantize_kbit_` (in-place variant with trailing
+underscore, matching existing pattern for `dequantize_4bit`):
+- Signature: `(Tensor packed, Tensor codebook, Tensor absmax, int k, int n, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)`
+- Fake implementation validates shapes, returns `out`
+
+### 3. Public API (`bitsandbytes/functional.py`)
+
+Add optional `out` parameter to `dequantize_kbit()`:
+- `out: Optional[Tensor] = None`
+- If provided, validate shape/dtype/device, pass to impl
+- If None, allocate as before
+
+### 4. Tests
+
+Add test cases in `tests/test_kbit_quantization.py`:
+- Dequant with pre-allocated `out` tensor matches normal dequant
+- `out` tensor with wrong shape raises error
+- `out` tensor with wrong dtype raises error
+
+## Files touched
+
+- `bitsandbytes/backends/cuda/ops.py`
+- `bitsandbytes/_ops.py`
+- `bitsandbytes/functional.py`
+- `tests/test_kbit_quantization.py`
+
+## Not in scope
+
+- `quantize_kbit` out parameter (runs once at model load, not on hot path)
diff --git a/tests/test_kbit_quantization.py b/tests/test_kbit_quantization.py
@@ -1398,3 +1398,69 @@ def test_storage_reduction(self):
         # uint8 should use 4x less storage (ignoring padding)
         assert absmax_e4.element_size() == 1
         assert absmax_f32.element_size() == 4
+
+
+class TestDequantizeKbitOut:
+    """Tests for dequantize_kbit with pre-allocated out tensor (CUDA graph compatibility)."""
+
+    @pytest.mark.parametrize("k", [2, 3, 4, 5])
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_out_matches_normal(self, k, dtype):
+        """Dequant with pre-allocated out should match normal dequant."""
+        from bitsandbytes.functional import dequantize_kbit, quantize_kbit
+
+        n = 1024
+        A = torch.randn(n, dtype=dtype, device="cuda")
+        packed, absmax, cb = quantize_kbit(A, k=k, absmax_format="e4m4")
+
+        expected = dequantize_kbit(packed, absmax, cb, k=k, n=n, dtype=dtype)
+
+        num_blocks = -(n // -32)
+        out = torch.empty(num_blocks * 32, device="cuda", dtype=dtype)
+        result = dequantize_kbit(packed, absmax, cb, k=k, n=n, dtype=dtype, out=out)
+
+        assert result.shape == expected.shape
+        assert torch.equal(result, expected)
+        # Verify it wrote into the provided buffer
+        assert result.data_ptr() == out.data_ptr()
+
+    def test_out_reuse_same_buffer(self):
+        """Calling twice with the same out buffer should produce identical results."""
+        from bitsandbytes.functional import dequantize_kbit, quantize_kbit
+
+        n = 512
+        A = torch.randn(n, dtype=torch.float16, device="cuda")
+        packed, absmax, cb = quantize_kbit(A, k=4, absmax_format="e4m4")
+
+        num_blocks = -(n // -32)
+        out = torch.empty(num_blocks * 32, device="cuda", dtype=torch.float16)
+
+        r1 = dequantize_kbit(packed, absmax, cb, k=4, n=n, dtype=torch.float16, out=out)
+        r2 = dequantize_kbit(packed, absmax, cb, k=4, n=n, dtype=torch.float16, out=out)
+
+        assert torch.equal(r1, r2)
+        assert r1.data_ptr() == r2.data_ptr()
+
+    def test_out_wrong_dtype_raises(self):
+        """Passing out with wrong dtype should raise ValueError."""
+        from bitsandbytes.functional import dequantize_kbit, quantize_kbit
+
+        n = 256
+        A = torch.randn(n, dtype=torch.float16, device="cuda")
+        packed, absmax, cb = quantize_kbit(A, k=4, absmax_format="e4m4")
+
+        out = torch.empty(256, device="cuda", dtype=torch.float32)
+        with pytest.raises(ValueError, match="does not match"):
+            dequantize_kbit(packed, absmax, cb, k=4, n=n, dtype=torch.float16, out=out)
+
+    def test_out_too_small_raises(self):
+        """Passing out tensor that is too small should raise ValueError."""
+        from bitsandbytes.functional import dequantize_kbit, quantize_kbit
+
+        n = 256
+        A = torch.randn(n, dtype=torch.float16, device="cuda")
+        packed, absmax, cb = quantize_kbit(A, k=4, absmax_format="e4m4")
+
+        out = torch.empty(128, device="cuda", dtype=torch.float16)
+        with pytest.raises(ValueError, match="need at least"):
+            dequantize_kbit(packed, absmax, cb, k=4, n=n, dtype=torch.float16, out=out)