Add out parameter to kbit_scalar_gemv_tiled for CUDA graph compat

TimDettmers · claude · TimDettmers · commit 9d11e85e1e72 · 2026-02-22T05:01:08.000-05:00
Adds kbit_scalar_gemv_tiled_ op that writes to a pre-allocated output
buffer, eliminating the allocate+copy in the kbit_linear dispatch path.
The CUDA kernel already accepted an output pointer — this just wires
it through the torch.library op layer.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -783,3 +783,31 @@ def _(
     torch._check(A.dtype in (torch.float16, torch.bfloat16), lambda: f"A must be fp16 or bf16, got {A.dtype}")
     M = A.shape[0]
     return torch.empty(M, N, device=A.device, dtype=A.dtype)
+
+
+# K-bit scalar GEMV tiled with pre-allocated output (CUDA graph compatible)
+
+torch.library.define(
+    "bitsandbytes::kbit_scalar_gemv_tiled_",
+    "(Tensor A, Tensor B_packed_tiled, Tensor B_absmax_tiled, Tensor codebook, int K_dim, int N, int k, "
+    "Tensor(a!) out) -> Tensor(a!)",
+)
+
+
+@register_fake("bitsandbytes::kbit_scalar_gemv_tiled_")
+def _(
+    A: torch.Tensor,
+    B_packed_tiled: torch.Tensor,
+    B_absmax_tiled: torch.Tensor,
+    codebook: torch.Tensor,
+    K_dim: int,
+    N: int,
+    k: int,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(A.dim() == 2 and A.shape[1] == K_dim, lambda: "A must be [M, K_dim]")
+    torch._check(A.shape[0] <= 4, lambda: f"kbit_scalar_gemv_tiled_ supports M<=4, got {A.shape[0]}")
+    torch._check(A.dtype in (torch.float16, torch.bfloat16), lambda: f"A must be fp16 or bf16, got {A.dtype}")
+    torch._check(out.dtype == A.dtype, lambda: f"out dtype {out.dtype} must match A dtype {A.dtype}")
+    return out
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -1332,3 +1332,39 @@ def _(
             ct.c_int(N),
         )
     return out
+
+
+@register_kernel("bitsandbytes::kbit_scalar_gemv_tiled_", "cuda")
+def _(
+    A: torch.Tensor,
+    B_packed_tiled: torch.Tensor,
+    B_absmax_tiled: torch.Tensor,
+    codebook: torch.Tensor,
+    K_dim: int,
+    N: int,
+    k: int,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(
+        A.dtype in (torch.float16, torch.bfloat16),
+        lambda: f"kbit_scalar_gemv_tiled_ supports float16 and bfloat16, got {A.dtype}",
+    )
+
+    M = A.shape[0]
+    dtype_suffix = "fp16" if A.dtype == torch.float16 else "bf16"
+    abs_suffix = "_fp16abs" if B_absmax_tiled.dtype == torch.float16 else ""
+
+    with _cuda_device_of(A):
+        fn = getattr(lib, f"ckbit_scalar_gemv_tiled_{dtype_suffix}{abs_suffix}_k{k}")
+        fn(
+            get_ptr(A),
+            get_ptr(B_packed_tiled),
+            get_ptr(B_absmax_tiled),
+            get_ptr(codebook),
+            get_ptr(out),
+            ct.c_int(M),
+            ct.c_int(K_dim),
+            ct.c_int(N),
+        )
+    return out
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1299,11 +1299,9 @@ def kbit_linear(
     if M <= 4:
         # Scalar GEMV: tiled layout, one column per block
         if out is not None:
-            # scalar GEMV doesn't have an out variant for tiled yet,
-            # so compute into temp and copy
-            result = torch.ops.bitsandbytes.kbit_scalar_gemv_tiled(A, B_packed, B_absmax, codebook, K_dim, N, k)
-            out[:M, :N].copy_(result)
-            return out[:M]
+            return torch.ops.bitsandbytes.kbit_scalar_gemv_tiled_(
+                A, B_packed, B_absmax, codebook, K_dim, N, k, out[:M]
+            )
         return torch.ops.bitsandbytes.kbit_scalar_gemv_tiled(A, B_packed, B_absmax, codebook, K_dim, N, k)
 
     if M <= 16: