bitsandbytes-foundation
diff --git a/‎bitsandbytes/_ops.py‎
Lines changed: 26 additions & 0 deletions b/‎bitsandbytes/_ops.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 58 additions & 0 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 58 additions & 0 deletions
@@ -573,3 +573,29 @@ def _(
     torch._check(A.dim() == 2 and A.shape[1] == K_dim, lambda: "A must be [M, K_dim]")
     M = A.shape[0]
     return torch.empty(M, N, device=A.device, dtype=A.dtype)
+
+
+# K-bit fused dequant + GEMM (production, Stage 6: fp16 + bf16)
+
+torch.library.define(
+    "bitsandbytes::kbit_gemm_prod",
+    "(Tensor A, Tensor B_packed, Tensor B_absmax, Tensor codebook, int K_dim, int N, int k, int k_chunks) -> Tensor",
+)
+
+
+@register_fake("bitsandbytes::kbit_gemm_prod")
+def _(
+    A: torch.Tensor,
+    B_packed: torch.Tensor,
+    B_absmax: torch.Tensor,
+    codebook: torch.Tensor,
+    K_dim: int,
+    N: int,
+    k: int,
+    k_chunks: int,
+) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(A.dim() == 2 and A.shape[1] == K_dim, lambda: "A must be [M, K_dim]")
+    torch._check(A.dtype in (torch.float16, torch.bfloat16), lambda: f"A must be fp16 or bf16, got {A.dtype}")
+    M = A.shape[0]
+    return torch.empty(M, N, device=A.device, dtype=A.dtype)
@@ -1021,3 +1021,61 @@ def _(
         )
 
     return C
+
+
+@register_kernel("bitsandbytes::kbit_gemm_prod", "cuda")
+def _(
+    A: torch.Tensor,
+    B_packed: torch.Tensor,
+    B_absmax: torch.Tensor,
+    codebook: torch.Tensor,
+    K_dim: int,
+    N: int,
+    k: int,
+    k_chunks: int,
+) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(
+        A.dtype in (torch.float16, torch.bfloat16),
+        lambda: f"kbit_gemm_prod supports float16 and bfloat16, got {A.dtype}",
+    )
+    torch._check(B_packed.dtype == torch.int32, lambda: f"B_packed must be int32, got {B_packed.dtype}")
+    torch._check(B_absmax.dtype == torch.uint8, lambda: f"B_absmax must be uint8 (E4M4), got {B_absmax.dtype}")
+    torch._check(codebook.dtype == torch.float32, lambda: f"codebook must be float32, got {codebook.dtype}")
+    torch._check(N % 128 == 0, lambda: f"N ({N}) must be divisible by 128")
+    torch._check(k_chunks >= 1, lambda: f"k_chunks must be >= 1, got {k_chunks}")
+
+    M = A.shape[0]
+    C = torch.empty(M, N, device=A.device, dtype=A.dtype)
+
+    TILE_M = 16
+    TILE_N = 128
+    m_tiles = (M + TILE_M - 1) // TILE_M
+    n_tiles = N // TILE_N
+
+    if k_chunks > 1:
+        C_workspace = torch.zeros(M, N, device=A.device, dtype=torch.float32)
+        tile_counters = torch.zeros(m_tiles * n_tiles, device=A.device, dtype=torch.int32)
+    else:
+        C_workspace = torch.empty(0, device=A.device, dtype=torch.float32)
+        tile_counters = torch.empty(0, device=A.device, dtype=torch.int32)
+
+    dtype_suffix = "fp16" if A.dtype == torch.float16 else "bf16"
+
+    with _cuda_device_of(A):
+        fn = getattr(lib, f"ckbit_gemm_prod_{dtype_suffix}_k{k}")
+        fn(
+            get_ptr(A),
+            get_ptr(B_packed),
+            get_ptr(B_absmax),
+            get_ptr(codebook),
+            get_ptr(C),
+            get_ptr(C_workspace),
+            get_ptr(tile_counters),
+            ct.c_int(M),
+            ct.c_int(K_dim),
+            ct.c_int(N),
+            ct.c_int(k_chunks),
+        )
+
+    return C