bitsandbytes-foundation
diff --git a/‎bitsandbytes/_ops.py‎
Lines changed: 49 additions & 0 deletions b/‎bitsandbytes/_ops.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 77 additions & 0 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 77 additions & 0 deletions
@@ -475,3 +475,52 @@ def _(
     )
     num_blocks = -(n // -32)
     return torch.empty(num_blocks * 32, device=packed.device, dtype=dtype)
+
+
+# K-bit repack: flat bit-plane layout -> GEMM-tiled layout
+
+torch.library.define(
+    "bitsandbytes::repack_kbit",
+    "(Tensor packed_flat, Tensor absmax_flat, int K_dim, int N, int k) -> (Tensor, Tensor)",
+)
+
+
+@register_fake("bitsandbytes::repack_kbit")
+def _(packed_flat: torch.Tensor, absmax_flat: torch.Tensor, K_dim: int, N: int, k: int) -> tuple[torch.Tensor, torch.Tensor]:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    TILE_K, TILE_N, BLOCKSIZE = 64, 128, 32
+    torch._check(N % TILE_N == 0, lambda: f"N ({N}) must be divisible by {TILE_N}")
+    torch._check(K_dim % BLOCKSIZE == 0, lambda: f"K_dim ({K_dim}) must be divisible by {BLOCKSIZE}")
+    K_dim_padded = ((K_dim + TILE_K - 1) // TILE_K) * TILE_K
+    k_tiles = K_dim_padded // TILE_K
+    n_tiles = N // TILE_N
+    k_blocks_per_tile = TILE_K // BLOCKSIZE
+    total_words = k_tiles * n_tiles * TILE_N * k_blocks_per_tile * k
+    total_absmax = k_tiles * n_tiles * TILE_N * k_blocks_per_tile
+    packed_tiled = torch.empty(total_words, device=packed_flat.device, dtype=torch.int32)
+    absmax_tiled = torch.empty(total_absmax, device=packed_flat.device, dtype=torch.uint8)
+    return packed_tiled, absmax_tiled
+
+
+# K-bit fused dequant + GEMM: C[M,N] = A[M,K_dim] * W_kbit^T
+
+torch.library.define(
+    "bitsandbytes::kbit_gemm",
+    "(Tensor A, Tensor B_packed, Tensor B_absmax, Tensor codebook, int K_dim, int N, int k) -> Tensor",
+)
+
+
+@register_fake("bitsandbytes::kbit_gemm")
+def _(
+    A: torch.Tensor,
+    B_packed: torch.Tensor,
+    B_absmax: torch.Tensor,
+    codebook: torch.Tensor,
+    K_dim: int,
+    N: int,
+    k: int,
+) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(A.dim() == 2 and A.shape[1] == K_dim, lambda: "A must be [M, K_dim]")
+    M = A.shape[0]
+    return torch.empty(M, N, device=A.device, dtype=A.dtype)
@@ -854,3 +854,80 @@ def _(
         )
 
     return out
+
+
+@register_kernel("bitsandbytes::repack_kbit", "cuda")
+def _(
+    packed_flat: torch.Tensor,
+    absmax_flat: torch.Tensor,
+    K_dim: int,
+    N: int,
+    k: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(packed_flat.dtype == torch.int32, lambda: f"packed_flat must be int32, got {packed_flat.dtype}")
+    torch._check(absmax_flat.dtype == torch.float32, lambda: f"absmax_flat must be float32, got {absmax_flat.dtype}")
+
+    TILE_K, TILE_N, BLOCKSIZE = 64, 128, 32
+    torch._check(N % TILE_N == 0, lambda: f"N ({N}) must be divisible by {TILE_N}")
+    torch._check(K_dim % BLOCKSIZE == 0, lambda: f"K_dim ({K_dim}) must be divisible by {BLOCKSIZE}")
+
+    K_dim_padded = ((K_dim + TILE_K - 1) // TILE_K) * TILE_K
+    k_tiles = K_dim_padded // TILE_K
+    n_tiles = N // TILE_N
+    k_blocks_per_tile = TILE_K // BLOCKSIZE
+    total_words = k_tiles * n_tiles * TILE_N * k_blocks_per_tile * k
+    total_absmax = k_tiles * n_tiles * TILE_N * k_blocks_per_tile
+
+    # Zero-fill for padding regions (when K_dim is not multiple of TILE_K)
+    packed_tiled = torch.zeros(total_words, device=packed_flat.device, dtype=torch.int32)
+    absmax_tiled = torch.zeros(total_absmax, device=packed_flat.device, dtype=torch.uint8)
+
+    with _cuda_device_of(packed_flat):
+        fn = getattr(lib, f"crepack_kbit_k{k}")
+        fn(
+            get_ptr(packed_flat),
+            get_ptr(absmax_flat),
+            get_ptr(packed_tiled),
+            get_ptr(absmax_tiled),
+            ct.c_int(K_dim),
+            ct.c_int(N),
+        )
+
+    return packed_tiled, absmax_tiled
+
+
+@register_kernel("bitsandbytes::kbit_gemm", "cuda")
+def _(
+    A: torch.Tensor,
+    B_packed: torch.Tensor,
+    B_absmax: torch.Tensor,
+    codebook: torch.Tensor,
+    K_dim: int,
+    N: int,
+    k: int,
+) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(A.dtype == torch.float16, lambda: f"kbit_gemm currently supports float16 only, got {A.dtype}")
+    torch._check(B_packed.dtype == torch.int32, lambda: f"B_packed must be int32, got {B_packed.dtype}")
+    torch._check(B_absmax.dtype == torch.uint8, lambda: f"B_absmax must be uint8 (E4M4), got {B_absmax.dtype}")
+    torch._check(codebook.dtype == torch.float32, lambda: f"codebook must be float32, got {codebook.dtype}")
+    torch._check(N % 128 == 0, lambda: f"N ({N}) must be divisible by 128")
+
+    M = A.shape[0]
+    C = torch.empty(M, N, device=A.device, dtype=torch.float16)
+
+    with _cuda_device_of(A):
+        fn = getattr(lib, f"ckbit_gemm_fp16_k{k}")
+        fn(
+            get_ptr(A),
+            get_ptr(B_packed),
+            get_ptr(B_absmax),
+            get_ptr(codebook),
+            get_ptr(C),
+            ct.c_int(M),
+            ct.c_int(K_dim),
+            ct.c_int(N),
+        )
+
+    return C