Add tiled scalar GEMV v2 with shared memory + split-K

TimDettmers · claude · TimDettmers · commit 72a374c18b54 · 2026-02-22T07:14:35.000-05:00
New kernel kbit_scalar_gemv_tiled_v2 that cooperatively loads full tiles
into double-buffered shared memory via cp.async, then each thread reads
its column from shared memory. 128 threads per block, each handling one
column within the TILE_N=128 N-tile.

Split-K for SM occupancy: grid = n_tiles * k_splits with atomicAdd to
float32 workspace. Includes fix for empty-split bug where trailing
k-splits with no work would prevent the last-block output conversion.

Registered as bitsandbytes::kbit_scalar_gemv_v2_ with explicit workspace
and tile_counters parameters for CUDA graph compatibility.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -811,3 +811,37 @@ def _(
     torch._check(A.dtype in (torch.float16, torch.bfloat16), lambda: f"A must be fp16 or bf16, got {A.dtype}")
     torch._check(out.dtype == A.dtype, lambda: f"out dtype {out.dtype} must match A dtype {A.dtype}")
     return out
+
+
+# K-bit scalar GEMV v2: tiled with shared memory + split-K (CUDA graph compatible)
+
+torch.library.define(
+    "bitsandbytes::kbit_scalar_gemv_v2_",
+    "(Tensor A, Tensor B_packed_tiled, Tensor B_absmax_tiled, Tensor codebook, int K_dim, int N, int k, "
+    "Tensor(a!) out, Tensor C_workspace, Tensor tile_counters) -> Tensor(a!)",
+)
+
+
+@register_fake("bitsandbytes::kbit_scalar_gemv_v2_")
+def _(
+    A: torch.Tensor,
+    B_packed_tiled: torch.Tensor,
+    B_absmax_tiled: torch.Tensor,
+    codebook: torch.Tensor,
+    K_dim: int,
+    N: int,
+    k: int,
+    out: torch.Tensor,
+    C_workspace: torch.Tensor,
+    tile_counters: torch.Tensor,
+) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(A.dim() == 2 and A.shape[1] == K_dim, lambda: "A must be [M, K_dim]")
+    torch._check(A.shape[0] <= 4, lambda: f"kbit_scalar_gemv_v2_ supports M<=4, got {A.shape[0]}")
+    torch._check(A.dtype in (torch.float16, torch.bfloat16), lambda: f"A must be fp16 or bf16, got {A.dtype}")
+    torch._check(out.dtype == A.dtype, lambda: f"out dtype {out.dtype} must match A dtype {A.dtype}")
+    torch._check(C_workspace.dtype == torch.float32, lambda: f"C_workspace must be float32, got {C_workspace.dtype}")
+    torch._check(
+        tile_counters.dtype == torch.int32, lambda: f"tile_counters must be int32, got {tile_counters.dtype}"
+    )
+    return out
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -1375,3 +1375,48 @@ def _(
             _get_tensor_stream(A),
         )
     return out
+
+
+@register_kernel("bitsandbytes::kbit_scalar_gemv_v2_", "cuda")
+def _(
+    A: torch.Tensor,
+    B_packed_tiled: torch.Tensor,
+    B_absmax_tiled: torch.Tensor,
+    codebook: torch.Tensor,
+    K_dim: int,
+    N: int,
+    k: int,
+    out: torch.Tensor,
+    C_workspace: torch.Tensor,
+    tile_counters: torch.Tensor,
+) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
+    torch._check(
+        A.dtype in (torch.float16, torch.bfloat16),
+        lambda: f"kbit_scalar_gemv_v2_ supports float16 and bfloat16, got {A.dtype}",
+    )
+
+    M = A.shape[0]
+    dtype_suffix = "fp16" if A.dtype == torch.float16 else "bf16"
+    abs_suffix = "_fp16abs" if B_absmax_tiled.dtype == torch.float16 else ""
+
+    # Zero workspace and counters (required by atomicAdd accumulation)
+    C_workspace.zero_()
+    tile_counters.zero_()
+
+    with _cuda_device_of(A):
+        fn = getattr(lib, f"ckbit_scalar_gemv_v2_{dtype_suffix}{abs_suffix}_k{k}")
+        fn(
+            get_ptr(A),
+            get_ptr(B_packed_tiled),
+            get_ptr(B_absmax_tiled),
+            get_ptr(codebook),
+            get_ptr(out),
+            get_ptr(C_workspace),
+            get_ptr(tile_counters),
+            ct.c_int(M),
+            ct.c_int(K_dim),
+            ct.c_int(N),
+            _get_tensor_stream(A),
+        )
+    return out
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -2162,6 +2162,278 @@ void kbitScalarGemvTiled(
 #undef LAUNCH_SCALAR_GEMV_TILED
 }
 
+// ---- Tiled Scalar GEMV v2 ----
+// Cooperative tile loading into shared memory with split-K for occupancy.
+// Grid = n_tiles * k_splits, Block = 128 threads (4 warps).
+// Each thread handles one column within an N-tile.
+// Double-buffered cp.async pipeline for B + absmax tiles.
+// A loaded directly from global memory (L1 broadcast across columns).
+
+template <int K_BITS, int M_VAL, typename scalar_t = half, typename ABSMAX_T = unsigned char>
+__global__ void __launch_bounds__(128, 8) kbit_scalar_gemv_tiled_v2(
+    const scalar_t* __restrict__ A,
+    const unsigned int* __restrict__ B_packed,
+    const ABSMAX_T* __restrict__ B_absmax,
+    const float* __restrict__ codebook,
+    scalar_t* __restrict__ C,
+    float* __restrict__ C_workspace,
+    int* __restrict__ tile_counters,
+    const int M, const int K_dim, const int N, const int k_splits
+) {
+    constexpr int BS = 32;            // quantization block size
+    constexpr int TILE_K = 64;
+    constexpr int TILE_N = 128;
+    constexpr int BLOCK_DIM = 128;    // threads per block
+    constexpr int NUM_WARPS = 4;
+    constexpr int M_MAX = 4;
+    constexpr int KB_PER_TILE = TILE_K / BS;  // 2
+    constexpr int B_COL_WORDS = KB_PER_TILE * K_BITS;
+    constexpr int B_STAGE_WORDS = TILE_N * B_COL_WORDS;
+    constexpr int B_STAGE_BYTES = B_STAGE_WORDS * (int)sizeof(unsigned int);
+    constexpr int ABS_STAGE_ELEMS = TILE_N * KB_PER_TILE;
+    constexpr int ABS_STAGE_BYTES = ABS_STAGE_ELEMS * (int)sizeof(ABSMAX_T);
+    constexpr int ABS_STAGE_ALIGNED = (ABS_STAGE_BYTES + 15) & ~15;
+    constexpr int STAGE_BYTES = B_STAGE_BYTES + ABS_STAGE_ALIGNED;
+
+    const int n_tiles = N / TILE_N;
+    const int k_tiles = (K_dim + TILE_K - 1) / TILE_K;
+    const int tiles_per_split = (k_tiles + k_splits - 1) / k_splits;
+
+    // Work item: which N-tile and K-split
+    const int work_id = blockIdx.x;
+    const int n_tile = work_id / k_splits;
+    const int ks_id = work_id % k_splits;
+    const int n_base = n_tile * TILE_N;
+
+    const int kt_start = ks_id * tiles_per_split;
+    const int kt_end = min(kt_start + tiles_per_split, k_tiles);
+    if (kt_start >= k_tiles) return;
+
+    // This thread's column within the tile
+    const int col_in_tile = threadIdx.x;  // 0..127
+    const int col = n_base + col_in_tile;
+
+    const int warp_id = threadIdx.x / 32;
+    const int lane_id = threadIdx.x % 32;
+
+    // Codebook in registers (shuffle-based lookup)
+    float cb = (lane_id < (1 << K_BITS)) ? codebook[lane_id] : 0.0f;
+
+    // Double-buffered shared memory
+    extern __shared__ char smem[];
+    auto sh_b = [&](int stage) -> unsigned int* {
+        return reinterpret_cast<unsigned int*>(smem + stage * STAGE_BYTES);
+    };
+    auto sh_abs = [&](int stage) -> ABSMAX_T* {
+        return reinterpret_cast<ABSMAX_T*>(smem + stage * STAGE_BYTES + B_STAGE_BYTES);
+    };
+
+    // Accumulators
+    float acc[M_VAL];
+    #pragma unroll
+    for (int m = 0; m < M_VAL; m++) acc[m] = 0.0f;
+
+    // Fetch tile: cooperative cp.async loading of B + absmax
+    auto fetch_tile = [&](int stage, int kt) {
+        const int tile_idx = kt * n_tiles + n_tile;  // K-major tile ordering
+
+        // B tile via cp.async (all 128 threads cooperatively load)
+        const int b_global_base = tile_idx * B_STAGE_WORDS;
+        constexpr int B_INT4S = B_STAGE_BYTES / 16;
+        const int4* b_src = reinterpret_cast<const int4*>(B_packed + b_global_base);
+        int4* b_dst = reinterpret_cast<int4*>(sh_b(stage));
+        for (int i = threadIdx.x; i < B_INT4S; i += BLOCK_DIM)
+            cp_async_cg_16(&b_dst[i], &b_src[i]);
+
+        // Absmax via cp.async
+        const int abs_global_base = tile_idx * ABS_STAGE_ELEMS;
+        constexpr int ABS_INT4S = (ABS_STAGE_BYTES + 15) / 16;
+        const int4* abs_src = reinterpret_cast<const int4*>(B_absmax + abs_global_base);
+        int4* abs_dst = reinterpret_cast<int4*>(sh_abs(stage));
+        for (int i = threadIdx.x; i < ABS_INT4S; i += BLOCK_DIM)
+            cp_async_cg_16(&abs_dst[i], &abs_src[i]);
+    };
+
+    // Compute tile: each thread reads its column from shared memory
+    auto compute_tile = [&](int stage, int kt) {
+        unsigned int* b_ptr = sh_b(stage);
+        ABSMAX_T* abs_ptr = sh_abs(stage);
+        const int k_base = kt * TILE_K;
+
+        // Process KB_PER_TILE (=2) K-blocks within this tile
+        #pragma unroll
+        for (int kb = 0; kb < KB_PER_TILE; kb++) {
+            const int block_k_base = k_base + kb * BS;
+            if (block_k_base >= K_dim) continue;
+
+            // Read bit-planes from shared memory for this column
+            int b_addr = col_in_tile * B_COL_WORDS + kb * K_BITS;
+            unsigned int planes[K_BITS];
+            if constexpr (K_BITS == 2) {
+                uint2 pv = *reinterpret_cast<const uint2*>(&b_ptr[b_addr]);
+                planes[0] = pv.x; planes[1] = pv.y;
+            } else if constexpr (K_BITS == 4) {
+                int4 pv = *reinterpret_cast<const int4*>(&b_ptr[b_addr]);
+                planes[0] = (unsigned int)pv.x; planes[1] = (unsigned int)pv.y;
+                planes[2] = (unsigned int)pv.z; planes[3] = (unsigned int)pv.w;
+            } else {
+                #pragma unroll
+                for (int b = 0; b < K_BITS; b++)
+                    planes[b] = b_ptr[b_addr + b];
+            }
+
+            // Load absmax from shared memory
+            float amax = load_absmax(abs_ptr, col_in_tile * KB_PER_TILE + kb);
+
+            // Dequant-once loop: decode weight once, FMA across M rows
+            #pragma unroll
+            for (int sub = 0; sub < 4; sub++) {
+                // Load A for all M rows (int4 = 8 fp16 values)
+                int4 av[M_VAL];
+                #pragma unroll
+                for (int m = 0; m < M_VAL; m++)
+                    av[m] = *reinterpret_cast<const int4*>(&A[m * K_dim + block_k_base + sub * 8]);
+
+                // Dequant each element once, then FMA across M rows
+                #pragma unroll
+                for (int j = 0; j < 8; j++) {
+                    int idx = 0;
+                    #pragma unroll
+                    for (int b = 0; b < K_BITS; b++)
+                        idx |= ((planes[b] >> (sub * 8 + j)) & 1) << b;
+                    float w = __shfl_sync(0xFFFFFFFF, cb, idx) * amax;
+
+                    #pragma unroll
+                    for (int m = 0; m < M_VAL; m++) {
+                        const scalar_t* ap = reinterpret_cast<const scalar_t*>(&av[m]);
+                        acc[m] += w * ScalarOps<scalar_t>::to_float(ap[j]);
+                    }
+                }
+            }
+        }
+    };
+
+    // Pipeline: double-buffered cp.async
+    fetch_tile(0, kt_start);
+    cp_async_fence();
+
+    for (int kt = kt_start; kt < kt_end; kt++) {
+        int cur = (kt - kt_start) % 2;
+        if (kt + 1 < kt_end) {
+            fetch_tile((kt + 1 - kt_start) % 2, kt + 1);
+            cp_async_fence();
+            cp_async_wait<1>();
+        } else {
+            cp_async_wait<0>();
+        }
+        __syncthreads();
+        compute_tile(cur, kt);
+        __syncthreads();
+    }
+
+    // Write output
+    if (k_splits == 1) {
+        // Direct write — this block owns the full K reduction
+        #pragma unroll
+        for (int m = 0; m < M_VAL; m++) {
+            if (m < M && col < N)
+                C[m * N + col] = ScalarOps<scalar_t>::from_float(acc[m]);
+        }
+    } else {
+        // Partial K — atomicAdd to workspace
+        #pragma unroll
+        for (int m = 0; m < M_VAL; m++) {
+            if (m < M && col < N)
+                atomicAdd(&C_workspace[m * N + col], acc[m]);
+        }
+
+        __threadfence();
+
+        // Last-arriving split converts workspace to output
+        __shared__ int is_last;
+        if (threadIdx.x == 0) {
+            int done = atomicAdd(&tile_counters[n_tile], 1);
+            is_last = (done == k_splits - 1) ? 1 : 0;
+        }
+        __syncthreads();
+
+        if (is_last) {
+            for (int i = threadIdx.x; i < M_VAL * TILE_N; i += BLOCK_DIM) {
+                int m = i / TILE_N;
+                int c = n_base + i % TILE_N;
+                if (m < M && c < N)
+                    C[m * N + c] = ScalarOps<scalar_t>::from_float(C_workspace[m * N + c]);
+            }
+        }
+    }
+}
+
+// ---- Tiled GEMV v2 launcher ----
+template <int K, int MV, typename scalar_t, typename ABSMAX_T>
+static void kbitScalarGemvTiledV2Launch(
+    const scalar_t* A, const unsigned int* B_packed, const ABSMAX_T* B_absmax,
+    const float* codebook, scalar_t* C, float* C_workspace, int* tile_counters,
+    int M, int K_dim, int N, int num_sms, cudaStream_t stream
+) {
+    constexpr int TILE_N = 128;
+    constexpr int TILE_K = 64;
+    constexpr int BLOCK_DIM = 128;
+    constexpr int BS = 32;
+    constexpr int KB_PER_TILE = TILE_K / BS;
+    constexpr int B_COL_WORDS = KB_PER_TILE * K;
+    constexpr int B_STAGE_BYTES = TILE_N * B_COL_WORDS * (int)sizeof(unsigned int);
+    constexpr int ABS_STAGE_BYTES = TILE_N * KB_PER_TILE * (int)sizeof(ABSMAX_T);
+    constexpr int ABS_STAGE_ALIGNED = (ABS_STAGE_BYTES + 15) & ~15;
+    constexpr int STAGE_BYTES = B_STAGE_BYTES + ABS_STAGE_ALIGNED;
+
+    int n_tiles = N / TILE_N;
+    int k_tiles = (K_dim + TILE_K - 1) / TILE_K;
+
+    // Choose k_splits to achieve ~4 blocks per SM.
+    // Recompute from tiles_per_split to guarantee no empty splits
+    // (empty splits would skip the tile_counters atomicAdd, breaking the last-block check).
+    int target_blocks = num_sms * 4;
+    int k_splits = max(1, (target_blocks + n_tiles - 1) / n_tiles);
+    k_splits = min(k_splits, k_tiles);
+    int tiles_per_split = (k_tiles + k_splits - 1) / k_splits;
+    k_splits = (k_tiles + tiles_per_split - 1) / tiles_per_split;  // no empty splits
+
+    int grid_size = n_tiles * k_splits;
+    int smem_size = 2 * STAGE_BYTES;
+
+    kbit_scalar_gemv_tiled_v2<K, MV, scalar_t, ABSMAX_T>
+        <<<grid_size, BLOCK_DIM, smem_size, stream>>>(
+            A, B_packed, B_absmax, codebook, C, C_workspace, tile_counters,
+            M, K_dim, N, k_splits
+        );
+    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+// Public entry point: selects M_VAL template, queries num_sms internally
+template <int K, typename scalar_t, typename ABSMAX_T>
+void kbitScalarGemvTiledV2(
+    const scalar_t* A, const unsigned int* B_packed, const ABSMAX_T* B_absmax,
+    const float* codebook, scalar_t* C, float* C_workspace, int* tile_counters,
+    int M, int K_dim, int N, cudaStream_t stream
+) {
+    int dev;
+    cudaGetDevice(&dev);
+    int num_sms;
+    cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev);
+
+#define LAUNCH_GEMV_V2(MV) \
+    kbitScalarGemvTiledV2Launch<K, MV, scalar_t, ABSMAX_T>( \
+        A, B_packed, B_absmax, codebook, C, C_workspace, tile_counters, \
+        M, K_dim, N, num_sms, stream)
+
+    if (M <= 1)      { LAUNCH_GEMV_V2(1); }
+    else if (M <= 2) { LAUNCH_GEMV_V2(2); }
+    else if (M <= 3) { LAUNCH_GEMV_V2(3); }
+    else             { LAUNCH_GEMV_V2(4); }
+
+#undef LAUNCH_GEMV_V2
+}
+
 // ---- Debug: Simple MMA test kernel ----
 // Takes fp16 A[16,16] and fp16 B[16,8] (B stored row-major), outputs fp32 C[16,8].
 __global__ void test_mma_kernel(const half* __restrict__ A, const half* __restrict__ B, float* __restrict__ C) {
@@ -2439,3 +2711,31 @@ INSTANTIATE_KBIT_SCALAR_GEMV_TILED_FP16(2)
 INSTANTIATE_KBIT_SCALAR_GEMV_TILED_FP16(3)
 INSTANTIATE_KBIT_SCALAR_GEMV_TILED_FP16(4)
 INSTANTIATE_KBIT_SCALAR_GEMV_TILED_FP16(5)
+// Scalar GEMV v2 (tiled with shared memory) instantiations — uint8 E4M4 absmax
+#define INSTANTIATE_KBIT_SCALAR_GEMV_V2_U8(K)                                                                          \
+    template void kbitScalarGemvTiledV2<K, half, unsigned char>(                                                        \
+        const half*, const unsigned int*, const unsigned char*, const float*, half*, float*, int*,                      \
+        int, int, int, cudaStream_t                                                                                    \
+    );                                                                                                                 \
+    template void kbitScalarGemvTiledV2<K, __nv_bfloat16, unsigned char>(                                              \
+        const __nv_bfloat16*, const unsigned int*, const unsigned char*, const float*, __nv_bfloat16*, float*, int*,    \
+        int, int, int, cudaStream_t                                                                                    \
+    );
+INSTANTIATE_KBIT_SCALAR_GEMV_V2_U8(2)
+INSTANTIATE_KBIT_SCALAR_GEMV_V2_U8(3)
+INSTANTIATE_KBIT_SCALAR_GEMV_V2_U8(4)
+INSTANTIATE_KBIT_SCALAR_GEMV_V2_U8(5)
+// fp16 absmax
+#define INSTANTIATE_KBIT_SCALAR_GEMV_V2_FP16(K)                                                                        \
+    template void kbitScalarGemvTiledV2<K, half, half>(                                                                \
+        const half*, const unsigned int*, const half*, const float*, half*, float*, int*,                              \
+        int, int, int, cudaStream_t                                                                                    \
+    );                                                                                                                 \
+    template void kbitScalarGemvTiledV2<K, __nv_bfloat16, half>(                                                       \
+        const __nv_bfloat16*, const unsigned int*, const half*, const float*, __nv_bfloat16*, float*, int*,            \
+        int, int, int, cudaStream_t                                                                                    \
+    );
+INSTANTIATE_KBIT_SCALAR_GEMV_V2_FP16(2)
+INSTANTIATE_KBIT_SCALAR_GEMV_V2_FP16(3)
+INSTANTIATE_KBIT_SCALAR_GEMV_V2_FP16(4)
+INSTANTIATE_KBIT_SCALAR_GEMV_V2_FP16(5)
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp