feat: Add VQ tiled dequantize kernel and vq_linear dispatch

TimDettmers · claude · TimDettmers · commit f08f6142c1ac · 2026-03-03T19:10:32.000-05:00
- kDequantize_VQ_tiled: reads tiled VQ layout, writes flat [N,K] output
- Full registration chain for tiled dequant (ops, bindings, Python)
- vq_linear() dispatch: M≤4 → vq_scalar_gemv_tiled, M&gt;4 → dequant+cuBLAS
- vq_linear_workspace() for CUDA graph compatibility
- End-to-end pipeline verified: quantize→repack→vq_linear→correct output
  for all (p={2,4}, K={64,2048,5120}, N={128,512,5120}, M={1,4,8,32})

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -617,6 +617,50 @@ def _(
     return out
 
 
+# VQ tiled dequantize: reads tiled VQ layout, writes flat [N, K_dim] output
+
+torch.library.define(
+    "bitsandbytes::dequantize_vq_tiled",
+    "(Tensor packed_tiled, Tensor codebook, Tensor absmax_tiled, int p, int K_dim, int N, ScalarType dtype) -> Tensor",
+)
+
+
+@register_fake("bitsandbytes::dequantize_vq_tiled")
+def _(
+    packed_tiled: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax_tiled: torch.Tensor,
+    p: int,
+    K_dim: int,
+    N: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    torch._check(p in (2, 4), lambda: f"p must be 2 or 4, got {p}")
+    return torch.empty(N * K_dim, device=packed_tiled.device, dtype=dtype)
+
+
+torch.library.define(
+    "bitsandbytes::dequantize_vq_tiled_",
+    "(Tensor packed_tiled, Tensor codebook, Tensor absmax_tiled, int p, int K_dim, int N, ScalarType dtype, "
+    "Tensor(a!) out) -> Tensor(a!)",
+)
+
+
+@register_fake("bitsandbytes::dequantize_vq_tiled_")
+def _(
+    packed_tiled: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax_tiled: torch.Tensor,
+    p: int,
+    K_dim: int,
+    N: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    torch._check(p in (2, 4), lambda: f"p must be 2 or 4, got {p}")
+    return out
+
+
 # VQ scalar GEMV: byte-indexed codebook lookup GEMV for M=1-4
 
 torch.library.define(
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -1059,6 +1059,76 @@ def _(
     return out
 
 
+def _dequantize_vq_tiled_impl(
+    packed_tiled: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax_tiled: torch.Tensor,
+    p: int,
+    K_dim: int,
+    N: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> None:
+    torch._check(codebook.dtype == torch.float16, lambda: f"codebook must be float16, got {codebook.dtype}")
+    torch._check(p in (2, 4), lambda: f"p must be 2 or 4, got {p}")
+
+    if dtype in (torch.float16,):
+        tname = "fp16"
+    elif dtype == torch.bfloat16:
+        tname = "bf16"
+    else:
+        raise ValueError(f"dequantize_vq_tiled only supports float16/bfloat16, got {dtype}")
+
+    if absmax_tiled.dtype == torch.uint8:
+        aname = "u8abs"
+    elif absmax_tiled.dtype == torch.float32:
+        aname = "fp32abs"
+    else:
+        raise ValueError(f"absmax must be uint8 or float32, got {absmax_tiled.dtype}")
+
+    with _cuda_device_of(packed_tiled):
+        fn = getattr(lib, f"cdequantize_vq_tiled_{tname}_{aname}_p{p}")
+        fn(
+            get_ptr(packed_tiled),
+            get_ptr(codebook),
+            get_ptr(absmax_tiled),
+            get_ptr(out),
+            ct.c_int(K_dim),
+            ct.c_int(N),
+            _get_tensor_stream(packed_tiled),
+        )
+
+
+@register_kernel("bitsandbytes::dequantize_vq_tiled", "cuda")
+def _(
+    packed_tiled: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax_tiled: torch.Tensor,
+    p: int,
+    K_dim: int,
+    N: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    out = torch.empty(N * K_dim, device=packed_tiled.device, dtype=dtype)
+    _dequantize_vq_tiled_impl(packed_tiled, codebook, absmax_tiled, p, K_dim, N, dtype, out)
+    return out
+
+
+@register_kernel("bitsandbytes::dequantize_vq_tiled_", "cuda")
+def _(
+    packed_tiled: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax_tiled: torch.Tensor,
+    p: int,
+    K_dim: int,
+    N: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    _dequantize_vq_tiled_impl(packed_tiled, codebook, absmax_tiled, p, K_dim, N, dtype, out)
+    return out
+
+
 def _vq_scalar_gemv_impl(
     A: torch.Tensor,
     B_packed: torch.Tensor,
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1545,6 +1545,90 @@ def kbit_linear_workspace(M: int, K_dim: int, N: int, dtype: torch.dtype, device
     }
 
 
+def vq_linear(
+    A: Tensor,
+    B_packed: Tensor,
+    B_absmax: Tensor,
+    codebook: Tensor,
+    p: int,
+    K_dim: int,
+    N: int,
+    out: Optional[Tensor] = None,
+    workspace: Optional[dict] = None,
+) -> Tensor:
+    """Unified dispatch for VQ codebook quantized linear (C = A @ B^T).
+
+    Routes to the optimal kernel based on M (batch dimension):
+      - M <= 4:  scalar GEMV (tiled layout, shmem codebook lookup)
+      - M > 4:   dequantize to fp16/bf16 + cuBLAS matmul
+
+    All paths read tiled B layout (from repack_vq output).
+
+    Args:
+        A: Input activations [M, K_dim], fp16 or bf16.
+        B_packed: Tiled VQ packed weights (from repack_vq).
+        B_absmax: Tiled per-block absmax values (from repack_vq).
+        codebook: fp16 codebook tensor [256, p].
+        p: VQ dimension (2 or 4).
+        K_dim: Reduction dimension of weight matrix.
+        N: Output dimension of weight matrix.
+        out: Optional pre-allocated output [M, N] for CUDA graph compat.
+        workspace: Optional dict with pre-allocated buffers:
+            'dequant_buf': fp16/bf16 [N * K_dim] for dequant+matmul path
+
+    Returns:
+        Output tensor [M, N] with same dtype as A.
+    """
+    M = A.shape[0]
+    dtype = A.dtype
+
+    if M <= 4:
+        # Scalar GEMV: tiled layout, shared memory codebook lookup
+        if out is not None:
+            return torch.ops.bitsandbytes.vq_scalar_gemv_tiled_(
+                A, B_packed, B_absmax, codebook, K_dim, N, p, out[:M]
+            )
+        return torch.ops.bitsandbytes.vq_scalar_gemv_tiled(A, B_packed, B_absmax, codebook, K_dim, N, p)
+
+    # M > 4: dequantize tiled VQ to dense + cuBLAS matmul
+    if workspace is not None and "dequant_buf" in workspace:
+        dequant_buf = workspace["dequant_buf"]
+        torch.ops.bitsandbytes.dequantize_vq_tiled_(
+            B_packed, codebook, B_absmax, p, K_dim, N, dtype, dequant_buf
+        )
+        W = dequant_buf[: N * K_dim].view(N, K_dim)
+    else:
+        W_flat = torch.ops.bitsandbytes.dequantize_vq_tiled(B_packed, codebook, B_absmax, p, K_dim, N, dtype)
+        W = W_flat[: N * K_dim].view(N, K_dim)
+
+    if out is not None:
+        torch.mm(A, W.t(), out=out[:M])
+        return out[:M]
+    return torch.mm(A, W.t())
+
+
+def vq_linear_workspace(M: int, K_dim: int, N: int, p: int, dtype: torch.dtype, device: torch.device) -> dict:
+    """Pre-allocate workspace buffers for vq_linear (CUDA graph compatibility).
+
+    Args:
+        M: Maximum batch size (must be >= actual M at runtime).
+        K_dim: Reduction dimension.
+        N: Output dimension.
+        p: VQ dimension (2 or 4).
+        dtype: Activation dtype (fp16 or bf16).
+        device: CUDA device.
+
+    Returns:
+        Dict with 'dequant_buf' tensor.
+    """
+    n_total = N * K_dim
+    num_blocks = -(n_total // -32)
+
+    return {
+        "dequant_buf": torch.empty(num_blocks * 32, device=device, dtype=dtype),
+    }
+
+
 def kbit_expert_linear(
     A_concat: Tensor,
     B_packed_all: Tensor,
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -980,6 +980,65 @@ __global__ void kDequantize_VQ(
 }
 
 
+// ---- VQ tiled dequantize kernel ----
+// Reads from tiled VQ layout (from repack_vq output), writes flat [N, K_dim] row-major.
+
+template <int P_VAL, typename T, typename ABSMAX_T>
+__global__ void kDequantize_VQ_tiled(
+    const unsigned int* __restrict__ packed_tiled,
+    const half* __restrict__ codebook,
+    const ABSMAX_T* __restrict__ absmax_tiled,
+    T* __restrict__ out,
+    const int K_dim, const int N
+) {
+    constexpr int BS = 32;
+    constexpr int TILE_K = 64;
+    constexpr int TILE_N = 128;
+    constexpr int KB_PER_TILE = TILE_K / BS;
+    constexpr int WORDS_PER_BLOCK = BS / (P_VAL * 4);
+    constexpr int WORDS_PER_TILE = TILE_N * KB_PER_TILE * WORDS_PER_BLOCK;
+    constexpr int ABS_PER_TILE = TILE_N * KB_PER_TILE;
+    constexpr int GROUPS_PER_BLOCK = BS / P_VAL;
+
+    const int num_k_blocks = K_dim / BS;
+    const int n_tiles = N / TILE_N;
+
+    // Each thread handles one element in the [N, K_dim] output
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total = N * K_dim;
+    if (idx >= total)
+        return;
+
+    const int n_idx = idx / K_dim;
+    const int k_idx = idx % K_dim;
+    const int k_block = k_idx / BS;
+    const int elem_in_block = k_idx % BS;
+
+    // Tiled addressing
+    const int k_tile = k_block / KB_PER_TILE;
+    const int kb = k_block % KB_PER_TILE;
+    const int n_tile = n_idx / TILE_N;
+    const int col_in_tile = n_idx % TILE_N;
+    const int tile_base = k_tile * n_tiles + n_tile;
+
+    // Load absmax
+    const int abs_idx = tile_base * ABS_PER_TILE + col_in_tile * KB_PER_TILE + kb;
+    float amax = load_absmax(absmax_tiled, abs_idx);
+
+    // Find the byte index for this element
+    const int group = elem_in_block / P_VAL;
+    const int component = elem_in_block % P_VAL;
+    const int word_in_block = group / 4;
+    const int byte_in_word = group % 4;
+
+    const int word_idx = tile_base * WORDS_PER_TILE + (col_in_tile * KB_PER_TILE + kb) * WORDS_PER_BLOCK + word_in_block;
+    unsigned char byte_idx = (packed_tiled[word_idx] >> (byte_in_word * 8)) & 0xFF;
+
+    // Codebook lookup
+    float val = __half2float(codebook[byte_idx * P_VAL + component]) * amax;
+    out[idx] = (T)val;
+}
+
 // ---- Launch wrappers ----
 
 #define KBIT_WARPS_PER_BLOCK 8
@@ -1115,6 +1174,19 @@ void dequantize_vq(
     CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
 
+template <int P_VAL, typename T, typename ABSMAX_T>
+void dequantize_vq_tiled(
+    const unsigned int* packed_tiled, const half* codebook, const ABSMAX_T* absmax_tiled,
+    T* out, int K_dim, int N, cudaStream_t stream
+) {
+    int total = N * K_dim;
+    int threads = 256;
+    int blocks = (total + threads - 1) / threads;
+    kDequantize_VQ_tiled<P_VAL, T, ABSMAX_T>
+        <<<blocks, threads, 0, stream>>>(packed_tiled, codebook, absmax_tiled, out, K_dim, N);
+    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
 // ---- Stage 2: Repack kernel (flat bit-plane -> GEMM-tiled layout) ----
 
 // Tile sizes matching the GEMM kernel design (compile-time constants).
@@ -3737,6 +3809,23 @@ INSTANTIATE_VQ_QUANT(4)
 INSTANTIATE_VQ_DEQUANT(2)
 INSTANTIATE_VQ_DEQUANT(4)
 
+// dequantize_vq_tiled: P_VAL × T × ABSMAX_T
+#define INSTANTIATE_VQ_DEQUANT_TILED(P)                                                                                \
+    template void dequantize_vq_tiled<P, half, unsigned char>(                                                         \
+        const unsigned int*, const half*, const unsigned char*, half*, int, int, cudaStream_t                           \
+    );                                                                                                                 \
+    template void dequantize_vq_tiled<P, __nv_bfloat16, unsigned char>(                                                \
+        const unsigned int*, const half*, const unsigned char*, __nv_bfloat16*, int, int, cudaStream_t                  \
+    );                                                                                                                 \
+    template void dequantize_vq_tiled<P, half, float>(                                                                 \
+        const unsigned int*, const half*, const float*, half*, int, int, cudaStream_t                                   \
+    );                                                                                                                 \
+    template void dequantize_vq_tiled<P, __nv_bfloat16, float>(                                                        \
+        const unsigned int*, const half*, const float*, __nv_bfloat16*, int, int, cudaStream_t                          \
+    );
+INSTANTIATE_VQ_DEQUANT_TILED(2)
+INSTANTIATE_VQ_DEQUANT_TILED(4)
+
 // vq_scalar_gemv: P_VAL × scalar_t × ABSMAX_T (flat + tiled)
 #define INSTANTIATE_VQ_SCALAR_GEMV_U8(P)                                                                               \
     template void vqScalarGemv<P, half, unsigned char>(                                                                \
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
@@ -552,6 +552,28 @@ MAKE_VQ_DEQUANT(fp16, half, fp32abs, float, 4)
 MAKE_VQ_DEQUANT(bf16, __nv_bfloat16, fp32abs, float, 2)
 MAKE_VQ_DEQUANT(bf16, __nv_bfloat16, fp32abs, float, 4)
 
+// Forward declaration of VQ tiled dequant launcher
+template <int P_VAL, typename T, typename ABSMAX_T>
+void dequantize_vq_tiled(const unsigned int*, const half*, const ABSMAX_T*, T*, int, int, cudaStream_t);
+
+// Unmangled VQ tiled dequant wrappers
+#define MAKE_VQ_DEQUANT_TILED(tname, T, aname, ABSMAX_T, P)                                                           \
+    void dequantize_vq_tiled_##tname##_##aname##_p##P(                                                                 \
+        const unsigned int* packed_tiled, const half* codebook, const ABSMAX_T* absmax_tiled, T* out, int K_dim,       \
+        int N, cudaStream_t stream                                                                                     \
+    ) {                                                                                                                \
+        dequantize_vq_tiled<P, T, ABSMAX_T>(packed_tiled, codebook, absmax_tiled, out, K_dim, N, stream);              \
+    }
+
+MAKE_VQ_DEQUANT_TILED(fp16, half, u8abs, unsigned char, 2)
+MAKE_VQ_DEQUANT_TILED(fp16, half, u8abs, unsigned char, 4)
+MAKE_VQ_DEQUANT_TILED(bf16, __nv_bfloat16, u8abs, unsigned char, 2)
+MAKE_VQ_DEQUANT_TILED(bf16, __nv_bfloat16, u8abs, unsigned char, 4)
+MAKE_VQ_DEQUANT_TILED(fp16, half, fp32abs, float, 2)
+MAKE_VQ_DEQUANT_TILED(fp16, half, fp32abs, float, 4)
+MAKE_VQ_DEQUANT_TILED(bf16, __nv_bfloat16, fp32abs, float, 2)
+MAKE_VQ_DEQUANT_TILED(bf16, __nv_bfloat16, fp32abs, float, 4)
+
 // Forward declaration of repack launcher
 template <int K>
 void repackKbit(const unsigned int*, const unsigned char*, unsigned int*, unsigned char*, int, int, cudaStream_t);
@@ -1646,6 +1668,24 @@ MAKE_CVQ_DEQUANT(fp16, half, fp32abs, float, 4)
 MAKE_CVQ_DEQUANT(bf16, __nv_bfloat16, fp32abs, float, 2)
 MAKE_CVQ_DEQUANT(bf16, __nv_bfloat16, fp32abs, float, 4)
 
+// VQ tiled dequant extern C wrappers
+#define MAKE_CVQ_DEQUANT_TILED(tname, T, aname, ABSMAX_T, P)                                                          \
+    void cdequantize_vq_tiled_##tname##_##aname##_p##P(                                                                \
+        const unsigned int* packed_tiled, const half* codebook, const ABSMAX_T* absmax_tiled, T* out, int K_dim,       \
+        int N, cudaStream_t stream                                                                                     \
+    ) {                                                                                                                \
+        dequantize_vq_tiled_##tname##_##aname##_p##P(packed_tiled, codebook, absmax_tiled, out, K_dim, N, stream);     \
+    }
+
+MAKE_CVQ_DEQUANT_TILED(fp16, half, u8abs, unsigned char, 2)
+MAKE_CVQ_DEQUANT_TILED(fp16, half, u8abs, unsigned char, 4)
+MAKE_CVQ_DEQUANT_TILED(bf16, __nv_bfloat16, u8abs, unsigned char, 2)
+MAKE_CVQ_DEQUANT_TILED(bf16, __nv_bfloat16, u8abs, unsigned char, 4)
+MAKE_CVQ_DEQUANT_TILED(fp16, half, fp32abs, float, 2)
+MAKE_CVQ_DEQUANT_TILED(fp16, half, fp32abs, float, 4)
+MAKE_CVQ_DEQUANT_TILED(bf16, __nv_bfloat16, fp32abs, float, 2)
+MAKE_CVQ_DEQUANT_TILED(bf16, __nv_bfloat16, fp32abs, float, 4)
+
 // VQ scalar GEMV extern C wrappers (flat layout)
 #define MAKE_CVQ_SCALAR_GEMV(P)                                                                                        \
     void cvq_scalar_gemv_fp16_p##P(                                                                                    \