feat: Update vq_linear dispatch to use MMA kernel for M=5-16

TimDettmers · claude · TimDettmers · commit 24cf7d1f3b0d · 2026-03-03T19:30:37.000-05:00
- Route M&lt;=4 to scalar GEMV, M=5-16 to vq_gemm_prod, M&gt;16 to
  dequant+cuBLAS (matching kbit_linear dispatch pattern)
- Update vq_linear_workspace to include C_workspace and tile_counters
- Un-skip MMA test stubs, replace with actual vq_gemm_prod tests
- All 100 VQ tests pass (50 scalar GEMV + 50 dispatch/MMA)

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1560,7 +1560,8 @@ def vq_linear(
 
     Routes to the optimal kernel based on M (batch dimension):
       - M <= 4:  scalar GEMV (tiled layout, shmem codebook lookup)
-      - M > 4:   dequantize to fp16/bf16 + cuBLAS matmul
+      - M <= 16: fused dequant + MMA (tiled layout, tensor core)
+      - M > 16:  dequantize to fp16/bf16 + cuBLAS matmul
 
     All paths read tiled B layout (from repack_vq output).
 
@@ -1574,6 +1575,8 @@ def vq_linear(
         N: Output dimension of weight matrix.
         out: Optional pre-allocated output [M, N] for CUDA graph compat.
         workspace: Optional dict with pre-allocated buffers:
+            'C_workspace': float32 [M, N] for MMA accumulation
+            'tile_counters': int32 [m_tiles * n_tiles] for persistent kernel
             'dequant_buf': fp16/bf16 [N * K_dim] for dequant+matmul path
 
     Returns:
@@ -1590,7 +1593,18 @@ def vq_linear(
             )
         return torch.ops.bitsandbytes.vq_scalar_gemv_tiled(A, B_packed, B_absmax, codebook, K_dim, N, p)
 
-    # M > 4: dequantize tiled VQ to dense + cuBLAS matmul
+    if M <= 16:
+        # Fused dequant + MMA: tiled layout, tensor core path
+        k_chunks = 1  # auto-selected internally by the kernel
+        if out is not None and workspace is not None:
+            C_workspace = workspace["C_workspace"]
+            tile_counters = workspace["tile_counters"]
+            return torch.ops.bitsandbytes.vq_gemm_prod_(
+                A, B_packed, B_absmax, codebook, K_dim, N, p, k_chunks, out, C_workspace, tile_counters
+            )
+        return torch.ops.bitsandbytes.vq_gemm_prod(A, B_packed, B_absmax, codebook, K_dim, N, p, k_chunks)
+
+    # M > 16: dequantize tiled VQ to dense + cuBLAS matmul
     if workspace is not None and "dequant_buf" in workspace:
         dequant_buf = workspace["dequant_buf"]
         torch.ops.bitsandbytes.dequantize_vq_tiled_(
@@ -1619,12 +1633,17 @@ def vq_linear_workspace(M: int, K_dim: int, N: int, p: int, dtype: torch.dtype,
         device: CUDA device.
 
     Returns:
-        Dict with 'dequant_buf' tensor.
+        Dict with 'C_workspace', 'tile_counters', 'dequant_buf' tensors.
     """
+    TILE_M, TILE_N = 16, 64  # worst-case tile sizes for counter allocation
+    m_tiles = (M + TILE_M - 1) // TILE_M
+    n_tiles = N // TILE_N
     n_total = N * K_dim
     num_blocks = -(n_total // -32)
 
     return {
+        "C_workspace": torch.zeros(M, N, device=device, dtype=torch.float32),
+        "tile_counters": torch.zeros(m_tiles * n_tiles, device=device, dtype=torch.int32),
         "dequant_buf": torch.empty(num_blocks * 32, device=device, dtype=dtype),
     }
 
diff --git a/tests/test_kbit_gemm.py b/tests/test_kbit_gemm.py
@@ -1203,7 +1203,35 @@ def test_vq_linear_preallocated_output(self, p):
 
     @pytest.mark.parametrize("p", [2, 4])
     @pytest.mark.parametrize("M", [5, 8, 16, 32])
-    @pytest.mark.skip(reason="Task 5 (VQ MMA kernel) not yet implemented")
     def test_vq_mma_kernel(self, p, M):
-        """VQ MMA kernel correctness (placeholder for Task 5)."""
-        pass
+        """VQ MMA kernel (vq_gemm_prod) correctness."""
+        from bitsandbytes.functional import create_vq_codebook, quantize_vq, repack_vq
+
+        K_dim, N = 512, 256
+        torch.manual_seed(42)
+
+        W = torch.randn(N, K_dim)
+        codebook = create_vq_codebook(p, device="cuda")
+        W_gpu = W.half().cuda()
+        packed_flat, absmax_flat, _ = quantize_vq(W_gpu, p=p, codebook=codebook)
+        packed_tiled, absmax_tiled = repack_vq(packed_flat, absmax_flat, K_dim, N, p=p)
+
+        A = torch.randn(M, K_dim, dtype=torch.float16, device="cuda")
+
+        C = torch.ops.bitsandbytes.vq_gemm_prod(
+            A, packed_tiled, absmax_tiled, codebook, K_dim, N, p, 1,
+        )
+
+        # Reference
+        from bitsandbytes.functional import dequantize_vq
+
+        W_deq = dequantize_vq(packed_flat, absmax_flat, codebook, p=p, n=N * K_dim)
+        W_deq = W_deq.reshape(N, K_dim)
+        C_ref = (A.float() @ W_deq.float().T).to(A.dtype)
+
+        diff = (C.float() - C_ref.float()).abs()
+        scale = C_ref.float().abs().clamp(min=1.0)
+        rel_err = (diff / scale).max().item()
+        assert rel_err < 0.10, (
+            f"p={p}, M={M}: vq_gemm_prod mismatch. Max rel err: {rel_err:.6f}"
+        )