feat: Add Hadamard rotation kernel for kbit outlier spreading

TimDettmers · claude · TimDettmers · commit 22148c7cae40 · 2026-02-23T05:57:48.000-05:00
Templated Walsh-Hadamard transform kernel for FP16/BF16, operating on
contiguous blocks of 32/64/128/256 elements. One warp per rotation block
using butterfly decomposition: in-register stages for stride&gt;=32, shuffle
stages for stride&lt;32. Normalization by 1/sqrt(block_size).

In-place operation, CUDA graph safe (no runtime API calls in hot path).
Registered as torch.ops.bitsandbytes.hadamard_rotate_ with Python helper
in functional.py.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -584,6 +584,27 @@ def _(
     return packed_tiled, absmax_tiled
 
 
+# Hadamard rotation (in-place, for kbit quantization outlier spreading)
+
+torch.library.define(
+    "bitsandbytes::hadamard_rotate_",
+    "(Tensor(a!) data, int block_size) -> Tensor(a!)",
+)
+
+
+@register_fake("bitsandbytes::hadamard_rotate_")
+def _(data: torch.Tensor, block_size: int) -> torch.Tensor:
+    torch._check(
+        block_size in (32, 64, 128, 256),
+        lambda: f"block_size must be 32, 64, 128, or 256, got {block_size}",
+    )
+    torch._check(
+        data.dtype in (torch.float16, torch.bfloat16),
+        lambda: f"hadamard_rotate only supports float16/bfloat16, got {data.dtype}",
+    )
+    return data
+
+
 # K-bit fused dequant + GEMM (production: fp16 + bf16)
 
 torch.library.define(
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -1000,6 +1000,30 @@ def _(
     return packed_tiled, absmax_tiled
 
 
+@register_kernel("bitsandbytes::hadamard_rotate_", "cuda")
+def _(data: torch.Tensor, block_size: int) -> torch.Tensor:
+    torch._check(
+        block_size in (32, 64, 128, 256),
+        lambda: f"block_size must be 32, 64, 128, or 256, got {block_size}",
+    )
+    torch._check(
+        data.dtype in (torch.float16, torch.bfloat16),
+        lambda: f"hadamard_rotate only supports float16/bfloat16, got {data.dtype}",
+    )
+
+    tname = _KBIT_DTYPE_SUFFIX[data.dtype]
+    with _cuda_device_of(data):
+        fn = getattr(lib, f"chadamard_rotate_{tname}")
+        fn(
+            get_ptr(data),
+            ct.c_int(data.numel()),
+            ct.c_int(block_size),
+            _get_tensor_stream(data),
+        )
+
+    return data
+
+
 def _kbit_gemm_prod_check(A, B_packed, B_absmax, codebook, N, k, k_chunks):
     torch._check(k >= 2 and k <= 5, lambda: f"k must be 2-5, got {k}")
     torch._check(
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1135,6 +1135,25 @@ def decode_absmax_e4m4(encoded: Tensor, bias: int = 11) -> Tensor:
     return result
 
 
+def hadamard_rotate(data: Tensor, block_size: int = 32) -> Tensor:
+    """Apply in-place Walsh-Hadamard rotation to contiguous blocks.
+
+    Spreads outliers across quantization blocks, improving kbit accuracy.
+    Since H is orthogonal, rotating both weights and activations preserves
+    the GEMM result: H(A) @ H(B)^T = A @ B^T.
+
+    Args:
+        data: Input tensor (float16 or bfloat16). Modified in-place.
+        block_size: Rotation block size (32, 64, 128, or 256).
+
+    Returns:
+        The input tensor, rotated in-place.
+    """
+    data_flat = data.contiguous().view(-1)
+    torch.ops.bitsandbytes.hadamard_rotate_(data_flat, block_size)
+    return data
+
+
 def quantize_kbit(
     A: Tensor,
     k: int = 4,
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -1011,6 +1011,107 @@ void repackKbit(
     CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
 
+// ===========================================================================
+// Hadamard rotation kernel (in-place, blocksize-templated)
+//
+// Applies a Walsh-Hadamard transform to contiguous blocks of BLOCK_SIZE
+// elements. Used to spread outliers before kbit quantization.
+// Since H is orthogonal, rotating both weights and activations preserves
+// the GEMM result: H(A) @ H(B)^T = A @ B^T.
+//
+// One warp per rotation block:
+//   BLOCK_SIZE=32:  1 elem/thread, 5 shuffle stages
+//   BLOCK_SIZE=64:  2 elem/thread, 1 register + 5 shuffle stages
+//   BLOCK_SIZE=128: 4 elem/thread, 2 register + 5 shuffle stages
+//   BLOCK_SIZE=256: 8 elem/thread, 3 register + 5 shuffle stages
+// ===========================================================================
+
+template <int BLOCK_SIZE, typename T>
+__global__ void kHadamardRotate(T* __restrict__ data, const int n) {
+    constexpr int ELEMS_PER_THREAD = BLOCK_SIZE / 32;
+    static_assert(BLOCK_SIZE >= 32 && (BLOCK_SIZE & (BLOCK_SIZE - 1)) == 0,
+                  "BLOCK_SIZE must be a power of 2 >= 32");
+
+    const int warp_idx = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    const int lane_id = threadIdx.x % 32;
+    const int block_start = warp_idx * BLOCK_SIZE;
+
+    if (block_start >= n)
+        return;
+
+    // Load ELEMS_PER_THREAD elements per thread.
+    // Thread t holds elements at global positions: block_start + t, t+32, t+64, ...
+    float vals[ELEMS_PER_THREAD];
+#pragma unroll
+    for (int j = 0; j < ELEMS_PER_THREAD; j++) {
+        int idx = block_start + lane_id + j * 32;
+        vals[j] = (idx < n) ? (float)data[idx] : 0.0f;
+    }
+
+    // In-register butterfly stages (strides >= 32).
+    // Stride S in global space corresponds to element index s = S/32.
+    // Element j pairs with element j ^ s (both in the same thread).
+#pragma unroll
+    for (int s = ELEMS_PER_THREAD / 2; s >= 1; s >>= 1) {
+#pragma unroll
+        for (int j = 0; j < ELEMS_PER_THREAD; j++) {
+            int partner = j ^ s;
+            if (partner > j) {
+                float a = vals[j], b = vals[partner];
+                vals[j] = a + b;
+                vals[partner] = a - b;
+            }
+        }
+    }
+
+    // Shuffle butterfly stages (strides 16, 8, 4, 2, 1).
+    // Each stage exchanges values between lanes within the warp.
+#pragma unroll
+    for (int s = 16; s >= 1; s >>= 1) {
+#pragma unroll
+        for (int j = 0; j < ELEMS_PER_THREAD; j++) {
+            float other = __shfl_xor_sync(0xFFFFFFFF, vals[j], s);
+            vals[j] = (lane_id & s) ? (other - vals[j]) : (vals[j] + other);
+        }
+    }
+
+    // Normalize by 1/sqrt(BLOCK_SIZE).
+    const float norm = rsqrtf((float)BLOCK_SIZE);
+#pragma unroll
+    for (int j = 0; j < ELEMS_PER_THREAD; j++)
+        vals[j] *= norm;
+
+    // Store back.
+#pragma unroll
+    for (int j = 0; j < ELEMS_PER_THREAD; j++) {
+        int idx = block_start + lane_id + j * 32;
+        if (idx < n)
+            data[idx] = (T)vals[j];
+    }
+}
+
+// ---- Hadamard rotation launch wrapper ----
+
+template <int BLOCK_SIZE, typename T>
+void hadamardRotate(T* data, int n, cudaStream_t stream) {
+    const int num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    const int num_cuda_blocks = (num_blocks + KBIT_WARPS_PER_BLOCK - 1) / KBIT_WARPS_PER_BLOCK;
+    kHadamardRotate<BLOCK_SIZE, T><<<num_cuda_blocks, KBIT_THREADS_PER_BLOCK, 0, stream>>>(data, n);
+    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+// Explicit instantiations: 4 block sizes x 2 dtypes
+#define INSTANTIATE_HADAMARD(BS) \
+    template void hadamardRotate<BS, half>(half*, int, cudaStream_t); \
+    template void hadamardRotate<BS, __nv_bfloat16>(__nv_bfloat16*, int, cudaStream_t);
+
+INSTANTIATE_HADAMARD(32)
+INSTANTIATE_HADAMARD(64)
+INSTANTIATE_HADAMARD(128)
+INSTANTIATE_HADAMARD(256)
+
+#undef INSTANTIATE_HADAMARD
+
 // Datacenter GPU detection: Hopper (sm_90) and Blackwell datacenter (sm_100).
 // NOTE: sm_120 (RTX 5090, Blackwell consumer) lacks TMA/wgmma — must NOT match.
 #if defined(__CUDA_ARCH__)
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
@@ -796,6 +796,26 @@ MAKE_KBIT_SCALAR_GEMV_V2_FP16ABS(5)
 // Debug MMA test
 void testMMA(const half*, const half*, float*);
 
+// Forward declarations of hadamard rotation template
+template <int BLOCK_SIZE, typename T>
+void hadamardRotate(T* data, int n, cudaStream_t stream);
+
+// Unmangled hadamard rotation wrappers (dispatch block_size at runtime)
+#define MAKE_HADAMARD_ROTATE(tname, T)                                                                                 \
+    void hadamard_rotate_##tname(T* data, int n, int block_size, cudaStream_t stream) {                                \
+        switch (block_size) {                                                                                          \
+        case 32: hadamardRotate<32, T>(data, n, stream); break;                                                        \
+        case 64: hadamardRotate<64, T>(data, n, stream); break;                                                        \
+        case 128: hadamardRotate<128, T>(data, n, stream); break;                                                      \
+        case 256: hadamardRotate<256, T>(data, n, stream); break;                                                      \
+        }                                                                                                              \
+    }
+
+MAKE_HADAMARD_ROTATE(fp16, half)
+MAKE_HADAMARD_ROTATE(bf16, __nv_bfloat16)
+
+#undef MAKE_HADAMARD_ROTATE
+
 #endif // BUILD_CUDA || BUILD_HIP (kbit unmangled)
 
 extern "C" {
@@ -1664,5 +1684,14 @@ MAKE_CKBIT_SCALAR_GEMV_V2_FP16ABS(3)
 MAKE_CKBIT_SCALAR_GEMV_V2_FP16ABS(4)
 MAKE_CKBIT_SCALAR_GEMV_V2_FP16ABS(5)
 
+// Hadamard rotation extern C wrappers
+void chadamard_rotate_fp16(half* data, int n, int block_size, cudaStream_t stream) {
+    hadamard_rotate_fp16(data, n, block_size, stream);
+}
+
+void chadamard_rotate_bf16(__nv_bfloat16* data, int n, int block_size, cudaStream_t stream) {
+    hadamard_rotate_bf16(data, n, block_size, stream);
+}
+
 #endif
 }