feat: Add optional random sign flips to Hadamard rotation

TimDettmers · claude · TimDettmers · commit 3a2cf588a2d0 · 2026-02-23T06:03:13.000-05:00
Support randomized Hadamard transform R = H*D where D is a diagonal sign
matrix. The sign vector (block_size/32 uint32 words as a bitmask) is
applied element-wise before the butterfly stages. Since R is orthogonal
(D^2=I), rotating both weights and activations with the same signs
preserves the GEMM result.

Random sign flips improve outlier destruction vs plain Hadamard by
breaking deterministic alignment patterns. Generate signs once per model
with torch.randint(0, 2**32, (block_size//32,), dtype=torch.int32).

Passing signs=None preserves the previous behavior (plain Hadamard).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -588,12 +588,12 @@ def _(
 
 torch.library.define(
     "bitsandbytes::hadamard_rotate_",
-    "(Tensor(a!) data, int block_size) -> Tensor(a!)",
+    "(Tensor(a!) data, int block_size, Tensor? signs) -> Tensor(a!)",
 )
 
 
 @register_fake("bitsandbytes::hadamard_rotate_")
-def _(data: torch.Tensor, block_size: int) -> torch.Tensor:
+def _(data: torch.Tensor, block_size: int, signs: Optional[torch.Tensor]) -> torch.Tensor:
     torch._check(
         block_size in (32, 64, 128, 256),
         lambda: f"block_size must be 32, 64, 128, or 256, got {block_size}",
@@ -602,6 +602,15 @@ def _(data: torch.Tensor, block_size: int) -> torch.Tensor:
         data.dtype in (torch.float16, torch.bfloat16),
         lambda: f"hadamard_rotate only supports float16/bfloat16, got {data.dtype}",
     )
+    if signs is not None:
+        torch._check(
+            signs.dtype == torch.int32,
+            lambda: f"signs must be int32, got {signs.dtype}",
+        )
+        torch._check(
+            signs.numel() == block_size // 32,
+            lambda: f"signs must have {block_size // 32} elements for block_size={block_size}, got {signs.numel()}",
+        )
     return data
 
 
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -1001,7 +1001,7 @@ def _(
 
 
 @register_kernel("bitsandbytes::hadamard_rotate_", "cuda")
-def _(data: torch.Tensor, block_size: int) -> torch.Tensor:
+def _(data: torch.Tensor, block_size: int, signs: Optional[torch.Tensor]) -> torch.Tensor:
     torch._check(
         block_size in (32, 64, 128, 256),
         lambda: f"block_size must be 32, 64, 128, or 256, got {block_size}",
@@ -1012,12 +1012,14 @@ def _(data: torch.Tensor, block_size: int) -> torch.Tensor:
     )
 
     tname = _KBIT_DTYPE_SUFFIX[data.dtype]
+    signs_ptr = get_ptr(signs) if signs is not None else None
     with _cuda_device_of(data):
         fn = getattr(lib, f"chadamard_rotate_{tname}")
         fn(
             get_ptr(data),
             ct.c_int(data.numel()),
             ct.c_int(block_size),
+            signs_ptr,
             _get_tensor_stream(data),
         )
 
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1135,22 +1135,30 @@ def decode_absmax_e4m4(encoded: Tensor, bias: int = 11) -> Tensor:
     return result
 
 
-def hadamard_rotate(data: Tensor, block_size: int = 32) -> Tensor:
-    """Apply in-place Walsh-Hadamard rotation to contiguous blocks.
+def hadamard_rotate(
+    data: Tensor,
+    block_size: int = 32,
+    signs: Optional[Tensor] = None,
+) -> Tensor:
+    """Apply in-place randomized Walsh-Hadamard rotation (H*D) to contiguous blocks.
 
     Spreads outliers across quantization blocks, improving kbit accuracy.
-    Since H is orthogonal, rotating both weights and activations preserves
-    the GEMM result: H(A) @ H(B)^T = A @ B^T.
+    Since H*D is orthogonal, rotating both weights and activations with the
+    same signs preserves the GEMM result: (H*D)(A) @ (H*D)(B)^T = A @ B^T.
 
     Args:
         data: Input tensor (float16 or bfloat16). Modified in-place.
         block_size: Rotation block size (32, 64, 128, or 256).
+        signs: Optional int32 tensor of block_size//32 words. Each bit controls
+            the sign flip for one element within the block. If None, no sign
+            flips are applied (plain Hadamard). Generate once per model with
+            ``torch.randint(0, 2**32, (block_size // 32,), dtype=torch.int32)``.
 
     Returns:
         The input tensor, rotated in-place.
     """
     data_flat = data.contiguous().view(-1)
-    torch.ops.bitsandbytes.hadamard_rotate_(data_flat, block_size)
+    torch.ops.bitsandbytes.hadamard_rotate_(data_flat, block_size, signs)
     return data
 
 
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -1014,20 +1014,25 @@ void repackKbit(
 // ===========================================================================
 // Hadamard rotation kernel (in-place, blocksize-templated)
 //
-// Applies a Walsh-Hadamard transform to contiguous blocks of BLOCK_SIZE
-// elements. Used to spread outliers before kbit quantization.
-// Since H is orthogonal, rotating both weights and activations preserves
-// the GEMM result: H(A) @ H(B)^T = A @ B^T.
+// Applies a randomized Walsh-Hadamard transform (H*D) to contiguous blocks
+// of BLOCK_SIZE elements. D is a diagonal sign-flip matrix (optional).
+// Used to spread outliers before kbit quantization.
+// Since H*D is orthogonal, rotating both weights and activations preserves
+// the GEMM result: (H*D)(A) @ (H*D)(B)^T = A @ B^T.
 //
 // One warp per rotation block:
 //   BLOCK_SIZE=32:  1 elem/thread, 5 shuffle stages
 //   BLOCK_SIZE=64:  2 elem/thread, 1 register + 5 shuffle stages
 //   BLOCK_SIZE=128: 4 elem/thread, 2 register + 5 shuffle stages
 //   BLOCK_SIZE=256: 8 elem/thread, 3 register + 5 shuffle stages
+//
+// signs: optional bitmask of BLOCK_SIZE/32 uint32 words. If non-null, bit i
+// set means element i is negated before the Hadamard butterfly. Same sign
+// vector is applied to every block.
 // ===========================================================================
 
 template <int BLOCK_SIZE, typename T>
-__global__ void kHadamardRotate(T* __restrict__ data, const int n) {
+__global__ void kHadamardRotate(T* __restrict__ data, const int n, const unsigned int* __restrict__ signs) {
     constexpr int ELEMS_PER_THREAD = BLOCK_SIZE / 32;
     static_assert(BLOCK_SIZE >= 32 && (BLOCK_SIZE & (BLOCK_SIZE - 1)) == 0,
                   "BLOCK_SIZE must be a power of 2 >= 32");
@@ -1048,6 +1053,16 @@ __global__ void kHadamardRotate(T* __restrict__ data, const int n) {
         vals[j] = (idx < n) ? (float)data[idx] : 0.0f;
     }
 
+    // Apply random sign flips (D matrix) before butterfly.
+    // Element at position lane_id + j*32 uses word j, bit lane_id.
+    if (signs != nullptr) {
+#pragma unroll
+        for (int j = 0; j < ELEMS_PER_THREAD; j++) {
+            if (signs[j] & (1u << lane_id))
+                vals[j] = -vals[j];
+        }
+    }
+
     // In-register butterfly stages (strides >= 32).
     // Stride S in global space corresponds to element index s = S/32.
     // Element j pairs with element j ^ s (both in the same thread).
@@ -1093,17 +1108,17 @@ __global__ void kHadamardRotate(T* __restrict__ data, const int n) {
 // ---- Hadamard rotation launch wrapper ----
 
 template <int BLOCK_SIZE, typename T>
-void hadamardRotate(T* data, int n, cudaStream_t stream) {
+void hadamardRotate(T* data, int n, const unsigned int* signs, cudaStream_t stream) {
     const int num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
     const int num_cuda_blocks = (num_blocks + KBIT_WARPS_PER_BLOCK - 1) / KBIT_WARPS_PER_BLOCK;
-    kHadamardRotate<BLOCK_SIZE, T><<<num_cuda_blocks, KBIT_THREADS_PER_BLOCK, 0, stream>>>(data, n);
+    kHadamardRotate<BLOCK_SIZE, T><<<num_cuda_blocks, KBIT_THREADS_PER_BLOCK, 0, stream>>>(data, n, signs);
     CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
 
 // Explicit instantiations: 4 block sizes x 2 dtypes
-#define INSTANTIATE_HADAMARD(BS) \
-    template void hadamardRotate<BS, half>(half*, int, cudaStream_t); \
-    template void hadamardRotate<BS, __nv_bfloat16>(__nv_bfloat16*, int, cudaStream_t);
+#define INSTANTIATE_HADAMARD(BS)                                                                                       \
+    template void hadamardRotate<BS, half>(half*, int, const unsigned int*, cudaStream_t);                             \
+    template void hadamardRotate<BS, __nv_bfloat16>(__nv_bfloat16*, int, const unsigned int*, cudaStream_t);
 
 INSTANTIATE_HADAMARD(32)
 INSTANTIATE_HADAMARD(64)
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
@@ -798,16 +798,16 @@ void testMMA(const half*, const half*, float*);
 
 // Forward declarations of hadamard rotation template
 template <int BLOCK_SIZE, typename T>
-void hadamardRotate(T* data, int n, cudaStream_t stream);
+void hadamardRotate(T* data, int n, const unsigned int* signs, cudaStream_t stream);
 
 // Unmangled hadamard rotation wrappers (dispatch block_size at runtime)
 #define MAKE_HADAMARD_ROTATE(tname, T)                                                                                 \
-    void hadamard_rotate_##tname(T* data, int n, int block_size, cudaStream_t stream) {                                \
+    void hadamard_rotate_##tname(T* data, int n, int block_size, const unsigned int* signs, cudaStream_t stream) {     \
         switch (block_size) {                                                                                          \
-        case 32: hadamardRotate<32, T>(data, n, stream); break;                                                        \
-        case 64: hadamardRotate<64, T>(data, n, stream); break;                                                        \
-        case 128: hadamardRotate<128, T>(data, n, stream); break;                                                      \
-        case 256: hadamardRotate<256, T>(data, n, stream); break;                                                      \
+        case 32: hadamardRotate<32, T>(data, n, signs, stream); break;                                                 \
+        case 64: hadamardRotate<64, T>(data, n, signs, stream); break;                                                 \
+        case 128: hadamardRotate<128, T>(data, n, signs, stream); break;                                               \
+        case 256: hadamardRotate<256, T>(data, n, signs, stream); break;                                               \
         }                                                                                                              \
     }
 
@@ -1685,12 +1685,14 @@ MAKE_CKBIT_SCALAR_GEMV_V2_FP16ABS(4)
 MAKE_CKBIT_SCALAR_GEMV_V2_FP16ABS(5)
 
 // Hadamard rotation extern C wrappers
-void chadamard_rotate_fp16(half* data, int n, int block_size, cudaStream_t stream) {
-    hadamard_rotate_fp16(data, n, block_size, stream);
+void chadamard_rotate_fp16(half* data, int n, int block_size, const unsigned int* signs, cudaStream_t stream) {
+    hadamard_rotate_fp16(data, n, block_size, signs, stream);
 }
 
-void chadamard_rotate_bf16(__nv_bfloat16* data, int n, int block_size, cudaStream_t stream) {
-    hadamard_rotate_bf16(data, n, block_size, stream);
+void chadamard_rotate_bf16(
+    __nv_bfloat16* data, int n, int block_size, const unsigned int* signs, cudaStream_t stream
+) {
+    hadamard_rotate_bf16(data, n, block_size, signs, stream);
 }
 
 #endif