refactor: Remove random sign flips from Hadamard rotation

TimDettmers · claude · TimDettmers · commit fcfca9f43501 · 2026-02-23T07:07:14.000-05:00
Simplify the Hadamard rotation API by removing the optional signs
parameter. Plain Walsh-Hadamard is sufficient for outlier spreading
and keeps the interface minimal.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -588,12 +588,12 @@ def _(
 
 torch.library.define(
     "bitsandbytes::hadamard_rotate_",
-    "(Tensor(a!) data, int block_size, Tensor? signs) -> Tensor(a!)",
+    "(Tensor(a!) data, int block_size) -> Tensor(a!)",
 )
 
 
 @register_fake("bitsandbytes::hadamard_rotate_")
-def _(data: torch.Tensor, block_size: int, signs: Optional[torch.Tensor]) -> torch.Tensor:
+def _(data: torch.Tensor, block_size: int) -> torch.Tensor:
     torch._check(
         block_size in (32, 64, 128, 256),
         lambda: f"block_size must be 32, 64, 128, or 256, got {block_size}",
@@ -602,15 +602,6 @@ def _(data: torch.Tensor, block_size: int, signs: Optional[torch.Tensor]) -> tor
         data.dtype in (torch.float16, torch.bfloat16),
         lambda: f"hadamard_rotate only supports float16/bfloat16, got {data.dtype}",
     )
-    if signs is not None:
-        torch._check(
-            signs.dtype == torch.int32,
-            lambda: f"signs must be int32, got {signs.dtype}",
-        )
-        torch._check(
-            signs.numel() == block_size // 32,
-            lambda: f"signs must have {block_size // 32} elements for block_size={block_size}, got {signs.numel()}",
-        )
     return data
 
 
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -1001,7 +1001,7 @@ def _(
 
 
 @register_kernel("bitsandbytes::hadamard_rotate_", "cuda")
-def _(data: torch.Tensor, block_size: int, signs: Optional[torch.Tensor]) -> torch.Tensor:
+def _(data: torch.Tensor, block_size: int) -> torch.Tensor:
     torch._check(
         block_size in (32, 64, 128, 256),
         lambda: f"block_size must be 32, 64, 128, or 256, got {block_size}",
@@ -1012,14 +1012,12 @@ def _(data: torch.Tensor, block_size: int, signs: Optional[torch.Tensor]) -> tor
     )
 
     tname = _KBIT_DTYPE_SUFFIX[data.dtype]
-    signs_ptr = get_ptr(signs) if signs is not None else None
     with _cuda_device_of(data):
         fn = getattr(lib, f"chadamard_rotate_{tname}")
         fn(
             get_ptr(data),
             ct.c_int(data.numel()),
             ct.c_int(block_size),
-            signs_ptr,
             _get_tensor_stream(data),
         )
 
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1135,30 +1135,22 @@ def decode_absmax_e4m4(encoded: Tensor, bias: int = 11) -> Tensor:
     return result
 
 
-def hadamard_rotate(
-    data: Tensor,
-    block_size: int = 32,
-    signs: Optional[Tensor] = None,
-) -> Tensor:
-    """Apply in-place randomized Walsh-Hadamard rotation (H*D) to contiguous blocks.
+def hadamard_rotate(data: Tensor, block_size: int = 32) -> Tensor:
+    """Apply in-place Walsh-Hadamard rotation to contiguous blocks.
 
     Spreads outliers across quantization blocks, improving kbit accuracy.
-    Since H*D is orthogonal, rotating both weights and activations with the
-    same signs preserves the GEMM result: (H*D)(A) @ (H*D)(B)^T = A @ B^T.
+    Since H is orthogonal, rotating both weights and activations preserves
+    the GEMM result: H(A) @ H(B)^T = A @ B^T.
 
     Args:
         data: Input tensor (float16 or bfloat16). Modified in-place.
         block_size: Rotation block size (32, 64, 128, or 256).
-        signs: Optional int32 tensor of block_size//32 words. Each bit controls
-            the sign flip for one element within the block. If None, no sign
-            flips are applied (plain Hadamard). Generate once per model with
-            ``torch.randint(0, 2**32, (block_size // 32,), dtype=torch.int32)``.
 
     Returns:
         The input tensor, rotated in-place.
     """
     data_flat = data.contiguous().view(-1)
-    torch.ops.bitsandbytes.hadamard_rotate_(data_flat, block_size, signs)
+    torch.ops.bitsandbytes.hadamard_rotate_(data_flat, block_size)
     return data
 
 
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -1015,25 +1015,19 @@ void repackKbit(
 // ===========================================================================
 // Hadamard rotation kernel (in-place, blocksize-templated)
 //
-// Applies a randomized Walsh-Hadamard transform (H*D) to contiguous blocks
-// of BLOCK_SIZE elements. D is a diagonal sign-flip matrix (optional).
-// Used to spread outliers before kbit quantization.
-// Since H*D is orthogonal, rotating both weights and activations preserves
-// the GEMM result: (H*D)(A) @ (H*D)(B)^T = A @ B^T.
+// Applies a Walsh-Hadamard transform to contiguous blocks of BLOCK_SIZE
+// elements. Used to spread outliers before kbit quantization.
+// Since H is orthogonal, rotating both weights and activations preserves
+// the GEMM result: H(A) @ H(B)^T = A @ B^T.
 //
 // One warp per rotation block:
 //   BLOCK_SIZE=32:  1 elem/thread, 5 shuffle stages
 //   BLOCK_SIZE=64:  2 elem/thread, 1 register + 5 shuffle stages
 //   BLOCK_SIZE=128: 4 elem/thread, 2 register + 5 shuffle stages
 //   BLOCK_SIZE=256: 8 elem/thread, 3 register + 5 shuffle stages
-//
-// signs: optional bitmask of BLOCK_SIZE/32 uint32 words. If non-null, bit i
-// set means element i is negated before the Hadamard butterfly. Same sign
-// vector is applied to every block.
 // ===========================================================================
 
-template <int BLOCK_SIZE, typename T>
-__global__ void kHadamardRotate(T* __restrict__ data, const int n, const unsigned int* __restrict__ signs) {
+template <int BLOCK_SIZE, typename T> __global__ void kHadamardRotate(T* __restrict__ data, const int n) {
     constexpr int ELEMS_PER_THREAD = BLOCK_SIZE / 32;
     static_assert(BLOCK_SIZE >= 32 && (BLOCK_SIZE & (BLOCK_SIZE - 1)) == 0, "BLOCK_SIZE must be a power of 2 >= 32");
 
@@ -1053,16 +1047,6 @@ __global__ void kHadamardRotate(T* __restrict__ data, const int n, const unsigne
         vals[j] = (idx < n) ? (float)data[idx] : 0.0f;
     }
 
-    // Apply random sign flips (D matrix) before butterfly.
-    // Element at position lane_id + j*32 uses word j, bit lane_id.
-    if (signs != nullptr) {
-#pragma unroll
-        for (int j = 0; j < ELEMS_PER_THREAD; j++) {
-            if (signs[j] & (1u << lane_id))
-                vals[j] = -vals[j];
-        }
-    }
-
     // In-register butterfly stages (strides >= 32).
     // Stride S in global space corresponds to element index s = S/32.
     // Element j pairs with element j ^ s (both in the same thread).
@@ -1107,18 +1091,17 @@ __global__ void kHadamardRotate(T* __restrict__ data, const int n, const unsigne
 
 // ---- Hadamard rotation launch wrapper ----
 
-template <int BLOCK_SIZE, typename T>
-void hadamardRotate(T* data, int n, const unsigned int* signs, cudaStream_t stream) {
+template <int BLOCK_SIZE, typename T> void hadamardRotate(T* data, int n, cudaStream_t stream) {
     const int num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
     const int num_cuda_blocks = (num_blocks + KBIT_WARPS_PER_BLOCK - 1) / KBIT_WARPS_PER_BLOCK;
-    kHadamardRotate<BLOCK_SIZE, T><<<num_cuda_blocks, KBIT_THREADS_PER_BLOCK, 0, stream>>>(data, n, signs);
+    kHadamardRotate<BLOCK_SIZE, T><<<num_cuda_blocks, KBIT_THREADS_PER_BLOCK, 0, stream>>>(data, n);
     CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
 
 // Explicit instantiations: 4 block sizes x 2 dtypes
 #define INSTANTIATE_HADAMARD(BS)                                                                                       \
-    template void hadamardRotate<BS, half>(half*, int, const unsigned int*, cudaStream_t);                             \
-    template void hadamardRotate<BS, __nv_bfloat16>(__nv_bfloat16*, int, const unsigned int*, cudaStream_t);
+    template void hadamardRotate<BS, half>(half*, int, cudaStream_t);                                                  \
+    template void hadamardRotate<BS, __nv_bfloat16>(__nv_bfloat16*, int, cudaStream_t);
 
 INSTANTIATE_HADAMARD(32)
 INSTANTIATE_HADAMARD(64)
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
@@ -800,24 +800,23 @@ MAKE_KBIT_SCALAR_GEMV_V2_FP16ABS(5)
 void testMMA(const half*, const half*, float*);
 
 // Forward declarations of hadamard rotation template
-template <int BLOCK_SIZE, typename T>
-void hadamardRotate(T* data, int n, const unsigned int* signs, cudaStream_t stream);
+template <int BLOCK_SIZE, typename T> void hadamardRotate(T* data, int n, cudaStream_t stream);
 
 // Unmangled hadamard rotation wrappers (dispatch block_size at runtime)
 #define MAKE_HADAMARD_ROTATE(tname, T)                                                                                 \
-    void hadamard_rotate_##tname(T* data, int n, int block_size, const unsigned int* signs, cudaStream_t stream) {     \
+    void hadamard_rotate_##tname(T* data, int n, int block_size, cudaStream_t stream) {                                \
         switch (block_size) {                                                                                          \
         case 32:                                                                                                       \
-            hadamardRotate<32, T>(data, n, signs, stream);                                                             \
+            hadamardRotate<32, T>(data, n, stream);                                                                    \
             break;                                                                                                     \
         case 64:                                                                                                       \
-            hadamardRotate<64, T>(data, n, signs, stream);                                                             \
+            hadamardRotate<64, T>(data, n, stream);                                                                    \
             break;                                                                                                     \
         case 128:                                                                                                      \
-            hadamardRotate<128, T>(data, n, signs, stream);                                                            \
+            hadamardRotate<128, T>(data, n, stream);                                                                   \
             break;                                                                                                     \
         case 256:                                                                                                      \
-            hadamardRotate<256, T>(data, n, signs, stream);                                                            \
+            hadamardRotate<256, T>(data, n, stream);                                                                   \
             break;                                                                                                     \
         }                                                                                                              \
     }
@@ -1699,12 +1698,12 @@ MAKE_CKBIT_SCALAR_GEMV_V2_FP16ABS(4)
 MAKE_CKBIT_SCALAR_GEMV_V2_FP16ABS(5)
 
 // Hadamard rotation extern C wrappers
-void chadamard_rotate_fp16(half* data, int n, int block_size, const unsigned int* signs, cudaStream_t stream) {
-    hadamard_rotate_fp16(data, n, block_size, signs, stream);
+void chadamard_rotate_fp16(half* data, int n, int block_size, cudaStream_t stream) {
+    hadamard_rotate_fp16(data, n, block_size, stream);
 }
 
-void chadamard_rotate_bf16(__nv_bfloat16* data, int n, int block_size, const unsigned int* signs, cudaStream_t stream) {
-    hadamard_rotate_bf16(data, n, block_size, signs, stream);
+void chadamard_rotate_bf16(__nv_bfloat16* data, int n, int block_size, cudaStream_t stream) {
+    hadamard_rotate_bf16(data, n, block_size, stream);
 }
 
 #endif
diff --git a/tests/test_hadamard.py b/tests/test_hadamard.py
@@ -10,7 +10,7 @@
 
 
 class TestOrthogonality:
-    """H(H(x)) ≈ x for plain Hadamard (no signs)."""
+    """H(H(x)) ≈ x — Hadamard is its own inverse (involutory)."""
 
     @pytest.mark.parametrize("block_size", BLOCK_SIZES)
     @pytest.mark.parametrize("dtype", DTYPES)
@@ -34,41 +34,12 @@ def test_double_apply_large(self, block_size, dtype):
         torch.testing.assert_close(x, x_orig, atol=atol, rtol=atol)
 
 
-class TestSignedOrthogonality:
-    """Randomized Hadamard: R=H*D is orthogonal (R^T*R=I)."""
-
-    @pytest.mark.parametrize("block_size", BLOCK_SIZES)
-    @pytest.mark.parametrize("dtype", DTYPES)
-    def test_signed_inverse(self, block_size, dtype):
-        """Verify inv(H*D) = D*H: forward then inverse recovers original."""
-        signs = torch.randint(0, 2**31, (block_size // 32,), dtype=torch.int32, device="cuda")
-        x = torch.randn(1024, dtype=dtype, device="cuda")
-        x_orig = x.clone()
-
-        # Forward: H*D*x
-        hadamard_rotate(x, block_size=block_size, signs=signs)
-
-        # Inverse: D*H*x' = first apply H (no signs), then sign flip
-        hadamard_rotate(x, block_size=block_size)  # H
-        # Apply D (sign flip)
-        x_flat = x.view(-1)
-        for j in range(block_size // 32):
-            word = signs[j].item()
-            for bit in range(32):
-                if word & (1 << bit):
-                    pos = j * 32 + bit
-                    x_flat[pos::block_size] *= -1
-
-        atol = 1e-2 if dtype == torch.bfloat16 else 1e-3
-        torch.testing.assert_close(x, x_orig, atol=atol, rtol=atol)
-
-
 class TestGEMMEquivalence:
     """H(A) @ H(B)^T ≈ A @ B^T (within quantization tolerance)."""
 
     @pytest.mark.parametrize("block_size", BLOCK_SIZES)
     @pytest.mark.parametrize("dtype", DTYPES)
-    def test_gemm_plain(self, block_size, dtype):
+    def test_gemm(self, block_size, dtype):
         M, K, N = 4, 256, 8
         A = torch.randn(M, K, dtype=dtype, device="cuda")
         B = torch.randn(N, K, dtype=dtype, device="cuda")
@@ -83,25 +54,6 @@ def test_gemm_plain(self, block_size, dtype):
         atol = 0.1 if dtype == torch.bfloat16 else 0.05
         torch.testing.assert_close(result, ref, atol=atol, rtol=0.05)
 
-    @pytest.mark.parametrize("block_size", BLOCK_SIZES)
-    @pytest.mark.parametrize("dtype", DTYPES)
-    def test_gemm_signed(self, block_size, dtype):
-        """GEMM equivalence with random sign flips."""
-        M, K, N = 4, 256, 8
-        signs = torch.randint(0, 2**31, (block_size // 32,), dtype=torch.int32, device="cuda")
-        A = torch.randn(M, K, dtype=dtype, device="cuda")
-        B = torch.randn(N, K, dtype=dtype, device="cuda")
-        ref = A.float() @ B.float().T
-
-        A_rot = A.clone()
-        B_rot = B.clone()
-        hadamard_rotate(A_rot, block_size=block_size, signs=signs)
-        hadamard_rotate(B_rot, block_size=block_size, signs=signs)
-        result = A_rot.float() @ B_rot.float().T
-
-        atol = 0.1 if dtype == torch.bfloat16 else 0.05
-        torch.testing.assert_close(result, ref, atol=atol, rtol=0.05)
-
     def test_gemm_qwen3_shapes(self):
         """GEMM equivalence on Qwen3-Coder-Next 70B shapes."""
         shapes = [
@@ -194,16 +146,6 @@ def test_deterministic(self, block_size, dtype):
         hadamard_rotate(b, block_size=block_size)
         torch.testing.assert_close(a, b, atol=0, rtol=0)
 
-    @pytest.mark.parametrize("block_size", BLOCK_SIZES)
-    def test_deterministic_signed(self, block_size):
-        signs = torch.randint(0, 2**31, (block_size // 32,), dtype=torch.int32, device="cuda")
-        x = torch.randn(1024, dtype=torch.float16, device="cuda")
-        a = x.clone()
-        b = x.clone()
-        hadamard_rotate(a, block_size=block_size, signs=signs)
-        hadamard_rotate(b, block_size=block_size, signs=signs)
-        torch.testing.assert_close(a, b, atol=0, rtol=0)
-
 
 class TestNormPreservation:
     """Hadamard rotation preserves L2 norm (orthogonal transform)."""