feat: Add Hadamard rotation and fused rotate+quantize NVFP4 kernels

TimDettmers · claude · TimDettmers · commit dd6b88c872ce · 2026-02-22T16:05:53.000-05:00
- kHadamardRotate16: block-diagonal 16x16 Hadamard via FWHT (4 butterfly
  stages with warp shuffles), normalized by 1/sqrt(16)
- kFusedHadamardQuantizeNVFP4: single-kernel Had16 rotation + NVFP4
  quantization (rotation, block scale computation, E2M1 encoding, packing)
- Host launchers, template instantiations, extern C symbols for all

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -320,6 +320,110 @@ __global__ void kDequantizeNVFP4(
         output[element_idx + 1] = (T)val1;
 }
 
+// ============================================================================
+// Block-diagonal Hadamard rotation kernel (Had16)
+// Applies a 16x16 normalized Hadamard transform to each consecutive
+// 16-element chunk using the Fast Walsh-Hadamard Transform (FWHT).
+// 4 butterfly stages: stride 8, 4, 2, 1. Normalization by 1/4 = 1/sqrt(16).
+// In-place operation on FP16/BF16/FP32 tensors.
+// ============================================================================
+template <typename T>
+__global__ void kHadamardRotate16(
+    T* __restrict__ data,
+    const int n
+) {
+    // Each thread handles one element.
+    // 16 threads form one Hadamard block.
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= n) return;
+
+    float val = (float)data[tid];
+
+    // Fast Walsh-Hadamard Transform: 4 butterfly stages
+    // Threads within the same 16-element group exchange via warp shuffles
+    // lane_in_block: position 0-15 within the 16-element Hadamard block
+    #pragma unroll
+    for (int stride = 8; stride >= 1; stride >>= 1) {
+        float other = __shfl_xor_sync(0xFFFFFFFF, val, stride);
+        // Butterfly: if bit is 0, add; if bit is 1, subtract
+        int bit = tid & stride;
+        val = bit ? (other - val) : (val + other);
+    }
+
+    // Normalize by 1/sqrt(16) = 0.25 to make the transform orthogonal
+    val *= 0.25f;
+
+    data[tid] = (T)val;
+}
+
+// ============================================================================
+// Fused Hadamard rotation + NVFP4 quantization kernel
+// Combines Had16 rotation with two-level NVFP4 quantization in a single kernel.
+// Each CUDA block processes multiple 16-element Hadamard/quantization blocks.
+// ============================================================================
+template <typename T>
+__global__ void kFusedHadamardQuantizeNVFP4(
+    const T* __restrict__ input,
+    unsigned char* __restrict__ output,       // Packed FP4: n/2 bytes
+    unsigned char* __restrict__ block_scales,  // E4M3 scales: n/16 bytes
+    const float tensor_scale,
+    const int n
+) {
+    // Each thread handles 1 element for the Hadamard transform,
+    // then pairs of threads pack 2 elements into 1 byte.
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= n) return;
+
+    // Load and convert to float
+    float val = (float)input[tid];
+
+    // Apply Hadamard rotation (FWHT, 4 butterfly stages)
+    #pragma unroll
+    for (int stride = 8; stride >= 1; stride >>= 1) {
+        float other = __shfl_xor_sync(0xFFFFFFFF, val, stride);
+        int bit = tid & stride;
+        val = bit ? (other - val) : (val + other);
+    }
+    val *= 0.25f; // Normalize
+
+    // Divide by tensor_scale
+    float inv_tensor_scale = (tensor_scale > 0.0f) ? (1.0f / tensor_scale) : 0.0f;
+    float scaled_val = val * inv_tensor_scale;
+
+    // Compute block absmax via warp shuffle (16 threads per Hadamard block)
+    float local_max = fabsf(scaled_val);
+    #pragma unroll
+    for (int offset = 8; offset >= 1; offset >>= 1) {
+        float other = __shfl_xor_sync(0xFFFFFFFF, local_max, offset);
+        local_max = fmaxf(local_max, other);
+    }
+
+    // Compute E4M3 block scale
+    float block_scale_f32 = local_max / 6.0f;
+    unsigned char block_scale_e4m3 = dFloatToE4M3(block_scale_f32);
+    float block_scale_deq = dE4M3ToFloat(block_scale_e4m3);
+    float inv_block_scale = (block_scale_deq > 0.0f) ? (1.0f / block_scale_deq) : 0.0f;
+
+    // Store block scale (first thread in each 16-thread group)
+    int lane_in_block = tid & 15;
+    if (lane_in_block == 0) {
+        block_scales[tid / 16] = block_scale_e4m3;
+    }
+
+    // Quantize to E2M1
+    unsigned char q = dQuantizeNVFP4(scaled_val * inv_block_scale);
+
+    // Pack pairs of values: even thread writes low nibble, odd thread writes high nibble
+    // Get partner's quantized value
+    unsigned char partner_q = __shfl_xor_sync(0xFFFFFFFF, q, 1);
+
+    if ((tid & 1) == 0) {
+        // Even thread: pack self as low nibble, partner (odd) as high nibble
+        unsigned char packed = ((partner_q & 0x0F) << 4) | (q & 0x0F);
+        output[tid / 2] = packed;
+    }
+}
+
 __device__ unsigned char dQuantizeNF4(float x) {
 
     // the values for this tree was generated by test_normal_map_tree
@@ -2792,6 +2896,25 @@ template __global__ void kDequantizeNVFP4<float>(
     const float tensor_scale, float* __restrict__ output, const int n
 );
 
+// Hadamard rotation kernel instantiations
+template __global__ void kHadamardRotate16<half>(half* __restrict__ data, const int n);
+template __global__ void kHadamardRotate16<__nv_bfloat16>(__nv_bfloat16* __restrict__ data, const int n);
+template __global__ void kHadamardRotate16<float>(float* __restrict__ data, const int n);
+
+// Fused Hadamard + NVFP4 quantize kernel instantiations
+template __global__ void kFusedHadamardQuantizeNVFP4<half>(
+    const half* __restrict__ input, unsigned char* __restrict__ output,
+    unsigned char* __restrict__ block_scales, const float tensor_scale, const int n
+);
+template __global__ void kFusedHadamardQuantizeNVFP4<__nv_bfloat16>(
+    const __nv_bfloat16* __restrict__ input, unsigned char* __restrict__ output,
+    unsigned char* __restrict__ block_scales, const float tensor_scale, const int n
+);
+template __global__ void kFusedHadamardQuantizeNVFP4<float>(
+    const float* __restrict__ input, unsigned char* __restrict__ output,
+    unsigned char* __restrict__ block_scales, const float tensor_scale, const int n
+);
+
 #define MAKE_OptimizerStatic8bit2StateBlockwise(oname, gtype, block_size, num_per_thread)                              \
     template __global__ void kOptimizerStatic8bit2StateBlockwise<gtype, oname, block_size, num_per_thread>(            \
         gtype * p, gtype* __restrict__ const g, unsigned char* state1, unsigned char* state2, const float beta1,       \
diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh
@@ -37,6 +37,15 @@ __global__ void kDequantizeNVFP4(
     const float tensor_scale, T* __restrict__ output, const int n
 );
 
+template <typename T>
+__global__ void kHadamardRotate16(T* __restrict__ data, const int n);
+
+template <typename T>
+__global__ void kFusedHadamardQuantizeNVFP4(
+    const T* __restrict__ input, unsigned char* __restrict__ output,
+    unsigned char* __restrict__ block_scales, const float tensor_scale, const int n
+);
+
 template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
 __global__ void kPreconditionOptimizer32bit2State(
     T* g, T* p, float* state1, float* state2, float* unorm, const float beta1, const float beta2, const float eps,
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -142,6 +142,44 @@ template void dequantizeNVFP4<float>(
     float tensor_scale, float* output, const int n, cudaStream_t stream
 );
 
+template <typename T>
+void hadamardRotate16(T* data, const int n) {
+    const int threads_per_block = 256;
+    const int num_blocks = (n + threads_per_block - 1) / threads_per_block;
+    kHadamardRotate16<T><<<num_blocks, threads_per_block>>>(data, n);
+    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+template <typename T>
+void fusedHadamardQuantizeNVFP4(
+    const T* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+) {
+    const int threads_per_block = 256;
+    const int num_blocks = (n + threads_per_block - 1) / threads_per_block;
+    kFusedHadamardQuantizeNVFP4<T><<<num_blocks, threads_per_block>>>(
+        input, output, block_scales, tensor_scale, n
+    );
+    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+// Hadamard and fused kernel instantiations
+template void hadamardRotate16<half>(half* data, const int n);
+template void hadamardRotate16<__nv_bfloat16>(__nv_bfloat16* data, const int n);
+template void hadamardRotate16<float>(float* data, const int n);
+template void fusedHadamardQuantizeNVFP4<half>(
+    const half* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+);
+template void fusedHadamardQuantizeNVFP4<__nv_bfloat16>(
+    const __nv_bfloat16* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+);
+template void fusedHadamardQuantizeNVFP4<float>(
+    const float* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+);
+
 template <typename T, int OPTIMIZER>
 void optimizer32bit(
     T* g, T* p, float* state1, float* state2, float* unorm, float max_unorm, float param_norm, const float beta1,
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
@@ -131,6 +131,15 @@ void dequantizeNVFP4(
     float tensor_scale, T* output, const int n, cudaStream_t stream
 );
 
+template <typename T>
+void hadamardRotate16(T* data, const int n);
+
+template <typename T>
+void fusedHadamardQuantizeNVFP4(
+    const T* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+);
+
 template <typename T, int OPTIMIZER>
 void optimizer32bit(
     T* g, T* p, float* state1, float* state2, float* unorm, float max_unorm, float param_norm, float beta1, float beta2,
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
@@ -224,6 +224,37 @@ void quantizeNVFP4_fp32(
     quantizeNVFP4<float>(input, output, block_scales, tensor_scale, n);
 }
 
+// Hadamard rotation wrapper functions
+void hadamardRotate16_fp16(half* data, const int n) {
+    hadamardRotate16<half>(data, n);
+}
+void hadamardRotate16_bf16(__nv_bfloat16* data, const int n) {
+    hadamardRotate16<__nv_bfloat16>(data, n);
+}
+void hadamardRotate16_fp32(float* data, const int n) {
+    hadamardRotate16<float>(data, n);
+}
+
+// Fused Hadamard + NVFP4 quantize wrapper functions
+void fusedHadamardQuantizeNVFP4_fp16(
+    const half* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+) {
+    fusedHadamardQuantizeNVFP4<half>(input, output, block_scales, tensor_scale, n);
+}
+void fusedHadamardQuantizeNVFP4_bf16(
+    const __nv_bfloat16* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+) {
+    fusedHadamardQuantizeNVFP4<__nv_bfloat16>(input, output, block_scales, tensor_scale, n);
+}
+void fusedHadamardQuantizeNVFP4_fp32(
+    const float* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+) {
+    fusedHadamardQuantizeNVFP4<float>(input, output, block_scales, tensor_scale, n);
+}
+
 // NVFP4 dequantize wrapper functions
 void dequantizeNVFP4_fp16(
     const unsigned char* input, const unsigned char* block_scales,
@@ -532,6 +563,37 @@ void cdequantize_blockwise_bf16_nf4(
     dequantizeBlockwise_bf16_nf4(code, A, absmax, out, blocksize, n, stream);
 }
 
+// Hadamard rotation extern "C" wrappers
+void chadamard_rotate16_fp16(half* data, const int n) {
+    hadamardRotate16_fp16(data, n);
+}
+void chadamard_rotate16_bf16(__nv_bfloat16* data, const int n) {
+    hadamardRotate16_bf16(data, n);
+}
+void chadamard_rotate16_fp32(float* data, const int n) {
+    hadamardRotate16_fp32(data, n);
+}
+
+// Fused Hadamard + NVFP4 quantize extern "C" wrappers
+void cfused_hadamard_quantize_nvfp4_fp16(
+    const half* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+) {
+    fusedHadamardQuantizeNVFP4_fp16(input, output, block_scales, tensor_scale, n);
+}
+void cfused_hadamard_quantize_nvfp4_bf16(
+    const __nv_bfloat16* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+) {
+    fusedHadamardQuantizeNVFP4_bf16(input, output, block_scales, tensor_scale, n);
+}
+void cfused_hadamard_quantize_nvfp4_fp32(
+    const float* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+) {
+    fusedHadamardQuantizeNVFP4_fp32(input, output, block_scales, tensor_scale, n);
+}
+
 // NVFP4 quantize extern "C" wrappers
 void cquantize_nvfp4_fp16(
     const half* input, unsigned char* output, unsigned char* block_scales,