feat: Add NVFP4 (E2M1) quantize/dequantize CUDA kernels

TimDettmers · claude · TimDettmers · commit 4d8db3966cc6 · 2026-02-22T16:02:43.000-05:00
Implements two-level block-scaled NVFP4 quantization:
- E2M1 quantize/dequantize device functions with decision-tree and LUT
- E4M3 float conversion helpers for block scale factors
- kQuantizeNVFP4: FP16/BF16/FP32 -&gt; packed FP4 + E4M3 block scales
- kDequantizeNVFP4: packed FP4 + scales -&gt; FP16/BF16/FP32
- Host launchers, template instantiations, extern C symbols
- NVFP4=3 added to DataType_t enum

Block size fixed at 16 (hardware requirement). Two-level scaling:
FP32 tensor_scale + unsigned E4M3 per-block scale.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/csrc/common.h b/csrc/common.h
@@ -4,4 +4,5 @@ typedef enum DataType_t {
     General8bit = 0,
     FP4 = 1,
     NF4 = 2,
+    NVFP4 = 3,
 } DataType_t;
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -121,6 +121,205 @@ __device__ unsigned char dQuantizeFP4(float x) {
 
 __device__ __forceinline__ float dDequantizeNF4(unsigned char val) { return nf4_dequantization_lut[val & 0x0F]; }
 
+// ============================================================================
+// NVFP4 (E2M1) device functions
+// E2M1 format: 1 sign + 2 exponent (bias=1) + 1 mantissa
+// Representable magnitudes: {0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0}
+// ============================================================================
+
+// E2M1 dequantization LUT - maps 3-bit unsigned magnitude code to float
+__device__ static float nvfp4_dequant_lut[8] = {
+    0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f
+};
+
+// Dequantize a 4-bit E2M1 code to float
+// Bit layout: [sign(1) | exponent(2) | mantissa(1)]
+__device__ __forceinline__ float dDequantizeNVFP4(unsigned char val) {
+    float sign = (val & 0x08) ? -1.0f : 1.0f;
+    return nvfp4_dequant_lut[val & 0x07] * sign;
+}
+
+// Quantize a float to 4-bit E2M1 code using round-to-nearest
+// Input should be pre-scaled so that the representable range [-6, 6] is appropriate
+__device__ unsigned char dQuantizeNVFP4(float x) {
+    unsigned char sign = (x < 0.0f) ? 0x08 : 0x00;
+    float ax = fabsf(x);
+
+    // Decision boundaries are midpoints between adjacent representable values
+    unsigned char code;
+    if (ax > 5.0f)
+        code = 0x07; // 6.0
+    else if (ax > 3.5f)
+        code = 0x06; // 4.0
+    else if (ax > 2.5f)
+        code = 0x05; // 3.0
+    else if (ax > 1.75f)
+        code = 0x04; // 2.0
+    else if (ax > 1.25f)
+        code = 0x03; // 1.5
+    else if (ax > 0.75f)
+        code = 0x02; // 1.0
+    else if (ax > 0.25f)
+        code = 0x01; // 0.5
+    else
+        code = 0x00; // 0.0
+
+    return code | sign;
+}
+
+// Convert positive float to unsigned E4M3 (8-bit: 4 exponent bits, bias=7, 3 mantissa bits)
+// Range: [0, 448]. Used for NVFP4 block scale factors.
+__device__ unsigned char dFloatToE4M3(float x) {
+    if (x <= 0.0f) return 0;
+    if (x >= 448.0f) return 0x7E; // Max normal (exp=14, mant=6). exp=15 mant=7 is NaN.
+
+    unsigned int bits = __float_as_uint(x);
+    int fp32_exp = ((bits >> 23) & 0xFF) - 127; // Unbiased FP32 exponent
+    int e4m3_exp = fp32_exp + 7;                // E4M3 bias is 7
+
+    if (e4m3_exp <= 0) {
+        // Subnormal in E4M3: value = mantissa/8 * 2^(-6)
+        int mant = __float2int_rn(x * 512.0f); // 512 = 8 * 2^6
+        if (mant <= 0) return 0;
+        if (mant > 7) mant = 7;
+        return (unsigned char)mant;
+    }
+
+    // Normal: extract top 3 mantissa bits with round-to-nearest
+    unsigned int fp32_mant = bits & 0x7FFFFF;
+    unsigned int mant_3bit = (fp32_mant + (1 << 19)) >> 20;
+
+    if (mant_3bit >= 8) {
+        mant_3bit = 0;
+        e4m3_exp++;
+    }
+
+    if (e4m3_exp > 15) return 0x7E;
+    if (e4m3_exp == 15 && mant_3bit >= 7) return 0x7E; // Clamp, don't produce NaN
+
+    return (unsigned char)((e4m3_exp << 3) | mant_3bit);
+}
+
+// Convert unsigned E4M3 byte to float
+__device__ float dE4M3ToFloat(unsigned char val) {
+    if (val == 0) return 0.0f;
+
+    int exp = (val >> 3) & 0x0F;
+    int mant = val & 0x07;
+
+    if (exp == 0) {
+        // Subnormal: value = mant/8 * 2^(1-7) = mant / 512
+        return (float)mant / 512.0f;
+    }
+
+    // Normal: value = (1 + mant/8) * 2^(exp-7)
+    return (1.0f + (float)mant * 0.125f) * exp2f((float)(exp - 7));
+}
+
+// ============================================================================
+// NVFP4 quantization kernel
+// Two-level scaling: FP32 tensor_scale + E4M3 block_scale (per 16 elements)
+// Input: T* tensor, float tensor_scale (precomputed)
+// Output: packed uint8 (2 values per byte), uint8 block_scales (E4M3)
+// ============================================================================
+template <typename T>
+__global__ void kQuantizeNVFP4(
+    const T* __restrict__ input,
+    unsigned char* __restrict__ output,      // Packed FP4: n/2 bytes
+    unsigned char* __restrict__ block_scales, // E4M3 scales: n/16 bytes
+    const float tensor_scale,
+    const int n
+) {
+    // Each thread handles 2 consecutive elements (packs into 1 byte)
+    // 8 threads per 16-element quantization block
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int element_idx = tid * 2;
+
+    if (element_idx >= n) return;
+
+    const float inv_tensor_scale = (tensor_scale > 0.0f) ? (1.0f / tensor_scale) : 0.0f;
+
+    // Load 2 elements, divide by tensor_scale
+    float val0 = (element_idx < n) ? (float)input[element_idx] * inv_tensor_scale : 0.0f;
+    float val1 = (element_idx + 1 < n) ? (float)input[element_idx + 1] * inv_tensor_scale : 0.0f;
+
+    // Compute per-thread absmax
+    float local_max = fmaxf(fabsf(val0), fabsf(val1));
+
+    // Warp-shuffle reduction within 8-thread quantization block
+    // Threads 0-7 handle block 0, 8-15 handle block 1, etc.
+    // XOR offsets 4, 2, 1 stay within each 8-thread group
+    #pragma unroll
+    for (int offset = 4; offset >= 1; offset >>= 1) {
+        float other = __shfl_xor_sync(0xFFFFFFFF, local_max, offset);
+        local_max = fmaxf(local_max, other);
+    }
+
+    // Compute E4M3 block scale: block_absmax / 6.0 (E2M1 max)
+    float block_scale_f32 = local_max / 6.0f;
+    unsigned char block_scale_e4m3 = dFloatToE4M3(block_scale_f32);
+    float block_scale_deq = dE4M3ToFloat(block_scale_e4m3);
+
+    // Avoid division by zero for all-zero blocks
+    float inv_block_scale = (block_scale_deq > 0.0f) ? (1.0f / block_scale_deq) : 0.0f;
+
+    // Store block scale (first thread in each 8-thread group)
+    int lane_in_block = threadIdx.x & 7;
+    if (lane_in_block == 0) {
+        int block_idx = element_idx / 16;
+        block_scales[block_idx] = block_scale_e4m3;
+    }
+
+    // Quantize values to E2M1
+    unsigned char q0 = dQuantizeNVFP4(val0 * inv_block_scale);
+    unsigned char q1 = dQuantizeNVFP4(val1 * inv_block_scale);
+
+    // Pack: low nibble = first element, high nibble = second element
+    unsigned char packed = ((q1 & 0x0F) << 4) | (q0 & 0x0F);
+
+    // Store packed byte
+    output[element_idx / 2] = packed;
+}
+
+// ============================================================================
+// NVFP4 dequantization kernel
+// Reverses the two-level scaling: unpacks FP4, multiplies by block_scale * tensor_scale
+// ============================================================================
+template <typename T>
+__global__ void kDequantizeNVFP4(
+    const unsigned char* __restrict__ input,       // Packed FP4: n/2 bytes
+    const unsigned char* __restrict__ block_scales, // E4M3 scales: n/16 bytes
+    const float tensor_scale,
+    T* __restrict__ output,
+    const int n
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int element_idx = tid * 2;
+
+    if (element_idx >= n) return;
+
+    // Load and unpack
+    unsigned char packed = input[element_idx / 2];
+    unsigned char q0 = packed & 0x0F;        // Low nibble
+    unsigned char q1 = (packed >> 4) & 0x0F; // High nibble
+
+    // Load block scale
+    int block_idx = element_idx / 16;
+    float block_scale_f32 = dE4M3ToFloat(block_scales[block_idx]);
+
+    // Combined scale factor
+    float scale = block_scale_f32 * tensor_scale;
+
+    // Dequantize and write
+    float val0 = dDequantizeNVFP4(q0) * scale;
+    float val1 = dDequantizeNVFP4(q1) * scale;
+
+    if (element_idx < n)
+        output[element_idx] = (T)val0;
+    if (element_idx + 1 < n)
+        output[element_idx + 1] = (T)val1;
+}
+
 __device__ unsigned char dQuantizeNF4(float x) {
 
     // the values for this tree was generated by test_normal_map_tree
@@ -2567,6 +2766,32 @@ template __global__ void kDequantizeBlockwise<__nv_bfloat16, 512, 64, 8, NF4>(
     float* code, unsigned char* A, float* absmax, __nv_bfloat16* out, const int blocksize, const int n
 );
 
+// NVFP4 kernel template instantiations
+template __global__ void kQuantizeNVFP4<half>(
+    const half* __restrict__ input, unsigned char* __restrict__ output,
+    unsigned char* __restrict__ block_scales, const float tensor_scale, const int n
+);
+template __global__ void kQuantizeNVFP4<__nv_bfloat16>(
+    const __nv_bfloat16* __restrict__ input, unsigned char* __restrict__ output,
+    unsigned char* __restrict__ block_scales, const float tensor_scale, const int n
+);
+template __global__ void kQuantizeNVFP4<float>(
+    const float* __restrict__ input, unsigned char* __restrict__ output,
+    unsigned char* __restrict__ block_scales, const float tensor_scale, const int n
+);
+template __global__ void kDequantizeNVFP4<half>(
+    const unsigned char* __restrict__ input, const unsigned char* __restrict__ block_scales,
+    const float tensor_scale, half* __restrict__ output, const int n
+);
+template __global__ void kDequantizeNVFP4<__nv_bfloat16>(
+    const unsigned char* __restrict__ input, const unsigned char* __restrict__ block_scales,
+    const float tensor_scale, __nv_bfloat16* __restrict__ output, const int n
+);
+template __global__ void kDequantizeNVFP4<float>(
+    const unsigned char* __restrict__ input, const unsigned char* __restrict__ block_scales,
+    const float tensor_scale, float* __restrict__ output, const int n
+);
+
 #define MAKE_OptimizerStatic8bit2StateBlockwise(oname, gtype, block_size, num_per_thread)                              \
     template __global__ void kOptimizerStatic8bit2StateBlockwise<gtype, oname, block_size, num_per_thread>(            \
         gtype * p, gtype* __restrict__ const g, unsigned char* state1, unsigned char* state2, const float beta1,       \
diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh
@@ -26,6 +26,17 @@ template <typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE
 __global__ void
     kDequantizeBlockwise(float* code, unsigned char* A, float* absmax, T* out, const int blocksize, const int n);
 
+template <typename T>
+__global__ void kQuantizeNVFP4(
+    const T* __restrict__ input, unsigned char* __restrict__ output,
+    unsigned char* __restrict__ block_scales, const float tensor_scale, const int n
+);
+template <typename T>
+__global__ void kDequantizeNVFP4(
+    const unsigned char* __restrict__ input, const unsigned char* __restrict__ block_scales,
+    const float tensor_scale, T* __restrict__ output, const int n
+);
+
 template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
 __global__ void kPreconditionOptimizer32bit2State(
     T* g, T* p, float* state1, float* state2, float* unorm, const float beta1, const float beta2, const float eps,
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -81,6 +81,67 @@ void dequantizeBlockwise(
     CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
 
+// ============================================================================
+// NVFP4 quantize/dequantize host-side launchers
+// ============================================================================
+
+template <typename T>
+void quantizeNVFP4(
+    const T* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+) {
+    // Each thread handles 2 elements, so we need n/2 threads
+    const int threads_per_block = 256;
+    const int num_threads = (n + 1) / 2;
+    const int num_blocks = (num_threads + threads_per_block - 1) / threads_per_block;
+
+    kQuantizeNVFP4<T><<<num_blocks, threads_per_block>>>(
+        input, output, block_scales, tensor_scale, n
+    );
+    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+template <typename T>
+void dequantizeNVFP4(
+    const unsigned char* input, const unsigned char* block_scales,
+    float tensor_scale, T* output, const int n, cudaStream_t stream
+) {
+    const int threads_per_block = 256;
+    const int num_threads = (n + 1) / 2;
+    const int num_blocks = (num_threads + threads_per_block - 1) / threads_per_block;
+
+    kDequantizeNVFP4<T><<<num_blocks, threads_per_block, 0, stream>>>(
+        input, block_scales, tensor_scale, output, n
+    );
+    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+// NVFP4 template instantiations
+template void quantizeNVFP4<half>(
+    const half* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+);
+template void quantizeNVFP4<__nv_bfloat16>(
+    const __nv_bfloat16* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+);
+template void quantizeNVFP4<float>(
+    const float* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+);
+template void dequantizeNVFP4<half>(
+    const unsigned char* input, const unsigned char* block_scales,
+    float tensor_scale, half* output, const int n, cudaStream_t stream
+);
+template void dequantizeNVFP4<__nv_bfloat16>(
+    const unsigned char* input, const unsigned char* block_scales,
+    float tensor_scale, __nv_bfloat16* output, const int n, cudaStream_t stream
+);
+template void dequantizeNVFP4<float>(
+    const unsigned char* input, const unsigned char* block_scales,
+    float tensor_scale, float* output, const int n, cudaStream_t stream
+);
+
 template <typename T, int OPTIMIZER>
 void optimizer32bit(
     T* g, T* p, float* state1, float* state2, float* unorm, float max_unorm, float param_norm, const float beta1,
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
@@ -120,6 +120,17 @@ void dequantizeBlockwise(
     float* code, unsigned char* A, float* absmax, T* out, int block_size, const int n, cudaStream_t stream
 );
 
+template <typename T>
+void quantizeNVFP4(
+    const T* input, unsigned char* output, unsigned char* block_scales,
+    float tensor_scale, const int n
+);
+template <typename T>
+void dequantizeNVFP4(
+    const unsigned char* input, const unsigned char* block_scales,
+    float tensor_scale, T* output, const int n, cudaStream_t stream
+);
+
 template <typename T, int OPTIMIZER>
 void optimizer32bit(
     T* g, T* p, float* state1, float* state2, float* unorm, float max_unorm, float param_norm, float beta1, float beta2,
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp