bitsandbytes-foundation
diff --git a/‎.clang-format‎
Lines changed: 3 additions & 0 deletions b/‎.clang-format‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎csrc/cuda/blockwise_quantization.cu‎
Lines changed: 345 additions & 0 deletions b/‎csrc/cuda/blockwise_quantization.cu‎
Lines changed: 345 additions & 0 deletions
diff --git a/‎csrc/cuda/blockwise_quantization.cuh‎
Lines changed: 4 additions & 0 deletions b/‎csrc/cuda/blockwise_quantization.cuh‎
Lines changed: 4 additions & 0 deletions
@@ -21,6 +21,9 @@ StatementMacros:
   - 'MAKE_optimizerStatic8bit2State'
   - 'MAKE_OptimizerStatic8bit1StateBlockwise'
   - 'MAKE_OptimizerStatic8bit2StateBlockwise'
+  - 'MAKE_optimizerStatic8bit'
+  - 'MAKE_optimizerStatic8bitBlockwise'
+  - 'MAKE_optimizer32bit'
   - 'MAKE_kQuantizeBlockwise'
   - 'MAKE_BLOCKWISE8'
   - 'MAKE_ELEMENTWISE_FUNC'
 
@@ -24,7 +24,7 @@ endif()
 
 # Define included source files
 set(CPP_FILES csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
-set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
+set(CUDA_FILES csrc/ops.cu csrc/kernels.cu csrc/cuda/blockwise_quantization.cu csrc/cuda/int8.cu csrc/cuda/optimizers.cu)
 set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
 set(METAL_FILES csrc/mps_kernels.metal)
@@ -312,7 +312,9 @@ if(BUILD_CUDA)
     set_target_properties(bitsandbytes
         PROPERTIES
             CUDA_SEPARABLE_COMPILATION ON
+            CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE
     )
+
 endif()
 if(BUILD_HIP)
     if(NOT DEFINED ENV{ROCM_PATH})
 
@@ -0,0 +1,345 @@
+#include "common.cuh"
+#include "ops.cuh" // For CUDA_CHECK_RETURN, some typedefs
+#include <cub/cub.cuh>
+#include <cuda_fp16.h>
+
+// from kernels.cu
+// TODO move somewhere like common.cuh or cub_utils.cuh etc
+#if CCCL_VERSION >= 2008002
+#include <cuda/std/functional>
+#define CUB_REDUCTIONOP_MAX                                                                                            \
+    cuda::maximum<> {}
+#else
+#define CUB_REDUCTIONOP_MAX cub::Max()
+#endif
+
+// copied from kernels.cu, todo
+#define NUM 4
+#define NUM_BLOCK 4096
+
+// helper. todo: maybe move elsewhere. copied from kernels.cu
+// it is needed in deprecated optimizers too
+template <int STOCHASTIC> __device__ unsigned char dQuantize(float* smem_code, const float rand, float x) {
+    int pivot = 127;
+    int upper_pivot = 255;
+    int lower_pivot = 0;
+
+    float lower = -1.0f;
+    float upper = 1.0f;
+
+    float val = smem_code[pivot];
+    // i>>=1 = {32, 16, 8, 4, 2, 1}
+    for (int i = 64; i > 0; i >>= 1) {
+        if (x > val) {
+            lower_pivot = pivot;
+            lower = val;
+            pivot += i;
+        } else {
+            upper_pivot = pivot;
+            upper = val;
+            pivot -= i;
+        }
+        val = smem_code[pivot];
+    }
+
+    if (upper_pivot == 255)
+        upper = smem_code[upper_pivot];
+    if (lower_pivot == 0)
+        lower = smem_code[lower_pivot];
+
+    if (!STOCHASTIC) {
+        if (x > val) {
+            float midpoint = (upper + val) * 0.5f;
+            if (x > midpoint) {
+                return upper_pivot;
+            } else
+                return pivot;
+        } else {
+            float midpoint = (lower + val) * 0.5f;
+            if (x < midpoint)
+                return lower_pivot;
+            else
+                return pivot;
+        }
+    } else {
+        if (x > val) {
+            float dist_to_upper = fabsf(upper - x);
+            float dist_full = upper - val;
+            if (rand >= dist_to_upper / dist_full)
+                return upper_pivot;
+            else
+                return pivot;
+        } else {
+            float dist_to_lower = fabsf(lower - x);
+            float dist_full = val - lower;
+            if (rand >= dist_to_lower / dist_full)
+                return lower_pivot;
+            else
+                return pivot;
+        }
+    }
+}
+
+// helper. maybe move elsewhere TODO
+__device__ unsigned char dQuantizeFP4(float x) {
+    // FP4 with bias of 3
+    // first bit is a sign
+    // subnormals
+    // 0b000 = 0
+    // 0b001 = 0.0625
+    // 0b110 = 2
+    // 0b111 = 3
+    // 0b100 = 4
+    // 0b101 = 6
+    // 0b010 = 8
+    // 0b011 = 12
+
+    // we do a binary search
+    // the pivots are divided by 12 (the FP4 absmax)
+    // since we assume input data is in [-1.0, 1.0]
+
+    // !be careful here, its easy to make a mistake
+    // that is difficult to notice if you add an extra
+    // zero somewhere!
+
+    int sign = x < 0 ? 0b1000 : 0b0000;
+    x = fabsf(x);
+    if (x > 0.29166667f)
+        if (x > 0.583333f)
+            if (x > 0.8333333f)
+                return 0b0011 + sign;
+            else
+                return 0b0010 + sign;
+        else if (x > 0.4166667f)
+            return 0b101 + sign;
+        else
+            return 0b100 + sign;
+    else if (x > 0.0859375f)
+        if (x > 0.20833333f)
+            return 0b0111 + sign;
+        else
+            return 0b0110 + sign;
+    else if (x > 0.00260417f)
+        return 0b0001 + sign;
+    else
+        return 0b0000 + sign;
+}
+
+// helper. maybe move elsewhere TODO
+__device__ unsigned char dQuantizeNF4(float x) {
+
+    // the values for this tree was generated by test_normal_map_tree
+    // in the file tests/test_functional.py
+    if (x > 0.03979014977812767f)
+        if (x > 0.3893125355243683f)         // 1
+            if (x > 0.6427869200706482f)     // 11
+                if (x > 0.8614784181118011f) // 111
+                    return 0b1111;
+                else
+                    return 0b1110;
+            else if (x > 0.5016634166240692f) // 110
+                return 0b1101;
+            else
+                return 0b1100;
+        else if (x > 0.2035212516784668f) // 10
+            if (x > 0.2920137718319893f)  // 101
+                return 0b1011;
+            else
+                return 0b1010;
+        else if (x > 0.1202552504837513f) // 100
+            return 0b1001;
+        else
+            return 0b1000;
+    else if (x > -0.33967943489551544f)     // 0
+        if (x > -0.13791173323988914f)      // 01
+            if (x > -0.045525018125772476f) // 011
+                return 0b0111;
+            else
+                return 0b0110;
+        else if (x > -0.23460740596055984f) // 010
+            return 0b0101;
+        else
+            return 0b0100;
+    else if (x > -0.6106329262256622f) // 00
+        if (x > -0.4599952697753906f)  // 001
+            return 0b0011;
+        else
+            return 0b0010;
+    else if (x > -0.8480964004993439f) // 000
+        return 0b0001;
+    else
+        return 0b0000;
+}
+
+template <typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE>
+//__launch_bounds__(TH, 4)
+__global__ void kQuantizeBlockwise(
+    float* code, T* __restrict__ const A, float* absmax, unsigned char* out, float* __restrict__ const rand,
+    const int rand_offset, const int n
+) {
+    // This can overflow, so we clamp to INT32_MAX. We won't have more elements than this.
+    const int n_full = min(gridDim.x * BLOCK_SIZE, INT32_MAX);
+
+    const int base_idx = blockIdx.x * BLOCK_SIZE;
+    int valid_items = 0;
+
+    T vals[NUM_PER_TH];
+    float rand_vals[NUM_PER_TH];
+    unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH / 2 : NUM_PER_TH];
+
+    float local_abs_max = 0.0f;
+    int local_rand_idx = 0;
+
+    typedef cub::BlockLoad<T, BLOCK_SIZE / NUM_PER_TH, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef cub::BlockStore<
+        unsigned char, BLOCK_SIZE / NUM_PER_TH, (DATA_TYPE > 0) ? NUM_PER_TH / 2 : NUM_PER_TH,
+        cub::BLOCK_STORE_WARP_TRANSPOSE>
+        StoreChar;
+    typedef cub::BlockReduce<float, BLOCK_SIZE / NUM_PER_TH> BlockReduce;
+    typedef cub::BlockLoad<float, BLOCK_SIZE / NUM_PER_TH, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+
+    __shared__ typename LoadT::TempStorage loadt;
+    __shared__ typename LoadFloat::TempStorage loadf;
+    __shared__ typename StoreChar::TempStorage storec;
+    __shared__ typename BlockReduce::TempStorage reduce;
+    __shared__ float smem_code[256];
+    __shared__ float smem_absmax_value[1];
+
+    if (DATA_TYPE == General8bit)
+        for (int i = threadIdx.x; i < 256; i += blockDim.x)
+            smem_code[i] = code[i];
+
+    for (int64_t i = base_idx; i < n_full; i += gridDim.x * BLOCK_SIZE) {
+        valid_items = min(BLOCK_SIZE, static_cast<int>(n - i));
+        local_abs_max = -FLT_MAX;
+
+        __syncthreads();
+        LoadT(loadt).Load(&(A[i]), vals, valid_items, (T)0.0f);
+
+        // 1. compute local max
+        // 2. broadcast local max
+        // 3. normalize inputs and quantize
+
+#pragma unroll NUM_PER_TH
+        for (int j = 0; j < NUM_PER_TH; j++)
+            local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j]));
+
+        local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, CUB_REDUCTIONOP_MAX, valid_items);
+
+        if (threadIdx.x == 0) {
+            smem_absmax_value[0] = 1.0f / local_abs_max;
+            absmax[i / BLOCK_SIZE] = local_abs_max;
+        }
+        __syncthreads();
+
+        local_abs_max = smem_absmax_value[0];
+
+        if (STOCHASTIC) {
+            local_rand_idx = ((blockIdx.x * NUM_BLOCK) + (threadIdx.x * NUM) + rand_offset) % (1024 - 4);
+            LoadFloat(loadf).Load(&rand[local_rand_idx], rand_vals, BLOCK_SIZE, 0);
+        }
+
+        switch (DATA_TYPE) {
+        case General8bit:
+#pragma unroll NUM_PER_TH
+            for (int j = 0; j < NUM_PER_TH; j++) {
+                if (!STOCHASTIC)
+                    qvals[j] = dQuantize<0>(smem_code, 0.0f, ((float)vals[j]) * local_abs_max);
+                else
+                    qvals[j] = dQuantize<1>(smem_code, rand_vals[j], ((float)vals[j]) * local_abs_max);
+            }
+            break;
+        case FP4:
+#pragma unroll NUM_PER_TH
+            for (int j = 0; j < NUM_PER_TH / 2; j++) {
+                qvals[j] = dQuantizeFP4(((float)vals[2 * j]) * local_abs_max) << 4;
+                qvals[j] |= dQuantizeFP4(((float)vals[2 * j + 1]) * local_abs_max);
+            }
+            break;
+        case NF4:
+#pragma unroll NUM_PER_TH
+            for (int j = 0; j < NUM_PER_TH / 2; j++) {
+                qvals[j] = dQuantizeNF4(((float)vals[2 * j]) * local_abs_max) << 4;
+                qvals[j] |= dQuantizeNF4(((float)vals[2 * j + 1]) * local_abs_max);
+            }
+            break;
+        }
+
+        __syncthreads();
+        StoreChar(storec).Store(
+            &(out[(DATA_TYPE > 0) ? i / 2 : i]), qvals, (DATA_TYPE > 0) ? (valid_items + 1) / 2 : valid_items
+        );
+    }
+}
+
+//// host code
+
+template <typename T, int STOCHASTIC, int DATA_TYPE>
+void quantizeBlockwise(
+    float* code, T* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+) {
+    int num_blocks = n / blocksize;
+    num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
+
+    if (blocksize == 4096)
+        kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, DATA_TYPE>
+            <<<num_blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
+    else if (blocksize == 2048)
+        kQuantizeBlockwise<T, 2048, 4, 0, DATA_TYPE><<<num_blocks, 512>>>(code, A, absmax, out, rand, rand_offset, n);
+    else if (blocksize == 1024)
+        kQuantizeBlockwise<T, 1024, 4, 0, DATA_TYPE><<<num_blocks, 256>>>(code, A, absmax, out, rand, rand_offset, n);
+    else if (blocksize == 512)
+        kQuantizeBlockwise<T, 512, 2, 0, DATA_TYPE><<<num_blocks, 256>>>(code, A, absmax, out, rand, rand_offset, n);
+    else if (blocksize == 256)
+        kQuantizeBlockwise<T, 256, 2, 0, DATA_TYPE><<<num_blocks, 128>>>(code, A, absmax, out, rand, rand_offset, n);
+    else if (blocksize == 128)
+        kQuantizeBlockwise<T, 128, 2, 0, DATA_TYPE><<<num_blocks, 64>>>(code, A, absmax, out, rand, rand_offset, n);
+    else if (blocksize == 64)
+        kQuantizeBlockwise<T, 64, 2, 0, DATA_TYPE><<<num_blocks, 32>>>(code, A, absmax, out, rand, rand_offset, n);
+
+    CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+// launch template instantiations needed for host code
+// todo: consider just exposing C API here instead
+
+template void quantizeBlockwise<half, 1, General8bit>(
+    float* code, half* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+);
+template void quantizeBlockwise<half, 0, General8bit>(
+    float* code, half* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+);
+template void quantizeBlockwise<half, 0, FP4>(
+    float* code, half* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+);
+template void quantizeBlockwise<half, 0, NF4>(
+    float* code, half* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+);
+template void quantizeBlockwise<float, 1, General8bit>(
+    float* code, float* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+);
+template void quantizeBlockwise<float, 0, General8bit>(
+    float* code, float* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+);
+template void quantizeBlockwise<float, 0, FP4>(
+    float* code, float* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+);
+template void quantizeBlockwise<float, 0, NF4>(
+    float* code, float* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+);
+template void quantizeBlockwise<__nv_bfloat16, 1, General8bit>(
+    float* code, __nv_bfloat16* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize,
+    const int n
+);
+template void quantizeBlockwise<__nv_bfloat16, 0, General8bit>(
+    float* code, __nv_bfloat16* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize,
+    const int n
+);
+template void quantizeBlockwise<__nv_bfloat16, 0, FP4>(
+    float* code, __nv_bfloat16* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize,
+    const int n
+);
+template void quantizeBlockwise<__nv_bfloat16, 0, NF4>(
+    float* code, __nv_bfloat16* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize,
+    const int n
+);
@@ -0,0 +1,4 @@
+template <typename T, int STOCHASTIC, int DATA_TYPE>
+void quantizeBlockwise(
+    float* code, T* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
+);