From e213850e3b86f7158ca5e8830204030d52f2cb69 Mon Sep 17 00:00:00 2001
From: SvenHepkema <47423359+SvenHepkema@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:35:09 +0200
Subject: [PATCH 1/3] Added Work in Progress implementation of ALP on GPU

---
 gpu-kernels/alp-test-kernels-global.cuh | 125 ++++++++
 gpu-kernels/alp.cuh                     | 405 ++++++++++++++++++++++++
 gpu-kernels/fls.cuh                     | 128 ++++++++
 3 files changed, 658 insertions(+)
 create mode 100644 gpu-kernels/alp-test-kernels-global.cuh
 create mode 100644 gpu-kernels/alp.cuh
 create mode 100644 gpu-kernels/fls.cuh
diff --git a/gpu-kernels/alp-test-kernels-global.cuh b/gpu-kernels/alp-test-kernels-global.cuh
new file mode 100644
index 0000000..7604384
--- /dev/null
+++ b/gpu-kernels/alp-test-kernels-global.cuh
@@ -0,0 +1,125 @@
+#include "alp.cuh"
+
+#include "../common/utils.hpp"
+
+#ifndef ALP_TEST_KERNELS_GLOBAL_CUH
+#define ALP_TEST_KERNELS_GLOBAL_CUH
+
+namespace alp {
+namespace kernels {
+namespace global {
+namespace test {
+
+template <typename T, typename UINT_T, int UNPACK_N_VECTORS,
+          int UNPACK_N_VALUES>
+__global__ void decode_complete_alp_vector(T *out, AlpColumn<T> data) {
+  constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits<T>();
+  constexpr uint32_t N_LANES = utils::get_n_lanes<T>();
+  constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane<T>();
+
+  const int16_t lane = threadIdx.x % N_LANES;
+  const int16_t vector_index = threadIdx.x / N_LANES;
+  const int32_t block_index = blockIdx.x;
+
+  constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS;
+
+  out += (block_index * n_vectors_per_block + vector_index) *
+         consts::VALUES_PER_VECTOR;
+
+  for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) {
+    unalp<UINT_T, T, UnpackingType::VectorArray, UNPACK_N_VECTORS,
+          UNPACK_N_VALUES>(out, data, block_index, lane, i);
+    out += UNPACK_N_VALUES * N_LANES;
+  }
+}
+
+template <typename T, typename UINT_T, int UNPACK_N_VECTORS,
+          int UNPACK_N_VALUES>
+__global__ void decode_alp_vector_into_lane(T *out, AlpColumn<T> data) {
+  constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits<T>();
+  constexpr uint32_t N_LANES = utils::get_n_lanes<T>();
+  constexpr uint32_t N_VALUES = UNPACK_N_VALUES * UNPACK_N_VECTORS;
+  constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane<T>();
+
+  const int16_t lane = threadIdx.x % N_LANES;
+  const int32_t block_index = blockIdx.x;
+  constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS;
+  const int16_t vector_index =
+      block_index * n_vectors_per_block + (threadIdx.x / N_LANES);
+
+  T registers[N_VALUES];
+  out += vector_index * consts::VALUES_PER_VECTOR;
+
+  for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) {
+    unalp<UINT_T, T, UnpackingType::LaneArray, UNPACK_N_VECTORS,
+          UNPACK_N_VALUES>(registers, data, block_index, lane, i);
+
+    for (int i = 0; i < UNPACK_N_VALUES; i++) {
+      out[lane + i * N_LANES] = registers[i];
+    }
+
+    out += UNPACK_N_VALUES * N_LANES;
+  }
+}
+
+template <typename T, typename UINT_T, int UNPACK_N_VECTORS,
+          int UNPACK_N_VALUES>
+__global__ void decode_alp_vector_with_state(T *out, AlpColumn<T> data) {
+  constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits<T>();
+  constexpr uint32_t N_LANES = utils::get_n_lanes<T>();
+  constexpr uint32_t N_VALUES = UNPACK_N_VALUES * UNPACK_N_VECTORS;
+  constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane<T>();
+
+  const int16_t lane = threadIdx.x % N_LANES;
+  const int32_t block_index = blockIdx.x;
+  constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS;
+  const int16_t vector_index =
+      block_index * n_vectors_per_block + (threadIdx.x / N_LANES);
+
+  T registers[N_VALUES];
+  out += vector_index * consts::VALUES_PER_VECTOR;
+
+  auto iterator =
+      Unpacker<UINT_T, T, UnpackingType::LaneArray, UNPACK_N_VECTORS,
+               UNPACK_N_VALUES>(vector_index, lane, data);
+
+  for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) {
+    iterator.unpack_next_into(registers);
+
+    for (int i = 0; i < UNPACK_N_VALUES; i++) {
+      out[lane + i * N_LANES] = registers[i];
+    }
+
+    out += UNPACK_N_VALUES * N_LANES;
+  }
+}
+
+template <typename T, typename UINT_T, int UNPACK_N_VECTORS,
+          int UNPACK_N_VALUES>
+__global__ void decode_complete_alprd_vector(T *out, AlpRdColumn<T> data) {
+  constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits<T>();
+  constexpr uint32_t N_LANES = utils::get_n_lanes<T>();
+  constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane<T>();
+
+  const int16_t lane = threadIdx.x % N_LANES;
+  const int16_t vector_index = threadIdx.x / N_LANES;
+  const int32_t block_index = blockIdx.x;
+
+  constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS;
+
+  out += (block_index * n_vectors_per_block + vector_index) *
+         consts::VALUES_PER_VECTOR;
+
+  for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) {
+    unalprd<UINT_T, T, UnpackingType::VectorArray, UNPACK_N_VECTORS,
+            UNPACK_N_VALUES>(out, data, block_index, lane, i);
+    out += UNPACK_N_VALUES * N_LANES;
+  }
+}
+
+} // namespace test
+} // namespace global
+} // namespace kernels
+} // namespace alp
+
+#endif // ALP_TEST_KERNELS_GLOBAL_CUH
diff --git a/gpu-kernels/alp.cuh b/gpu-kernels/alp.cuh
new file mode 100644
index 0000000..f6e370d
--- /dev/null
+++ b/gpu-kernels/alp.cuh
@@ -0,0 +1,405 @@
+#include <cstdint>
+#include <cstdio>
+#include <type_traits>
+
+#include "../alp/constants.hpp"
+#include "../common/utils.hpp"
+#include "../gpu-fls/fls.cuh"
+#include "src/alp/config.hpp"
+
+#ifndef ALP_CUH
+#define ALP_CUH
+
+template <typename T> struct AlpColumn {
+  using UINT_T = typename utils::same_width_uint<T>::type;
+
+  UINT_T *ffor_array;
+  UINT_T *ffor_bases;
+  uint8_t *bit_widths;
+  uint8_t *exponents;
+  uint8_t *factors;
+
+  T *exceptions;
+  uint16_t *positions;
+  uint16_t *counts;
+};
+
+template <typename T> struct AlpRdColumn {
+  using UINT_T = typename utils::same_width_uint<T>::type;
+
+  uint16_t *left_ffor_array;
+  uint16_t *left_ffor_bases;
+  uint8_t *left_bit_widths;
+
+  UINT_T *right_ffor_array;
+  UINT_T *right_ffor_bases;
+  uint8_t *right_bit_widths;
+
+  uint16_t *left_parts_dicts;
+
+  uint16_t *exceptions;
+  uint16_t *positions;
+  uint16_t *counts;
+};
+
+namespace constant_memory {
+constexpr int32_t F_FACT_ARR_COUNT = 10;
+constexpr int32_t F_FRAC_ARR_COUNT = 11;
+__constant__ int32_t F_FACT_ARRAY[F_FACT_ARR_COUNT];
+__constant__ float F_FRAC_ARRAY[F_FRAC_ARR_COUNT];
+
+constexpr int32_t D_FACT_ARR_COUNT = 19;
+constexpr int32_t D_FRAC_ARR_COUNT = 21;
+__constant__ int64_t D_FACT_ARRAY[D_FACT_ARR_COUNT];
+__constant__ double D_FRAC_ARRAY[D_FRAC_ARR_COUNT];
+
+template <typename T> __host__ void load_alp_constants() {
+  cudaMemcpyToSymbol(F_FACT_ARRAY, alp::Constants<float>::FACT_ARR,
+                     F_FACT_ARR_COUNT * sizeof(int32_t));
+  cudaMemcpyToSymbol(F_FRAC_ARRAY, alp::Constants<float>::FRAC_ARR,
+                     F_FRAC_ARR_COUNT * sizeof(float));
+
+  cudaMemcpyToSymbol(D_FACT_ARRAY, alp::Constants<double>::FACT_ARR,
+                     D_FACT_ARR_COUNT * sizeof(int64_t));
+  cudaMemcpyToSymbol(D_FRAC_ARRAY, alp::Constants<double>::FRAC_ARR,
+                     D_FRAC_ARR_COUNT * sizeof(double));
+}
+
+template <typename T> __device__ __forceinline__ T *get_frac_arr();
+template <> __device__ __forceinline__ float *get_frac_arr() {
+  return F_FRAC_ARRAY;
+}
+template <> __device__ __forceinline__ double *get_frac_arr() {
+  return D_FRAC_ARRAY;
+}
+
+template <typename T> __device__ __forceinline__ T *get_fact_arr();
+template <> __device__ __forceinline__ int32_t *get_fact_arr() {
+  return F_FACT_ARRAY;
+}
+template <> __device__ __forceinline__ int64_t *get_fact_arr() {
+  return D_FACT_ARRAY;
+}
+} // namespace constant_memory
+
+// WARNING
+// WARNING
+// TODO WARNING IS IT NOT FASTER TO PASS THESE ARGUMENTS IN FULL WIDTH?
+
+// SO uint8_T -> uint32_t (if it gets multiplied with 32) This saves a cast
+// in each kernel, and we do not care how big parameters are, as they are
+// passed via const
+// INFO Hypothesis: not stalling on arithmetic, so it does not matter in
+// execution time. Check # executed instructions tho.
+// WARNING
+// WARNING
+template <typename T_in, typename T_out, UnpackingType unpacking_type,
+          unsigned UNPACK_N_VECTORS, unsigned UNPACK_N_VALUES>
+__device__ void unalp(T_out *__restrict out, const AlpColumn<T_out> column,
+                      const uint16_t vector_index, const uint16_t lane,
+                      const uint16_t start_index) {
+  static_assert((std::is_same<T_in, uint32_t>::value &&
+                 std::is_same<T_out, float>::value) ||
+                    (std::is_same<T_in, uint64_t>::value &&
+                     std::is_same<T_out, double>::value),
+                "Wrong type arguments");
+  using INT_T = typename utils::same_width_int<T_out>::type;
+  using UINT_T = typename utils::same_width_int<T_out>::type;
+
+  T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index;
+  uint16_t value_bit_width = column.bit_widths[vector_index];
+  UINT_T base = column.ffor_bases[vector_index];
+  INT_T factor =
+      constant_memory::get_fact_arr<INT_T>()[column.factors[vector_index]];
+  T_out frac10 = constant_memory::get_frac_arr<
+      T_out>()[column.exponents[vector_index]]; // WARNING TODO implement a
+                                                // compile time switch to grab
+                                                // float array
+  auto lambda = [base, factor, frac10](const T_in value) -> T_out {
+    return static_cast<T_out>(static_cast<INT_T>((value + base) *
+                                                 static_cast<UINT_T>(factor))) *
+           frac10;
+  };
+
+  unpack_vector<T_in, T_out, unpacking_type, UNPACK_N_VECTORS, UNPACK_N_VALUES>(
+      in, out, lane, value_bit_width, start_index, lambda);
+
+  // Patch exceptions
+  constexpr auto N_LANES = utils::get_n_lanes<INT_T>();
+  auto exceptions_count = column.counts[vector_index];
+
+  auto vec_exceptions =
+      column.exceptions + consts::VALUES_PER_VECTOR * vector_index;
+  auto vec_exceptions_positions =
+      column.positions + consts::VALUES_PER_VECTOR * vector_index;
+
+  const int first_pos = start_index * N_LANES + lane;
+  const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1);
+  if (unpacking_type == UnpackingType::VectorArray) {
+    for (int i{lane}; i < exceptions_count; i += N_LANES) {
+      // WARNING Currently assumes that you are decoding an entire vector
+      // TODO Implement an if (position > startindex && position < (start_index
+      // + UNPACK_N_VALUES * n_lanes) {...}
+      auto position = vec_exceptions_positions[i];
+      out[position] = vec_exceptions[i];
+    }
+  } else if (unpacking_type == UnpackingType::LaneArray) {
+    for (int i{0}; i < exceptions_count; i++) {
+      auto position = vec_exceptions_positions[i];
+      auto exception = vec_exceptions[i];
+      if (position >= first_pos) {
+        if (position <= last_pos && position % N_LANES == lane) {
+          out[(position - first_pos) / N_LANES] = exception;
+        }
+        if (position + 1 > last_pos) {
+          return;
+        }
+      }
+    }
+  }
+}
+
+template <typename T_in, typename T_out, UnpackingType unpacking_type,
+          unsigned UNPACK_N_VECTORS, unsigned UNPACK_N_VALUES>
+__device__ void
+unalp_with_scanner(T_out *__restrict out, const AlpColumn<T_out> column,
+                   const uint16_t vector_index, const uint16_t lane,
+                   const uint16_t start_index) {
+  static_assert((std::is_same<T_in, uint32_t>::value &&
+                 std::is_same<T_out, float>::value) ||
+                    (std::is_same<T_in, uint64_t>::value &&
+                     std::is_same<T_out, double>::value),
+                "Wrong type arguments");
+  using INT_T = typename utils::same_width_int<T_out>::type;
+  using UINT_T = typename utils::same_width_int<T_out>::type;
+
+  T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index;
+  uint16_t value_bit_width = column.bit_widths[vector_index];
+  UINT_T base = column.ffor_bases[vector_index];
+  INT_T factor =
+      constant_memory::get_fact_arr<INT_T>()[column.factors[vector_index]];
+  T_out frac10 = constant_memory::get_frac_arr<
+      T_out>()[column.exponents[vector_index]]; // WARNING TODO implement a
+                                                // compile time switch to grab
+                                                // float array
+  auto lambda = [base, factor, frac10](const T_in value) -> T_out {
+    return static_cast<T_out>(static_cast<INT_T>((value + base) *
+                                                 static_cast<UINT_T>(factor))) *
+           frac10;
+  };
+
+  unpack_vector<T_in, T_out, unpacking_type, UNPACK_N_VECTORS, UNPACK_N_VALUES>(
+      in, out, lane, value_bit_width, start_index, lambda);
+
+  // Patch exceptions
+  constexpr auto N_LANES = utils::get_n_lanes<INT_T>();
+  auto exceptions_count = column.counts[vector_index];
+
+  auto vec_exceptions =
+      column.exceptions + consts::VALUES_PER_VECTOR * vector_index;
+  auto vec_exceptions_positions =
+      column.positions + consts::VALUES_PER_VECTOR * vector_index;
+
+  const int first_pos = start_index * N_LANES + lane;
+  const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1);
+  if (unpacking_type == UnpackingType::VectorArray) {
+    for (int i{lane}; i < exceptions_count; i += N_LANES) {
+      // WARNING Currently assumes that you are decoding an entire vector
+      // TODO Implement an if (position > startindex && position < (start_index
+      // + UNPACK_N_VALUES * n_lanes) {...}
+      auto position = vec_exceptions_positions[i];
+      out[position] = vec_exceptions[i];
+    }
+  } else if (unpacking_type == UnpackingType::LaneArray) {
+    constexpr int32_t SCANNER_SIZE = 1;
+    uint16_t scanner[SCANNER_SIZE];
+
+    for (int i{0}; i < exceptions_count; i += SCANNER_SIZE) {
+
+      for (int j{0}; j < SCANNER_SIZE && j + i < exceptions_count; ++j) {
+        scanner[j] = vec_exceptions_positions[j + i];
+      }
+
+      for (int j{0}; j < SCANNER_SIZE && j + i < exceptions_count; ++j) {
+        auto position = scanner[j];
+        if (position >= first_pos) {
+          if (position <= last_pos && position % N_LANES == lane) {
+            out[(position - first_pos) / N_LANES] = vec_exceptions[j + i];
+          }
+          if (position + 1 > last_pos) {
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T_in, typename T_out, UnpackingType unpacking_type,
+          unsigned UNPACK_N_VECTORS, unsigned UNPACK_N_VALUES>
+
+struct Unpacker {
+  const int16_t vector_index;
+  const uint16_t lane;
+  const AlpColumn<T_out> column;
+  int32_t start_index = 0;
+  int32_t exception_index = 0;
+
+  __device__ Unpacker(const uint16_t vector_index, const uint16_t lane,
+           const AlpColumn<T_out> column)
+      : vector_index(vector_index), lane(lane), column(column) {}
+
+  __device__ void unpack_next_into(T_out *__restrict out) {
+    static_assert((std::is_same<T_in, uint32_t>::value &&
+                   std::is_same<T_out, float>::value) ||
+                      (std::is_same<T_in, uint64_t>::value &&
+                       std::is_same<T_out, double>::value),
+                  "Wrong type arguments");
+    using INT_T = typename utils::same_width_int<T_out>::type;
+    using UINT_T = typename utils::same_width_int<T_out>::type;
+
+    T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index;
+    uint16_t value_bit_width = column.bit_widths[vector_index];
+    UINT_T base = column.ffor_bases[vector_index];
+    INT_T factor =
+        constant_memory::get_fact_arr<INT_T>()[column.factors[vector_index]];
+    T_out frac10 = constant_memory::get_frac_arr<
+        T_out>()[column.exponents[vector_index]]; 
+    auto lambda = [base, factor, frac10](const T_in value) -> T_out {
+      return static_cast<T_out>(static_cast<INT_T>(
+                 (value + base) * static_cast<UINT_T>(factor))) *
+             frac10;
+    };
+
+    unpack_vector<T_in, T_out, unpacking_type, UNPACK_N_VECTORS,
+                  UNPACK_N_VALUES>(in, out, lane, value_bit_width, start_index,
+                                   lambda);
+
+    // Patch exceptions
+    constexpr auto N_LANES = utils::get_n_lanes<INT_T>();
+    auto exceptions_count = column.counts[vector_index];
+
+    auto vec_exceptions =
+        column.exceptions + consts::VALUES_PER_VECTOR * vector_index;
+    auto vec_exceptions_positions =
+        column.positions + consts::VALUES_PER_VECTOR * vector_index;
+
+    const int first_pos = start_index * N_LANES + lane;
+    const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1);
+		start_index += UNPACK_N_VALUES;
+    if (unpacking_type == UnpackingType::VectorArray) {
+      for (int i{lane}; i < exceptions_count; i += N_LANES) {
+        auto position = vec_exceptions_positions[i];
+        out[position] = vec_exceptions[i];
+      }
+    } else if (unpacking_type == UnpackingType::LaneArray) {
+      for (; exception_index < exceptions_count; exception_index++) {
+        auto position = vec_exceptions_positions[exception_index];
+        auto exception = vec_exceptions[exception_index];
+        if (position >= first_pos) {
+          if (position <= last_pos && position % N_LANES == lane) {
+            out[(position - first_pos) / N_LANES] = exception;
+          }
+          if (position + 1 > last_pos) {
+            return;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T_in, typename T_out, UnpackingType unpacking_type,
+          unsigned UNPACK_N_VECTORS, unsigned UNPACK_N_VALUES>
+__device__ void unalprd(T_out *__restrict out, const AlpRdColumn<T_out> column,
+                        const uint16_t vector_index, const uint16_t lane,
+                        const uint16_t start_index) {
+  static_assert((std::is_same<T_in, uint32_t>::value &&
+                 std::is_same<T_out, float>::value) ||
+                    (std::is_same<T_in, uint64_t>::value &&
+                     std::is_same<T_out, double>::value),
+                "Wrong type arguments");
+  using INT_T = typename utils::same_width_int<T_out>::type;
+  using UINT_T = typename utils::same_width_uint<T_out>::type;
+
+  constexpr int32_t N_LANES = utils::get_n_lanes<UINT_T>();
+  constexpr int32_t VALUES_PER_LANE = utils::get_values_per_lane<UINT_T>();
+
+  UINT_T *out_ut = reinterpret_cast<UINT_T *>(out);
+
+  // Loading left parts dict
+  // TODO Let the threads collaborate to load this data
+  const uint16_t *left_parts_dicts_p =
+      column.left_parts_dicts +
+      vector_index * alp::config::MAX_RD_DICTIONARY_SIZE;
+  uint16_t left_parts_dict[alp::config::MAX_RD_DICTIONARY_SIZE];
+  for (size_t j{0}; j < alp::config::MAX_RD_DICTIONARY_SIZE; j++) {
+    left_parts_dict[j] = left_parts_dicts_p[j];
+  }
+
+  // Unfforring Arrays
+  // INFO The paper says something about fusing, consider using a custom lambda
+  // TODO Do thread local array instead of shared
+  __shared__ uint16_t left_array[consts::VALUES_PER_VECTOR];
+  const uint16_t *left_ffor_array =
+      column.left_ffor_array + vector_index * consts::VALUES_PER_VECTOR;
+  const uint16_t left_bitwidth = column.left_bit_widths[vector_index];
+  const uint16_t left_base = column.left_ffor_bases[vector_index];
+
+  // WARNING This mapping from alprd lane to uint16_t lane should be independent
+  // of block sizes
+  for (int i{lane}; i < utils::get_n_lanes<uint16_t>(); i += N_LANES) {
+    undict_vector<uint16_t, uint16_t, UnpackingType::VectorArray, 1,
+                  utils::get_values_per_lane<uint16_t>()>(
+        left_ffor_array, left_array, i, left_bitwidth, 0, &left_base,
+        left_parts_dict);
+  }
+
+  // TODO Do thread local array instead of shared
+  __shared__ UINT_T right_array[consts::VALUES_PER_VECTOR];
+  const UINT_T *right_ffor_array =
+      column.right_ffor_array + vector_index * consts::VALUES_PER_VECTOR;
+  const uint16_t right_bitwidth = column.right_bit_widths[vector_index];
+  const UINT_T right_base = column.right_ffor_bases[vector_index];
+  unffor_vector<UINT_T, UnpackingType::VectorArray, 1, VALUES_PER_LANE>(
+      right_ffor_array, right_array, lane, right_bitwidth, 0, &right_base);
+
+  // Decoding
+#pragma unroll
+  for (int i{lane}; i < consts::VALUES_PER_VECTOR; i += N_LANES) {
+    // WARNING THIS SHOULD NOT WRITE TO GLOBAL
+    // TODO write to thread local, and then patch all thread local values
+    // INFO THIS IS ALSO an issue in the normal ALP kernel
+    out_ut[i] =
+        (static_cast<UINT_T>(left_array[i]) << right_bitwidth) | right_array[i];
+  }
+
+  // Patching exceptions
+  const uint16_t exceptions_count = column.counts[vector_index];
+  const uint16_t *vec_exceptions =
+      column.exceptions + consts::VALUES_PER_VECTOR * vector_index;
+  const uint16_t *vec_exceptions_positions =
+      column.positions + consts::VALUES_PER_VECTOR * vector_index;
+
+  if (unpacking_type == UnpackingType::VectorArray) {
+    for (int i{lane}; i < exceptions_count; i += N_LANES) {
+      const auto position = vec_exceptions_positions[i];
+      const UINT_T right = right_array[position];
+      const uint16_t left = vec_exceptions[i];
+      out_ut[position] = (static_cast<UINT_T>(left) << right_bitwidth) | right;
+    }
+  } else if (unpacking_type == UnpackingType::LaneArray) {
+    for (int i{0}; i < exceptions_count; ++i) {
+      const auto position = vec_exceptions_positions[i];
+
+      if (position % N_LANES == lane) {
+        const UINT_T right = right_array[position];
+        const uint16_t left = vec_exceptions[i];
+        out_ut[position / N_LANES] =
+            (static_cast<UINT_T>(left) << right_bitwidth) | right;
+      }
+    }
+  }
+}
+
+#endif // ALP_CUH
diff --git a/gpu-kernels/fls.cuh b/gpu-kernels/fls.cuh
new file mode 100644
index 0000000..17893d8
--- /dev/null
+++ b/gpu-kernels/fls.cuh
@@ -0,0 +1,128 @@
+#include <assert.h>
+#include <cstdint>
+#include <cstdio>
+#include <type_traits>
+
+#include "../common/utils.hpp"
+
+#ifndef FLS_CUH
+#define FLS_CUH
+
+enum UnpackingType { LaneArray, VectorArray };
+
+template <typename T_in, typename T_out, UnpackingType unpacking_type,
+          unsigned UNPACK_N_VECTORS, unsigned UNPACK_N_VALUES,
+          typename lambda_T>
+__device__ void unpack_vector(const T_in *__restrict in, T_out *__restrict out,
+                              const uint16_t lane,
+                              const uint16_t value_bit_width,
+                              const uint16_t start_index, lambda_T lambda) {
+  static_assert(std::is_unsigned<T_in>::value,
+                "Packing function only supports unsigned types. Cast signed "
+                "arrays to unsigned equivalent.");
+  constexpr uint8_t LANE_BIT_WIDTH = utils::get_lane_bitwidth<T_in>();
+  constexpr uint32_t N_LANES = utils::get_n_lanes<T_in>();
+  uint16_t preceding_bits = (start_index * value_bit_width);
+  uint16_t buffer_offset = preceding_bits % LANE_BIT_WIDTH;
+  uint16_t n_input_line = preceding_bits / LANE_BIT_WIDTH;
+  T_in value_mask = utils::set_first_n_bits<T_in>(value_bit_width);
+
+  T_in line_buffer[UNPACK_N_VECTORS];
+  T_in buffer_offset_mask;
+
+  int32_t encoded_vector_offset =
+      utils::get_compressed_vector_size<T_in>(value_bit_width);
+
+  in += lane;
+
+#pragma unroll
+  for (int v = 0; v < UNPACK_N_VECTORS; ++v) {
+    line_buffer[v] = *(in + n_input_line * N_LANES + v * encoded_vector_offset);
+  }
+  out += unpacking_type == UnpackingType::VectorArray ? lane : 0;
+  n_input_line++;
+
+  T_in value[UNPACK_N_VECTORS];
+
+#pragma unroll
+  for (int i = 0; i < UNPACK_N_VALUES; ++i) {
+    bool line_buffer_is_empty = buffer_offset == LANE_BIT_WIDTH;
+    if (line_buffer_is_empty) {
+#pragma unroll
+      for (int v = 0; v < UNPACK_N_VECTORS; ++v) {
+        line_buffer[v] =
+            *(in + n_input_line * N_LANES + v * encoded_vector_offset);
+      }
+      ++n_input_line;
+      buffer_offset -= LANE_BIT_WIDTH;
+    }
+
+#pragma unroll
+    for (int v = 0; v < UNPACK_N_VECTORS; ++v) {
+      value[v] =
+          (line_buffer[v] & (value_mask << buffer_offset)) >> buffer_offset;
+    }
+    buffer_offset += value_bit_width;
+
+    bool value_continues_on_next_line = buffer_offset > LANE_BIT_WIDTH;
+    if (value_continues_on_next_line) {
+#pragma unroll
+      for (int v = 0; v < UNPACK_N_VECTORS; ++v) {
+        line_buffer[v] =
+            *(in + n_input_line * N_LANES + v * encoded_vector_offset);
+      }
+      ++n_input_line;
+      buffer_offset -= LANE_BIT_WIDTH;
+
+      buffer_offset_mask =
+          (T_in{1} << static_cast<T_in>(buffer_offset)) - T_in{1};
+#pragma unroll
+      for (int v = 0; v < UNPACK_N_VECTORS; ++v) {
+        value[v] |= (line_buffer[v] & buffer_offset_mask)
+                    << (value_bit_width - buffer_offset);
+      }
+    }
+
+#pragma unroll
+    for (int v = 0; v < UNPACK_N_VECTORS; ++v) {
+      *(out + v * UNPACK_N_VALUES) = lambda(value[v]);
+    }
+    out += unpacking_type == UnpackingType::VectorArray ? N_LANES : 1;
+  }
+}
+
+template <typename T, UnpackingType unpacking_type, unsigned UNPACK_N_VECTORS,
+          unsigned UNPACK_N_VALUES>
+__device__ void
+bitunpack_vector(const T *__restrict in, T *__restrict out, const uint16_t lane,
+                 const uint16_t value_bit_width, const uint16_t start_index) {
+  auto lambda = [=](const T value) -> T { return value; };
+  unpack_vector<T, T, unpacking_type, UNPACK_N_VECTORS, UNPACK_N_VALUES>(
+      in, out, lane, value_bit_width, start_index, lambda);
+}
+
+template <typename T, UnpackingType unpacking_type, unsigned UNPACK_N_VECTORS,
+          unsigned UNPACK_N_VALUES>
+__device__ void
+unffor_vector(const T *__restrict in, T *__restrict out, const uint16_t lane,
+              const uint16_t value_bit_width, const uint16_t start_index,
+              const T *__restrict a_base_p) {
+  T base = *a_base_p;
+  auto lambda = [base](const T value) -> T { return value + base; };
+  unpack_vector<T, T, unpacking_type, UNPACK_N_VECTORS, UNPACK_N_VALUES>(
+      in, out, lane, value_bit_width, start_index, lambda);
+}
+
+template <typename T, typename T_dict, UnpackingType unpacking_type,
+          unsigned UNPACK_N_VECTORS, unsigned UNPACK_N_VALUES>
+__device__ void
+undict_vector(const T *__restrict in, T *__restrict out, const uint16_t lane,
+              const uint16_t value_bit_width, const uint16_t start_index,
+              const T *__restrict a_base_p, const T_dict *__restrict dict) {
+  T base = *a_base_p;
+  auto lambda = [base, dict](const T value) -> T { return dict[value + base]; };
+  unpack_vector<T, T_dict, unpacking_type, UNPACK_N_VECTORS, UNPACK_N_VALUES>(
+      in, out, lane, value_bit_width, start_index, lambda);
+}
+
+#endif // FLS_CUH

From 477278382147ace35b1c8d0ebe84b885bcd2a947 Mon Sep 17 00:00:00 2001
From: SvenHepkema <47423359+SvenHepkema@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:41:14 +0200
Subject: [PATCH 2/3] Added more files and fixed references

---
 gpu-kernels/alp-test-kernels-global.cuh |  2 +-
 gpu-kernels/alp.cuh                     |  8 +--
 gpu-kernels/consts.hpp                  | 28 ++++++++
 gpu-kernels/fls.cuh                     |  2 +-
 gpu-kernels/gpu-utils.cuh               | 91 +++++++++++++++++++++++++
 gpu-kernels/utils.hpp                   | 62 +++++++++++++++++
 6 files changed, 187 insertions(+), 6 deletions(-)
 create mode 100644 gpu-kernels/consts.hpp
 create mode 100644 gpu-kernels/gpu-utils.cuh
 create mode 100644 gpu-kernels/utils.hpp

diff --git a/gpu-kernels/alp-test-kernels-global.cuh b/gpu-kernels/alp-test-kernels-global.cuh
index 7604384..d80d7d4 100644
--- a/gpu-kernels/alp-test-kernels-global.cuh
+++ b/gpu-kernels/alp-test-kernels-global.cuh
@@ -1,6 +1,6 @@
 #include "alp.cuh"
 
-#include "../common/utils.hpp"
+#include "utils.hpp"
 
 #ifndef ALP_TEST_KERNELS_GLOBAL_CUH
 #define ALP_TEST_KERNELS_GLOBAL_CUH
diff --git a/gpu-kernels/alp.cuh b/gpu-kernels/alp.cuh
index f6e370d..dc5bccc 100644
--- a/gpu-kernels/alp.cuh
+++ b/gpu-kernels/alp.cuh
@@ -2,10 +2,10 @@
 #include <cstdio>
 #include <type_traits>
 
-#include "../alp/constants.hpp"
-#include "../common/utils.hpp"
-#include "../gpu-fls/fls.cuh"
-#include "src/alp/config.hpp"
+#include "../alp/include/constants.hpp"
+#include "utils.hpp"
+#include "fls.cuh"
+#include "../alp/include/config.hpp"
 
 #ifndef ALP_CUH
 #define ALP_CUH
diff --git a/gpu-kernels/consts.hpp b/gpu-kernels/consts.hpp
new file mode 100644
index 0000000..04e081c
--- /dev/null
+++ b/gpu-kernels/consts.hpp
@@ -0,0 +1,28 @@
+#include <cstdint>
+
+#ifndef CONSTS_H
+#define CONSTS_H
+
+namespace consts {
+
+constexpr int32_t REGISTER_WIDTH = 1024;
+constexpr int32_t VALUES_PER_VECTOR = 1024;
+constexpr int32_t THREADS_PER_WARP = 32;
+
+template <class T>
+struct as {
+	static inline constexpr T MAGIC_NUMBER = 0;
+};
+
+template <>
+struct as<float> {
+	static inline constexpr float MAGIC_NUMBER = 0.3214f;
+};
+
+template <>
+struct as<double> {
+	static inline constexpr double MAGIC_NUMBER = 0.3214;
+};
+} // namespace consts
+	
+#endif // CONSTS_H
diff --git a/gpu-kernels/fls.cuh b/gpu-kernels/fls.cuh
index 17893d8..b18af8b 100644
--- a/gpu-kernels/fls.cuh
+++ b/gpu-kernels/fls.cuh
@@ -3,7 +3,7 @@
 #include <cstdio>
 #include <type_traits>
 
-#include "../common/utils.hpp"
+#include "utils.hpp"
 
 #ifndef FLS_CUH
 #define FLS_CUH
diff --git a/gpu-kernels/gpu-utils.cuh b/gpu-kernels/gpu-utils.cuh
new file mode 100644
index 0000000..e20300d
--- /dev/null
+++ b/gpu-kernels/gpu-utils.cuh
@@ -0,0 +1,91 @@
+#include <__clang_cuda_runtime_wrapper.h>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <stdio.h>
+
+#ifndef GPU_UTILS_H
+#define GPU_UTILS_H
+
+#define CUDA_SAFE_CALL(call)                                                   \
+  do {                                                                         \
+    cudaError_t err = call;                                                    \
+    if (cudaSuccess != err) {                                                  \
+      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.", __FILE__,    \
+              __LINE__, cudaGetErrorString(err));                              \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+template <typename T> void free_device_pointer(T *&device_ptr) {
+  if (device_ptr != nullptr) {
+    CUDA_SAFE_CALL(cudaFree(device_ptr));
+  }
+  device_ptr = nullptr;
+}
+
+template <typename T> class GPUArray {
+private:
+  size_t count;
+  T *device_ptr = nullptr;
+
+  void allocate() {
+    CUDA_SAFE_CALL(
+        cudaMalloc(reinterpret_cast<void **>(&device_ptr), count * sizeof(T)));
+  }
+
+public:
+  GPUArray(const size_t a_count) {
+    count = a_count;
+    allocate();
+  }
+
+  GPUArray(const size_t a_count, const T *host_p) {
+    count = a_count;
+    allocate();
+    CUDA_SAFE_CALL(cudaMemcpy(device_ptr, host_p, count * sizeof(T),
+                              cudaMemcpyHostToDevice));
+  }
+
+  // Copy constructor
+  GPUArray(const GPUArray &) = delete;
+  // Assignment operator deleted
+  GPUArray &operator=(const GPUArray &) = delete;
+
+  // Move constructor
+  GPUArray(GPUArray &&other) noexcept : device_ptr(other.device_ptr) {
+    other.device_ptr = nullptr;
+  }
+
+  // Assignment operator
+  GPUArray &operator=(GPUArray &&other) noexcept {
+    if (this != &other) {
+      free_device_pointer(device_ptr);
+      device_ptr = other.device_ptr;
+      other.device_ptr = nullptr;
+    }
+    return *this;
+  }
+
+  ~GPUArray() {
+    free_device_pointer(device_ptr);
+  }
+
+  void copy_to_host(T *host_p) {
+    CUDA_SAFE_CALL(cudaMemcpy(host_p, device_ptr, count * sizeof(T),
+                              cudaMemcpyDeviceToHost));
+  }
+
+  T *get() { return device_ptr; }
+
+  T *release() {
+    auto temp = device_ptr;
+    device_ptr = nullptr;
+    return temp;
+  }
+};
+
+#endif // GPU_UTILS_H
diff --git a/gpu-kernels/utils.hpp b/gpu-kernels/utils.hpp
new file mode 100644
index 0000000..9a3bd76
--- /dev/null
+++ b/gpu-kernels/utils.hpp
@@ -0,0 +1,62 @@
+#include <cstdint>
+#include <cstdint>
+#include <type_traits>
+
+#include "consts.hpp"
+
+#ifndef FASTLANES_UTILS_H
+#define FASTLANES_UTILS_H
+
+namespace utils { // internal functions
+
+template <typename T> constexpr int32_t sizeof_in_bits() {
+  return sizeof(T) * 8;
+}
+
+template <typename T> constexpr T set_first_n_bits(const int32_t count) {
+  return (count < sizeof_in_bits<T>() ? static_cast<T>((T{1} << int32_t{count}) - T{1})
+                                      : static_cast<T>(~T{0}));
+}
+
+template <typename T> constexpr int32_t get_lane_bitwidth() {
+  return sizeof_in_bits<T>();
+}
+
+template <typename T> constexpr int32_t get_n_lanes() {
+  return consts::REGISTER_WIDTH / get_lane_bitwidth<T>();
+}
+
+template <typename T> constexpr int32_t get_values_per_lane() {
+  return consts::VALUES_PER_VECTOR / get_n_lanes<T>();
+}
+
+template <typename T>
+constexpr int32_t get_compressed_vector_size(int32_t value_bit_width) {
+  return (consts::VALUES_PER_VECTOR * value_bit_width) / sizeof_in_bits<T>();
+}
+
+constexpr size_t get_n_vecs_from_size(const size_t size) {
+	return (size + consts::VALUES_PER_VECTOR - 1) / consts::VALUES_PER_VECTOR;
+}
+
+template<typename T>
+struct same_width_int {
+  using type = 
+		typename std::conditional<sizeof(T) == 8, int64_t, 
+		typename std::conditional<sizeof(T) == 4, int32_t, 
+		typename std::conditional<sizeof(T) == 2, int16_t, 
+		int8_t>::type>::type>::type;
+};
+
+template<typename T>
+struct same_width_uint {
+  using type = 
+		typename std::conditional<sizeof(T) == 8, uint64_t, 
+		typename std::conditional<sizeof(T) == 4, uint32_t, 
+		typename std::conditional<sizeof(T) == 2, uint16_t, 
+		uint8_t>::type>::type>::type;
+};
+
+} // namespace utils
+
+#endif // FASTLANES_UTILS_H

From 673e1ca35db5f9491e7b611de5a61d3f0f9dfafb Mon Sep 17 00:00:00 2001
From: SvenHepkema <47423359+SvenHepkema@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:47:35 +0200
Subject: [PATCH 3/3] Removed some outdated comments and a redundant kernel

---
 gpu-kernels/alp.cuh | 100 +-------------------------------------------
 1 file changed, 2 insertions(+), 98 deletions(-)

diff --git a/gpu-kernels/alp.cuh b/gpu-kernels/alp.cuh
index dc5bccc..8cc1748 100644
--- a/gpu-kernels/alp.cuh
+++ b/gpu-kernels/alp.cuh
@@ -82,17 +82,6 @@ template <> __device__ __forceinline__ int64_t *get_fact_arr() {
 }
 } // namespace constant_memory
 
-// WARNING
-// WARNING
-// TODO WARNING IS IT NOT FASTER TO PASS THESE ARGUMENTS IN FULL WIDTH?
-
-// SO uint8_T -> uint32_t (if it gets multiplied with 32) This saves a cast
-// in each kernel, and we do not care how big parameters are, as they are
-// passed via const
-// INFO Hypothesis: not stalling on arithmetic, so it does not matter in
-// execution time. Check # executed instructions tho.
-// WARNING
-// WARNING
 template <typename T_in, typename T_out, UnpackingType unpacking_type,
           unsigned UNPACK_N_VECTORS, unsigned UNPACK_N_VALUES>
 __device__ void unalp(T_out *__restrict out, const AlpColumn<T_out> column,
@@ -112,9 +101,8 @@ __device__ void unalp(T_out *__restrict out, const AlpColumn<T_out> column,
   INT_T factor =
       constant_memory::get_fact_arr<INT_T>()[column.factors[vector_index]];
   T_out frac10 = constant_memory::get_frac_arr<
-      T_out>()[column.exponents[vector_index]]; // WARNING TODO implement a
-                                                // compile time switch to grab
-                                                // float array
+      T_out>()[column.exponents[vector_index]]; 
+
   auto lambda = [base, factor, frac10](const T_in value) -> T_out {
     return static_cast<T_out>(static_cast<INT_T>((value + base) *
                                                  static_cast<UINT_T>(factor))) *
@@ -138,8 +126,6 @@ __device__ void unalp(T_out *__restrict out, const AlpColumn<T_out> column,
   if (unpacking_type == UnpackingType::VectorArray) {
     for (int i{lane}; i < exceptions_count; i += N_LANES) {
       // WARNING Currently assumes that you are decoding an entire vector
-      // TODO Implement an if (position > startindex && position < (start_index
-      // + UNPACK_N_VALUES * n_lanes) {...}
       auto position = vec_exceptions_positions[i];
       out[position] = vec_exceptions[i];
     }
@@ -161,83 +147,6 @@ __device__ void unalp(T_out *__restrict out, const AlpColumn<T_out> column,
 
 template <typename T_in, typename T_out, UnpackingType unpacking_type,
           unsigned UNPACK_N_VECTORS, unsigned UNPACK_N_VALUES>
-__device__ void
-unalp_with_scanner(T_out *__restrict out, const AlpColumn<T_out> column,
-                   const uint16_t vector_index, const uint16_t lane,
-                   const uint16_t start_index) {
-  static_assert((std::is_same<T_in, uint32_t>::value &&
-                 std::is_same<T_out, float>::value) ||
-                    (std::is_same<T_in, uint64_t>::value &&
-                     std::is_same<T_out, double>::value),
-                "Wrong type arguments");
-  using INT_T = typename utils::same_width_int<T_out>::type;
-  using UINT_T = typename utils::same_width_int<T_out>::type;
-
-  T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index;
-  uint16_t value_bit_width = column.bit_widths[vector_index];
-  UINT_T base = column.ffor_bases[vector_index];
-  INT_T factor =
-      constant_memory::get_fact_arr<INT_T>()[column.factors[vector_index]];
-  T_out frac10 = constant_memory::get_frac_arr<
-      T_out>()[column.exponents[vector_index]]; // WARNING TODO implement a
-                                                // compile time switch to grab
-                                                // float array
-  auto lambda = [base, factor, frac10](const T_in value) -> T_out {
-    return static_cast<T_out>(static_cast<INT_T>((value + base) *
-                                                 static_cast<UINT_T>(factor))) *
-           frac10;
-  };
-
-  unpack_vector<T_in, T_out, unpacking_type, UNPACK_N_VECTORS, UNPACK_N_VALUES>(
-      in, out, lane, value_bit_width, start_index, lambda);
-
-  // Patch exceptions
-  constexpr auto N_LANES = utils::get_n_lanes<INT_T>();
-  auto exceptions_count = column.counts[vector_index];
-
-  auto vec_exceptions =
-      column.exceptions + consts::VALUES_PER_VECTOR * vector_index;
-  auto vec_exceptions_positions =
-      column.positions + consts::VALUES_PER_VECTOR * vector_index;
-
-  const int first_pos = start_index * N_LANES + lane;
-  const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1);
-  if (unpacking_type == UnpackingType::VectorArray) {
-    for (int i{lane}; i < exceptions_count; i += N_LANES) {
-      // WARNING Currently assumes that you are decoding an entire vector
-      // TODO Implement an if (position > startindex && position < (start_index
-      // + UNPACK_N_VALUES * n_lanes) {...}
-      auto position = vec_exceptions_positions[i];
-      out[position] = vec_exceptions[i];
-    }
-  } else if (unpacking_type == UnpackingType::LaneArray) {
-    constexpr int32_t SCANNER_SIZE = 1;
-    uint16_t scanner[SCANNER_SIZE];
-
-    for (int i{0}; i < exceptions_count; i += SCANNER_SIZE) {
-
-      for (int j{0}; j < SCANNER_SIZE && j + i < exceptions_count; ++j) {
-        scanner[j] = vec_exceptions_positions[j + i];
-      }
-
-      for (int j{0}; j < SCANNER_SIZE && j + i < exceptions_count; ++j) {
-        auto position = scanner[j];
-        if (position >= first_pos) {
-          if (position <= last_pos && position % N_LANES == lane) {
-            out[(position - first_pos) / N_LANES] = vec_exceptions[j + i];
-          }
-          if (position + 1 > last_pos) {
-            return;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T_in, typename T_out, UnpackingType unpacking_type,
-          unsigned UNPACK_N_VECTORS, unsigned UNPACK_N_VALUES>
-
 struct Unpacker {
   const int16_t vector_index;
   const uint16_t lane;
@@ -328,7 +237,6 @@ __device__ void unalprd(T_out *__restrict out, const AlpRdColumn<T_out> column,
   UINT_T *out_ut = reinterpret_cast<UINT_T *>(out);
 
   // Loading left parts dict
-  // TODO Let the threads collaborate to load this data
   const uint16_t *left_parts_dicts_p =
       column.left_parts_dicts +
       vector_index * alp::config::MAX_RD_DICTIONARY_SIZE;
@@ -338,7 +246,6 @@ __device__ void unalprd(T_out *__restrict out, const AlpRdColumn<T_out> column,
   }
 
   // Unfforring Arrays
-  // INFO The paper says something about fusing, consider using a custom lambda
   // TODO Do thread local array instead of shared
   __shared__ uint16_t left_array[consts::VALUES_PER_VECTOR];
   const uint16_t *left_ffor_array =
@@ -367,9 +274,6 @@ __device__ void unalprd(T_out *__restrict out, const AlpRdColumn<T_out> column,
   // Decoding
 #pragma unroll
   for (int i{lane}; i < consts::VALUES_PER_VECTOR; i += N_LANES) {
-    // WARNING THIS SHOULD NOT WRITE TO GLOBAL
-    // TODO write to thread local, and then patch all thread local values
-    // INFO THIS IS ALSO an issue in the normal ALP kernel
     out_ut[i] =
         (static_cast<UINT_T>(left_array[i]) << right_bitwidth) | right_array[i];
   }