diff --git a/gpu-kernels/alp-test-kernels-global.cuh b/gpu-kernels/alp-test-kernels-global.cuh new file mode 100644 index 0000000..d80d7d4 --- /dev/null +++ b/gpu-kernels/alp-test-kernels-global.cuh @@ -0,0 +1,125 @@ +#include "alp.cuh" + +#include "utils.hpp" + +#ifndef ALP_TEST_KERNELS_GLOBAL_CUH +#define ALP_TEST_KERNELS_GLOBAL_CUH + +namespace alp { +namespace kernels { +namespace global { +namespace test { + +template +__global__ void decode_complete_alp_vector(T *out, AlpColumn data) { + constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane(); + + const int16_t lane = threadIdx.x % N_LANES; + const int16_t vector_index = threadIdx.x / N_LANES; + const int32_t block_index = blockIdx.x; + + constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS; + + out += (block_index * n_vectors_per_block + vector_index) * + consts::VALUES_PER_VECTOR; + + for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) { + unalp(out, data, block_index, lane, i); + out += UNPACK_N_VALUES * N_LANES; + } +} + +template +__global__ void decode_alp_vector_into_lane(T *out, AlpColumn data) { + constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + constexpr uint32_t N_VALUES = UNPACK_N_VALUES * UNPACK_N_VECTORS; + constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane(); + + const int16_t lane = threadIdx.x % N_LANES; + const int32_t block_index = blockIdx.x; + constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS; + const int16_t vector_index = + block_index * n_vectors_per_block + (threadIdx.x / N_LANES); + + T registers[N_VALUES]; + out += vector_index * consts::VALUES_PER_VECTOR; + + for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) { + unalp(registers, data, block_index, lane, i); + + for (int i = 0; i < UNPACK_N_VALUES; i++) { + out[lane + i * N_LANES] = registers[i]; + } + + out += UNPACK_N_VALUES * N_LANES; + } +} + +template +__global__ void decode_alp_vector_with_state(T *out, AlpColumn data) { + constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + constexpr uint32_t N_VALUES = UNPACK_N_VALUES * UNPACK_N_VECTORS; + constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane(); + + const int16_t lane = threadIdx.x % N_LANES; + const int32_t block_index = blockIdx.x; + constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS; + const int16_t vector_index = + block_index * n_vectors_per_block + (threadIdx.x / N_LANES); + + T registers[N_VALUES]; + out += vector_index * consts::VALUES_PER_VECTOR; + + auto iterator = + Unpacker(vector_index, lane, data); + + for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) { + iterator.unpack_next_into(registers); + + for (int i = 0; i < UNPACK_N_VALUES; i++) { + out[lane + i * N_LANES] = registers[i]; + } + + out += UNPACK_N_VALUES * N_LANES; + } +} + +template +__global__ void decode_complete_alprd_vector(T *out, AlpRdColumn data) { + constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane(); + + const int16_t lane = threadIdx.x % N_LANES; + const int16_t vector_index = threadIdx.x / N_LANES; + const int32_t block_index = blockIdx.x; + + constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS; + + out += (block_index * n_vectors_per_block + vector_index) * + consts::VALUES_PER_VECTOR; + + for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) { + unalprd(out, data, block_index, lane, i); + out += UNPACK_N_VALUES * N_LANES; + } +} + +} // namespace test +} // namespace global +} // namespace kernels +} // namespace alp + +#endif // ALP_TEST_KERNELS_GLOBAL_CUH diff --git a/gpu-kernels/alp.cuh b/gpu-kernels/alp.cuh new file mode 100644 index 0000000..8cc1748 --- /dev/null +++ b/gpu-kernels/alp.cuh @@ -0,0 +1,309 @@ +#include +#include +#include + +#include "../alp/include/constants.hpp" +#include "utils.hpp" +#include "fls.cuh" +#include "../alp/include/config.hpp" + +#ifndef ALP_CUH +#define ALP_CUH + +template struct AlpColumn { + using UINT_T = typename utils::same_width_uint::type; + + UINT_T *ffor_array; + UINT_T *ffor_bases; + uint8_t *bit_widths; + uint8_t *exponents; + uint8_t *factors; + + T *exceptions; + uint16_t *positions; + uint16_t *counts; +}; + +template struct AlpRdColumn { + using UINT_T = typename utils::same_width_uint::type; + + uint16_t *left_ffor_array; + uint16_t *left_ffor_bases; + uint8_t *left_bit_widths; + + UINT_T *right_ffor_array; + UINT_T *right_ffor_bases; + uint8_t *right_bit_widths; + + uint16_t *left_parts_dicts; + + uint16_t *exceptions; + uint16_t *positions; + uint16_t *counts; +}; + +namespace constant_memory { +constexpr int32_t F_FACT_ARR_COUNT = 10; +constexpr int32_t F_FRAC_ARR_COUNT = 11; +__constant__ int32_t F_FACT_ARRAY[F_FACT_ARR_COUNT]; +__constant__ float F_FRAC_ARRAY[F_FRAC_ARR_COUNT]; + +constexpr int32_t D_FACT_ARR_COUNT = 19; +constexpr int32_t D_FRAC_ARR_COUNT = 21; +__constant__ int64_t D_FACT_ARRAY[D_FACT_ARR_COUNT]; +__constant__ double D_FRAC_ARRAY[D_FRAC_ARR_COUNT]; + +template __host__ void load_alp_constants() { + cudaMemcpyToSymbol(F_FACT_ARRAY, alp::Constants::FACT_ARR, + F_FACT_ARR_COUNT * sizeof(int32_t)); + cudaMemcpyToSymbol(F_FRAC_ARRAY, alp::Constants::FRAC_ARR, + F_FRAC_ARR_COUNT * sizeof(float)); + + cudaMemcpyToSymbol(D_FACT_ARRAY, alp::Constants::FACT_ARR, + D_FACT_ARR_COUNT * sizeof(int64_t)); + cudaMemcpyToSymbol(D_FRAC_ARRAY, alp::Constants::FRAC_ARR, + D_FRAC_ARR_COUNT * sizeof(double)); +} + +template __device__ __forceinline__ T *get_frac_arr(); +template <> __device__ __forceinline__ float *get_frac_arr() { + return F_FRAC_ARRAY; +} +template <> __device__ __forceinline__ double *get_frac_arr() { + return D_FRAC_ARRAY; +} + +template __device__ __forceinline__ T *get_fact_arr(); +template <> __device__ __forceinline__ int32_t *get_fact_arr() { + return F_FACT_ARRAY; +} +template <> __device__ __forceinline__ int64_t *get_fact_arr() { + return D_FACT_ARRAY; +} +} // namespace constant_memory + +template +__device__ void unalp(T_out *__restrict out, const AlpColumn column, + const uint16_t vector_index, const uint16_t lane, + const uint16_t start_index) { + static_assert((std::is_same::value && + std::is_same::value) || + (std::is_same::value && + std::is_same::value), + "Wrong type arguments"); + using INT_T = typename utils::same_width_int::type; + using UINT_T = typename utils::same_width_int::type; + + T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index; + uint16_t value_bit_width = column.bit_widths[vector_index]; + UINT_T base = column.ffor_bases[vector_index]; + INT_T factor = + constant_memory::get_fact_arr()[column.factors[vector_index]]; + T_out frac10 = constant_memory::get_frac_arr< + T_out>()[column.exponents[vector_index]]; + + auto lambda = [base, factor, frac10](const T_in value) -> T_out { + return static_cast(static_cast((value + base) * + static_cast(factor))) * + frac10; + }; + + unpack_vector( + in, out, lane, value_bit_width, start_index, lambda); + + // Patch exceptions + constexpr auto N_LANES = utils::get_n_lanes(); + auto exceptions_count = column.counts[vector_index]; + + auto vec_exceptions = + column.exceptions + consts::VALUES_PER_VECTOR * vector_index; + auto vec_exceptions_positions = + column.positions + consts::VALUES_PER_VECTOR * vector_index; + + const int first_pos = start_index * N_LANES + lane; + const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1); + if (unpacking_type == UnpackingType::VectorArray) { + for (int i{lane}; i < exceptions_count; i += N_LANES) { + // WARNING Currently assumes that you are decoding an entire vector + auto position = vec_exceptions_positions[i]; + out[position] = vec_exceptions[i]; + } + } else if (unpacking_type == UnpackingType::LaneArray) { + for (int i{0}; i < exceptions_count; i++) { + auto position = vec_exceptions_positions[i]; + auto exception = vec_exceptions[i]; + if (position >= first_pos) { + if (position <= last_pos && position % N_LANES == lane) { + out[(position - first_pos) / N_LANES] = exception; + } + if (position + 1 > last_pos) { + return; + } + } + } + } +} + +template +struct Unpacker { + const int16_t vector_index; + const uint16_t lane; + const AlpColumn column; + int32_t start_index = 0; + int32_t exception_index = 0; + + __device__ Unpacker(const uint16_t vector_index, const uint16_t lane, + const AlpColumn column) + : vector_index(vector_index), lane(lane), column(column) {} + + __device__ void unpack_next_into(T_out *__restrict out) { + static_assert((std::is_same::value && + std::is_same::value) || + (std::is_same::value && + std::is_same::value), + "Wrong type arguments"); + using INT_T = typename utils::same_width_int::type; + using UINT_T = typename utils::same_width_int::type; + + T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index; + uint16_t value_bit_width = column.bit_widths[vector_index]; + UINT_T base = column.ffor_bases[vector_index]; + INT_T factor = + constant_memory::get_fact_arr()[column.factors[vector_index]]; + T_out frac10 = constant_memory::get_frac_arr< + T_out>()[column.exponents[vector_index]]; + auto lambda = [base, factor, frac10](const T_in value) -> T_out { + return static_cast(static_cast( + (value + base) * static_cast(factor))) * + frac10; + }; + + unpack_vector(in, out, lane, value_bit_width, start_index, + lambda); + + // Patch exceptions + constexpr auto N_LANES = utils::get_n_lanes(); + auto exceptions_count = column.counts[vector_index]; + + auto vec_exceptions = + column.exceptions + consts::VALUES_PER_VECTOR * vector_index; + auto vec_exceptions_positions = + column.positions + consts::VALUES_PER_VECTOR * vector_index; + + const int first_pos = start_index * N_LANES + lane; + const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1); + start_index += UNPACK_N_VALUES; + if (unpacking_type == UnpackingType::VectorArray) { + for (int i{lane}; i < exceptions_count; i += N_LANES) { + auto position = vec_exceptions_positions[i]; + out[position] = vec_exceptions[i]; + } + } else if (unpacking_type == UnpackingType::LaneArray) { + for (; exception_index < exceptions_count; exception_index++) { + auto position = vec_exceptions_positions[exception_index]; + auto exception = vec_exceptions[exception_index]; + if (position >= first_pos) { + if (position <= last_pos && position % N_LANES == lane) { + out[(position - first_pos) / N_LANES] = exception; + } + if (position + 1 > last_pos) { + return; + } + } + } + } + } +}; + +template +__device__ void unalprd(T_out *__restrict out, const AlpRdColumn column, + const uint16_t vector_index, const uint16_t lane, + const uint16_t start_index) { + static_assert((std::is_same::value && + std::is_same::value) || + (std::is_same::value && + std::is_same::value), + "Wrong type arguments"); + using INT_T = typename utils::same_width_int::type; + using UINT_T = typename utils::same_width_uint::type; + + constexpr int32_t N_LANES = utils::get_n_lanes(); + constexpr int32_t VALUES_PER_LANE = utils::get_values_per_lane(); + + UINT_T *out_ut = reinterpret_cast(out); + + // Loading left parts dict + const uint16_t *left_parts_dicts_p = + column.left_parts_dicts + + vector_index * alp::config::MAX_RD_DICTIONARY_SIZE; + uint16_t left_parts_dict[alp::config::MAX_RD_DICTIONARY_SIZE]; + for (size_t j{0}; j < alp::config::MAX_RD_DICTIONARY_SIZE; j++) { + left_parts_dict[j] = left_parts_dicts_p[j]; + } + + // Unfforring Arrays + // TODO Do thread local array instead of shared + __shared__ uint16_t left_array[consts::VALUES_PER_VECTOR]; + const uint16_t *left_ffor_array = + column.left_ffor_array + vector_index * consts::VALUES_PER_VECTOR; + const uint16_t left_bitwidth = column.left_bit_widths[vector_index]; + const uint16_t left_base = column.left_ffor_bases[vector_index]; + + // WARNING This mapping from alprd lane to uint16_t lane should be independent + // of block sizes + for (int i{lane}; i < utils::get_n_lanes(); i += N_LANES) { + undict_vector()>( + left_ffor_array, left_array, i, left_bitwidth, 0, &left_base, + left_parts_dict); + } + + // TODO Do thread local array instead of shared + __shared__ UINT_T right_array[consts::VALUES_PER_VECTOR]; + const UINT_T *right_ffor_array = + column.right_ffor_array + vector_index * consts::VALUES_PER_VECTOR; + const uint16_t right_bitwidth = column.right_bit_widths[vector_index]; + const UINT_T right_base = column.right_ffor_bases[vector_index]; + unffor_vector( + right_ffor_array, right_array, lane, right_bitwidth, 0, &right_base); + + // Decoding +#pragma unroll + for (int i{lane}; i < consts::VALUES_PER_VECTOR; i += N_LANES) { + out_ut[i] = + (static_cast(left_array[i]) << right_bitwidth) | right_array[i]; + } + + // Patching exceptions + const uint16_t exceptions_count = column.counts[vector_index]; + const uint16_t *vec_exceptions = + column.exceptions + consts::VALUES_PER_VECTOR * vector_index; + const uint16_t *vec_exceptions_positions = + column.positions + consts::VALUES_PER_VECTOR * vector_index; + + if (unpacking_type == UnpackingType::VectorArray) { + for (int i{lane}; i < exceptions_count; i += N_LANES) { + const auto position = vec_exceptions_positions[i]; + const UINT_T right = right_array[position]; + const uint16_t left = vec_exceptions[i]; + out_ut[position] = (static_cast(left) << right_bitwidth) | right; + } + } else if (unpacking_type == UnpackingType::LaneArray) { + for (int i{0}; i < exceptions_count; ++i) { + const auto position = vec_exceptions_positions[i]; + + if (position % N_LANES == lane) { + const UINT_T right = right_array[position]; + const uint16_t left = vec_exceptions[i]; + out_ut[position / N_LANES] = + (static_cast(left) << right_bitwidth) | right; + } + } + } +} + +#endif // ALP_CUH diff --git a/gpu-kernels/consts.hpp b/gpu-kernels/consts.hpp new file mode 100644 index 0000000..04e081c --- /dev/null +++ b/gpu-kernels/consts.hpp @@ -0,0 +1,28 @@ +#include + +#ifndef CONSTS_H +#define CONSTS_H + +namespace consts { + +constexpr int32_t REGISTER_WIDTH = 1024; +constexpr int32_t VALUES_PER_VECTOR = 1024; +constexpr int32_t THREADS_PER_WARP = 32; + +template +struct as { + static inline constexpr T MAGIC_NUMBER = 0; +}; + +template <> +struct as { + static inline constexpr float MAGIC_NUMBER = 0.3214f; +}; + +template <> +struct as { + static inline constexpr double MAGIC_NUMBER = 0.3214; +}; +} // namespace consts + +#endif // CONSTS_H diff --git a/gpu-kernels/fls.cuh b/gpu-kernels/fls.cuh new file mode 100644 index 0000000..b18af8b --- /dev/null +++ b/gpu-kernels/fls.cuh @@ -0,0 +1,128 @@ +#include +#include +#include +#include + +#include "utils.hpp" + +#ifndef FLS_CUH +#define FLS_CUH + +enum UnpackingType { LaneArray, VectorArray }; + +template +__device__ void unpack_vector(const T_in *__restrict in, T_out *__restrict out, + const uint16_t lane, + const uint16_t value_bit_width, + const uint16_t start_index, lambda_T lambda) { + static_assert(std::is_unsigned::value, + "Packing function only supports unsigned types. Cast signed " + "arrays to unsigned equivalent."); + constexpr uint8_t LANE_BIT_WIDTH = utils::get_lane_bitwidth(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + uint16_t preceding_bits = (start_index * value_bit_width); + uint16_t buffer_offset = preceding_bits % LANE_BIT_WIDTH; + uint16_t n_input_line = preceding_bits / LANE_BIT_WIDTH; + T_in value_mask = utils::set_first_n_bits(value_bit_width); + + T_in line_buffer[UNPACK_N_VECTORS]; + T_in buffer_offset_mask; + + int32_t encoded_vector_offset = + utils::get_compressed_vector_size(value_bit_width); + + in += lane; + +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + line_buffer[v] = *(in + n_input_line * N_LANES + v * encoded_vector_offset); + } + out += unpacking_type == UnpackingType::VectorArray ? lane : 0; + n_input_line++; + + T_in value[UNPACK_N_VECTORS]; + +#pragma unroll + for (int i = 0; i < UNPACK_N_VALUES; ++i) { + bool line_buffer_is_empty = buffer_offset == LANE_BIT_WIDTH; + if (line_buffer_is_empty) { +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + line_buffer[v] = + *(in + n_input_line * N_LANES + v * encoded_vector_offset); + } + ++n_input_line; + buffer_offset -= LANE_BIT_WIDTH; + } + +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + value[v] = + (line_buffer[v] & (value_mask << buffer_offset)) >> buffer_offset; + } + buffer_offset += value_bit_width; + + bool value_continues_on_next_line = buffer_offset > LANE_BIT_WIDTH; + if (value_continues_on_next_line) { +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + line_buffer[v] = + *(in + n_input_line * N_LANES + v * encoded_vector_offset); + } + ++n_input_line; + buffer_offset -= LANE_BIT_WIDTH; + + buffer_offset_mask = + (T_in{1} << static_cast(buffer_offset)) - T_in{1}; +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + value[v] |= (line_buffer[v] & buffer_offset_mask) + << (value_bit_width - buffer_offset); + } + } + +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + *(out + v * UNPACK_N_VALUES) = lambda(value[v]); + } + out += unpacking_type == UnpackingType::VectorArray ? N_LANES : 1; + } +} + +template +__device__ void +bitunpack_vector(const T *__restrict in, T *__restrict out, const uint16_t lane, + const uint16_t value_bit_width, const uint16_t start_index) { + auto lambda = [=](const T value) -> T { return value; }; + unpack_vector( + in, out, lane, value_bit_width, start_index, lambda); +} + +template +__device__ void +unffor_vector(const T *__restrict in, T *__restrict out, const uint16_t lane, + const uint16_t value_bit_width, const uint16_t start_index, + const T *__restrict a_base_p) { + T base = *a_base_p; + auto lambda = [base](const T value) -> T { return value + base; }; + unpack_vector( + in, out, lane, value_bit_width, start_index, lambda); +} + +template +__device__ void +undict_vector(const T *__restrict in, T *__restrict out, const uint16_t lane, + const uint16_t value_bit_width, const uint16_t start_index, + const T *__restrict a_base_p, const T_dict *__restrict dict) { + T base = *a_base_p; + auto lambda = [base, dict](const T value) -> T { return dict[value + base]; }; + unpack_vector( + in, out, lane, value_bit_width, start_index, lambda); +} + +#endif // FLS_CUH diff --git a/gpu-kernels/gpu-utils.cuh b/gpu-kernels/gpu-utils.cuh new file mode 100644 index 0000000..e20300d --- /dev/null +++ b/gpu-kernels/gpu-utils.cuh @@ -0,0 +1,91 @@ +#include <__clang_cuda_runtime_wrapper.h> +#include +#include +#include +#include +#include +#include +#include + +#ifndef GPU_UTILS_H +#define GPU_UTILS_H + +#define CUDA_SAFE_CALL(call) \ + do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.", __FILE__, \ + __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +template void free_device_pointer(T *&device_ptr) { + if (device_ptr != nullptr) { + CUDA_SAFE_CALL(cudaFree(device_ptr)); + } + device_ptr = nullptr; +} + +template class GPUArray { +private: + size_t count; + T *device_ptr = nullptr; + + void allocate() { + CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&device_ptr), count * sizeof(T))); + } + +public: + GPUArray(const size_t a_count) { + count = a_count; + allocate(); + } + + GPUArray(const size_t a_count, const T *host_p) { + count = a_count; + allocate(); + CUDA_SAFE_CALL(cudaMemcpy(device_ptr, host_p, count * sizeof(T), + cudaMemcpyHostToDevice)); + } + + // Copy constructor + GPUArray(const GPUArray &) = delete; + // Assignment operator deleted + GPUArray &operator=(const GPUArray &) = delete; + + // Move constructor + GPUArray(GPUArray &&other) noexcept : device_ptr(other.device_ptr) { + other.device_ptr = nullptr; + } + + // Assignment operator + GPUArray &operator=(GPUArray &&other) noexcept { + if (this != &other) { + free_device_pointer(device_ptr); + device_ptr = other.device_ptr; + other.device_ptr = nullptr; + } + return *this; + } + + ~GPUArray() { + free_device_pointer(device_ptr); + } + + void copy_to_host(T *host_p) { + CUDA_SAFE_CALL(cudaMemcpy(host_p, device_ptr, count * sizeof(T), + cudaMemcpyDeviceToHost)); + } + + T *get() { return device_ptr; } + + T *release() { + auto temp = device_ptr; + device_ptr = nullptr; + return temp; + } +}; + +#endif // GPU_UTILS_H diff --git a/gpu-kernels/utils.hpp b/gpu-kernels/utils.hpp new file mode 100644 index 0000000..9a3bd76 --- /dev/null +++ b/gpu-kernels/utils.hpp @@ -0,0 +1,62 @@ +#include +#include +#include + +#include "consts.hpp" + +#ifndef FASTLANES_UTILS_H +#define FASTLANES_UTILS_H + +namespace utils { // internal functions + +template constexpr int32_t sizeof_in_bits() { + return sizeof(T) * 8; +} + +template constexpr T set_first_n_bits(const int32_t count) { + return (count < sizeof_in_bits() ? static_cast((T{1} << int32_t{count}) - T{1}) + : static_cast(~T{0})); +} + +template constexpr int32_t get_lane_bitwidth() { + return sizeof_in_bits(); +} + +template constexpr int32_t get_n_lanes() { + return consts::REGISTER_WIDTH / get_lane_bitwidth(); +} + +template constexpr int32_t get_values_per_lane() { + return consts::VALUES_PER_VECTOR / get_n_lanes(); +} + +template +constexpr int32_t get_compressed_vector_size(int32_t value_bit_width) { + return (consts::VALUES_PER_VECTOR * value_bit_width) / sizeof_in_bits(); +} + +constexpr size_t get_n_vecs_from_size(const size_t size) { + return (size + consts::VALUES_PER_VECTOR - 1) / consts::VALUES_PER_VECTOR; +} + +template +struct same_width_int { + using type = + typename std::conditional::type>::type>::type; +}; + +template +struct same_width_uint { + using type = + typename std::conditional::type>::type>::type; +}; + +} // namespace utils + +#endif // FASTLANES_UTILS_H