From e213850e3b86f7158ca5e8830204030d52f2cb69 Mon Sep 17 00:00:00 2001 From: SvenHepkema <47423359+SvenHepkema@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:35:09 +0200 Subject: [PATCH 1/3] Added Work in Progress implementation of ALP on GPU --- gpu-kernels/alp-test-kernels-global.cuh | 125 ++++++++ gpu-kernels/alp.cuh | 405 ++++++++++++++++++++++++ gpu-kernels/fls.cuh | 128 ++++++++ 3 files changed, 658 insertions(+) create mode 100644 gpu-kernels/alp-test-kernels-global.cuh create mode 100644 gpu-kernels/alp.cuh create mode 100644 gpu-kernels/fls.cuh diff --git a/gpu-kernels/alp-test-kernels-global.cuh b/gpu-kernels/alp-test-kernels-global.cuh new file mode 100644 index 0000000..7604384 --- /dev/null +++ b/gpu-kernels/alp-test-kernels-global.cuh @@ -0,0 +1,125 @@ +#include "alp.cuh" + +#include "../common/utils.hpp" + +#ifndef ALP_TEST_KERNELS_GLOBAL_CUH +#define ALP_TEST_KERNELS_GLOBAL_CUH + +namespace alp { +namespace kernels { +namespace global { +namespace test { + +template +__global__ void decode_complete_alp_vector(T *out, AlpColumn data) { + constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane(); + + const int16_t lane = threadIdx.x % N_LANES; + const int16_t vector_index = threadIdx.x / N_LANES; + const int32_t block_index = blockIdx.x; + + constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS; + + out += (block_index * n_vectors_per_block + vector_index) * + consts::VALUES_PER_VECTOR; + + for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) { + unalp(out, data, block_index, lane, i); + out += UNPACK_N_VALUES * N_LANES; + } +} + +template +__global__ void decode_alp_vector_into_lane(T *out, AlpColumn data) { + constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + constexpr uint32_t N_VALUES = UNPACK_N_VALUES * UNPACK_N_VECTORS; + constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane(); + + const int16_t lane = threadIdx.x % N_LANES; + const int32_t block_index = blockIdx.x; + constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS; + const int16_t vector_index = + block_index * n_vectors_per_block + (threadIdx.x / N_LANES); + + T registers[N_VALUES]; + out += vector_index * consts::VALUES_PER_VECTOR; + + for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) { + unalp(registers, data, block_index, lane, i); + + for (int i = 0; i < UNPACK_N_VALUES; i++) { + out[lane + i * N_LANES] = registers[i]; + } + + out += UNPACK_N_VALUES * N_LANES; + } +} + +template +__global__ void decode_alp_vector_with_state(T *out, AlpColumn data) { + constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + constexpr uint32_t N_VALUES = UNPACK_N_VALUES * UNPACK_N_VECTORS; + constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane(); + + const int16_t lane = threadIdx.x % N_LANES; + const int32_t block_index = blockIdx.x; + constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS; + const int16_t vector_index = + block_index * n_vectors_per_block + (threadIdx.x / N_LANES); + + T registers[N_VALUES]; + out += vector_index * consts::VALUES_PER_VECTOR; + + auto iterator = + Unpacker(vector_index, lane, data); + + for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) { + iterator.unpack_next_into(registers); + + for (int i = 0; i < UNPACK_N_VALUES; i++) { + out[lane + i * N_LANES] = registers[i]; + } + + out += UNPACK_N_VALUES * N_LANES; + } +} + +template +__global__ void decode_complete_alprd_vector(T *out, AlpRdColumn data) { + constexpr uint8_t LANE_BIT_WIDTH = utils::sizeof_in_bits(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + constexpr uint32_t N_VALUES_IN_LANE = utils::get_values_per_lane(); + + const int16_t lane = threadIdx.x % N_LANES; + const int16_t vector_index = threadIdx.x / N_LANES; + const int32_t block_index = blockIdx.x; + + constexpr int32_t n_vectors_per_block = UNPACK_N_VECTORS; + + out += (block_index * n_vectors_per_block + vector_index) * + consts::VALUES_PER_VECTOR; + + for (int i = 0; i < N_VALUES_IN_LANE; i += UNPACK_N_VALUES) { + unalprd(out, data, block_index, lane, i); + out += UNPACK_N_VALUES * N_LANES; + } +} + +} // namespace test +} // namespace global +} // namespace kernels +} // namespace alp + +#endif // ALP_TEST_KERNELS_GLOBAL_CUH diff --git a/gpu-kernels/alp.cuh b/gpu-kernels/alp.cuh new file mode 100644 index 0000000..f6e370d --- /dev/null +++ b/gpu-kernels/alp.cuh @@ -0,0 +1,405 @@ +#include +#include +#include + +#include "../alp/constants.hpp" +#include "../common/utils.hpp" +#include "../gpu-fls/fls.cuh" +#include "src/alp/config.hpp" + +#ifndef ALP_CUH +#define ALP_CUH + +template struct AlpColumn { + using UINT_T = typename utils::same_width_uint::type; + + UINT_T *ffor_array; + UINT_T *ffor_bases; + uint8_t *bit_widths; + uint8_t *exponents; + uint8_t *factors; + + T *exceptions; + uint16_t *positions; + uint16_t *counts; +}; + +template struct AlpRdColumn { + using UINT_T = typename utils::same_width_uint::type; + + uint16_t *left_ffor_array; + uint16_t *left_ffor_bases; + uint8_t *left_bit_widths; + + UINT_T *right_ffor_array; + UINT_T *right_ffor_bases; + uint8_t *right_bit_widths; + + uint16_t *left_parts_dicts; + + uint16_t *exceptions; + uint16_t *positions; + uint16_t *counts; +}; + +namespace constant_memory { +constexpr int32_t F_FACT_ARR_COUNT = 10; +constexpr int32_t F_FRAC_ARR_COUNT = 11; +__constant__ int32_t F_FACT_ARRAY[F_FACT_ARR_COUNT]; +__constant__ float F_FRAC_ARRAY[F_FRAC_ARR_COUNT]; + +constexpr int32_t D_FACT_ARR_COUNT = 19; +constexpr int32_t D_FRAC_ARR_COUNT = 21; +__constant__ int64_t D_FACT_ARRAY[D_FACT_ARR_COUNT]; +__constant__ double D_FRAC_ARRAY[D_FRAC_ARR_COUNT]; + +template __host__ void load_alp_constants() { + cudaMemcpyToSymbol(F_FACT_ARRAY, alp::Constants::FACT_ARR, + F_FACT_ARR_COUNT * sizeof(int32_t)); + cudaMemcpyToSymbol(F_FRAC_ARRAY, alp::Constants::FRAC_ARR, + F_FRAC_ARR_COUNT * sizeof(float)); + + cudaMemcpyToSymbol(D_FACT_ARRAY, alp::Constants::FACT_ARR, + D_FACT_ARR_COUNT * sizeof(int64_t)); + cudaMemcpyToSymbol(D_FRAC_ARRAY, alp::Constants::FRAC_ARR, + D_FRAC_ARR_COUNT * sizeof(double)); +} + +template __device__ __forceinline__ T *get_frac_arr(); +template <> __device__ __forceinline__ float *get_frac_arr() { + return F_FRAC_ARRAY; +} +template <> __device__ __forceinline__ double *get_frac_arr() { + return D_FRAC_ARRAY; +} + +template __device__ __forceinline__ T *get_fact_arr(); +template <> __device__ __forceinline__ int32_t *get_fact_arr() { + return F_FACT_ARRAY; +} +template <> __device__ __forceinline__ int64_t *get_fact_arr() { + return D_FACT_ARRAY; +} +} // namespace constant_memory + +// WARNING +// WARNING +// TODO WARNING IS IT NOT FASTER TO PASS THESE ARGUMENTS IN FULL WIDTH? + +// SO uint8_T -> uint32_t (if it gets multiplied with 32) This saves a cast +// in each kernel, and we do not care how big parameters are, as they are +// passed via const +// INFO Hypothesis: not stalling on arithmetic, so it does not matter in +// execution time. Check # executed instructions tho. +// WARNING +// WARNING +template +__device__ void unalp(T_out *__restrict out, const AlpColumn column, + const uint16_t vector_index, const uint16_t lane, + const uint16_t start_index) { + static_assert((std::is_same::value && + std::is_same::value) || + (std::is_same::value && + std::is_same::value), + "Wrong type arguments"); + using INT_T = typename utils::same_width_int::type; + using UINT_T = typename utils::same_width_int::type; + + T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index; + uint16_t value_bit_width = column.bit_widths[vector_index]; + UINT_T base = column.ffor_bases[vector_index]; + INT_T factor = + constant_memory::get_fact_arr()[column.factors[vector_index]]; + T_out frac10 = constant_memory::get_frac_arr< + T_out>()[column.exponents[vector_index]]; // WARNING TODO implement a + // compile time switch to grab + // float array + auto lambda = [base, factor, frac10](const T_in value) -> T_out { + return static_cast(static_cast((value + base) * + static_cast(factor))) * + frac10; + }; + + unpack_vector( + in, out, lane, value_bit_width, start_index, lambda); + + // Patch exceptions + constexpr auto N_LANES = utils::get_n_lanes(); + auto exceptions_count = column.counts[vector_index]; + + auto vec_exceptions = + column.exceptions + consts::VALUES_PER_VECTOR * vector_index; + auto vec_exceptions_positions = + column.positions + consts::VALUES_PER_VECTOR * vector_index; + + const int first_pos = start_index * N_LANES + lane; + const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1); + if (unpacking_type == UnpackingType::VectorArray) { + for (int i{lane}; i < exceptions_count; i += N_LANES) { + // WARNING Currently assumes that you are decoding an entire vector + // TODO Implement an if (position > startindex && position < (start_index + // + UNPACK_N_VALUES * n_lanes) {...} + auto position = vec_exceptions_positions[i]; + out[position] = vec_exceptions[i]; + } + } else if (unpacking_type == UnpackingType::LaneArray) { + for (int i{0}; i < exceptions_count; i++) { + auto position = vec_exceptions_positions[i]; + auto exception = vec_exceptions[i]; + if (position >= first_pos) { + if (position <= last_pos && position % N_LANES == lane) { + out[(position - first_pos) / N_LANES] = exception; + } + if (position + 1 > last_pos) { + return; + } + } + } + } +} + +template +__device__ void +unalp_with_scanner(T_out *__restrict out, const AlpColumn column, + const uint16_t vector_index, const uint16_t lane, + const uint16_t start_index) { + static_assert((std::is_same::value && + std::is_same::value) || + (std::is_same::value && + std::is_same::value), + "Wrong type arguments"); + using INT_T = typename utils::same_width_int::type; + using UINT_T = typename utils::same_width_int::type; + + T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index; + uint16_t value_bit_width = column.bit_widths[vector_index]; + UINT_T base = column.ffor_bases[vector_index]; + INT_T factor = + constant_memory::get_fact_arr()[column.factors[vector_index]]; + T_out frac10 = constant_memory::get_frac_arr< + T_out>()[column.exponents[vector_index]]; // WARNING TODO implement a + // compile time switch to grab + // float array + auto lambda = [base, factor, frac10](const T_in value) -> T_out { + return static_cast(static_cast((value + base) * + static_cast(factor))) * + frac10; + }; + + unpack_vector( + in, out, lane, value_bit_width, start_index, lambda); + + // Patch exceptions + constexpr auto N_LANES = utils::get_n_lanes(); + auto exceptions_count = column.counts[vector_index]; + + auto vec_exceptions = + column.exceptions + consts::VALUES_PER_VECTOR * vector_index; + auto vec_exceptions_positions = + column.positions + consts::VALUES_PER_VECTOR * vector_index; + + const int first_pos = start_index * N_LANES + lane; + const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1); + if (unpacking_type == UnpackingType::VectorArray) { + for (int i{lane}; i < exceptions_count; i += N_LANES) { + // WARNING Currently assumes that you are decoding an entire vector + // TODO Implement an if (position > startindex && position < (start_index + // + UNPACK_N_VALUES * n_lanes) {...} + auto position = vec_exceptions_positions[i]; + out[position] = vec_exceptions[i]; + } + } else if (unpacking_type == UnpackingType::LaneArray) { + constexpr int32_t SCANNER_SIZE = 1; + uint16_t scanner[SCANNER_SIZE]; + + for (int i{0}; i < exceptions_count; i += SCANNER_SIZE) { + + for (int j{0}; j < SCANNER_SIZE && j + i < exceptions_count; ++j) { + scanner[j] = vec_exceptions_positions[j + i]; + } + + for (int j{0}; j < SCANNER_SIZE && j + i < exceptions_count; ++j) { + auto position = scanner[j]; + if (position >= first_pos) { + if (position <= last_pos && position % N_LANES == lane) { + out[(position - first_pos) / N_LANES] = vec_exceptions[j + i]; + } + if (position + 1 > last_pos) { + return; + } + } + } + } + } +} + +template + +struct Unpacker { + const int16_t vector_index; + const uint16_t lane; + const AlpColumn column; + int32_t start_index = 0; + int32_t exception_index = 0; + + __device__ Unpacker(const uint16_t vector_index, const uint16_t lane, + const AlpColumn column) + : vector_index(vector_index), lane(lane), column(column) {} + + __device__ void unpack_next_into(T_out *__restrict out) { + static_assert((std::is_same::value && + std::is_same::value) || + (std::is_same::value && + std::is_same::value), + "Wrong type arguments"); + using INT_T = typename utils::same_width_int::type; + using UINT_T = typename utils::same_width_int::type; + + T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index; + uint16_t value_bit_width = column.bit_widths[vector_index]; + UINT_T base = column.ffor_bases[vector_index]; + INT_T factor = + constant_memory::get_fact_arr()[column.factors[vector_index]]; + T_out frac10 = constant_memory::get_frac_arr< + T_out>()[column.exponents[vector_index]]; + auto lambda = [base, factor, frac10](const T_in value) -> T_out { + return static_cast(static_cast( + (value + base) * static_cast(factor))) * + frac10; + }; + + unpack_vector(in, out, lane, value_bit_width, start_index, + lambda); + + // Patch exceptions + constexpr auto N_LANES = utils::get_n_lanes(); + auto exceptions_count = column.counts[vector_index]; + + auto vec_exceptions = + column.exceptions + consts::VALUES_PER_VECTOR * vector_index; + auto vec_exceptions_positions = + column.positions + consts::VALUES_PER_VECTOR * vector_index; + + const int first_pos = start_index * N_LANES + lane; + const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1); + start_index += UNPACK_N_VALUES; + if (unpacking_type == UnpackingType::VectorArray) { + for (int i{lane}; i < exceptions_count; i += N_LANES) { + auto position = vec_exceptions_positions[i]; + out[position] = vec_exceptions[i]; + } + } else if (unpacking_type == UnpackingType::LaneArray) { + for (; exception_index < exceptions_count; exception_index++) { + auto position = vec_exceptions_positions[exception_index]; + auto exception = vec_exceptions[exception_index]; + if (position >= first_pos) { + if (position <= last_pos && position % N_LANES == lane) { + out[(position - first_pos) / N_LANES] = exception; + } + if (position + 1 > last_pos) { + return; + } + } + } + } + } +}; + +template +__device__ void unalprd(T_out *__restrict out, const AlpRdColumn column, + const uint16_t vector_index, const uint16_t lane, + const uint16_t start_index) { + static_assert((std::is_same::value && + std::is_same::value) || + (std::is_same::value && + std::is_same::value), + "Wrong type arguments"); + using INT_T = typename utils::same_width_int::type; + using UINT_T = typename utils::same_width_uint::type; + + constexpr int32_t N_LANES = utils::get_n_lanes(); + constexpr int32_t VALUES_PER_LANE = utils::get_values_per_lane(); + + UINT_T *out_ut = reinterpret_cast(out); + + // Loading left parts dict + // TODO Let the threads collaborate to load this data + const uint16_t *left_parts_dicts_p = + column.left_parts_dicts + + vector_index * alp::config::MAX_RD_DICTIONARY_SIZE; + uint16_t left_parts_dict[alp::config::MAX_RD_DICTIONARY_SIZE]; + for (size_t j{0}; j < alp::config::MAX_RD_DICTIONARY_SIZE; j++) { + left_parts_dict[j] = left_parts_dicts_p[j]; + } + + // Unfforring Arrays + // INFO The paper says something about fusing, consider using a custom lambda + // TODO Do thread local array instead of shared + __shared__ uint16_t left_array[consts::VALUES_PER_VECTOR]; + const uint16_t *left_ffor_array = + column.left_ffor_array + vector_index * consts::VALUES_PER_VECTOR; + const uint16_t left_bitwidth = column.left_bit_widths[vector_index]; + const uint16_t left_base = column.left_ffor_bases[vector_index]; + + // WARNING This mapping from alprd lane to uint16_t lane should be independent + // of block sizes + for (int i{lane}; i < utils::get_n_lanes(); i += N_LANES) { + undict_vector()>( + left_ffor_array, left_array, i, left_bitwidth, 0, &left_base, + left_parts_dict); + } + + // TODO Do thread local array instead of shared + __shared__ UINT_T right_array[consts::VALUES_PER_VECTOR]; + const UINT_T *right_ffor_array = + column.right_ffor_array + vector_index * consts::VALUES_PER_VECTOR; + const uint16_t right_bitwidth = column.right_bit_widths[vector_index]; + const UINT_T right_base = column.right_ffor_bases[vector_index]; + unffor_vector( + right_ffor_array, right_array, lane, right_bitwidth, 0, &right_base); + + // Decoding +#pragma unroll + for (int i{lane}; i < consts::VALUES_PER_VECTOR; i += N_LANES) { + // WARNING THIS SHOULD NOT WRITE TO GLOBAL + // TODO write to thread local, and then patch all thread local values + // INFO THIS IS ALSO an issue in the normal ALP kernel + out_ut[i] = + (static_cast(left_array[i]) << right_bitwidth) | right_array[i]; + } + + // Patching exceptions + const uint16_t exceptions_count = column.counts[vector_index]; + const uint16_t *vec_exceptions = + column.exceptions + consts::VALUES_PER_VECTOR * vector_index; + const uint16_t *vec_exceptions_positions = + column.positions + consts::VALUES_PER_VECTOR * vector_index; + + if (unpacking_type == UnpackingType::VectorArray) { + for (int i{lane}; i < exceptions_count; i += N_LANES) { + const auto position = vec_exceptions_positions[i]; + const UINT_T right = right_array[position]; + const uint16_t left = vec_exceptions[i]; + out_ut[position] = (static_cast(left) << right_bitwidth) | right; + } + } else if (unpacking_type == UnpackingType::LaneArray) { + for (int i{0}; i < exceptions_count; ++i) { + const auto position = vec_exceptions_positions[i]; + + if (position % N_LANES == lane) { + const UINT_T right = right_array[position]; + const uint16_t left = vec_exceptions[i]; + out_ut[position / N_LANES] = + (static_cast(left) << right_bitwidth) | right; + } + } + } +} + +#endif // ALP_CUH diff --git a/gpu-kernels/fls.cuh b/gpu-kernels/fls.cuh new file mode 100644 index 0000000..17893d8 --- /dev/null +++ b/gpu-kernels/fls.cuh @@ -0,0 +1,128 @@ +#include +#include +#include +#include + +#include "../common/utils.hpp" + +#ifndef FLS_CUH +#define FLS_CUH + +enum UnpackingType { LaneArray, VectorArray }; + +template +__device__ void unpack_vector(const T_in *__restrict in, T_out *__restrict out, + const uint16_t lane, + const uint16_t value_bit_width, + const uint16_t start_index, lambda_T lambda) { + static_assert(std::is_unsigned::value, + "Packing function only supports unsigned types. Cast signed " + "arrays to unsigned equivalent."); + constexpr uint8_t LANE_BIT_WIDTH = utils::get_lane_bitwidth(); + constexpr uint32_t N_LANES = utils::get_n_lanes(); + uint16_t preceding_bits = (start_index * value_bit_width); + uint16_t buffer_offset = preceding_bits % LANE_BIT_WIDTH; + uint16_t n_input_line = preceding_bits / LANE_BIT_WIDTH; + T_in value_mask = utils::set_first_n_bits(value_bit_width); + + T_in line_buffer[UNPACK_N_VECTORS]; + T_in buffer_offset_mask; + + int32_t encoded_vector_offset = + utils::get_compressed_vector_size(value_bit_width); + + in += lane; + +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + line_buffer[v] = *(in + n_input_line * N_LANES + v * encoded_vector_offset); + } + out += unpacking_type == UnpackingType::VectorArray ? lane : 0; + n_input_line++; + + T_in value[UNPACK_N_VECTORS]; + +#pragma unroll + for (int i = 0; i < UNPACK_N_VALUES; ++i) { + bool line_buffer_is_empty = buffer_offset == LANE_BIT_WIDTH; + if (line_buffer_is_empty) { +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + line_buffer[v] = + *(in + n_input_line * N_LANES + v * encoded_vector_offset); + } + ++n_input_line; + buffer_offset -= LANE_BIT_WIDTH; + } + +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + value[v] = + (line_buffer[v] & (value_mask << buffer_offset)) >> buffer_offset; + } + buffer_offset += value_bit_width; + + bool value_continues_on_next_line = buffer_offset > LANE_BIT_WIDTH; + if (value_continues_on_next_line) { +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + line_buffer[v] = + *(in + n_input_line * N_LANES + v * encoded_vector_offset); + } + ++n_input_line; + buffer_offset -= LANE_BIT_WIDTH; + + buffer_offset_mask = + (T_in{1} << static_cast(buffer_offset)) - T_in{1}; +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + value[v] |= (line_buffer[v] & buffer_offset_mask) + << (value_bit_width - buffer_offset); + } + } + +#pragma unroll + for (int v = 0; v < UNPACK_N_VECTORS; ++v) { + *(out + v * UNPACK_N_VALUES) = lambda(value[v]); + } + out += unpacking_type == UnpackingType::VectorArray ? N_LANES : 1; + } +} + +template +__device__ void +bitunpack_vector(const T *__restrict in, T *__restrict out, const uint16_t lane, + const uint16_t value_bit_width, const uint16_t start_index) { + auto lambda = [=](const T value) -> T { return value; }; + unpack_vector( + in, out, lane, value_bit_width, start_index, lambda); +} + +template +__device__ void +unffor_vector(const T *__restrict in, T *__restrict out, const uint16_t lane, + const uint16_t value_bit_width, const uint16_t start_index, + const T *__restrict a_base_p) { + T base = *a_base_p; + auto lambda = [base](const T value) -> T { return value + base; }; + unpack_vector( + in, out, lane, value_bit_width, start_index, lambda); +} + +template +__device__ void +undict_vector(const T *__restrict in, T *__restrict out, const uint16_t lane, + const uint16_t value_bit_width, const uint16_t start_index, + const T *__restrict a_base_p, const T_dict *__restrict dict) { + T base = *a_base_p; + auto lambda = [base, dict](const T value) -> T { return dict[value + base]; }; + unpack_vector( + in, out, lane, value_bit_width, start_index, lambda); +} + +#endif // FLS_CUH From 477278382147ace35b1c8d0ebe84b885bcd2a947 Mon Sep 17 00:00:00 2001 From: SvenHepkema <47423359+SvenHepkema@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:41:14 +0200 Subject: [PATCH 2/3] Added more files and fixed references --- gpu-kernels/alp-test-kernels-global.cuh | 2 +- gpu-kernels/alp.cuh | 8 +-- gpu-kernels/consts.hpp | 28 ++++++++ gpu-kernels/fls.cuh | 2 +- gpu-kernels/gpu-utils.cuh | 91 +++++++++++++++++++++++++ gpu-kernels/utils.hpp | 62 +++++++++++++++++ 6 files changed, 187 insertions(+), 6 deletions(-) create mode 100644 gpu-kernels/consts.hpp create mode 100644 gpu-kernels/gpu-utils.cuh create mode 100644 gpu-kernels/utils.hpp diff --git a/gpu-kernels/alp-test-kernels-global.cuh b/gpu-kernels/alp-test-kernels-global.cuh index 7604384..d80d7d4 100644 --- a/gpu-kernels/alp-test-kernels-global.cuh +++ b/gpu-kernels/alp-test-kernels-global.cuh @@ -1,6 +1,6 @@ #include "alp.cuh" -#include "../common/utils.hpp" +#include "utils.hpp" #ifndef ALP_TEST_KERNELS_GLOBAL_CUH #define ALP_TEST_KERNELS_GLOBAL_CUH diff --git a/gpu-kernels/alp.cuh b/gpu-kernels/alp.cuh index f6e370d..dc5bccc 100644 --- a/gpu-kernels/alp.cuh +++ b/gpu-kernels/alp.cuh @@ -2,10 +2,10 @@ #include #include -#include "../alp/constants.hpp" -#include "../common/utils.hpp" -#include "../gpu-fls/fls.cuh" -#include "src/alp/config.hpp" +#include "../alp/include/constants.hpp" +#include "utils.hpp" +#include "fls.cuh" +#include "../alp/include/config.hpp" #ifndef ALP_CUH #define ALP_CUH diff --git a/gpu-kernels/consts.hpp b/gpu-kernels/consts.hpp new file mode 100644 index 0000000..04e081c --- /dev/null +++ b/gpu-kernels/consts.hpp @@ -0,0 +1,28 @@ +#include + +#ifndef CONSTS_H +#define CONSTS_H + +namespace consts { + +constexpr int32_t REGISTER_WIDTH = 1024; +constexpr int32_t VALUES_PER_VECTOR = 1024; +constexpr int32_t THREADS_PER_WARP = 32; + +template +struct as { + static inline constexpr T MAGIC_NUMBER = 0; +}; + +template <> +struct as { + static inline constexpr float MAGIC_NUMBER = 0.3214f; +}; + +template <> +struct as { + static inline constexpr double MAGIC_NUMBER = 0.3214; +}; +} // namespace consts + +#endif // CONSTS_H diff --git a/gpu-kernels/fls.cuh b/gpu-kernels/fls.cuh index 17893d8..b18af8b 100644 --- a/gpu-kernels/fls.cuh +++ b/gpu-kernels/fls.cuh @@ -3,7 +3,7 @@ #include #include -#include "../common/utils.hpp" +#include "utils.hpp" #ifndef FLS_CUH #define FLS_CUH diff --git a/gpu-kernels/gpu-utils.cuh b/gpu-kernels/gpu-utils.cuh new file mode 100644 index 0000000..e20300d --- /dev/null +++ b/gpu-kernels/gpu-utils.cuh @@ -0,0 +1,91 @@ +#include <__clang_cuda_runtime_wrapper.h> +#include +#include +#include +#include +#include +#include +#include + +#ifndef GPU_UTILS_H +#define GPU_UTILS_H + +#define CUDA_SAFE_CALL(call) \ + do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.", __FILE__, \ + __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +template void free_device_pointer(T *&device_ptr) { + if (device_ptr != nullptr) { + CUDA_SAFE_CALL(cudaFree(device_ptr)); + } + device_ptr = nullptr; +} + +template class GPUArray { +private: + size_t count; + T *device_ptr = nullptr; + + void allocate() { + CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&device_ptr), count * sizeof(T))); + } + +public: + GPUArray(const size_t a_count) { + count = a_count; + allocate(); + } + + GPUArray(const size_t a_count, const T *host_p) { + count = a_count; + allocate(); + CUDA_SAFE_CALL(cudaMemcpy(device_ptr, host_p, count * sizeof(T), + cudaMemcpyHostToDevice)); + } + + // Copy constructor + GPUArray(const GPUArray &) = delete; + // Assignment operator deleted + GPUArray &operator=(const GPUArray &) = delete; + + // Move constructor + GPUArray(GPUArray &&other) noexcept : device_ptr(other.device_ptr) { + other.device_ptr = nullptr; + } + + // Assignment operator + GPUArray &operator=(GPUArray &&other) noexcept { + if (this != &other) { + free_device_pointer(device_ptr); + device_ptr = other.device_ptr; + other.device_ptr = nullptr; + } + return *this; + } + + ~GPUArray() { + free_device_pointer(device_ptr); + } + + void copy_to_host(T *host_p) { + CUDA_SAFE_CALL(cudaMemcpy(host_p, device_ptr, count * sizeof(T), + cudaMemcpyDeviceToHost)); + } + + T *get() { return device_ptr; } + + T *release() { + auto temp = device_ptr; + device_ptr = nullptr; + return temp; + } +}; + +#endif // GPU_UTILS_H diff --git a/gpu-kernels/utils.hpp b/gpu-kernels/utils.hpp new file mode 100644 index 0000000..9a3bd76 --- /dev/null +++ b/gpu-kernels/utils.hpp @@ -0,0 +1,62 @@ +#include +#include +#include + +#include "consts.hpp" + +#ifndef FASTLANES_UTILS_H +#define FASTLANES_UTILS_H + +namespace utils { // internal functions + +template constexpr int32_t sizeof_in_bits() { + return sizeof(T) * 8; +} + +template constexpr T set_first_n_bits(const int32_t count) { + return (count < sizeof_in_bits() ? static_cast((T{1} << int32_t{count}) - T{1}) + : static_cast(~T{0})); +} + +template constexpr int32_t get_lane_bitwidth() { + return sizeof_in_bits(); +} + +template constexpr int32_t get_n_lanes() { + return consts::REGISTER_WIDTH / get_lane_bitwidth(); +} + +template constexpr int32_t get_values_per_lane() { + return consts::VALUES_PER_VECTOR / get_n_lanes(); +} + +template +constexpr int32_t get_compressed_vector_size(int32_t value_bit_width) { + return (consts::VALUES_PER_VECTOR * value_bit_width) / sizeof_in_bits(); +} + +constexpr size_t get_n_vecs_from_size(const size_t size) { + return (size + consts::VALUES_PER_VECTOR - 1) / consts::VALUES_PER_VECTOR; +} + +template +struct same_width_int { + using type = + typename std::conditional::type>::type>::type; +}; + +template +struct same_width_uint { + using type = + typename std::conditional::type>::type>::type; +}; + +} // namespace utils + +#endif // FASTLANES_UTILS_H From 673e1ca35db5f9491e7b611de5a61d3f0f9dfafb Mon Sep 17 00:00:00 2001 From: SvenHepkema <47423359+SvenHepkema@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:47:35 +0200 Subject: [PATCH 3/3] Removed some outdated comments and a redundant kernel --- gpu-kernels/alp.cuh | 100 +------------------------------------------- 1 file changed, 2 insertions(+), 98 deletions(-) diff --git a/gpu-kernels/alp.cuh b/gpu-kernels/alp.cuh index dc5bccc..8cc1748 100644 --- a/gpu-kernels/alp.cuh +++ b/gpu-kernels/alp.cuh @@ -82,17 +82,6 @@ template <> __device__ __forceinline__ int64_t *get_fact_arr() { } } // namespace constant_memory -// WARNING -// WARNING -// TODO WARNING IS IT NOT FASTER TO PASS THESE ARGUMENTS IN FULL WIDTH? - -// SO uint8_T -> uint32_t (if it gets multiplied with 32) This saves a cast -// in each kernel, and we do not care how big parameters are, as they are -// passed via const -// INFO Hypothesis: not stalling on arithmetic, so it does not matter in -// execution time. Check # executed instructions tho. -// WARNING -// WARNING template __device__ void unalp(T_out *__restrict out, const AlpColumn column, @@ -112,9 +101,8 @@ __device__ void unalp(T_out *__restrict out, const AlpColumn column, INT_T factor = constant_memory::get_fact_arr()[column.factors[vector_index]]; T_out frac10 = constant_memory::get_frac_arr< - T_out>()[column.exponents[vector_index]]; // WARNING TODO implement a - // compile time switch to grab - // float array + T_out>()[column.exponents[vector_index]]; + auto lambda = [base, factor, frac10](const T_in value) -> T_out { return static_cast(static_cast((value + base) * static_cast(factor))) * @@ -138,8 +126,6 @@ __device__ void unalp(T_out *__restrict out, const AlpColumn column, if (unpacking_type == UnpackingType::VectorArray) { for (int i{lane}; i < exceptions_count; i += N_LANES) { // WARNING Currently assumes that you are decoding an entire vector - // TODO Implement an if (position > startindex && position < (start_index - // + UNPACK_N_VALUES * n_lanes) {...} auto position = vec_exceptions_positions[i]; out[position] = vec_exceptions[i]; } @@ -161,83 +147,6 @@ __device__ void unalp(T_out *__restrict out, const AlpColumn column, template -__device__ void -unalp_with_scanner(T_out *__restrict out, const AlpColumn column, - const uint16_t vector_index, const uint16_t lane, - const uint16_t start_index) { - static_assert((std::is_same::value && - std::is_same::value) || - (std::is_same::value && - std::is_same::value), - "Wrong type arguments"); - using INT_T = typename utils::same_width_int::type; - using UINT_T = typename utils::same_width_int::type; - - T_in *in = column.ffor_array + consts::VALUES_PER_VECTOR * vector_index; - uint16_t value_bit_width = column.bit_widths[vector_index]; - UINT_T base = column.ffor_bases[vector_index]; - INT_T factor = - constant_memory::get_fact_arr()[column.factors[vector_index]]; - T_out frac10 = constant_memory::get_frac_arr< - T_out>()[column.exponents[vector_index]]; // WARNING TODO implement a - // compile time switch to grab - // float array - auto lambda = [base, factor, frac10](const T_in value) -> T_out { - return static_cast(static_cast((value + base) * - static_cast(factor))) * - frac10; - }; - - unpack_vector( - in, out, lane, value_bit_width, start_index, lambda); - - // Patch exceptions - constexpr auto N_LANES = utils::get_n_lanes(); - auto exceptions_count = column.counts[vector_index]; - - auto vec_exceptions = - column.exceptions + consts::VALUES_PER_VECTOR * vector_index; - auto vec_exceptions_positions = - column.positions + consts::VALUES_PER_VECTOR * vector_index; - - const int first_pos = start_index * N_LANES + lane; - const int last_pos = first_pos + N_LANES * (UNPACK_N_VALUES - 1); - if (unpacking_type == UnpackingType::VectorArray) { - for (int i{lane}; i < exceptions_count; i += N_LANES) { - // WARNING Currently assumes that you are decoding an entire vector - // TODO Implement an if (position > startindex && position < (start_index - // + UNPACK_N_VALUES * n_lanes) {...} - auto position = vec_exceptions_positions[i]; - out[position] = vec_exceptions[i]; - } - } else if (unpacking_type == UnpackingType::LaneArray) { - constexpr int32_t SCANNER_SIZE = 1; - uint16_t scanner[SCANNER_SIZE]; - - for (int i{0}; i < exceptions_count; i += SCANNER_SIZE) { - - for (int j{0}; j < SCANNER_SIZE && j + i < exceptions_count; ++j) { - scanner[j] = vec_exceptions_positions[j + i]; - } - - for (int j{0}; j < SCANNER_SIZE && j + i < exceptions_count; ++j) { - auto position = scanner[j]; - if (position >= first_pos) { - if (position <= last_pos && position % N_LANES == lane) { - out[(position - first_pos) / N_LANES] = vec_exceptions[j + i]; - } - if (position + 1 > last_pos) { - return; - } - } - } - } - } -} - -template - struct Unpacker { const int16_t vector_index; const uint16_t lane; @@ -328,7 +237,6 @@ __device__ void unalprd(T_out *__restrict out, const AlpRdColumn column, UINT_T *out_ut = reinterpret_cast(out); // Loading left parts dict - // TODO Let the threads collaborate to load this data const uint16_t *left_parts_dicts_p = column.left_parts_dicts + vector_index * alp::config::MAX_RD_DICTIONARY_SIZE; @@ -338,7 +246,6 @@ __device__ void unalprd(T_out *__restrict out, const AlpRdColumn column, } // Unfforring Arrays - // INFO The paper says something about fusing, consider using a custom lambda // TODO Do thread local array instead of shared __shared__ uint16_t left_array[consts::VALUES_PER_VECTOR]; const uint16_t *left_ffor_array = @@ -367,9 +274,6 @@ __device__ void unalprd(T_out *__restrict out, const AlpRdColumn column, // Decoding #pragma unroll for (int i{lane}; i < consts::VALUES_PER_VECTOR; i += N_LANES) { - // WARNING THIS SHOULD NOT WRITE TO GLOBAL - // TODO write to thread local, and then patch all thread local values - // INFO THIS IS ALSO an issue in the normal ALP kernel out_ut[i] = (static_cast(left_array[i]) << right_bitwidth) | right_array[i]; }