Skip to content

Commit 30042ee

Browse files
authored
data-parallel patched ALP standalone kernel (#7576)
## Summary Follow up to #7440 This changes ALP execution on CUDA. Previously, we'd execute two kernel passes: one to perform ALP decoding to global memory, and a second to apply patches. This PR works similarly to prior work to push patching into the decoding kernel for unpacking. We assign a FastLanes 1024-element block to each warp (32 threads), and then perform decoding and patching in a single kernel pass. ## Testing Unit tests were added to check for simple and edge cases (multi-chunk, mix of chunks with/without patches) Signed-off-by: Andrew Duffy <andrew@a10y.dev>
1 parent 9e261cc commit 30042ee

4 files changed

Lines changed: 390 additions & 98 deletions

File tree

vortex-cuda/kernels/src/alp.cu

Lines changed: 84 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,92 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
#include "scalar_kernel.cuh"
5-
6-
// ALP (Adaptive Lossless floating-Point) decode operation.
7-
// Converts integers to floats by multiplying by precomputed exponent factors.
8-
// Formula: decoded = (float)encoded * f * e
9-
// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
10-
template <typename EncodedT, typename FloatT>
11-
struct AlpOp {
12-
FloatT f; // F10[exponents.f] - power of 10
13-
FloatT e; // IF10[exponents.e] - inverse power of 10
14-
15-
__device__ inline FloatT operator()(EncodedT value) const {
16-
return static_cast<FloatT>(value) * f * e;
4+
#include "patches.cuh"
5+
6+
// ALP (Adaptive Lossless floating-Point) decode: out[i] = (FloatT)in[i] * f * e.
7+
//
8+
// Each block processes one 1024-element chunk cooperatively and applies patches
9+
// into shared memory before writing to global memory, mirroring the strategy
10+
// used by bit_unpack. f = F10[exponents.f], e = IF10[exponents.e].
11+
//
12+
// The cast from EncT to FloatT must preserve ALP's lossless contract: f32 is
13+
// only encoded as i32, and f64 is only encoded as i64. The i64 → double cast
14+
// is lossless for all values ALP can produce.
15+
template <typename EncT, typename FloatT>
16+
__device__ void alp_device(const EncT *__restrict in,
17+
FloatT *__restrict out,
18+
FloatT f,
19+
FloatT e,
20+
uint64_t array_len,
21+
int thread_idx,
22+
GPUPatches &patches) {
23+
constexpr int ThreadCount = 32;
24+
// ThreadCount == 32 (one warp) is baked into this kernel:
25+
// - __syncwarp() below is only sufficient because all threads live in one warp.
26+
// - per_thread must evenly divide 1024 so the unrolled loops cover the chunk.
27+
static_assert(ThreadCount == 32, "alp kernel requires exactly one warp per block");
28+
static_assert(1024 % ThreadCount == 0, "ThreadCount must evenly divide 1024");
29+
__shared__ FloatT shared_out[1024];
30+
31+
constexpr int per_thread = 1024 / ThreadCount;
32+
uint64_t chunk_base = static_cast<uint64_t>(blockIdx.x) * 1024;
33+
34+
// Step 1: decode the chunk into shared memory. The tail block is bounds-checked;
35+
// all interior blocks take the fast path with no per-element branch.
36+
if (chunk_base + 1024 <= array_len) {
37+
#pragma unroll
38+
for (int i = 0; i < per_thread; i++) {
39+
int idx = i * ThreadCount + thread_idx;
40+
shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
41+
}
42+
} else {
43+
#pragma unroll
44+
for (int i = 0; i < per_thread; i++) {
45+
int idx = i * ThreadCount + thread_idx;
46+
uint64_t global_idx = chunk_base + static_cast<uint64_t>(idx);
47+
if (global_idx < array_len) {
48+
shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
49+
} else {
50+
shared_out[idx] = FloatT {};
51+
}
52+
}
1753
}
18-
};
19-
20-
// Macro to generate ALP kernel for each type combination.
21-
// Input is integer (encoded), output is float (decoded).
22-
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
23-
extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded, \
24-
FloatType *__restrict decoded, \
25-
FloatType f, \
26-
FloatType e, \
27-
uint64_t array_len) { \
28-
scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e}); \
54+
__syncwarp();
55+
56+
// Step 2: apply patches in parallel across the warp.
57+
PatchesCursor<FloatT> cursor(patches, blockIdx.x, thread_idx, static_cast<uint32_t>(ThreadCount));
58+
auto patch = cursor.next();
59+
while (patch.index != 1024) {
60+
shared_out[patch.index] = patch.value;
61+
patch = cursor.next();
2962
}
63+
__syncwarp();
3064

31-
// f32 variants (ALP for f32 encodes as i32 or i64)
32-
GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
33-
GENERATE_ALP_KERNEL(i64, f32, int64_t, float)
65+
// Step 3: coalesced write-out of the full 1024-element chunk. The caller
66+
// allocates `full_out` rounded up to a multiple of 1024, so every block
67+
// writes entirely within bounds. Positions in `[array_len, rounded_len)`
68+
// of the tail chunk hold don't-care values; the caller slices them off.
69+
#pragma unroll
70+
for (int i = 0; i < per_thread; i++) {
71+
int idx = i * ThreadCount + thread_idx;
72+
out[idx] = shared_out[idx];
73+
}
74+
}
75+
76+
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncT, FloatT) \
77+
extern "C" __global__ void alp_##enc_suffix##_##float_suffix##_32t(const EncT *__restrict full_in, \
78+
FloatT *__restrict full_out, \
79+
FloatT f, \
80+
FloatT e, \
81+
uint64_t array_len, \
82+
GPUPatches patches) { \
83+
int thread_idx = threadIdx.x; \
84+
auto in = full_in + (blockIdx.x * 1024); \
85+
auto out = full_out + (blockIdx.x * 1024); \
86+
alp_device<EncT, FloatT>(in, out, f, e, array_len, thread_idx, patches); \
87+
}
3488

35-
// f64 variants (ALP for f64 encodes as i64)
89+
// The only ALPInt bindings produced by the encoder are (f32, i32) and (f64, i64).
90+
// i64 → double is lossless; i32 → float is lossless for all values ALP emits.
91+
GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
3692
GENERATE_ALP_KERNEL(i64, f64, int64_t, double)

0 commit comments

Comments
 (0)