Skip to content

Commit 9dc91b1

Browse files
committed
data-parallel patched ALP standalone kernel
Signed-off-by: Andrew Duffy <andrew@a10y.dev>
1 parent be2a14b commit 9dc91b1

4 files changed

Lines changed: 267 additions & 101 deletions

File tree

vortex-cuda/kernels/src/alp.cu

Lines changed: 68 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,75 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
#include "scalar_kernel.cuh"
5-
6-
// ALP (Adaptive Lossless floating-Point) decode operation.
7-
// Converts integers to floats by multiplying by precomputed exponent factors.
8-
// Formula: decoded = (float)encoded * f * e
9-
// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
10-
template <typename EncodedT, typename FloatT>
11-
struct AlpOp {
12-
FloatT f; // F10[exponents.f] - power of 10
13-
FloatT e; // IF10[exponents.e] - inverse power of 10
14-
15-
__device__ inline FloatT operator()(EncodedT value) const {
16-
return static_cast<FloatT>(value) * f * e;
4+
#include "patches.cuh"
5+
6+
// ALP (Adaptive Lossless floating-Point) decode: out[i] = (FloatT)in[i] * f * e.
7+
//
8+
// Each block processes one 1024-element chunk cooperatively and applies patches
9+
// into shared memory before writing to global memory, mirroring the strategy
10+
// used by bit_unpack. f = F10[exponents.f], e = IF10[exponents.e].
11+
template <typename EncT, typename FloatT, int ThreadCount>
12+
__device__ void _alp_device(const EncT *__restrict in, FloatT *__restrict out, FloatT f,
13+
FloatT e, uint64_t array_len, int thread_idx, GPUPatches &patches) {
14+
__shared__ FloatT shared_out[1024];
15+
16+
constexpr int per_thread = 1024 / ThreadCount;
17+
uint64_t chunk_base = static_cast<uint64_t>(blockIdx.x) * 1024;
18+
19+
// Step 1: decode the chunk into shared memory. The tail block is bounds-checked;
20+
// all interior blocks take the fast path with no per-element branch.
21+
if (chunk_base + 1024 <= array_len) {
22+
#pragma unroll
23+
for (int i = 0; i < per_thread; i++) {
24+
int idx = i * ThreadCount + thread_idx;
25+
shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
26+
}
27+
} else {
28+
#pragma unroll
29+
for (int i = 0; i < per_thread; i++) {
30+
int idx = i * ThreadCount + thread_idx;
31+
uint64_t global_idx = chunk_base + static_cast<uint64_t>(idx);
32+
if (global_idx < array_len) {
33+
shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
34+
} else {
35+
shared_out[idx] = FloatT{};
36+
}
37+
}
38+
}
39+
__syncwarp();
40+
41+
// Step 2: apply patches in parallel across the warp.
42+
PatchesCursor<FloatT> cursor(patches, blockIdx.x, thread_idx, ThreadCount);
43+
auto patch = cursor.next();
44+
while (patch.index != 1024) {
45+
shared_out[patch.index] = patch.value;
46+
patch = cursor.next();
47+
}
48+
__syncwarp();
49+
50+
// Step 3: coalesced write-out. Slop past `array_len` in the tail chunk is
51+
// overwritten harmlessly; the caller slices the final buffer to `array_len`.
52+
#pragma unroll
53+
for (int i = 0; i < per_thread; i++) {
54+
int idx = i * ThreadCount + thread_idx;
55+
out[idx] = shared_out[idx];
1756
}
18-
};
19-
20-
// Macro to generate ALP kernel for each type combination.
21-
// Input is integer (encoded), output is float (decoded).
22-
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
23-
extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded, \
24-
FloatType *__restrict decoded, \
25-
FloatType f, \
26-
FloatType e, \
27-
uint64_t array_len) { \
28-
scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e}); \
57+
}
58+
59+
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncT, FloatT, THREAD_COUNT) \
60+
extern "C" __global__ void alp_##enc_suffix##_##float_suffix##_##THREAD_COUNT##t( \
61+
const EncT *__restrict full_in, FloatT *__restrict full_out, FloatT f, FloatT e, \
62+
uint64_t array_len, GPUPatches patches) { \
63+
int thread_idx = threadIdx.x; \
64+
auto in = full_in + (blockIdx.x * 1024); \
65+
auto out = full_out + (blockIdx.x * 1024); \
66+
_alp_device<EncT, FloatT, THREAD_COUNT>(in, out, f, e, array_len, thread_idx, patches); \
2967
}
3068

31-
// f32 variants (ALP for f32 encodes as i32 or i64)
32-
GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
33-
GENERATE_ALP_KERNEL(i64, f32, int64_t, float)
69+
// f32 decoded from i32 or i64 encoded — 32 threads per block (32 elements each).
70+
GENERATE_ALP_KERNEL(i32, f32, int32_t, float, 32)
71+
GENERATE_ALP_KERNEL(i64, f32, int64_t, float, 32)
3472

35-
// f64 variants (ALP for f64 encodes as i64)
36-
GENERATE_ALP_KERNEL(i64, f64, int64_t, double)
73+
// f64 decoded from i64 encoded — 16 threads per block (64 elements each) to match
74+
// the lane count bit_unpack uses for 64-bit output widths.
75+
GENERATE_ALP_KERNEL(i64, f64, int64_t, double, 16)

vortex-cuda/src/kernel/encodings/alp.rs

Lines changed: 138 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,18 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
use std::fmt::Debug;
5-
use std::sync::Arc;
65

76
use async_trait::async_trait;
87
use cudarc::driver::DeviceRepr;
8+
use cudarc::driver::LaunchConfig;
99
use cudarc::driver::PushKernelArg;
1010
use tracing::instrument;
1111
use vortex::array::ArrayRef;
1212
use vortex::array::Canonical;
1313
use vortex::array::arrays::PrimitiveArray;
1414
use vortex::array::arrays::primitive::PrimitiveDataParts;
1515
use vortex::array::buffer::BufferHandle;
16-
use vortex::array::match_each_unsigned_integer_ptype;
16+
use vortex::array::buffer::DeviceBufferExt;
1717
use vortex::dtype::NativePType;
1818
use vortex::encodings::alp::ALP;
1919
use vortex::encodings::alp::ALPArray;
@@ -30,7 +30,8 @@ use crate::CudaDeviceBuffer;
3030
use crate::executor::CudaArrayExt;
3131
use crate::executor::CudaExecute;
3232
use crate::executor::CudaExecutionCtx;
33-
use crate::kernel::patches::execute_patches;
33+
use crate::kernel::patches::build_gpu_patches;
34+
use crate::kernel::patches::types::load_patches;
3435

3536
/// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
3637
#[derive(Debug)]
@@ -54,6 +55,13 @@ impl CudaExecute for ALPExecutor {
5455
}
5556
}
5657

58+
/// Thread count per block, matching the strategy used by `bit_unpack`:
59+
/// 16 threads (64 elements each) for 64-bit output widths, otherwise 32.
60+
const fn alp_thread_count<A>() -> u32 {
61+
if size_of::<A>() == 8 { 16 } else { 32 }
62+
}
63+
64+
#[instrument(skip_all)]
5765
async fn decode_alp<A>(array: ALPArray, ctx: &mut CudaExecutionCtx) -> VortexResult<Canonical>
5866
where
5967
A: ALPFloat + NativePType + DeviceRepr + Send + Sync + 'static,
@@ -67,50 +75,69 @@ where
6775
let f: A = A::F10[exponents.f as usize];
6876
let e: A = A::IF10[exponents.e as usize];
6977

70-
// Execute child and copy to device
78+
// Execute child and copy to device.
7179
let canonical = array.encoded().clone().execute_cuda(ctx).await?;
7280
let primitive = canonical.into_primitive();
7381
let PrimitiveDataParts {
7482
buffer, validity, ..
7583
} = primitive.into_data_parts();
7684

7785
let device_input = ctx.ensure_on_device(buffer).await?;
78-
79-
// Get CUDA view of input
8086
let input_view = device_input.cuda_view::<A::ALPInt>()?;
8187

82-
// Allocate output buffer
83-
let output_slice = ctx.device_alloc::<A>(array_len)?;
88+
// Allocate output rounded up to a full chunk: the fused kernel writes a
89+
// whole 1024-element chunk per block, and we slice off any padding below.
90+
let output_slice = ctx.device_alloc::<A>(array_len.next_multiple_of(1024))?;
8491
let output_buf = CudaDeviceBuffer::new(output_slice);
8592
let output_view = output_buf.as_view::<A>();
8693

87-
let array_len_u64 = array_len as u64;
88-
89-
// Load kernel function
90-
let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE];
91-
let cuda_function = ctx.load_function("alp", &kernel_ptypes)?;
94+
// Patch validity does not need to be scattered: the ALP encoder strips null
95+
// positions from the exception list, so patches only exist at valid
96+
// positions. load_patches additionally rejects patches without
97+
// chunk_offsets (required by the fused kernel's PatchesCursor).
98+
let device_patches = if let Some(patches) = array.patches() {
99+
Some(load_patches(&patches, ctx).await?)
100+
} else {
101+
None
102+
};
103+
let patches_arg = build_gpu_patches(device_patches.as_ref())?;
104+
105+
// Load the kernel: alp_{enc}_{float}_{threads}t
106+
let thread_count = alp_thread_count::<A>();
107+
let thread_suffix = format!("{thread_count}t");
108+
let enc_suffix = A::ALPInt::PTYPE.to_string();
109+
let float_suffix = A::PTYPE.to_string();
110+
let cuda_function = ctx.load_function_with_suffixes(
111+
"alp",
112+
&[
113+
enc_suffix.as_str(),
114+
float_suffix.as_str(),
115+
thread_suffix.as_str(),
116+
],
117+
)?;
118+
119+
let num_blocks = u32::try_from(array_len.div_ceil(1024))?;
120+
let config = LaunchConfig {
121+
grid_dim: (num_blocks, 1, 1),
122+
block_dim: (thread_count, 1, 1),
123+
shared_mem_bytes: 0,
124+
};
92125

93-
ctx.launch_kernel(&cuda_function, array_len, |args| {
126+
let array_len_u64 = array_len as u64;
127+
ctx.launch_kernel_config(&cuda_function, config, array_len, |args| {
94128
args.arg(&input_view)
95129
.arg(&output_view)
96130
.arg(&f)
97131
.arg(&e)
98-
.arg(&array_len_u64);
132+
.arg(&array_len_u64)
133+
.arg(&patches_arg);
99134
})?;
100135

101-
// Check if there are any patches to decode here. Patch validity does not
102-
// need to be scattered: the ALP encoder strips null positions from the
103-
// exception list, so patches only exist at valid positions. execute_patches
104-
// additionally guards against nullable patch values at runtime.
105-
let output_buf = if let Some(patches) = array.patches() {
106-
match_each_unsigned_integer_ptype!(patches.indices_ptype()?, |I| {
107-
execute_patches::<A, I>(patches.clone(), output_buf, ctx).await?
108-
})
109-
} else {
110-
output_buf
111-
};
136+
// Synchronize so the device patches buffers remain alive for the kernel.
137+
ctx.synchronize_stream()?;
138+
drop(device_patches);
112139

113-
let output_handle = BufferHandle::new_device(Arc::new(output_buf));
140+
let output_handle = BufferHandle::new_device(output_buf.slice_typed::<A>(0..array_len));
114141
Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
115142
output_handle,
116143
A::PTYPE,
@@ -257,4 +284,88 @@ mod tests {
257284
assert_arrays_eq!(cpu_result, gpu_result);
258285
Ok(())
259286
}
287+
288+
/// Multi-chunk ALP (> 1024 elements) with patches scattered across chunks.
289+
/// Exercises the fused kernel's per-block patches cursor math when more
290+
/// than one block is launched.
291+
#[crate::test]
292+
async fn test_cuda_alp_multi_chunk_with_patches() -> VortexResult<()> {
293+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
294+
.vortex_expect("failed to create execution context");
295+
296+
// 3072 values (3 chunks). Inject exceptions (values ALP can't encode
297+
// losslessly) at a handful of positions spread across chunks.
298+
let mut values: Vec<f32> = Vec::with_capacity(3072);
299+
for i in 0u32..3072 {
300+
if matches!(i, 0 | 100 | 1023 | 1024 | 2000 | 3071) {
301+
values.push(1.0_f32 / 7.0 + i as f32);
302+
} else {
303+
values.push(i as f32);
304+
}
305+
}
306+
let prim = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
307+
let alp_array = alp_encode(
308+
prim.as_view(),
309+
None,
310+
&mut LEGACY_SESSION.create_execution_ctx(),
311+
)?;
312+
assert!(
313+
alp_array.patches().is_some(),
314+
"expected patches from ALP exceptions"
315+
);
316+
317+
let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array();
318+
319+
let gpu_result = alp_array
320+
.into_array()
321+
.execute_cuda(&mut cuda_ctx)
322+
.await?
323+
.into_host()
324+
.await?
325+
.into_array();
326+
327+
assert_arrays_eq!(cpu_result, gpu_result);
328+
Ok(())
329+
}
330+
331+
/// Tail-chunk bounds check: an array whose length is not a multiple of
332+
/// 1024 forces the kernel's tail-block path to bounds-check its decode
333+
/// loop. Includes a patch in the tail.
334+
#[crate::test]
335+
async fn test_cuda_alp_partial_tail_chunk() -> VortexResult<()> {
336+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
337+
.vortex_expect("failed to create execution context");
338+
339+
let mut values: Vec<f64> = Vec::with_capacity(1500);
340+
for i in 0u32..1500 {
341+
if i == 1400 {
342+
values.push(1.0_f64 / 3.0);
343+
} else {
344+
values.push(i as f64);
345+
}
346+
}
347+
let prim = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
348+
let alp_array = alp_encode(
349+
prim.as_view(),
350+
None,
351+
&mut LEGACY_SESSION.create_execution_ctx(),
352+
)?;
353+
assert!(
354+
alp_array.patches().is_some(),
355+
"expected patches from ALP exceptions"
356+
);
357+
358+
let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array();
359+
360+
let gpu_result = alp_array
361+
.into_array()
362+
.execute_cuda(&mut cuda_ctx)
363+
.await?
364+
.into_host()
365+
.await?
366+
.into_array();
367+
368+
assert_arrays_eq!(cpu_result, gpu_result);
369+
Ok(())
370+
}
260371
}

0 commit comments

Comments
 (0)