Skip to content

Commit d036fc1

Browse files
authored
feat(cuda): support f64 in alp dyn dispatch (#7666)
Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 77e9bf0 commit d036fc1

14 files changed

Lines changed: 330 additions & 105 deletions

File tree

vortex-cuda/benches/dict_cuda.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
mod common;
1010

11+
use std::fmt::Debug;
1112
use std::mem::size_of;
1213
use std::sync::Arc;
1314
use std::sync::atomic::Ordering;
@@ -48,7 +49,7 @@ fn make_dict_array_typed<V, C>(len: usize, dict_size: usize) -> DictArray
4849
where
4950
V: NativePType + From<u32>,
5051
C: NativePType + TryFrom<usize>,
51-
<C as TryFrom<usize>>::Error: std::fmt::Debug,
52+
<C as TryFrom<usize>>::Error: Debug,
5253
{
5354
// Dictionary values
5455
let values: Vec<V> = (0..dict_size)
@@ -71,7 +72,7 @@ fn benchmark_dict_typed<V, C>(c: &mut Criterion, config: &DictBenchConfig)
7172
where
7273
V: NativePType + DeviceRepr + From<u32>,
7374
C: NativePType + DeviceRepr + TryFrom<usize>,
74-
<C as TryFrom<usize>>::Error: std::fmt::Debug,
75+
<C as TryFrom<usize>>::Error: Debug,
7576
{
7677
let mut group = c.benchmark_group("dict_cuda");
7778

vortex-cuda/benches/dynamic_dispatch_cuda.rs

Lines changed: 93 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#![expect(clippy::cast_possible_truncation)]
66
#![expect(clippy::expect_used)]
77

8+
use std::marker::PhantomData;
89
use std::mem::size_of;
910
use std::sync::Arc;
1011
use std::time::Duration;
@@ -14,19 +15,22 @@ use criterion::Criterion;
1415
use criterion::Throughput;
1516
use cudarc::driver::CudaSlice;
1617
use cudarc::driver::DevicePtr;
18+
use cudarc::driver::DeviceRepr;
1719
use cudarc::driver::LaunchConfig;
1820
use cudarc::driver::PushKernelArg;
1921
use cudarc::driver::sys::CUevent_flags;
2022
use futures::executor::block_on;
23+
use vortex::array::ArrayRef;
2124
use vortex::array::IntoArray;
2225
use vortex::array::LEGACY_SESSION;
2326
use vortex::array::VortexSessionExecute;
2427
use vortex::array::arrays::DictArray;
2528
use vortex::array::arrays::PrimitiveArray;
29+
use vortex::array::buffer;
2630
use vortex::array::scalar::Scalar;
2731
use vortex::array::validity::Validity::NonNullable;
2832
use vortex::buffer::Buffer;
29-
use vortex::dtype::PType;
33+
use vortex::dtype::NativePType;
3034
use vortex::encodings::alp::ALP;
3135
use vortex::encodings::alp::ALPArrayExt;
3236
use vortex::encodings::alp::ALPArraySlotsExt;
@@ -59,16 +63,16 @@ const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M"), (100_000_000, "100M"
5963
/// This deliberately does not use `CudaDispatchPlan::execute` because the
6064
/// benchmark pre-allocates the output buffer and device plan once, then reuses
6165
/// them across iterations.
62-
fn run_timed(
66+
fn run_timed<T: DeviceRepr + NativePType>(
6367
cuda_ctx: &mut CudaExecutionCtx,
6468
array_len: usize,
6569
output_buf: &CudaDeviceBuffer,
6670
device_plan: &Arc<CudaSlice<u8>>,
6771
shared_mem_bytes: u32,
6872
) -> VortexResult<Duration> {
69-
let cuda_function = cuda_ctx.load_function("dynamic_dispatch", &[PType::U32])?;
73+
let cuda_function = cuda_ctx.load_function("dynamic_dispatch", &[T::PTYPE])?;
7074
let array_len_u64 = array_len as u64;
71-
let output_view = output_buf.as_view::<u32>();
75+
let output_view = output_buf.as_view::<T>();
7276
let (output_ptr, record_output) = output_view.device_ptr(cuda_ctx.stream());
7377
let (plan_ptr, record_plan) = device_plan.device_ptr(cuda_ctx.stream());
7478

@@ -115,17 +119,21 @@ fn run_timed(
115119
}
116120

117121
/// Benchmark runner: builds a dynamic plan and launches the kernel.
118-
struct BenchRunner {
122+
///
123+
/// `T` is the unsigned integer type matching the output element width
124+
/// (e.g. `u32` for f32/i32/u32, `u64` for f64/i64/u64).
125+
struct BenchRunner<T> {
119126
_plan: CudaDispatchPlan,
120127
smem_bytes: u32,
121128
len: usize,
122129
device_plan: Arc<CudaSlice<u8>>,
123130
output_buf: CudaDeviceBuffer,
124-
_plan_buffers: Vec<vortex::array::buffer::BufferHandle>,
131+
_plan_buffers: Vec<buffer::BufferHandle>,
132+
_phantom: PhantomData<T>,
125133
}
126134

127-
impl BenchRunner {
128-
fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &mut CudaExecutionCtx) -> Self {
135+
impl<T: DeviceRepr + NativePType> BenchRunner<T> {
136+
fn new(array: &ArrayRef, len: usize, cuda_ctx: &mut CudaExecutionCtx) -> Self {
129137
let plan = match DispatchPlan::new(array, CudaDispatchMode::DynDispatchOnly)
130138
.vortex_expect("build_dyn_dispatch_plan")
131139
{
@@ -153,16 +161,17 @@ impl BenchRunner {
153161
device_plan,
154162
output_buf: CudaDeviceBuffer::new(
155163
cuda_ctx
156-
.device_alloc::<u32>(len.next_multiple_of(1024))
164+
.device_alloc::<T>(len.next_multiple_of(1024))
157165
.expect("alloc output"),
158166
),
159167
_plan_buffers: device_buffers,
168+
_phantom: PhantomData,
160169
}
161170
}
162171

163172
fn run(&self, cuda_ctx: &mut CudaExecutionCtx) -> Duration {
164173
cuda_ctx.stream().synchronize().unwrap();
165-
run_timed(
174+
run_timed::<T>(
166175
cuda_ctx,
167176
self.len,
168177
&self.output_buf,
@@ -205,7 +214,7 @@ fn bench_for_bitpacked(c: &mut Criterion) {
205214
let mut cuda_ctx =
206215
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
207216

208-
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
217+
let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
209218

210219
b.iter_custom(|iters| {
211220
let mut total_time = Duration::ZERO;
@@ -250,7 +259,7 @@ fn bench_dict_bp_codes(c: &mut Criterion) {
250259
let mut cuda_ctx =
251260
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
252261

253-
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
262+
let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
254263

255264
b.iter_custom(|iters| {
256265
let mut total_time = Duration::ZERO;
@@ -294,7 +303,72 @@ fn bench_runend(c: &mut Criterion) {
294303
let mut cuda_ctx =
295304
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
296305

297-
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
306+
let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
307+
308+
b.iter_custom(|iters| {
309+
let mut total_time = Duration::ZERO;
310+
for _ in 0..iters {
311+
total_time += bench_runner.run(&mut cuda_ctx);
312+
}
313+
total_time
314+
});
315+
},
316+
);
317+
}
318+
319+
group.finish();
320+
}
321+
322+
// ---------------------------------------------------------------------------
323+
// Benchmark: ALP(FoR(BitPacked)) — f64
324+
// ---------------------------------------------------------------------------
325+
fn bench_alp_for_bitpacked_f64(c: &mut Criterion) {
326+
let mut ctx = LEGACY_SESSION.create_execution_ctx();
327+
let mut group = c.benchmark_group("alp_for_bp_6bw_f64");
328+
329+
let exponents = Exponents { e: 2, f: 0 };
330+
let bit_width: u8 = 6;
331+
332+
for (len, len_str) in BENCH_ARGS {
333+
group.throughput(Throughput::Bytes((len * size_of::<f64>()) as u64));
334+
335+
// Generate f64 values that ALP-encode without patches.
336+
let floats: Vec<f64> = (0..*len)
337+
.map(|i| <f64 as ALPFloat>::decode_single(10 + (i as i64 % 64), exponents))
338+
.collect();
339+
let float_prim = PrimitiveArray::new(Buffer::from(floats), NonNullable);
340+
341+
// Encode: ALP → FoR → BitPacked
342+
let alp =
343+
alp_encode(float_prim.as_view(), Some(exponents), &mut ctx).vortex_expect("alp_encode");
344+
assert!(alp.patches().is_none());
345+
let for_arr = FoRData::encode(
346+
alp.encoded()
347+
.clone()
348+
.execute::<PrimitiveArray>(&mut ctx)
349+
.vortex_expect("to primitive"),
350+
)
351+
.vortex_expect("for encode");
352+
let bp = BitPackedData::encode(for_arr.encoded(), bit_width, &mut ctx)
353+
.vortex_expect("bitpack encode");
354+
355+
let tree = ALP::new(
356+
FoR::try_new(bp.into_array(), for_arr.reference_scalar().clone())
357+
.vortex_expect("for_new")
358+
.into_array(),
359+
exponents,
360+
None,
361+
);
362+
let array = tree.into_array();
363+
364+
group.bench_with_input(
365+
BenchmarkId::new("dynamic_dispatch_f64", len_str),
366+
len,
367+
|b, &n| {
368+
let mut cuda_ctx =
369+
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
370+
371+
let bench_runner = BenchRunner::<u64>::new(&array, n, &mut cuda_ctx);
298372

299373
b.iter_custom(|iters| {
300374
let mut total_time = Duration::ZERO;
@@ -348,7 +422,7 @@ fn bench_dict_bp_codes_bp_for_values(c: &mut Criterion) {
348422
let mut cuda_ctx =
349423
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
350424

351-
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
425+
let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
352426

353427
b.iter_custom(|iters| {
354428
let mut total_time = Duration::ZERO;
@@ -413,7 +487,7 @@ fn bench_alp_for_bitpacked(c: &mut Criterion) {
413487
let mut cuda_ctx =
414488
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
415489

416-
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
490+
let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
417491

418492
b.iter_custom(|iters| {
419493
let mut total_time = Duration::ZERO;
@@ -460,7 +534,7 @@ fn bench_dict_bp_u8_codes_u32_values(c: &mut Criterion) {
460534
let mut cuda_ctx =
461535
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
462536

463-
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
537+
let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
464538

465539
b.iter_custom(|iters| {
466540
let mut total_time = Duration::ZERO;
@@ -503,7 +577,7 @@ fn bench_dict_bp_u16_codes_u32_values(c: &mut Criterion) {
503577
let mut cuda_ctx =
504578
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
505579

506-
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
580+
let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
507581

508582
b.iter_custom(|iters| {
509583
let mut total_time = Duration::ZERO;
@@ -546,7 +620,7 @@ fn bench_dict_bp_u32_codes_u32_values(c: &mut Criterion) {
546620
let mut cuda_ctx =
547621
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
548622

549-
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
623+
let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
550624

551625
b.iter_custom(|iters| {
552626
let mut total_time = Duration::ZERO;
@@ -568,6 +642,7 @@ fn benchmark_dynamic_dispatch(c: &mut Criterion) {
568642
bench_runend(c);
569643
bench_dict_bp_codes_bp_for_values(c);
570644
bench_alp_for_bitpacked(c);
645+
bench_alp_for_bitpacked_f64(c);
571646
bench_dict_bp_u8_codes_u32_values(c);
572647
bench_dict_bp_u16_codes_u32_values(c);
573648
bench_dict_bp_u32_codes_u32_values(c);

vortex-cuda/benches/filter_cuda.rs

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use cudarc::driver::CudaSlice;
1818
use cudarc::driver::CudaView;
1919
use cudarc::driver::DevicePtr;
2020
use cudarc::driver::DevicePtrMut;
21+
use cudarc::driver::DeviceRepr;
2122
use cudarc::driver::sys::CUevent_flags;
2223
use futures::executor::block_on;
2324
use vortex::error::VortexExpect;
@@ -64,7 +65,7 @@ fn make_bitmask(len: usize, selectivity: f64) -> (Vec<u8>, usize) {
6465

6566
/// Runs the CUB filter kernel and returns elapsed GPU time.
6667
#[expect(clippy::too_many_arguments)]
67-
async fn run_filter_timed<T: CubFilterable + cudarc::driver::DeviceRepr>(
68+
async fn run_filter_timed<T: CubFilterable + DeviceRepr>(
6869
d_input: CudaView<'_, T>,
6970
d_bitmask: CudaView<'_, u8>,
7071
d_output: &mut CudaSlice<T>,
@@ -132,14 +133,7 @@ async fn run_filter_timed<T: CubFilterable + cudarc::driver::DeviceRepr>(
132133
/// Benchmark filter for a specific type.
133134
fn benchmark_filter_type<T>(c: &mut Criterion, type_name: &str)
134135
where
135-
T: CubFilterable
136-
+ cudarc::driver::DeviceRepr
137-
+ From<u8>
138-
+ Debug
139-
+ Clone
140-
+ Send
141-
+ Sync
142-
+ 'static,
136+
T: CubFilterable + DeviceRepr + From<u8> + Debug + Clone + Send + Sync + 'static,
143137
{
144138
let mut group = c.benchmark_group(format!("filter_cuda_{type_name}"));
145139

vortex-cuda/benches/runend_cuda.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use criterion::Criterion;
1818
use criterion::Throughput;
1919
use cudarc::driver::DeviceRepr;
2020
use futures::executor::block_on;
21+
use vortex::array::ExecutionCtx;
2122
use vortex::array::IntoArray;
2223
use vortex::array::arrays::PrimitiveArray;
2324
use vortex::array::validity::Validity;
@@ -37,7 +38,7 @@ use crate::common::TimedLaunchStrategy;
3738
fn make_runend_array_typed<T>(
3839
output_len: usize,
3940
avg_run_len: usize,
40-
ctx: &mut vortex::array::ExecutionCtx,
41+
ctx: &mut ExecutionCtx,
4142
) -> RunEndArray
4243
where
4344
T: NativePType + From<u8>,

vortex-cuda/kernels/src/dynamic_dispatch.cu

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,30 @@ scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem, uint64_t
140140
break;
141141
}
142142
case ScalarOp::ALP: {
143-
const float f = op.params.alp.f, e = op.params.alp.e;
143+
if constexpr (sizeof(T) == 4) {
144+
// The plan builder stores f32 F10/IF10 table entries as f64
145+
// in AlpParams. The round-trip f32→f64→f32 is exact per the
146+
// C++ standard: [conv.fpprom] guarantees the widening is
147+
// value-preserving, and [conv.double] guarantees the narrowing
148+
// recovers the original value when it is exactly representable
149+
// in the destination type (which it is, having originated as f32).
150+
const float f = static_cast<float>(op.params.alp.f);
151+
const float e = static_cast<float>(op.params.alp.e);
144152
#pragma unroll
145-
for (uint32_t i = 0; i < N; ++i) {
146-
float r = static_cast<float>(static_cast<int32_t>(values[i])) * f * e;
147-
values[i] = static_cast<T>(__float_as_uint(r));
153+
for (uint32_t i = 0; i < N; ++i) {
154+
float r = static_cast<float>(static_cast<int32_t>(values[i])) * f * e;
155+
values[i] = static_cast<T>(__float_as_uint(r));
156+
}
157+
} else if constexpr (sizeof(T) == 8) {
158+
const double f = op.params.alp.f, e = op.params.alp.e;
159+
#pragma unroll
160+
for (uint32_t i = 0; i < N; ++i) {
161+
double r = static_cast<double>(static_cast<int64_t>(values[i])) * f * e;
162+
// __double_as_longlong reinterprets f64 bits as int64, and
163+
// static_cast to T (uint64_t) preserves the bit pattern
164+
// under C++20's two's complement guarantee.
165+
values[i] = static_cast<T>(__double_as_longlong(r));
166+
}
148167
}
149168
// Apply ALP patches: override positions whose float value couldn't
150169
// be reconstructed through the ALP encode/decode cycle.

vortex-cuda/kernels/src/dynamic_dispatch.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
///
2121
/// Each source op and scalar op may produce a different PType than its input.
2222
/// For example, DICT transforms codes (e.g. u8) into values (e.g. f32), and
23-
/// ALP transforms encoded integers (i32) into floats (f32).
23+
/// ALP transforms encoded integers into floats (e.g. i32 → f32, i64 → f64).
2424
///
2525
/// `PTypeTag` is a compact enum that identifies the primitive type at each
2626
/// point in the pipeline. The kernel uses it to dispatch typed memory
@@ -171,7 +171,7 @@ struct SourceOp {
171171
/// Each scalar op declares its `output_ptype` — the PType of the values it
172172
/// produces. Most ops preserve the input type (FOR, ZIGZAG), but some
173173
/// change it:
174-
/// - ALP: encoded int → float (e.g. i32 → f32)
174+
/// - ALP: encoded int → float (e.g. i32 → f32, i64 → f64)
175175
/// - DICT: codes type → values type (e.g. u8 → u32)
176176
///
177177
/// The plan builder uses `output_ptype` to determine the element width
@@ -183,8 +183,8 @@ union ScalarParams {
183183
} frame_of_ref;
184184

185185
struct AlpParams {
186-
float f;
187-
float e;
186+
double f;
187+
double e;
188188
uint64_t patches_ptr; // device pointer to GPUPatches struct (0 = none)
189189
} alp;
190190

0 commit comments

Comments
 (0)