55#![ expect( clippy:: cast_possible_truncation) ]
66#![ expect( clippy:: expect_used) ]
77
8+ use std:: marker:: PhantomData ;
89use std:: mem:: size_of;
910use std:: sync:: Arc ;
1011use std:: time:: Duration ;
@@ -14,19 +15,22 @@ use criterion::Criterion;
1415use criterion:: Throughput ;
1516use cudarc:: driver:: CudaSlice ;
1617use cudarc:: driver:: DevicePtr ;
18+ use cudarc:: driver:: DeviceRepr ;
1719use cudarc:: driver:: LaunchConfig ;
1820use cudarc:: driver:: PushKernelArg ;
1921use cudarc:: driver:: sys:: CUevent_flags ;
2022use futures:: executor:: block_on;
23+ use vortex:: array:: ArrayRef ;
2124use vortex:: array:: IntoArray ;
2225use vortex:: array:: LEGACY_SESSION ;
2326use vortex:: array:: VortexSessionExecute ;
2427use vortex:: array:: arrays:: DictArray ;
2528use vortex:: array:: arrays:: PrimitiveArray ;
29+ use vortex:: array:: buffer;
2630use vortex:: array:: scalar:: Scalar ;
2731use vortex:: array:: validity:: Validity :: NonNullable ;
2832use vortex:: buffer:: Buffer ;
29- use vortex:: dtype:: PType ;
33+ use vortex:: dtype:: NativePType ;
3034use vortex:: encodings:: alp:: ALP ;
3135use vortex:: encodings:: alp:: ALPArrayExt ;
3236use vortex:: encodings:: alp:: ALPArraySlotsExt ;
@@ -59,16 +63,16 @@ const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M"), (100_000_000, "100M"
5963/// This deliberately does not use `CudaDispatchPlan::execute` because the
6064/// benchmark pre-allocates the output buffer and device plan once, then reuses
6165/// them across iterations.
62- fn run_timed (
66+ fn run_timed < T : DeviceRepr + NativePType > (
6367 cuda_ctx : & mut CudaExecutionCtx ,
6468 array_len : usize ,
6569 output_buf : & CudaDeviceBuffer ,
6670 device_plan : & Arc < CudaSlice < u8 > > ,
6771 shared_mem_bytes : u32 ,
6872) -> VortexResult < Duration > {
69- let cuda_function = cuda_ctx. load_function ( "dynamic_dispatch" , & [ PType :: U32 ] ) ?;
73+ let cuda_function = cuda_ctx. load_function ( "dynamic_dispatch" , & [ T :: PTYPE ] ) ?;
7074 let array_len_u64 = array_len as u64 ;
71- let output_view = output_buf. as_view :: < u32 > ( ) ;
75+ let output_view = output_buf. as_view :: < T > ( ) ;
7276 let ( output_ptr, record_output) = output_view. device_ptr ( cuda_ctx. stream ( ) ) ;
7377 let ( plan_ptr, record_plan) = device_plan. device_ptr ( cuda_ctx. stream ( ) ) ;
7478
@@ -115,17 +119,21 @@ fn run_timed(
115119}
116120
117121/// Benchmark runner: builds a dynamic plan and launches the kernel.
118- struct BenchRunner {
122+ ///
123+ /// `T` is the unsigned integer type matching the output element width
124+ /// (e.g. `u32` for f32/i32/u32, `u64` for f64/i64/u64).
125+ struct BenchRunner < T > {
119126 _plan : CudaDispatchPlan ,
120127 smem_bytes : u32 ,
121128 len : usize ,
122129 device_plan : Arc < CudaSlice < u8 > > ,
123130 output_buf : CudaDeviceBuffer ,
124- _plan_buffers : Vec < vortex:: array:: buffer:: BufferHandle > ,
131+ _plan_buffers : Vec < buffer:: BufferHandle > ,
132+ _phantom : PhantomData < T > ,
125133}
126134
127- impl BenchRunner {
128- fn new ( array : & vortex :: array :: ArrayRef , len : usize , cuda_ctx : & mut CudaExecutionCtx ) -> Self {
135+ impl < T : DeviceRepr + NativePType > BenchRunner < T > {
136+ fn new ( array : & ArrayRef , len : usize , cuda_ctx : & mut CudaExecutionCtx ) -> Self {
129137 let plan = match DispatchPlan :: new ( array, CudaDispatchMode :: DynDispatchOnly )
130138 . vortex_expect ( "build_dyn_dispatch_plan" )
131139 {
@@ -153,16 +161,17 @@ impl BenchRunner {
153161 device_plan,
154162 output_buf : CudaDeviceBuffer :: new (
155163 cuda_ctx
156- . device_alloc :: < u32 > ( len. next_multiple_of ( 1024 ) )
164+ . device_alloc :: < T > ( len. next_multiple_of ( 1024 ) )
157165 . expect ( "alloc output" ) ,
158166 ) ,
159167 _plan_buffers : device_buffers,
168+ _phantom : PhantomData ,
160169 }
161170 }
162171
163172 fn run ( & self , cuda_ctx : & mut CudaExecutionCtx ) -> Duration {
164173 cuda_ctx. stream ( ) . synchronize ( ) . unwrap ( ) ;
165- run_timed (
174+ run_timed :: < T > (
166175 cuda_ctx,
167176 self . len ,
168177 & self . output_buf ,
@@ -205,7 +214,7 @@ fn bench_for_bitpacked(c: &mut Criterion) {
205214 let mut cuda_ctx =
206215 CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) . vortex_expect ( "ctx" ) ;
207216
208- let bench_runner = BenchRunner :: new ( & array, n, & mut cuda_ctx) ;
217+ let bench_runner = BenchRunner :: < u32 > :: new ( & array, n, & mut cuda_ctx) ;
209218
210219 b. iter_custom ( |iters| {
211220 let mut total_time = Duration :: ZERO ;
@@ -250,7 +259,7 @@ fn bench_dict_bp_codes(c: &mut Criterion) {
250259 let mut cuda_ctx =
251260 CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) . vortex_expect ( "ctx" ) ;
252261
253- let bench_runner = BenchRunner :: new ( & array, n, & mut cuda_ctx) ;
262+ let bench_runner = BenchRunner :: < u32 > :: new ( & array, n, & mut cuda_ctx) ;
254263
255264 b. iter_custom ( |iters| {
256265 let mut total_time = Duration :: ZERO ;
@@ -294,7 +303,72 @@ fn bench_runend(c: &mut Criterion) {
294303 let mut cuda_ctx =
295304 CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) . vortex_expect ( "ctx" ) ;
296305
297- let bench_runner = BenchRunner :: new ( & array, n, & mut cuda_ctx) ;
306+ let bench_runner = BenchRunner :: < u32 > :: new ( & array, n, & mut cuda_ctx) ;
307+
308+ b. iter_custom ( |iters| {
309+ let mut total_time = Duration :: ZERO ;
310+ for _ in 0 ..iters {
311+ total_time += bench_runner. run ( & mut cuda_ctx) ;
312+ }
313+ total_time
314+ } ) ;
315+ } ,
316+ ) ;
317+ }
318+
319+ group. finish ( ) ;
320+ }
321+
322+ // ---------------------------------------------------------------------------
323+ // Benchmark: ALP(FoR(BitPacked)) — f64
324+ // ---------------------------------------------------------------------------
325+ fn bench_alp_for_bitpacked_f64 ( c : & mut Criterion ) {
326+ let mut ctx = LEGACY_SESSION . create_execution_ctx ( ) ;
327+ let mut group = c. benchmark_group ( "alp_for_bp_6bw_f64" ) ;
328+
329+ let exponents = Exponents { e : 2 , f : 0 } ;
330+ let bit_width: u8 = 6 ;
331+
332+ for ( len, len_str) in BENCH_ARGS {
333+ group. throughput ( Throughput :: Bytes ( ( len * size_of :: < f64 > ( ) ) as u64 ) ) ;
334+
335+ // Generate f64 values that ALP-encode without patches.
336+ let floats: Vec < f64 > = ( 0 ..* len)
337+ . map ( |i| <f64 as ALPFloat >:: decode_single ( 10 + ( i as i64 % 64 ) , exponents) )
338+ . collect ( ) ;
339+ let float_prim = PrimitiveArray :: new ( Buffer :: from ( floats) , NonNullable ) ;
340+
341+ // Encode: ALP → FoR → BitPacked
342+ let alp =
343+ alp_encode ( float_prim. as_view ( ) , Some ( exponents) , & mut ctx) . vortex_expect ( "alp_encode" ) ;
344+ assert ! ( alp. patches( ) . is_none( ) ) ;
345+ let for_arr = FoRData :: encode (
346+ alp. encoded ( )
347+ . clone ( )
348+ . execute :: < PrimitiveArray > ( & mut ctx)
349+ . vortex_expect ( "to primitive" ) ,
350+ )
351+ . vortex_expect ( "for encode" ) ;
352+ let bp = BitPackedData :: encode ( for_arr. encoded ( ) , bit_width, & mut ctx)
353+ . vortex_expect ( "bitpack encode" ) ;
354+
355+ let tree = ALP :: new (
356+ FoR :: try_new ( bp. into_array ( ) , for_arr. reference_scalar ( ) . clone ( ) )
357+ . vortex_expect ( "for_new" )
358+ . into_array ( ) ,
359+ exponents,
360+ None ,
361+ ) ;
362+ let array = tree. into_array ( ) ;
363+
364+ group. bench_with_input (
365+ BenchmarkId :: new ( "dynamic_dispatch_f64" , len_str) ,
366+ len,
367+ |b, & n| {
368+ let mut cuda_ctx =
369+ CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) . vortex_expect ( "ctx" ) ;
370+
371+ let bench_runner = BenchRunner :: < u64 > :: new ( & array, n, & mut cuda_ctx) ;
298372
299373 b. iter_custom ( |iters| {
300374 let mut total_time = Duration :: ZERO ;
@@ -348,7 +422,7 @@ fn bench_dict_bp_codes_bp_for_values(c: &mut Criterion) {
348422 let mut cuda_ctx =
349423 CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) . vortex_expect ( "ctx" ) ;
350424
351- let bench_runner = BenchRunner :: new ( & array, n, & mut cuda_ctx) ;
425+ let bench_runner = BenchRunner :: < u32 > :: new ( & array, n, & mut cuda_ctx) ;
352426
353427 b. iter_custom ( |iters| {
354428 let mut total_time = Duration :: ZERO ;
@@ -413,7 +487,7 @@ fn bench_alp_for_bitpacked(c: &mut Criterion) {
413487 let mut cuda_ctx =
414488 CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) . vortex_expect ( "ctx" ) ;
415489
416- let bench_runner = BenchRunner :: new ( & array, n, & mut cuda_ctx) ;
490+ let bench_runner = BenchRunner :: < u32 > :: new ( & array, n, & mut cuda_ctx) ;
417491
418492 b. iter_custom ( |iters| {
419493 let mut total_time = Duration :: ZERO ;
@@ -460,7 +534,7 @@ fn bench_dict_bp_u8_codes_u32_values(c: &mut Criterion) {
460534 let mut cuda_ctx =
461535 CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) . vortex_expect ( "ctx" ) ;
462536
463- let bench_runner = BenchRunner :: new ( & array, n, & mut cuda_ctx) ;
537+ let bench_runner = BenchRunner :: < u32 > :: new ( & array, n, & mut cuda_ctx) ;
464538
465539 b. iter_custom ( |iters| {
466540 let mut total_time = Duration :: ZERO ;
@@ -503,7 +577,7 @@ fn bench_dict_bp_u16_codes_u32_values(c: &mut Criterion) {
503577 let mut cuda_ctx =
504578 CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) . vortex_expect ( "ctx" ) ;
505579
506- let bench_runner = BenchRunner :: new ( & array, n, & mut cuda_ctx) ;
580+ let bench_runner = BenchRunner :: < u32 > :: new ( & array, n, & mut cuda_ctx) ;
507581
508582 b. iter_custom ( |iters| {
509583 let mut total_time = Duration :: ZERO ;
@@ -546,7 +620,7 @@ fn bench_dict_bp_u32_codes_u32_values(c: &mut Criterion) {
546620 let mut cuda_ctx =
547621 CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) . vortex_expect ( "ctx" ) ;
548622
549- let bench_runner = BenchRunner :: new ( & array, n, & mut cuda_ctx) ;
623+ let bench_runner = BenchRunner :: < u32 > :: new ( & array, n, & mut cuda_ctx) ;
550624
551625 b. iter_custom ( |iters| {
552626 let mut total_time = Duration :: ZERO ;
@@ -568,6 +642,7 @@ fn benchmark_dynamic_dispatch(c: &mut Criterion) {
568642 bench_runend ( c) ;
569643 bench_dict_bp_codes_bp_for_values ( c) ;
570644 bench_alp_for_bitpacked ( c) ;
645+ bench_alp_for_bitpacked_f64 ( c) ;
571646 bench_dict_bp_u8_codes_u32_values ( c) ;
572647 bench_dict_bp_u16_codes_u32_values ( c) ;
573648 bench_dict_bp_u32_codes_u32_values ( c) ;
0 commit comments