11// SPDX-License-Identifier: Apache-2.0
22// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+ #![ expect( clippy:: cast_possible_truncation) ]
34
45//! CUDA benchmarks for Arrow validity bitmap repacking.
56
@@ -9,15 +10,27 @@ mod timed_launch_strategy;
910use std:: sync:: Arc ;
1011use std:: sync:: atomic:: Ordering ;
1112use std:: time:: Duration ;
13+ use std:: time:: Instant ;
1214
1315use criterion:: BenchmarkId ;
1416use criterion:: Criterion ;
1517use criterion:: Throughput ;
1618use futures:: executor:: block_on;
19+ use vortex:: array:: IntoArray ;
20+ use vortex:: array:: arrays:: BoolArray ;
21+ use vortex:: array:: arrays:: PrimitiveArray ;
1722use vortex:: array:: buffer:: BufferHandle ;
23+ use vortex:: array:: validity:: Validity ;
1824use vortex:: buffer:: BitBuffer ;
25+ use vortex:: buffer:: Buffer ;
26+ use vortex:: dtype:: PType ;
1927use vortex:: error:: VortexExpect ;
28+ use vortex:: error:: VortexResult ;
29+ use vortex:: session:: VortexSession ;
30+ use vortex_cuda:: CudaExecutionCtx ;
2031use vortex_cuda:: CudaSession ;
32+ use vortex_cuda:: arrow:: ArrowDeviceArray ;
33+ use vortex_cuda:: arrow:: DeviceArrayExt ;
2134use vortex_cuda:: arrow:: test_harness;
2235use vortex_cuda_macros:: cuda_available;
2336use vortex_cuda_macros:: cuda_not_available;
@@ -26,30 +39,130 @@ use crate::timed_launch_strategy::TimedLaunchStrategy;
2639
2740const INPUT_OFFSET : usize = 5 ;
2841const ARROW_OFFSET : usize = 3 ;
42+ const EXPORT_BENCH_SIZES : & [ ( usize , & str ) ] = & [ ( 100_000_000 , "100M" ) ] ;
43+
44+ fn validity_bitmap_byte_len ( len : usize , bit_offset : usize ) -> usize {
45+ ( bit_offset + len) . div_ceil ( 8 )
46+ }
47+
48+ unsafe fn release_arrow_device_array ( array : & mut ArrowDeviceArray ) {
49+ unsafe {
50+ if let Some ( release) = array. array . release {
51+ release ( & raw mut array. array ) ;
52+ }
53+ }
54+ }
55+
56+ async fn device_validity_buffer (
57+ len : usize ,
58+ validity_offset : usize ,
59+ ctx : & mut CudaExecutionCtx ,
60+ ) -> VortexResult < ( usize , BufferHandle ) > {
61+ let validity_bits = BitBuffer :: collect_bool ( len + validity_offset, |idx| idx % 3 != 0 )
62+ . slice ( validity_offset..validity_offset + len) ;
63+ let ( validity_offset, _, validity_buffer) = validity_bits. into_inner ( ) ;
64+ Ok ( (
65+ validity_offset,
66+ ctx. ensure_on_device ( BufferHandle :: new_host ( validity_buffer) )
67+ . await ?,
68+ ) )
69+ }
70+
71+ async fn primitive_with_device_bool_validity (
72+ len : usize ,
73+ validity_offset : usize ,
74+ ctx : & mut CudaExecutionCtx ,
75+ ) -> VortexResult < vortex:: array:: ArrayRef > {
76+ let values = Buffer :: < i32 > :: from_iter ( ( 0 ..len) . map ( |idx| idx as i32 ) ) ;
77+ let values = ctx
78+ . ensure_on_device ( BufferHandle :: new_host ( values. into_byte_buffer ( ) ) )
79+ . await ?;
80+
81+ let ( validity_offset, validity_buffer) =
82+ device_validity_buffer ( len, validity_offset, ctx) . await ?;
83+ let validity =
84+ BoolArray :: new_handle ( validity_buffer, validity_offset, len, Validity :: NonNullable )
85+ . into_array ( ) ;
86+
87+ Ok (
88+ PrimitiveArray :: from_buffer_handle ( values, PType :: I32 , Validity :: Array ( validity) )
89+ . into_array ( ) ,
90+ )
91+ }
92+
93+ fn benchmark_arrow_validity_export ( c : & mut Criterion ) {
94+ let mut group = c. benchmark_group ( "cuda" ) ;
95+
96+ for & ( len, len_label) in EXPORT_BENCH_SIZES {
97+ for ( case, validity_offset) in
98+ [ ( "device_bitmap" , 0 ) , ( "device_bitmap_repack" , INPUT_OFFSET ) ]
99+ {
100+ group. throughput ( Throughput :: Bytes (
101+ validity_bitmap_byte_len ( len, validity_offset) as u64 ,
102+ ) ) ;
103+ group. bench_with_input (
104+ BenchmarkId :: new ( format ! ( "cuda/arrow_validity/export/{case}" ) , len_label) ,
105+ & len,
106+ |b, & len| {
107+ b. iter_custom ( |iters| {
108+ let mut cuda_ctx =
109+ CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) )
110+ . vortex_expect ( "failed to create execution context" ) ;
111+ let array = block_on ( primitive_with_device_bool_validity (
112+ len,
113+ validity_offset,
114+ & mut cuda_ctx,
115+ ) )
116+ . vortex_expect ( "failed to create primitive fixture" ) ;
117+
118+ let mut exported_arrays = Vec :: with_capacity (
119+ usize:: try_from ( iters)
120+ . vortex_expect ( "iteration count does not fit usize" ) ,
121+ ) ;
122+
123+ let start = Instant :: now ( ) ;
124+ for _ in 0 ..iters {
125+ exported_arrays. push (
126+ block_on ( array. clone ( ) . export_device_array ( & mut cuda_ctx) )
127+ . vortex_expect ( "failed to export device array" ) ,
128+ ) ;
129+ }
130+ let elapsed = start. elapsed ( ) ;
131+
132+ for exported in & mut exported_arrays {
133+ unsafe { release_arrow_device_array ( exported) } ;
134+ }
135+
136+ elapsed
137+ } ) ;
138+ } ,
139+ ) ;
140+ }
141+ }
142+
143+ group. finish ( ) ;
144+ }
29145
30146fn benchmark_arrow_validity_repack ( c : & mut Criterion ) {
31147 let mut group = c. benchmark_group ( "cuda" ) ;
32148
33149 for & ( len, len_label) in bench_config:: BENCH_SIZES {
34- group. throughput ( Throughput :: Elements ( len as u64 ) ) ;
150+ group. throughput ( Throughput :: Bytes (
151+ validity_bitmap_byte_len ( len, INPUT_OFFSET ) as u64 ,
152+ ) ) ;
35153 group. bench_with_input (
36154 BenchmarkId :: new ( "cuda/arrow_validity/repack" , len_label) ,
37155 & len,
38156 |b, & len| {
39157 b. iter_custom ( |iters| {
40158 let timed = TimedLaunchStrategy :: default ( ) ;
41159 let timer = timed. timer ( ) ;
42-
43- let mut cuda_ctx =
44- CudaSession :: create_execution_ctx ( & vortex_cuda:: cuda_session ( ) )
45- . vortex_expect ( "failed to create execution context" )
46- . with_launch_strategy ( Arc :: new ( timed) ) ;
47- let source = BitBuffer :: collect_bool ( len + INPUT_OFFSET , |idx| idx % 3 != 0 ) ;
48- let sliced = source. slice ( INPUT_OFFSET ..INPUT_OFFSET + len) ;
49- let ( input_offset, _, input_buffer) = sliced. into_inner ( ) ;
50- let input_buffer =
51- block_on ( cuda_ctx. ensure_on_device ( BufferHandle :: new_host ( input_buffer) ) )
52- . vortex_expect ( "failed to copy validity input to device" ) ;
160+ let mut cuda_ctx = CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) )
161+ . vortex_expect ( "failed to create execution context" )
162+ . with_launch_strategy ( Arc :: new ( timed) ) ;
163+ let ( input_offset, input_buffer) =
164+ block_on ( device_validity_buffer ( len, INPUT_OFFSET , & mut cuda_ctx) )
165+ . vortex_expect ( "failed to create validity fixture" ) ;
53166
54167 for _ in 0 ..iters {
55168 let output = test_harness:: repack_arrow_validity_buffer (
@@ -72,10 +185,51 @@ fn benchmark_arrow_validity_repack(c: &mut Criterion) {
72185 group. finish ( ) ;
73186}
74187
188+ fn benchmark_arrow_validity_count_nulls ( c : & mut Criterion ) {
189+ let mut group = c. benchmark_group ( "cuda" ) ;
190+
191+ for & ( len, len_label) in bench_config:: BENCH_SIZES {
192+ group. throughput ( Throughput :: Bytes (
193+ validity_bitmap_byte_len ( len, ARROW_OFFSET ) as u64 ,
194+ ) ) ;
195+ group. bench_with_input (
196+ BenchmarkId :: new ( "cuda/arrow_validity/count_nulls" , len_label) ,
197+ & len,
198+ |b, & len| {
199+ b. iter_custom ( |iters| {
200+ let timed = TimedLaunchStrategy :: default ( ) ;
201+ let timer = timed. timer ( ) ;
202+ let mut cuda_ctx = CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) )
203+ . vortex_expect ( "failed to create execution context" )
204+ . with_launch_strategy ( Arc :: new ( timed) ) ;
205+ let ( _, input_buffer) =
206+ block_on ( device_validity_buffer ( len, ARROW_OFFSET , & mut cuda_ctx) )
207+ . vortex_expect ( "failed to create validity fixture" ) ;
208+
209+ for _ in 0 ..iters {
210+ let null_count = test_harness:: count_arrow_validity_nulls (
211+ & input_buffer,
212+ len,
213+ ARROW_OFFSET ,
214+ & mut cuda_ctx,
215+ )
216+ . vortex_expect ( "failed to count Arrow validity nulls" ) ;
217+ std:: hint:: black_box ( null_count) ;
218+ }
219+
220+ Duration :: from_nanos ( timer. load ( Ordering :: Relaxed ) )
221+ } ) ;
222+ } ,
223+ ) ;
224+ }
225+
226+ group. finish ( ) ;
227+ }
228+
75229criterion:: criterion_group! {
76230 name = benches;
77231 config = bench_config:: cuda_bench_config( ) ;
78- targets = benchmark_arrow_validity_repack
232+ targets = benchmark_arrow_validity_repack, benchmark_arrow_validity_count_nulls , benchmark_arrow_validity_export
79233}
80234
81235#[ cuda_available]
0 commit comments