Skip to content

Commit cb82828

Browse files
0ax1claude
andauthored
perf[gpu]: export arrow device validity on the gpu (#8440)
Move canonicalization of the validity buffer from the CPU to the GPU for arrow device array. As part of that this change adds a null count kernel, as the count is required by cuDF. cuDF does not support consuming `-1` (unknown true count) for passed in arrow device arrays. --------- Signed-off-by: Alexander Droste <alexander.droste@protonmail.com> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 60d165d commit cb82828

5 files changed

Lines changed: 456 additions & 99 deletions

File tree

vortex-cuda/benches/arrow_validity_cuda.rs

Lines changed: 167 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
#![expect(clippy::cast_possible_truncation)]
34

45
//! CUDA benchmarks for Arrow validity bitmap repacking.
56
@@ -9,15 +10,27 @@ mod timed_launch_strategy;
910
use std::sync::Arc;
1011
use std::sync::atomic::Ordering;
1112
use std::time::Duration;
13+
use std::time::Instant;
1214

1315
use criterion::BenchmarkId;
1416
use criterion::Criterion;
1517
use criterion::Throughput;
1618
use futures::executor::block_on;
19+
use vortex::array::IntoArray;
20+
use vortex::array::arrays::BoolArray;
21+
use vortex::array::arrays::PrimitiveArray;
1722
use vortex::array::buffer::BufferHandle;
23+
use vortex::array::validity::Validity;
1824
use vortex::buffer::BitBuffer;
25+
use vortex::buffer::Buffer;
26+
use vortex::dtype::PType;
1927
use vortex::error::VortexExpect;
28+
use vortex::error::VortexResult;
29+
use vortex::session::VortexSession;
30+
use vortex_cuda::CudaExecutionCtx;
2031
use vortex_cuda::CudaSession;
32+
use vortex_cuda::arrow::ArrowDeviceArray;
33+
use vortex_cuda::arrow::DeviceArrayExt;
2134
use vortex_cuda::arrow::test_harness;
2235
use vortex_cuda_macros::cuda_available;
2336
use vortex_cuda_macros::cuda_not_available;
@@ -26,30 +39,130 @@ use crate::timed_launch_strategy::TimedLaunchStrategy;
2639

2740
const INPUT_OFFSET: usize = 5;
2841
const ARROW_OFFSET: usize = 3;
42+
const EXPORT_BENCH_SIZES: &[(usize, &str)] = &[(100_000_000, "100M")];
43+
44+
fn validity_bitmap_byte_len(len: usize, bit_offset: usize) -> usize {
45+
(bit_offset + len).div_ceil(8)
46+
}
47+
48+
unsafe fn release_arrow_device_array(array: &mut ArrowDeviceArray) {
49+
unsafe {
50+
if let Some(release) = array.array.release {
51+
release(&raw mut array.array);
52+
}
53+
}
54+
}
55+
56+
async fn device_validity_buffer(
57+
len: usize,
58+
validity_offset: usize,
59+
ctx: &mut CudaExecutionCtx,
60+
) -> VortexResult<(usize, BufferHandle)> {
61+
let validity_bits = BitBuffer::collect_bool(len + validity_offset, |idx| idx % 3 != 0)
62+
.slice(validity_offset..validity_offset + len);
63+
let (validity_offset, _, validity_buffer) = validity_bits.into_inner();
64+
Ok((
65+
validity_offset,
66+
ctx.ensure_on_device(BufferHandle::new_host(validity_buffer))
67+
.await?,
68+
))
69+
}
70+
71+
async fn primitive_with_device_bool_validity(
72+
len: usize,
73+
validity_offset: usize,
74+
ctx: &mut CudaExecutionCtx,
75+
) -> VortexResult<vortex::array::ArrayRef> {
76+
let values = Buffer::<i32>::from_iter((0..len).map(|idx| idx as i32));
77+
let values = ctx
78+
.ensure_on_device(BufferHandle::new_host(values.into_byte_buffer()))
79+
.await?;
80+
81+
let (validity_offset, validity_buffer) =
82+
device_validity_buffer(len, validity_offset, ctx).await?;
83+
let validity =
84+
BoolArray::new_handle(validity_buffer, validity_offset, len, Validity::NonNullable)
85+
.into_array();
86+
87+
Ok(
88+
PrimitiveArray::from_buffer_handle(values, PType::I32, Validity::Array(validity))
89+
.into_array(),
90+
)
91+
}
92+
93+
fn benchmark_arrow_validity_export(c: &mut Criterion) {
94+
let mut group = c.benchmark_group("cuda");
95+
96+
for &(len, len_label) in EXPORT_BENCH_SIZES {
97+
for (case, validity_offset) in
98+
[("device_bitmap", 0), ("device_bitmap_repack", INPUT_OFFSET)]
99+
{
100+
group.throughput(Throughput::Bytes(
101+
validity_bitmap_byte_len(len, validity_offset) as u64,
102+
));
103+
group.bench_with_input(
104+
BenchmarkId::new(format!("cuda/arrow_validity/export/{case}"), len_label),
105+
&len,
106+
|b, &len| {
107+
b.iter_custom(|iters| {
108+
let mut cuda_ctx =
109+
CudaSession::create_execution_ctx(&VortexSession::empty())
110+
.vortex_expect("failed to create execution context");
111+
let array = block_on(primitive_with_device_bool_validity(
112+
len,
113+
validity_offset,
114+
&mut cuda_ctx,
115+
))
116+
.vortex_expect("failed to create primitive fixture");
117+
118+
let mut exported_arrays = Vec::with_capacity(
119+
usize::try_from(iters)
120+
.vortex_expect("iteration count does not fit usize"),
121+
);
122+
123+
let start = Instant::now();
124+
for _ in 0..iters {
125+
exported_arrays.push(
126+
block_on(array.clone().export_device_array(&mut cuda_ctx))
127+
.vortex_expect("failed to export device array"),
128+
);
129+
}
130+
let elapsed = start.elapsed();
131+
132+
for exported in &mut exported_arrays {
133+
unsafe { release_arrow_device_array(exported) };
134+
}
135+
136+
elapsed
137+
});
138+
},
139+
);
140+
}
141+
}
142+
143+
group.finish();
144+
}
29145

30146
fn benchmark_arrow_validity_repack(c: &mut Criterion) {
31147
let mut group = c.benchmark_group("cuda");
32148

33149
for &(len, len_label) in bench_config::BENCH_SIZES {
34-
group.throughput(Throughput::Elements(len as u64));
150+
group.throughput(Throughput::Bytes(
151+
validity_bitmap_byte_len(len, INPUT_OFFSET) as u64,
152+
));
35153
group.bench_with_input(
36154
BenchmarkId::new("cuda/arrow_validity/repack", len_label),
37155
&len,
38156
|b, &len| {
39157
b.iter_custom(|iters| {
40158
let timed = TimedLaunchStrategy::default();
41159
let timer = timed.timer();
42-
43-
let mut cuda_ctx =
44-
CudaSession::create_execution_ctx(&vortex_cuda::cuda_session())
45-
.vortex_expect("failed to create execution context")
46-
.with_launch_strategy(Arc::new(timed));
47-
let source = BitBuffer::collect_bool(len + INPUT_OFFSET, |idx| idx % 3 != 0);
48-
let sliced = source.slice(INPUT_OFFSET..INPUT_OFFSET + len);
49-
let (input_offset, _, input_buffer) = sliced.into_inner();
50-
let input_buffer =
51-
block_on(cuda_ctx.ensure_on_device(BufferHandle::new_host(input_buffer)))
52-
.vortex_expect("failed to copy validity input to device");
160+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
161+
.vortex_expect("failed to create execution context")
162+
.with_launch_strategy(Arc::new(timed));
163+
let (input_offset, input_buffer) =
164+
block_on(device_validity_buffer(len, INPUT_OFFSET, &mut cuda_ctx))
165+
.vortex_expect("failed to create validity fixture");
53166

54167
for _ in 0..iters {
55168
let output = test_harness::repack_arrow_validity_buffer(
@@ -72,10 +185,51 @@ fn benchmark_arrow_validity_repack(c: &mut Criterion) {
72185
group.finish();
73186
}
74187

188+
fn benchmark_arrow_validity_count_nulls(c: &mut Criterion) {
189+
let mut group = c.benchmark_group("cuda");
190+
191+
for &(len, len_label) in bench_config::BENCH_SIZES {
192+
group.throughput(Throughput::Bytes(
193+
validity_bitmap_byte_len(len, ARROW_OFFSET) as u64,
194+
));
195+
group.bench_with_input(
196+
BenchmarkId::new("cuda/arrow_validity/count_nulls", len_label),
197+
&len,
198+
|b, &len| {
199+
b.iter_custom(|iters| {
200+
let timed = TimedLaunchStrategy::default();
201+
let timer = timed.timer();
202+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
203+
.vortex_expect("failed to create execution context")
204+
.with_launch_strategy(Arc::new(timed));
205+
let (_, input_buffer) =
206+
block_on(device_validity_buffer(len, ARROW_OFFSET, &mut cuda_ctx))
207+
.vortex_expect("failed to create validity fixture");
208+
209+
for _ in 0..iters {
210+
let null_count = test_harness::count_arrow_validity_nulls(
211+
&input_buffer,
212+
len,
213+
ARROW_OFFSET,
214+
&mut cuda_ctx,
215+
)
216+
.vortex_expect("failed to count Arrow validity nulls");
217+
std::hint::black_box(null_count);
218+
}
219+
220+
Duration::from_nanos(timer.load(Ordering::Relaxed))
221+
});
222+
},
223+
);
224+
}
225+
226+
group.finish();
227+
}
228+
75229
criterion::criterion_group! {
76230
name = benches;
77231
config = bench_config::cuda_bench_config();
78-
targets = benchmark_arrow_validity_repack
232+
targets = benchmark_arrow_validity_repack, benchmark_arrow_validity_count_nulls, benchmark_arrow_validity_export
79233
}
80234

81235
#[cuda_available]

0 commit comments

Comments
 (0)