Skip to content

Commit a5ebab2

Browse files
committed
bench: CUDA sync load-to-device benchmark
This benchmark provides the baseline for GPU data loading execising our synchronous logic. In this naive case, the data is loaded from file to RAM, and then sync copied to the device. Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 5e5572b commit a5ebab2

2 files changed

Lines changed: 86 additions & 0 deletions

File tree

vortex-cuda/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,7 @@ harness = false
9494
[[bench]]
9595
name = "throughput_cuda"
9696
harness = false
97+
98+
[[bench]]
99+
name = "load_to_device_cuda"
100+
harness = false
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
#![expect(clippy::expect_used)]
5+
6+
mod bench_config;
7+
// Unused here but suppresses dead_code warning for the shared module.
8+
const _: &[(usize, &str)] = bench_config::BENCH_SIZES;
9+
10+
use std::time::Duration;
11+
use std::time::Instant;
12+
13+
use criterion::BenchmarkId;
14+
use criterion::Criterion;
15+
use criterion::Throughput;
16+
use vortex::array::buffer::BufferHandle;
17+
use vortex::buffer::ByteBuffer;
18+
use vortex::error::VortexExpect;
19+
use vortex::session::VortexSession;
20+
use vortex_cuda::CudaSession;
21+
use vortex_cuda_macros::cuda_available;
22+
use vortex_cuda_macros::cuda_not_available;
23+
24+
const LOAD_SIZES: &[(usize, &str)] = &[
25+
(16 * 1024 * 1024, "16MiB"),
26+
(64 * 1024 * 1024, "64MiB"),
27+
(256 * 1024 * 1024, "256MiB"),
28+
(1024 * 1024 * 1024, "1GiB"),
29+
];
30+
31+
fn benchmark_load_to_device(c: &mut Criterion) {
32+
let mut group = c.benchmark_group("cuda");
33+
34+
for &(size, size_name) in LOAD_SIZES {
35+
group.throughput(Throughput::Bytes(size as u64));
36+
37+
group.bench_with_input(
38+
BenchmarkId::new("cuda/load_to_device/ensure_on_device_sync", size_name),
39+
&size,
40+
|b, &size| {
41+
let session = VortexSession::empty();
42+
let cuda_ctx =
43+
CudaSession::create_execution_ctx(&session).vortex_expect("cuda ctx");
44+
45+
b.iter_custom(|iters| {
46+
let mut total = Duration::ZERO;
47+
for _ in 0..iters {
48+
let source = BufferHandle::new_host(ByteBuffer::from(vec![0xA5; size]));
49+
let start = Instant::now();
50+
let handle = cuda_ctx
51+
.ensure_on_device_sync(source)
52+
.vortex_expect("ensure_on_device_sync");
53+
assert!(handle.is_on_device());
54+
// Keep the explcit sync here to ensure that we measure a sync copy. In
55+
// case the default buffer allocation strategy in the future changes to use
56+
// `cuMemHostAlloc`, the htod copy would change to being async, making the
57+
// function return immediately.
58+
cuda_ctx.stream().synchronize().expect("synchronize stream");
59+
total += start.elapsed();
60+
}
61+
total
62+
});
63+
64+
drop(cuda_ctx);
65+
},
66+
);
67+
}
68+
69+
group.finish();
70+
}
71+
72+
criterion::criterion_group! {
73+
name = benches;
74+
config = bench_config::cuda_bench_config();
75+
targets = benchmark_load_to_device
76+
}
77+
78+
#[cuda_available]
79+
criterion::criterion_main!(benches);
80+
81+
#[cuda_not_available]
82+
fn main() {}

0 commit comments

Comments
 (0)