|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright the Vortex contributors |
| 3 | + |
| 4 | +#![expect(clippy::expect_used)] |
| 5 | + |
| 6 | +mod bench_config; |
| 7 | +// Unused here but suppresses dead_code warning for the shared module. |
| 8 | +const _: &[(usize, &str)] = bench_config::BENCH_SIZES; |
| 9 | + |
| 10 | +use std::time::Duration; |
| 11 | +use std::time::Instant; |
| 12 | + |
| 13 | +use criterion::BenchmarkId; |
| 14 | +use criterion::Criterion; |
| 15 | +use criterion::Throughput; |
| 16 | +use vortex::array::buffer::BufferHandle; |
| 17 | +use vortex::buffer::ByteBuffer; |
| 18 | +use vortex::error::VortexExpect; |
| 19 | +use vortex::session::VortexSession; |
| 20 | +use vortex_cuda::CudaSession; |
| 21 | +use vortex_cuda_macros::cuda_available; |
| 22 | +use vortex_cuda_macros::cuda_not_available; |
| 23 | + |
| 24 | +const LOAD_SIZES: &[(usize, &str)] = &[ |
| 25 | + (16 * 1024 * 1024, "16MiB"), |
| 26 | + (64 * 1024 * 1024, "64MiB"), |
| 27 | + (256 * 1024 * 1024, "256MiB"), |
| 28 | + (1024 * 1024 * 1024, "1GiB"), |
| 29 | +]; |
| 30 | + |
| 31 | +fn benchmark_load_to_device(c: &mut Criterion) { |
| 32 | + let mut group = c.benchmark_group("cuda"); |
| 33 | + |
| 34 | + for &(size, size_name) in LOAD_SIZES { |
| 35 | + group.throughput(Throughput::Bytes(size as u64)); |
| 36 | + |
| 37 | + group.bench_with_input( |
| 38 | + BenchmarkId::new("cuda/load_to_device/ensure_on_device_sync", size_name), |
| 39 | + &size, |
| 40 | + |b, &size| { |
| 41 | + let session = VortexSession::empty(); |
| 42 | + let cuda_ctx = |
| 43 | + CudaSession::create_execution_ctx(&session).vortex_expect("cuda ctx"); |
| 44 | + |
| 45 | + b.iter_custom(|iters| { |
| 46 | + let mut total = Duration::ZERO; |
| 47 | + for _ in 0..iters { |
| 48 | + let source = BufferHandle::new_host(ByteBuffer::from(vec![0xA5; size])); |
| 49 | + let start = Instant::now(); |
| 50 | + let handle = cuda_ctx |
| 51 | + .ensure_on_device_sync(source) |
| 52 | + .vortex_expect("ensure_on_device_sync"); |
| 53 | + assert!(handle.is_on_device()); |
| 54 | + // Keep the explcit sync here to ensure that we measure a sync copy. In |
| 55 | + // case the default buffer allocation strategy in the future changes to use |
| 56 | + // `cuMemHostAlloc`, the htod copy would change to being async, making the |
| 57 | + // function return immediately. |
| 58 | + cuda_ctx.stream().synchronize().expect("synchronize stream"); |
| 59 | + total += start.elapsed(); |
| 60 | + } |
| 61 | + total |
| 62 | + }); |
| 63 | + |
| 64 | + drop(cuda_ctx); |
| 65 | + }, |
| 66 | + ); |
| 67 | + } |
| 68 | + |
| 69 | + group.finish(); |
| 70 | +} |
| 71 | + |
| 72 | +criterion::criterion_group! { |
| 73 | + name = benches; |
| 74 | + config = bench_config::cuda_bench_config(); |
| 75 | + targets = benchmark_load_to_device |
| 76 | +} |
| 77 | + |
| 78 | +#[cuda_available] |
| 79 | +criterion::criterion_main!(benches); |
| 80 | + |
| 81 | +#[cuda_not_available] |
| 82 | +fn main() {} |
0 commit comments