Skip to content

Commit b647a21

Browse files
authored
ci[gpu]: run cuda micro-benchmarks with codspeed (#7696)
Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent d2d79f0 commit b647a21

13 files changed

Lines changed: 115 additions & 62 deletions

File tree

.github/workflows/codspeed.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,43 @@ jobs:
6565
run: bash scripts/bench-taskset.sh cargo codspeed run
6666
token: ${{ secrets.CODSPEED_TOKEN }}
6767
mode: "simulation"
68+
69+
bench-codspeed-cuda:
70+
if: github.repository == 'vortex-data/vortex'
71+
strategy:
72+
matrix:
73+
include:
74+
- { shard: 1, name: "Bitpacked", benches: "bitpacked_cuda" }
75+
- { shard: 2, name: "Dynamic dispatch", benches: "dynamic_dispatch_cuda" }
76+
- { shard: 3, name: "Standalone kernels", benches: "alp_cuda date_time_parts_cuda dict_cuda for_cuda runend_cuda throughput_cuda" }
77+
- { shard: 4, name: "NVIDIA kernels", benches: "filter_cuda zstd_cuda" }
78+
name: "Benchmark with Codspeed (CUDA Shard #${{ matrix.shard }} - ${{ matrix.name }})"
79+
timeout-minutes: 30
80+
runs-on: runs-on=${{ github.run_id }}/family=g5/image=ubuntu24-gpu-x64/tag=bench-codspeed-cuda-${{ matrix.shard }}
81+
steps:
82+
- uses: runs-on/action@v2
83+
with:
84+
sccache: s3
85+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
86+
- uses: ./.github/actions/setup-rust
87+
with:
88+
repo-token: ${{ secrets.GITHUB_TOKEN }}
89+
- name: Display NVIDIA SMI details
90+
run: |
91+
nvidia-smi
92+
nvidia-smi -L
93+
nvidia-smi -q -d Memory
94+
- name: Install Codspeed
95+
uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995
96+
with:
97+
tool: cargo-codspeed
98+
- name: Build benchmarks
99+
run: cargo codspeed build -m walltime -p vortex-cuda --profile bench
100+
- name: Run benchmarks
101+
uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
102+
env:
103+
CARGO_MANIFEST_DIR: ${{ github.workspace }}/vortex-cuda
104+
with:
105+
run: cargo codspeed run $(printf -- '--bench %s ' ${{ matrix.benches }})
106+
token: ${{ secrets.CODSPEED_TOKEN }}
107+
mode: "walltime"

vortex-cuda/benches/alp_cuda.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
#![expect(clippy::unwrap_used)]
77
#![expect(clippy::cast_possible_truncation)]
88

9-
mod common;
9+
mod bench_config;
10+
mod timed_launch_strategy;
1011

1112
use std::mem::size_of;
1213
use std::sync::Arc;
@@ -37,7 +38,7 @@ use vortex_cuda::executor::CudaArrayExt;
3738
use vortex_cuda_macros::cuda_available;
3839
use vortex_cuda_macros::cuda_not_available;
3940

40-
use crate::common::TimedLaunchStrategy;
41+
use crate::timed_launch_strategy::TimedLaunchStrategy;
4142

4243
const N_ROWS: usize = 100_000_000;
4344

@@ -133,11 +134,7 @@ fn benchmark_alp_decode(c: &mut Criterion) {
133134

134135
criterion::criterion_group! {
135136
name = benches;
136-
config = Criterion::default().without_plots()
137-
.sample_size(10)
138-
.warm_up_time(Duration::from_nanos(1))
139-
.measurement_time(Duration::from_nanos(1))
140-
.nresamples(10);
137+
config = bench_config::cuda_bench_config();
141138
targets = benchmark_alp_decode
142139
}
143140

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use std::time::Duration;
5+
6+
use criterion::Criterion;
7+
8+
/// Returns a [`Criterion`] configuration tuned for CUDA benchmarks.
9+
///
10+
/// All benchmarks use `iter_custom` with precise CUDA event timing.
11+
/// criterion's iteration planner estimates `iters` from **wall time** during
12+
/// warmup, which includes GPU context setup and memory copies — not just
13+
/// the kernel. Setting `measurement_time = 1ns` forces `iters = 1` so
14+
/// each sample is exactly one `iter_custom` call returning GPU-timed duration.
15+
/// Stability comes from a high `sample_size` (many independent launches)
16+
/// rather than many iterations per sample.
17+
///
18+
/// `warm_up_time` runs at least one full iteration before sampling, giving
19+
/// the GPU a chance to reach steady state (clock boost, cache warming).
20+
/// If a single launch exceeds the warm-up budget, criterion still completes
21+
/// it before moving on.
22+
pub(super) fn cuda_bench_config() -> Criterion {
23+
// Number of independent kernel launches.
24+
let sample_size = 10;
25+
26+
Criterion::default()
27+
.without_plots()
28+
.sample_size(sample_size)
29+
// One ns is enough to JIT-compile kernels and warm GPU caches.
30+
// Criterion always finishes the in-flight iteration even if this
31+
// budget is exceeded.
32+
.warm_up_time(Duration::from_nanos(1))
33+
// Forces `iters = 1`: criterion's planner estimates iteration cost
34+
// from wall time (which includes GPU context setup), not the
35+
// GPU-timed duration returned by `iter_custom`. A real
36+
// measurement_time would cause wildly inflated iteration counts.
37+
.measurement_time(Duration::from_nanos(1))
38+
}

vortex-cuda/benches/bitpacked_cuda.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
#![expect(clippy::unwrap_used)]
77
#![expect(clippy::cast_possible_truncation)]
88

9-
mod common;
9+
mod bench_config;
10+
mod timed_launch_strategy;
1011

1112
use std::mem::size_of;
1213
use std::ops::Add;
@@ -37,7 +38,7 @@ use vortex_cuda::executor::CudaArrayExt;
3738
use vortex_cuda_macros::cuda_available;
3839
use vortex_cuda_macros::cuda_not_available;
3940

40-
use crate::common::TimedLaunchStrategy;
41+
use crate::timed_launch_strategy::TimedLaunchStrategy;
4142

4243
const N_ROWS: usize = 100_000_000;
4344

@@ -199,11 +200,7 @@ fn benchmark_bitunpack_with_patches(c: &mut Criterion) {
199200

200201
criterion::criterion_group! {
201202
name = benches;
202-
config = Criterion::default().without_plots()
203-
.sample_size(10)
204-
.warm_up_time(Duration::from_nanos(1))
205-
.measurement_time(Duration::from_nanos(1))
206-
.nresamples(10);
203+
config = bench_config::cuda_bench_config();
207204
targets = benchmark_bitunpack, benchmark_bitunpack_with_patches
208205
}
209206

vortex-cuda/benches/date_time_parts_cuda.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
#![expect(clippy::unwrap_used)]
77
#![expect(clippy::cast_possible_truncation)]
88

9-
mod common;
9+
mod bench_config;
10+
mod timed_launch_strategy;
1011

1112
use std::mem::size_of;
1213
use std::sync::Arc;
@@ -36,7 +37,7 @@ use vortex_cuda::executor::CudaArrayExt;
3637
use vortex_cuda_macros::cuda_available;
3738
use vortex_cuda_macros::cuda_not_available;
3839

39-
use crate::common::TimedLaunchStrategy;
40+
use crate::timed_launch_strategy::TimedLaunchStrategy;
4041

4142
fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArray {
4243
let days: Vec<i16> = (0..len).map(|i| (i / 1000) as i16).collect();
@@ -89,11 +90,7 @@ fn benchmark_datetimeparts(c: &mut Criterion) {
8990

9091
criterion::criterion_group! {
9192
name = benches;
92-
config = Criterion::default().without_plots()
93-
.sample_size(10)
94-
.warm_up_time(Duration::from_nanos(1))
95-
.measurement_time(Duration::from_nanos(1))
96-
.nresamples(10);
93+
config = bench_config::cuda_bench_config();
9794
targets = benchmark_datetimeparts
9895
}
9996

vortex-cuda/benches/dict_cuda.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
#![expect(clippy::unwrap_used)]
77
#![expect(clippy::cast_possible_truncation)]
88

9-
mod common;
9+
mod bench_config;
10+
mod timed_launch_strategy;
1011

1112
use std::fmt::Debug;
1213
use std::mem::size_of;
@@ -33,7 +34,7 @@ use vortex_cuda::executor::CudaArrayExt;
3334
use vortex_cuda_macros::cuda_available;
3435
use vortex_cuda_macros::cuda_not_available;
3536

36-
use crate::common::TimedLaunchStrategy;
37+
use crate::timed_launch_strategy::TimedLaunchStrategy;
3738

3839
const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
3940

@@ -160,11 +161,7 @@ fn benchmark_dict(c: &mut Criterion) {
160161

161162
criterion::criterion_group! {
162163
name = benches;
163-
config = Criterion::default().without_plots()
164-
.sample_size(10)
165-
.warm_up_time(Duration::from_nanos(1))
166-
.measurement_time(Duration::from_nanos(1))
167-
.nresamples(10);
164+
config = bench_config::cuda_bench_config();
168165
targets = benchmark_dict
169166
}
170167

vortex-cuda/benches/dynamic_dispatch_cuda.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#![expect(clippy::cast_possible_truncation)]
66
#![expect(clippy::expect_used)]
77

8+
mod bench_config;
9+
810
use std::marker::PhantomData;
911
use std::mem::size_of;
1012
use std::sync::Arc;
@@ -650,11 +652,7 @@ fn benchmark_dynamic_dispatch(c: &mut Criterion) {
650652

651653
criterion::criterion_group! {
652654
name = benches;
653-
config = Criterion::default().without_plots()
654-
.sample_size(10)
655-
.warm_up_time(Duration::from_nanos(1))
656-
.measurement_time(Duration::from_nanos(1))
657-
.nresamples(10);
655+
config = bench_config::cuda_bench_config();
658656
targets = benchmark_dynamic_dispatch
659657
}
660658

vortex-cuda/benches/filter_cuda.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
#![expect(clippy::unwrap_used)]
77
#![expect(clippy::cast_possible_truncation)]
88

9+
mod bench_config;
10+
911
use std::ffi::c_void;
1012
use std::fmt::Debug;
1113
use std::mem::size_of;
@@ -226,11 +228,7 @@ fn benchmark_filter(c: &mut Criterion) {
226228

227229
criterion::criterion_group! {
228230
name = benches;
229-
config = Criterion::default().without_plots()
230-
.sample_size(10)
231-
.warm_up_time(Duration::from_nanos(1))
232-
.measurement_time(Duration::from_nanos(1))
233-
.nresamples(10);
231+
config = bench_config::cuda_bench_config();
234232
targets = benchmark_filter
235233
}
236234

vortex-cuda/benches/for_cuda.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
#![expect(clippy::unwrap_used)]
77
#![expect(clippy::cast_possible_truncation)]
88

9-
mod common;
9+
mod bench_config;
10+
mod timed_launch_strategy;
1011

1112
use std::mem::size_of;
1213
use std::ops::Add;
@@ -39,7 +40,7 @@ use vortex_cuda::executor::CudaArrayExt;
3940
use vortex_cuda_macros::cuda_available;
4041
use vortex_cuda_macros::cuda_not_available;
4142

42-
use crate::common::TimedLaunchStrategy;
43+
use crate::timed_launch_strategy::TimedLaunchStrategy;
4344

4445
const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
4546
const REFERENCE_VALUE: u8 = 10;
@@ -166,11 +167,7 @@ fn benchmark_ffor(c: &mut Criterion) {
166167

167168
criterion::criterion_group! {
168169
name = benches;
169-
config = Criterion::default().without_plots()
170-
.sample_size(10)
171-
.warm_up_time(Duration::from_nanos(1))
172-
.measurement_time(Duration::from_nanos(1))
173-
.nresamples(10);
170+
config = bench_config::cuda_bench_config();
174171
targets = benchmark_for, benchmark_ffor
175172
}
176173

vortex-cuda/benches/runend_cuda.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
#![expect(clippy::unwrap_used)]
77
#![expect(clippy::cast_possible_truncation)]
88

9-
mod common;
9+
mod bench_config;
10+
mod timed_launch_strategy;
1011

1112
use std::mem::size_of;
1213
use std::sync::Arc;
@@ -32,7 +33,7 @@ use vortex_cuda::executor::CudaArrayExt;
3233
use vortex_cuda_macros::cuda_available;
3334
use vortex_cuda_macros::cuda_not_available;
3435

35-
use crate::common::TimedLaunchStrategy;
36+
use crate::timed_launch_strategy::TimedLaunchStrategy;
3637

3738
/// Creates a run-end encoded array with the specified output length and average run length.
3839
fn make_runend_array_typed<T>(
@@ -117,11 +118,7 @@ fn benchmark_runend(c: &mut Criterion) {
117118

118119
criterion::criterion_group! {
119120
name = benches;
120-
config = Criterion::default().without_plots()
121-
.sample_size(10)
122-
.warm_up_time(Duration::from_nanos(1))
123-
.measurement_time(Duration::from_nanos(1))
124-
.nresamples(10);
121+
config = bench_config::cuda_bench_config();
125122
targets = benchmark_runend
126123
}
127124

0 commit comments

Comments
 (0)