|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright the Vortex contributors |
| 3 | + |
| 4 | +use std::time::Duration; |
| 5 | + |
| 6 | +use criterion::Criterion; |
| 7 | + |
| 8 | +/// Returns a [`Criterion`] configuration tuned for CUDA benchmarks. |
| 9 | +/// |
| 10 | +/// All benchmarks use `iter_custom` with precise CUDA event timing. |
| 11 | +/// criterion's iteration planner estimates `iters` from **wall time** during |
| 12 | +/// warmup, which includes GPU context setup and memory copies — not just |
| 13 | +/// the kernel. Setting `measurement_time = 1ns` forces `iters = 1` so |
| 14 | +/// each sample is exactly one `iter_custom` call returning GPU-timed duration. |
| 15 | +/// Stability comes from a high `sample_size` (many independent launches) |
| 16 | +/// rather than many iterations per sample. |
| 17 | +/// |
| 18 | +/// `warm_up_time` runs at least one full iteration before sampling, giving |
| 19 | +/// the GPU a chance to reach steady state (clock boost, cache warming). |
| 20 | +/// If a single launch exceeds the warm-up budget, criterion still completes |
| 21 | +/// it before moving on. |
| 22 | +pub(super) fn cuda_bench_config() -> Criterion { |
| 23 | + // Number of independent kernel launches. |
| 24 | + let sample_size = 10; |
| 25 | + |
| 26 | + Criterion::default() |
| 27 | + .without_plots() |
| 28 | + .sample_size(sample_size) |
| 29 | + // One ns is enough to JIT-compile kernels and warm GPU caches. |
| 30 | + // Criterion always finishes the in-flight iteration even if this |
| 31 | + // budget is exceeded. |
| 32 | + .warm_up_time(Duration::from_nanos(1)) |
| 33 | + // Forces `iters = 1`: criterion's planner estimates iteration cost |
| 34 | + // from wall time (which includes GPU context setup), not the |
| 35 | + // GPU-timed duration returned by `iter_custom`. A real |
| 36 | + // measurement_time would cause wildly inflated iteration counts. |
| 37 | + .measurement_time(Duration::from_nanos(1)) |
| 38 | +} |
0 commit comments