Skip to content

Commit a2fd579

Browse files
committed
Merge branch 'main' into fix-pixi-run-test
2 parents d9668c6 + 6d4e82d commit a2fd579

44 files changed

Lines changed: 1392 additions & 262 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*.h binary
66
*.hpp binary
77
# Exception: headers we own
8+
benchmarks/cuda_bindings/benchmarks/cpp/*.hpp -binary text diff
89
cuda_bindings/cuda/bindings/_bindings/*.h -binary text diff
910
cuda_bindings/cuda/bindings/_lib/*.h -binary text diff
1011
cuda_core/cuda/core/_cpp/*.h -binary text diff

.spdx-ignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,7 @@ cuda_bindings/examples/*
1010

1111
# Vendored
1212
cuda_core/cuda/core/_include/dlpack.h
13+
cuda_core/cuda/core/_include/aoti_shim.h
14+
cuda_core/cuda/core/_include/aoti_shim.def
1315

1416
qa/ctk-next.drawio.svg

benchmarks/cuda_bindings/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,14 @@ To run the benchmarks combine the environment and task:
4747
```bash
4848
# Run the Python benchmarks in the wheel environment
4949
pixi run -e wheel bench
50+
pixi run -e wheel bench --min-time 0.1
5051

5152
# Run the Python benchmarks in the source environment
5253
pixi run -e source bench
5354

5455
# Run the C++ benchmarks
5556
pixi run -e wheel bench-cpp
57+
pixi run -e wheel bench-cpp --min-time 0.1
5658
```
5759

5860
Both runners automatically save results to JSON files in the benchmarks

benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ int main(int argc, char** argv) {
4545
check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
4646

4747
bench::BenchmarkSuite suite(options);
48+
// Drain the persistent stream after calibration so event_record (which
49+
// enqueues onto the stream) and event_synchronize start from a known state.
50+
suite.set_post_calibrate([&]() {
51+
check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
52+
});
4853

4954
// --- event_create_destroy ---
5055
{

benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,12 @@ int main(int argc, char** argv) {
238238
void* struct_params[] = {&struct_2048B};
239239

240240
bench::BenchmarkSuite suite(options);
241+
// After calibration, drain the persistent stream so the first measured
242+
// sample does not start on a backlogged stream. Calibration for enqueue-
243+
// style ops (kernel launches) may queue many thousands of operations.
244+
suite.set_post_calibrate([&]() {
245+
check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
246+
});
241247

242248
suite.run("launch.launch_empty_kernel", [&]() {
243249
check_cu(cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr),

benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ int main(int argc, char** argv) {
5252
uint8_t host_dst[COPY_SIZE] = {};
5353

5454
bench::BenchmarkSuite suite(options);
55+
// Drain the persistent stream after calibration so async benchmarks
56+
// (mem_alloc_async_free_async) don't start measurement on a backlogged stream.
57+
suite.set_post_calibrate([&]() {
58+
check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
59+
});
5560

5661
// --- mem_alloc_free ---
5762
{

benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ int main(int argc, char** argv) {
3838
check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
3939

4040
bench::BenchmarkSuite suite(options);
41+
// Drain the persistent stream after calibration for completeness.
42+
// stream_create_destroy uses a local stream, but stream_query/synchronize
43+
// observe the persistent one.
44+
suite.set_post_calibrate([&]() {
45+
check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
46+
});
4147

4248
// --- stream_create_destroy ---
4349
{

benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp

Lines changed: 170 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66

77
#include <chrono>
88
#include <cmath>
9+
#include <algorithm>
910
#include <cstdint>
1011
#include <cstdlib>
1112
#include <ctime>
1213
#include <fstream>
14+
#include <functional>
1315
#include <iomanip>
1416
#include <iostream>
1517
#include <string>
@@ -22,6 +24,12 @@ struct Options {
2224
std::uint64_t warmups = 5;
2325
std::uint64_t values = 20;
2426
std::uint64_t runs = 20;
27+
double min_time_sec = 0.0;
28+
// Safety cap for the calibration doubling loop. Set high enough that even
29+
// sub-nanosecond ops can reach typical --min-time targets (e.g. 100ms).
30+
// A warning is printed if calibration hits this cap before reaching min-time.
31+
std::uint64_t max_loops = 100000000;
32+
std::uint64_t calibrate_rounds = 3;
2533
std::string output_path;
2634
std::string benchmark_name;
2735
};
@@ -46,6 +54,18 @@ inline Options parse_args(int argc, char** argv) {
4654
options.warmups = std::strtoull(argv[++i], nullptr, 10);
4755
continue;
4856
}
57+
if (arg == "--min-time" && i + 1 < argc) {
58+
options.min_time_sec = std::strtod(argv[++i], nullptr);
59+
continue;
60+
}
61+
if (arg == "--max-loops" && i + 1 < argc) {
62+
options.max_loops = std::strtoull(argv[++i], nullptr, 10);
63+
continue;
64+
}
65+
if (arg == "--calibrate-rounds" && i + 1 < argc) {
66+
options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10);
67+
continue;
68+
}
4969
if (arg == "--values" && i + 1 < argc) {
5070
options.values = std::strtoull(argv[++i], nullptr, 10);
5171
continue;
@@ -68,6 +88,9 @@ inline Options parse_args(int argc, char** argv) {
6888
<< " --warmups N Warmup values per run (default: 5)\n"
6989
<< " --values N Timed values per run (default: 20)\n"
7090
<< " --runs N Number of runs (default: 20)\n"
91+
<< " --min-time S Calibrate loops to reach S seconds per value\n"
92+
<< " --max-loops N Safety cap for calibration loop count (default: 100000000)\n"
93+
<< " --calibrate-rounds N Calibration passes (default: 3)\n"
7194
<< " -o, --output F Write pyperf-compatible JSON to file\n"
7295
<< " --name S Benchmark name (overrides default)\n";
7396
std::exit(0);
@@ -93,6 +116,82 @@ inline std::string iso_now() {
93116
return std::string(buf);
94117
}
95118

119+
// Calibrate loop count to hit a minimum wall time per value.
120+
// Returns the chosen loop count. If `capped_out` is non-null, it is set to
121+
// true when calibration reached `max_loops` before hitting `min_time_sec`
122+
// (meaning --min-time was NOT actually satisfied by the calibration).
123+
template <typename Fn>
124+
std::uint64_t calibrate_loops(
125+
const Options& options,
126+
Fn&& fn,
127+
const std::function<void()>& post_calibrate = {},
128+
bool* capped_out = nullptr,
129+
double* last_elapsed_out = nullptr
130+
) {
131+
if (options.min_time_sec <= 0.0) {
132+
if (capped_out) *capped_out = false;
133+
if (last_elapsed_out) *last_elapsed_out = 0.0;
134+
return options.loops;
135+
}
136+
137+
// Allow callers (e.g. the explicit-loop overload) to request a minimum
138+
// starting loop count via options.loops.
139+
const std::uint64_t start_loops = std::max<std::uint64_t>(1, options.loops);
140+
const std::uint64_t max_loops = std::max<std::uint64_t>(start_loops, options.max_loops);
141+
const std::uint64_t rounds = std::max<std::uint64_t>(1, options.calibrate_rounds);
142+
143+
// Track the round that produced the best (largest) loop count so the
144+
// returned loop count, capped flag, and last-elapsed time all describe
145+
// the same round.
146+
std::uint64_t best_loops = 0;
147+
bool best_capped = false;
148+
double best_elapsed = 0.0;
149+
150+
for (std::uint64_t round = 0; round < rounds; ++round) {
151+
std::uint64_t loops = start_loops;
152+
bool round_capped = false;
153+
double elapsed = 0.0;
154+
155+
while (true) {
156+
const auto t0 = std::chrono::steady_clock::now();
157+
for (std::uint64_t i = 0; i < loops; ++i) {
158+
fn();
159+
}
160+
const auto t1 = std::chrono::steady_clock::now();
161+
elapsed = std::chrono::duration<double>(t1 - t0).count();
162+
163+
// Drain any state left behind by this probe (e.g. queued async
164+
// work on a persistent stream) before the next probe, the next
165+
// round, or the first measured warmup/value runs.
166+
if (post_calibrate) {
167+
post_calibrate();
168+
}
169+
if (elapsed >= options.min_time_sec) {
170+
break;
171+
}
172+
if (loops >= max_loops) {
173+
round_capped = true;
174+
break;
175+
}
176+
if (loops > max_loops / 2) {
177+
loops = max_loops;
178+
} else {
179+
loops *= 2;
180+
}
181+
}
182+
183+
if (loops >= best_loops) {
184+
best_loops = loops;
185+
best_capped = round_capped;
186+
best_elapsed = elapsed;
187+
}
188+
}
189+
190+
if (capped_out) *capped_out = best_capped;
191+
if (last_elapsed_out) *last_elapsed_out = best_elapsed;
192+
return best_loops;
193+
}
194+
96195
// Run a benchmark function. The function signature is: void fn() — one call = one operation.
97196
// The harness calls fn() in a tight loop `loops` times per value.
98197
template <typename Fn>
@@ -235,22 +334,57 @@ class BenchmarkSuite {
235334
public:
236335
explicit BenchmarkSuite(Options options) : options_(std::move(options)) {}
237336

337+
// Post-calibration hook. If set, invoked between calibration probes so
338+
// async benchmarks can drain state left behind by each probe before the
339+
// next one runs. The final probe leaves the benchmark in a drained state
340+
// before the first measured warmup/value. Can be overridden per-call via
341+
// the `post_calibrate` parameter on `run()`.
342+
void set_post_calibrate(std::function<void()> hook) {
343+
post_calibrate_ = std::move(hook);
344+
}
345+
238346
// Run a benchmark and record it. The name is used as the benchmark ID.
347+
// If --min-time is set, loop count is auto-calibrated. `post_calibrate`,
348+
// if provided, runs between calibration probes to reset async state.
239349
template <typename Fn>
240-
void run(const std::string& name, Fn&& fn) {
241-
auto results = run_benchmark(options_, std::forward<Fn>(fn));
350+
void run(
351+
const std::string& name,
352+
Fn&& fn,
353+
std::function<void()> post_calibrate = {}
354+
) {
355+
std::uint64_t loops = options_.loops;
356+
Options custom = options_;
357+
if (options_.min_time_sec > 0.0) {
358+
loops = calibrate_and_warn(name, options_, fn, select_post_calibrate(post_calibrate));
359+
custom.loops = loops;
360+
}
361+
auto results = run_benchmark(custom, std::forward<Fn>(fn));
242362
print_summary(name, results);
243-
entries_.push_back({name, options_.loops, std::move(results)});
363+
entries_.push_back({name, loops, std::move(results)});
244364
}
245365

246-
// Run a benchmark with a custom loop count (for slow operations like compilation).
366+
// Run a benchmark with a custom loop count (used as a floor for fast ops
367+
// or a fixed count for slow ops like compilation). When --min-time is set,
368+
// calibration still runs but starts from `loops_override` as the minimum.
247369
template <typename Fn>
248-
void run(const std::string& name, std::uint64_t loops_override, Fn&& fn) {
370+
void run(
371+
const std::string& name,
372+
std::uint64_t loops_override,
373+
Fn&& fn,
374+
std::function<void()> post_calibrate = {}
375+
) {
376+
std::uint64_t loops = loops_override;
249377
Options custom = options_;
250378
custom.loops = loops_override;
379+
if (options_.min_time_sec > 0.0) {
380+
Options calib_opts = options_;
381+
calib_opts.loops = loops_override; // floor
382+
loops = calibrate_and_warn(name, calib_opts, fn, select_post_calibrate(post_calibrate));
383+
custom.loops = loops;
384+
}
251385
auto results = run_benchmark(custom, std::forward<Fn>(fn));
252386
print_summary(name, results);
253-
entries_.push_back({name, loops_override, std::move(results)});
387+
entries_.push_back({name, loops, std::move(results)});
254388
}
255389

256390
// Write all collected benchmarks to the output file (if -o was given).
@@ -263,6 +397,36 @@ class BenchmarkSuite {
263397
private:
264398
Options options_;
265399
std::vector<BenchmarkEntry> entries_;
400+
std::function<void()> post_calibrate_;
401+
402+
std::function<void()> select_post_calibrate(const std::function<void()>& per_call) const {
403+
if (per_call) {
404+
return per_call;
405+
}
406+
return post_calibrate_;
407+
}
408+
409+
template <typename Fn>
410+
std::uint64_t calibrate_and_warn(
411+
const std::string& name,
412+
const Options& calib_opts,
413+
Fn&& fn,
414+
const std::function<void()>& post_calibrate
415+
) const {
416+
bool capped = false;
417+
double last_elapsed = 0.0;
418+
std::uint64_t loops = calibrate_loops(
419+
calib_opts, std::forward<Fn>(fn), post_calibrate, &capped, &last_elapsed
420+
);
421+
if (capped) {
422+
std::cerr << "WARNING: " << name
423+
<< ": calibration hit --max-loops (" << calib_opts.max_loops
424+
<< ") before reaching --min-time (" << calib_opts.min_time_sec
425+
<< "s). Last sample: " << last_elapsed
426+
<< "s. Raise --max-loops to satisfy --min-time for this benchmark.\n";
427+
}
428+
return loops;
429+
}
266430

267431
static void write_multi_pyperf_json(
268432
const std::string& output_path,

0 commit comments

Comments
 (0)