Skip to content

Commit 0cfea1d

Browse files
committed
Add Launch benchmarks
1 parent 79f930d commit 0cfea1d

File tree

5 files changed

+375
-2
lines changed

5 files changed

+375
-2
lines changed

cuda_bindings/benchmarks/.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,5 @@ __pycache__/
1212
# Override root .gitignore *.cpp rule (which targets Cython-generated files)
1313
!benchmarks/cpp/*.cpp
1414

15-
1615
results-python.json
1716
results-cpp.json
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import ctypes
6+
import time
7+
8+
from runner.runtime import alloc_persistent, compile_and_load, ensure_context
9+
10+
from cuda.bindings import driver as cuda
11+
12+
ensure_context()
13+
14+
# Compile kernels
15+
KERNEL_SOURCE = """\
16+
extern "C" __global__ void empty_kernel() { return; }
17+
extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
18+
19+
#define ITEM_PARAM(x, T) T x
20+
#define REP1(x, T) , ITEM_PARAM(x, T)
21+
#define REP2(x, T) REP1(x##0, T) REP1(x##1, T)
22+
#define REP4(x, T) REP2(x##0, T) REP2(x##1, T)
23+
#define REP8(x, T) REP4(x##0, T) REP4(x##1, T)
24+
#define REP16(x, T) REP8(x##0, T) REP8(x##1, T)
25+
26+
extern "C" __global__
27+
void small_kernel_16_args(
28+
ITEM_PARAM(F, int*)
29+
REP1(A, int*)
30+
REP2(A, int*)
31+
REP4(A, int*)
32+
REP8(A, int*))
33+
{ *F = 0; }
34+
"""
35+
36+
MODULE = compile_and_load(KERNEL_SOURCE)
37+
38+
# Get kernel handles
39+
_err, EMPTY_KERNEL = cuda.cuModuleGetFunction(MODULE, b"empty_kernel")
40+
_err, SMALL_KERNEL = cuda.cuModuleGetFunction(MODULE, b"small_kernel")
41+
_err, KERNEL_16_ARGS = cuda.cuModuleGetFunction(MODULE, b"small_kernel_16_args")
42+
43+
# Create a non-blocking stream for launches
44+
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
45+
46+
# Allocate device memory for kernel arguments
47+
FLOAT_PTR = alloc_persistent(ctypes.sizeof(ctypes.c_float))
48+
INT_PTRS = [alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16)]
49+
50+
# Pre-pack ctypes params for the pre-packed benchmark
51+
_val_ps = [ctypes.c_void_p(int(p)) for p in INT_PTRS]
52+
PACKED_16 = (ctypes.c_void_p * 16)()
53+
for _i in range(16):
54+
PACKED_16[_i] = ctypes.addressof(_val_ps[_i])
55+
56+
57+
def bench_launch_empty_kernel(loops: int) -> float:
58+
_cuLaunchKernel = cuda.cuLaunchKernel
59+
_kernel = EMPTY_KERNEL
60+
_stream = STREAM
61+
62+
t0 = time.perf_counter()
63+
for _ in range(loops):
64+
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
65+
return time.perf_counter() - t0
66+
67+
68+
def bench_launch_small_kernel(loops: int) -> float:
69+
_cuLaunchKernel = cuda.cuLaunchKernel
70+
_kernel = SMALL_KERNEL
71+
_stream = STREAM
72+
_args = (FLOAT_PTR,)
73+
_arg_types = (None,)
74+
75+
t0 = time.perf_counter()
76+
for _ in range(loops):
77+
_cuLaunchKernel(
78+
_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0
79+
)
80+
return time.perf_counter() - t0
81+
82+
83+
def bench_launch_16_args(loops: int) -> float:
84+
_cuLaunchKernel = cuda.cuLaunchKernel
85+
_kernel = KERNEL_16_ARGS
86+
_stream = STREAM
87+
_args = tuple(INT_PTRS)
88+
_arg_types = tuple([None] * 16)
89+
90+
t0 = time.perf_counter()
91+
for _ in range(loops):
92+
_cuLaunchKernel(
93+
_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0
94+
)
95+
return time.perf_counter() - t0
96+
97+
98+
def bench_launch_16_args_pre_packed(loops: int) -> float:
99+
_cuLaunchKernel = cuda.cuLaunchKernel
100+
_kernel = KERNEL_16_ARGS
101+
_stream = STREAM
102+
_packed = PACKED_16
103+
104+
t0 = time.perf_counter()
105+
for _ in range(loops):
106+
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
107+
return time.perf_counter() - t0
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#include <cuda.h>
6+
#include <nvrtc.h>
7+
8+
#include "bench_support.hpp"
9+
10+
#include <cstdlib>
11+
#include <cstring>
12+
#include <iostream>
13+
#include <string>
14+
#include <vector>
15+
16+
17+
static void check_cu(CUresult status, const char* message) {
18+
if (status != CUDA_SUCCESS) {
19+
const char* error_name = nullptr;
20+
cuGetErrorName(status, &error_name);
21+
std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
22+
std::exit(1);
23+
}
24+
}
25+
26+
static void check_nvrtc(nvrtcResult status, const char* message) {
27+
if (status != NVRTC_SUCCESS) {
28+
std::cerr << message << ": " << nvrtcGetErrorString(status) << '\n';
29+
std::exit(1);
30+
}
31+
}
32+
33+
static CUmodule compile_and_load(const char* source, CUdevice device) {
34+
int major = 0, minor = 0;
35+
check_cu(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device),
36+
"cuDeviceGetAttribute failed");
37+
check_cu(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device),
38+
"cuDeviceGetAttribute failed");
39+
40+
nvrtcProgram prog;
41+
check_nvrtc(nvrtcCreateProgram(&prog, source, "benchmark_kernel.cu", 0, nullptr, nullptr),
42+
"nvrtcCreateProgram failed");
43+
44+
std::string arch = "--gpu-architecture=sm_" + std::to_string(major) + std::to_string(minor);
45+
const char* opts[] = {"--fmad=false", arch.c_str()};
46+
nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, opts);
47+
48+
// Print log on failure
49+
if (compile_result != NVRTC_SUCCESS) {
50+
size_t log_size = 0;
51+
nvrtcGetProgramLogSize(prog, &log_size);
52+
std::vector<char> log(log_size);
53+
nvrtcGetProgramLog(prog, log.data());
54+
std::cerr << "NVRTC compile failed:\n" << log.data() << '\n';
55+
std::exit(1);
56+
}
57+
58+
size_t cubin_size = 0;
59+
check_nvrtc(nvrtcGetCUBINSize(prog, &cubin_size), "nvrtcGetCUBINSize failed");
60+
std::vector<char> cubin(cubin_size);
61+
check_nvrtc(nvrtcGetCUBIN(prog, cubin.data()), "nvrtcGetCUBIN failed");
62+
nvrtcDestroyProgram(&prog);
63+
64+
CUmodule module;
65+
check_cu(cuModuleLoadData(&module, cubin.data()), "cuModuleLoadData failed");
66+
return module;
67+
}
68+
69+
70+
static const char* KERNEL_SOURCE = R"(
71+
extern "C" __global__ void empty_kernel() { return; }
72+
extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
73+
74+
#define ITEM_PARAM(x, T) T x
75+
#define REP1(x, T) , ITEM_PARAM(x, T)
76+
#define REP2(x, T) REP1(x##0, T) REP1(x##1, T)
77+
#define REP4(x, T) REP2(x##0, T) REP2(x##1, T)
78+
#define REP8(x, T) REP4(x##0, T) REP4(x##1, T)
79+
#define REP16(x, T) REP8(x##0, T) REP8(x##1, T)
80+
81+
extern "C" __global__
82+
void small_kernel_16_args(
83+
ITEM_PARAM(F, int*)
84+
REP1(A, int*)
85+
REP2(A, int*)
86+
REP4(A, int*)
87+
REP8(A, int*))
88+
{ *F = 0; }
89+
)";
90+
91+
92+
int main(int argc, char** argv) {
93+
bench::Options options = bench::parse_args(argc, argv);
94+
95+
// Setup
96+
check_cu(cuInit(0), "cuInit failed");
97+
98+
CUdevice device;
99+
check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
100+
101+
CUcontext ctx;
102+
CUctxCreateParams ctxParams = {};
103+
check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
104+
105+
CUmodule module = compile_and_load(KERNEL_SOURCE, device);
106+
107+
CUfunction empty_kernel, small_kernel, kernel_16_args;
108+
check_cu(cuModuleGetFunction(&empty_kernel, module, "empty_kernel"), "GetFunction failed");
109+
check_cu(cuModuleGetFunction(&small_kernel, module, "small_kernel"), "GetFunction failed");
110+
check_cu(cuModuleGetFunction(&kernel_16_args, module, "small_kernel_16_args"), "GetFunction failed");
111+
112+
CUstream stream;
113+
check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
114+
115+
// Allocate device memory for arguments
116+
CUdeviceptr float_ptr;
117+
check_cu(cuMemAlloc(&float_ptr, sizeof(float)), "cuMemAlloc failed");
118+
119+
CUdeviceptr int_ptrs[16];
120+
for (int i = 0; i < 16; ++i) {
121+
check_cu(cuMemAlloc(&int_ptrs[i], sizeof(int)), "cuMemAlloc failed");
122+
}
123+
124+
// Pre-pack kernel params for the pre-packed benchmark
125+
void* packed_16[16];
126+
for (int i = 0; i < 16; ++i) {
127+
packed_16[i] = &int_ptrs[i];
128+
}
129+
130+
bench::BenchmarkSuite suite(options);
131+
132+
// --- launch_empty_kernel ---
133+
{
134+
suite.run("launch.launch_empty_kernel", [&]() {
135+
check_cu(
136+
cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr),
137+
"cuLaunchKernel failed"
138+
);
139+
});
140+
}
141+
142+
// --- launch_small_kernel ---
143+
{
144+
void* params[] = {&float_ptr};
145+
suite.run("launch.launch_small_kernel", [&]() {
146+
check_cu(
147+
cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
148+
"cuLaunchKernel failed"
149+
);
150+
});
151+
}
152+
153+
// --- launch_16_args ---
154+
{
155+
suite.run("launch.launch_16_args", [&]() {
156+
check_cu(
157+
cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
158+
"cuLaunchKernel failed"
159+
);
160+
});
161+
}
162+
163+
// --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) ---
164+
// In C++ the params are always pre-packed, so this is identical to launch_16_args.
165+
// We include it for naming parity with the Python benchmark.
166+
{
167+
suite.run("launch.launch_16_args_pre_packed", [&]() {
168+
check_cu(
169+
cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
170+
"cuLaunchKernel failed"
171+
);
172+
});
173+
}
174+
175+
// Cleanup
176+
for (int i = 0; i < 16; ++i) {
177+
check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed");
178+
}
179+
check_cu(cuMemFree(float_ptr), "cuMemFree failed");
180+
check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
181+
check_cu(cuModuleUnload(module), "cuModuleUnload failed");
182+
check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
183+
184+
suite.write();
185+
186+
return 0;
187+
}

cuda_bindings/benchmarks/pixi.lock

Lines changed: 30 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)