Skip to content

Commit 7a9a248

Browse files
authored
Add more cuda.bindings latency benchmarks (#1856)
1 parent 6a162a2 commit 7a9a248

21 files changed

+1640
-51
lines changed

cuda_bindings/benchmarks/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@ __pycache__/
1111

1212
# Override root .gitignore *.cpp rule (which targets Cython-generated files)
1313
!benchmarks/cpp/*.cpp
14+
15+
results-python.json
16+
results-cpp.json

cuda_bindings/benchmarks/README.md

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,17 @@
1-
# cuda.bindings Benchmarks
1+
# cuda.bindings benchmarks
2+
3+
These benchmarks are intended to measure the latency overhead of calling CUDA
4+
Driver APIs through cuda.bindings, relative to a similar C++ baseline.
5+
6+
The goal is to benchmark how much overhead does the Python layer adds to calling
7+
CUDA APIs and what operations are not in our target of less than 1us of overhead.
8+
9+
Each Python benchmark has a C++ counterpart, which is used to compare the
10+
operations. We try to make each implementation perform small operations
11+
and nearly the same work as possible and are run under similar conditions.
12+
13+
These are **not** throughput benchmarks to measure the overall performance
14+
of kernels and applications.
215

316
## Usage
417

@@ -32,26 +45,30 @@ sudo $(pixi run -e wheel -- which python) -m pyperf system tune
3245
To run the benchmarks combine the environment and task:
3346

3447
```bash
35-
3648
# Run the Python benchmarks in the wheel environment
3749
pixi run -e wheel bench
3850

3951
# Run the Python benchmarks in the source environment
4052
pixi run -e source bench
4153

42-
# Run the C++ benchmarks (environment is irrelavant here)
54+
# Run the C++ benchmarks
4355
pixi run -e wheel bench-cpp
4456
```
4557

46-
## pyperf JSON
58+
Both runners automatically save results to JSON files in the benchmarks
59+
directory: `results-python.json` and `results-cpp.json`.
4760

48-
The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
49-
The results are written to a JSON file in the format expected by pyperf.
61+
## Output JSON and analysis
5062

51-
The C++ benchmarks also generate a valid JSON file, in the same format.
63+
The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
64+
Both Python and C++ results are saved in pyperf-compatible JSON format,
65+
which can be analyzed with pyperf commands:
5266

53-
```
54-
pixi run -e wheel bench-cpp -0 cpp.json
67+
```bash
68+
# Show results and statistics
69+
pixi run -e wheel -- python -m pyperf stats results-python.json
70+
pixi run -e wheel -- python -m pyperf stats results-cpp.json
5571

56-
pixi run -e wheel pyperf stats cpp.json
72+
# Compare C++ vs Python results
73+
pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json
5774
```
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runner.runtime import ensure_context
8+
9+
from cuda.bindings import driver as cuda
10+
11+
CTX = ensure_context()
12+
13+
_, DEVICE = cuda.cuDeviceGet(0)
14+
ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
15+
16+
17+
def bench_ctx_get_current(loops: int) -> float:
18+
_cuCtxGetCurrent = cuda.cuCtxGetCurrent
19+
20+
t0 = time.perf_counter()
21+
for _ in range(loops):
22+
_cuCtxGetCurrent()
23+
return time.perf_counter() - t0
24+
25+
26+
def bench_ctx_set_current(loops: int) -> float:
27+
_cuCtxSetCurrent = cuda.cuCtxSetCurrent
28+
_ctx = CTX
29+
30+
t0 = time.perf_counter()
31+
for _ in range(loops):
32+
_cuCtxSetCurrent(_ctx)
33+
return time.perf_counter() - t0
34+
35+
36+
def bench_ctx_get_device(loops: int) -> float:
37+
_cuCtxGetDevice = cuda.cuCtxGetDevice
38+
39+
t0 = time.perf_counter()
40+
for _ in range(loops):
41+
_cuCtxGetDevice()
42+
return time.perf_counter() - t0
43+
44+
45+
def bench_device_get(loops: int) -> float:
46+
_cuDeviceGet = cuda.cuDeviceGet
47+
48+
t0 = time.perf_counter()
49+
for _ in range(loops):
50+
_cuDeviceGet(0)
51+
return time.perf_counter() - t0
52+
53+
54+
def bench_device_get_attribute(loops: int) -> float:
55+
_cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
56+
_attr = ATTRIBUTE
57+
_dev = DEVICE
58+
59+
t0 = time.perf_counter()
60+
for _ in range(loops):
61+
_cuDeviceGetAttribute(_attr, _dev)
62+
return time.perf_counter() - t0
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runner.runtime import ensure_context
8+
9+
from cuda.bindings import driver as cuda
10+
11+
ensure_context()
12+
13+
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
14+
_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value)
15+
16+
cuda.cuEventRecord(EVENT, STREAM)
17+
cuda.cuStreamSynchronize(STREAM)
18+
19+
EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value
20+
21+
22+
def bench_event_create_destroy(loops: int) -> float:
23+
_cuEventCreate = cuda.cuEventCreate
24+
_cuEventDestroy = cuda.cuEventDestroy
25+
_flags = EVENT_FLAGS
26+
27+
t0 = time.perf_counter()
28+
for _ in range(loops):
29+
_, e = _cuEventCreate(_flags)
30+
_cuEventDestroy(e)
31+
return time.perf_counter() - t0
32+
33+
34+
def bench_event_record(loops: int) -> float:
35+
_cuEventRecord = cuda.cuEventRecord
36+
_event = EVENT
37+
_stream = STREAM
38+
39+
t0 = time.perf_counter()
40+
for _ in range(loops):
41+
_cuEventRecord(_event, _stream)
42+
return time.perf_counter() - t0
43+
44+
45+
def bench_event_query(loops: int) -> float:
46+
_cuEventQuery = cuda.cuEventQuery
47+
_event = EVENT
48+
49+
t0 = time.perf_counter()
50+
for _ in range(loops):
51+
_cuEventQuery(_event)
52+
return time.perf_counter() - t0
53+
54+
55+
def bench_event_synchronize(loops: int) -> float:
56+
_cuEventSynchronize = cuda.cuEventSynchronize
57+
_event = EVENT
58+
59+
t0 = time.perf_counter()
60+
for _ in range(loops):
61+
_cuEventSynchronize(_event)
62+
return time.perf_counter() - t0
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import ctypes
6+
import time
7+
8+
from runner.runtime import alloc_persistent, assert_drv, compile_and_load
9+
10+
from cuda.bindings import driver as cuda
11+
12+
# Compile kernels lazily so benchmark discovery does not need NVRTC.
13+
KERNEL_SOURCE = """\
14+
extern "C" __global__ void empty_kernel() { return; }
15+
extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
16+
17+
#define ITEM_PARAM(x, T) T x
18+
#define REP1(x, T) , ITEM_PARAM(x, T)
19+
#define REP2(x, T) REP1(x##0, T) REP1(x##1, T)
20+
#define REP4(x, T) REP2(x##0, T) REP2(x##1, T)
21+
#define REP8(x, T) REP4(x##0, T) REP4(x##1, T)
22+
#define REP16(x, T) REP8(x##0, T) REP8(x##1, T)
23+
24+
extern "C" __global__
25+
void small_kernel_16_args(
26+
ITEM_PARAM(F, int*)
27+
REP1(A, int*)
28+
REP2(A, int*)
29+
REP4(A, int*)
30+
REP8(A, int*))
31+
{ *F = 0; }
32+
"""
33+
34+
MODULE = None
35+
EMPTY_KERNEL = None
36+
SMALL_KERNEL = None
37+
KERNEL_16_ARGS = None
38+
STREAM = None
39+
FLOAT_PTR = None
40+
INT_PTRS = None
41+
_VAL_PS = None
42+
PACKED_16 = None
43+
44+
45+
def _ensure_launch_state() -> None:
46+
global MODULE, EMPTY_KERNEL, SMALL_KERNEL, KERNEL_16_ARGS, STREAM
47+
global FLOAT_PTR, INT_PTRS, _VAL_PS, PACKED_16
48+
49+
if EMPTY_KERNEL is not None:
50+
return
51+
52+
module = compile_and_load(KERNEL_SOURCE)
53+
54+
err, empty_kernel = cuda.cuModuleGetFunction(module, b"empty_kernel")
55+
assert_drv(err)
56+
err, small_kernel = cuda.cuModuleGetFunction(module, b"small_kernel")
57+
assert_drv(err)
58+
err, kernel_16_args = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
59+
assert_drv(err)
60+
61+
err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
62+
assert_drv(err)
63+
64+
float_ptr = alloc_persistent(ctypes.sizeof(ctypes.c_float))
65+
int_ptrs = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16))
66+
67+
val_ps = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs]
68+
packed_16 = (ctypes.c_void_p * 16)()
69+
for index, value_ptr in enumerate(val_ps):
70+
packed_16[index] = ctypes.addressof(value_ptr)
71+
72+
MODULE = module
73+
EMPTY_KERNEL = empty_kernel
74+
SMALL_KERNEL = small_kernel
75+
KERNEL_16_ARGS = kernel_16_args
76+
STREAM = stream
77+
FLOAT_PTR = float_ptr
78+
INT_PTRS = int_ptrs
79+
_VAL_PS = val_ps
80+
PACKED_16 = packed_16
81+
82+
83+
def bench_launch_empty_kernel(loops: int) -> float:
84+
_ensure_launch_state()
85+
_cuLaunchKernel = cuda.cuLaunchKernel
86+
_kernel = EMPTY_KERNEL
87+
_stream = STREAM
88+
89+
t0 = time.perf_counter()
90+
for _ in range(loops):
91+
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
92+
return time.perf_counter() - t0
93+
94+
95+
def bench_launch_small_kernel(loops: int) -> float:
96+
_ensure_launch_state()
97+
_cuLaunchKernel = cuda.cuLaunchKernel
98+
_kernel = SMALL_KERNEL
99+
_stream = STREAM
100+
_args = (FLOAT_PTR,)
101+
_arg_types = (None,)
102+
103+
t0 = time.perf_counter()
104+
for _ in range(loops):
105+
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
106+
return time.perf_counter() - t0
107+
108+
109+
def bench_launch_16_args(loops: int) -> float:
110+
_ensure_launch_state()
111+
_cuLaunchKernel = cuda.cuLaunchKernel
112+
_kernel = KERNEL_16_ARGS
113+
_stream = STREAM
114+
_args = INT_PTRS
115+
_arg_types = (None,) * 16
116+
117+
t0 = time.perf_counter()
118+
for _ in range(loops):
119+
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
120+
return time.perf_counter() - t0
121+
122+
123+
def bench_launch_16_args_pre_packed(loops: int) -> float:
124+
_ensure_launch_state()
125+
_cuLaunchKernel = cuda.cuLaunchKernel
126+
_kernel = KERNEL_16_ARGS
127+
_stream = STREAM
128+
_packed = PACKED_16
129+
130+
t0 = time.perf_counter()
131+
for _ in range(loops):
132+
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
133+
return time.perf_counter() - t0
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runner.runtime import ensure_context
8+
9+
from cuda.bindings import driver as cuda
10+
11+
ensure_context()
12+
13+
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
14+
15+
16+
def bench_stream_create_destroy(loops: int) -> float:
17+
_cuStreamCreate = cuda.cuStreamCreate
18+
_cuStreamDestroy = cuda.cuStreamDestroy
19+
_flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value
20+
21+
t0 = time.perf_counter()
22+
for _ in range(loops):
23+
_, s = _cuStreamCreate(_flags)
24+
_cuStreamDestroy(s)
25+
return time.perf_counter() - t0
26+
27+
28+
def bench_stream_query(loops: int) -> float:
29+
_cuStreamQuery = cuda.cuStreamQuery
30+
_stream = STREAM
31+
32+
t0 = time.perf_counter()
33+
for _ in range(loops):
34+
_cuStreamQuery(_stream)
35+
return time.perf_counter() - t0
36+
37+
38+
def bench_stream_synchronize(loops: int) -> float:
39+
_cuStreamSynchronize = cuda.cuStreamSynchronize
40+
_stream = STREAM
41+
42+
t0 = time.perf_counter()
43+
for _ in range(loops):
44+
_cuStreamSynchronize(_stream)
45+
return time.perf_counter() - t0

0 commit comments

Comments
 (0)