Skip to content

Commit 7953daa

Browse files
authored
Merge branch 'main' into nvml-renaming
2 parents ba0c510 + 355fcaa commit 7953daa

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+343
-132
lines changed

.coveragerc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2-
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
2+
# SPDX-License-Identifier: Apache-2.0
33

44
[paths]
55
source =

.github/workflows/test-wheel-linux.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ jobs:
274274
if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
275275
run: |
276276
pip install pyperf
277-
pushd cuda_bindings/benchmarks
277+
pushd benchmarks/cuda_bindings
278278
python run_pyperf.py --fast --min-time 1
279279
popd
280280

.spdx-ignore

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@ LICENSE
88
requirements*.txt
99
cuda_bindings/examples/*
1010

11-
# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149)
12-
cuda_bindings/benchmarks/*
13-
1411
# Vendored
1512
cuda_core/cuda/core/_include/dlpack.h
1613

benchmarks/cuda_bindings/AGENTS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# cuda.bindings benchmarks
2+
3+
Read the README.md in this directory for more details about the benchmarks.
4+
5+
When generating code verify that that the code is correct based on the source for cuda-bindings
6+
that can be found in ../../cuda_bindings
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system
3737
pixi run -e wheel -- python -m pyperf system show
3838

3939
# Apply tuning (may require root)
40-
sudo $(pixi run -e wheel -- which python) -m pyperf system tune
40+
$(pixi run -e wheel -- which python) -m pyperf system tune
4141
```
4242

4343
### Running benchmarks

cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py renamed to benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,48 +15,48 @@
1515

1616

1717
def bench_ctx_get_current(loops: int) -> float:
18-
_cuCtxGetCurrent = cuda.cuCtxGetCurrent
18+
_fn = cuda.cuCtxGetCurrent
1919

2020
t0 = time.perf_counter()
2121
for _ in range(loops):
22-
_cuCtxGetCurrent()
22+
_fn()
2323
return time.perf_counter() - t0
2424

2525

2626
def bench_ctx_set_current(loops: int) -> float:
27-
_cuCtxSetCurrent = cuda.cuCtxSetCurrent
27+
_fn = cuda.cuCtxSetCurrent
2828
_ctx = CTX
2929

3030
t0 = time.perf_counter()
3131
for _ in range(loops):
32-
_cuCtxSetCurrent(_ctx)
32+
_fn(_ctx)
3333
return time.perf_counter() - t0
3434

3535

3636
def bench_ctx_get_device(loops: int) -> float:
37-
_cuCtxGetDevice = cuda.cuCtxGetDevice
37+
_fn = cuda.cuCtxGetDevice
3838

3939
t0 = time.perf_counter()
4040
for _ in range(loops):
41-
_cuCtxGetDevice()
41+
_fn()
4242
return time.perf_counter() - t0
4343

4444

4545
def bench_device_get(loops: int) -> float:
46-
_cuDeviceGet = cuda.cuDeviceGet
46+
_fn = cuda.cuDeviceGet
4747

4848
t0 = time.perf_counter()
4949
for _ in range(loops):
50-
_cuDeviceGet(0)
50+
_fn(0)
5151
return time.perf_counter() - t0
5252

5353

5454
def bench_device_get_attribute(loops: int) -> float:
55-
_cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
55+
_fn = cuda.cuDeviceGetAttribute
5656
_attr = ATTRIBUTE
5757
_dev = DEVICE
5858

5959
t0 = time.perf_counter()
6060
for _ in range(loops):
61-
_cuDeviceGetAttribute(_attr, _dev)
61+
_fn(_attr, _dev)
6262
return time.perf_counter() - t0

cuda_bindings/benchmarks/benchmarks/bench_event.py renamed to benchmarks/cuda_bindings/benchmarks/bench_event.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,43 +20,43 @@
2020

2121

2222
def bench_event_create_destroy(loops: int) -> float:
23-
_cuEventCreate = cuda.cuEventCreate
24-
_cuEventDestroy = cuda.cuEventDestroy
23+
_create = cuda.cuEventCreate
24+
_destroy = cuda.cuEventDestroy
2525
_flags = EVENT_FLAGS
2626

2727
t0 = time.perf_counter()
2828
for _ in range(loops):
29-
_, e = _cuEventCreate(_flags)
30-
_cuEventDestroy(e)
29+
_, e = _create(_flags)
30+
_destroy(e)
3131
return time.perf_counter() - t0
3232

3333

3434
def bench_event_record(loops: int) -> float:
35-
_cuEventRecord = cuda.cuEventRecord
35+
_fn = cuda.cuEventRecord
3636
_event = EVENT
3737
_stream = STREAM
3838

3939
t0 = time.perf_counter()
4040
for _ in range(loops):
41-
_cuEventRecord(_event, _stream)
41+
_fn(_event, _stream)
4242
return time.perf_counter() - t0
4343

4444

4545
def bench_event_query(loops: int) -> float:
46-
_cuEventQuery = cuda.cuEventQuery
46+
_fn = cuda.cuEventQuery
4747
_event = EVENT
4848

4949
t0 = time.perf_counter()
5050
for _ in range(loops):
51-
_cuEventQuery(_event)
51+
_fn(_event)
5252
return time.perf_counter() - t0
5353

5454

5555
def bench_event_synchronize(loops: int) -> float:
56-
_cuEventSynchronize = cuda.cuEventSynchronize
56+
_fn = cuda.cuEventSynchronize
5757
_event = EVENT
5858

5959
t0 = time.perf_counter()
6060
for _ in range(loops):
61-
_cuEventSynchronize(_event)
61+
_fn(_event)
6262
return time.perf_counter() - t0

cuda_bindings/benchmarks/benchmarks/bench_launch.py renamed to benchmarks/cuda_bindings/benchmarks/bench_launch.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,52 +82,52 @@ def _ensure_launch_state() -> None:
8282

8383
def bench_launch_empty_kernel(loops: int) -> float:
8484
_ensure_launch_state()
85-
_cuLaunchKernel = cuda.cuLaunchKernel
85+
_fn = cuda.cuLaunchKernel
8686
_kernel = EMPTY_KERNEL
8787
_stream = STREAM
8888

8989
t0 = time.perf_counter()
9090
for _ in range(loops):
91-
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
91+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
9292
return time.perf_counter() - t0
9393

9494

9595
def bench_launch_small_kernel(loops: int) -> float:
9696
_ensure_launch_state()
97-
_cuLaunchKernel = cuda.cuLaunchKernel
97+
_fn = cuda.cuLaunchKernel
9898
_kernel = SMALL_KERNEL
9999
_stream = STREAM
100100
_args = (FLOAT_PTR,)
101101
_arg_types = (None,)
102102

103103
t0 = time.perf_counter()
104104
for _ in range(loops):
105-
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
105+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
106106
return time.perf_counter() - t0
107107

108108

109109
def bench_launch_16_args(loops: int) -> float:
110110
_ensure_launch_state()
111-
_cuLaunchKernel = cuda.cuLaunchKernel
111+
_fn = cuda.cuLaunchKernel
112112
_kernel = KERNEL_16_ARGS
113113
_stream = STREAM
114114
_args = INT_PTRS
115115
_arg_types = (None,) * 16
116116

117117
t0 = time.perf_counter()
118118
for _ in range(loops):
119-
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
119+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
120120
return time.perf_counter() - t0
121121

122122

123123
def bench_launch_16_args_pre_packed(loops: int) -> float:
124124
_ensure_launch_state()
125-
_cuLaunchKernel = cuda.cuLaunchKernel
125+
_fn = cuda.cuLaunchKernel
126126
_kernel = KERNEL_16_ARGS
127127
_stream = STREAM
128128
_packed = PACKED_16
129129

130130
t0 = time.perf_counter()
131131
for _ in range(loops):
132-
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
132+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
133133
return time.perf_counter() - t0
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
import numpy as np
8+
from runner.runtime import alloc_persistent, ensure_context
9+
10+
from cuda.bindings import driver as cuda
11+
12+
ensure_context()
13+
14+
# Allocation size for alloc/free benchmarks
15+
ALLOC_SIZE = 1024
16+
17+
# Small transfer size (8 bytes) to measure call overhead, not bandwidth
18+
COPY_SIZE = 8
19+
20+
# Pre-allocate device memory and host buffers for memcpy benchmarks
21+
DST_DPTR = alloc_persistent(COPY_SIZE)
22+
SRC_DPTR = alloc_persistent(COPY_SIZE)
23+
HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8)
24+
HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8)
25+
26+
# Stream for async operations
27+
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
28+
29+
30+
def bench_mem_alloc_free(loops: int) -> float:
31+
_alloc = cuda.cuMemAlloc
32+
_free = cuda.cuMemFree
33+
_size = ALLOC_SIZE
34+
35+
t0 = time.perf_counter()
36+
for _ in range(loops):
37+
_, ptr = _alloc(_size)
38+
_free(ptr)
39+
return time.perf_counter() - t0
40+
41+
42+
def bench_mem_alloc_async_free_async(loops: int) -> float:
43+
_alloc = cuda.cuMemAllocAsync
44+
_free = cuda.cuMemFreeAsync
45+
_size = ALLOC_SIZE
46+
_stream = STREAM
47+
48+
t0 = time.perf_counter()
49+
for _ in range(loops):
50+
_, ptr = _alloc(_size, _stream)
51+
_free(ptr, _stream)
52+
return time.perf_counter() - t0
53+
54+
55+
def bench_memcpy_htod(loops: int) -> float:
56+
_fn = cuda.cuMemcpyHtoD
57+
_dst = DST_DPTR
58+
_src = HOST_SRC
59+
_size = COPY_SIZE
60+
61+
t0 = time.perf_counter()
62+
for _ in range(loops):
63+
_fn(_dst, _src, _size)
64+
return time.perf_counter() - t0
65+
66+
67+
def bench_memcpy_dtoh(loops: int) -> float:
68+
_fn = cuda.cuMemcpyDtoH
69+
_dst = HOST_DST
70+
_src = SRC_DPTR
71+
_size = COPY_SIZE
72+
73+
t0 = time.perf_counter()
74+
for _ in range(loops):
75+
_fn(_dst, _src, _size)
76+
return time.perf_counter() - t0
77+
78+
79+
def bench_memcpy_dtod(loops: int) -> float:
80+
_fn = cuda.cuMemcpyDtoD
81+
_dst = DST_DPTR
82+
_src = SRC_DPTR
83+
_size = COPY_SIZE
84+
85+
t0 = time.perf_counter()
86+
for _ in range(loops):
87+
_fn(_dst, _src, _size)
88+
return time.perf_counter() - t0

0 commit comments

Comments
 (0)