NVIDIA
diff --git a/‎cuda_bindings/benchmarks/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎cuda_bindings/benchmarks/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cuda_bindings/benchmarks/README.md‎
Lines changed: 27 additions & 10 deletions b/‎cuda_bindings/benchmarks/README.md‎
Lines changed: 27 additions & 10 deletions
diff --git a/‎cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py‎
Lines changed: 62 additions & 0 deletions b/‎cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎cuda_bindings/benchmarks/benchmarks/bench_event.py‎
Lines changed: 62 additions & 0 deletions b/‎cuda_bindings/benchmarks/benchmarks/bench_event.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎cuda_bindings/benchmarks/benchmarks/bench_launch.py‎
Lines changed: 133 additions & 0 deletions b/‎cuda_bindings/benchmarks/benchmarks/bench_launch.py‎
Lines changed: 133 additions & 0 deletions
diff --git a/‎cuda_bindings/benchmarks/benchmarks/bench_stream.py‎
Lines changed: 45 additions & 0 deletions b/‎cuda_bindings/benchmarks/benchmarks/bench_stream.py‎
Lines changed: 45 additions & 0 deletions
@@ -11,3 +11,6 @@ __pycache__/
 
 # Override root .gitignore *.cpp rule (which targets Cython-generated files)
 !benchmarks/cpp/*.cpp
+
+results-python.json
+results-cpp.json
@@ -1,4 +1,17 @@
-# cuda.bindings Benchmarks
+# cuda.bindings benchmarks
+
+These benchmarks are intended to measure the latency overhead of calling CUDA
+Driver APIs through cuda.bindings, relative to a similar C++ baseline.
+
+The goal is to benchmark how much overhead does the Python layer adds to calling
+CUDA APIs and what operations are not in our target of less than 1us of overhead.
+
+Each Python benchmark has a C++ counterpart, which is used to compare the
+operations. We try to make each implementation perform small operations
+and nearly the same work as possible and are run under similar conditions.
+
+These are **not** throughput benchmarks to measure the overall performance
+of kernels and applications.
 
 ## Usage
 
@@ -32,26 +45,30 @@ sudo $(pixi run -e wheel -- which python) -m pyperf system tune
 To run the benchmarks combine the environment and task:
 
 ```bash
-
 # Run the Python benchmarks in the wheel environment
 pixi run -e wheel bench
 
 # Run the Python benchmarks in the source environment
 pixi run -e source bench
 
-# Run the C++ benchmarks (environment is irrelavant here)
+# Run the C++ benchmarks
 pixi run -e wheel bench-cpp
 ```
 
-## pyperf JSON
+Both runners automatically save results to JSON files in the benchmarks
+directory: `results-python.json` and `results-cpp.json`.
 
-The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
-The results are written to a JSON file in the format expected by pyperf.
+## Output JSON and analysis
 
-The C++ benchmarks also generate a valid JSON file, in the same format.
+The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
+Both Python and C++ results are saved in pyperf-compatible JSON format,
+which can be analyzed with pyperf commands:
 
-```
-pixi run -e wheel bench-cpp -0 cpp.json
+```bash
+# Show results and statistics
+pixi run -e wheel -- python -m pyperf stats results-python.json
+pixi run -e wheel -- python -m pyperf stats results-cpp.json
 
-pixi run -e wheel pyperf stats cpp.json
+# Compare C++ vs Python results
+pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json
 ```
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import ensure_context
+
+from cuda.bindings import driver as cuda
+
+CTX = ensure_context()
+
+_, DEVICE = cuda.cuDeviceGet(0)
+ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
+
+
+def bench_ctx_get_current(loops: int) -> float:
+    _cuCtxGetCurrent = cuda.cuCtxGetCurrent
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuCtxGetCurrent()
+    return time.perf_counter() - t0
+
+
+def bench_ctx_set_current(loops: int) -> float:
+    _cuCtxSetCurrent = cuda.cuCtxSetCurrent
+    _ctx = CTX
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuCtxSetCurrent(_ctx)
+    return time.perf_counter() - t0
+
+
+def bench_ctx_get_device(loops: int) -> float:
+    _cuCtxGetDevice = cuda.cuCtxGetDevice
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuCtxGetDevice()
+    return time.perf_counter() - t0
+
+
+def bench_device_get(loops: int) -> float:
+    _cuDeviceGet = cuda.cuDeviceGet
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuDeviceGet(0)
+    return time.perf_counter() - t0
+
+
+def bench_device_get_attribute(loops: int) -> float:
+    _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
+    _attr = ATTRIBUTE
+    _dev = DEVICE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuDeviceGetAttribute(_attr, _dev)
+    return time.perf_counter() - t0
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value)
+
+cuda.cuEventRecord(EVENT, STREAM)
+cuda.cuStreamSynchronize(STREAM)
+
+EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value
+
+
+def bench_event_create_destroy(loops: int) -> float:
+    _cuEventCreate = cuda.cuEventCreate
+    _cuEventDestroy = cuda.cuEventDestroy
+    _flags = EVENT_FLAGS
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, e = _cuEventCreate(_flags)
+        _cuEventDestroy(e)
+    return time.perf_counter() - t0
+
+
+def bench_event_record(loops: int) -> float:
+    _cuEventRecord = cuda.cuEventRecord
+    _event = EVENT
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuEventRecord(_event, _stream)
+    return time.perf_counter() - t0
+
+
+def bench_event_query(loops: int) -> float:
+    _cuEventQuery = cuda.cuEventQuery
+    _event = EVENT
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuEventQuery(_event)
+    return time.perf_counter() - t0
+
+
+def bench_event_synchronize(loops: int) -> float:
+    _cuEventSynchronize = cuda.cuEventSynchronize
+    _event = EVENT
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuEventSynchronize(_event)
+    return time.perf_counter() - t0
@@ -0,0 +1,133 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ctypes
+import time
+
+from runner.runtime import alloc_persistent, assert_drv, compile_and_load
+
+from cuda.bindings import driver as cuda
+
+# Compile kernels lazily so benchmark discovery does not need NVRTC.
+KERNEL_SOURCE = """\
+extern "C" __global__ void empty_kernel() { return; }
+extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
+
+#define ITEM_PARAM(x, T) T x
+#define REP1(x, T)   , ITEM_PARAM(x, T)
+#define REP2(x, T)   REP1(x##0, T)   REP1(x##1, T)
+#define REP4(x, T)   REP2(x##0, T)   REP2(x##1, T)
+#define REP8(x, T)   REP4(x##0, T)   REP4(x##1, T)
+#define REP16(x, T)  REP8(x##0, T)   REP8(x##1, T)
+
+extern "C" __global__
+void small_kernel_16_args(
+    ITEM_PARAM(F, int*)
+    REP1(A, int*)
+    REP2(A, int*)
+    REP4(A, int*)
+    REP8(A, int*))
+{ *F = 0; }
+"""
+
+MODULE = None
+EMPTY_KERNEL = None
+SMALL_KERNEL = None
+KERNEL_16_ARGS = None
+STREAM = None
+FLOAT_PTR = None
+INT_PTRS = None
+_VAL_PS = None
+PACKED_16 = None
+
+
+def _ensure_launch_state() -> None:
+    global MODULE, EMPTY_KERNEL, SMALL_KERNEL, KERNEL_16_ARGS, STREAM
+    global FLOAT_PTR, INT_PTRS, _VAL_PS, PACKED_16
+
+    if EMPTY_KERNEL is not None:
+        return
+
+    module = compile_and_load(KERNEL_SOURCE)
+
+    err, empty_kernel = cuda.cuModuleGetFunction(module, b"empty_kernel")
+    assert_drv(err)
+    err, small_kernel = cuda.cuModuleGetFunction(module, b"small_kernel")
+    assert_drv(err)
+    err, kernel_16_args = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
+    assert_drv(err)
+
+    err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+    assert_drv(err)
+
+    float_ptr = alloc_persistent(ctypes.sizeof(ctypes.c_float))
+    int_ptrs = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16))
+
+    val_ps = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs]
+    packed_16 = (ctypes.c_void_p * 16)()
+    for index, value_ptr in enumerate(val_ps):
+        packed_16[index] = ctypes.addressof(value_ptr)
+
+    MODULE = module
+    EMPTY_KERNEL = empty_kernel
+    SMALL_KERNEL = small_kernel
+    KERNEL_16_ARGS = kernel_16_args
+    STREAM = stream
+    FLOAT_PTR = float_ptr
+    INT_PTRS = int_ptrs
+    _VAL_PS = val_ps
+    PACKED_16 = packed_16
+
+
+def bench_launch_empty_kernel(loops: int) -> float:
+    _ensure_launch_state()
+    _cuLaunchKernel = cuda.cuLaunchKernel
+    _kernel = EMPTY_KERNEL
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
+    return time.perf_counter() - t0
+
+
+def bench_launch_small_kernel(loops: int) -> float:
+    _ensure_launch_state()
+    _cuLaunchKernel = cuda.cuLaunchKernel
+    _kernel = SMALL_KERNEL
+    _stream = STREAM
+    _args = (FLOAT_PTR,)
+    _arg_types = (None,)
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+    return time.perf_counter() - t0
+
+
+def bench_launch_16_args(loops: int) -> float:
+    _ensure_launch_state()
+    _cuLaunchKernel = cuda.cuLaunchKernel
+    _kernel = KERNEL_16_ARGS
+    _stream = STREAM
+    _args = INT_PTRS
+    _arg_types = (None,) * 16
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+    return time.perf_counter() - t0
+
+
+def bench_launch_16_args_pre_packed(loops: int) -> float:
+    _ensure_launch_state()
+    _cuLaunchKernel = cuda.cuLaunchKernel
+    _kernel = KERNEL_16_ARGS
+    _stream = STREAM
+    _packed = PACKED_16
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
+    return time.perf_counter() - t0
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+
+
+def bench_stream_create_destroy(loops: int) -> float:
+    _cuStreamCreate = cuda.cuStreamCreate
+    _cuStreamDestroy = cuda.cuStreamDestroy
+    _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, s = _cuStreamCreate(_flags)
+        _cuStreamDestroy(s)
+    return time.perf_counter() - t0
+
+
+def bench_stream_query(loops: int) -> float:
+    _cuStreamQuery = cuda.cuStreamQuery
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuStreamQuery(_stream)
+    return time.perf_counter() - t0
+
+
+def bench_stream_synchronize(loops: int) -> float:
+    _cuStreamSynchronize = cuda.cuStreamSynchronize
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuStreamSynchronize(_stream)
+    return time.perf_counter() - t0