NVIDIA
diff --git a/‎.coveragerc‎
Lines changed: 1 addition & 1 deletion b/‎.coveragerc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test-wheel-linux.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test-wheel-linux.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.spdx-ignore‎
Lines changed: 0 additions & 3 deletions b/‎.spdx-ignore‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎cuda_bindings/benchmarks/.gitignore‎ ‎benchmarks/cuda_bindings/.gitignore‎cuda_bindings/benchmarks/.gitignore renamed to benchmarks/cuda_bindings/.gitignore b/‎cuda_bindings/benchmarks/.gitignore‎ ‎benchmarks/cuda_bindings/.gitignore‎cuda_bindings/benchmarks/.gitignore renamed to benchmarks/cuda_bindings/.gitignore
diff --git a/‎benchmarks/cuda_bindings/AGENTS.md‎
Lines changed: 6 additions & 0 deletions b/‎benchmarks/cuda_bindings/AGENTS.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cuda_bindings/benchmarks/README.md‎ ‎benchmarks/cuda_bindings/README.md‎cuda_bindings/benchmarks/README.md renamed to benchmarks/cuda_bindings/README.md
Lines changed: 1 addition & 1 deletion b/‎cuda_bindings/benchmarks/README.md‎ ‎benchmarks/cuda_bindings/README.md‎cuda_bindings/benchmarks/README.md renamed to benchmarks/cuda_bindings/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎…enchmarks/benchmarks/bench_ctx_device.py‎ ‎…_bindings/benchmarks/bench_ctx_device.py‎cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py renamed to benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
Lines changed: 10 additions & 10 deletions b/‎…enchmarks/benchmarks/bench_ctx_device.py‎ ‎…_bindings/benchmarks/bench_ctx_device.py‎cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py renamed to benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
Lines changed: 10 additions & 10 deletions
diff --git a/‎…ngs/benchmarks/benchmarks/bench_event.py‎ ‎…/cuda_bindings/benchmarks/bench_event.py‎cuda_bindings/benchmarks/benchmarks/bench_event.py renamed to benchmarks/cuda_bindings/benchmarks/bench_event.py
Lines changed: 10 additions & 10 deletions b/‎…ngs/benchmarks/benchmarks/bench_event.py‎ ‎…/cuda_bindings/benchmarks/bench_event.py‎cuda_bindings/benchmarks/benchmarks/bench_event.py renamed to benchmarks/cuda_bindings/benchmarks/bench_event.py
Lines changed: 10 additions & 10 deletions
diff --git a/‎…gs/benchmarks/benchmarks/bench_launch.py‎ ‎…cuda_bindings/benchmarks/bench_launch.py‎cuda_bindings/benchmarks/benchmarks/bench_launch.py renamed to benchmarks/cuda_bindings/benchmarks/bench_launch.py
Lines changed: 8 additions & 8 deletions b/‎…gs/benchmarks/benchmarks/bench_launch.py‎ ‎…cuda_bindings/benchmarks/bench_launch.py‎cuda_bindings/benchmarks/benchmarks/bench_launch.py renamed to benchmarks/cuda_bindings/benchmarks/bench_launch.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎benchmarks/cuda_bindings/benchmarks/bench_memory.py‎
Lines changed: 88 additions & 0 deletions b/‎benchmarks/cuda_bindings/benchmarks/bench_memory.py‎
Lines changed: 88 additions & 0 deletions
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 [paths]
 source =
 
@@ -274,7 +274,7 @@ jobs:
         if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
         run: |
           pip install pyperf
-          pushd cuda_bindings/benchmarks
+          pushd benchmarks/cuda_bindings
           python run_pyperf.py --fast --min-time 1
           popd
 
 
@@ -8,9 +8,6 @@ LICENSE
 requirements*.txt
 cuda_bindings/examples/*
 
-# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149)
-cuda_bindings/benchmarks/*
-
 # Vendored
 cuda_core/cuda/core/_include/dlpack.h
 
 
@@ -0,0 +1,6 @@
+# cuda.bindings benchmarks
+
+Read the README.md in this directory for more details about the benchmarks.
+
+When generating code verify that that the code is correct based on the source for cuda-bindings
+that can be found in ../../cuda_bindings
@@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system
 pixi run -e wheel -- python -m pyperf system show
 
 # Apply tuning (may require root)
-sudo $(pixi run -e wheel -- which python) -m pyperf system tune
+$(pixi run -e wheel -- which python) -m pyperf system tune
 ```
 
 ### Running benchmarks
 
@@ -15,48 +15,48 @@
 
 
 def bench_ctx_get_current(loops: int) -> float:
-    _cuCtxGetCurrent = cuda.cuCtxGetCurrent
+    _fn = cuda.cuCtxGetCurrent
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxGetCurrent()
+        _fn()
     return time.perf_counter() - t0
 
 
 def bench_ctx_set_current(loops: int) -> float:
-    _cuCtxSetCurrent = cuda.cuCtxSetCurrent
+    _fn = cuda.cuCtxSetCurrent
     _ctx = CTX
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxSetCurrent(_ctx)
+        _fn(_ctx)
     return time.perf_counter() - t0
 
 
 def bench_ctx_get_device(loops: int) -> float:
-    _cuCtxGetDevice = cuda.cuCtxGetDevice
+    _fn = cuda.cuCtxGetDevice
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxGetDevice()
+        _fn()
     return time.perf_counter() - t0
 
 
 def bench_device_get(loops: int) -> float:
-    _cuDeviceGet = cuda.cuDeviceGet
+    _fn = cuda.cuDeviceGet
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuDeviceGet(0)
+        _fn(0)
     return time.perf_counter() - t0
 
 
 def bench_device_get_attribute(loops: int) -> float:
-    _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
+    _fn = cuda.cuDeviceGetAttribute
     _attr = ATTRIBUTE
     _dev = DEVICE
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuDeviceGetAttribute(_attr, _dev)
+        _fn(_attr, _dev)
     return time.perf_counter() - t0
@@ -20,43 +20,43 @@
 
 
 def bench_event_create_destroy(loops: int) -> float:
-    _cuEventCreate = cuda.cuEventCreate
-    _cuEventDestroy = cuda.cuEventDestroy
+    _create = cuda.cuEventCreate
+    _destroy = cuda.cuEventDestroy
     _flags = EVENT_FLAGS
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _, e = _cuEventCreate(_flags)
-        _cuEventDestroy(e)
+        _, e = _create(_flags)
+        _destroy(e)
     return time.perf_counter() - t0
 
 
 def bench_event_record(loops: int) -> float:
-    _cuEventRecord = cuda.cuEventRecord
+    _fn = cuda.cuEventRecord
     _event = EVENT
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventRecord(_event, _stream)
+        _fn(_event, _stream)
     return time.perf_counter() - t0
 
 
 def bench_event_query(loops: int) -> float:
-    _cuEventQuery = cuda.cuEventQuery
+    _fn = cuda.cuEventQuery
     _event = EVENT
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventQuery(_event)
+        _fn(_event)
     return time.perf_counter() - t0
 
 
 def bench_event_synchronize(loops: int) -> float:
-    _cuEventSynchronize = cuda.cuEventSynchronize
+    _fn = cuda.cuEventSynchronize
     _event = EVENT
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventSynchronize(_event)
+        _fn(_event)
     return time.perf_counter() - t0
@@ -82,52 +82,52 @@ def _ensure_launch_state() -> None:
 
 def bench_launch_empty_kernel(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = EMPTY_KERNEL
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_small_kernel(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = SMALL_KERNEL
     _stream = STREAM
     _args = (FLOAT_PTR,)
     _arg_types = (None,)
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_16_args(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
     _args = INT_PTRS
     _arg_types = (None,) * 16
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_16_args_pre_packed(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
     _packed = PACKED_16
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
     return time.perf_counter() - t0
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+import numpy as np
+from runner.runtime import alloc_persistent, ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+# Allocation size for alloc/free benchmarks
+ALLOC_SIZE = 1024
+
+# Small transfer size (8 bytes) to measure call overhead, not bandwidth
+COPY_SIZE = 8
+
+# Pre-allocate device memory and host buffers for memcpy benchmarks
+DST_DPTR = alloc_persistent(COPY_SIZE)
+SRC_DPTR = alloc_persistent(COPY_SIZE)
+HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8)
+HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8)
+
+# Stream for async operations
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+
+
+def bench_mem_alloc_free(loops: int) -> float:
+    _alloc = cuda.cuMemAlloc
+    _free = cuda.cuMemFree
+    _size = ALLOC_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _alloc(_size)
+        _free(ptr)
+    return time.perf_counter() - t0
+
+
+def bench_mem_alloc_async_free_async(loops: int) -> float:
+    _alloc = cuda.cuMemAllocAsync
+    _free = cuda.cuMemFreeAsync
+    _size = ALLOC_SIZE
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _alloc(_size, _stream)
+        _free(ptr, _stream)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_htod(loops: int) -> float:
+    _fn = cuda.cuMemcpyHtoD
+    _dst = DST_DPTR
+    _src = HOST_SRC
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtoh(loops: int) -> float:
+    _fn = cuda.cuMemcpyDtoH
+    _dst = HOST_DST
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtod(loops: int) -> float:
+    _fn = cuda.cuMemcpyDtoD
+    _dst = DST_DPTR
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dst, _src, _size)
+    return time.perf_counter() - t0