NVIDIA · rwgk · Apr 20, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/cuda_bindings/benchmarks/.gitignore → benchmarks/cuda_bindings/.gitignore b/cuda_bindings/benchmarks/.gitignore → benchmarks/cuda_bindings/.gitignore
diff --git a/benchmarks/cuda_bindings/AGENTS.md b/benchmarks/cuda_bindings/AGENTS.md
@@ -0,0 +1,6 @@
+# cuda.bindings benchmarks
+
+Read the README.md in this directory for more details about the benchmarks.
+
+When generating code verify that that the code is correct based on the source for cuda-bindings
+that can be found in ../../cuda_bindings
diff --git a/cuda_bindings/benchmarks/README.md → benchmarks/cuda_bindings/README.md b/cuda_bindings/benchmarks/README.md → benchmarks/cuda_bindings/README.md
@@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system
 pixi run -e wheel -- python -m pyperf system show
 
 # Apply tuning (may require root)
-sudo $(pixi run -e wheel -- which python) -m pyperf system tune
+$(pixi run -e wheel -- which python) -m pyperf system tune
 ```
 
 ### Running benchmarks

diff --git a/...benchmarks/benchmarks/bench_ctx_device.py → ...a_bindings/benchmarks/bench_ctx_device.py b/...benchmarks/benchmarks/bench_ctx_device.py → ...a_bindings/benchmarks/bench_ctx_device.py
diff --git a/...ings/benchmarks/benchmarks/bench_event.py → ...s/cuda_bindings/benchmarks/bench_event.py b/...ings/benchmarks/benchmarks/bench_event.py → ...s/cuda_bindings/benchmarks/bench_event.py
diff --git a/...ngs/benchmarks/benchmarks/bench_launch.py → .../cuda_bindings/benchmarks/bench_launch.py b/...ngs/benchmarks/benchmarks/bench_launch.py → .../cuda_bindings/benchmarks/bench_launch.py
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_memory.py b/benchmarks/cuda_bindings/benchmarks/bench_memory.py
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ctypes
+import time
+
+import numpy as np
+
+from runner.runtime import alloc_persistent, ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+# Allocation size for alloc/free benchmarks
+ALLOC_SIZE = 1024
+
+# Small transfer size (8 bytes) to measure call overhead, not bandwidth
+COPY_SIZE = 8
+
+# Pre-allocate device memory and host buffers for memcpy benchmarks
+DST_DPTR = alloc_persistent(COPY_SIZE)
+SRC_DPTR = alloc_persistent(COPY_SIZE)
+HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8)
+HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8)
+
+# Stream for async operations
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+
+
+def bench_mem_alloc_free(loops: int) -> float:
+    _cuMemAlloc = cuda.cuMemAlloc
+    _cuMemFree = cuda.cuMemFree
+    _size = ALLOC_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _cuMemAlloc(_size)
+        _cuMemFree(ptr)
+    return time.perf_counter() - t0
+
+
+def bench_mem_alloc_async_free_async(loops: int) -> float:
+    _cuMemAllocAsync = cuda.cuMemAllocAsync
+    _cuMemFreeAsync = cuda.cuMemFreeAsync
+    _size = ALLOC_SIZE
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _cuMemAllocAsync(_size, _stream)
+        _cuMemFreeAsync(ptr, _stream)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_htod(loops: int) -> float:
+    _cuMemcpyHtoD = cuda.cuMemcpyHtoD
+    _dst = DST_DPTR
+    _src = HOST_SRC
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuMemcpyHtoD(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtoh(loops: int) -> float:
+    _cuMemcpyDtoH = cuda.cuMemcpyDtoH
+    _dst = HOST_DST
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuMemcpyDtoH(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtod(loops: int) -> float:
+    _cuMemcpyDtoD = cuda.cuMemcpyDtoD
+    _dst = DST_DPTR
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuMemcpyDtoD(_dst, _src, _size)
+    return time.perf_counter() - t0
diff --git a/...ks/benchmarks/bench_pointer_attributes.py → ...gs/benchmarks/bench_pointer_attributes.py b/...ks/benchmarks/bench_pointer_attributes.py → ...gs/benchmarks/bench_pointer_attributes.py
diff --git a/...ngs/benchmarks/benchmarks/bench_stream.py → .../cuda_bindings/benchmarks/bench_stream.py b/...ngs/benchmarks/benchmarks/bench_stream.py → .../cuda_bindings/benchmarks/bench_stream.py
diff --git a/.../benchmarks/benchmarks/cpp/CMakeLists.txt → ...da_bindings/benchmarks/cpp/CMakeLists.txt b/.../benchmarks/benchmarks/cpp/CMakeLists.txt → ...da_bindings/benchmarks/cpp/CMakeLists.txt
@@ -82,6 +82,7 @@ add_driver_benchmark(bench_pointer_attributes)
 add_driver_benchmark(bench_ctx_device)
 add_driver_benchmark(bench_stream)
 add_driver_benchmark(bench_event)
+add_driver_benchmark(bench_memory)
 
 # NVRTC benchmarks (require nvrtc for kernel compilation)
 if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)

diff --git a/...marks/benchmarks/cpp/bench_ctx_device.cpp → ...dings/benchmarks/cpp/bench_ctx_device.cpp b/...marks/benchmarks/cpp/bench_ctx_device.cpp → ...dings/benchmarks/cpp/bench_ctx_device.cpp
diff --git a/...benchmarks/benchmarks/cpp/bench_event.cpp → ...a_bindings/benchmarks/cpp/bench_event.cpp b/...benchmarks/benchmarks/cpp/bench_event.cpp → ...a_bindings/benchmarks/cpp/bench_event.cpp
diff --git a/...enchmarks/benchmarks/cpp/bench_launch.cpp → ..._bindings/benchmarks/cpp/bench_launch.cpp b/...enchmarks/benchmarks/cpp/bench_launch.cpp → ..._bindings/benchmarks/cpp/bench_launch.cpp
@@ -168,39 +168,6 @@ int main(int argc, char** argv) {
         });
     }
 
-    // --- launch_small_kernel ---
-    {
-        void* params[] = {&float_ptr};
-        suite.run("launch.launch_small_kernel", [&]() {
-            check_cu(
-                cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args ---
-    {
-        suite.run("launch.launch_16_args", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) ---
-    // In C++ the params are always pre-packed, so this is identical to launch_16_args.
-    // We include it for naming parity with the Python benchmark.
-    {
-        suite.run("launch.launch_16_args_pre_packed", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
     // Cleanup
     for (int i = 0; i < 16; ++i) {
         check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed");

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
@@ -0,0 +1,106 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include "bench_support.hpp"
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+
+static void check_cu(CUresult status, const char* message) {
+    if (status != CUDA_SUCCESS) {
+        const char* error_name = nullptr;
+        cuGetErrorName(status, &error_name);
+        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
+        std::exit(1);
+    }
+}
+
+
+static constexpr size_t ALLOC_SIZE = 1024;
+static constexpr size_t COPY_SIZE = 8;
+
+
+int main(int argc, char** argv) {
+    bench::Options options = bench::parse_args(argc, argv);
+
+    // Setup
+    check_cu(cuInit(0), "cuInit failed");
+
+    CUdevice device;
+    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
+
+    CUcontext ctx;
+    CUctxCreateParams ctxParams = {};
+    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
+
+    CUstream stream;
+    check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
+
+    // Pre-allocate device memory for memcpy benchmarks
+    CUdeviceptr dst_dptr, src_dptr;
+    check_cu(cuMemAlloc(&dst_dptr, COPY_SIZE), "cuMemAlloc failed");
+    check_cu(cuMemAlloc(&src_dptr, COPY_SIZE), "cuMemAlloc failed");
+
+    // Host buffers for memcpy
+    uint8_t host_src[COPY_SIZE] = {};
+    uint8_t host_dst[COPY_SIZE] = {};
+
+    bench::BenchmarkSuite suite(options);
+
+    // --- mem_alloc_free ---
+    {
+        CUdeviceptr ptr;
+        suite.run("memory.mem_alloc_free", [&]() {
+            check_cu(cuMemAlloc(&ptr, ALLOC_SIZE), "cuMemAlloc failed");
+            check_cu(cuMemFree(ptr), "cuMemFree failed");
+        });
+    }
+
+    // --- mem_alloc_async_free_async ---
+    {
+        CUdeviceptr ptr;
+        suite.run("memory.mem_alloc_async_free_async", [&]() {
+            check_cu(cuMemAllocAsync(&ptr, ALLOC_SIZE, stream), "cuMemAllocAsync failed");
+            check_cu(cuMemFreeAsync(ptr, stream), "cuMemFreeAsync failed");
+        });
+    }
+
+    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
+
+    // --- memcpy_htod ---
+    {
+        suite.run("memory.memcpy_htod", [&]() {
+            check_cu(cuMemcpyHtoD(dst_dptr, host_src, COPY_SIZE), "cuMemcpyHtoD failed");
+        });
+    }
+
+    // --- memcpy_dtoh ---
+    {
+        suite.run("memory.memcpy_dtoh", [&]() {
+            check_cu(cuMemcpyDtoH(host_dst, src_dptr, COPY_SIZE), "cuMemcpyDtoH failed");
+        });
+    }
+
+    // --- memcpy_dtod ---
+    {
+        suite.run("memory.memcpy_dtod", [&]() {
+            check_cu(cuMemcpyDtoD(dst_dptr, src_dptr, COPY_SIZE), "cuMemcpyDtoD failed");
+        });
+    }
+
+    // Cleanup
+    check_cu(cuMemFree(dst_dptr), "cuMemFree failed");
+    check_cu(cuMemFree(src_dptr), "cuMemFree failed");
+    check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
+    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
+
+    suite.write();
+
+    return 0;
+}
diff --git a/...nchmarks/cpp/bench_pointer_attributes.cpp → ...nchmarks/cpp/bench_pointer_attributes.cpp b/...nchmarks/cpp/bench_pointer_attributes.cpp → ...nchmarks/cpp/bench_pointer_attributes.cpp
diff --git a/...enchmarks/benchmarks/cpp/bench_stream.cpp → ..._bindings/benchmarks/cpp/bench_stream.cpp b/...enchmarks/benchmarks/cpp/bench_stream.cpp → ..._bindings/benchmarks/cpp/bench_stream.cpp
diff --git a/...nchmarks/benchmarks/cpp/bench_support.hpp → ...bindings/benchmarks/cpp/bench_support.hpp b/...nchmarks/benchmarks/cpp/bench_support.hpp → ...bindings/benchmarks/cpp/bench_support.hpp
diff --git a/cuda_bindings/benchmarks/compare.py → benchmarks/cuda_bindings/compare.py b/cuda_bindings/benchmarks/compare.py → benchmarks/cuda_bindings/compare.py
diff --git a/cuda_bindings/benchmarks/pixi.lock → benchmarks/cuda_bindings/pixi.lock b/cuda_bindings/benchmarks/pixi.lock → benchmarks/cuda_bindings/pixi.lock
diff --git a/cuda_bindings/benchmarks/pixi.toml → benchmarks/cuda_bindings/pixi.toml b/cuda_bindings/benchmarks/pixi.toml → benchmarks/cuda_bindings/pixi.toml
@@ -45,7 +45,7 @@ pre-commit = "*"
 cuda-bindings = "==13.1.0"
 
 [feature.bindings-source.dependencies]
-cuda-bindings = { path = ".." }
+cuda-bindings = { path = "../../cuda_bindings" }
 
 [environments]
 wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] }

diff --git a/...ings/benchmarks/pytest-legacy/conftest.py → ...s/cuda_bindings/pytest-legacy/conftest.py b/...ings/benchmarks/pytest-legacy/conftest.py → ...s/cuda_bindings/pytest-legacy/conftest.py
diff --git a/...dings/benchmarks/pytest-legacy/kernels.py → ...ks/cuda_bindings/pytest-legacy/kernels.py b/...dings/benchmarks/pytest-legacy/kernels.py → ...ks/cuda_bindings/pytest-legacy/kernels.py
diff --git a/...ngs/benchmarks/pytest-legacy/test_cupy.py → .../cuda_bindings/pytest-legacy/test_cupy.py b/...ngs/benchmarks/pytest-legacy/test_cupy.py → .../cuda_bindings/pytest-legacy/test_cupy.py
diff --git a/...arks/pytest-legacy/test_launch_latency.py → ...ings/pytest-legacy/test_launch_latency.py b/...arks/pytest-legacy/test_launch_latency.py → ...ings/pytest-legacy/test_launch_latency.py
diff --git a/...gs/benchmarks/pytest-legacy/test_numba.py → ...cuda_bindings/pytest-legacy/test_numba.py b/...gs/benchmarks/pytest-legacy/test_numba.py → ...cuda_bindings/pytest-legacy/test_numba.py
diff --git a/.../pytest-legacy/test_pointer_attributes.py → .../pytest-legacy/test_pointer_attributes.py b/.../pytest-legacy/test_pointer_attributes.py → .../pytest-legacy/test_pointer_attributes.py
diff --git a/cuda_bindings/benchmarks/run_cpp.py → benchmarks/cuda_bindings/run_cpp.py b/cuda_bindings/benchmarks/run_cpp.py → benchmarks/cuda_bindings/run_cpp.py
diff --git a/cuda_bindings/benchmarks/run_pyperf.py → benchmarks/cuda_bindings/run_pyperf.py b/cuda_bindings/benchmarks/run_pyperf.py → benchmarks/cuda_bindings/run_pyperf.py
diff --git a/cuda_bindings/benchmarks/runner/__init__.py → benchmarks/cuda_bindings/runner/__init__.py b/cuda_bindings/benchmarks/runner/__init__.py → benchmarks/cuda_bindings/runner/__init__.py
diff --git a/cuda_bindings/benchmarks/runner/cpp.py → benchmarks/cuda_bindings/runner/cpp.py b/cuda_bindings/benchmarks/runner/cpp.py → benchmarks/cuda_bindings/runner/cpp.py
diff --git a/cuda_bindings/benchmarks/runner/main.py → benchmarks/cuda_bindings/runner/main.py b/cuda_bindings/benchmarks/runner/main.py → benchmarks/cuda_bindings/runner/main.py
diff --git a/cuda_bindings/benchmarks/runner/runtime.py → benchmarks/cuda_bindings/runner/runtime.py b/cuda_bindings/benchmarks/runner/runtime.py → benchmarks/cuda_bindings/runner/runtime.py
diff --git a/..._bindings/benchmarks/tests/test_runner.py → ...hmarks/cuda_bindings/tests/test_runner.py b/..._bindings/benchmarks/tests/test_runner.py → ...hmarks/cuda_bindings/tests/test_runner.py