Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions benchmarks/cuda_bindings/AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# cuda.bindings benchmarks

Read the README.md in this directory for more details about the benchmarks.

When generating code verify that that the code is correct based on the source for cuda-bindings
that can be found in ../../cuda_bindings
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system
pixi run -e wheel -- python -m pyperf system show

# Apply tuning (may require root)
sudo $(pixi run -e wheel -- which python) -m pyperf system tune
$(pixi run -e wheel -- which python) -m pyperf system tune
```

### Running benchmarks
Expand Down
90 changes: 90 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/bench_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import ctypes
import time

import numpy as np

from runner.runtime import alloc_persistent, ensure_context

from cuda.bindings import driver as cuda

ensure_context()

# Allocation size for alloc/free benchmarks
ALLOC_SIZE = 1024

# Small transfer size (8 bytes) to measure call overhead, not bandwidth
COPY_SIZE = 8

# Pre-allocate device memory and host buffers for memcpy benchmarks
DST_DPTR = alloc_persistent(COPY_SIZE)
SRC_DPTR = alloc_persistent(COPY_SIZE)
HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8)
HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8)

# Stream for async operations
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)


def bench_mem_alloc_free(loops: int) -> float:
_cuMemAlloc = cuda.cuMemAlloc
_cuMemFree = cuda.cuMemFree
_size = ALLOC_SIZE

t0 = time.perf_counter()
for _ in range(loops):
_, ptr = _cuMemAlloc(_size)
_cuMemFree(ptr)
return time.perf_counter() - t0


def bench_mem_alloc_async_free_async(loops: int) -> float:
_cuMemAllocAsync = cuda.cuMemAllocAsync
_cuMemFreeAsync = cuda.cuMemFreeAsync
_size = ALLOC_SIZE
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_, ptr = _cuMemAllocAsync(_size, _stream)
_cuMemFreeAsync(ptr, _stream)
return time.perf_counter() - t0


def bench_memcpy_htod(loops: int) -> float:
_cuMemcpyHtoD = cuda.cuMemcpyHtoD
_dst = DST_DPTR
_src = HOST_SRC
_size = COPY_SIZE

t0 = time.perf_counter()
for _ in range(loops):
_cuMemcpyHtoD(_dst, _src, _size)
return time.perf_counter() - t0


def bench_memcpy_dtoh(loops: int) -> float:
_cuMemcpyDtoH = cuda.cuMemcpyDtoH
_dst = HOST_DST
_src = SRC_DPTR
_size = COPY_SIZE

t0 = time.perf_counter()
for _ in range(loops):
_cuMemcpyDtoH(_dst, _src, _size)
return time.perf_counter() - t0


def bench_memcpy_dtod(loops: int) -> float:
_cuMemcpyDtoD = cuda.cuMemcpyDtoD
_dst = DST_DPTR
_src = SRC_DPTR
_size = COPY_SIZE

t0 = time.perf_counter()
for _ in range(loops):
_cuMemcpyDtoD(_dst, _src, _size)
return time.perf_counter() - t0
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ add_driver_benchmark(bench_pointer_attributes)
add_driver_benchmark(bench_ctx_device)
add_driver_benchmark(bench_stream)
add_driver_benchmark(bench_event)
add_driver_benchmark(bench_memory)

# NVRTC benchmarks (require nvrtc for kernel compilation)
if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,39 +168,6 @@ int main(int argc, char** argv) {
});
}

// --- launch_small_kernel ---
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These were duplicated before. Cleaning up.

{
void* params[] = {&float_ptr};
suite.run("launch.launch_small_kernel", [&]() {
check_cu(
cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
"cuLaunchKernel failed"
);
});
}

// --- launch_16_args ---
{
suite.run("launch.launch_16_args", [&]() {
check_cu(
cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
"cuLaunchKernel failed"
);
});
}

// --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) ---
// In C++ the params are always pre-packed, so this is identical to launch_16_args.
// We include it for naming parity with the Python benchmark.
{
suite.run("launch.launch_16_args_pre_packed", [&]() {
check_cu(
cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
"cuLaunchKernel failed"
);
});
}

// Cleanup
for (int i = 0; i < 16; ++i) {
check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed");
Expand Down
106 changes: 106 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// SPDX-License-Identifier: Apache-2.0

#include <cuda.h>

#include "bench_support.hpp"

#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <iostream>


static void check_cu(CUresult status, const char* message) {
if (status != CUDA_SUCCESS) {
const char* error_name = nullptr;
cuGetErrorName(status, &error_name);
std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
std::exit(1);
}
}


static constexpr size_t ALLOC_SIZE = 1024;
static constexpr size_t COPY_SIZE = 8;


int main(int argc, char** argv) {
bench::Options options = bench::parse_args(argc, argv);

// Setup
check_cu(cuInit(0), "cuInit failed");

CUdevice device;
check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");

CUcontext ctx;
CUctxCreateParams ctxParams = {};
check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");

CUstream stream;
check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");

// Pre-allocate device memory for memcpy benchmarks
CUdeviceptr dst_dptr, src_dptr;
check_cu(cuMemAlloc(&dst_dptr, COPY_SIZE), "cuMemAlloc failed");
check_cu(cuMemAlloc(&src_dptr, COPY_SIZE), "cuMemAlloc failed");

// Host buffers for memcpy
uint8_t host_src[COPY_SIZE] = {};
uint8_t host_dst[COPY_SIZE] = {};

bench::BenchmarkSuite suite(options);

// --- mem_alloc_free ---
{
CUdeviceptr ptr;
suite.run("memory.mem_alloc_free", [&]() {
check_cu(cuMemAlloc(&ptr, ALLOC_SIZE), "cuMemAlloc failed");
check_cu(cuMemFree(ptr), "cuMemFree failed");
});
}

// --- mem_alloc_async_free_async ---
{
CUdeviceptr ptr;
suite.run("memory.mem_alloc_async_free_async", [&]() {
check_cu(cuMemAllocAsync(&ptr, ALLOC_SIZE, stream), "cuMemAllocAsync failed");
check_cu(cuMemFreeAsync(ptr, stream), "cuMemFreeAsync failed");
});
}

check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");

// --- memcpy_htod ---
{
suite.run("memory.memcpy_htod", [&]() {
check_cu(cuMemcpyHtoD(dst_dptr, host_src, COPY_SIZE), "cuMemcpyHtoD failed");
});
}

// --- memcpy_dtoh ---
{
suite.run("memory.memcpy_dtoh", [&]() {
check_cu(cuMemcpyDtoH(host_dst, src_dptr, COPY_SIZE), "cuMemcpyDtoH failed");
});
}

// --- memcpy_dtod ---
{
suite.run("memory.memcpy_dtod", [&]() {
check_cu(cuMemcpyDtoD(dst_dptr, src_dptr, COPY_SIZE), "cuMemcpyDtoD failed");
});
}

// Cleanup
check_cu(cuMemFree(dst_dptr), "cuMemFree failed");
check_cu(cuMemFree(src_dptr), "cuMemFree failed");
check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");

suite.write();

return 0;
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ pre-commit = "*"
cuda-bindings = "==13.1.0"

[feature.bindings-source.dependencies]
cuda-bindings = { path = ".." }
cuda-bindings = { path = "../../cuda_bindings" }

[environments]
wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] }
Expand Down
Loading