Skip to content

Commit f09b4f2

Browse files
committed
Add bench_event and bench_stream and compare script for a summary table
1 parent b371a1d commit f09b4f2

File tree

5 files changed

+273
-1
lines changed

5 files changed

+273
-1
lines changed

cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88

99
from cuda.bindings import driver as cuda
1010

11-
ensure_context()
11+
CTX = ensure_context()
12+
13+
_, DEVICE = cuda.cuDeviceGet(0)
14+
ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
1215

1316

1417
def bench_ctx_get_current(loops: int) -> float:
@@ -18,3 +21,43 @@ def bench_ctx_get_current(loops: int) -> float:
1821
for _ in range(loops):
1922
_cuCtxGetCurrent()
2023
return time.perf_counter() - t0
24+
25+
26+
def bench_ctx_set_current(loops: int) -> float:
27+
_cuCtxSetCurrent = cuda.cuCtxSetCurrent
28+
_ctx = CTX
29+
30+
t0 = time.perf_counter()
31+
for _ in range(loops):
32+
_cuCtxSetCurrent(_ctx)
33+
return time.perf_counter() - t0
34+
35+
36+
def bench_ctx_get_device(loops: int) -> float:
37+
_cuCtxGetDevice = cuda.cuCtxGetDevice
38+
39+
t0 = time.perf_counter()
40+
for _ in range(loops):
41+
_cuCtxGetDevice()
42+
return time.perf_counter() - t0
43+
44+
45+
def bench_device_get(loops: int) -> float:
46+
_cuDeviceGet = cuda.cuDeviceGet
47+
48+
t0 = time.perf_counter()
49+
for _ in range(loops):
50+
_cuDeviceGet(0)
51+
return time.perf_counter() - t0
52+
53+
54+
def bench_device_get_attribute(loops: int) -> float:
55+
_cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
56+
_attr = ATTRIBUTE
57+
_dev = DEVICE
58+
59+
t0 = time.perf_counter()
60+
for _ in range(loops):
61+
_cuDeviceGetAttribute(_attr, _dev)
62+
return time.perf_counter() - t0
63+
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runner.runtime import ensure_context
8+
9+
from cuda.bindings import driver as cuda
10+
11+
ensure_context()
12+
13+
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
14+
_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value)
15+
16+
cuda.cuEventRecord(EVENT, STREAM)
17+
cuda.cuStreamSynchronize(STREAM)
18+
19+
EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value
20+
21+
22+
def bench_event_create_destroy(loops: int) -> float:
23+
_cuEventCreate = cuda.cuEventCreate
24+
_cuEventDestroy = cuda.cuEventDestroy
25+
_flags = EVENT_FLAGS
26+
27+
t0 = time.perf_counter()
28+
for _ in range(loops):
29+
_, e = _cuEventCreate(_flags)
30+
_cuEventDestroy(e)
31+
return time.perf_counter() - t0
32+
33+
34+
def bench_event_record(loops: int) -> float:
35+
_cuEventRecord = cuda.cuEventRecord
36+
_event = EVENT
37+
_stream = STREAM
38+
39+
t0 = time.perf_counter()
40+
for _ in range(loops):
41+
_cuEventRecord(_event, _stream)
42+
return time.perf_counter() - t0
43+
44+
45+
def bench_event_query(loops: int) -> float:
46+
_cuEventQuery = cuda.cuEventQuery
47+
_event = EVENT
48+
49+
t0 = time.perf_counter()
50+
for _ in range(loops):
51+
_cuEventQuery(_event)
52+
return time.perf_counter() - t0
53+
54+
55+
def bench_event_synchronize(loops: int) -> float:
56+
_cuEventSynchronize = cuda.cuEventSynchronize
57+
_event = EVENT
58+
59+
t0 = time.perf_counter()
60+
for _ in range(loops):
61+
_cuEventSynchronize(_event)
62+
return time.perf_counter() - t0
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runner.runtime import ensure_context
8+
9+
from cuda.bindings import driver as cuda
10+
11+
ensure_context()
12+
13+
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
14+
15+
16+
def bench_stream_create_destroy(loops: int) -> float:
17+
_cuStreamCreate = cuda.cuStreamCreate
18+
_cuStreamDestroy = cuda.cuStreamDestroy
19+
_flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value
20+
21+
t0 = time.perf_counter()
22+
for _ in range(loops):
23+
_, s = _cuStreamCreate(_flags)
24+
_cuStreamDestroy(s)
25+
return time.perf_counter() - t0
26+
27+
28+
def bench_stream_query(loops: int) -> float:
29+
_cuStreamQuery = cuda.cuStreamQuery
30+
_stream = STREAM
31+
32+
t0 = time.perf_counter()
33+
for _ in range(loops):
34+
_cuStreamQuery(_stream)
35+
return time.perf_counter() - t0
36+
37+
38+
def bench_stream_synchronize(loops: int) -> float:
39+
_cuStreamSynchronize = cuda.cuStreamSynchronize
40+
_stream = STREAM
41+
42+
t0 = time.perf_counter()
43+
for _ in range(loops):
44+
_cuStreamSynchronize(_stream)
45+
return time.perf_counter() - t0
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""Compare Python and C++ benchmark results in a summary table."""
6+
7+
import argparse
8+
import json
9+
import statistics
10+
import sys
11+
from pathlib import Path
12+
13+
PROJECT_ROOT = Path(__file__).resolve().parent
14+
DEFAULT_PYTHON = PROJECT_ROOT / "results-python.json"
15+
DEFAULT_CPP = PROJECT_ROOT / "results-cpp.json"
16+
17+
18+
def load_benchmarks(path: Path) -> dict[str, list[float]]:
19+
"""Load a pyperf JSON file and return {name: [values]}."""
20+
with open(path) as f:
21+
data = json.load(f)
22+
23+
results: dict[str, list[float]] = {}
24+
for bench in data.get("benchmarks", []):
25+
name = bench.get("metadata", {}).get("name", "")
26+
if not name:
27+
# Try to find name in run metadata
28+
for run in bench.get("runs", []):
29+
name = run.get("metadata", {}).get("name", "")
30+
if name:
31+
break
32+
values = []
33+
for run in bench.get("runs", []):
34+
values.extend(run.get("values", []))
35+
if name and values:
36+
results[name] = values
37+
return results
38+
39+
40+
def fmt_ns(seconds: float) -> str:
41+
ns = seconds * 1e9
42+
if ns >= 1000:
43+
return f"{ns / 1000:.2f} us"
44+
return f"{ns:.0f} ns"
45+
46+
47+
def main() -> None:
48+
parser = argparse.ArgumentParser(description="Compare Python vs C++ benchmark results")
49+
parser.add_argument(
50+
"--python",
51+
type=Path,
52+
default=DEFAULT_PYTHON,
53+
help=f"Python results JSON (default: {DEFAULT_PYTHON.name})",
54+
)
55+
parser.add_argument(
56+
"--cpp",
57+
type=Path,
58+
default=DEFAULT_CPP,
59+
help=f"C++ results JSON (default: {DEFAULT_CPP.name})",
60+
)
61+
args = parser.parse_args()
62+
63+
if not args.python.exists():
64+
print(f"Python results not found: {args.python}", file=sys.stderr)
65+
print("Run: pixi run -e wheel bench", file=sys.stderr)
66+
sys.exit(1)
67+
68+
py_benchmarks = load_benchmarks(args.python)
69+
cpp_benchmarks = load_benchmarks(args.cpp) if args.cpp.exists() else {}
70+
71+
if not py_benchmarks:
72+
print("No benchmarks found in Python results.", file=sys.stderr)
73+
sys.exit(1)
74+
75+
# Column widths
76+
all_names = sorted(set(py_benchmarks) | set(cpp_benchmarks))
77+
name_width = max(len(n) for n in all_names)
78+
name_width = max(name_width, len("Benchmark"))
79+
80+
# Header
81+
if cpp_benchmarks:
82+
header = f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'Python (mean)':>14} {'Overhead':>10}"
83+
sep = "-" * len(header)
84+
print(sep)
85+
print(header)
86+
print(sep)
87+
else:
88+
header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14}"
89+
sep = "-" * len(header)
90+
print(sep)
91+
print(header)
92+
print(sep)
93+
94+
for name in all_names:
95+
py_vals = py_benchmarks.get(name)
96+
cpp_vals = cpp_benchmarks.get(name)
97+
98+
py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-"
99+
cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-"
100+
101+
if py_vals and cpp_vals:
102+
py_mean = statistics.mean(py_vals)
103+
cpp_mean = statistics.mean(cpp_vals)
104+
overhead_ns = (py_mean - cpp_mean) * 1e9
105+
overhead_str = f"+{overhead_ns:.0f} ns"
106+
else:
107+
overhead_str = "-"
108+
109+
if cpp_benchmarks:
110+
print(f"{name:<{name_width}} {cpp_str:>12} {py_str:>14} {overhead_str:>10}")
111+
else:
112+
print(f"{name:<{name_width}} {py_str:>14}")
113+
114+
print(sep)
115+
116+
117+
if __name__ == "__main__":
118+
main()

cuda_bindings/benchmarks/pixi.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ cmake = "*"
2929
ninja = "*"
3030
cxx-compiler = "*"
3131
cuda-cudart-dev = "*"
32+
cuda-nvrtc-dev = "*"
3233

3334
[feature.cpp-bench.target.linux-64.dependencies]
3435
cuda-crt-dev_linux-64 = "*"
@@ -79,5 +80,8 @@ depends-on = [{ task = "bench-cpp-configure" }]
7980
cmd = ["python", "$PIXI_PROJECT_ROOT/run_cpp.py"]
8081
depends-on = [{ task = "bench-cpp-build" }]
8182

83+
[target.linux.tasks.bench-compare]
84+
cmd = ["python", "$PIXI_PROJECT_ROOT/compare.py"]
85+
8286
[target.linux.tasks.lint]
8387
cmd = ["pre-commit", "run", "--all-files"]

0 commit comments

Comments
 (0)