Skip to content

Commit 8869a26

Browse files
danielfrgmdboomrwgk
authored
Benchmarks: cuda.core (#2005)
* cuda.core benchmarks * cuda.core benchmarks * cuda.core benchmarks * cuda.core benchmarks * Remove benchmark plan mentions --------- Co-authored-by: Michael Droettboom <mdboom@gmail.com> Co-authored-by: Ralf W. Grosse-Kunstleve <rwgkio@gmail.com>
1 parent 99e1991 commit 8869a26

15 files changed

Lines changed: 1830 additions & 25 deletions

File tree

benchmarks/cuda_bindings/runner/main.py

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,30 +16,30 @@
1616
PROJECT_ROOT = Path(__file__).resolve().parent.parent
1717
BENCH_DIR = PROJECT_ROOT / "benchmarks"
1818
DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json"
19+
DEFAULT_MODULE_NAME_PREFIX = "cuda_bindings_bench"
1920
# Env var used to propagate the --benchmark filter from the parent to pyperf
2021
# worker subprocesses. pyperf reconstructs worker argv from scratch and drops
2122
# custom flags like --benchmark, so without this the worker would register the
2223
# full bench list and pyperf would run the wrong bench by task index.
23-
BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
24+
DEFAULT_BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
2425

25-
PYPERF_INHERITED_ENV_VARS = (
26+
BASE_PYPERF_INHERITED_ENV_VARS = (
2627
"CUDA_HOME",
2728
"CUDA_PATH",
2829
"CUDA_VISIBLE_DEVICES",
2930
"LD_LIBRARY_PATH",
3031
"NVIDIA_VISIBLE_DEVICES",
31-
BENCH_FILTER_ENV_VAR,
3232
)
3333
_MODULE_CACHE: dict[Path, ModuleType] = {}
3434

3535

36-
def load_module(module_path: Path) -> ModuleType:
36+
def load_module(module_path: Path, module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX) -> ModuleType:
3737
module_path = module_path.resolve()
3838
cached_module = _MODULE_CACHE.get(module_path)
3939
if cached_module is not None:
4040
return cached_module
4141

42-
module_name = f"cuda_bindings_bench_{module_path.stem}"
42+
module_name = f"{module_name_prefix}_{module_path.stem}"
4343
spec = importlib.util.spec_from_file_location(module_name, module_path)
4444
if spec is None or spec.loader is None:
4545
raise RuntimeError(f"Failed to load benchmark module: {module_path}")
@@ -64,13 +64,17 @@ def _discover_module_functions(module_path: Path) -> list[str]:
6464
]
6565

6666

67-
def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]:
67+
def _lazy_benchmark(
68+
module_path: Path,
69+
function_name: str,
70+
module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
71+
) -> Callable[[int], float]:
6872
loaded_function: Callable[[int], float] | None = None
6973

7074
def run(loops: int) -> float:
7175
nonlocal loaded_function
7276
if loaded_function is None:
73-
module = load_module(module_path)
77+
module = load_module(module_path, module_name_prefix=module_name_prefix)
7478
loaded_function = getattr(module, function_name)
7579
return loaded_function(loops)
7680

@@ -86,6 +90,7 @@ def run(loops: int) -> float:
8690
def _collect_skipped_benchmarks(
8791
bench_ids: list[str],
8892
registry: dict[str, Callable[[int], float]],
93+
module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
8994
) -> set[str]:
9095
"""Return bench IDs that the owning module has marked as unsupported.
9196
@@ -106,29 +111,37 @@ def _collect_skipped_benchmarks(
106111
continue
107112
module = loaded_modules.get(module_path)
108113
if module is None:
109-
module = load_module(module_path)
114+
module = load_module(module_path, module_name_prefix=module_name_prefix)
110115
loaded_modules[module_path] = module
111116
module_skip = getattr(module, "SKIPPED_BENCHMARKS", None)
112117
if module_skip and function_name in module_skip:
113118
skipped.add(bench_id)
114119
return skipped
115120

116121

117-
def discover_benchmarks() -> dict[str, Callable[[int], float]]:
122+
def discover_benchmarks(
123+
bench_dir: Path | None = None,
124+
module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
125+
) -> dict[str, Callable[[int], float]]:
118126
"""Discover bench_ functions.
119127
120128
Each bench_ function must have the signature: bench_*(loops: int) -> float
121129
where it calls the operation `loops` times and returns the total elapsed
122130
time in seconds (using time.perf_counter).
123131
"""
132+
# Resolve the default inside the call so tests (and embedders) can
133+
# monkeypatch ``BENCH_DIR`` at the module level — Python binds default
134+
# args at def-time, so a literal default would ignore later patches.
135+
if bench_dir is None:
136+
bench_dir = BENCH_DIR
124137
registry: dict[str, Callable[[int], float]] = {}
125-
for module_path in sorted(BENCH_DIR.glob("bench_*.py")):
138+
for module_path in sorted(bench_dir.glob("bench_*.py")):
126139
module_name = module_path.stem
127140
for function_name in _discover_module_functions(module_path):
128141
bench_id = benchmark_id(module_name, function_name)
129142
if bench_id in registry:
130143
raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}")
131-
registry[bench_id] = _lazy_benchmark(module_path, function_name)
144+
registry[bench_id] = _lazy_benchmark(module_path, function_name, module_name_prefix=module_name_prefix)
132145
return registry
133146

134147

@@ -152,7 +165,10 @@ def _split_env_vars(arg_value: str) -> list[str]:
152165
return [env_var for env_var in arg_value.split(",") if env_var]
153166

154167

155-
def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
168+
def ensure_pyperf_worker_env(
169+
argv: list[str],
170+
extra_env_vars: tuple[str, ...] = (DEFAULT_BENCH_FILTER_ENV_VAR,),
171+
) -> list[str]:
156172
if "--copy-env" in argv:
157173
return list(argv)
158174

@@ -175,7 +191,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
175191
if skip_next:
176192
raise ValueError("Missing value for --inherit-environ")
177193

178-
for env_var in PYPERF_INHERITED_ENV_VARS:
194+
for env_var in (*BASE_PYPERF_INHERITED_ENV_VARS, *extra_env_vars):
179195
if env_var in os.environ:
180196
inherited_env.append(env_var)
181197

@@ -190,7 +206,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
190206
return cleaned
191207

192208

193-
def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
209+
def parse_args(argv: list[str], default_output: Path = DEFAULT_OUTPUT) -> tuple[argparse.Namespace, list[str]]:
194210
parser = argparse.ArgumentParser(add_help=False)
195211
parser.add_argument(
196212
"--benchmark",
@@ -207,19 +223,25 @@ def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
207223
"-o",
208224
"--output",
209225
type=Path,
210-
default=DEFAULT_OUTPUT,
211-
help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})",
226+
default=default_output,
227+
help=f"JSON output file path (default: {default_output.name})",
212228
)
213229
parsed, remaining = parser.parse_known_args(argv)
214230
return parsed, remaining
215231

216232

217-
def main() -> None:
218-
parsed, remaining_argv = parse_args(sys.argv[1:])
233+
def main(
234+
*,
235+
bench_dir: Path = BENCH_DIR,
236+
default_output: Path = DEFAULT_OUTPUT,
237+
module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
238+
bench_filter_env_var: str = DEFAULT_BENCH_FILTER_ENV_VAR,
239+
) -> None:
240+
parsed, remaining_argv = parse_args(sys.argv[1:], default_output=default_output)
219241

220-
registry = discover_benchmarks()
242+
registry = discover_benchmarks(bench_dir=bench_dir, module_name_prefix=module_name_prefix)
221243
if not registry:
222-
raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}")
244+
raise RuntimeError(f"No benchmark functions found in {bench_dir}")
223245

224246
if parsed.list:
225247
for bench_id in sorted(registry):
@@ -231,7 +253,7 @@ def main() -> None:
231253
# the wrong bench. pyperf drops unknown CLI flags when spawning workers,
232254
# so fall back to an env var carrying the filter.
233255
requested = list(parsed.benchmark)
234-
env_filter = os.environ.get(BENCH_FILTER_ENV_VAR, "")
256+
env_filter = os.environ.get(bench_filter_env_var, "")
235257
if not requested and env_filter:
236258
requested = [bid for bid in env_filter.split(",") if bid]
237259

@@ -243,21 +265,21 @@ def main() -> None:
243265
raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}")
244266
benchmark_ids = requested
245267
# Propagate to any pyperf worker we're about to spawn.
246-
os.environ[BENCH_FILTER_ENV_VAR] = ",".join(benchmark_ids)
268+
os.environ[bench_filter_env_var] = ",".join(benchmark_ids)
247269
else:
248270
benchmark_ids = sorted(registry)
249271

250272
# Strip any --output args to avoid conflicts with our output handling.
251273
output_path = parsed.output.resolve()
252274
remaining_argv = strip_pyperf_output_args(remaining_argv)
253-
remaining_argv = ensure_pyperf_worker_env(remaining_argv)
275+
remaining_argv = ensure_pyperf_worker_env(remaining_argv, extra_env_vars=(bench_filter_env_var,))
254276
is_worker = "--worker" in remaining_argv
255277

256278
# Drop benchmarks that the owning module has marked as unavailable on
257279
# this driver/device. Without this step a single unsupported bench
258280
# (e.g. TMA on a pre-Hopper GPU) would abort the whole pyperf run,
259281
# since pyperf treats a raised exception as a fatal worker failure.
260-
skipped = _collect_skipped_benchmarks(benchmark_ids, registry)
282+
skipped = _collect_skipped_benchmarks(benchmark_ids, registry, module_name_prefix=module_name_prefix)
261283
if skipped and not is_worker:
262284
for bench_id in sorted(skipped):
263285
print(f"Skipping {bench_id}: unsupported on this driver/device", file=sys.stderr)

benchmarks/cuda_bindings/tests/test_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path):
135135
def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch):
136136
runner_main = load_runner_main(monkeypatch)
137137

138-
for env_var in runner_main.PYPERF_INHERITED_ENV_VARS:
138+
for env_var in runner_main.BASE_PYPERF_INHERITED_ENV_VARS:
139139
monkeypatch.delenv(env_var, raising=False)
140140
monkeypatch.setenv("CUDA_PATH", "/opt/cuda")
141141
monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64")

benchmarks/cuda_core/.gitignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Build artifacts
2+
.build/
3+
__pycache__/
4+
5+
# Benchmark results
6+
*.json
7+
.benchmarks/
8+
9+
# Pixi environments
10+
.pixi/
11+
12+
# Override root .gitignore *.cpp rule (which targets Cython-generated files)
13+
!benchmarks/cpp/*.cpp
14+
15+
results-python.json
16+
results-cpp.json

benchmarks/cuda_core/AGENTS.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# cuda.core benchmarks
2+
3+
Read the README.md in this directory for more details about the benchmarks.
4+
5+
When generating code verify that that the code is correct based on the source for cuda-core
6+
that can be found in ../../cuda_core.
7+
8+
This suite shares the pyperf runner with `../cuda_bindings/` via a sys.path
9+
insert in `run_pyperf.py`. The per-suite setup (`runtime.py`, the `benchmarks/`
10+
module files) lives here. Benchmark IDs are kept identical to the cuda.bindings
11+
suite so `compare.py` can diff them directly.

benchmarks/cuda_core/README.md

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# cuda.core benchmarks
2+
3+
These benchmarks measure the latency overhead of the `cuda.core` public API
4+
on top of `cuda.bindings`. Every benchmark ID here has a 1:1 counterpart in
5+
`../cuda_bindings/benchmarks/` so a `compare.py` run produces a side-by-side
6+
"bindings vs core" overhead table for every operation.
7+
8+
This suite is **not** a throughput benchmark and does not test kernel
9+
performance — it measures Python-side call overhead only. No C++ baseline
10+
is built or run for `cuda.core`: the comparative baseline is the
11+
`cuda.bindings` Python results file at
12+
`../cuda_bindings/results-python.json`.
13+
14+
The pyperf runner (`runner/main.py`) is shared with the cuda.bindings
15+
suite via a `sys.path` insert in `run_pyperf.py`; only the per-suite
16+
`runtime.py` and `benchmarks/*.py` live here.
17+
18+
## Usage
19+
20+
Requires pixi.
21+
22+
Environments:
23+
24+
- `wheel`: Installs released `cuda-core` from conda-forge.
25+
- `source`: Installs `cuda-core` and `cuda-bindings` from the in-tree
26+
sources, so local changes are exercised.
27+
28+
Tasks:
29+
30+
- `bench`: Runs the full suite.
31+
- `bench-smoke-test`: Runs each bench with `--debug-single-value` for
32+
a quick smoke check (not meaningful for timing).
33+
- `bench-compare`: Prints a side-by-side table against
34+
`../cuda_bindings/results-python.json`.
35+
36+
### System tuning
37+
38+
For more stable results on Linux, tune the system before running.
39+
See: https://pyperf.readthedocs.io/en/latest/system.html#system
40+
41+
```bash
42+
pixi run -e wheel -- python -m pyperf system show
43+
$(pixi run -e wheel -- which python) -m pyperf system tune
44+
```
45+
46+
### Running benchmarks
47+
48+
```bash
49+
# Wheel env
50+
pixi run -e wheel bench
51+
pixi run -e wheel bench --min-time 0.1
52+
53+
# Source env (picks up local cuda.core / cuda.bindings changes)
54+
pixi run -e source bench
55+
56+
# Side-by-side comparison vs cuda.bindings
57+
pixi run -e wheel bench-compare
58+
```
59+
60+
Results are saved to `results-python.json` in this directory. Compare
61+
against the cuda.bindings baseline by running that suite's `bench` task
62+
first so `../cuda_bindings/results-python.json` exists.
63+
64+
## Output JSON and analysis
65+
66+
The suite uses [pyperf](https://pyperf.readthedocs.io/en/latest/). The
67+
output JSON is pyperf-compatible:
68+
69+
```bash
70+
pixi run -e wheel -- python -m pyperf stats results-python.json
71+
pixi run -e wheel -- python -m pyperf compare_to \
72+
../cuda_bindings/results-python.json results-python.json
73+
```
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runtime import ensure_device
8+
9+
from cuda.core import Device
10+
11+
DEV = ensure_device()
12+
13+
14+
def bench_ctx_get_current(loops: int) -> float:
15+
# Device() with no args returns the TLS-cached "current" device.
16+
_fn = Device
17+
18+
t0 = time.perf_counter()
19+
for _ in range(loops):
20+
_fn()
21+
return time.perf_counter() - t0
22+
23+
24+
def bench_ctx_set_current(loops: int) -> float:
25+
_fn = DEV.set_current
26+
27+
t0 = time.perf_counter()
28+
for _ in range(loops):
29+
_fn()
30+
return time.perf_counter() - t0
31+
32+
33+
def bench_device_get(loops: int) -> float:
34+
# Device(id) hits the same TLS cache after the first construction.
35+
_fn = Device
36+
37+
t0 = time.perf_counter()
38+
for _ in range(loops):
39+
_fn(0)
40+
return time.perf_counter() - t0
41+
42+
43+
def bench_device_get_attribute(loops: int) -> float:
44+
# Matches the cuda.bindings bench's CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
45+
# call. cuda.core caches this attribute in DeviceProperties, so every
46+
# iteration past the first is a dict lookup rather than a driver call
47+
# — the bench therefore measures the user-visible cost of the public
48+
# API, which is legitimately faster than cuda.bindings here.
49+
_props = DEV.properties
50+
51+
t0 = time.perf_counter()
52+
for _ in range(loops):
53+
_props.compute_capability_major # noqa: B018
54+
return time.perf_counter() - t0

0 commit comments

Comments
 (0)