NVIDIA
diff --git a/‎benchmarks/cuda_bindings/runner/main.py‎
Lines changed: 46 additions & 24 deletions b/‎benchmarks/cuda_bindings/runner/main.py‎
Lines changed: 46 additions & 24 deletions
diff --git a/‎benchmarks/cuda_bindings/tests/test_runner.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/cuda_bindings/tests/test_runner.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cuda_core/.gitignore‎
Lines changed: 16 additions & 0 deletions b/‎benchmarks/cuda_core/.gitignore‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎benchmarks/cuda_core/AGENTS.md‎
Lines changed: 11 additions & 0 deletions b/‎benchmarks/cuda_core/AGENTS.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎benchmarks/cuda_core/README.md‎
Lines changed: 73 additions & 0 deletions b/‎benchmarks/cuda_core/README.md‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎benchmarks/cuda_core/benchmarks/bench_ctx_device.py‎
Lines changed: 54 additions & 0 deletions b/‎benchmarks/cuda_core/benchmarks/bench_ctx_device.py‎
Lines changed: 54 additions & 0 deletions
@@ -16,30 +16,30 @@
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 BENCH_DIR = PROJECT_ROOT / "benchmarks"
 DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json"
+DEFAULT_MODULE_NAME_PREFIX = "cuda_bindings_bench"
 # Env var used to propagate the --benchmark filter from the parent to pyperf
 # worker subprocesses. pyperf reconstructs worker argv from scratch and drops
 # custom flags like --benchmark, so without this the worker would register the
 # full bench list and pyperf would run the wrong bench by task index.
-BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
+DEFAULT_BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
 
-PYPERF_INHERITED_ENV_VARS = (
+BASE_PYPERF_INHERITED_ENV_VARS = (
     "CUDA_HOME",
     "CUDA_PATH",
     "CUDA_VISIBLE_DEVICES",
     "LD_LIBRARY_PATH",
     "NVIDIA_VISIBLE_DEVICES",
-    BENCH_FILTER_ENV_VAR,
 )
 _MODULE_CACHE: dict[Path, ModuleType] = {}
 
 
-def load_module(module_path: Path) -> ModuleType:
+def load_module(module_path: Path, module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX) -> ModuleType:
     module_path = module_path.resolve()
     cached_module = _MODULE_CACHE.get(module_path)
     if cached_module is not None:
         return cached_module
 
-    module_name = f"cuda_bindings_bench_{module_path.stem}"
+    module_name = f"{module_name_prefix}_{module_path.stem}"
     spec = importlib.util.spec_from_file_location(module_name, module_path)
     if spec is None or spec.loader is None:
         raise RuntimeError(f"Failed to load benchmark module: {module_path}")
@@ -64,13 +64,17 @@ def _discover_module_functions(module_path: Path) -> list[str]:
     ]
 
 
-def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]:
+def _lazy_benchmark(
+    module_path: Path,
+    function_name: str,
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
+) -> Callable[[int], float]:
     loaded_function: Callable[[int], float] | None = None
 
     def run(loops: int) -> float:
         nonlocal loaded_function
         if loaded_function is None:
-            module = load_module(module_path)
+            module = load_module(module_path, module_name_prefix=module_name_prefix)
             loaded_function = getattr(module, function_name)
         return loaded_function(loops)
 
@@ -86,6 +90,7 @@ def run(loops: int) -> float:
 def _collect_skipped_benchmarks(
     bench_ids: list[str],
     registry: dict[str, Callable[[int], float]],
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
 ) -> set[str]:
     """Return bench IDs that the owning module has marked as unsupported.
 
@@ -106,29 +111,37 @@ def _collect_skipped_benchmarks(
             continue
         module = loaded_modules.get(module_path)
         if module is None:
-            module = load_module(module_path)
+            module = load_module(module_path, module_name_prefix=module_name_prefix)
             loaded_modules[module_path] = module
         module_skip = getattr(module, "SKIPPED_BENCHMARKS", None)
         if module_skip and function_name in module_skip:
             skipped.add(bench_id)
     return skipped
 
 
-def discover_benchmarks() -> dict[str, Callable[[int], float]]:
+def discover_benchmarks(
+    bench_dir: Path | None = None,
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
+) -> dict[str, Callable[[int], float]]:
     """Discover bench_ functions.
 
     Each bench_ function must have the signature: bench_*(loops: int) -> float
     where it calls the operation `loops` times and returns the total elapsed
     time in seconds (using time.perf_counter).
     """
+    # Resolve the default inside the call so tests (and embedders) can
+    # monkeypatch ``BENCH_DIR`` at the module level — Python binds default
+    # args at def-time, so a literal default would ignore later patches.
+    if bench_dir is None:
+        bench_dir = BENCH_DIR
     registry: dict[str, Callable[[int], float]] = {}
-    for module_path in sorted(BENCH_DIR.glob("bench_*.py")):
+    for module_path in sorted(bench_dir.glob("bench_*.py")):
         module_name = module_path.stem
         for function_name in _discover_module_functions(module_path):
             bench_id = benchmark_id(module_name, function_name)
             if bench_id in registry:
                 raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}")
-            registry[bench_id] = _lazy_benchmark(module_path, function_name)
+            registry[bench_id] = _lazy_benchmark(module_path, function_name, module_name_prefix=module_name_prefix)
     return registry
 
 
@@ -152,7 +165,10 @@ def _split_env_vars(arg_value: str) -> list[str]:
     return [env_var for env_var in arg_value.split(",") if env_var]
 
 
-def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
+def ensure_pyperf_worker_env(
+    argv: list[str],
+    extra_env_vars: tuple[str, ...] = (DEFAULT_BENCH_FILTER_ENV_VAR,),
+) -> list[str]:
     if "--copy-env" in argv:
         return list(argv)
 
@@ -175,7 +191,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
     if skip_next:
         raise ValueError("Missing value for --inherit-environ")
 
-    for env_var in PYPERF_INHERITED_ENV_VARS:
+    for env_var in (*BASE_PYPERF_INHERITED_ENV_VARS, *extra_env_vars):
         if env_var in os.environ:
             inherited_env.append(env_var)
 
@@ -190,7 +206,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
     return cleaned
 
 
-def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
+def parse_args(argv: list[str], default_output: Path = DEFAULT_OUTPUT) -> tuple[argparse.Namespace, list[str]]:
     parser = argparse.ArgumentParser(add_help=False)
     parser.add_argument(
         "--benchmark",
@@ -207,19 +223,25 @@ def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
         "-o",
         "--output",
         type=Path,
-        default=DEFAULT_OUTPUT,
-        help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})",
+        default=default_output,
+        help=f"JSON output file path (default: {default_output.name})",
     )
     parsed, remaining = parser.parse_known_args(argv)
     return parsed, remaining
 
 
-def main() -> None:
-    parsed, remaining_argv = parse_args(sys.argv[1:])
+def main(
+    *,
+    bench_dir: Path = BENCH_DIR,
+    default_output: Path = DEFAULT_OUTPUT,
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
+    bench_filter_env_var: str = DEFAULT_BENCH_FILTER_ENV_VAR,
+) -> None:
+    parsed, remaining_argv = parse_args(sys.argv[1:], default_output=default_output)
 
-    registry = discover_benchmarks()
+    registry = discover_benchmarks(bench_dir=bench_dir, module_name_prefix=module_name_prefix)
     if not registry:
-        raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}")
+        raise RuntimeError(f"No benchmark functions found in {bench_dir}")
 
     if parsed.list:
         for bench_id in sorted(registry):
@@ -231,7 +253,7 @@ def main() -> None:
     # the wrong bench. pyperf drops unknown CLI flags when spawning workers,
     # so fall back to an env var carrying the filter.
     requested = list(parsed.benchmark)
-    env_filter = os.environ.get(BENCH_FILTER_ENV_VAR, "")
+    env_filter = os.environ.get(bench_filter_env_var, "")
     if not requested and env_filter:
         requested = [bid for bid in env_filter.split(",") if bid]
 
@@ -243,21 +265,21 @@ def main() -> None:
             raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}")
         benchmark_ids = requested
         # Propagate to any pyperf worker we're about to spawn.
-        os.environ[BENCH_FILTER_ENV_VAR] = ",".join(benchmark_ids)
+        os.environ[bench_filter_env_var] = ",".join(benchmark_ids)
     else:
         benchmark_ids = sorted(registry)
 
     # Strip any --output args to avoid conflicts with our output handling.
     output_path = parsed.output.resolve()
     remaining_argv = strip_pyperf_output_args(remaining_argv)
-    remaining_argv = ensure_pyperf_worker_env(remaining_argv)
+    remaining_argv = ensure_pyperf_worker_env(remaining_argv, extra_env_vars=(bench_filter_env_var,))
     is_worker = "--worker" in remaining_argv
 
     # Drop benchmarks that the owning module has marked as unavailable on
     # this driver/device. Without this step a single unsupported bench
     # (e.g. TMA on a pre-Hopper GPU) would abort the whole pyperf run,
     # since pyperf treats a raised exception as a fatal worker failure.
-    skipped = _collect_skipped_benchmarks(benchmark_ids, registry)
+    skipped = _collect_skipped_benchmarks(benchmark_ids, registry, module_name_prefix=module_name_prefix)
     if skipped and not is_worker:
         for bench_id in sorted(skipped):
             print(f"Skipping {bench_id}: unsupported on this driver/device", file=sys.stderr)
 
@@ -135,7 +135,7 @@ def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path):
 def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch):
     runner_main = load_runner_main(monkeypatch)
 
-    for env_var in runner_main.PYPERF_INHERITED_ENV_VARS:
+    for env_var in runner_main.BASE_PYPERF_INHERITED_ENV_VARS:
         monkeypatch.delenv(env_var, raising=False)
     monkeypatch.setenv("CUDA_PATH", "/opt/cuda")
     monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64")
 
@@ -0,0 +1,16 @@
+# Build artifacts
+.build/
+__pycache__/
+
+# Benchmark results
+*.json
+.benchmarks/
+
+# Pixi environments
+.pixi/
+
+# Override root .gitignore *.cpp rule (which targets Cython-generated files)
+!benchmarks/cpp/*.cpp
+
+results-python.json
+results-cpp.json
@@ -0,0 +1,11 @@
+# cuda.core benchmarks
+
+Read the README.md in this directory for more details about the benchmarks.
+
+When generating code verify that that the code is correct based on the source for cuda-core
+that can be found in ../../cuda_core.
+
+This suite shares the pyperf runner with `../cuda_bindings/` via a sys.path
+insert in `run_pyperf.py`. The per-suite setup (`runtime.py`, the `benchmarks/`
+module files) lives here. Benchmark IDs are kept identical to the cuda.bindings
+suite so `compare.py` can diff them directly.
@@ -0,0 +1,73 @@
+# cuda.core benchmarks
+
+These benchmarks measure the latency overhead of the `cuda.core` public API
+on top of `cuda.bindings`. Every benchmark ID here has a 1:1 counterpart in
+`../cuda_bindings/benchmarks/` so a `compare.py` run produces a side-by-side
+"bindings vs core" overhead table for every operation.
+
+This suite is **not** a throughput benchmark and does not test kernel
+performance — it measures Python-side call overhead only. No C++ baseline
+is built or run for `cuda.core`: the comparative baseline is the
+`cuda.bindings` Python results file at
+`../cuda_bindings/results-python.json`.
+
+The pyperf runner (`runner/main.py`) is shared with the cuda.bindings
+suite via a `sys.path` insert in `run_pyperf.py`; only the per-suite
+`runtime.py` and `benchmarks/*.py` live here.
+
+## Usage
+
+Requires pixi.
+
+Environments:
+
+- `wheel`: Installs released `cuda-core` from conda-forge.
+- `source`: Installs `cuda-core` and `cuda-bindings` from the in-tree
+  sources, so local changes are exercised.
+
+Tasks:
+
+- `bench`: Runs the full suite.
+- `bench-smoke-test`: Runs each bench with `--debug-single-value` for
+  a quick smoke check (not meaningful for timing).
+- `bench-compare`: Prints a side-by-side table against
+  `../cuda_bindings/results-python.json`.
+
+### System tuning
+
+For more stable results on Linux, tune the system before running.
+See: https://pyperf.readthedocs.io/en/latest/system.html#system
+
+```bash
+pixi run -e wheel -- python -m pyperf system show
+$(pixi run -e wheel -- which python) -m pyperf system tune
+```
+
+### Running benchmarks
+
+```bash
+# Wheel env
+pixi run -e wheel bench
+pixi run -e wheel bench --min-time 0.1
+
+# Source env (picks up local cuda.core / cuda.bindings changes)
+pixi run -e source bench
+
+# Side-by-side comparison vs cuda.bindings
+pixi run -e wheel bench-compare
+```
+
+Results are saved to `results-python.json` in this directory. Compare
+against the cuda.bindings baseline by running that suite's `bench` task
+first so `../cuda_bindings/results-python.json` exists.
+
+## Output JSON and analysis
+
+The suite uses [pyperf](https://pyperf.readthedocs.io/en/latest/). The
+output JSON is pyperf-compatible:
+
+```bash
+pixi run -e wheel -- python -m pyperf stats results-python.json
+pixi run -e wheel -- python -m pyperf compare_to \
+    ../cuda_bindings/results-python.json results-python.json
+```
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runtime import ensure_device
+
+from cuda.core import Device
+
+DEV = ensure_device()
+
+
+def bench_ctx_get_current(loops: int) -> float:
+    # Device() with no args returns the TLS-cached "current" device.
+    _fn = Device
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn()
+    return time.perf_counter() - t0
+
+
+def bench_ctx_set_current(loops: int) -> float:
+    _fn = DEV.set_current
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn()
+    return time.perf_counter() - t0
+
+
+def bench_device_get(loops: int) -> float:
+    # Device(id) hits the same TLS cache after the first construction.
+    _fn = Device
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(0)
+    return time.perf_counter() - t0
+
+
+def bench_device_get_attribute(loops: int) -> float:
+    # Matches the cuda.bindings bench's CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
+    # call. cuda.core caches this attribute in DeviceProperties, so every
+    # iteration past the first is a dict lookup rather than a driver call
+    # — the bench therefore measures the user-visible cost of the public
+    # API, which is legitimately faster than cuda.bindings here.
+    _props = DEV.properties
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _props.compute_capability_major  # noqa: B018
+    return time.perf_counter() - t0