fix(bindings): keep benchmark discovery from loading NVRTC

rwgk · danielfrg · commit 12dc9dbe2bb8 · 2026-04-14T10:32:44.000-05:00
The Python benchmark smoke test imported `bench_launch` during discovery, which compiled kernels before `pyperf` workers started and made the local-CTK Linux lanes fail when NVRTC was not yet visible in that process setup. Discover benchmark IDs without importing GPU modules, defer launch setup until a launch benchmark actually runs, and preserve CUDA-related environment variables for worker processes.

Made-with: Cursor
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_launch.py b/cuda_bindings/benchmarks/benchmarks/bench_launch.py
@@ -5,13 +5,11 @@
 import ctypes
 import time
 
-from runner.runtime import alloc_persistent, compile_and_load, ensure_context
+from runner.runtime import alloc_persistent, assert_drv, compile_and_load
 
 from cuda.bindings import driver as cuda
 
-ensure_context()
-
-# Compile kernels
+# Compile kernels lazily so benchmark discovery does not need NVRTC.
 KERNEL_SOURCE = """\
 extern "C" __global__ void empty_kernel() { return; }
 extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
@@ -33,28 +31,57 @@
 { *F = 0; }
 """
 
-MODULE = compile_and_load(KERNEL_SOURCE)
+MODULE = None
+EMPTY_KERNEL = None
+SMALL_KERNEL = None
+KERNEL_16_ARGS = None
+STREAM = None
+FLOAT_PTR = None
+INT_PTRS = None
+_VAL_PS = None
+PACKED_16 = None
+
+
+def _ensure_launch_state() -> None:
+    global MODULE, EMPTY_KERNEL, SMALL_KERNEL, KERNEL_16_ARGS, STREAM
+    global FLOAT_PTR, INT_PTRS, _VAL_PS, PACKED_16
+
+    if EMPTY_KERNEL is not None:
+        return
+
+    module = compile_and_load(KERNEL_SOURCE)
+
+    err, empty_kernel = cuda.cuModuleGetFunction(module, b"empty_kernel")
+    assert_drv(err)
+    err, small_kernel = cuda.cuModuleGetFunction(module, b"small_kernel")
+    assert_drv(err)
+    err, kernel_16_args = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
+    assert_drv(err)
 
-# Get kernel handles
-_err, EMPTY_KERNEL = cuda.cuModuleGetFunction(MODULE, b"empty_kernel")
-_err, SMALL_KERNEL = cuda.cuModuleGetFunction(MODULE, b"small_kernel")
-_err, KERNEL_16_ARGS = cuda.cuModuleGetFunction(MODULE, b"small_kernel_16_args")
+    err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+    assert_drv(err)
 
-# Create a non-blocking stream for launches
-_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+    float_ptr = alloc_persistent(ctypes.sizeof(ctypes.c_float))
+    int_ptrs = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16))
 
-# Allocate device memory for kernel arguments
-FLOAT_PTR = alloc_persistent(ctypes.sizeof(ctypes.c_float))
-INT_PTRS = [alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16)]
+    val_ps = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs]
+    packed_16 = (ctypes.c_void_p * 16)()
+    for index, value_ptr in enumerate(val_ps):
+        packed_16[index] = ctypes.addressof(value_ptr)
 
-# Pre-pack ctypes params for the pre-packed benchmark
-_val_ps = [ctypes.c_void_p(int(p)) for p in INT_PTRS]
-PACKED_16 = (ctypes.c_void_p * 16)()
-for _i in range(16):
-    PACKED_16[_i] = ctypes.addressof(_val_ps[_i])
+    MODULE = module
+    EMPTY_KERNEL = empty_kernel
+    SMALL_KERNEL = small_kernel
+    KERNEL_16_ARGS = kernel_16_args
+    STREAM = stream
+    FLOAT_PTR = float_ptr
+    INT_PTRS = int_ptrs
+    _VAL_PS = val_ps
+    PACKED_16 = packed_16
 
 
 def bench_launch_empty_kernel(loops: int) -> float:
+    _ensure_launch_state()
     _cuLaunchKernel = cuda.cuLaunchKernel
     _kernel = EMPTY_KERNEL
     _stream = STREAM
@@ -66,6 +93,7 @@ def bench_launch_empty_kernel(loops: int) -> float:
 
 
 def bench_launch_small_kernel(loops: int) -> float:
+    _ensure_launch_state()
     _cuLaunchKernel = cuda.cuLaunchKernel
     _kernel = SMALL_KERNEL
     _stream = STREAM
@@ -79,11 +107,12 @@ def bench_launch_small_kernel(loops: int) -> float:
 
 
 def bench_launch_16_args(loops: int) -> float:
+    _ensure_launch_state()
     _cuLaunchKernel = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
-    _args = tuple(INT_PTRS)
-    _arg_types = tuple([None] * 16)
+    _args = INT_PTRS
+    _arg_types = (None,) * 16
 
     t0 = time.perf_counter()
     for _ in range(loops):
@@ -92,6 +121,7 @@ def bench_launch_16_args(loops: int) -> float:
 
 
 def bench_launch_16_args_pre_packed(loops: int) -> float:
+    _ensure_launch_state()
     _cuLaunchKernel = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
diff --git a/cuda_bindings/benchmarks/runner/main.py b/cuda_bindings/benchmarks/runner/main.py
@@ -3,8 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+import ast
 import importlib.util
-import inspect
+import os
 import sys
 from collections.abc import Callable
 from pathlib import Path
@@ -15,15 +16,29 @@
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 BENCH_DIR = PROJECT_ROOT / "benchmarks"
 DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json"
+PYPERF_INHERITED_ENV_VARS = (
+    "CUDA_HOME",
+    "CUDA_PATH",
+    "CUDA_VISIBLE_DEVICES",
+    "LD_LIBRARY_PATH",
+    "NVIDIA_VISIBLE_DEVICES",
+)
+_MODULE_CACHE: dict[Path, ModuleType] = {}
 
 
 def load_module(module_path: Path) -> ModuleType:
+    module_path = module_path.resolve()
+    cached_module = _MODULE_CACHE.get(module_path)
+    if cached_module is not None:
+        return cached_module
+
     module_name = f"cuda_bindings_bench_{module_path.stem}"
     spec = importlib.util.spec_from_file_location(module_name, module_path)
     if spec is None or spec.loader is None:
         raise RuntimeError(f"Failed to load benchmark module: {module_path}")
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
+    _MODULE_CACHE[module_path] = module
     return module
 
 
@@ -33,6 +48,29 @@ def benchmark_id(module_name: str, function_name: str) -> str:
     return f"{module_suffix}.{suffix}"
 
 
+def _discover_module_functions(module_path: Path) -> list[str]:
+    tree = ast.parse(module_path.read_text(encoding="utf-8"), filename=str(module_path))
+    return [
+        node.name
+        for node in tree.body
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name.startswith("bench_")
+    ]
+
+
+def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]:
+    loaded_function: Callable[[int], float] | None = None
+
+    def run(loops: int) -> float:
+        nonlocal loaded_function
+        if loaded_function is None:
+            module = load_module(module_path)
+            loaded_function = getattr(module, function_name)
+        return loaded_function(loops)
+
+    run.__name__ = function_name
+    return run
+
+
 def discover_benchmarks() -> dict[str, Callable[[int], float]]:
     """Discover bench_ functions.
 
@@ -42,24 +80,19 @@ def discover_benchmarks() -> dict[str, Callable[[int], float]]:
     """
     registry: dict[str, Callable[[int], float]] = {}
     for module_path in sorted(BENCH_DIR.glob("bench_*.py")):
-        module = load_module(module_path)
         module_name = module_path.stem
-        for function_name, function in inspect.getmembers(module, inspect.isfunction):
-            if not function_name.startswith("bench_"):
-                continue
-            if function.__module__ != module.__name__:
-                continue
+        for function_name in _discover_module_functions(module_path):
             bench_id = benchmark_id(module_name, function_name)
             if bench_id in registry:
                 raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}")
-            registry[bench_id] = function
+            registry[bench_id] = _lazy_benchmark(module_path, function_name)
     return registry
 
 
 def strip_pyperf_output_args(argv: list[str]) -> list[str]:
     cleaned: list[str] = []
     skip_next = False
-    for i, arg in enumerate(argv):
+    for arg in argv:
         if skip_next:
             skip_next = False
             continue
@@ -72,6 +105,48 @@ def strip_pyperf_output_args(argv: list[str]) -> list[str]:
     return cleaned
 
 
+def _split_env_vars(arg_value: str) -> list[str]:
+    return [env_var for env_var in arg_value.split(",") if env_var]
+
+
+def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
+    if "--copy-env" in argv:
+        return list(argv)
+
+    inherited_env: list[str] = []
+    cleaned: list[str] = []
+    skip_next = False
+    for arg in argv:
+        if skip_next:
+            inherited_env.extend(_split_env_vars(arg))
+            skip_next = False
+            continue
+        if arg == "--inherit-environ":
+            skip_next = True
+            continue
+        if arg.startswith("--inherit-environ="):
+            inherited_env.extend(_split_env_vars(arg.partition("=")[2]))
+            continue
+        cleaned.append(arg)
+
+    if skip_next:
+        raise ValueError("Missing value for --inherit-environ")
+
+    for env_var in PYPERF_INHERITED_ENV_VARS:
+        if env_var in os.environ:
+            inherited_env.append(env_var)
+
+    deduped_env: list[str] = []
+    for env_var in inherited_env:
+        if env_var not in deduped_env:
+            deduped_env.append(env_var)
+
+    if deduped_env:
+        cleaned.extend(["--inherit-environ", ",".join(deduped_env)])
+
+    return cleaned
+
+
 def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
     parser = argparse.ArgumentParser(add_help=False)
     parser.add_argument(
@@ -118,12 +193,13 @@ def main() -> None:
     else:
         benchmark_ids = sorted(registry)
 
-    # Strip any --output args to avoid conflicts with our output handling
+    # Strip any --output args to avoid conflicts with our output handling.
     output_path = parsed.output.resolve()
     remaining_argv = strip_pyperf_output_args(remaining_argv)
+    remaining_argv = ensure_pyperf_worker_env(remaining_argv)
     is_worker = "--worker" in remaining_argv
 
-    # Delete the file so this run starts fresh
+    # Delete the file so this run starts fresh.
     if not is_worker:
         output_path.unlink(missing_ok=True)
 
diff --git a/cuda_bindings/benchmarks/tests/test_runner.py b/cuda_bindings/benchmarks/tests/test_runner.py