Skip to content

Commit 86ab025

Browse files
rparolinclaude
andcommitted
[FEA]: Add find_nvidia_dynamic_lib to locate a DSO without loading it
Closes #757. Resolution is delegated to `load_nvidia_dynamic_lib` running in a fresh Python subprocess so the caller's process is left untouched while results stay consistent with the loader's full search cascade. Refactors the existing canary subprocess plumbing into shared helpers (`run_dynamic_lib_subprocess`, `raise_subprocess_child_process_error`) in `subprocess_protocol.py` so canary and find paths share one implementation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8a83a4f commit 86ab025

8 files changed

Lines changed: 320 additions & 76 deletions

File tree

cuda_pathfinder/cuda/pathfinder/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
from cuda.pathfinder._dynamic_libs.load_dl_common import (
1919
DynamicLibUnknownError as DynamicLibUnknownError,
2020
)
21+
from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import (
22+
find_nvidia_dynamic_lib as find_nvidia_dynamic_lib,
23+
)
2124
from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL as LoadedDL
2225
from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib
2326
from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (

cuda_pathfinder/cuda/pathfinder/_dynamic_libs/dynamic_lib_subprocess.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,17 @@
1313
from cuda.pathfinder._dynamic_libs.platform_loader import LOADER
1414
from cuda.pathfinder._dynamic_libs.subprocess_protocol import (
1515
MODE_CANARY,
16+
MODE_FIND,
1617
MODE_LOAD,
1718
STATUS_NOT_FOUND,
1819
STATUS_OK,
1920
VALID_MODES,
2021
format_dynamic_lib_subprocess_payload,
2122
)
2223

23-
# NOTE: The main entrypoint (below) serves both production (canary probe)
24-
# and tests (full loader). Keeping them together ensures a single subprocess
25-
# protocol and CLI surface, so the test subprocess stays aligned with the
26-
# production flow while avoiding a separate test-only module.
27-
# Any production-code impact is negligible since the extra logic only runs
28-
# in the subprocess entrypoint and only in test mode.
24+
# The main entrypoint serves three modes — canary probe, find-without-load,
25+
# and (test-only) full-loader exercise — behind a single subprocess protocol
26+
# so test and production flows stay aligned.
2927

3028

3129
def _probe_canary_abs_path(libname: str) -> str | None:
@@ -94,6 +92,22 @@ def probe_dynamic_lib_and_print_json(libname: str, mode: str) -> None:
9492
print(format_dynamic_lib_subprocess_payload(status, abs_path))
9593
return
9694

95+
if mode == MODE_FIND:
96+
from cuda.pathfinder import load_nvidia_dynamic_lib
97+
98+
try:
99+
loaded = load_nvidia_dynamic_lib(libname)
100+
except DynamicLibNotFoundError as exc:
101+
error = {"type": exc.__class__.__name__, "message": str(exc)}
102+
print(format_dynamic_lib_subprocess_payload(STATUS_NOT_FOUND, None, error=error))
103+
return
104+
abs_path = loaded.abs_path
105+
if not isinstance(abs_path, str):
106+
raise RuntimeError(f"loaded.abs_path is not a string: {abs_path!r}")
107+
_validate_abs_path(abs_path)
108+
print(format_dynamic_lib_subprocess_payload(STATUS_OK, abs_path))
109+
return
110+
97111
if mode == MODE_LOAD:
98112
# Test-only path: exercises full loader behavior in isolation.
99113
try:
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Locate an NVIDIA dynamic library on disk without loading it in this process.
5+
6+
Resolution is delegated to ``load_nvidia_dynamic_lib`` running in a fresh
7+
Python subprocess. The full loader runs (including ``dlopen`` /
8+
``LoadLibraryExW``) but only inside the child, so the caller's process is left
9+
untouched.
10+
"""
11+
12+
from __future__ import annotations
13+
14+
import functools
15+
16+
from cuda.pathfinder._dynamic_libs import load_nvidia_dynamic_lib as _load_module
17+
from cuda.pathfinder._dynamic_libs.load_dl_common import (
18+
DynamicLibNotAvailableError,
19+
DynamicLibNotFoundError,
20+
DynamicLibUnknownError,
21+
)
22+
from cuda.pathfinder._dynamic_libs.subprocess_protocol import (
23+
MODE_FIND,
24+
STATUS_OK,
25+
run_dynamic_lib_subprocess,
26+
)
27+
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
28+
29+
# The subprocess runs the full loader (site-packages / conda / CUDA_PATH /
30+
# canary cascade), which can be substantially slower than a single canary
31+
# probe. Bound it so a wedged child cannot hang the caller indefinitely.
32+
_FIND_SUBPROCESS_TIMEOUT_SECONDS = 120.0 if IS_WINDOWS else 30.0
33+
34+
35+
@functools.cache
36+
def find_nvidia_dynamic_lib(libname: str) -> str:
37+
"""Return the absolute path to an NVIDIA dynamic library without loading it.
38+
39+
Resolution is performed by running :func:`load_nvidia_dynamic_lib` in a
40+
fresh Python subprocess and reporting back the resolved absolute path.
41+
The caller's process does **not** dlopen / LoadLibrary the library.
42+
43+
Args:
44+
libname: Short name of the library (e.g., ``"cufile"``,
45+
``"nvJitLink"``, ``"cudart"``).
46+
47+
Returns:
48+
The absolute path the loader would have used in the caller's process.
49+
50+
Raises:
51+
DynamicLibUnknownError: If ``libname`` is not a recognized library.
52+
DynamicLibNotAvailableError: If ``libname`` is recognized but not
53+
supported on this platform.
54+
DynamicLibNotFoundError: If the library cannot be located.
55+
56+
Notes:
57+
Because resolution happens in a separate process, results may differ
58+
from an in-process ``load_nvidia_dynamic_lib`` if the caller's process
59+
has DSOs loaded with custom ``RPATH``s or has already loaded a matching
60+
library by some other mechanism. The intent is to report the path the
61+
loader would pick when not influenced by other DSOs in the caller.
62+
"""
63+
# Indirect attribute access (not `from ... import`) so tests can
64+
# monkeypatch the source-of-truth tables in `load_nvidia_dynamic_lib`.
65+
if libname not in _load_module._ALL_KNOWN_LIBNAMES:
66+
raise DynamicLibUnknownError(
67+
f"Unknown library name: {libname!r}. Known names: {sorted(_load_module._ALL_KNOWN_LIBNAMES)}"
68+
)
69+
if libname not in _load_module._ALL_SUPPORTED_LIBNAMES:
70+
raise DynamicLibNotAvailableError(
71+
f"Library name {libname!r} is known but not available on {_load_module._PLATFORM_NAME}. "
72+
f"Supported names on {_load_module._PLATFORM_NAME}: {sorted(_load_module._ALL_SUPPORTED_LIBNAMES)}"
73+
)
74+
75+
payload = run_dynamic_lib_subprocess(
76+
MODE_FIND,
77+
libname,
78+
timeout=_FIND_SUBPROCESS_TIMEOUT_SECONDS,
79+
error_label=f"find_nvidia_dynamic_lib subprocess for {libname!r}",
80+
)
81+
if payload.status == STATUS_OK:
82+
assert payload.abs_path is not None
83+
return payload.abs_path
84+
85+
message = (
86+
payload.error["message"]
87+
if payload.error and "message" in payload.error
88+
else f"find_nvidia_dynamic_lib could not locate {libname!r}"
89+
)
90+
raise DynamicLibNotFoundError(message)

cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py

Lines changed: 4 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import functools
77
import struct
8-
import subprocess
98
import sys
109
from typing import TYPE_CHECKING
1110

@@ -27,12 +26,9 @@
2726
run_find_steps,
2827
)
2928
from cuda.pathfinder._dynamic_libs.subprocess_protocol import (
30-
DYNAMIC_LIB_SUBPROCESS_CWD,
3129
MODE_CANARY,
3230
STATUS_OK,
33-
DynamicLibSubprocessPayload,
34-
build_dynamic_lib_subprocess_command,
35-
parse_dynamic_lib_subprocess_payload,
31+
run_dynamic_lib_subprocess,
3632
)
3733
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
3834

@@ -74,61 +70,17 @@ def _load_driver_lib_no_cache(desc: LibDescriptor) -> LoadedDL:
7470
)
7571

7672

77-
def _coerce_subprocess_output(output: str | bytes | None) -> str:
78-
if isinstance(output, bytes):
79-
return output.decode(errors="replace")
80-
return "" if output is None else output
81-
82-
83-
def _raise_canary_probe_child_process_error(
84-
*,
85-
returncode: int | None = None,
86-
timeout: float | None = None,
87-
stderr: str | bytes | None = None,
88-
) -> None:
89-
if timeout is None:
90-
error_line = f"Canary probe child process exited with code {returncode}."
91-
else:
92-
error_line = f"Canary probe child process timed out after {timeout} seconds."
93-
raise ChildProcessError(
94-
f"{error_line}\n"
95-
"--- stderr-from-child-process ---\n"
96-
f"{_coerce_subprocess_output(stderr)}"
97-
"<end-of-stderr-from-child-process>\n"
98-
)
99-
100-
10173
@functools.cache
10274
def _resolve_system_loaded_abs_path_in_subprocess(
10375
libname: str,
10476
*,
10577
timeout: float = _CANARY_PROBE_TIMEOUT_SECONDS,
10678
) -> str | None:
10779
"""Resolve a canary library's absolute path in a fresh Python subprocess."""
108-
try:
109-
result = subprocess.run( # noqa: S603 - trusted argv: current interpreter + internal probe module
110-
build_dynamic_lib_subprocess_command(MODE_CANARY, libname),
111-
capture_output=True,
112-
text=True,
113-
timeout=timeout,
114-
check=False,
115-
cwd=DYNAMIC_LIB_SUBPROCESS_CWD,
116-
)
117-
except subprocess.TimeoutExpired as exc:
118-
_raise_canary_probe_child_process_error(timeout=exc.timeout, stderr=exc.stderr)
119-
120-
if result.returncode != 0:
121-
_raise_canary_probe_child_process_error(returncode=result.returncode, stderr=result.stderr)
122-
123-
payload: DynamicLibSubprocessPayload = parse_dynamic_lib_subprocess_payload(
124-
result.stdout,
125-
libname=libname,
126-
error_label="Canary probe child process",
80+
payload = run_dynamic_lib_subprocess(
81+
MODE_CANARY, libname, timeout=timeout, error_label="Canary probe child process"
12782
)
128-
abs_path: str | None = payload.abs_path
129-
if payload.status == STATUS_OK:
130-
return abs_path
131-
return None
83+
return payload.abs_path if payload.status == STATUS_OK else None
13284

13385

13486
def _loadable_via_canary_subprocess(libname: str, *, timeout: float = _CANARY_PROBE_TIMEOUT_SECONDS) -> bool:

cuda_pathfinder/cuda/pathfinder/_dynamic_libs/subprocess_protocol.py

Lines changed: 80 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@
44
from __future__ import annotations
55

66
import json
7+
import subprocess
78
import sys
89
from dataclasses import dataclass
910
from pathlib import Path
10-
from typing import Literal
11+
from typing import Literal, NoReturn
1112

1213
MODE_CANARY: Literal["canary"] = "canary"
1314
MODE_LOAD: Literal["load"] = "load"
14-
VALID_MODES: tuple[Literal["canary"], Literal["load"]] = (MODE_CANARY, MODE_LOAD)
15+
MODE_FIND: Literal["find"] = "find"
16+
VALID_MODES: tuple[Literal["canary"], Literal["load"], Literal["find"]] = (MODE_CANARY, MODE_LOAD, MODE_FIND)
1517

1618
STATUS_OK: Literal["ok"] = "ok"
1719
STATUS_NOT_FOUND: Literal["not-found"] = "not-found"
@@ -24,6 +26,7 @@
2426
class DynamicLibSubprocessPayload:
2527
status: Literal["ok", "not-found"]
2628
abs_path: str | None
29+
error: dict[str, str] | None = None
2730

2831

2932
def format_dynamic_lib_subprocess_payload(
@@ -60,12 +63,78 @@ def parse_dynamic_lib_subprocess_payload(
6063
raise RuntimeError(f"{error_label} emitted unexpected payload for {libname!r}: {payload!r}")
6164
status = payload.get("status")
6265
abs_path = payload.get("abs_path")
63-
if status == STATUS_OK:
64-
if not isinstance(abs_path, str):
65-
raise RuntimeError(f"{error_label} emitted unexpected payload for {libname!r}: {payload!r}")
66-
return DynamicLibSubprocessPayload(status=STATUS_OK, abs_path=abs_path)
67-
if status == STATUS_NOT_FOUND:
68-
if abs_path is not None:
69-
raise RuntimeError(f"{error_label} emitted unexpected payload for {libname!r}: {payload!r}")
70-
return DynamicLibSubprocessPayload(status=STATUS_NOT_FOUND, abs_path=None)
71-
raise RuntimeError(f"{error_label} emitted unexpected payload for {libname!r}: {payload!r}")
66+
error = payload.get("error")
67+
68+
def reject() -> NoReturn:
69+
raise RuntimeError(f"{error_label} emitted unexpected payload for {libname!r}: {payload!r}")
70+
71+
if error is not None and not (
72+
isinstance(error, dict) and all(isinstance(k, str) and isinstance(v, str) for k, v in error.items())
73+
):
74+
reject()
75+
if status == STATUS_OK and isinstance(abs_path, str):
76+
return DynamicLibSubprocessPayload(status=STATUS_OK, abs_path=abs_path, error=error)
77+
if status == STATUS_NOT_FOUND and abs_path is None:
78+
return DynamicLibSubprocessPayload(status=STATUS_NOT_FOUND, abs_path=None, error=error)
79+
reject()
80+
81+
82+
def _coerce_subprocess_output(output: str | bytes | None) -> str:
83+
if isinstance(output, bytes):
84+
return output.decode(errors="replace")
85+
return "" if output is None else output
86+
87+
88+
def raise_subprocess_child_process_error(
89+
error_label: str,
90+
*,
91+
returncode: int | None = None,
92+
timeout: float | None = None,
93+
stdout: str | bytes | None = None,
94+
stderr: str | bytes | None = None,
95+
) -> NoReturn:
96+
if timeout is not None:
97+
first_line = f"{error_label} timed out after {timeout} seconds."
98+
else:
99+
first_line = f"{error_label} exited with code {returncode}."
100+
raise ChildProcessError(
101+
f"{first_line}\n"
102+
"--- stdout-from-child-process ---\n"
103+
f"{_coerce_subprocess_output(stdout)}<end-of-stdout-from-child-process>\n"
104+
"--- stderr-from-child-process ---\n"
105+
f"{_coerce_subprocess_output(stderr)}<end-of-stderr-from-child-process>\n"
106+
)
107+
108+
109+
def run_dynamic_lib_subprocess(
110+
mode: str,
111+
libname: str,
112+
*,
113+
timeout: float,
114+
error_label: str,
115+
) -> DynamicLibSubprocessPayload:
116+
"""Run the dynamic-lib subprocess and parse its payload.
117+
118+
Raises ``ChildProcessError`` if the child times out or exits non-zero;
119+
otherwise returns the parsed payload (which may itself be ``STATUS_NOT_FOUND``).
120+
"""
121+
try:
122+
result = subprocess.run( # noqa: S603 - trusted argv: current interpreter + internal probe module
123+
build_dynamic_lib_subprocess_command(mode, libname),
124+
capture_output=True,
125+
text=True,
126+
timeout=timeout,
127+
check=False,
128+
cwd=DYNAMIC_LIB_SUBPROCESS_CWD,
129+
)
130+
except subprocess.TimeoutExpired as exc:
131+
raise_subprocess_child_process_error(
132+
error_label, timeout=exc.timeout, stdout=exc.stdout, stderr=exc.stderr
133+
)
134+
135+
if result.returncode != 0:
136+
raise_subprocess_child_process_error(
137+
error_label, returncode=result.returncode, stdout=result.stdout, stderr=result.stderr
138+
)
139+
140+
return parse_dynamic_lib_subprocess_payload(result.stdout, libname=libname, error_label=error_label)

cuda_pathfinder/docs/source/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ CUDA bitcode and static libraries.
2020

2121
SUPPORTED_NVIDIA_LIBNAMES
2222
load_nvidia_dynamic_lib
23+
find_nvidia_dynamic_lib
2324
LoadedDL
2425
DynamicLibNotFoundError
2526
DynamicLibUnknownError

0 commit comments

Comments
 (0)