Skip to content

Commit 62349f4

Browse files
cpcloudcursoragentCopilot
authored
feat(pathfinder): support "cuda"/"nvml" driver libs and reject unsupported libnames (#1602)
* feat(pathfinder): support "cuda"/"nvml" driver libs and reject unsupported libnames Add support for loading NVIDIA driver libraries ("cuda", "nvml") via load_nvidia_dynamic_lib(). These are part of the display driver (not the CTK) and use a simplified system-search-only path, skipping site-packages, conda, CUDA_HOME, and canary probe steps. Also reject unrecognized libnames with a ValueError instead of silently falling through to system search, which could produce surprising results for unsupported libs like "cupti". Closes #1288, closes #1564. Co-authored-by: Cursor <cursoragent@cursor.com> * Update cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix(pathfinder): avoid real lib names in unsupported-libname test Use only obviously fake names to prevent confusion when searching the codebase for real library names like "cupti". Co-authored-by: Cursor <cursoragent@cursor.com> * fix(pathfinder): trim stale name-mapping detail from driver lib comment The duplicated soname/DLL names in the comment can drift from the dict values below; the dict itself is the source of truth. Co-authored-by: Cursor <cursoragent@cursor.com> * fix(pathfinder): remove trivial in/not-in registry tests for driver libnames These just exercise Python's `in` operator against a hardcoded list and don't provide meaningful coverage. Co-authored-by: Cursor <cursoragent@cursor.com> * test(pathfinder): add real-loading tests for driver libs Run "cuda" and "nvml" through the actual OS loader in spawned child processes, following the same pattern as test_load_nvidia_dynamic_lib. Results are tracked via INFO summary lines for CI/QA log inspection. Co-authored-by: Cursor <cursoragent@cursor.com> --------- Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 3e022ad commit 62349f4

File tree

4 files changed

+232
-10
lines changed

4 files changed

+232
-10
lines changed

cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66
import sys
77

88
from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib
9-
from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL, load_dependencies
9+
from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL, load_dependencies
10+
from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
11+
SUPPORTED_LINUX_SONAMES,
12+
SUPPORTED_WINDOWS_DLLS,
13+
)
1014
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
1115

1216
if IS_WINDOWS:
@@ -22,8 +26,44 @@
2226
load_with_system_search,
2327
)
2428

29+
# All libnames recognized by load_nvidia_dynamic_lib, across all categories
30+
# (CTK, third-party, driver). Built from the platform-appropriate soname/DLL
31+
# registry so that platform-specific libs (e.g. cufile on Linux) are included
32+
# only where they apply.
33+
_ALL_SUPPORTED_LIBNAMES: frozenset[str] = frozenset(
34+
(SUPPORTED_WINDOWS_DLLS if IS_WINDOWS else SUPPORTED_LINUX_SONAMES).keys()
35+
)
36+
37+
# Driver libraries: shipped with the NVIDIA display driver, always on the
38+
# system linker path. These skip all CTK search steps (site-packages,
39+
# conda, CUDA_HOME, canary) and go straight to system search.
40+
_DRIVER_ONLY_LIBNAMES = frozenset(("cuda", "nvml"))
41+
42+
43+
def _load_driver_lib_no_cache(libname: str) -> LoadedDL:
44+
"""Load an NVIDIA driver library (system-search only).
45+
46+
Driver libs (libcuda, libnvidia-ml) are part of the display driver, not
47+
the CUDA Toolkit. They are always on the system linker path, so the
48+
full CTK search cascade (site-packages, conda, CUDA_HOME, canary) is
49+
unnecessary.
50+
"""
51+
loaded = check_if_already_loaded_from_elsewhere(libname, False)
52+
if loaded is not None:
53+
return loaded
54+
loaded = load_with_system_search(libname)
55+
if loaded is not None:
56+
return loaded
57+
raise DynamicLibNotFoundError(
58+
f'"{libname}" is an NVIDIA driver library and can only be found via'
59+
f" system search. Ensure the NVIDIA display driver is installed."
60+
)
61+
2562

2663
def _load_lib_no_cache(libname: str) -> LoadedDL:
64+
if libname in _DRIVER_ONLY_LIBNAMES:
65+
return _load_driver_lib_no_cache(libname)
66+
2767
finder = _FindNvidiaDynamicLib(libname)
2868
abs_path = finder.try_site_packages()
2969
if abs_path is not None:
@@ -83,6 +123,7 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
83123
https://github.com/NVIDIA/cuda-python/issues/1011
84124
85125
Raises:
126+
ValueError: If ``libname`` is not a recognized library name.
86127
DynamicLibNotFoundError: If the library cannot be found or loaded.
87128
RuntimeError: If Python is not 64-bit.
88129
@@ -123,6 +164,18 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
123164
124165
- If set, use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).
125166
167+
**Driver libraries** (``"cuda"``, ``"nvml"``):
168+
169+
These are part of the NVIDIA display driver (not the CUDA Toolkit) and
170+
are always on the system linker path. For these libraries the search
171+
is simplified to:
172+
173+
0. Already loaded in the current process
174+
1. OS default mechanisms (``dlopen`` / ``LoadLibraryW``)
175+
176+
The CTK-specific steps (site-packages, conda, ``CUDA_HOME``, canary
177+
probe) are skipped entirely.
178+
126179
Notes:
127180
The search is performed **per library**. There is currently no mechanism to
128181
guarantee that multiple libraries are all resolved from the same location.
@@ -135,4 +188,6 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
135188
f" Currently running: {pointer_size_bits}-bit Python"
136189
f" {sys.version_info.major}.{sys.version_info.minor}"
137190
)
191+
if libname not in _ALL_SUPPORTED_LIBNAMES:
192+
raise ValueError(f"Unsupported library name: {libname!r}. Supported names: {sorted(_ALL_SUPPORTED_LIBNAMES)}")
138193
return _load_lib_no_cache(libname)

cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,14 @@
214214
"nvpl_fftw": ("libnvpl_fftw.so.0",),
215215
"nvshmem_host": ("libnvshmem_host.so.3",),
216216
}
217-
SUPPORTED_LINUX_SONAMES = SUPPORTED_LINUX_SONAMES_CTK | SUPPORTED_LINUX_SONAMES_OTHER
217+
# Driver libraries: shipped with the NVIDIA driver, always on the system
218+
# linker path. Only system search is needed (no site-packages / conda /
219+
# CUDA_HOME).
220+
SUPPORTED_LINUX_SONAMES_DRIVER = {
221+
"cuda": ("libcuda.so.1",),
222+
"nvml": ("libnvidia-ml.so.1",),
223+
}
224+
SUPPORTED_LINUX_SONAMES = SUPPORTED_LINUX_SONAMES_CTK | SUPPORTED_LINUX_SONAMES_OTHER | SUPPORTED_LINUX_SONAMES_DRIVER
218225

219226
# Based on these files:
220227
# cuda_12.0.1_528.33_windows.exe
@@ -338,7 +345,11 @@
338345
"cutensor": ("cutensor.dll",),
339346
"cutensorMg": ("cutensorMg.dll",),
340347
}
341-
SUPPORTED_WINDOWS_DLLS = SUPPORTED_WINDOWS_DLLS_CTK | SUPPORTED_WINDOWS_DLLS_OTHER
348+
SUPPORTED_WINDOWS_DLLS_DRIVER = {
349+
"cuda": ("nvcuda.dll",),
350+
"nvml": ("nvml.dll",),
351+
}
352+
SUPPORTED_WINDOWS_DLLS = SUPPORTED_WINDOWS_DLLS_CTK | SUPPORTED_WINDOWS_DLLS_OTHER | SUPPORTED_WINDOWS_DLLS_DRIVER
342353

343354
LIBNAMES_REQUIRING_OS_ADD_DLL_DIRECTORY = (
344355
"cufft",
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Tests for NVIDIA driver library loading ("cuda", "nvml").
5+
6+
These libraries are part of the display driver, not the CUDA Toolkit.
7+
They use a simplified system-search-only path, skipping site-packages,
8+
conda, CUDA_HOME, and the canary probe.
9+
"""
10+
11+
import json
12+
import os
13+
14+
import pytest
15+
import spawned_process_runner
16+
from child_load_nvidia_dynamic_lib_helper import build_child_process_failed_for_libname_message, child_process_func
17+
18+
from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL
19+
from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
20+
_DRIVER_ONLY_LIBNAMES,
21+
_load_driver_lib_no_cache,
22+
_load_lib_no_cache,
23+
)
24+
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS, quote_for_shell
25+
26+
STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS", "see_what_works")
27+
assert STRICTNESS in ("see_what_works", "all_must_work")
28+
29+
_MODULE = "cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib"
30+
31+
32+
def _make_loaded_dl(path, found_via):
33+
return LoadedDL(path, False, 0xDEAD, found_via)
34+
35+
36+
# ---------------------------------------------------------------------------
37+
# _load_driver_lib_no_cache
38+
# ---------------------------------------------------------------------------
39+
40+
41+
def test_driver_lib_returns_already_loaded(mocker):
42+
already = LoadedDL("/usr/lib/libcuda.so.1", True, 0xBEEF, "was-already-loaded-from-elsewhere")
43+
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=already)
44+
mocker.patch(f"{_MODULE}.load_with_system_search")
45+
46+
result = _load_driver_lib_no_cache("cuda")
47+
48+
assert result is already
49+
# system search should not have been called
50+
from cuda.pathfinder._dynamic_libs import load_nvidia_dynamic_lib as mod
51+
52+
mod.load_with_system_search.assert_not_called()
53+
54+
55+
def test_driver_lib_falls_through_to_system_search(mocker):
56+
loaded = _make_loaded_dl("/usr/lib/libcuda.so.1", "system-search")
57+
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
58+
mocker.patch(f"{_MODULE}.load_with_system_search", return_value=loaded)
59+
60+
result = _load_driver_lib_no_cache("cuda")
61+
62+
assert result is loaded
63+
assert result.found_via == "system-search"
64+
65+
66+
def test_driver_lib_raises_when_not_found(mocker):
67+
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
68+
mocker.patch(f"{_MODULE}.load_with_system_search", return_value=None)
69+
70+
with pytest.raises(DynamicLibNotFoundError, match="NVIDIA driver library"):
71+
_load_driver_lib_no_cache("nvml")
72+
73+
74+
def test_driver_lib_does_not_search_site_packages(mocker):
75+
"""Driver libs must not go through the CTK search cascade."""
76+
loaded = _make_loaded_dl("/usr/lib/libcuda.so.1", "system-search")
77+
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
78+
mocker.patch(f"{_MODULE}.load_with_system_search", return_value=loaded)
79+
80+
from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib
81+
82+
spy = mocker.spy(_FindNvidiaDynamicLib, "try_site_packages")
83+
_load_driver_lib_no_cache("cuda")
84+
spy.assert_not_called()
85+
86+
87+
# ---------------------------------------------------------------------------
88+
# _load_lib_no_cache dispatches driver libs correctly
89+
# ---------------------------------------------------------------------------
90+
91+
92+
@pytest.mark.parametrize("libname", sorted(_DRIVER_ONLY_LIBNAMES))
93+
def test_load_lib_no_cache_dispatches_to_driver_path(libname, mocker):
94+
loaded = _make_loaded_dl(f"/usr/lib/fake_{libname}.so", "system-search")
95+
mock_driver = mocker.patch(f"{_MODULE}._load_driver_lib_no_cache", return_value=loaded)
96+
97+
result = _load_lib_no_cache(libname)
98+
99+
assert result is loaded
100+
mock_driver.assert_called_once_with(libname)
101+
102+
103+
def test_load_lib_no_cache_does_not_dispatch_ctk_lib_to_driver_path(mocker):
104+
"""Ensure regular CTK libs don't take the driver shortcut."""
105+
mock_driver = mocker.patch(f"{_MODULE}._load_driver_lib_no_cache")
106+
# Let the normal path run far enough to prove the driver path wasn't used.
107+
# We'll make it fail quickly at check_if_already_loaded_from_elsewhere.
108+
from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib
109+
110+
mocker.patch.object(_FindNvidiaDynamicLib, "try_site_packages", return_value=None)
111+
mocker.patch.object(_FindNvidiaDynamicLib, "try_with_conda_prefix", return_value=None)
112+
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
113+
mocker.patch(f"{_MODULE}.load_dependencies")
114+
mocker.patch(
115+
f"{_MODULE}.load_with_system_search",
116+
return_value=_make_loaded_dl("/usr/lib/libcudart.so.13", "system-search"),
117+
)
118+
119+
_load_lib_no_cache("cudart")
120+
121+
mock_driver.assert_not_called()
122+
123+
124+
# ---------------------------------------------------------------------------
125+
# Real loading tests (spawned child process for isolation)
126+
# ---------------------------------------------------------------------------
127+
128+
129+
@pytest.mark.parametrize("libname", sorted(_DRIVER_ONLY_LIBNAMES))
130+
def test_real_load_driver_lib(info_summary_append, libname):
131+
"""Load a real driver library in a child process.
132+
133+
This complements the mock tests above: it exercises the actual OS
134+
loader path and logs results via INFO for CI/QA inspection.
135+
"""
136+
timeout = 120 if IS_WINDOWS else 30
137+
result = spawned_process_runner.run_in_spawned_child_process(child_process_func, args=(libname,), timeout=timeout)
138+
139+
def raise_child_process_failed():
140+
raise RuntimeError(build_child_process_failed_for_libname_message(libname, result))
141+
142+
if result.returncode != 0:
143+
raise_child_process_failed()
144+
assert not result.stderr
145+
if result.stdout.startswith("CHILD_LOAD_NVIDIA_DYNAMIC_LIB_HELPER_DYNAMIC_LIB_NOT_FOUND_ERROR:"):
146+
if STRICTNESS == "all_must_work":
147+
raise_child_process_failed()
148+
info_summary_append(f"Not found: {libname=!r}")
149+
else:
150+
abs_path = json.loads(result.stdout.rstrip())
151+
info_summary_append(f"abs_path={quote_for_shell(abs_path)}")
152+
assert os.path.isfile(abs_path)

cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import json
55
import os
66
import platform
7-
from unittest.mock import patch
87

98
import pytest
109
import spawned_process_runner
@@ -62,12 +61,17 @@ def test_supported_libnames_windows_libnames_requiring_os_add_dll_directory_cons
6261
)
6362

6463

65-
def test_runtime_error_on_non_64bit_python():
66-
with (
67-
patch("struct.calcsize", return_value=3), # fake 24-bit pointer
68-
pytest.raises(RuntimeError, match=r"requires 64-bit Python\. Currently running: 24-bit Python"),
69-
):
70-
load_nvidia_dynamic_lib("not_used")
64+
def test_runtime_error_on_non_64bit_python(mocker):
65+
# Ensure this test is not affected by any prior cached calls.
66+
load_nvidia_dynamic_lib.cache_clear()
67+
mocker.patch("struct.calcsize", return_value=3) # fake 24-bit pointer
68+
with pytest.raises(RuntimeError, match=r"requires 64-bit Python\. Currently running: 24-bit Python"):
69+
load_nvidia_dynamic_lib("cudart")
70+
71+
72+
def test_unsupported_libname_raises_value_error():
73+
with pytest.raises(ValueError, match=r"Unsupported library name: 'not_a_real_lib'.*cudart"):
74+
load_nvidia_dynamic_lib("not_a_real_lib")
7175

7276

7377
IMPORTLIB_METADATA_DISTRIBUTIONS_NAMES = {

0 commit comments

Comments
 (0)