Skip to content

Commit ef9253b

Browse files
authored
fix(pathfinder): use CTK canary fallback for header discovery (#1731)
* fix(pathfinder): use CTK canary fallback for header discovery Reuse the CTK root canary probe for CTK header lookup when site-packages, conda, and CUDA_HOME/CUDA_PATH are unavailable, avoiding hardcoded default install paths. Add tests for fallback success, search-order precedence, and non-fatal canary miss behavior. Made-with: Cursor * fix(pathfinder): surface canary probe errors in header lookup Avoid masking canary subprocess failures during CTK header discovery so probe bugs are visible. Update header-discovery tests so only a None canary result is non-fatal while runtime probe errors are asserted. Made-with: Cursor
1 parent cfbda9f commit ef9253b

File tree

2 files changed

+161
-1
lines changed

2 files changed

+161
-1
lines changed

cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
import os
99
from dataclasses import dataclass
1010

11+
from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
12+
_resolve_system_loaded_abs_path_in_subprocess,
13+
)
14+
from cuda.pathfinder._dynamic_libs.search_steps import derive_ctk_root
1115
from cuda.pathfinder._headers import supported_nvidia_headers
1216
from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
1317
from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
@@ -91,6 +95,23 @@ def _find_based_on_conda_layout(libname: str, h_basename: str, ctk_layout: bool)
9195
return None
9296

9397

98+
def _find_ctk_header_directory_via_canary(libname: str, h_basename: str) -> str | None:
99+
"""Try CTK header lookup via CTK-root canary probing.
100+
101+
Uses the same canary as dynamic-library CTK-root discovery: system-load
102+
``cudart`` in a spawned child process, derive CTK root from the resolved
103+
absolute library path, then search the expected CTK include layout under
104+
that root.
105+
"""
106+
canary_abs_path = _resolve_system_loaded_abs_path_in_subprocess("cudart")
107+
if canary_abs_path is None:
108+
return None
109+
ctk_root = derive_ctk_root(canary_abs_path)
110+
if ctk_root is None:
111+
return None
112+
return _locate_based_on_ctk_layout(libname, h_basename, ctk_root)
113+
114+
94115
def _find_ctk_header_directory(libname: str) -> LocatedHeaderDir | None:
95116
h_basename = supported_nvidia_headers.SUPPORTED_HEADERS_CTK[libname]
96117
candidate_dirs = supported_nvidia_headers.SUPPORTED_SITE_PACKAGE_HEADER_DIRS_CTK[libname]
@@ -106,6 +127,9 @@ def _find_ctk_header_directory(libname: str) -> LocatedHeaderDir | None:
106127
if cuda_home and (result := _locate_based_on_ctk_layout(libname, h_basename, cuda_home)):
107128
return LocatedHeaderDir(abs_path=result, found_via="CUDA_HOME")
108129

130+
if result := _find_ctk_header_directory_via_canary(libname, h_basename):
131+
return LocatedHeaderDir(abs_path=result, found_via="system-ctk-root")
132+
109133
return None
110134

111135

@@ -139,6 +163,12 @@ def locate_nvidia_header_directory(libname: str) -> LocatedHeaderDir | None:
139163
3. **CUDA Toolkit environment variables**
140164
141165
- Use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).
166+
167+
4. **CTK root canary probe**
168+
169+
- Probe a system-loaded ``cudart`` in a spawned child process,
170+
derive the CTK root from the resolved library path, then search
171+
CTK include layout under that root.
142172
"""
143173

144174
if libname in supported_nvidia_headers.SUPPORTED_HEADERS_CTK:
@@ -195,6 +225,12 @@ def find_nvidia_header_directory(libname: str) -> str | None:
195225
3. **CUDA Toolkit environment variables**
196226
197227
- Use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).
228+
229+
4. **CTK root canary probe**
230+
231+
- Probe a system-loaded ``cudart`` in a spawned child process,
232+
derive the CTK root from the resolved library path, then search
233+
CTK include layout under that root.
198234
"""
199235
found = locate_nvidia_header_directory(libname)
200236
return found.abs_path if found else None

cuda_pathfinder/tests/test_find_nvidia_headers.py

Lines changed: 125 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,15 @@
1616
import importlib.metadata
1717
import os
1818
import re
19+
from pathlib import Path
1920

2021
import pytest
2122

23+
import cuda.pathfinder._headers.find_nvidia_headers as find_nvidia_headers_module
2224
from cuda.pathfinder import LocatedHeaderDir, find_nvidia_header_directory, locate_nvidia_header_directory
25+
from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
26+
_resolve_system_loaded_abs_path_in_subprocess,
27+
)
2328
from cuda.pathfinder._headers.supported_nvidia_headers import (
2429
SUPPORTED_HEADERS_CTK,
2530
SUPPORTED_HEADERS_CTK_ALL,
@@ -28,6 +33,7 @@
2833
SUPPORTED_INSTALL_DIRS_NON_CTK,
2934
SUPPORTED_SITE_PACKAGE_HEADER_DIRS_CTK,
3035
)
36+
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
3137

3238
STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
3339
assert STRICTNESS in ("see_what_works", "all_must_work")
@@ -46,7 +52,13 @@ def test_unknown_libname():
4652

4753
def _located_hdr_dir_asserts(located_hdr_dir):
4854
assert isinstance(located_hdr_dir, LocatedHeaderDir)
49-
assert located_hdr_dir.found_via in ("site-packages", "conda", "CUDA_HOME", "supported_install_dir")
55+
assert located_hdr_dir.found_via in (
56+
"site-packages",
57+
"conda",
58+
"CUDA_HOME",
59+
"system-ctk-root",
60+
"supported_install_dir",
61+
)
5062

5163

5264
def test_non_ctk_importlib_metadata_distributions_names():
@@ -62,6 +74,36 @@ def have_distribution_for(libname: str) -> bool:
6274
)
6375

6476

77+
@pytest.fixture
78+
def clear_locate_nvidia_header_cache():
79+
locate_nvidia_header_directory.cache_clear()
80+
_resolve_system_loaded_abs_path_in_subprocess.cache_clear()
81+
yield
82+
locate_nvidia_header_directory.cache_clear()
83+
_resolve_system_loaded_abs_path_in_subprocess.cache_clear()
84+
85+
86+
def _create_ctk_header(ctk_root: Path, libname: str) -> str:
87+
"""Create a fake CTK header file and return its directory."""
88+
header_basename = SUPPORTED_HEADERS_CTK[libname]
89+
if libname == "nvvm":
90+
header_dir = ctk_root / "nvvm" / "include"
91+
elif libname == "cccl":
92+
header_dir = ctk_root / "include" / "cccl"
93+
else:
94+
header_dir = ctk_root / "include"
95+
header_path = header_dir / header_basename
96+
header_path.parent.mkdir(parents=True, exist_ok=True)
97+
header_path.touch()
98+
return str(header_dir)
99+
100+
101+
def _fake_cudart_canary_abs_path(ctk_root: Path) -> str:
102+
if IS_WINDOWS:
103+
return str(ctk_root / "bin" / "x64" / "cudart64_13.dll")
104+
return str(ctk_root / "lib64" / "libcudart.so.13")
105+
106+
65107
@pytest.mark.parametrize("libname", SUPPORTED_HEADERS_NON_CTK.keys())
66108
def test_locate_non_ctk_headers(info_summary_append, libname):
67109
hdr_dir = find_nvidia_header_directory(libname)
@@ -110,3 +152,85 @@ def test_locate_ctk_headers(info_summary_append, libname):
110152
assert os.path.isfile(os.path.join(hdr_dir, h_filename))
111153
if STRICTNESS == "all_must_work":
112154
assert hdr_dir is not None
155+
156+
157+
@pytest.mark.usefixtures("clear_locate_nvidia_header_cache")
158+
def test_locate_ctk_headers_uses_canary_fallback_when_cuda_home_unset(tmp_path, monkeypatch, mocker):
159+
ctk_root = tmp_path / "cuda-system"
160+
expected_hdr_dir = _create_ctk_header(ctk_root, "cudart")
161+
162+
monkeypatch.delenv("CONDA_PREFIX", raising=False)
163+
monkeypatch.delenv("CUDA_HOME", raising=False)
164+
monkeypatch.delenv("CUDA_PATH", raising=False)
165+
mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[])
166+
probe = mocker.patch.object(
167+
find_nvidia_headers_module,
168+
"_resolve_system_loaded_abs_path_in_subprocess",
169+
return_value=_fake_cudart_canary_abs_path(ctk_root),
170+
)
171+
172+
located_hdr_dir = locate_nvidia_header_directory("cudart")
173+
174+
assert located_hdr_dir is not None
175+
assert located_hdr_dir.abs_path == expected_hdr_dir
176+
assert located_hdr_dir.found_via == "system-ctk-root"
177+
probe.assert_called_once_with("cudart")
178+
179+
180+
@pytest.mark.usefixtures("clear_locate_nvidia_header_cache")
181+
def test_locate_ctk_headers_cuda_home_takes_priority_over_canary(tmp_path, monkeypatch, mocker):
182+
cuda_home = tmp_path / "cuda-home"
183+
expected_hdr_dir = _create_ctk_header(cuda_home, "cudart")
184+
canary_root = tmp_path / "cuda-system"
185+
_create_ctk_header(canary_root, "cudart")
186+
187+
monkeypatch.delenv("CONDA_PREFIX", raising=False)
188+
monkeypatch.setenv("CUDA_HOME", str(cuda_home))
189+
monkeypatch.delenv("CUDA_PATH", raising=False)
190+
mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[])
191+
probe = mocker.patch.object(
192+
find_nvidia_headers_module,
193+
"_resolve_system_loaded_abs_path_in_subprocess",
194+
return_value=_fake_cudart_canary_abs_path(canary_root),
195+
)
196+
197+
located_hdr_dir = locate_nvidia_header_directory("cudart")
198+
199+
assert located_hdr_dir is not None
200+
assert located_hdr_dir.abs_path == expected_hdr_dir
201+
assert located_hdr_dir.found_via == "CUDA_HOME"
202+
probe.assert_not_called()
203+
204+
205+
@pytest.mark.usefixtures("clear_locate_nvidia_header_cache")
206+
def test_locate_ctk_headers_canary_miss_paths_are_non_fatal(monkeypatch, mocker):
207+
monkeypatch.delenv("CONDA_PREFIX", raising=False)
208+
monkeypatch.delenv("CUDA_HOME", raising=False)
209+
monkeypatch.delenv("CUDA_PATH", raising=False)
210+
mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[])
211+
mocker.patch.object(
212+
find_nvidia_headers_module,
213+
"_resolve_system_loaded_abs_path_in_subprocess",
214+
return_value=None,
215+
)
216+
217+
assert locate_nvidia_header_directory("cudart") is None
218+
assert find_nvidia_header_directory("cudart") is None
219+
220+
221+
@pytest.mark.usefixtures("clear_locate_nvidia_header_cache")
222+
def test_locate_ctk_headers_canary_probe_errors_are_not_masked(monkeypatch, mocker):
223+
monkeypatch.delenv("CONDA_PREFIX", raising=False)
224+
monkeypatch.delenv("CUDA_HOME", raising=False)
225+
monkeypatch.delenv("CUDA_PATH", raising=False)
226+
mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[])
227+
mocker.patch.object(
228+
find_nvidia_headers_module,
229+
"_resolve_system_loaded_abs_path_in_subprocess",
230+
side_effect=RuntimeError("canary probe failed"),
231+
)
232+
233+
with pytest.raises(RuntimeError, match="canary probe failed"):
234+
locate_nvidia_header_directory("cudart")
235+
with pytest.raises(RuntimeError, match="canary probe failed"):
236+
find_nvidia_header_directory("cudart")

0 commit comments

Comments
 (0)