Skip to content

Commit 356049d

Browse files
rwgkcursoragent
andcommitted
pathfinder: polish guard-rails diagnostics, comments, and lock-in tests
Low-severity polish on the v1 compatibility guard rails surface plus two new tests so the existing invariants are asserted instead of only code-read. - _owned_distribution_candidates: note that symlinks are intentionally not chased on either side of the path comparison. - _missing_ctk_metadata_message now appends the conflicting CTK set when wheel metadata for the same on-disk file matches more than one cuda-toolkit distribution, instead of silently collapsing to "could not determine the CTK version". - _compatible_pair_message picks distinct wording for the same-CTK vs cross-CTK independent-pair cases so the message is no longer misleading when both items share a CTK. - _declare_dynamic_lib_pipeline gains a docstring explaining why it stays single-underscored in v1 (taxonomy/policy still evolving). - Block comment near _STATIC_LIBS_PACKAGED_WITH / _BITCODE_LIBS_PACKAGED_WITH calls out the lockstep requirement with SUPPORTED_*_LIBS and points at the parametrized resolver tests that enforce coverage. - load_nvidia_dynamic_lib augments any CompatibilityCheckError raised during _register_and_check with a sentence explaining the underlying dlopen / LoadLibraryW already happened and the OS handle remains live. Mutates exc.args in place so subclass typing (DriverCtkCompatibilityError) and __cause__ are preserved. - _try_process_wide_guard_rails_then_fallback documents why the forward-compat hint is appended only on Linux (cuda-compat-* is NVIDIA's Linux-only contract). - New test_register_and_check_is_idempotent_for_repeated_items asserts duplicate ResolvedItem registrations collapse to one entry. - New test_driver_ctk_compatibility_error_is_typed_catchable asserts a driver-too-old failure raises DriverCtkCompatibilityError as itself (not just by message), is still a CompatibilityCheckError, and carries the new "OS handle remains live" augmentation. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent fc963ff commit 356049d

3 files changed

Lines changed: 110 additions & 10 deletions

File tree

cuda_pathfinder/cuda/pathfinder/_compatibility_guard_rails.py

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,14 @@ class DeclaredDynamicLibPipeline:
111111
artifact_kind: PipelineArtifactKind
112112

113113

114+
# NOTE: Any new entry added to ``SUPPORTED_STATIC_LIBS`` (e.g. ``culibos``)
115+
# or ``SUPPORTED_BITCODE_LIBS`` (e.g. a future ``device``-style bitcode lib)
116+
# must be registered in the dicts below in the same change. The packaging
117+
# classification is required by ``_resolve_static_lib_item`` and
118+
# ``_resolve_bitcode_lib_item``; missing entries raise ``KeyError`` at runtime
119+
# instead of producing a guarded ``CompatibilityInsufficientMetadataError``.
120+
# Coverage is enforced by the parametrized resolver tests in
121+
# ``tests/test_compatibility_guard_rails.py``.
114122
_STATIC_LIBS_PACKAGED_WITH: dict[str, PackagedWith] = {
115123
"cudadevrt": "ctk",
116124
}
@@ -235,6 +243,11 @@ def _distribution_name(dist: importlib.metadata.Distribution) -> str | None:
235243

236244
@functools.cache
237245
def _owned_distribution_candidates(abs_path: str) -> tuple[tuple[str, str], ...]:
246+
# Symlinks are intentionally not chased: ``os.path.realpath`` is omitted on
247+
# both sides of the comparison, so editable installs that route through a
248+
# wheel-cache symlink are not matched. This keeps the wheel-metadata path
249+
# tied to the path the search actually returned, not to a different on-disk
250+
# location that happens to share inodes.
238251
normalized_abs_path = os.path.normpath(os.path.abspath(abs_path))
239252
matches: set[tuple[str, str]] = set()
240253
for dist in importlib.metadata.distributions():
@@ -282,7 +295,7 @@ def _cuda_toolkit_requirement_maps() -> tuple[tuple[str, CtkVersion, dict[str, t
282295
return tuple(results)
283296

284297

285-
def _wheel_metadata_for_abs_path(abs_path: str) -> CtkMetadata | None:
298+
def _wheel_metadata_matches_for_abs_path(abs_path: str) -> dict[CtkVersion, str]:
286299
matched_versions: dict[CtkVersion, str] = {}
287300
for owner_name, owner_version in _owned_distribution_candidates(abs_path):
288301
try:
@@ -300,6 +313,11 @@ def _wheel_metadata_for_abs_path(abs_path: str) -> CtkMetadata | None:
300313
matched_versions[ctk_version] = (
301314
f"wheel metadata via {owner_name}=={owner_version} pinned by cuda-toolkit=={toolkit_dist_version}"
302315
)
316+
return matched_versions
317+
318+
319+
def _wheel_metadata_for_abs_path(abs_path: str) -> CtkMetadata | None:
320+
matched_versions = _wheel_metadata_matches_for_abs_path(abs_path)
303321
if len(matched_versions) != 1:
304322
return None
305323
[(ctk_version, source)] = matched_versions.items()
@@ -470,11 +488,19 @@ def _unsupported_packaging_message(
470488

471489

472490
def _missing_ctk_metadata_message(item: ResolvedItem) -> str:
473-
return (
491+
base = (
474492
"v1 compatibility checks require either an enclosing CUDA Toolkit root "
475493
"with cuda.h or wheel metadata that can be traced to an installed "
476494
f"cuda-toolkit distribution. Could not determine the CTK version for {item.describe()}."
477495
)
496+
matches = _wheel_metadata_matches_for_abs_path(item.abs_path)
497+
if len(matches) > 1:
498+
# Multiple cuda-toolkit distributions claim the same wheel-installed
499+
# file; surface them so users can disambiguate (typically by removing
500+
# one of the conflicting cuda-toolkit==X.Y wheels).
501+
rendered = ", ".join(f"CTK {ctk_version} ({source})" for ctk_version, source in sorted(matches.items()))
502+
base += f" Wheel metadata matched multiple incompatible CTK versions: {rendered}."
503+
return base
478504

479505

480506
def _ctk_constraint_failure_message(item: ResolvedItem, constraint: CtkVersionConstraint) -> str:
@@ -628,12 +654,18 @@ def _compatible_pair_message(
628654
assert item1.ctk_version is not None
629655
assert item2.ctk_version is not None
630656
if relation.kind == _PAIRWISE_ITEM_RELATION_NONE:
631-
return (
632-
f"{item1.describe()} resolves to CTK {item1.ctk_version}, "
633-
f"{item2.describe()} resolves to CTK {item2.ctk_version}, "
634-
"and v1 does not require exact CTK lockstep for this pair. "
635-
f"Separately, {driver_decision.detail}."
636-
)
657+
if item1.ctk_version == item2.ctk_version:
658+
shared_clause = (
659+
f"{item1.describe()} and {item2.describe()} both resolve to CTK {item1.ctk_version}, "
660+
"and v1 does not require any direct relation between them"
661+
)
662+
else:
663+
shared_clause = (
664+
f"{item1.describe()} resolves to CTK {item1.ctk_version}, "
665+
f"{item2.describe()} resolves to CTK {item2.ctk_version}, "
666+
"and v1 does not require exact CTK lockstep for this pair"
667+
)
668+
return f"{shared_clause}. Separately, {driver_decision.detail}."
637669
assert relation.reason is not None
638670
return (
639671
f"{item1.describe()} and {item2.describe()} both resolve to CTK {item1.ctk_version}. "
@@ -956,6 +988,14 @@ def _declare_dynamic_lib_pipeline(
956988
consumer_libname: str,
957989
artifact_kind: PipelineArtifactKind,
958990
) -> None:
991+
"""Register a producer/consumer pipeline so v1 can enforce its policy.
992+
993+
Intentionally single-underscored: the pipeline API stays private in v1
994+
because the artifact taxonomy and policy matrix are expected to evolve
995+
before they are promoted to a public surface. Internal callers (e.g.
996+
``cuda_bindings``' nvJitLink/nvrtc pairings) reach into this method
997+
directly via the ``CompatibilityGuardRails`` instance.
998+
"""
959999
if producer_libname not in LIB_DESCRIPTORS:
9601000
raise ValueError(f"Unknown dynamic library producer: {producer_libname!r}")
9611001
if consumer_libname not in LIB_DESCRIPTORS:
@@ -1002,9 +1042,28 @@ def _register_and_check(self, item: ResolvedItem) -> None:
10021042
self._remember(item)
10031043

10041044
def load_nvidia_dynamic_lib(self, libname: str) -> LoadedDL:
1005-
"""Load a CUDA dynamic library and reject v1-incompatible resolutions."""
1045+
"""Load a CUDA dynamic library and reject v1-incompatible resolutions.
1046+
1047+
``_load_nvidia_dynamic_lib`` is ``functools.cache``d, so the underlying
1048+
OS-level load (``dlopen`` / ``LoadLibraryW``) has already happened by
1049+
the time we raise. Subsequent calls for the same library will short-
1050+
circuit and never re-trigger the loader, even after this rejection.
1051+
"""
10061052
loaded = _load_nvidia_dynamic_lib(libname)
1007-
self._register_and_check(_resolve_dynamic_lib_item(libname, loaded))
1053+
try:
1054+
self._register_and_check(_resolve_dynamic_lib_item(libname, loaded))
1055+
except CompatibilityCheckError as exc:
1056+
# Surface the irreversibility so callers don't assume the rejection
1057+
# also unwound the underlying OS load. Mutate the same exception
1058+
# instance in place so subclass typing (e.g.
1059+
# DriverCtkCompatibilityError) and the original ``__cause__`` are
1060+
# preserved.
1061+
augmented = (
1062+
f"{exc} Note: the underlying dynamic-library load already happened, "
1063+
"and the resulting OS handle remains live for the rest of this process."
1064+
)
1065+
exc.args = (augmented, *exc.args[1:])
1066+
raise
10081067
return loaded
10091068

10101069
def locate_nvidia_header_directory(self, libname: str) -> LocatedHeaderDir | None:

cuda_pathfinder/cuda/pathfinder/_process_wide_compatibility_guard_rails.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,12 @@ def _try_process_wide_guard_rails_then_fallback(guard_rails_call: Callable[[], _
181181
except DriverCtkCompatibilityError as exc:
182182
if driver_compatibility_mode == "assume_forward_compatibility":
183183
return raw_call()
184+
# The forward-compat hint is appended only on Linux because the
185+
# underlying ``cuda-compat-*`` packages (and the
186+
# ``CUDA_PATHFINDER_DRIVER_COMPATIBILITY=assume_forward_compatibility``
187+
# override they justify) are NVIDIA's Linux-only forward-compat
188+
# contract; there is no equivalent on Windows / macOS, so suggesting
189+
# the override on those platforms would be misleading.
184190
if sys.platform.startswith("linux"):
185191
raise DriverCtkCompatibilityError(_with_driver_compatibility_hint(str(exc))) from exc
186192
raise

cuda_pathfinder/tests/test_compatibility_guard_rails.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
CompatibilityCheckError,
2626
CompatibilityGuardRails,
2727
CompatibilityInsufficientMetadataError,
28+
DriverCtkCompatibilityError,
2829
LocatedHeaderDir,
2930
)
3031
from cuda.pathfinder._binaries.supported_nvidia_binaries import SUPPORTED_BINARIES_ALL
@@ -625,3 +626,37 @@ def test_find_nvidia_header_directory_returns_none_when_unresolved(monkeypatch):
625626
guard_rails = CompatibilityGuardRails(driver_cuda_version=_driver_cuda_version(13000))
626627

627628
assert guard_rails.find_nvidia_header_directory("nvrtc") is None
629+
630+
631+
def test_register_and_check_is_idempotent_for_repeated_items(monkeypatch, tmp_path):
632+
lib_path = _touch_ctk_file(tmp_path / "cuda-12.9", "12.9.20250531", "targets/x86_64-linux/lib/libnvrtc.so.12")
633+
634+
monkeypatch.setattr(compatibility_module, "_load_nvidia_dynamic_lib", lambda _libname: _loaded_dl(lib_path))
635+
636+
guard_rails = CompatibilityGuardRails(driver_cuda_version=_driver_cuda_version(13000))
637+
638+
item = compatibility_module._resolve_dynamic_lib_item("nvrtc", _loaded_dl(lib_path))
639+
640+
guard_rails._register_and_check(item)
641+
guard_rails._register_and_check(item)
642+
guard_rails._register_and_check(item)
643+
644+
matching = [resolved for resolved in guard_rails._resolved_items if resolved == item]
645+
assert len(matching) == 1
646+
647+
648+
def test_driver_ctk_compatibility_error_is_typed_catchable(monkeypatch, tmp_path):
649+
lib_path = _touch_ctk_file(tmp_path / "cuda-12.9", "12.9.20250531", "targets/x86_64-linux/lib/libnvrtc.so.12")
650+
651+
monkeypatch.setattr(compatibility_module, "_load_nvidia_dynamic_lib", lambda _libname: _loaded_dl(lib_path))
652+
653+
guard_rails = CompatibilityGuardRails(
654+
driver_cuda_version=_driver_cuda_version(12000),
655+
driver_release_version=_driver_release_version("520.30.01"),
656+
)
657+
658+
with pytest.raises(DriverCtkCompatibilityError) as exc_info:
659+
guard_rails.load_nvidia_dynamic_lib("nvrtc")
660+
661+
assert isinstance(exc_info.value, CompatibilityCheckError)
662+
assert "OS handle remains live" in str(exc_info.value)

0 commit comments

Comments
 (0)