Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
# Any production-code impact is negligible since the extra logic only runs
# in the subprocess entrypoint and only in test mode.

_CUPTI_DIAGNOSTICS_ENVVAR = "CUDA_PATHFINDER_WINDOWS_CUPTI_ALREADY_LOADED_DIAGNOSTICS"


def _probe_canary_abs_path(libname: str) -> str | None:
desc = LIB_DESCRIPTORS.get(libname)
Expand All @@ -48,6 +50,26 @@ def _validate_abs_path(abs_path: str) -> None:
assert os.path.isfile(abs_path), f"not a file: {abs_path=!r}"


def _cupti_diagnostics_enabled(libname: str) -> bool:
raw = os.environ.get(_CUPTI_DIAGNOSTICS_ENVVAR)
if libname != "cupti" or raw is None:
return False
return raw.strip().lower() not in ("", "0", "false", "no")


def _emit_cupti_diagnostic(message: str) -> None:
print(f"[cuda.pathfinder][cupti-diag] {message}", file=sys.stderr)


def _emit_loaded_dl_diagnostic(label: str, loaded_dl: LoadedDL) -> None:
_emit_cupti_diagnostic(
f"{label}: abs_path={loaded_dl.abs_path!r}"
f" found_via={loaded_dl.found_via!r}"
f" was_already_loaded_from_elsewhere={loaded_dl.was_already_loaded_from_elsewhere}"
f" handle=0x{loaded_dl._handle_uint:x}"
)


def _load_nvidia_dynamic_lib_for_test(libname: str) -> str:
"""Test-only loader used by the subprocess entrypoint."""
# Keep imports inside the subprocess body so startup stays focused on the
Expand All @@ -60,7 +82,10 @@ def _load_nvidia_dynamic_lib_for_test(libname: str) -> str:
)
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS

diagnostics_enabled = _cupti_diagnostics_enabled(libname)
loaded_dl_fresh = load_nvidia_dynamic_lib(libname)
if diagnostics_enabled:
_emit_loaded_dl_diagnostic("fresh load", loaded_dl_fresh)
if loaded_dl_fresh.was_already_loaded_from_elsewhere:
raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere")

Expand All @@ -75,6 +100,8 @@ def _load_nvidia_dynamic_lib_for_test(libname: str) -> str:
raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh")

loaded_dl_no_cache = _load_lib_no_cache(libname)
if diagnostics_enabled:
_emit_loaded_dl_diagnostic("second uncached load", loaded_dl_no_cache)
supported_libs = SUPPORTED_WINDOWS_DLLS if IS_WINDOWS else SUPPORTED_LINUX_SONAMES
if not loaded_dl_no_cache.was_already_loaded_from_elsewhere and libname in supported_libs:
raise RuntimeError("not loaded_dl_no_cache.was_already_loaded_from_elsewhere")
Expand Down
106 changes: 106 additions & 0 deletions cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import ctypes.wintypes
import os
import struct
import sys
from collections.abc import Iterator
from typing import TYPE_CHECKING

from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL
Expand All @@ -22,11 +24,16 @@

# Set up kernel32 functions with proper types
kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
psapi = ctypes.windll.psapi # type: ignore[attr-defined]

# GetModuleHandleW
kernel32.GetModuleHandleW.argtypes = [ctypes.wintypes.LPCWSTR]
kernel32.GetModuleHandleW.restype = ctypes.wintypes.HMODULE

# GetCurrentProcess
kernel32.GetCurrentProcess.argtypes = []
kernel32.GetCurrentProcess.restype = ctypes.wintypes.HANDLE

# LoadLibraryExW
kernel32.LoadLibraryExW.argtypes = [
ctypes.wintypes.LPCWSTR, # lpLibFileName
Expand All @@ -47,6 +54,28 @@
kernel32.AddDllDirectory.argtypes = [ctypes.wintypes.LPCWSTR]
kernel32.AddDllDirectory.restype = ctypes.c_void_p # DLL_DIRECTORY_COOKIE

# EnumProcessModules
psapi.EnumProcessModules.argtypes = [
ctypes.wintypes.HANDLE,
ctypes.POINTER(ctypes.wintypes.HMODULE),
ctypes.wintypes.DWORD,
ctypes.POINTER(ctypes.wintypes.DWORD),
]
psapi.EnumProcessModules.restype = ctypes.wintypes.BOOL

_CUPTI_DIAGNOSTICS_ENVVAR = "CUDA_PATHFINDER_WINDOWS_CUPTI_ALREADY_LOADED_DIAGNOSTICS"


def _cupti_diagnostics_enabled(desc_name: str) -> bool:
raw = os.environ.get(_CUPTI_DIAGNOSTICS_ENVVAR)
if desc_name != "cupti" or raw is None:
return False
return raw.strip().lower() not in ("", "0", "false", "no")


def _emit_cupti_diagnostic(message: str) -> None:
sys.stderr.write(f"[cuda.pathfinder][cupti-diag] {message}\n")


def ctypes_handle_to_unsigned_int(handle: ctypes.wintypes.HMODULE) -> int:
"""Convert ctypes HMODULE to unsigned int."""
Expand Down Expand Up @@ -101,17 +130,94 @@ def abs_path_for_dynamic_library(libname: str, handle: ctypes.wintypes.HMODULE)
return buffer.value


def _iter_loaded_module_handles() -> Iterator[ctypes.wintypes.HMODULE]:
process_handle = kernel32.GetCurrentProcess()
capacity = 64
module_size = ctypes.sizeof(ctypes.wintypes.HMODULE)
while True:
module_handles = (ctypes.wintypes.HMODULE * capacity)()
needed = ctypes.wintypes.DWORD()
ok = psapi.EnumProcessModules(
process_handle,
module_handles,
ctypes.sizeof(module_handles),
ctypes.byref(needed),
)
if not ok:
error_code = ctypes.GetLastError() # type: ignore[attr-defined]
raise RuntimeError(f"EnumProcessModules failed (error code: {error_code})")
count = needed.value // module_size
if count <= capacity:
for raw_handle in module_handles[:count]:
if raw_handle is None:
continue
yield ctypes.wintypes.HMODULE(int(raw_handle))
return
capacity = count


def _find_loaded_module(
dll_names: tuple[str, ...],
*,
diagnostics_enabled: bool = False,
) -> tuple[ctypes.wintypes.HMODULE, str] | None:
wanted = {dll_name.casefold() for dll_name in dll_names}
relevant_modules: list[str] = []
for handle in _iter_loaded_module_handles():
abs_path = abs_path_for_dynamic_library("loaded module", handle)
basename = os.path.basename(abs_path)
basename_casefold = basename.casefold()
if diagnostics_enabled and ("cupti" in basename_casefold or "nvperf" in basename_casefold):
relevant_modules.append(f"0x{ctypes_handle_to_unsigned_int(handle):x}:{abs_path}")
if basename_casefold in wanted:
if diagnostics_enabled:
_emit_cupti_diagnostic(
"enumerated relevant modules: " + (" | ".join(relevant_modules) if relevant_modules else "<none>")
)
_emit_cupti_diagnostic(
f"enumeration match: basename={basename!r} abs_path={abs_path!r}"
f" handle=0x{ctypes_handle_to_unsigned_int(handle):x}"
)
return handle, abs_path
if diagnostics_enabled:
_emit_cupti_diagnostic(
"enumerated relevant modules: " + (" | ".join(relevant_modules) if relevant_modules else "<none>")
)
return None


def check_if_already_loaded_from_elsewhere(desc: LibDescriptor, have_abs_path: bool) -> LoadedDL | None:
diagnostics_enabled = _cupti_diagnostics_enabled(desc.name)
basename_probe_results: list[str] = []
for dll_name in desc.windows_dlls:
handle = kernel32.GetModuleHandleW(dll_name)
if diagnostics_enabled:
handle_text = "0x0" if not handle else f"0x{ctypes_handle_to_unsigned_int(handle):x}"
basename_probe_results.append(f"{dll_name}={handle_text}")
if handle:
abs_path = abs_path_for_dynamic_library(desc.name, handle)
if diagnostics_enabled:
_emit_cupti_diagnostic("basename GetModuleHandleW results: " + ", ".join(basename_probe_results))
_emit_cupti_diagnostic(
f"basename match: dll_name={dll_name!r} abs_path={abs_path!r}"
f" handle=0x{ctypes_handle_to_unsigned_int(handle):x}"
)
if have_abs_path and desc.requires_add_dll_directory:
# This is a side-effect if the pathfinder loads the library via
# load_with_abs_path(). To make the side-effect more deterministic,
# activate it even if the library was already loaded from elsewhere.
add_dll_directory(abs_path)
return LoadedDL(abs_path, True, ctypes_handle_to_unsigned_int(handle), "was-already-loaded-from-elsewhere")
# Observed on newer Windows CUPTI builds: GetModuleHandleW(basename)
# can miss an already loaded DLL, so fall back to enumerating loaded modules.
if diagnostics_enabled:
_emit_cupti_diagnostic("basename GetModuleHandleW results: " + ", ".join(basename_probe_results))
loaded = _find_loaded_module(desc.windows_dlls, diagnostics_enabled=diagnostics_enabled)
if loaded is not None:
handle, abs_path = loaded
if have_abs_path and desc.requires_add_dll_directory:
add_dll_directory(abs_path)
return LoadedDL(abs_path, True, ctypes_handle_to_unsigned_int(handle), "was-already-loaded-from-elsewhere")
return None


Expand Down
56 changes: 56 additions & 0 deletions cuda_pathfinder/tests/test_load_dl_windows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import sys

import pytest

if sys.platform != "win32":
pytest.skip("Windows-only tests", allow_module_level=True)

from cuda.pathfinder._dynamic_libs import load_dl_windows
from cuda.pathfinder._dynamic_libs.lib_descriptor import LIB_DESCRIPTORS


def test_check_if_already_loaded_falls_back_to_enumerated_modules(tmp_path, mocker):
desc = LIB_DESCRIPTORS["cupti"]
expected_path = tmp_path / desc.windows_dlls[0]
handles = (0x111, 0x222)

mocker.patch.object(load_dl_windows.kernel32, "GetModuleHandleW", return_value=0)
mocker.patch.object(load_dl_windows, "_iter_loaded_module_handles", return_value=iter(handles))
mocker.patch.object(
load_dl_windows,
"abs_path_for_dynamic_library",
side_effect=(
r"C:\Windows\System32\kernel32.dll",
str(expected_path),
),
)
add_dll_directory = mocker.patch.object(load_dl_windows, "add_dll_directory")

result = load_dl_windows.check_if_already_loaded_from_elsewhere(desc, have_abs_path=False)

assert result is not None
assert result.abs_path == str(expected_path)
assert result.was_already_loaded_from_elsewhere is True
assert result.found_via == "was-already-loaded-from-elsewhere"
assert result._handle_uint == handles[1]
add_dll_directory.assert_not_called()


def test_check_if_already_loaded_fallback_preserves_add_dll_directory_side_effect(tmp_path, mocker):
desc = LIB_DESCRIPTORS["nvrtc"]
expected_path = tmp_path / desc.windows_dlls[0]

mocker.patch.object(load_dl_windows.kernel32, "GetModuleHandleW", return_value=0)
mocker.patch.object(load_dl_windows, "_iter_loaded_module_handles", return_value=iter((0x333,)))
mocker.patch.object(load_dl_windows, "abs_path_for_dynamic_library", return_value=str(expected_path))
add_dll_directory = mocker.patch.object(load_dl_windows, "add_dll_directory")

result = load_dl_windows.check_if_already_loaded_from_elsewhere(desc, have_abs_path=True)

assert result is not None
assert result.abs_path == str(expected_path)
assert result.was_already_loaded_from_elsewhere is True
add_dll_directory.assert_called_once_with(str(expected_path))