Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ class _BitcodeLibInfo(TypedDict):
"site_packages_dirs": ("nvidia/nvshmem/lib",),
"available_on_windows": False,
},
"nccl": {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please make this nccl_device to match the existing nvshmem_device convention, and move the new entry before nvshmem_device to keep this list in lexicographic order?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still see nccl, could you please make it nccl_device for internal consistency?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In list DESCRIPTOR_CATALOG of dynamic libs, the name of nvshmem is "nvshmem_host" and the name of nccl is "nccl", which reflect the .so filenames libnvshmem_host.so.3 and libnccl.so.2. nccl as a key in _SUPPORTED_BITCODE_LIBS_INFO doesn't match nvshmem, but it's consistent with nccl's name in dynamic lib list DESCRIPTOR_CATALOG. I'd like to keep the key simple for nccl, so I don't have to think about it should end with _device or _host when I call these pathfinder APIs.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I don't have to think

It's gonna be surprising/confusing for anyone else, especially because the filename is libnccl_device.bc. Unless there is more than a personal preference (?), nccl_device is clearly the more consistent name here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

matching filename libnccl_device.bc is a good point.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

"filename": "libnccl_device.bc",
"rel_path": "lib",
"site_packages_dirs": ("nvidia/nccl/lib",),
"available_on_windows": False,
},
}

# Public API: just the supported library names
Expand Down
52 changes: 37 additions & 15 deletions cuda_pathfinder/tests/test_find_bitcode_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@
STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS", "see_what_works")
assert STRICTNESS in ("see_what_works", "all_must_work")

BCL_FILENAME = find_bitcode_lib_module._SUPPORTED_BITCODE_LIBS_INFO["device"]["filename"]

def _bitcode_lib_info(libname: str):
return find_bitcode_lib_module._SUPPORTED_BITCODE_LIBS_INFO[libname]


def _bitcode_lib_filename(libname: str) -> str:
return _bitcode_lib_info(libname)["filename"]


@pytest.fixture
Expand All @@ -30,15 +36,20 @@ def clear_find_bitcode_lib_cache():
get_cuda_path_or_home.cache_clear()


def _make_bitcode_lib_file(dir_path: Path) -> str:
def _make_bitcode_lib_file(dir_path: Path, libname: str = "device") -> str:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's safer and more readable to avoid the defaults here and below. I tried out the change below. Could you please apply?

@@ -36,14 +36,14 @@ def clear_find_bitcode_lib_cache():
     get_cuda_path_or_home.cache_clear()


-def _make_bitcode_lib_file(dir_path: Path, libname: str = "device") -> str:
+def _make_bitcode_lib_file(dir_path: Path, libname: str) -> str:
     dir_path.mkdir(parents=True, exist_ok=True)
     file_path = dir_path / _bitcode_lib_filename(libname)
     file_path.touch()
     return str(file_path)


-def _bitcode_lib_dir_under(anchor_dir: Path, libname: str = "device") -> Path:
+def _bitcode_lib_dir_under(anchor_dir: Path, libname: str) -> Path:
     return anchor_dir / _bitcode_lib_info(libname)["rel_path"]


@@ -138,7 +138,7 @@ def test_locate_bitcode_lib_search_order(monkeypatch, tmp_path, libname):
 @pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
 def test_find_bitcode_lib_not_found_error_includes_cuda_home_directory_listing(monkeypatch, tmp_path):
     cuda_home = tmp_path / "cuda-home"
-    lib_dir = _bitcode_lib_dir_under(cuda_home)
+    lib_dir = _bitcode_lib_dir_under(cuda_home, "device")
     lib_dir.mkdir(parents=True, exist_ok=True)
     extra_file = lib_dir / "README.txt"
     extra_file.write_text("placeholder", encoding="utf-8")

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

applied.

dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / BCL_FILENAME
file_path = dir_path / _bitcode_lib_filename(libname)
file_path.touch()
return str(file_path)


def _bitcode_lib_dir_under(anchor_dir: Path) -> Path:
return anchor_dir / "nvvm" / "libdevice"
def _bitcode_lib_dir_under(anchor_dir: Path, libname: str = "device") -> Path:
return anchor_dir / _bitcode_lib_info(libname)["rel_path"]


def _site_packages_bitcode_lib_dir_under(anchor_dir: Path, libname: str) -> Path:
rel_dir = _bitcode_lib_info(libname)["site_packages_dirs"][0]
return anchor_dir.joinpath(*rel_dir.split("/"))


def _conda_anchor(conda_prefix: Path) -> Path:
Expand Down Expand Up @@ -79,36 +90,47 @@ def test_locate_bitcode_lib(info_summary_append, libname):


@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
def test_locate_bitcode_lib_search_order(monkeypatch, tmp_path):
site_packages_lib_dir = tmp_path / "site-packages" / "nvidia" / "cu13" / "nvvm" / "libdevice"
site_packages_path = _make_bitcode_lib_file(site_packages_lib_dir)
@pytest.mark.parametrize("libname", SUPPORTED_BITCODE_LIBS)
def test_locate_bitcode_lib_search_order(monkeypatch, tmp_path, libname):
site_packages_lib_dir = _site_packages_bitcode_lib_dir_under(tmp_path / "site-packages", libname)
site_packages_path = _make_bitcode_lib_file(site_packages_lib_dir, libname)

conda_prefix = tmp_path / "conda-prefix"
conda_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(_conda_anchor(conda_prefix)))
conda_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(_conda_anchor(conda_prefix), libname), libname)

cuda_home = tmp_path / "cuda-home"
cuda_home_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(cuda_home))
cuda_home_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(cuda_home, libname), libname)

site_packages_sub_dirs = tuple(
tuple(rel_dir.split("/")) for rel_dir in _bitcode_lib_info(libname)["site_packages_dirs"]
)

def find_expected_sub_dir(sub_dir):
assert sub_dir in site_packages_sub_dirs
if sub_dir == site_packages_sub_dirs[0]:
return [str(site_packages_lib_dir)]
return []

monkeypatch.setattr(
find_bitcode_lib_module,
"find_sub_dirs_all_sitepackages",
lambda _sub_dir: [str(site_packages_lib_dir)],
find_expected_sub_dir,
)
monkeypatch.setenv("CONDA_PREFIX", str(conda_prefix))
monkeypatch.setenv("CUDA_HOME", str(cuda_home))
monkeypatch.delenv("CUDA_PATH", raising=False)

located_lib = locate_bitcode_lib("device")
located_lib = locate_bitcode_lib(libname)
assert located_lib.abs_path == site_packages_path
assert located_lib.found_via == "site-packages"
os.remove(site_packages_path)

located_lib = locate_bitcode_lib("device")
located_lib = locate_bitcode_lib(libname)
assert located_lib.abs_path == conda_path
assert located_lib.found_via == "conda"
os.remove(conda_path)

located_lib = locate_bitcode_lib("device")
located_lib = locate_bitcode_lib(libname)
assert located_lib.abs_path == cuda_home_path
assert located_lib.found_via == "CUDA_PATH"

Expand All @@ -134,7 +156,7 @@ def test_find_bitcode_lib_not_found_error_includes_cuda_home_directory_listing(m
find_bitcode_lib("device")

message = str(exc_info.value)
expected_missing_file = os.path.join(str(lib_dir), BCL_FILENAME)
expected_missing_file = os.path.join(str(lib_dir), _bitcode_lib_filename("device"))
assert f"No such file: {expected_missing_file}" in message
assert f'listdir("{lib_dir}"):' in message
assert "README.txt" in message
Expand Down
Loading