-
Notifications
You must be signed in to change notification settings - Fork 276
[FEA]: Add support for pathfinder.find_bitcode_lib("nccl_device") #1975
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+44
−16
Merged
Changes from 1 commit
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
b559dcf
Add NCCL bitcode library lookup
xiakun-lu 552c3c6
Remove default values for libname in _make_bitcode_lib_file and _bitc…
xiakun-lu 8ba018c
Keep entries of _SUPPORTED_BITCODE_LIBS_INFO in lexicographic order
xiakun-lu d42f5c3
Rename NCCL bitcode key to nccl_device
xiakun-lu File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,13 @@ | |
| STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS", "see_what_works") | ||
| assert STRICTNESS in ("see_what_works", "all_must_work") | ||
|
|
||
| BCL_FILENAME = find_bitcode_lib_module._SUPPORTED_BITCODE_LIBS_INFO["device"]["filename"] | ||
|
|
||
| def _bitcode_lib_info(libname: str): | ||
| return find_bitcode_lib_module._SUPPORTED_BITCODE_LIBS_INFO[libname] | ||
|
|
||
|
|
||
| def _bitcode_lib_filename(libname: str) -> str: | ||
| return _bitcode_lib_info(libname)["filename"] | ||
|
|
||
|
|
||
| @pytest.fixture | ||
|
|
@@ -30,15 +36,20 @@ def clear_find_bitcode_lib_cache(): | |
| get_cuda_path_or_home.cache_clear() | ||
|
|
||
|
|
||
| def _make_bitcode_lib_file(dir_path: Path) -> str: | ||
| def _make_bitcode_lib_file(dir_path: Path, libname: str = "device") -> str: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's safer and more readable to avoid the defaults here and below. I tried out the change below. Could you please apply? @@ -36,14 +36,14 @@ def clear_find_bitcode_lib_cache():
get_cuda_path_or_home.cache_clear()
-def _make_bitcode_lib_file(dir_path: Path, libname: str = "device") -> str:
+def _make_bitcode_lib_file(dir_path: Path, libname: str) -> str:
dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / _bitcode_lib_filename(libname)
file_path.touch()
return str(file_path)
-def _bitcode_lib_dir_under(anchor_dir: Path, libname: str = "device") -> Path:
+def _bitcode_lib_dir_under(anchor_dir: Path, libname: str) -> Path:
return anchor_dir / _bitcode_lib_info(libname)["rel_path"]
@@ -138,7 +138,7 @@ def test_locate_bitcode_lib_search_order(monkeypatch, tmp_path, libname):
@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
def test_find_bitcode_lib_not_found_error_includes_cuda_home_directory_listing(monkeypatch, tmp_path):
cuda_home = tmp_path / "cuda-home"
- lib_dir = _bitcode_lib_dir_under(cuda_home)
+ lib_dir = _bitcode_lib_dir_under(cuda_home, "device")
lib_dir.mkdir(parents=True, exist_ok=True)
extra_file = lib_dir / "README.txt"
extra_file.write_text("placeholder", encoding="utf-8")
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. applied. |
||
| dir_path.mkdir(parents=True, exist_ok=True) | ||
| file_path = dir_path / BCL_FILENAME | ||
| file_path = dir_path / _bitcode_lib_filename(libname) | ||
| file_path.touch() | ||
| return str(file_path) | ||
|
|
||
|
|
||
| def _bitcode_lib_dir_under(anchor_dir: Path) -> Path: | ||
| return anchor_dir / "nvvm" / "libdevice" | ||
| def _bitcode_lib_dir_under(anchor_dir: Path, libname: str = "device") -> Path: | ||
| return anchor_dir / _bitcode_lib_info(libname)["rel_path"] | ||
|
|
||
|
|
||
| def _site_packages_bitcode_lib_dir_under(anchor_dir: Path, libname: str) -> Path: | ||
| rel_dir = _bitcode_lib_info(libname)["site_packages_dirs"][0] | ||
| return anchor_dir.joinpath(*rel_dir.split("/")) | ||
|
|
||
|
|
||
| def _conda_anchor(conda_prefix: Path) -> Path: | ||
|
|
@@ -79,36 +90,47 @@ def test_locate_bitcode_lib(info_summary_append, libname): | |
|
|
||
|
|
||
| @pytest.mark.usefixtures("clear_find_bitcode_lib_cache") | ||
| def test_locate_bitcode_lib_search_order(monkeypatch, tmp_path): | ||
| site_packages_lib_dir = tmp_path / "site-packages" / "nvidia" / "cu13" / "nvvm" / "libdevice" | ||
| site_packages_path = _make_bitcode_lib_file(site_packages_lib_dir) | ||
| @pytest.mark.parametrize("libname", SUPPORTED_BITCODE_LIBS) | ||
| def test_locate_bitcode_lib_search_order(monkeypatch, tmp_path, libname): | ||
| site_packages_lib_dir = _site_packages_bitcode_lib_dir_under(tmp_path / "site-packages", libname) | ||
| site_packages_path = _make_bitcode_lib_file(site_packages_lib_dir, libname) | ||
|
|
||
| conda_prefix = tmp_path / "conda-prefix" | ||
| conda_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(_conda_anchor(conda_prefix))) | ||
| conda_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(_conda_anchor(conda_prefix), libname), libname) | ||
|
|
||
| cuda_home = tmp_path / "cuda-home" | ||
| cuda_home_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(cuda_home)) | ||
| cuda_home_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(cuda_home, libname), libname) | ||
|
|
||
| site_packages_sub_dirs = tuple( | ||
| tuple(rel_dir.split("/")) for rel_dir in _bitcode_lib_info(libname)["site_packages_dirs"] | ||
| ) | ||
|
|
||
| def find_expected_sub_dir(sub_dir): | ||
| assert sub_dir in site_packages_sub_dirs | ||
| if sub_dir == site_packages_sub_dirs[0]: | ||
| return [str(site_packages_lib_dir)] | ||
| return [] | ||
|
|
||
| monkeypatch.setattr( | ||
| find_bitcode_lib_module, | ||
| "find_sub_dirs_all_sitepackages", | ||
| lambda _sub_dir: [str(site_packages_lib_dir)], | ||
| find_expected_sub_dir, | ||
| ) | ||
| monkeypatch.setenv("CONDA_PREFIX", str(conda_prefix)) | ||
| monkeypatch.setenv("CUDA_HOME", str(cuda_home)) | ||
| monkeypatch.delenv("CUDA_PATH", raising=False) | ||
|
|
||
| located_lib = locate_bitcode_lib("device") | ||
| located_lib = locate_bitcode_lib(libname) | ||
| assert located_lib.abs_path == site_packages_path | ||
| assert located_lib.found_via == "site-packages" | ||
| os.remove(site_packages_path) | ||
|
|
||
| located_lib = locate_bitcode_lib("device") | ||
| located_lib = locate_bitcode_lib(libname) | ||
| assert located_lib.abs_path == conda_path | ||
| assert located_lib.found_via == "conda" | ||
| os.remove(conda_path) | ||
|
|
||
| located_lib = locate_bitcode_lib("device") | ||
| located_lib = locate_bitcode_lib(libname) | ||
| assert located_lib.abs_path == cuda_home_path | ||
| assert located_lib.found_via == "CUDA_PATH" | ||
|
|
||
|
|
@@ -134,7 +156,7 @@ def test_find_bitcode_lib_not_found_error_includes_cuda_home_directory_listing(m | |
| find_bitcode_lib("device") | ||
|
|
||
| message = str(exc_info.value) | ||
| expected_missing_file = os.path.join(str(lib_dir), BCL_FILENAME) | ||
| expected_missing_file = os.path.join(str(lib_dir), _bitcode_lib_filename("device")) | ||
| assert f"No such file: {expected_missing_file}" in message | ||
| assert f'listdir("{lib_dir}"):' in message | ||
| assert "README.txt" in message | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please make this
nccl_deviceto match the existingnvshmem_deviceconvention, and move the new entry beforenvshmem_deviceto keep this list in lexicographic order?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still see
nccl, could you please make itnccl_devicefor internal consistency?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In list
DESCRIPTOR_CATALOGof dynamic libs, the name of nvshmem is "nvshmem_host" and the name of nccl is "nccl", which reflect the .so filenameslibnvshmem_host.so.3andlibnccl.so.2.ncclas a key in_SUPPORTED_BITCODE_LIBS_INFOdoesn't match nvshmem, but it's consistent with nccl's name in dynamic lib listDESCRIPTOR_CATALOG. I'd like to keep the key simple for nccl, so I don't have to think about it should end with_deviceor_hostwhen I call these pathfinder APIs.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's gonna be surprising/confusing for anyone else, especially because the filename is
libnccl_device.bc. Unless there is more than a personal preference (?),nccl_deviceis clearly the more consistent name here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
matching filename
libnccl_device.bcis a good point.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done