Skip to content

Commit 1e9cdac

Browse files
authored
Merge branch 'main' into swiglu_offset
2 parents 447f2c4 + 50ac303 commit 1e9cdac

4 files changed

Lines changed: 21 additions & 5 deletions

File tree

3rdparty/cudnn-frontend

Submodule cudnn-frontend updated 301 files

examples/pytorch/comm_gemm_overlap/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
- `CUDA_DEVICE_MAX_CONNECTIONS=1` must be enabled in the environment.
77
- For best performance, point-to-point communication via _CUDA Multicast_ needs CUDA Toolkit 12.0+
88
and CUDA driver 535+ on devices with compute capability 9.0 or newer.
9-
- Devices older than compute capability 9.0 require `UB_SKIPMC=1` in the environment in order fall
9+
- Devices older than compute capability 9.0 require `UB_SKIPMC=1` in the environment in order to fall
1010
back on a less performant implementation based on CUDA Inter-Process Communication (IPC) handles.
1111

1212
## Examples
@@ -22,7 +22,7 @@ $ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_ov
2222
# [rank0:node0] |-- Created tensor-parallel group: [0, 1, 2, 3, 4, 5, 6, 7]
2323
# !!! [UB] Create UbufP2PCommOverlap Communicator
2424
# UB_TIMEOUT is set to 110 sec, 217800000000 cycles, freq: 1980000khz
25-
# MC initialized succesfully, window size = 549755813888
25+
# MC initialized successfully, window size = 549755813888
2626
# !!! [UBP2P] Register UBuf 1
2727
# !!! [UBP2P] Register UBuf 2
2828
# !!! [UBP2P] Register UBuf 3
@@ -66,7 +66,7 @@ $ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_ov
6666
```
6767
### Single node, mixed data- and tensor-parallel LayerNormMLP:
6868

69-
Uses `torch.nn.parallel.DistributedDataParallel` for replicatin the model across 2 tensor-parallel
69+
Uses `torch.nn.parallel.DistributedDataParallel` for replicating the model across 2 tensor-parallel
7070
groups in a single node.
7171

7272
```bash
@@ -81,7 +81,7 @@ $ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_ov
8181
# [rank2:node0] |-- Created data-parallel group: [2, 6]
8282
# !!! [UB] Create UbufP2PCommOverlap Communicator
8383
# UB_TIMEOUT is set to 110 sec, 217800000000 cycles, freq: 1980000khz
84-
# MC initialized succesfully, window size = 549755813888
84+
# MC initialized successfully, window size = 549755813888
8585
# !!! [UBP2P] Register UBuf 1
8686
# !!! [UBP2P] Register UBuf 2
8787
# !!! [UBP2P] Register UBuf 3

transformer_engine/jax/setup.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@
4242
shutil.rmtree(build_tools_copy)
4343
shutil.copytree(build_tools_dir, build_tools_copy)
4444

45+
license_src = current_file_path.parent.parent / "LICENSE"
46+
license_dst = current_file_path / "LICENSE"
47+
if license_src.is_file():
48+
shutil.copyfile(license_src, license_dst)
49+
4550

4651
from build_tools.build_ext import get_build_ext
4752
from build_tools.utils import copy_common_headers, min_python_version_str
@@ -131,7 +136,10 @@ def get_cuda_major_version() -> int:
131136
python_requires=f">={min_python_version_str()}",
132137
install_requires=install_requires,
133138
tests_require=test_requirements(),
139+
license_files=("LICENSE",),
134140
)
135141
if any(x in sys.argv for x in (".", "sdist", "bdist_wheel")):
136142
shutil.rmtree(common_headers_dir)
137143
shutil.rmtree("build_tools")
144+
if license_dst.is_file():
145+
license_dst.unlink()

transformer_engine/pytorch/setup.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@
4343
shutil.rmtree(build_tools_copy)
4444
shutil.copytree(build_tools_dir, build_tools_copy)
4545

46+
license_src = current_file_path.parent.parent / "LICENSE"
47+
license_dst = current_file_path / "LICENSE"
48+
if license_src.is_file():
49+
shutil.copyfile(license_src, license_dst)
50+
4651

4752
from build_tools.build_ext import get_build_ext
4853
from build_tools.utils import copy_common_headers, min_python_version_str
@@ -177,7 +182,10 @@ def run(self):
177182
python_requires=f">={min_python_version_str()}",
178183
install_requires=install_requires,
179184
tests_require=test_requirements(),
185+
license_files=("LICENSE",),
180186
)
181187
if any(x in sys.argv for x in (".", "sdist", "bdist_wheel")):
182188
shutil.rmtree(common_headers_dir)
183189
shutil.rmtree("build_tools")
190+
if license_dst.is_file():
191+
license_dst.unlink()

0 commit comments

Comments
 (0)