Skip to content

Commit 9452570

Browse files
authored
Merge pull request #165 from erieaton-amd/sparse
Update to rocm 7.1 and convert rocshmem to a sparse checkout
2 parents d9f94a3 + fd41ad3 commit 9452570

10 files changed

Lines changed: 64 additions & 32 deletions

File tree

.github/workflows/amd-ci.yml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ jobs:
1212
runs-on: "amd-gfx942-mi325"
1313
timeout-minutes: 45
1414
container:
15-
image: rocm/pytorch:rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.5.1_preview
15+
image: rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
1616
options: >-
1717
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
18-
--volume /home/runner/.triton:/github/home/.triton
18+
--volume /home/runner/.triton:/github/home/.triton --shm-size=512MB
1919
env:
2020
TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
2121
TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
@@ -26,8 +26,8 @@ jobs:
2626
- name: Install dependencies
2727
run: |
2828
rm -f /usr/local/bin/cmake
29-
apt-get update -y && apt install -y libopenmpi-dev git cython3 ibverbs-utils openmpi-bin libopenmpi-dev libpci-dev libdw1 locales cmake
30-
pip3 install -i https://test.pypi.org/simple hip-python>=6.3.0 # (or whatever Rocm version you have)
29+
apt-get update -y && apt install -y libopenmpi-dev git cython3 ibverbs-utils openmpi-bin libopenmpi-dev libpci-dev libdw1 locales cmake miopen-hip autoconf libtool flex ninja-build clang lld
30+
pip3 install -i https://test.pypi.org/simple hip-python>=7.1 # (or whatever Rocm version you have)
3131
pip3 install pybind11
3232
- name: Checkout
3333
uses: actions/checkout@v4
@@ -58,10 +58,10 @@ jobs:
5858
runs-on: "amd-gfx942-mi325"
5959
timeout-minutes: 60
6060
container:
61-
image: rocm/pytorch:rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.5.1_preview
61+
image: rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
6262
options: >-
6363
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
64-
--volume /home/runner/.triton:/github/home/.triton
64+
--volume /home/runner/.triton:/github/home/.triton --shm-size=512MB
6565
env:
6666
TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
6767
TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
@@ -72,9 +72,9 @@ jobs:
7272
steps:
7373
- name: Install dependencies
7474
run: |
75-
apt-get update -y && apt install -y libopenmpi-dev
76-
pip3 install -i https://test.pypi.org/simple hip-python>=6.3.0 # (or whatever Rocm version you have)
77-
pip3 install pybind11
75+
apt-get update -y && apt install -y libopenmpi-dev cmake miopen-hip autoconf libtool flex ninja-build clang lld
76+
pip3 install -i https://test.pypi.org/simple hip-python>=7.1 # (or whatever Rocm version you have)
77+
pip3 install pybind11 psutil
7878
- name: Checkout
7979
uses: actions/checkout@v4
8080
with:
@@ -103,10 +103,10 @@ jobs:
103103
runs-on: "amd-gfx942-mi325"
104104
timeout-minutes: 45
105105
container:
106-
image: rocm/pytorch:rocm6.4.3_ubuntu22.04_py3.10_pytorch_release_2.5.1
106+
image: rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
107107
options: >-
108108
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
109-
--volume /home/runner/.triton:/github/home/.triton
109+
--volume /home/runner/.triton:/github/home/.triton --shm-size=512MB
110110
env:
111111
TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
112112
TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
@@ -118,8 +118,8 @@ jobs:
118118
- name: Install dependencies
119119
run: |
120120
rm -f /usr/local/bin/cmake
121-
apt-get update -y && apt install -y libopenmpi-dev git cython3 ibverbs-utils openmpi-bin libopenmpi-dev libpci-dev libdw1 locales cmake
122-
pip3 install -i https://test.pypi.org/simple hip-python==6.4.3.555.40
121+
apt-get update -y && apt install -y libopenmpi-dev git cython3 ibverbs-utils openmpi-bin libopenmpi-dev libpci-dev libdw1 locales cmake ninja-build clang lld
122+
pip3 install -i https://test.pypi.org/simple hip-python>=7.1
123123
pip3 install pybind11
124124
- name: Checkout
125125
uses: actions/checkout@v4

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ python/triton_dist/_C/*nvshmem
6161
hf_configs*
6262
*.whl
6363

64+
3rdparty/rocm-systems
65+
3rdparty/rocshmem
66+
6467
# conflicts
6568
temp_conflicts
6669
temp_merge_repo
@@ -82,4 +85,4 @@ tmp*
8285
*zst*
8386

8487
.cursor
85-
*.csv
88+
*.csv

.gitmodules

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
[submodule "3rdparty/rocshmem"]
2-
path = 3rdparty/rocshmem
3-
url = https://github.com/ROCm/rocSHMEM.git
41
[submodule "3rdparty/triton"]
52
path = 3rdparty/triton
63
url = https://github.com/ByteDance-Seed/triton.git

3rdparty/mori

Submodule mori updated 1 file

3rdparty/rocshmem

Lines changed: 0 additions & 1 deletion
This file was deleted.

python/triton_dist/test/amd/test_rocshmem_api.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,8 @@ def test_rocshmem_memcpy():
209209

210210
HIP_CHECK(cp_res)
211211

212-
pyrocshmem.rocshmem_barrier_all_on_stream(cur_stream.cuda_stream)
213-
214212
torch.cuda.synchronize()
213+
pyrocshmem.rocshmem_barrier_all_on_stream(cur_stream.cuda_stream)
215214

216215
try:
217216
torch.testing.assert_close(comm_buffs[peer], one)

shmem/rocshmem_bind/build.sh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,6 @@ set -e
66
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
77
PROJECT_ROOT=$(realpath ${SCRIPT_DIR})
88

9-
function apt_install_deps() {
10-
apt update -y
11-
apt-get install -y miopen-hip
12-
}
13-
149
function build_pyrocshmem_cmake() {
1510
pushd ${PROJECT_ROOT}/pyrocshmem
1611
mkdir -p build
@@ -58,8 +53,6 @@ export ROCM_INSTALL_DIR="/opt/rocm"
5853
export PATH="${OPENMPI_UCX_INSTALL_DIR}/bin:$PATH"
5954
export LD_LIBRARY_PATH="${OPENMPI_UCX_INSTALL_DIR}/lib:$LD_LIBRARY_PATH"
6055

61-
apt_install_deps
62-
6356
# build rocshmem
6457
bash -x ${PROJECT_ROOT}/build_rocshmem.sh
6558
# build rocshmem bitcode

shmem/rocshmem_bind/build_rocshmem.sh

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,49 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
77
PROJECT_ROOT=$(realpath ${SCRIPT_DIR})
88
ROCSHMEM_SRC_DIR=${PROJECT_ROOT}/../../3rdparty/rocshmem
99

10+
sys_path="${PROJECT_ROOT}/../../3rdparty/rocm-systems"
11+
12+
if [ -d "${ROCSHMEM_SRC_DIR}" ]; then
13+
pushd "${PROJECT_ROOT}/../.."
14+
active=$(git config submodule.3rdparty/rocshmem.active || echo "nil")
15+
if [ "${active}" = "true" ]; then
16+
echo "Error: Rocshmem submodule still active, please delete it"
17+
quit=1
18+
fi
19+
popd
20+
pushd "${ROCSHMEM_SRC_DIR}"
21+
url=$(git remote get-url origin || echo "nil")
22+
if [ "${url}" = "https://github.com/ROCm/rocSHMEM.git" ]; then
23+
echo "Error: Old rocshmem checkout found, please delete it"
24+
quit=1
25+
fi
26+
popd
27+
if ! [ -z "${quit}" ]; then
28+
exit $quit
29+
fi
30+
31+
if ! [ "$(ls -A "${ROCSHMEM_SRC_DIR}")" ]; then
32+
rmdir "${ROCSHMEM_SRC_DIR}"
33+
fi
34+
fi
35+
36+
rocm_systems_tag=hip-version_7.12.60610
37+
38+
if ! [ -d "${ROCSHMEM_SRC_DIR}" ]; then
39+
echo "Creating sparse checkout"
40+
pushd "${PROJECT_ROOT}/../.."
41+
git clone "https://github.com/ROCm/rocm-systems.git" -b "${rocm_systems_tag}" --depth 1 --sparse "${sys_path}"
42+
popd
43+
pushd "${sys_path}"
44+
git config core.sparseCheckoutCone true
45+
git sparse-checkout set projects/rocshmem
46+
ln -s rocm-systems/projects/rocshmem ../rocshmem
47+
popd
48+
fi
49+
50+
pushd "${sys_path}"
51+
git checkout "${rocm_systems_tag}"
52+
popd
1053

1154
pushd ${ROCSHMEM_SRC_DIR}
1255

@@ -17,7 +60,6 @@ OMPI_INSTALL_DIR="${OMPI_INSTALL_DIR:-/opt/ompi_build}"
1760
# build ompi, ucx
1861
if [ ! -e "${OMPI_INSTALL_DIR}" ]; then
1962
# prepare for building ompi, ucx
20-
apt-get install autoconf libtool flex -y
2163
BUILD_DIR=${OMPI_INSTALL_DIR} bash ${ROCSHMEM_SRC_DIR}/scripts/install_dependencies.sh
2264
else
2365
echo "ompi exists, skip building ompi and ucx"

shmem/rocshmem_bind/pyrocshmem/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def setup_pytorch_extension() -> setuptools.Extension:
108108
"""Setup CppExtension for PyTorch support"""
109109
include_dirs, library_dirs, libraries = [], [], []
110110

111-
deps = [hip_deps(), mpi_deps(), rocshmem_deps()]
111+
deps = [rocshmem_deps(), hip_deps(), mpi_deps()]
112112

113113
for include_dir, library_dir, library in deps:
114114
include_dirs += include_dir

shmem/rocshmem_bind/scripts/build_rocshmem_device_bc.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,11 @@ SOURCE_MAP=(
3939
["${ROCSHMEM_SRC}/src/ipc/context_ipc_device_coll.cpp"]="rocshmem_context_ipc_device_coll.bc"
4040
["${ROCSHMEM_SRC}/src/ipc_policy.cpp"]="rocshmem_ipc_policy.bc"
4141
["${ROCSHMEM_SRC}/src/gda/context_gda_device.cpp"]="rocshmem_context_gda_device.bc"
42+
["${ROCSHMEM_SRC}/src/gda/context_gda_device_coll.cpp"]="rocshmem_context_gda_device_coll.bc"
4243
["${ROCSHMEM_SRC}/src/gda/backend_gda.cpp"]="rocshmem_backend_gda.bc"
4344
["${ROCSHMEM_SRC}/src/gda/queue_pair.cpp"]="rocshmem_queue_pair.bc"
4445
["${ROCSHMEM_SRC}/src/gda/ionic/queue_pair_ionic.cpp"]="rocshmem_queue_pair_ionic.bc"
4546
["${ROCSHMEM_SRC}/src/gda/mlx5/queue_pair_mlx5.cpp"]="rocshmem_queue_pair_mlx5.bc"
46-
["${ROCSHMEM_SRC}/src/gda/mlx5/segment_builder.cpp"]="rocshmem_segment_builder.bc"
47-
["${ROCSHMEM_SRC}/src/gda/endian.cpp"]="rocshmem_endian.bc"
4847
["${ROCSHMEM_SRC}/src/team.cpp"]="rocshmem_team.bc"
4948
["${ROCSHMEM_SRC}/src/sync/abql_block_mutex.cpp"]="rocshmem_abql_block_mutex.bc"
5049
["${ROCSHMEM_SRC}/src/util.cpp"]="rocshmem_util.bc"

0 commit comments

Comments
 (0)