-
-
Notifications
You must be signed in to change notification settings - Fork 226
Expand file tree
/
Copy path.dstack.yml
More file actions
57 lines (51 loc) · 1.72 KB
/
.dstack.yml
File metadata and controls
57 lines (51 loc) · 1.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
type: task
name: rccl-tests
nodes: 2
# Uncomment to mount the system libraries folder from the host
#volumes:
# - /usr/local/lib:/mnt/lib
image: rocm/dev-ubuntu-22.04:6.4-complete
env:
- NCCL_DEBUG=INFO
- OPEN_MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi
commands:
# Setup MPI and build RCCL tests
- apt-get install -y git libopenmpi-dev openmpi-bin
- git clone https://github.com/ROCm/rccl-tests.git
- cd rccl-tests
- make MPI=1 MPI_HOME=${OPEN_MPI_HOME}
# Uncomment to preload the RoCE driver library from the host (for Broadcom driver compatibility)
#- export LD_PRELOAD=/mnt/lib/libbnxt_re-rdmav34.so
# Run RCCL tests via MPI
- |
FIFO=/tmp/${DSTACK_RUN_NAME}
if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
sleep 10
echo "$DSTACK_NODES_IPS" | tr ' ' '\n' > hostfile
MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
# Wait for other nodes
while true; do
if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then
break
fi
echo 'Waiting for other nodes...'
sleep 5
done
# Run NCCL Tests
${MPIRUN} \
-n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
--mca btl_tcp_if_include ens41np0 \
-x LD_PRELOAD \
-x NCCL_IB_HCA=mlx5_0/1,bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 \
-x NCCL_IB_GID_INDEX=3 \
-x NCCL_IB_DISABLE=0 \
./build/all_reduce_perf -b 8M -e 8G -f 2 -g 1 -w 5 --iters 20 -c 0;
# Notify other nodes the MPI run is finished
${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}"
else
mkfifo ${FIFO}
# Wait for a message from the master node
cat ${FIFO}
fi
resources:
gpu: MI300X:8