diff --git a/docs/docs/concepts/tasks.md b/docs/docs/concepts/tasks.md
index ed9a00db6..39a626ddf 100644
--- a/docs/docs/concepts/tasks.md
+++ b/docs/docs/concepts/tasks.md
@@ -139,6 +139,15 @@ Nodes can communicate using their private IP addresses.
 Use `DSTACK_MASTER_NODE_IP`, `DSTACK_NODES_IPS`, `DSTACK_NODE_RANK`, and other
 [System environment variables](#system-environment-variables) for inter-node communication.
 
+`dstack` is easy to use with `accelerate`, `torchrun`, Ray, Spark, and any other distributed frameworks.
+
+
+!!! info "MPI"
+    If want to use MPI, you can set `startup_order` to `workers-first` and `stop_criteria` to `master-done`, and use `DSTACK_MPI_HOSTFILE`.
+    See the [NCCL](../../examples/clusters/nccl-tests/index.md) or [RCCL](../../examples/clusters/rccl-tests/index.md) examples.
+
+> For detailed examples, see [distributed training](../../examples.md#distributed-training) examples.
+
 ??? info "Network interface"
     Distributed frameworks usually detect the correct network interface automatically,
     but sometimes you need to specify it explicitly.
@@ -170,7 +179,7 @@ Use `DSTACK_MASTER_NODE_IP`, `DSTACK_NODES_IPS`, `DSTACK_NODE_RANK`, and other
     recommended to create them via a fleet configuration
     to ensure the highest level of inter-node connectivity.
 
-`dstack` is easy to use with `accelerate`, `torchrun`, Ray, Spark, and any other distributed frameworks.
+> See the [Clusters](../guides/clusters.md) guide for more details on how to use `dstack` on clusters.
 
 ### Resources
 
diff --git a/docs/docs/guides/clusters.md b/docs/docs/guides/clusters.md
index b8f6c3a8f..f2502eb48 100644
--- a/docs/docs/guides/clusters.md
+++ b/docs/docs/guides/clusters.md
@@ -38,25 +38,28 @@ For cloud fleets, fast interconnect is currently supported only on the `aws`, `g
 > To request fast interconnect support for a other backends,
 file an [issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_ blank"}. 
 
-## NCCL/RCCL tests
-
-To test the interconnect of a created fleet, ensure you run [NCCL](../../examples/clusters/nccl-tests/index.md) 
-(for NVIDIA) or [RCCL](../../examples/clusters/rccl-tests/index.md) (for AMD) tests.
-
 ## Distributed tasks
 
 A distributed task is a task with `nodes` set to a value greater than `2`. In this case, `dstack` first ensures a 
-suitable fleet is available, then starts the master node and runs the task container on it. Once the master is up,
-`dstack` starts the rest of the nodes and runs the task container on each of them.
+suitable fleet is available, then selects the master node (to obtain its IP) and finally runs jobs on each node.
 
 Within the task's `commands`, it's possible to use `DSTACK_MASTER_NODE_IP`, `DSTACK_NODES_IPS`, `DSTACK_NODE_RANK`, and other
 [system environment variables](../concepts/tasks.md#system-environment-variables) for inter-node communication.
 
-Refer to [distributed tasks](../concepts/tasks.md#distributed-tasks) for an example.
+??? info "MPI"
+    If want to use MPI, you can set `startup_order` to `workers-first` and `stop_criteria` to `master-done`, and use `DSTACK_MPI_HOSTFILE`.
+    See the [NCCL](../../examples/clusters/nccl-tests/index.md) or [RCCL](../../examples/clusters/rccl-tests/index.md) examples.
 
 !!! info "Retry policy"
     By default, if any of the nodes fails, `dstack` terminates the entire run. Configure a [retry policy](../concepts/tasks.md#retry-policy) to  restart the run if any node fails.
 
+Refer to [distributed tasks](../concepts/tasks.md#distributed-tasks) for an example.
+
+## NCCL/RCCL tests
+
+To test the interconnect of a created fleet, ensure you run [NCCL](../../examples/clusters/nccl-tests/index.md) 
+(for NVIDIA) or [RCCL](../../examples/clusters/rccl-tests/index.md) (for AMD) tests using MPI.
+
 ## Volumes
 
 ### Network volumes
diff --git a/examples/clusters/nccl-tests/.dstack.yml b/examples/clusters/nccl-tests/.dstack.yml
index 59ff2ca6a..3870731e3 100644
--- a/examples/clusters/nccl-tests/.dstack.yml
+++ b/examples/clusters/nccl-tests/.dstack.yml
@@ -5,17 +5,19 @@ nodes: 2
 startup_order: workers-first
 stop_criteria: master-done
 
+# This image comes with MPI and NCCL tests pre-built
 image: dstackai/efa
 env:
   - NCCL_DEBUG=INFO
 commands:
+  - cd /root/nccl-tests/build
   - |
-    if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
-      cd /root/nccl-tests/build
-      MPIRUN="mpirun --allow-run-as-root --hostfile $DSTACK_MPI_HOSTFILE"
-      # Run NCCL Tests
-      ${MPIRUN} \
-        -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
+    if [ $DSTACK_NODE_RANK -eq 0 ]; then
+      mpirun \
+        --allow-run-as-root \
+        --hostfile $DSTACK_MPI_HOSTFILE \
+        -n $DSTACK_GPUS_NUM \
+        -N $DSTACK_GPUS_PER_NODE \
         --mca btl_tcp_if_exclude lo,docker0 \
         --bind-to none \
         ./all_reduce_perf -b 8 -e 8G -f 2 -g 1
diff --git a/examples/clusters/nccl-tests/README.md b/examples/clusters/nccl-tests/README.md
index 76e67cd3a..ee1bf7fc9 100644
--- a/examples/clusters/nccl-tests/README.md
+++ b/examples/clusters/nccl-tests/README.md
@@ -20,13 +20,14 @@ image: dstackai/efa
 env:
   - NCCL_DEBUG=INFO
 commands:
+  - cd /root/nccl-tests/build
   - |
-    if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
-      cd /root/nccl-tests/build
-      MPIRUN="mpirun --allow-run-as-root --hostfile $DSTACK_MPI_HOSTFILE"
-      # Run NCCL Tests
-      ${MPIRUN} \
-        -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
+    if [ $DSTACK_NODE_RANK -eq 0 ]; then
+      mpirun \
+        --allow-run-as-root \
+        --hostfile $DSTACK_MPI_HOSTFILE \
+        -n $DSTACK_GPUS_NUM \
+        -N $DSTACK_GPUS_PER_NODE \
         --mca btl_tcp_if_exclude lo,docker0 \
         --bind-to none \
         ./all_reduce_perf -b 8 -e 8G -f 2 -g 1
@@ -37,11 +38,12 @@ commands:
 resources:
   gpu: nvidia:4:16GB
   shm_size: 16GB
-
 ```
 
 </div>
 
+<!-- TODO: Need to stop using our EFA image - either make our default image cluster-friendly, or recommend using NGC or other images -->
+
 !!! info "Docker image"
     The `dstackai/efa` image used in the example comes with MPI and NCCL tests pre-installed. While it is optimized for
     [AWS EFA :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/hpc/efa/){:target="_blank"}, it can also
diff --git a/examples/clusters/rccl-tests/.dstack.yml b/examples/clusters/rccl-tests/.dstack.yml
index 17fb007fa..5beb1cd3e 100644
--- a/examples/clusters/rccl-tests/.dstack.yml
+++ b/examples/clusters/rccl-tests/.dstack.yml
@@ -2,10 +2,12 @@ type: task
 name: rccl-tests
 
 nodes: 2
+startup_order: workers-first
+stop_criteria: master-done
 
-# Uncomment to mount the system libraries folder from the host
-#volumes:
-#  - /usr/local/lib:/mnt/lib
+# Mount the system libraries folder from the host
+volumes:
+  - /usr/local/lib:/mnt/lib
 
 image: rocm/dev-ubuntu-22.04:6.4-complete
 env:
@@ -16,41 +18,26 @@ commands:
   - apt-get install -y git libopenmpi-dev openmpi-bin
   - git clone https://github.com/ROCm/rccl-tests.git
   - cd rccl-tests
-  - make MPI=1 MPI_HOME=${OPEN_MPI_HOME}
+  - make MPI=1 MPI_HOME=$OPEN_MPI_HOME
 
-  # Uncomment to preload the RoCE driver library from the host (for Broadcom driver compatibility)
-  #- export LD_PRELOAD=/mnt/lib/libbnxt_re-rdmav34.so
+  # Preload the RoCE driver library from the host (for Broadcom driver compatibility)
+  - export LD_PRELOAD=/mnt/lib/libbnxt_re-rdmav34.so
 
   # Run RCCL tests via MPI
   - |
-    FIFO=/tmp/${DSTACK_RUN_NAME}
-    if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
-      sleep 10
-      echo "$DSTACK_NODES_IPS" | tr ' ' '\n' > hostfile
-      MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
-      # Wait for other nodes
-      while true; do
-        if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then
-          break
-        fi
-        echo 'Waiting for other nodes...'
-        sleep 5
-      done
-      # Run NCCL Tests
-      ${MPIRUN} \
-        -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
+    if [ $DSTACK_NODE_RANK -eq 0 ]; then
+      mpirun --allow-run-as-root \
+        --hostfile $DSTACK_MPI_HOSTFILE \
+        -n $DSTACK_GPUS_NUM \
+        -N $DSTACK_GPUS_PER_NODE \
         --mca btl_tcp_if_include ens41np0 \
         -x LD_PRELOAD \
         -x NCCL_IB_HCA=mlx5_0/1,bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 \
         -x NCCL_IB_GID_INDEX=3 \
         -x NCCL_IB_DISABLE=0 \
         ./build/all_reduce_perf -b 8M -e 8G -f 2 -g 1 -w 5 --iters 20 -c 0;
-      # Notify other nodes the MPI run is finished
-      ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}"
     else
-      mkfifo ${FIFO}
-      # Wait for a message from the master node
-      cat ${FIFO}
+      sleep infinity
     fi
 
 resources:
diff --git a/examples/clusters/rccl-tests/README.md b/examples/clusters/rccl-tests/README.md
index 1b97f9f84..ac0876fd7 100644
--- a/examples/clusters/rccl-tests/README.md
+++ b/examples/clusters/rccl-tests/README.md
@@ -13,6 +13,8 @@ type: task
 name: rccl-tests
 
 nodes: 2
+startup_order: workers-first
+stop_criteria: master-done
 
 # Mount the system libraries folder from the host
 volumes:
@@ -27,41 +29,26 @@ commands:
   - apt-get install -y git libopenmpi-dev openmpi-bin
   - git clone https://github.com/ROCm/rccl-tests.git
   - cd rccl-tests
-  - make MPI=1 MPI_HOME=${OPEN_MPI_HOME}
+  - make MPI=1 MPI_HOME=$OPEN_MPI_HOME
 
   # Preload the RoCE driver library from the host (for Broadcom driver compatibility)
   - export LD_PRELOAD=/mnt/lib/libbnxt_re-rdmav34.so
 
   # Run RCCL tests via MPI
   - |
-    FIFO=/tmp/${DSTACK_RUN_NAME}
-    if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
-      sleep 10
-      echo "$DSTACK_NODES_IPS" | tr ' ' '\n' > hostfile
-      MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
-      # Wait for other nodes
-      while true; do
-        if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then
-          break
-        fi
-        echo 'Waiting for other nodes...'
-        sleep 5
-      done
-      # Run NCCL Tests
-      ${MPIRUN} \
-        -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
+    if [ $DSTACK_NODE_RANK -eq 0 ]; then
+      mpirun --allow-run-as-root \
+        --hostfile $DSTACK_MPI_HOSTFILE \
+        -n $DSTACK_GPUS_NUM \
+        -N $DSTACK_GPUS_PER_NODE \
         --mca btl_tcp_if_include ens41np0 \
         -x LD_PRELOAD \
         -x NCCL_IB_HCA=mlx5_0/1,bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 \
         -x NCCL_IB_GID_INDEX=3 \
         -x NCCL_IB_DISABLE=0 \
         ./build/all_reduce_perf -b 8M -e 8G -f 2 -g 1 -w 5 --iters 20 -c 0;
-      # Notify other nodes the MPI run is finished
-      ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}"
     else
-      mkfifo ${FIFO}
-      # Wait for a message from the master node
-      cat ${FIFO}
+      sleep infinity
     fi
 
 resources: