dstackai
diff --git a/‎docs/docs/concepts/tasks.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/docs/concepts/tasks.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/docs/reference/environment-variables.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/docs/reference/environment-variables.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/clusters/nccl-tests/.dstack.yml‎
Lines changed: 4 additions & 23 deletions b/‎examples/clusters/nccl-tests/.dstack.yml‎
Lines changed: 4 additions & 23 deletions
diff --git a/‎examples/clusters/nccl-tests/README.md‎
Lines changed: 8 additions & 36 deletions b/‎examples/clusters/nccl-tests/README.md‎
Lines changed: 8 additions & 36 deletions
diff --git a/‎examples/single-node-training/trl/README.md‎
Lines changed: 11 additions & 10 deletions b/‎examples/single-node-training/trl/README.md‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎examples/single-node-training/trl/train.dstack.yml‎
Lines changed: 10 additions & 11 deletions b/‎examples/single-node-training/trl/train.dstack.yml‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎runner/internal/executor/executor.go‎
Lines changed: 36 additions & 0 deletions b/‎runner/internal/executor/executor.go‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎src/dstack/_internal/cli/services/configurators/run.py‎
Lines changed: 15 additions & 1 deletion b/‎src/dstack/_internal/cli/services/configurators/run.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/dstack/_internal/core/models/configurations.py‎
Lines changed: 1 addition & 1 deletion b/‎src/dstack/_internal/core/models/configurations.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/dstack/_internal/core/models/fleets.py‎
Lines changed: 6 additions & 1 deletion b/‎src/dstack/_internal/core/models/fleets.py‎
Lines changed: 6 additions & 1 deletion
@@ -378,6 +378,7 @@ If you don't assign a value to an environment variable (see `HF_TOKEN` above),
     | `DSTACK_NODE_RANK`      | The rank of the node                                             |
     | `DSTACK_MASTER_NODE_IP` | The internal IP address of the master node                          |
     | `DSTACK_NODES_IPS`      | The list of internal IP addresses of all nodes delimited by "\n" |
+    | `DSTACK_MPI_HOSTFILE`   | The path to a pre-populated MPI hostfile                         |
 
 ### Spot policy
 
 
@@ -77,6 +77,7 @@ tasks, and services:
      ```
 
 - `DSTACK_NODES_IPS`{ #DSTACK_NODES_IPS } – The list of internal IP addresses of all nodes delimited by `"\n"`.
+- `DSTACK_MPI_HOSTFILE`{ #DSTACK_MPI_HOSTFILE } – The path to a pre-populated MPI hostfile that can be used directly as `mpirun --hostfile $DSTACK_MPI_HOSTFILE`.
 
 ## Server
 
 
@@ -2,44 +2,25 @@ type: task
 name: nccl-tests
 
 nodes: 2
+startup_order: workers-first
+stop_criteria: master-done
 
 image: dstackai/efa
 env:
   - NCCL_DEBUG=INFO
 commands:
   - |
-    # We use FIFO for inter-node communication
-    FIFO=/tmp/dstack_job
     if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
       cd /root/nccl-tests/build
-      # Generate hostfile for mpirun
-      : > hostfile
-      for ip in ${DSTACK_NODES_IPS}; do
-        echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> hostfile
-      done
-      MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
-      # Wait for other nodes
-      while true; do
-        if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then
-          break
-        fi
-        echo 'Waiting for nodes...'
-        sleep 5
-      done
+      MPIRUN="mpirun --allow-run-as-root --hostfile $DSTACK_MPI_HOSTFILE"
       # Run NCCL Tests
       ${MPIRUN} \
         -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
-        --mca pml ^cm \
-        --mca btl tcp,self \
         --mca btl_tcp_if_exclude lo,docker0 \
         --bind-to none \
         ./all_reduce_perf -b 8 -e 8G -f 2 -g 1
-      # Notify nodes the job is done
-      ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}"
     else
-      mkfifo ${FIFO}
-      # Wait for a message from the first node
-      cat ${FIFO}
+      sleep infinity
     fi
 
 resources:
 
@@ -6,51 +6,32 @@ This example shows how to run distributed [NCCL tests :material-arrow-top-right-
 
 Here's an example of a task that runs AllReduce test on 2 nodes, each with 4 GPUs (8 processes in total).
 
-<div editor-title="examples/distributed-training/nccl-tests/.dstack.yml">
+<div editor-title="examples/clusters/nccl-tests/.dstack.yml">
 
 ```yaml
 type: task
 name: nccl-tests
 
 nodes: 2
+startup_order: workers-first
+stop_criteria: master-done
 
 image: dstackai/efa
 env:
   - NCCL_DEBUG=INFO
 commands:
   - |
-    # We use FIFO for inter-node communication
-    FIFO=/tmp/dstack_job
     if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
       cd /root/nccl-tests/build
-      # Generate hostfile for mpirun
-      : > hostfile
-      for ip in ${DSTACK_NODES_IPS}; do
-        echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> hostfile
-      done
-      MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
-      # Wait for other nodes
-      while true; do
-        if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then
-          break
-        fi
-        echo 'Waiting for nodes...'
-        sleep 5
-      done
-      # Run NCCL tests
+      MPIRUN="mpirun --allow-run-as-root --hostfile $DSTACK_MPI_HOSTFILE"
+      # Run NCCL Tests
       ${MPIRUN} \
         -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \
-        --mca pml ^cm \
-        --mca btl tcp,self \
         --mca btl_tcp_if_exclude lo,docker0 \
         --bind-to none \
         ./all_reduce_perf -b 8 -e 8G -f 2 -g 1
-      # Notify nodes the job is done
-      ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}"
     else
-      mkfifo ${FIFO}
-      # Wait for a message from the first node
-      cat ${FIFO}
+      sleep infinity
     fi
 
 resources:
@@ -61,15 +42,6 @@ resources:
 
 </div>
 
-!!! info "MPI"
-    NCCL tests rely on MPI to run on multiple processes. The master node (`DSTACK_NODE_RANK=0`) generates `hostfile` (using `DSTACK_NODES_IPS`) 
-    and waits until other nodes are accessible via MPI. 
-    Then, it executes `/nccl-tests/build/all_reduce_perf` across all GPUs.
-
-    Non-master nodes use a `FIFO` pipe to wait for until the MPI run is finished.
-
-    There is an open [issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/2467){:target="_blank"} to simplify the use of MPI with distributed tasks.
-
 !!! info "Docker image"
     The `dstackai/efa` image used in the example comes with MPI and NCCL tests pre-installed. While it is optimized for
     [AWS EFA :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/hpc/efa/){:target="_blank"}, it can also
@@ -84,7 +56,7 @@ To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/referenc
 <div class="termy">
 
 ```shell
-$ dstack apply -f examples/distributed-training/nccl-tests/.dstack.yml
+$ dstack apply -f examples/clusters/nccl-tests/.dstack.yml
 
  #  BACKEND  REGION     INSTANCE       RESOURCES                                   SPOT  PRICE
  1  aws      us-east-1  g4dn.12xlarge  48xCPU, 192GB, 4xT4 (16GB), 100.0GB (disk)  no    $3.912
@@ -99,7 +71,7 @@ Submit the run nccl-tests? [y/n]: y
 ## Source code
 
 The source-code of this example can be found in 
-[`examples/distributed-training/nccl-tests` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/nccl-tests).
+[`examples/clusters/nccl-tests` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/clusters/nccl-tests).
 
 ## What's next?
 
 
@@ -21,19 +21,19 @@ env:
   - WANDB_API_KEY
   - HUB_MODEL_ID
 commands:
-  - pip install "transformers>=4.43.2"
-  - pip install bitsandbytes
-  - pip install flash-attn --no-build-isolation
-  - pip install peft
-  - pip install wandb
+  # Pin torch==2.6.0 to avoid building Flash Attention from source.
+  # Prebuilt Flash Attention wheels are not available for the latest torch==2.7.0.
+  - uv pip install torch==2.6.0
+  - uv pip install transformers bitsandbytes peft wandb
+  - uv pip install flash_attn --no-build-isolation
   - git clone https://github.com/huggingface/trl
   - cd trl
-  - pip install .
-  - | 
+  - uv pip install .
+  - |
     accelerate launch \
       --config_file=examples/accelerate_configs/multi_gpu.yaml \
       --num_processes $DSTACK_GPUS_PER_NODE \
-      examples/scripts/sft.py \
+      trl/scripts/sft.py \
       --model_name meta-llama/Meta-Llama-3.1-8B \
       --dataset_name OpenAssistant/oasst_top1_2023-08-25 \
       --dataset_text_field="text" \
@@ -44,14 +44,15 @@ commands:
       --report_to wandb \
       --bf16 \
       --max_seq_length 1024 \
-      --lora_r 16 --lora_alpha 32 \
+      --lora_r 16 \
+      --lora_alpha 32 \
       --lora_target_modules q_proj k_proj v_proj o_proj \
       --load_in_4bit \
       --use_peft \
       --attn_implementation "flash_attention_2" \
       --logging_steps=10 \
       --output_dir models/llama31 \
-      --hub_model_id $HUB_MODEL_ID
+      --hub_model_id peterschmidt85/FineLlama-3.1-8B
 
 resources:
   gpu:
 
@@ -12,20 +12,19 @@ env:
   - ACCELERATE_LOG_LEVEL=info
 # Commands of the task
 commands:
-  - conda install cuda
-  - pip install git+https://github.com/huggingface/transformers.git
-  - pip install bitsandbytes
-  - pip install flash-attn --no-build-isolation
-  - pip install peft
-  - pip install wandb
+  # Pin torch==2.6.0 to avoid building Flash Attention from source.
+  # Prebuilt Flash Attention wheels are not available for the latest torch==2.7.0.
+  - uv pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
+  - uv pip install transformers bitsandbytes peft wandb
+  - uv pip install flash_attn --no-build-isolation
   - git clone https://github.com/huggingface/trl
   - cd trl
-  - pip install .
+  - uv pip install .
   - |
     accelerate launch \
       --config_file=examples/accelerate_configs/multi_gpu.yaml \
       --num_processes $DSTACK_GPUS_PER_NODE \
-      examples/scripts/sft.py \
+      trl/scripts/sft.py \
       --model_name meta-llama/Meta-Llama-3.1-8B \
       --dataset_name OpenAssistant/oasst_top1_2023-08-25 \
       --dataset_text_field="text" \
@@ -36,15 +35,15 @@ commands:
       --report_to wandb \
       --bf16 \
       --max_seq_length 1024 \
-      --lora_r 16 --lora_alpha 32 \
+      --lora_r 16 \
+      --lora_alpha 32 \
       --lora_target_modules q_proj k_proj v_proj o_proj \
       --load_in_4bit \
       --use_peft \
       --attn_implementation "flash_attention_2" \
       --logging_steps=10 \
       --output_dir models/llama31 \
-      --hub_model_id $HUB_MODEL_ID
-
+      --hub_model_id peterschmidt85/FineLlama-3.1-8B
 resources:
   gpu:
     # 24GB or more VRAM
 
@@ -257,6 +257,8 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error
 	gpus_per_node_num := ex.clusterInfo.GPUSPerJob
 	gpus_num := nodes_num * gpus_per_node_num
 
+	mpiHostfilePath := filepath.Join(ex.homeDir, ".dstack/mpi/hostfile")
+
 	jobEnvs := map[string]string{
 		"DSTACK_RUN_ID":         ex.run.Id,
 		"DSTACK_JOB_ID":         ex.jobSubmission.Id,
@@ -268,6 +270,7 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error
 		"DSTACK_NODES_NUM":      strconv.Itoa(nodes_num),
 		"DSTACK_GPUS_PER_NODE":  strconv.Itoa(gpus_per_node_num),
 		"DSTACK_GPUS_NUM":       strconv.Itoa(gpus_num),
+		"DSTACK_MPI_HOSTFILE":   mpiHostfilePath,
 	}
 
 	// Call buildLDLibraryPathEnv and update jobEnvs if no error occurs
@@ -390,6 +393,11 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error
 		}
 	}
 
+	err = writeMpiHostfile(ctx, ex.clusterInfo.JobIPs, gpus_per_node_num, mpiHostfilePath)
+	if err != nil {
+		return err
+	}
+
 	cmd.Env = envMap.Render()
 
 	log.Trace(ctx, "Starting exec", "cmd", cmd.String(), "working_dir", cmd.Dir, "env", cmd.Env)
@@ -696,6 +704,34 @@ func prepareSSHDir(uid int, gid int, homeDir string) (string, error) {
 	return sshDir, nil
 }
 
+func writeMpiHostfile(ctx context.Context, ips []string, gpus_per_node int, path string) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
+		return err
+	}
+	file, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+	nonEmptyIps := []string{}
+	for _, ip := range ips {
+		if ip != "" {
+			nonEmptyIps = append(nonEmptyIps, ip)
+		}
+	}
+	if len(nonEmptyIps) == len(ips) {
+		for _, ip := range nonEmptyIps {
+			line := fmt.Sprintf("%s slots=%d\n", ip, gpus_per_node)
+			if _, err = file.WriteString(line); err != nil {
+				return err
+			}
+		}
+	} else {
+		log.Info(ctx, "creating empty MPI hostfile: no internal IPs assigned")
+	}
+	return nil
+}
+
 func writeDstackProfile(env map[string]string, path string) error {
 	file, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644)
 	if err != nil {
 
@@ -41,7 +41,7 @@
 )
 from dstack._internal.core.models.repos.base import Repo
 from dstack._internal.core.models.resources import CPUSpec
-from dstack._internal.core.models.runs import JobSubmission, RunStatus
+from dstack._internal.core.models.runs import JobStatus, JobSubmission, RunStatus
 from dstack._internal.core.services.configs import ConfigManager
 from dstack._internal.core.services.diff import diff_models
 from dstack._internal.utils.common import local_time
@@ -593,6 +593,20 @@ def get_run_exit_code(run: Run) -> int:
     return 1
 
 
+def _is_ready_to_attach(run: Run) -> bool:
+    return not (
+        run.status
+        in [
+            RunStatus.SUBMITTED,
+            RunStatus.PENDING,
+            RunStatus.PROVISIONING,
+            RunStatus.TERMINATING,
+        ]
+        or run._run.jobs[0].job_submissions[-1].status
+        in [JobStatus.SUBMITTED, JobStatus.PROVISIONING, JobStatus.PULLING]
+    )
+
+
 def _run_resubmitted(run: Run, current_job_submission: Optional[JobSubmission]) -> bool:
     if current_job_submission is None or run._run.latest_job_submission is None:
         return False
 
@@ -440,7 +440,7 @@ def convert_replicas(cls, v: Any) -> Range[int]:
             raise ValueError("The minimum number of replicas must be greater than or equal to 0")
         if v.max < v.min:
             raise ValueError(
-                "The maximum number of replicas must be greater than or equal to the minium number of replicas"
+                "The maximum number of replicas must be greater than or equal to the minimum number of replicas"
             )
         return v
 
 
@@ -20,6 +20,7 @@
     parse_idle_duration,
 )
 from dstack._internal.core.models.resources import Range, ResourcesSpec
+from dstack._internal.utils.common import list_enum_values_for_annotation
 from dstack._internal.utils.json_schema import add_extra_schema_types
 from dstack._internal.utils.tags import tags_validator
 
@@ -207,7 +208,11 @@ class InstanceGroupParams(CoreModel):
     spot_policy: Annotated[
         Optional[SpotPolicy],
         Field(
-            description="The policy for provisioning spot or on-demand instances: `spot`, `on-demand`, or `auto`"
+            description=(
+                "The policy for provisioning spot or on-demand instances:"
+                f" {list_enum_values_for_annotation(SpotPolicy)}."
+                f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
+            )
         ),
     ] = None
     retry: Annotated[
Original file line number	Diff line number	Diff line change
`@@ -440,7 +440,7 @@ def convert_replicas(cls, v: Any) -> Range[int]:`
`440`	`440`	`raise ValueError("The minimum number of replicas must be greater than or equal to 0")`
`441`	`441`	`if v.max < v.min:`
`442`	`442`	`raise ValueError(`
`443`		`- "The maximum number of replicas must be greater than or equal to the minium number of replicas"`
	`443`	`+ "The maximum number of replicas must be greater than or equal to the minimum number of replicas"`
`444`	`444`	`)`
`445`	`445`	`return v`
`446`	`446`