dstackai · Bihan · May 29, 2025 · May 22, 2025 · May 29, 2025 · May 29, 2025
diff --git a/docs/blog/posts/intel-gaudi.md b/docs/blog/posts/intel-gaudi.md
@@ -98,7 +98,7 @@ model using [Optimum for Intel Gaudi :material-arrow-top-right-thin:{ .external
 and [DeepSpeed :material-arrow-top-right-thin:{ .external }](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide){:target="_blank"} with 
 the [`lvwerra/stack-exchange-paired` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/lvwerra/stack-exchange-paired){:target="_blank"} dataset:
 
-<div editor-title="examples/fine-tuning/trl/intel/.dstack.yml">
+<div editor-title="examples/single-node-training/trl/intel/.dstack.yml">
 
 ```yaml
 type: task
@@ -152,7 +152,7 @@ Submit the task using the [`dstack apply`](../../docs/reference/cli/dstack/apply
 <div class="termy">
 
 ```shell
-$ dstack apply -f examples/fine-tuning/trl/intel/.dstack.yml -R
+$ dstack apply -f examples/single-node-training/trl/intel/.dstack.yml -R
 ```
 
 </div>

diff --git a/docs/blog/posts/tpu-on-gcp.md b/docs/blog/posts/tpu-on-gcp.md
@@ -158,7 +158,7 @@ Below is an example of fine-tuning Llama 3.1 8B using [Optimum TPU :material-arr
 and the [Abirate/english_quotes :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/Abirate/english_quotes){:target="_blank"}
 dataset.
 
-<div editor-title="examples/fine-tuning/optimum-tpu/llama31/train.dstack.yml"> 
+<div editor-title="examples/single-node-training/optimum-tpu/llama31/train.dstack.yml"> 
 
 ```yaml
 type: task
@@ -171,8 +171,8 @@ env:
 commands:
   - git clone -b add_llama_31_support https://github.com/dstackai/optimum-tpu.git
   - mkdir -p optimum-tpu/examples/custom/
-  - cp examples/fine-tuning/optimum-tpu/llama31/train.py optimum-tpu/examples/custom/train.py
-  - cp examples/fine-tuning/optimum-tpu/llama31/config.yaml optimum-tpu/examples/custom/config.yaml
+  - cp examples/single-node-training/optimum-tpu/llama31/train.py optimum-tpu/examples/custom/train.py
+  - cp examples/single-node-training/optimum-tpu/llama31/config.yaml optimum-tpu/examples/custom/config.yaml
   - cd optimum-tpu
   - pip install -e . -f https://storage.googleapis.com/libtpu-releases/index.html
   - pip install datasets evaluate

diff --git a/docs/docs/concepts/tasks.md b/docs/docs/concepts/tasks.md
@@ -10,7 +10,7 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
 
 [//]: # (TODO: Make tabs - single machine & distributed tasks & web app)
 
-<div editor-title="examples/fine-tuning/axolotl/train.dstack.yml"> 
+<div editor-title="examples/single-node-training/axolotl/train.dstack.yml"> 
 
 ```yaml
 type: task
@@ -26,7 +26,7 @@ env:
   - WANDB_API_KEY
 # Commands of the task
 commands:
-  - accelerate launch -m axolotl.cli.train examples/fine-tuning/axolotl/config.yaml
+  - accelerate launch -m axolotl.cli.train examples/single-node-training/axolotl/config.yaml
 
 resources:
   gpu:
@@ -461,4 +461,4 @@ it does not block other runs with lower priority from scheduling.
 !!! info "What's next?"
     1. Read about [dev environments](dev-environments.md), [services](services.md), and [repos](repos.md)
     2. Learn how to manage [fleets](fleets.md)
-    3. Check the [Axolotl](/examples/fine-tuning/axolotl) example
+    3. Check the [Axolotl](/examples/single-node-training/axolotl) example
diff --git a/docs/examples.md b/docs/examples.md
@@ -12,10 +12,21 @@ hide:
 }
 </style>
 
-## Fine-tuning
+## Single-node training
 
 <div class="tx-landing__highlights_grid">
-    <a href="/examples/fine-tuning/axolotl"
+    <a href="/examples/single-node-training/trl"
+       class="feature-cell">
+        <h3>
+            TRL
+        </h3>
+
+        <p>
+            Fine-tune Llama 3.1 8B on a custom dataset using TRL.
+        </p>
+    </a>
+
+    <a href="/examples/single-node-training/axolotl"
        class="feature-cell">
         <h3>
             Axolotl
@@ -25,19 +36,47 @@ hide:
             Fine-tune Llama 4 on a custom dataset using Axolotl.
         </p>
     </a>
+</div>
 
-    <a href="/examples/fine-tuning/trl"
-       class="feature-cell">
+## Distributed training
+
+<div class="tx-landing__highlights_grid">
+    <a href="/examples/distributed-training/trl"
+       class="feature-cell sky">
         <h3>
             TRL
         </h3>
 
         <p>
-            Fine-tune Llama 3.1 8B on a custom dataset using TRL.
+            Fine-tune LLM on multiple nodes
+            with TRL, Accelerate, and Deepspeed.
+        </p>
+    </a>
+    <a href="/examples/distributed-training/axolotl"
+       class="feature-cell sky">
+        <h3>
+            Axolotl
+        </h3>
+
+        <p>
+            Fine-tune LLM on multiple nodes
+            with Axolotl.
+        </p>
+    </a>
+    <a href="/examples/distributed-training/ray-ragen"
+       class="feature-cell sky">
+        <h3>
+            Ray+RAGEN
+        </h3>
+
+        <p>
+            Fine-tune an agent on multiple nodes
+            with RAGEN, verl, and Ray.
         </p>
     </a>
 </div>
 
+
 ## Clusters
 
 <div class="tx-landing__highlights_grid">
@@ -83,22 +122,6 @@ hide:
     </a>
 </div>
 
-## Distributed training
-
-<div class="tx-landing__highlights_grid">
-    <a href="/examples/distributed-training/ray-ragen"
-       class="feature-cell sky">
-        <h3>
-            Ray+RAGEN
-        </h3>
-
-        <p>
-            Fine-tune an agent on multiple nodes
-            with RAGEN, verl, and Ray.
-        </p>
-    </a>
-</div>
-
 ## Inference
 
 <div class="tx-landing__highlights_grid">
@@ -197,31 +220,6 @@ hide:
     </a>
 </div>
 
-## LLMs
-
-<div class="tx-landing__highlights_grid">
-    <a href="/examples/llms/deepseek"
-       class="feature-cell sky">
-        <h3>
-            Deepseek
-        </h3>
-
-        <p>
-            Deploy and train Deepseek models
-        </p>
-    </a>
-    <a href="/examples/llms/llama"
-       class="feature-cell sky">
-        <h3>
-            Llama
-        </h3>
-
-        <p>
-            Deploy Llama 4 models
-        </p>
-    </a>
-</div>
-
 ## Misc
 
 <div class="tx-landing__highlights_grid">

diff --git a/docs/examples/fine-tuning/axolotl/index.md → ...les/distributed-training/axolotl/index.md b/docs/examples/fine-tuning/axolotl/index.md → ...les/distributed-training/axolotl/index.md
diff --git a/docs/examples/fine-tuning/trl/index.md → ...xamples/distributed-training/trl/index.md b/docs/examples/fine-tuning/trl/index.md → ...xamples/distributed-training/trl/index.md
diff --git a/examples/fine-tuning/qlora/requirements.txt → ...les/single-node-training/axolotl/index.md b/examples/fine-tuning/qlora/requirements.txt → ...les/single-node-training/axolotl/index.md
diff --git a/docs/examples/single-node-training/trl/index.md b/docs/examples/single-node-training/trl/index.md
diff --git a/docs/overrides/main.html b/docs/overrides/main.html
@@ -117,12 +117,11 @@
 
                 <div class="tx-footer__section">
                     <div class="tx-footer__section-title">Examples</div>
-                    <a href="/examples#fine-tuning" class="tx-footer__section-link">Fine-tuning</a>
-                    <a href="/examples#clusters" class="tx-footer__section-link">Clusters</a>
+                    <a href="/examples#fine-tuning" class="tx-footer__section-link">Single-node training</a>
                     <a href="/examples#distributed-training" class="tx-footer__section-link">Distributed training</a>
+                    <a href="/examples#clusters" class="tx-footer__section-link">Clusters</a>
                     <a href="/examples#inference" class="tx-footer__section-link">Inference</a>
                     <a href="/examples#accelerators" class="tx-footer__section-link">Accelerators</a>
-                    <a href="/examples#llms" class="tx-footer__section-link">LLMs</a>
                     <!-- <a href="/examples#misc" class="tx-footer__section-link">Misc</a> -->
                 </div>
 

diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md
@@ -114,7 +114,7 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by
     and the [`mlabonne/guanaco-llama2-1k` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k){:target="_blank"}
     dataset.
 
-    <div editor-title="examples/fine-tuning/trl/amd/.dstack.yml">
+    <div editor-title="examples/single-node-training/trl/amd/.dstack.yml">
 
     ```yaml
     type: task
@@ -140,7 +140,7 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by
       - pip install peft
       - pip install transformers datasets huggingface-hub scipy
       - cd ..
-      - python examples/fine-tuning/trl/amd/train.py
+      - python examples/single-node-training/trl/amd/train.py
 
     # Uncomment to leverage spot instances
     #spot_policy: auto
@@ -157,7 +157,7 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by
     and the [tatsu-lab/alpaca :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/tatsu-lab/alpaca){:target="_blank"}
     dataset.
 
-    <div editor-title="examples/fine-tuning/axolotl/amd/.dstack.yml">
+    <div editor-title="examples/single-node-training/axolotl/amd/.dstack.yml">
 
     ```yaml
     type: task
@@ -213,7 +213,7 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by
 
     > To speed up installation of `flash-attention` and `xformers `, we use pre-built binaries uploaded to S3. 
     > You can find the tasks that build and upload the binaries
-    > in [`examples/fine-tuning/axolotl/amd/` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/amd/){:target="_blank"}.
+    > in [`examples/single-node-training/axolotl/amd/` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl/amd/){:target="_blank"}.
 
 ## Running a configuration
 
@@ -238,8 +238,8 @@ $ dstack apply -f examples/inference/vllm/amd/.dstack.yml
 The source-code of this example can be found in 
 [`examples/inference/tgi/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi/amd){:target="_blank"},
 [`examples/inference/vllm/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm/amd){:target="_blank"},
-[`examples/fine-tuning/axolotl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/amd){:target="_blank"} and
-[`examples/fine-tuning/trl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/trl/amd){:target="_blank"}
+[`examples/single-node-training/axolotl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl/amd){:target="_blank"} and
+[`examples/single-node-training/trl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl/amd){:target="_blank"}
 
 ## What's next?
 

diff --git a/examples/accelerators/intel/README.md b/examples/accelerators/intel/README.md
@@ -102,7 +102,7 @@ using [Optimum for Intel Gaudi :material-arrow-top-right-thin:{ .external }](htt
 and [DeepSpeed :material-arrow-top-right-thin:{ .external }](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide){:target="_blank"} with 
 the [`lvwerra/stack-exchange-paired` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/lvwerra/stack-exchange-paired){:target="_blank"} dataset. 
 
-<div editor-title="examples/fine-tuning/trl/intel/.dstack.yml">
+<div editor-title="examples/single-node-training/trl/intel/.dstack.yml">
 
 ```yaml
 type: task

diff --git a/examples/accelerators/tpu/README.md b/examples/accelerators/tpu/README.md
@@ -127,7 +127,7 @@ Below is an example of fine-tuning Llama 3.1 8B using [Optimum TPU :material-arr
 and the [`Abirate/english_quotes` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/Abirate/english_quotes){:target="_blank"}
 dataset.
 
-<div editor-title="examples/fine-tuning/optimum-tpu/llama31/.dstack.yml"> 
+<div editor-title="examples/single-node-training/optimum-tpu/llama31/.dstack.yml"> 
 
 ```yaml
 type: task
@@ -139,8 +139,8 @@ env:
 commands:
   - git clone -b add_llama_31_support https://github.com/dstackai/optimum-tpu.git
   - mkdir -p optimum-tpu/examples/custom/
-  - cp examples/fine-tuning/optimum-tpu/llama31/train.py optimum-tpu/examples/custom/train.py
-  - cp examples/fine-tuning/optimum-tpu/llama31/config.yaml optimum-tpu/examples/custom/config.yaml
+  - cp examples/single-node-training/optimum-tpu/llama31/train.py optimum-tpu/examples/custom/train.py
+  - cp examples/single-node-training/optimum-tpu/llama31/config.yaml optimum-tpu/examples/custom/config.yaml
   - cd optimum-tpu
   - pip install -e . -f https://storage.googleapis.com/libtpu-releases/index.html
   - pip install datasets evaluate
@@ -155,7 +155,7 @@ resources:
 </div>
 
 [//]: # (### Fine-Tuning with TRL)
-[//]: # (Use the example `examples/fine-tuning/optimum-tpu/gemma/train.dstack.yml` to Finetune `Gemma-2B` model using `trl` with `dstack` and `optimum-tpu`. )
+[//]: # (Use the example `examples/single-node-training/optimum-tpu/gemma/train.dstack.yml` to Finetune `Gemma-2B` model using `trl` with `dstack` and `optimum-tpu`. )
 
 ### Memory requirements
 
@@ -181,7 +181,7 @@ Note, `v5litepod` is optimized for fine-tuning transformer-based models. Each co
 The source-code of this example can be found in 
 [`examples/inference/tgi/tpu` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi/tpu){:target="_blank"},
 [`examples/inference/vllm/tpu` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm/tpu){:target="_blank"},
-and [`examples/fine-tuning/optimum-tpu` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/trl){:target="_blank"}.
+and [`examples/single-node-training/optimum-tpu` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl){:target="_blank"}.
 
 ## What's next?
 

diff --git a/examples/distributed-training/axolotl/.dstack.yml b/examples/distributed-training/axolotl/.dstack.yml
@@ -0,0 +1,49 @@
+type: task
+name: axolotl-multi-node-qlora-llama3-70b
+
+# Size of the cluster
+nodes: 2
+
+# The axolotlai/axolotl:main-latest image does not include InfiniBand or RDMA libraries, so we need to use the NGC container.
+image: nvcr.io/nvidia/pytorch:25.01-py3
+# Required environment variables
+env:
+  - HF_TOKEN
+  - WANDB_API_KEY
+  - WANDB_PROJECT
+  - HUB_MODEL_ID
+  - NCCL_DEBUG=INFO
+  - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  - ACCELERATE_LOG_LEVEL=info
+# Commands of the task
+commands:
+  # Replacing the default Torch and FlashAttention in the NCG container with Axolotl-compatible versions.
+  # The preinstalled versions are incompatible with Axolotl.
+  - pip uninstall -y torch flash-attn
+  - pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/test/cu124
+  - pip install --no-build-isolation axolotl[flash-attn,deepspeed]
+  - wget https://raw.githubusercontent.com/huggingface/trl/main/examples/accelerate_configs/fsdp1.yaml
+  - wget https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/qlora-fsdp-70b.yaml
+  # Axolotl includes hf-xet version 1.1.0, which fails during downloads. Replacing it with the latest version (1.1.2).
+  - pip uninstall -y hf-xet
+  - pip install hf-xet --no-cache-dir
+  - |
+    accelerate launch \
+      --config_file=fsdp1.yaml \
+      -m axolotl.cli.train qlora-fsdp-70b.yaml \
+      --hub-model-id $HUB_MODEL_ID \
+      --output-dir /checkpoints/qlora-llama3-70b \
+      --wandb-project $DSTACK_RUN_NAME \
+      --wandb-name $WANDB_NAME \
+      --main_process_ip=$DSTACK_MASTER_NODE_IP \
+      --main_process_port=8008 \
+      --machine_rank=$DSTACK_NODE_RANK \
+      --num_processes=$DSTACK_GPUS_NUM \
+      --num_machines=$DSTACK_NODES_NUM
+
+resources:
+  gpu: 80GB:8
+  shm_size: 128GB
+
+volumes:
+  - /checkpoints:/checkpoints