Automodel/slurm.sub at main · edjson/Automodel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/bin/bash
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# =============================================================================
# NeMo AutoModel — Reference SLURM submission script
# =============================================================================
#
# Copy this file, adapt it to your cluster, and submit with sbatch:
#
#   cp slurm.sub my_cluster.sub
#   vim my_cluster.sub          # edit CONFIG, SBATCH directives, container, etc.
#   sbatch my_cluster.sub
#
# The script launches torchrun across all allocated nodes.  Each worker runs
# `automodel` which detects the torchrun environment and executes the recipe
# in-process.
# =============================================================================

# ---------------------------------------------------------------------------
# SLURM directives — change these for your cluster
# ---------------------------------------------------------------------------
#SBATCH -A <your_account>           # account
#SBATCH -p batch                    # partition
#SBATCH -t 01:00:00                 # wall time limit, hr:min:sec
#SBATCH -N 8                        # number of nodes
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=1
#SBATCH -J automodel-job            # job name
#SBATCH --output=slurm_jobs/%x_%j.out
#SBATCH --error=slurm_jobs/%x_%j.err
#SBATCH --dependency=singleton

# ---------------------------------------------------------------------------
# Recipe config — point this at your YAML
# ---------------------------------------------------------------------------
CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml

echo "Running on hosts: $(echo $(scontrol show hostname))"

# ---------------------------------------------------------------------------
# Container image and mounts — edit paths for your environment
# ---------------------------------------------------------------------------
CONT=/lustre/fsw/portfolios/<team>/users/$USER/automodel.sqsh
CONT_NAME=automodel-training
CONT_MOUNT="\
/lustre/fsw/portfolios/<team>/users/$USER/:/$USER/,\
/home/$USER/.ssh:/root/.ssh,\
/home/$USER/.gitconfig:/root/.gitconfig,\
/home/$USER/Automodel:/opt/Automodel"

# ---------------------------------------------------------------------------
# Multi-node environment
# ---------------------------------------------------------------------------
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=${MASTER_PORT:-13742}
export NUM_GPUS=${SLURM_GPUS_PER_NODE:-8}
export WORLD_SIZE=$(($NUM_GPUS * $SLURM_NNODES))

# ---------------------------------------------------------------------------
# Secrets — set these or export them in your shell before sbatch
# ---------------------------------------------------------------------------
export WANDB_API_KEY=${WANDB_API_KEY:-}
export HF_TOKEN=${HF_TOKEN:-}

# ---------------------------------------------------------------------------
# Launch
# ---------------------------------------------------------------------------
srun \
    --container-name="${CONT_NAME}" \
    --container-image="${CONT}" \
    --container-mounts="${CONT_MOUNT}" \
    --container-entrypoint \
    --no-container-mount-home \
    --export=ALL \
    bash -c "\
        cd /opt/Automodel && \
        torchrun \
            --nproc_per_node=\${SLURM_GPUS_PER_NODE:-8} \
            --nnodes=\${SLURM_NNODES:-1} \
            --rdzv_backend=c10d \
            --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \
            -m nemo_automodel.cli.app ${CONFIG}"