forked from NVIDIA-NeMo/Automodel
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathslurm.sub
More file actions
executable file
·94 lines (86 loc) · 4.01 KB
/
slurm.sub
File metadata and controls
executable file
·94 lines (86 loc) · 4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/bin/bash
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
# NeMo AutoModel — Reference SLURM submission script
# =============================================================================
#
# Copy this file, adapt it to your cluster, and submit with sbatch:
#
# cp slurm.sub my_cluster.sub
# vim my_cluster.sub # edit CONFIG, SBATCH directives, container, etc.
# sbatch my_cluster.sub
#
# The script launches torchrun across all allocated nodes. Each worker runs
# `automodel` which detects the torchrun environment and executes the recipe
# in-process.
# =============================================================================
# ---------------------------------------------------------------------------
# SLURM directives — change these for your cluster
# ---------------------------------------------------------------------------
#SBATCH -A <your_account> # account
#SBATCH -p batch # partition
#SBATCH -t 01:00:00 # wall time limit, hr:min:sec
#SBATCH -N 8 # number of nodes
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=1
#SBATCH -J automodel-job # job name
#SBATCH --output=slurm_jobs/%x_%j.out
#SBATCH --error=slurm_jobs/%x_%j.err
#SBATCH --dependency=singleton
# ---------------------------------------------------------------------------
# Recipe config — point this at your YAML
# ---------------------------------------------------------------------------
CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
echo "Running on hosts: $(echo $(scontrol show hostname))"
# ---------------------------------------------------------------------------
# Container image and mounts — edit paths for your environment
# ---------------------------------------------------------------------------
CONT=/lustre/fsw/portfolios/<team>/users/$USER/automodel.sqsh
CONT_NAME=automodel-training
CONT_MOUNT="\
/lustre/fsw/portfolios/<team>/users/$USER/:/$USER/,\
/home/$USER/.ssh:/root/.ssh,\
/home/$USER/.gitconfig:/root/.gitconfig,\
/home/$USER/Automodel:/opt/Automodel"
# ---------------------------------------------------------------------------
# Multi-node environment
# ---------------------------------------------------------------------------
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=${MASTER_PORT:-13742}
export NUM_GPUS=${SLURM_GPUS_PER_NODE:-8}
export WORLD_SIZE=$(($NUM_GPUS * $SLURM_NNODES))
# ---------------------------------------------------------------------------
# Secrets — set these or export them in your shell before sbatch
# ---------------------------------------------------------------------------
export WANDB_API_KEY=${WANDB_API_KEY:-}
export HF_TOKEN=${HF_TOKEN:-}
# ---------------------------------------------------------------------------
# Launch
# ---------------------------------------------------------------------------
srun \
--container-name="${CONT_NAME}" \
--container-image="${CONT}" \
--container-mounts="${CONT_MOUNT}" \
--container-entrypoint \
--no-container-mount-home \
--export=ALL \
bash -c "\
cd /opt/Automodel && \
torchrun \
--nproc_per_node=\${SLURM_GPUS_PER_NODE:-8} \
--nnodes=\${SLURM_NNODES:-1} \
--rdzv_backend=c10d \
--rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \
-m nemo_automodel.cli.app ${CONFIG}"