-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathlaunch_slurm.sh
More file actions
executable file
·435 lines (371 loc) · 17.6 KB
/
launch_slurm.sh
File metadata and controls
executable file
·435 lines (371 loc) · 17.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
#!/bin/bash
# shellcheck disable=SC2016
#==================================================================================
# SLURM Multi-Node DeepSpeed Zero3 Training Launcher
#==================================================================================
#
# Submit with: sbatch scripts/launch_slurm.sh
# Or override: TRAIN_ARGS="+experiment=keywords/v0_rl.yaml" sbatch scripts/launch_slurm.sh
# Exclude nodes (any of these work — sbatch reads them at submission time):
# sbatch --exclude=dgx-34,dgx-35 scripts/launch_slurm.sh
# SBATCH_EXCLUDE=dgx-34,dgx-35 sbatch scripts/launch_slurm.sh
# (or uncomment the #SBATCH --exclude=... line below for a permanent default)
#
# This script replaces the manual SSH-based launch_multinode.sh for SLURM-managed
# clusters. SLURM handles node allocation, process placement, and cleanup.
#
#==================================================================================
# SLURM DIRECTIVES — edit these or override via sbatch flags
#==================================================================================
#SBATCH --job-name=rl-train
#SBATCH --nodes=5
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-task=32
#SBATCH --exclusive
#SBATCH --no-requeue
# #SBATCH --time=48:00:00
#SBATCH --output=/lambdafs/users/user/logs/slurm/job_%j.out
#SBATCH --error=/lambdafs/users/user/logs/slurm/job_%j.err
# Uncomment and set if your cluster requires it:
# #SBATCH --partition=gpu
# #SBATCH --account=your_account
# #SBATCH --exclude=dgx-34
set -euo pipefail
#==================================================================================
# CONFIGURATION — edit this section for your run
#==================================================================================
# Workspace: where the repo lives. Use shared filesystem for simplicity with SLURM.
# Code reads from /lambdafs are fine; heavy I/O (caches, checkpoints) goes to /raid.
WORKSPACE="${WORKSPACE:-/lambdafs/users/user/new_tests/code-interp-benchmark}"
# Accelerate config (relative to WORKSPACE)
ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-pyine/configs/accelerate/deepspeed_zero3_multinode_5x8gpu.yaml}"
# Training script (relative to WORKSPACE)
TRAIN_SCRIPT="${TRAIN_SCRIPT:-pyine/apps/trainers/hf_trainer.py}"
# Training arguments (Hydra overrides)
TRAIN_ARGS="${TRAIN_ARGS:-+experiment=keywords/v0_rl.yaml}"
# GPUs per node (8 for standard multi-GPU nodes)
GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
# Fast node-local storage for caches
RAID_BASE="${RAID_BASE:-/raid}"
CACHE_BASE="${CACHE_BASE:-${RAID_BASE}/tmp/cache}"
# Application paths — these control where the training app reads data and writes outputs.
# PYINE_DATA_ROOT: where datasets (LMDB traces) live. Defaults to {WORKSPACE}/data.
# Since WORKSPACE is on /lambdafs (shared), datasets are accessible from all nodes.
# PYINE_LOGS_ROOT: where Hydra writes run outputs (configs, checkpoints, trainer state).
# On shared FS so all nodes can write and outputs survive job completion.
PYINE_DATA_ROOT="${PYINE_DATA_ROOT:-${WORKSPACE}/data}"
PYINE_LOGS_ROOT="${PYINE_LOGS_ROOT:-${WORKSPACE}/logs}"
WANDB_PROJECT="${WANDB_PROJECT:-pyine}"
LOGLEVEL="${LOGLEVEL:-INFO}"
# SLURM log directory for launcher-level logs (stdout/stderr tee, env dump)
LOG_DIR="${LOG_DIR:-/lambdafs/users/user/logs/slurm/run_${SLURM_JOB_ID}_$(date +%Y%m%d_%H%M%S)}"
# Network interface for NCCL communication (check with: ip link show)
# Common values: eth0, bond0, ens5, ibp*s0 (InfiniBand)
NETWORK_INTERFACE="${NETWORK_INTERFACE:-bond0}"
# Module loads (space-separated). Set to empty string to skip.
MODULE_LOADS="${MODULE_LOADS:-cuda12.8/toolkit/12.8.1 nccl2-cuda12.8-gcc/2.25.1}"
# Option to force recreate cache
RECREATE_CACHE="${RECREATE_CACHE:-false}"
# Resume configuration (optional)
RESUME_FROM_RUN_DIR="${RESUME_FROM_RUN_DIR:-}"
RESUME_CHECKPOINT="${RESUME_CHECKPOINT:-}"
# Checkpoint output directory override (optional; defaults to Hydra output_dir)
CHECKPOINT_DIR="${CHECKPOINT_DIR:-}"
# Debug mode for NCCL/PyTorch distributed (10-30%+ overhead)
ENABLE_DEBUG="${ENABLE_DEBUG:-false}"
#==================================================================================
# DERIVED CONFIGURATION — do not edit
#==================================================================================
TOTAL_PROCESSES=$((SLURM_NNODES * GPUS_PER_NODE))
# Get the list of allocated nodes
mapfile -t NODELIST < <(scontrol show hostnames "$SLURM_JOB_NODELIST")
MAIN_NODE="${NODELIST[0]}"
# Resolve main node IP from network interface
MAIN_NODE_IP=$(srun --nodes=1 --ntasks=1 --nodelist="$MAIN_NODE" \
bash -c "ip -4 addr show dev ${NETWORK_INTERFACE} 2>/dev/null | grep -oP 'inet \K[0-9.]+'" 2>/dev/null)
if [ -z "$MAIN_NODE_IP" ]; then
# Fallback: resolve hostname
MAIN_NODE_IP=$(srun --nodes=1 --ntasks=1 --nodelist="$MAIN_NODE" hostname -I 2>/dev/null | awk '{print $1}')
fi
if [ -z "$MAIN_NODE_IP" ]; then
echo "ERROR: Cannot determine IP for main node $MAIN_NODE" >&2
echo "Set NETWORK_INTERFACE or add MAIN_PROCESS_IP to override." >&2
exit 1
fi
# Allow explicit IP override
MAIN_NODE_IP="${MAIN_PROCESS_IP:-$MAIN_NODE_IP}"
#==================================================================================
# HELPER: cache environment variables
#==================================================================================
build_cache_exports() {
local cb="$1"
cat <<CACHEEOF
export HF_HOME="${cb}/huggingface"
export HF_HUB_CACHE="${cb}/huggingface/hub"
export HF_DATASETS_CACHE="${cb}/huggingface/datasets"
export HF_ASSETS_CACHE="${cb}/huggingface/assets"
export HF_MODULES_CACHE="${cb}/huggingface/modules"
export TRANSFORMERS_CACHE="${cb}/huggingface/transformers"
export HUGGINGFACE_HUB_CACHE="${cb}/huggingface/hub"
export TORCH_HOME="${cb}/torch"
export TORCH_EXTENSIONS_DIR="${cb}/torch_extensions"
export TORCHINDUCTOR_CACHE_DIR="${cb}/torch_inductor"
export PYTORCH_KERNEL_CACHE_PATH="${cb}/torch/kernels"
export WANDB_DIR="${cb}/wandb"
export WANDB_CACHE_DIR="${cb}/wandb/cache"
export WANDB_CONFIG_DIR="${cb}/wandb/config"
export WANDB_DATA_DIR="${cb}/wandb/data"
export TIKTOKEN_CACHE_DIR="${cb}/tiktoken"
export TRITON_CACHE_DIR="${cb}/triton"
export MPLCONFIGDIR="${cb}/matplotlib"
export XDG_CACHE_HOME="${cb}/xdg_cache"
export XDG_CONFIG_HOME="${cb}/xdg_config"
export XDG_DATA_HOME="${cb}/xdg_data"
export CUDA_CACHE_PATH="${cb}/cuda_cache"
export FLASH_ATTENTION_CACHE_DIR="${cb}/flash_attn"
export VLLM_CACHE_DIR="${cb}/vllm"
export TMPDIR="${cb}/tmp"
export TEMP="${cb}/tmp"
export TMP="${cb}/tmp"
CACHEEOF
}
#==================================================================================
# JOB INFO
#==================================================================================
echo "=============================================="
echo "SLURM Multi-Node Training"
echo "=============================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Nodes: ${NODELIST[*]}"
echo "Excluded: ${SBATCH_EXCLUDE:-(none)}"
echo "Num nodes: $SLURM_NNODES"
echo "GPUs per node: $GPUS_PER_NODE"
echo "Total processes: $TOTAL_PROCESSES"
echo "Main node: $MAIN_NODE ($MAIN_NODE_IP)"
echo "Workspace: $WORKSPACE"
echo "Accelerate cfg: $ACCELERATE_CONFIG"
echo "Train args: $TRAIN_ARGS"
echo "Cache base: $CACHE_BASE (on /raid, node-local)"
echo "Log dir: $LOG_DIR (on /lambdafs, shared)"
echo "Network iface: $NETWORK_INTERFACE"
echo "Debug: $ENABLE_DEBUG"
if [ -n "$RESUME_FROM_RUN_DIR" ]; then
echo "Resume from: $RESUME_FROM_RUN_DIR"
[ -n "$RESUME_CHECKPOINT" ] && echo "Resume ckpt: $RESUME_CHECKPOINT"
fi
if [ -n "$CHECKPOINT_DIR" ]; then
echo "Checkpoint dir: $CHECKPOINT_DIR"
else
echo "Checkpoint dir: (Hydra output_dir on shared FS)"
fi
echo "Data root: $PYINE_DATA_ROOT"
echo "Logs root: $PYINE_LOGS_ROOT (Hydra outputs, checkpoints)"
echo "=============================================="
echo ""
#==================================================================================
# PRE-FLIGHT: create dirs, validate workspace
#==================================================================================
# Create shared log directory
mkdir -p "$LOG_DIR"
# Save this script's config for reproducibility
env | grep -E '^(SLURM_|TRAIN_|WORKSPACE|CACHE_|CHECKPOINT_|RESUME_|NETWORK_|ENABLE_|MODULE_|ACCELERATE_|LOG_DIR)' \
| sort > "${LOG_DIR}/job_env.txt" 2>/dev/null || true
# Validate workspace exists (shared filesystem — same on all nodes)
if [ ! -d "$WORKSPACE" ]; then
echo "ERROR: Workspace not found: $WORKSPACE" >&2
echo "Ensure the repo is cloned on the shared filesystem (/lambdafs)." >&2
exit 1
fi
# Node-local setup: create cache dirs, optionally recreate cache
echo "Setting up node-local directories..."
srun --ntasks-per-node=1 bash -c "
if [ '${RECREATE_CACHE}' = true ]; then
rm -rf '${CACHE_BASE}'
fi
mkdir -p '${CACHE_BASE}/tmp'
if [ -n '${CHECKPOINT_DIR}' ]; then
mkdir -p '${CHECKPOINT_DIR}'
fi
echo \" Node \$(hostname): /raid ready\"
"
#==================================================================================
# BUILD TRAINING COMMAND
#==================================================================================
# Build Hydra overrides for checkpoint dir and resume
HYDRA_OVERRIDES=""
if [ -n "$CHECKPOINT_DIR" ]; then
HYDRA_OVERRIDES="${HYDRA_OVERRIDES} config.grpo_config.output_dir=${CHECKPOINT_DIR}"
fi
if [ -n "$RESUME_FROM_RUN_DIR" ]; then
HYDRA_OVERRIDES="${HYDRA_OVERRIDES} config.resume_from_run_dir=${RESUME_FROM_RUN_DIR}"
if [ -n "$RESUME_CHECKPOINT" ]; then
HYDRA_OVERRIDES="${HYDRA_OVERRIDES} config.resume_checkpoint_name=${RESUME_CHECKPOINT}"
fi
fi
# Build debug env exports
DEBUG_EXPORTS=""
if [ "$ENABLE_DEBUG" = true ]; then
DEBUG_EXPORTS="
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=ALL
export TORCH_NCCL_TRACE_BUFFER_SIZE=1000
export TORCH_DISTRIBUTED_DEBUG=DETAIL
export TORCH_SHOW_CPP_STACKTRACES=1
"
fi
# Build module load commands
MODULE_CMDS=""
if [ -n "$MODULE_LOADS" ]; then
MODULE_CMDS="source /etc/profile.d/modules.sh"
for mod in $MODULE_LOADS; do
MODULE_CMDS="${MODULE_CMDS} && module load ${mod}"
done
fi
# Write the cache exports to a temp file so srun can source it
CACHE_EXPORTS_FILE="${LOG_DIR}/.cache_exports.sh"
build_cache_exports "$CACHE_BASE" > "$CACHE_EXPORTS_FILE"
# Export variables needed inside srun
export WORKSPACE ACCELERATE_CONFIG TRAIN_SCRIPT TRAIN_ARGS GPUS_PER_NODE
export TOTAL_PROCESSES MAIN_NODE_IP NETWORK_INTERFACE
export CACHE_EXPORTS_FILE HYDRA_OVERRIDES DEBUG_EXPORTS MODULE_CMDS LOG_DIR
export PYINE_PER_NODE_PREP=off
export PYINE_DATA_ROOT PYINE_LOGS_ROOT WANDB_PROJECT LOGLEVEL
#==================================================================================
# INSTALL / SYNC VENV (single node to avoid races on shared FS)
#==================================================================================
echo ""
echo "Syncing venv from main node ($MAIN_NODE)..."
srun --nodes=1 --ntasks=1 --nodelist="$MAIN_NODE" bash -c '
if [ -n "$MODULE_CMDS" ]; then
eval "$MODULE_CMDS"
fi
source "$CACHE_EXPORTS_FILE"
cd "$WORKSPACE"
uv sync --extra vllm --extra liger --extra gpu_monitoring
'
echo "Venv sync complete."
#==================================================================================
# LAUNCH TRAINING
#==================================================================================
echo ""
echo "Launching training across ${SLURM_NNODES} nodes..."
echo " DeepSpeed Zero Stage 3"
echo " vLLM colocated on training GPUs"
echo ""
srun --ntasks-per-node=1 --kill-on-bad-exit=1 bash -c '
# Load modules if configured
if [ -n "$MODULE_CMDS" ]; then
eval "$MODULE_CMDS"
fi
# Set cache environment (node-local /raid paths)
source "$CACHE_EXPORTS_FILE"
# Application paths (dataset reads from shared FS, outputs to shared FS)
export PYINE_DATA_ROOT="$PYINE_DATA_ROOT"
export PYINE_LOGS_ROOT="$PYINE_LOGS_ROOT"
# PyTorch CUDA allocator: use expandable segments to reduce fragmentation
# (critical for colocated vLLM + DeepSpeed ZeRO-3 training)
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
# NCCL network config
export NCCL_SOCKET_IFNAME="$NETWORK_INTERFACE"
export NCCL_BUFFSIZE="${NCCL_BUFFSIZE:-4194304}"
# Allow long init phase (lazy data prep can desync ranks by >8 min)
export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC="${TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC:-3600}"
# Debug env vars (empty string if debug disabled)
eval "$DEBUG_EXPORTS"
# Accelerate env vars (used by accelerate + DeepSpeed for coordination)
export ACCELERATE_MACHINE_RANK=$SLURM_PROCID
export ACCELERATE_MAIN_PROCESS_IP=$MAIN_NODE_IP
export ACCELERATE_NUM_MACHINES=$SLURM_NNODES
export ACCELERATE_NUM_PROCESSES=$TOTAL_PROCESSES
cd "$WORKSPACE"
#--- Pre-launch GPU memory check ---------------------------------------------
# Ensure all GPUs have <1% memory in use (catch stale processes / leftover jobs)
echo "[$(hostname)] Checking GPU memory before launch..."
GPU_DIRTY=0
nvidia-smi --query-gpu=index,memory.used,memory.total \
--format=csv,noheader,nounits 2>/dev/null | while IFS=", " read -r GPU_IDX GPU_MEM_USED GPU_MEM_TOTAL; do
GPU_MEM_PCT=$((GPU_MEM_USED * 100 / GPU_MEM_TOTAL))
if [ "$GPU_MEM_PCT" -ge 1 ]; then
echo "[$(hostname)] ERROR: GPU $GPU_IDX has ${GPU_MEM_USED}/${GPU_MEM_TOTAL} MiB in use (${GPU_MEM_PCT}%) before training start."
echo "[$(hostname)] Processes on GPU $GPU_IDX:"
nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader -i "$GPU_IDX" 2>/dev/null | sed "s/^/ /"
GPU_DIRTY=1
fi
done
# The while loop runs in a subshell, so re-check to propagate the exit
MAX_GPU_MEM_PCT=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null \
| awk -F", " "{pct=\$1*100/\$2; if(pct>max) max=pct} END{printf \"%d\", max}")
if [ "$MAX_GPU_MEM_PCT" -ge 1 ]; then
echo "[$(hostname)] ABORTING: GPUs are not clean (max ${MAX_GPU_MEM_PCT}% memory in use). Kill stale processes first."
exit 1
fi
echo "[$(hostname)] All GPUs clean (<1% memory in use)."
#-----------------------------------------------------------------------------
#--- Resource monitor (background, debug only) --------------------------------
MONITOR_PID=""
if [ "$ENABLE_DEBUG" = "true" ]; then
MONITOR_LOG="${LOG_DIR}/resource_monitor_$(hostname).csv"
(
echo "timestamp,hostname,cpu_mem_used_mb,cpu_mem_total_mb,cpu_mem_pct,gpu_idx,gpu_name,gpu_util_pct,gpu_mem_used_mb,gpu_mem_total_mb,gpu_mem_pct,gpu_temp_c,gpu_power_w,gpu_pids"
while true; do
TS=$(date "+%Y-%m-%d %H:%M:%S")
HOST=$(hostname)
# CPU memory (from /proc/meminfo)
read MEM_TOTAL MEM_AVAIL <<< $(awk "/MemTotal/{t=\$2} /MemAvailable/{a=\$2} END{printf \"%d %d\", t/1024, a/1024}" /proc/meminfo)
MEM_USED=$((MEM_TOTAL - MEM_AVAIL))
MEM_PCT=$((MEM_USED * 100 / MEM_TOTAL))
# GPU stats via nvidia-smi (one row per GPU)
nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw \
--format=csv,noheader,nounits 2>/dev/null | while IFS=", " read -r GPU_IDX GPU_NAME GPU_UTIL GPU_MEM_USED GPU_MEM_TOTAL GPU_TEMP GPU_POWER; do
GPU_MEM_PCT=$((GPU_MEM_USED * 100 / GPU_MEM_TOTAL))
# Get PIDs using this GPU
GPU_PIDS=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader,nounits -i "$GPU_IDX" 2>/dev/null | tr "\n" ";" | sed "s/;$//")
echo "${TS},${HOST},${MEM_USED},${MEM_TOTAL},${MEM_PCT},${GPU_IDX},${GPU_NAME},${GPU_UTIL},${GPU_MEM_USED},${GPU_MEM_TOTAL},${GPU_MEM_PCT},${GPU_TEMP},${GPU_POWER},${GPU_PIDS}"
done
sleep 2
done
) >> "$MONITOR_LOG" 2>/dev/null &
MONITOR_PID=$!
echo "[$(hostname)] Resource monitor started (PID=$MONITOR_PID, log=$MONITOR_LOG)"
fi
#-----------------------------------------------------------------------------
echo "[$(hostname)] Rank $SLURM_PROCID: launching accelerate (main=$MAIN_NODE_IP)"
uv run --extra vllm --extra liger --extra gpu_monitoring accelerate launch \
--config_file "$ACCELERATE_CONFIG" \
--machine_rank "$SLURM_PROCID" \
--main_process_ip "$MAIN_NODE_IP" \
--num_machines "$SLURM_NNODES" \
--num_processes "$TOTAL_PROCESSES" \
$TRAIN_SCRIPT \
$TRAIN_ARGS \
$HYDRA_OVERRIDES \
2>&1 | tee "${LOG_DIR}/train_$(hostname).log"
# Stop resource monitor (if running)
if [ -n "$MONITOR_PID" ]; then
kill $MONITOR_PID 2>/dev/null
wait $MONITOR_PID 2>/dev/null
echo "[$(hostname)] Resource monitor stopped."
fi
'
EXIT_CODE=$?
#==================================================================================
# POST-TRAINING
#==================================================================================
echo ""
echo "=============================================="
if [ $EXIT_CODE -eq 0 ]; then
echo "Training completed successfully."
else
echo "Training exited with code $EXIT_CODE."
fi
echo "Job ID: $SLURM_JOB_ID"
echo "Logs: $LOG_DIR"
echo "SLURM out: /lambdafs/users/user/logs/slurm/job_${SLURM_JOB_ID}.out"
if [ -n "$CHECKPOINT_DIR" ]; then
echo "Checkpoints: $CHECKPOINT_DIR (on /raid — collect from nodes)"
else
echo "Checkpoints: Hydra output_dir (check config for path)"
fi
echo "=============================================="
exit $EXIT_CODE