Skip to content

Commit e36d0df

Browse files
committed
add slurm template
Signed-off-by: Ye Yu <yeyu@nvidia.com>
1 parent db14b2c commit e36d0df

2 files changed

Lines changed: 44 additions & 2 deletions

File tree

examples/speculative_decoding/launch_train.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ while [ $# -gt 0 ]; do
106106
if [[ "$1" != *=* ]]; then shift; fi
107107
NUM_NODES="${1#*=}"
108108
;;
109-
--head_node_ip*
109+
--head_node_ip*)
110110
if [[ "$1" != *=* ]]; then shift; fi
111111
HEAD_NODE_IP="${1#*=}"
112112
;;
@@ -199,7 +199,7 @@ if [[ "$NUM_NODES" != 1 ]]; then
199199
--num_machines $NUM_NODES \
200200
--machine_rank $SLURM_PROCID \
201201
--rdzv_backend c10d \
202-
--main_process_ip $HEAD_NODE_IP \
202+
--main_process_ip $HEAD_NODE_IP \
203203
--main_process_port 29500"
204204
else
205205
MULTI_NODE_ARGS=""
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
3+
#SBATCH -A {account}
4+
#SBATCH --job-name={job_name}
5+
#SBATCH --nodes={num_nodes} --ntasks-per-node=1 --gpus-per-node={num_gpus_per_node}
6+
#SBATCH -p {partition}
7+
#SBATCH -t {time_limit}
8+
9+
CONTAINER_IMAGE={container_image}
10+
WORK_DIR={path_to_modelopt}
11+
12+
CONTAINER_MOUNT="${WORK_DIR}:/modelopt"
13+
14+
OUTPUT_DIR={path_to_output_dir}
15+
MODEL={path_to_model_dir}
16+
DATA={path_to_data_dir}
17+
OFFLINE_DATA={path_to_offline_data_dir}
18+
19+
CMD="./launch_train.sh --model $MODEL \
20+
--output_dir $OUTPUT_DIR \
21+
--data $DATA \
22+
--num_epochs 1 \
23+
--train_bs 1 \
24+
--lr 1e-4 \
25+
--eagle_config eagle_config.json \
26+
--training_seq_len 4096 \
27+
--save_steps 1000 \
28+
--estimate_ar True \
29+
--disable_tqdm True \
30+
--offline-data $OFFLINE_DATA \
31+
--num_nodes $SLURM_NNODES \
32+
--head_node_ip $head_node_ip \
33+
"
34+
35+
srun -l \
36+
--mpi=pmix \
37+
--output=%x_%j_$DATETIME.log \
38+
--container-workdir "/modelopt/examples/speculative_decoding" \
39+
--container-image ${CONTAINER_IMAGE} --container-mounts ${CONTAINER_MOUNT} \
40+
bash -lc "$CMD"
41+
42+
set +x

0 commit comments

Comments
 (0)