File tree Expand file tree Collapse file tree
examples/speculative_decoding Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -106,7 +106,7 @@ while [ $# -gt 0 ]; do
106106 if [[ " $1 " != * = * ]]; then shift ; fi
107107 NUM_NODES=" ${1#* =} "
108108 ;;
109- --head_node_ip*
109+ --head_node_ip* )
110110 if [[ " $1 " != * = * ]]; then shift ; fi
111111 HEAD_NODE_IP=" ${1#* =} "
112112 ;;
@@ -199,7 +199,7 @@ if [[ "$NUM_NODES" != 1 ]]; then
199199 --num_machines $NUM_NODES \
200200 --machine_rank $SLURM_PROCID \
201201 --rdzv_backend c10d \
202- --main_process_ip $HEAD_NODE_IP \
202+ --main_process_ip $HEAD_NODE_IP \
203203 --main_process_port 29500"
204204else
205205 MULTI_NODE_ARGS=" "
Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+
3+ # SBATCH -A {account}
4+ # SBATCH --job-name={job_name}
5+ # SBATCH --nodes={num_nodes} --ntasks-per-node=1 --gpus-per-node={num_gpus_per_node}
6+ # SBATCH -p {partition}
7+ # SBATCH -t {time_limit}
8+
9+ CONTAINER_IMAGE={container_image}
10+ WORK_DIR={path_to_modelopt}
11+
12+ CONTAINER_MOUNT=" ${WORK_DIR} :/modelopt"
13+
14+ OUTPUT_DIR={path_to_output_dir}
15+ MODEL={path_to_model_dir}
16+ DATA={path_to_data_dir}
17+ OFFLINE_DATA={path_to_offline_data_dir}
18+
19+ CMD=" ./launch_train.sh --model $MODEL \
20+ --output_dir $OUTPUT_DIR \
21+ --data $DATA \
22+ --num_epochs 1 \
23+ --train_bs 1 \
24+ --lr 1e-4 \
25+ --eagle_config eagle_config.json \
26+ --training_seq_len 4096 \
27+ --save_steps 1000 \
28+ --estimate_ar True \
29+ --disable_tqdm True \
30+ --offline-data $OFFLINE_DATA \
31+ --num_nodes $SLURM_NNODES \
32+ --head_node_ip $head_node_ip \
33+ "
34+
35+ srun -l \
36+ --mpi=pmix \
37+ --output=%x_%j_$DATETIME .log \
38+ --container-workdir " /modelopt/examples/speculative_decoding" \
39+ --container-image ${CONTAINER_IMAGE} --container-mounts ${CONTAINER_MOUNT} \
40+ bash -lc " $CMD "
41+
42+ set +x
You can’t perform that action at this time.
0 commit comments