Skip to content

Commit db14b2c

Browse files
committed
debug
Signed-off-by: Ye Yu <yeyu@nvidia.com>
1 parent 7fb3666 commit db14b2c

1 file changed

Lines changed: 6 additions & 2 deletions

File tree

examples/speculative_decoding/launch_train.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,10 @@ while [ $# -gt 0 ]; do
106106
if [[ "$1" != *=* ]]; then shift; fi
107107
NUM_NODES="${1#*=}"
108108
;;
109+
--head_node_ip*
110+
if [[ "$1" != *=* ]]; then shift; fi
111+
HEAD_NODE_IP="${1#*=}"
112+
;;
109113
*)
110114
>&2 printf "Error: Invalid argument ${1#*=}\n"
111115
exit 1
@@ -190,12 +194,12 @@ else
190194
DRAFT_VOCAB_CACHE_ARGS=""
191195
fi
192196

193-
if [[ "$HEAD_NODE_IP" != "" ]]; then
197+
if [[ "$NUM_NODES" != 1 ]]; then
194198
MULTI_NODE_ARGS="--num_processes $TOTAL_GPU \
195199
--num_machines $NUM_NODES \
196200
--machine_rank $SLURM_PROCID \
197201
--rdzv_backend c10d \
198-
--main_process_ip $(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) \
202+
--main_process_ip $HEAD_NODE_IP \
199203
--main_process_port 29500"
200204
else
201205
MULTI_NODE_ARGS=""

0 commit comments

Comments
 (0)