Skip to content

Commit 1a97746

Browse files
committed
now fail if server fails
1 parent f62ca37 commit 1a97746

1 file changed

Lines changed: 5 additions & 0 deletions

File tree

benchmarks/dsr1_fp8_h200_trt_slurm.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,18 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
6969
--tp_size=$TP --ep_size=$EP_SIZE \
7070
--extra_llm_api_options=$EXTRA_CONFIG_FILE \
7171
> $SERVER_LOG 2>&1 &
72+
SERVER_PID=$!
7273

7374

7475
# Show logs until server is ready
7576
tail -f $SERVER_LOG &
7677
TAIL_PID=$!
7778
set +x
7879
until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
80+
if ! kill -0 $SERVER_PID 2>/dev/null; then
81+
echo "Server died before becoming healthy. Exiting."
82+
exit 1
83+
fi
7984
sleep 5
8085
done
8186
kill $TAIL_PID

0 commit comments

Comments
 (0)