We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent f62ca37 commit 1a97746Copy full SHA for 1a97746
1 file changed
benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -69,13 +69,18 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
69
--tp_size=$TP --ep_size=$EP_SIZE \
70
--extra_llm_api_options=$EXTRA_CONFIG_FILE \
71
> $SERVER_LOG 2>&1 &
72
+SERVER_PID=$!
73
74
75
# Show logs until server is ready
76
tail -f $SERVER_LOG &
77
TAIL_PID=$!
78
set +x
79
until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
80
+ if ! kill -0 $SERVER_PID 2>/dev/null; then
81
+ echo "Server died before becoming healthy. Exiting."
82
+ exit 1
83
+ fi
84
sleep 5
85
done
86
kill $TAIL_PID
0 commit comments