Skip to content

Commit 8b847f1

Browse files
committed
adding prop of failure for server logs
1 parent 1031ac9 commit 8b847f1

4 files changed

Lines changed: 18 additions & 1 deletion

File tree

benchmarks/dsr1_fp4_mi355x_slurm.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
3434
--cuda-graph-max-bs=128 \
3535
> $SERVER_LOG 2>&1 &
3636

37+
# Show logs until server is ready
38+
tail -f $SERVER_LOG &
39+
TAIL_PID=$!
40+
set +x
41+
until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
42+
sleep 5
43+
done
44+
kill $TAIL_PID
45+
3746
set -x
3847
BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
3948
git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR

benchmarks/dsr1_fp8_h200_trt_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,9 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
6969
--tp_size=$TP --ep_size=$EP_SIZE \
7070
--extra_llm_api_options=$EXTRA_CONFIG_FILE \
7171
> $SERVER_LOG 2>&1 &
72+
7273
SERVER_PID=$!
7374

74-
7575
# Show logs until server is ready
7676
tail -f $SERVER_LOG &
7777
TAIL_PID=$!

benchmarks/gptoss_fp4_h100_docker.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,17 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
3232
--max-num-seqs=$CONC \
3333
--disable-log-requests > $SERVER_LOG 2>&1 &
3434

35+
SERVER_PID=$!
36+
3537
# Show logs until server is ready
3638
tail -f $SERVER_LOG &
3739
TAIL_PID=$!
3840
set +x
3941
until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
42+
if ! kill -0 $SERVER_PID 2>/dev/null; then
43+
echo "Server died before becoming healthy. Exiting."
44+
exit 1
45+
fi
4046
sleep 5
4147
done
4248
kill $TAIL_PID

runners/launch_b200-nb.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,5 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
1313
--container-workdir=/workspace/ \
1414
--no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \
1515
bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
16+
17+
scancel $JOB_ID

0 commit comments

Comments
 (0)