Skip to content

Commit 5b889c7

Browse files
ChenhanYuclaude
andcommitted
fix: vLLM smoke test regression check + query.sh server liveness
- vllm_smoke_test.sh: parse acceptance length from vLLM log instead of missing Prometheus metric; capture server output to temp file - vllm_smoke_test.sh: pip install pandas for broken nightly container - query.sh: detect server death + 600s startup timeout (prevents infinite polling when vLLM crashes, wasting GPU hours) - query.sh: pip install pandas for broken nightly container - hf_online_dflash.yaml: add regression baselines from B200 100K run, add MAX_FINAL_LOSS/MIN_FINAL_ACC/MIN_ACCEPTANCE_LENGTH thresholds Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Chenhan Yu <chenhany@nvidia.com>
1 parent a161445 commit 5b889c7

3 files changed

Lines changed: 72 additions & 8 deletions

File tree

tools/launcher/common/specdec/vllm_smoke_test.sh

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@
3232
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
3333
source ${SCRIPT_DIR}/../service_utils.sh 2>/dev/null || true
3434

35-
cleanup() { kill $SERVER_PID 2>/dev/null; sleep 2; kill -9 $SERVER_PID 2>/dev/null; }
35+
# Ensure pandas is available (missing in some vLLM nightly builds)
36+
pip install pandas 2>/dev/null || true
37+
38+
cleanup() { kill $SERVER_PID 2>/dev/null; sleep 2; kill -9 $SERVER_PID 2>/dev/null; rm -f "${VLLM_LOG:-}" 2>/dev/null; }
3639
trap cleanup EXIT
3740

3841
MODEL=${HF_MODEL_CKPT}
@@ -72,22 +75,23 @@ if [ "${DISABLE_PREFIX_CACHING:-}" = "1" ]; then
7275
OPTIONAL_ARGS="${OPTIONAL_ARGS} --no-enable-prefix-caching"
7376
fi
7477

75-
# Start vLLM server
78+
# Start vLLM server (capture output for regression check parsing)
79+
VLLM_LOG=$(mktemp /tmp/vllm_server_XXXXXX.log)
7680
if [ -n "$SPEC_CONFIG" ]; then
7781
vllm serve ${MODEL} \
7882
--speculative-config "${SPEC_CONFIG}" \
7983
--max-num-batched-tokens 32768 \
8084
--tensor-parallel-size ${TP} \
8185
--port ${PORT} \
8286
${OPTIONAL_ARGS} \
83-
&
87+
> >(tee -a "$VLLM_LOG") 2>&1 &
8488
else
8589
vllm serve ${MODEL} \
8690
--max-num-batched-tokens 32768 \
8791
--tensor-parallel-size ${TP} \
8892
--port ${PORT} \
8993
${OPTIONAL_ARGS} \
90-
&
94+
> >(tee -a "$VLLM_LOG") 2>&1 &
9195
fi
9296
SERVER_PID=$!
9397

@@ -168,4 +172,27 @@ if [ $FAIL -gt 0 ]; then
168172
exit 1
169173
fi
170174

175+
# Regression check: minimum acceptance length for speculative decoding
176+
if [ -n "${MIN_ACCEPTANCE_LENGTH:-}" ]; then
177+
# Parse mean acceptance length from vLLM's SpecDecoding metrics log.
178+
# vLLM logs: "SpecDecoding metrics: Mean acceptance length: X.XX, ..."
179+
# Take the last reported value (most accurate, covers all prompts).
180+
AVG_ACCEPT=$(grep -oP 'Mean acceptance length: \K[0-9.]+' "$VLLM_LOG" 2>/dev/null | tail -1 || true)
181+
if [ -n "$AVG_ACCEPT" ]; then
182+
echo ""
183+
echo "=== Acceptance Length Regression Check ==="
184+
echo " Mean acceptance length: ${AVG_ACCEPT}"
185+
echo " Threshold: ${MIN_ACCEPTANCE_LENGTH}"
186+
PASS_CHECK=$(python3 -c "print('yes' if float('${AVG_ACCEPT}') >= float('${MIN_ACCEPTANCE_LENGTH}') else 'no')")
187+
if [ "$PASS_CHECK" = "yes" ]; then
188+
echo " PASS: ${AVG_ACCEPT} >= ${MIN_ACCEPTANCE_LENGTH}"
189+
else
190+
echo " REGRESSION: ${AVG_ACCEPT} < ${MIN_ACCEPTANCE_LENGTH}"
191+
exit 1
192+
fi
193+
else
194+
echo "WARNING: Could not parse acceptance length from vLLM log, skipping regression check"
195+
fi
196+
fi
197+
171198
echo "Done"

tools/launcher/common/vllm/query.sh

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ source ${SCRIPT_DIR}/../service_utils.sh
5858
# gpus_per_node: 4
5959
###################################################################################################
6060

61+
# Ensure pandas is available (missing in some vLLM nightly builds)
62+
pip install pandas 2>/dev/null || true
63+
6164
export OPENAI_API_KEY="token-abc123"
6265

6366
if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
@@ -108,13 +111,26 @@ SERVER_PID=$!
108111

109112
# Wait for server to start up by polling the health endpoint
110113
echo "Waiting for server to start..."
114+
MAX_WAIT=${VLLM_STARTUP_TIMEOUT:-600}
115+
WAITED=0
111116
while true; do
117+
if ! kill -0 $SERVER_PID 2>/dev/null; then
118+
echo "ERROR: vLLM server process died during startup"
119+
wait $SERVER_PID 2>/dev/null
120+
exit 1
121+
fi
112122
response=$(curl -s -o /dev/null -w "%{http_code}" "http://$(hostname -f):8000/health" || true)
113123
if [ "$response" -eq 200 ]; then
114-
echo "Server is up!"
124+
echo "Server is up! (waited ${WAITED}s)"
115125
break
116126
fi
117-
echo "Server not ready yet, retrying in 10 seconds..."
127+
WAITED=$((WAITED + 10))
128+
if [ $WAITED -ge $MAX_WAIT ]; then
129+
echo "ERROR: vLLM server failed to start within ${MAX_WAIT}s"
130+
kill $SERVER_PID 2>/dev/null
131+
exit 1
132+
fi
133+
echo "Server not ready yet (${WAITED}/${MAX_WAIT}s), retrying in 10 seconds..."
118134
sleep 10
119135
done
120136

tools/launcher/examples/Qwen/Qwen3-8B/hf_online_dflash.yaml

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,21 @@
55
# task_1: vLLM smoke test with DFlash speculative decoding
66
# task_2: MT-Bench per-category HF AR evaluation (1 GPU)
77
#
8+
# Convergence baseline (8x B200, batch_size=1, seq_len=4096, 5-layer draft, block_size=16):
9+
# 100K samples, 1 epoch (~12,500 steps)
10+
# Step 100 (epoch 0.01): loss=8.900 acc=0.029
11+
# Step 1000 (epoch 0.08): loss=5.845 acc=0.096
12+
# Step 2500 (epoch 0.20): loss=4.981 acc=0.138
13+
# Step 5000 (epoch 0.40): loss=4.383 acc=0.176
14+
# Step 7500 (epoch 0.60): loss=4.040 acc=0.196
15+
# Step 10000 (epoch 0.80): loss=3.900 acc=0.210
16+
# Step 12500 (epoch 1.00): loss=3.821 acc=0.200
17+
# Average train_loss=4.493, training time=5094s
18+
#
19+
# Regression criteria (set via environment):
20+
# MAX_FINAL_LOSS: final loss must be below this (default: 5.0)
21+
# MIN_FINAL_ACC: final accuracy must be above this (default: 0.15)
22+
#
823
# Reference: "DFlash: Block Diffusion for Flash Speculative Decoding" (arXiv:2602.06036)
924
#
1025
# Usage:
@@ -22,20 +37,24 @@ pipeline:
2237
args:
2338
- --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
2439
- model.model_name_or_path=<<global_vars.hf_model>>
25-
- data.data_path=/hf-local/modelopt/Speculative-Decoding-Dataset-v1-Qwen3-8B/sample-1K-openai.jsonl
40+
- data.data_path=/hf-local/modelopt/Speculative-Decoding-Dataset-v1-Qwen3-8B/sample-100K-openai.jsonl
2641
- data.chat_template=examples/Qwen/Qwen3-8B/chat_template_train.jinja
2742
- training.output_dir=/scratchspace/dflash_bs16
43+
- training.per_device_train_batch_size=1
2844
- training.num_train_epochs=1
2945
- training.training_seq_len=4096
3046
- training.save_steps=5000
31-
- training.logging_steps=1000
47+
- training.logging_steps=100
3248
- training.disable_tqdm=true
3349
- training.answer_only_loss=true
3450
- dflash.dflash_block_size=16
3551
- dflash.dflash_num_anchors=512
3652
- dflash.dflash_loss_decay_factor=7
3753
- dflash.dflash_mask_token_id=151669
3854
- dflash.dflash_architecture_config.num_hidden_layers=5
55+
environment:
56+
- MAX_FINAL_LOSS: "5.0"
57+
- MIN_FINAL_ACC: "0.15"
3958
slurm_config:
4059
_factory_: "slurm_factory"
4160
nodes: 1
@@ -50,8 +69,10 @@ pipeline:
5069
- DRAFT_CKPT_DIR: /scratchspace/dflash_bs16
5170
- SPEC_METHOD: "dflash"
5271
- NUM_SPEC_TOKENS: "7"
72+
- MIN_ACCEPTANCE_LENGTH: "1.4"
5373
slurm_config:
5474
_factory_: "slurm_factory"
75+
container: "vllm/vllm-openai:nightly"
5576
nodes: 1
5677
ntasks_per_node: 1
5778
gpus_per_node: 1

0 commit comments

Comments
 (0)