fix: vLLM smoke test regression check + query.sh server liveness

ChenhanYu · claude · ChenhanYu · commit 5b889c72017b · 2026-04-17T12:31:18.000-07:00
- vllm_smoke_test.sh: parse acceptance length from vLLM log instead of
  missing Prometheus metric; capture server output to temp file
- vllm_smoke_test.sh: pip install pandas for broken nightly container
- query.sh: detect server death + 600s startup timeout (prevents
  infinite polling when vLLM crashes, wasting GPU hours)
- query.sh: pip install pandas for broken nightly container
- hf_online_dflash.yaml: add regression baselines from B200 100K run,
  add MAX_FINAL_LOSS/MIN_FINAL_ACC/MIN_ACCEPTANCE_LENGTH thresholds

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Chenhan Yu &lt;chenhany@nvidia.com&gt;
diff --git a/tools/launcher/common/specdec/vllm_smoke_test.sh b/tools/launcher/common/specdec/vllm_smoke_test.sh
@@ -32,7 +32,10 @@
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 source ${SCRIPT_DIR}/../service_utils.sh 2>/dev/null || true
 
-cleanup() { kill $SERVER_PID 2>/dev/null; sleep 2; kill -9 $SERVER_PID 2>/dev/null; }
+# Ensure pandas is available (missing in some vLLM nightly builds)
+pip install pandas 2>/dev/null || true
+
+cleanup() { kill $SERVER_PID 2>/dev/null; sleep 2; kill -9 $SERVER_PID 2>/dev/null; rm -f "${VLLM_LOG:-}" 2>/dev/null; }
 trap cleanup EXIT
 
 MODEL=${HF_MODEL_CKPT}
@@ -72,22 +75,23 @@ if [ "${DISABLE_PREFIX_CACHING:-}" = "1" ]; then
     OPTIONAL_ARGS="${OPTIONAL_ARGS} --no-enable-prefix-caching"
 fi
 
-# Start vLLM server
+# Start vLLM server (capture output for regression check parsing)
+VLLM_LOG=$(mktemp /tmp/vllm_server_XXXXXX.log)
 if [ -n "$SPEC_CONFIG" ]; then
     vllm serve ${MODEL} \
         --speculative-config "${SPEC_CONFIG}" \
         --max-num-batched-tokens 32768 \
         --tensor-parallel-size ${TP} \
         --port ${PORT} \
         ${OPTIONAL_ARGS} \
-        &
+        > >(tee -a "$VLLM_LOG") 2>&1 &
 else
     vllm serve ${MODEL} \
         --max-num-batched-tokens 32768 \
         --tensor-parallel-size ${TP} \
         --port ${PORT} \
         ${OPTIONAL_ARGS} \
-        &
+        > >(tee -a "$VLLM_LOG") 2>&1 &
 fi
 SERVER_PID=$!
 
@@ -168,4 +172,27 @@ if [ $FAIL -gt 0 ]; then
     exit 1
 fi
 
+# Regression check: minimum acceptance length for speculative decoding
+if [ -n "${MIN_ACCEPTANCE_LENGTH:-}" ]; then
+    # Parse mean acceptance length from vLLM's SpecDecoding metrics log.
+    # vLLM logs: "SpecDecoding metrics: Mean acceptance length: X.XX, ..."
+    # Take the last reported value (most accurate, covers all prompts).
+    AVG_ACCEPT=$(grep -oP 'Mean acceptance length: \K[0-9.]+' "$VLLM_LOG" 2>/dev/null | tail -1 || true)
+    if [ -n "$AVG_ACCEPT" ]; then
+        echo ""
+        echo "=== Acceptance Length Regression Check ==="
+        echo "  Mean acceptance length: ${AVG_ACCEPT}"
+        echo "  Threshold: ${MIN_ACCEPTANCE_LENGTH}"
+        PASS_CHECK=$(python3 -c "print('yes' if float('${AVG_ACCEPT}') >= float('${MIN_ACCEPTANCE_LENGTH}') else 'no')")
+        if [ "$PASS_CHECK" = "yes" ]; then
+            echo "  PASS: ${AVG_ACCEPT} >= ${MIN_ACCEPTANCE_LENGTH}"
+        else
+            echo "  REGRESSION: ${AVG_ACCEPT} < ${MIN_ACCEPTANCE_LENGTH}"
+            exit 1
+        fi
+    else
+        echo "WARNING: Could not parse acceptance length from vLLM log, skipping regression check"
+    fi
+fi
+
 echo "Done"
diff --git a/tools/launcher/common/vllm/query.sh b/tools/launcher/common/vllm/query.sh
@@ -58,6 +58,9 @@ source ${SCRIPT_DIR}/../service_utils.sh
 #     gpus_per_node: 4
 ###################################################################################################
 
+# Ensure pandas is available (missing in some vLLM nightly builds)
+pip install pandas 2>/dev/null || true
+
 export OPENAI_API_KEY="token-abc123"
 
 if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
@@ -108,13 +111,26 @@ SERVER_PID=$!
 
 # Wait for server to start up by polling the health endpoint
 echo "Waiting for server to start..."
+MAX_WAIT=${VLLM_STARTUP_TIMEOUT:-600}
+WAITED=0
 while true; do
+    if ! kill -0 $SERVER_PID 2>/dev/null; then
+        echo "ERROR: vLLM server process died during startup"
+        wait $SERVER_PID 2>/dev/null
+        exit 1
+    fi
     response=$(curl -s -o /dev/null -w "%{http_code}" "http://$(hostname -f):8000/health" || true)
     if [ "$response" -eq 200 ]; then
-        echo "Server is up!"
+        echo "Server is up! (waited ${WAITED}s)"
         break
     fi
-    echo "Server not ready yet, retrying in 10 seconds..."
+    WAITED=$((WAITED + 10))
+    if [ $WAITED -ge $MAX_WAIT ]; then
+        echo "ERROR: vLLM server failed to start within ${MAX_WAIT}s"
+        kill $SERVER_PID 2>/dev/null
+        exit 1
+    fi
+    echo "Server not ready yet (${WAITED}/${MAX_WAIT}s), retrying in 10 seconds..."
     sleep 10
 done
 
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_dflash.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_dflash.yaml
@@ -5,6 +5,21 @@
 #   task_1: vLLM smoke test with DFlash speculative decoding
 #   task_2: MT-Bench per-category HF AR evaluation (1 GPU)
 #
+# Convergence baseline (8x B200, batch_size=1, seq_len=4096, 5-layer draft, block_size=16):
+#   100K samples, 1 epoch (~12,500 steps)
+#   Step   100 (epoch 0.01): loss=8.900  acc=0.029
+#   Step  1000 (epoch 0.08): loss=5.845  acc=0.096
+#   Step  2500 (epoch 0.20): loss=4.981  acc=0.138
+#   Step  5000 (epoch 0.40): loss=4.383  acc=0.176
+#   Step  7500 (epoch 0.60): loss=4.040  acc=0.196
+#   Step 10000 (epoch 0.80): loss=3.900  acc=0.210
+#   Step 12500 (epoch 1.00): loss=3.821  acc=0.200
+#   Average train_loss=4.493, training time=5094s
+#
+# Regression criteria (set via environment):
+#   MAX_FINAL_LOSS: final loss must be below this (default: 5.0)
+#   MIN_FINAL_ACC:  final accuracy must be above this (default: 0.15)
+#
 # Reference: "DFlash: Block Diffusion for Flash Speculative Decoding" (arXiv:2602.06036)
 #
 # Usage:
@@ -22,20 +37,24 @@ pipeline:
     args:
       - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
       - model.model_name_or_path=<<global_vars.hf_model>>
-      - data.data_path=/hf-local/modelopt/Speculative-Decoding-Dataset-v1-Qwen3-8B/sample-1K-openai.jsonl
+      - data.data_path=/hf-local/modelopt/Speculative-Decoding-Dataset-v1-Qwen3-8B/sample-100K-openai.jsonl
       - data.chat_template=examples/Qwen/Qwen3-8B/chat_template_train.jinja
       - training.output_dir=/scratchspace/dflash_bs16
+      - training.per_device_train_batch_size=1
       - training.num_train_epochs=1
       - training.training_seq_len=4096
       - training.save_steps=5000
-      - training.logging_steps=1000
+      - training.logging_steps=100
       - training.disable_tqdm=true
       - training.answer_only_loss=true
       - dflash.dflash_block_size=16
       - dflash.dflash_num_anchors=512
       - dflash.dflash_loss_decay_factor=7
       - dflash.dflash_mask_token_id=151669
       - dflash.dflash_architecture_config.num_hidden_layers=5
+    environment:
+      - MAX_FINAL_LOSS: "5.0"
+      - MIN_FINAL_ACC: "0.15"
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
@@ -50,8 +69,10 @@ pipeline:
       - DRAFT_CKPT_DIR: /scratchspace/dflash_bs16
       - SPEC_METHOD: "dflash"
       - NUM_SPEC_TOKENS: "7"
+      - MIN_ACCEPTANCE_LENGTH: "1.4"
     slurm_config:
       _factory_: "slurm_factory"
+      container: "vllm/vllm-openai:nightly"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 1