fix: DFlash regression tests and vLLM server liveness (#1288)

ChenhanYu · claude · web-flow · commit 76b6fd51a54a · 2026-04-18T11:41:17.000+05:30
## Summary - **hf_online_dflash.yaml**: Add 100K-sample training config with regression baselines (B200 loss curve), `MAX_FINAL_LOSS`/`MIN_FINAL_ACC`/`MIN_ACCEPTANCE_LENGTH` thresholds, vLLM nightly container for DFlash support - **vllm_smoke_test.sh**: Parse acceptance length from vLLM server log for regression check; `pip install pandas` workaround for broken nightly container; capture server output to temp file - **query.sh**: Detect vLLM server death during startup (PID liveness check) + 600s timeout to prevent infinite polling that wastes GPU hours; `pip install pandas` workaround - Fix empty `environment:` key in DFlash YAML causing nemo_run `ListParseError` ## Test plan - [x] E2E pipeline passed on 8x B200 (training + vLLM smoke test + AR eval) - [x] Training regression: final loss 3.82 < 5.0, acc 0.20 > 0.15 - [x] vLLM acceptance length: 1.79 >= 1.4 threshold - [x] AR evaluation: 2.02 overall on MT-Bench (8 categories) - [x] Server liveness check prevents GPU waste on vLLM crash 🤖 Generated with [Claude Code](https://claude.com/claude-code)  ## Summary by CodeRabbit * **New Features** * Added optional regression validation for vLLM acceptance metrics * Introduced configurable vLLM server startup timeout (default 600 seconds) * **Improvements** * Enhanced logging for vLLM server startup with progress tracking and waited time reporting * Faster detection of vLLM server process failures during initialization * **Configuration Updates** * Increased training dataset size and logging granularity * Scaled tensor parallelism from 4 to 8 across multiple pipelines * Expanded PTQ quantization to multi-step pipeline * Added configurable training metric thresholds  --------- Signed-off-by: Chenhan Yu <chenhany@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/tools/launcher/common/megatron_lm/quantize/quantize.sh b/tools/launcher/common/megatron_lm/quantize/quantize.sh
@@ -36,10 +36,18 @@ CONVERT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/convert.sh
 EXPORT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/export.sh"
 
 export MLM_EXTRA_ARGS=${@}
-${QUANTIZE_EXE} ${MLM_MODEL_CFG} ${QUANT_CFG}
-
-export MLM_EXTRA_ARGS="--mmlu-dataset ${MMLU_DATASET:-/hf-local/cais/mmlu} --fraction 0.01 --lower-bound 0.38 --disable-tqdm"
-MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG}
+TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} ${QUANTIZE_EXE} ${MLM_MODEL_CFG} ${QUANT_CFG}
+
+export MLM_EXTRA_ARGS="--mmlu-dataset ${MMLU_DATASET:-/hf-local/cais/mmlu} --fraction 0.01 --lower-bound ${MMLU_LOWER_BOUND:-0.38} --disable-tqdm"
+TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG}
+
+# Export quantized checkpoint to HF format (PP=all GPUs)
+TOTAL_GPUS=$(python3 -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo ${NUM_GPUS:-1})
+echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (PP=${TOTAL_GPUS}) ==="
+export MLM_EXTRA_ARGS=
+TP=1 PP=${TOTAL_GPUS} EP=1 ETP=1 MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${EXPORT_EXE} ${MLM_MODEL_CFG}
+ls ${EXPORT_DIR}
+cat ${EXPORT_DIR}/hf_quant_config.json
 
 ###################################################################################################
 
diff --git a/tools/launcher/common/megatron_lm/quantize/task.py b/tools/launcher/common/megatron_lm/quantize/task.py
@@ -66,6 +66,10 @@ class MegatronLMQuantizeConfig:
     model: str = "Qwen/Qwen3-8B"
     quant_cfg: str = "NVFP4_DEFAULT_CFG"
     tp: int = 4
+    pp: int = 1
+    ep: int = 1
+    etp: int = 1
+    extra_args: str = ""
     calib_dataset: str = "abisee/cnn_dailymail"
     calib_size: int = 32
     mmlu_dataset: str = "cais/mmlu"
@@ -92,14 +96,21 @@ def __post_init__(self):
         if self.config is not None:
             c = self.config
             self.script = self.script or "common/megatron_lm/quantize/quantize.sh"
-            self.args = [
+            args = [
                 f"--calib-dataset-path-or-name {c.hf_local}{c.calib_dataset}",
                 f"--calib-size {c.calib_size}",
             ]
+            if c.extra_args:
+                args.append(c.extra_args)
+            self.args = args
             self.environment = [
                 {"MLM_MODEL_CFG": c.model},
                 {"QUANT_CFG": c.quant_cfg},
                 {"HF_MODEL_CKPT": f"{c.hf_local}{c.model}"},
                 {"MMLU_DATASET": f"{c.hf_local}{c.mmlu_dataset}"},
                 {"TP": str(c.tp)},
+                {"PP": str(c.pp)},
+                {"EP": str(c.ep)},
+                {"ETP": str(c.etp)},
+                {"MMLU_LOWER_BOUND": str(c.mmlu_lower_bound)},
             ]
diff --git a/tools/launcher/common/specdec/vllm_smoke_test.sh b/tools/launcher/common/specdec/vllm_smoke_test.sh
@@ -32,7 +32,10 @@
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 source ${SCRIPT_DIR}/../service_utils.sh 2>/dev/null || true
 
-cleanup() { kill $SERVER_PID 2>/dev/null; sleep 2; kill -9 $SERVER_PID 2>/dev/null; }
+# Ensure pandas is available (missing in some vLLM nightly builds)
+pip install pandas 2>/dev/null || true
+
+cleanup() { kill $SERVER_PID 2>/dev/null; sleep 2; kill -9 $SERVER_PID 2>/dev/null; rm -f "${VLLM_LOG:-}" 2>/dev/null; }
 trap cleanup EXIT
 
 MODEL=${HF_MODEL_CKPT}
@@ -72,22 +75,23 @@ if [ "${DISABLE_PREFIX_CACHING:-}" = "1" ]; then
     OPTIONAL_ARGS="${OPTIONAL_ARGS} --no-enable-prefix-caching"
 fi
 
-# Start vLLM server
+# Start vLLM server (capture output for regression check parsing)
+VLLM_LOG=$(mktemp /tmp/vllm_server_XXXXXX.log)
 if [ -n "$SPEC_CONFIG" ]; then
     vllm serve ${MODEL} \
         --speculative-config "${SPEC_CONFIG}" \
         --max-num-batched-tokens 32768 \
         --tensor-parallel-size ${TP} \
         --port ${PORT} \
         ${OPTIONAL_ARGS} \
-        &
+        > >(tee -a "$VLLM_LOG") 2>&1 &
 else
     vllm serve ${MODEL} \
         --max-num-batched-tokens 32768 \
         --tensor-parallel-size ${TP} \
         --port ${PORT} \
         ${OPTIONAL_ARGS} \
-        &
+        > >(tee -a "$VLLM_LOG") 2>&1 &
 fi
 SERVER_PID=$!
 
@@ -168,4 +172,27 @@ if [ $FAIL -gt 0 ]; then
     exit 1
 fi
 
+# Regression check: minimum acceptance length for speculative decoding
+if [ -n "${MIN_ACCEPTANCE_LENGTH:-}" ]; then
+    # Parse mean acceptance length from vLLM's SpecDecoding metrics log.
+    # vLLM logs: "SpecDecoding metrics: Mean acceptance length: X.XX, ..."
+    # Take the last reported value (most accurate, covers all prompts).
+    AVG_ACCEPT=$(grep -oP 'Mean acceptance length: \K[0-9.]+' "$VLLM_LOG" 2>/dev/null | tail -1 || true)
+    if [ -n "$AVG_ACCEPT" ]; then
+        echo ""
+        echo "=== Acceptance Length Regression Check ==="
+        echo "  Mean acceptance length: ${AVG_ACCEPT}"
+        echo "  Threshold: ${MIN_ACCEPTANCE_LENGTH}"
+        PASS_CHECK=$(python3 -c "print('yes' if float('${AVG_ACCEPT}') >= float('${MIN_ACCEPTANCE_LENGTH}') else 'no')")
+        if [ "$PASS_CHECK" = "yes" ]; then
+            echo "  PASS: ${AVG_ACCEPT} >= ${MIN_ACCEPTANCE_LENGTH}"
+        else
+            echo "  REGRESSION: ${AVG_ACCEPT} < ${MIN_ACCEPTANCE_LENGTH}"
+            exit 1
+        fi
+    else
+        echo "WARNING: Could not parse acceptance length from vLLM log, skipping regression check"
+    fi
+fi
+
 echo "Done"
diff --git a/tools/launcher/common/tensorrt_llm/eval.sh b/tools/launcher/common/tensorrt_llm/eval.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+###################################################################################################
+
+if [[ -z ${HF_MODEL_CKPT} ]]; then
+    export HF_MODEL_CKPT=/scratchspace/export
+fi
+
+if [[ -z ${TP} ]]; then
+    TP=4
+fi
+
+if [[ -z ${EP} ]]; then
+    EP=4
+fi
+
+if [[ -z ${EXTRA_LLM_API_OPTIONS} ]]; then
+    EXTRA_LLM_API_OPTIONS=common/tensorrt_llm/extra_llm_api_options.yaml
+fi
+
+
+TARGET_FILENAME="config.json"
+
+
+# Find all files matching the target filename, print their paths null-terminated
+find "${HF_MODEL_CKPT}" -type f -name "$TARGET_FILENAME" -print0 | while IFS= read -r -d '' filepath; do
+    # Extract the directory path from the full file path
+    dir_path=$(dirname "$filepath")
+    
+    echo "Processing model: $dir_path"
+    # Place your commands here to run within or on the $dir_path
+    # Example: cd "$dir_path" && some_command
+
+    trtllm-llmapi-launch trtllm-eval \
+        --model ${dir_path} \
+        --disable_kv_cache_reuse \
+        --tp_size ${TP} \
+        --ep_size ${EP} \
+        --trust_remote_code \
+        --extra_llm_api_options ${EXTRA_LLM_API_OPTIONS} \
+        mmlu
+done
diff --git a/tools/launcher/common/tensorrt_llm/extra_llm_api_options.yaml b/tools/launcher/common/tensorrt_llm/extra_llm_api_options.yaml
@@ -0,0 +1,52 @@
+context_parallel_size: 1
+
+  # backend: _autodeploy
+  # reasoning_parser: nano-v3
+  # tool_parser: qwen3_coder
+  # 
+  # runtime: trtllm
+  # compile_backend: torch-cudagraph
+  # max_batch_size: 64
+  # max_seq_len: 16384
+  # enable_chunked_prefill: true
+  # attn_backend: flashinfer
+  # model_factory: AutoModelForCausalLM
+  # skip_loading_weights: false
+  # free_mem_ratio: 0.65
+  # cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
+  # kv_cache_config:
+  #   # disable kv_cache reuse since not supported for hybrid/ssm models
+  #   enable_block_reuse: false
+  # transforms:
+  #   detect_sharding:
+  #     sharding_dims: ['ep', 'bmm']
+  #     allreduce_strategy: 'AUTO'
+  #     manual_config:
+  #       head_dim: 128
+  #       tp_plan:
+  #         # mamba SSM layer
+  #         "in_proj": "mamba"
+  #         "out_proj": "rowwise"
+  #         # attention layer
+  #         "q_proj": "colwise"
+  #         "k_proj": "colwise"
+  #         "v_proj": "colwise"
+  #         "o_proj": "rowwise"
+  #         # NOTE: consider not sharding shared experts and/or
+  #         # latent projections at all, keeping them replicated.
+  #         # To do so, comment out the corresponding entries.
+  #         # moe layer: SHARED experts
+  #         "up_proj": "colwise"
+  #         "down_proj": "rowwise"
+  #         # MoLE: latent projections: simple shard
+  #         "fc1_latent_proj": "gather"
+  #         "fc2_latent_proj": "gather"
+  #   multi_stream_moe:
+  #     stage: compile
+  #     enabled: true
+  #   insert_cached_ssm_attention:
+  #       cache_config:
+  #         mamba_dtype: float32
+  #   fuse_mamba_a_log:
+  #     stage: post_load_fusion
+  #     enabled: true
diff --git a/tools/launcher/common/vllm/query.sh b/tools/launcher/common/vllm/query.sh
@@ -58,6 +58,9 @@ source ${SCRIPT_DIR}/../service_utils.sh
 #     gpus_per_node: 4
 ###################################################################################################
 
+# Ensure pandas is available (missing in some vLLM nightly builds)
+pip install pandas 2>/dev/null || true
+
 export OPENAI_API_KEY="token-abc123"
 
 if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
@@ -108,13 +111,26 @@ SERVER_PID=$!
 
 # Wait for server to start up by polling the health endpoint
 echo "Waiting for server to start..."
+MAX_WAIT=${VLLM_STARTUP_TIMEOUT:-600}
+WAITED=0
 while true; do
+    if ! kill -0 $SERVER_PID 2>/dev/null; then
+        echo "ERROR: vLLM server process died during startup"
+        wait $SERVER_PID 2>/dev/null
+        exit 1
+    fi
     response=$(curl -s -o /dev/null -w "%{http_code}" "http://$(hostname -f):8000/health" || true)
     if [ "$response" -eq 200 ]; then
-        echo "Server is up!"
+        echo "Server is up! (waited ${WAITED}s)"
         break
     fi
-    echo "Server not ready yet, retrying in 10 seconds..."
+    WAITED=$((WAITED + 10))
+    if [ $WAITED -ge $MAX_WAIT ]; then
+        echo "ERROR: vLLM server failed to start within ${MAX_WAIT}s"
+        kill $SERVER_PID 2>/dev/null
+        exit 1
+    fi
+    echo "Server not ready yet (${WAITED}/${MAX_WAIT}s), retrying in 10 seconds..."
     sleep 10
 done
 
diff --git a/tools/launcher/examples/Qwen/Qwen3-30B-A3B/megatron_lm_ptq.yaml b/tools/launcher/examples/Qwen/Qwen3-30B-A3B/megatron_lm_ptq.yaml
@@ -0,0 +1,53 @@
+# Qwen3-30B-A3B PTQ quantization (8 GPUs, MoE model).
+#
+# 2-step pipeline: NVFP4 then FP8, each followed by MMLU evaluation.
+# MMLU uses EP for expert parallelism.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3-30B-A3B/megatron_lm_ptq.yaml --yes
+
+job_name: Qwen3-30B-A3B_PTQ
+pipeline:
+  skip: false
+  allow_to_fail: false
+  note:
+
+  task_0:
+    _target_: common.megatron_lm.quantize.task.MegatronLMQuantizeTask
+    config:
+      model: Qwen/Qwen3-30B-A3B
+      quant_cfg: NVFP4_DEFAULT_CFG
+      tp: 1
+      pp: 1
+      ep: 8
+      etp: 1
+      calib_dataset: abisee/cnn_dailymail
+      calib_size: 32
+      mmlu_dataset: cais/mmlu
+      mmlu_lower_bound: 0.75
+      hf_local: /hf-local/
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 8
+      gpus_per_node: 8
+
+  task_1:
+    _target_: common.megatron_lm.quantize.task.MegatronLMQuantizeTask
+    config:
+      model: Qwen/Qwen3-30B-A3B
+      quant_cfg: FP8_DEFAULT_CFG
+      tp: 1
+      pp: 1
+      ep: 8
+      etp: 1
+      calib_dataset: abisee/cnn_dailymail
+      calib_size: 32
+      mmlu_dataset: cais/mmlu
+      mmlu_lower_bound: 0.75
+      hf_local: /hf-local/
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 8
+      gpus_per_node: 8
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml
@@ -27,8 +27,8 @@ pipeline:
     script: common/tensorrt_llm/query.sh
     args:
       - --model <<global_vars.hf_model>>
-      - --tp_size 4
-      - --ep_size 4
+      - --tp_size 8
+      - --ep_size 8
       - --max_num_tokens 32000
       - --port 8000
       - --host 0.0.0.0
@@ -41,8 +41,8 @@ pipeline:
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 4
-      gpus_per_node: 4
+      ntasks_per_node: 8
+      gpus_per_node: 8
       container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
 
   # Step 2: Dump hidden states from target model
@@ -52,15 +52,15 @@ pipeline:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
-      - --moe-ep 4
+      - --tp 8
+      - --moe-ep 8
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 4
-      gpus_per_node: 4
+      ntasks_per_node: 8
+      gpus_per_node: 8
       container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
 
   # Step 3: Train EAGLE3 draft head (offline, single task)
@@ -78,7 +78,7 @@ pipeline:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      gpus_per_node: 4
+      gpus_per_node: 8
       container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
 
   # Step 4: Benchmark speculative decoding (VLLM backend)
@@ -89,7 +89,7 @@ pipeline:
       - --draft_length 3
       - --output_length 4096
       - --engine VLLM
-      - --tp_size 4
+      - --tp_size 8
       - --ep_size 1
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
@@ -100,5 +100,5 @@ pipeline:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      gpus_per_node: 4
+      gpus_per_node: 8
       container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_dflash.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_dflash.yaml
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml
diff --git a/uv.lock b/uv.lock