update: Qwen3-8B PTQ and EAGLE3 to 8 GPUs

ChenhanYu · claude · ChenhanYu · commit b04a2dbcb109 · 2026-04-17T15:13:02.000-07:00
Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Chenhan Yu &lt;chenhany@nvidia.com&gt;
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml
@@ -27,8 +27,8 @@ pipeline:
     script: common/tensorrt_llm/query.sh
     args:
       - --model <<global_vars.hf_model>>
-      - --tp_size 4
-      - --ep_size 4
+      - --tp_size 8
+      - --ep_size 8
       - --max_num_tokens 32000
       - --port 8000
       - --host 0.0.0.0
@@ -41,8 +41,8 @@ pipeline:
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 4
-      gpus_per_node: 4
+      ntasks_per_node: 8
+      gpus_per_node: 8
       container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
 
   # Step 2: Dump hidden states from target model
@@ -52,15 +52,15 @@ pipeline:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
-      - --moe-ep 4
+      - --tp 8
+      - --moe-ep 8
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 4
-      gpus_per_node: 4
+      ntasks_per_node: 8
+      gpus_per_node: 8
       container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
 
   # Step 3: Train EAGLE3 draft head (offline, single task)
@@ -78,7 +78,7 @@ pipeline:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      gpus_per_node: 4
+      gpus_per_node: 8
       container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
 
   # Step 4: Benchmark speculative decoding (VLLM backend)
@@ -89,7 +89,7 @@ pipeline:
       - --draft_length 3
       - --output_length 4096
       - --engine VLLM
-      - --tp_size 4
+      - --tp_size 8
       - --ep_size 1
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
@@ -100,5 +100,5 @@ pipeline:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      gpus_per_node: 4
+      gpus_per_node: 8
       container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml
@@ -1,4 +1,4 @@
-# Qwen3-8B NVFP4 quantization (4 GPUs, for Slurm clusters).
+# Qwen3-8B NVFP4 quantization (8 GPUs, for Slurm clusters).
 #
 # Uses MegatronLMQuantizeTask with typed config — see common/megatron_lm/quantize/task.py
 # for all available fields.
@@ -19,13 +19,13 @@ pipeline:
     config:
       model: Qwen/Qwen3-8B
       quant_cfg: NVFP4_DEFAULT_CFG
-      tp: 4
+      tp: 8
       calib_dataset: abisee/cnn_dailymail
       calib_size: 32
       mmlu_dataset: cais/mmlu
       hf_local: /hf-local/
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 4
-      gpus_per_node: 4
+      ntasks_per_node: 8
+      gpus_per_node: 8