Skip to content

Commit b04a2db

Browse files
ChenhanYuclaude
andcommitted
update: Qwen3-8B PTQ and EAGLE3 to 8 GPUs
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Chenhan Yu <chenhany@nvidia.com>
1 parent d58d290 commit b04a2db

2 files changed

Lines changed: 15 additions & 15 deletions

File tree

tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ pipeline:
2727
script: common/tensorrt_llm/query.sh
2828
args:
2929
- --model <<global_vars.hf_model>>
30-
- --tp_size 4
31-
- --ep_size 4
30+
- --tp_size 8
31+
- --ep_size 8
3232
- --max_num_tokens 32000
3333
- --port 8000
3434
- --host 0.0.0.0
@@ -41,8 +41,8 @@ pipeline:
4141
slurm_config:
4242
_factory_: "slurm_factory"
4343
nodes: 1
44-
ntasks_per_node: 4
45-
gpus_per_node: 4
44+
ntasks_per_node: 8
45+
gpus_per_node: 8
4646
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
4747

4848
# Step 2: Dump hidden states from target model
@@ -52,15 +52,15 @@ pipeline:
5252
- --input-data /scratchspace/data
5353
- --output-dir /scratchspace/offline_hidden_states
5454
- --max-seq-len 8192
55-
- --tp 4
56-
- --moe-ep 4
55+
- --tp 8
56+
- --moe-ep 8
5757
environment:
5858
- HF_MODEL_CKPT: <<global_vars.hf_model>>
5959
slurm_config:
6060
_factory_: "slurm_factory"
6161
nodes: 1
62-
ntasks_per_node: 4
63-
gpus_per_node: 4
62+
ntasks_per_node: 8
63+
gpus_per_node: 8
6464
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
6565

6666
# Step 3: Train EAGLE3 draft head (offline, single task)
@@ -78,7 +78,7 @@ pipeline:
7878
_factory_: "slurm_factory"
7979
nodes: 1
8080
ntasks_per_node: 1
81-
gpus_per_node: 4
81+
gpus_per_node: 8
8282
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
8383

8484
# Step 4: Benchmark speculative decoding (VLLM backend)
@@ -89,7 +89,7 @@ pipeline:
8989
- --draft_length 3
9090
- --output_length 4096
9191
- --engine VLLM
92-
- --tp_size 4
92+
- --tp_size 8
9393
- --ep_size 1
9494
- --speculative_algorithm EAGLE3
9595
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
@@ -100,5 +100,5 @@ pipeline:
100100
_factory_: "slurm_factory"
101101
nodes: 1
102102
ntasks_per_node: 1
103-
gpus_per_node: 4
103+
gpus_per_node: 8
104104
container: vllm/vllm-openai:latest

tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Qwen3-8B NVFP4 quantization (4 GPUs, for Slurm clusters).
1+
# Qwen3-8B NVFP4 quantization (8 GPUs, for Slurm clusters).
22
#
33
# Uses MegatronLMQuantizeTask with typed config — see common/megatron_lm/quantize/task.py
44
# for all available fields.
@@ -19,13 +19,13 @@ pipeline:
1919
config:
2020
model: Qwen/Qwen3-8B
2121
quant_cfg: NVFP4_DEFAULT_CFG
22-
tp: 4
22+
tp: 8
2323
calib_dataset: abisee/cnn_dailymail
2424
calib_size: 32
2525
mmlu_dataset: cais/mmlu
2626
hf_local: /hf-local/
2727
slurm_config:
2828
_factory_: "slurm_factory"
2929
nodes: 1
30-
ntasks_per_node: 4
31-
gpus_per_node: 4
30+
ntasks_per_node: 8
31+
gpus_per_node: 8

0 commit comments

Comments
 (0)