File tree Expand file tree Collapse file tree
common/specdec_bench/_cells Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ sampling_kwargs :
2+ temperature : 0
3+ engine_args :
4+ max_model_len : 40960
Original file line number Diff line number Diff line change 1+ # SPEED-bench MTP speculative-decoding run for Qwen3.5-4B via vLLM.
2+ # Cell: t0_d3 (temperature=0, draft_length=3)
3+
4+ job_name : Qwen3.5-4B_specdec_bench_mtp_vllm_t0_d3
5+
6+ pipeline :
7+ global_vars :
8+ hf_model : /hf-local/Qwen/Qwen3.5-4B
9+
10+ task_0 :
11+ script : common/specdec_bench/run.sh
12+ args :
13+ - --dataset speed
14+ - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative
15+ - --engine VLLM
16+ - --speculative_algorithm MTP
17+ - --draft_length 3
18+ - --runtime_params common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d3.yaml
19+ - --tp_size 2
20+ - --ep_size 1
21+ - --concurrency 32
22+ - --output_length 4096
23+ - --aa_timing
24+ - --show_progress
25+ - --save_dir /scratchspace/qwen35_4b_mtp_vllm_t0_d3/qualitative
26+ environment :
27+ - HF_MODEL_CKPT : <<global_vars.hf_model>>
28+ - HF_LOCAL : /hf-local
29+ slurm_config :
30+ _factory_ : " slurm_factory"
31+ nodes : 1
32+ ntasks_per_node : 1
33+ gpus_per_node : 2
34+ container : vllm/vllm-openai:qwen3_5-cu130
35+
36+ task_1 :
37+ script : common/specdec_bench/run.sh
38+ args :
39+ - --dataset speed
40+ - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k
41+ - --engine VLLM
42+ - --speculative_algorithm MTP
43+ - --draft_length 3
44+ - --runtime_params common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d3.yaml
45+ - --tp_size 2
46+ - --ep_size 1
47+ - --concurrency 8
48+ - --num_requests 80
49+ - --output_length 4096
50+ - --aa_timing
51+ - --show_progress
52+ - --save_dir /scratchspace/qwen35_4b_mtp_vllm_t0_d3/throughput_32k
53+ environment :
54+ - HF_MODEL_CKPT : <<global_vars.hf_model>>
55+ - HF_LOCAL : /hf-local
56+ slurm_config :
57+ _factory_ : " slurm_factory"
58+ nodes : 1
59+ ntasks_per_node : 1
60+ gpus_per_node : 2
61+ container : vllm/vllm-openai:qwen3_5-cu130
You can’t perform that action at this time.
0 commit comments