Skip to content

Commit 7705946

Browse files
author
pensieve-intern
committed
[OMNIML-4886] cell_t0_d3 — pensieve-intern agent draft
1 parent 5eba879 commit 7705946

2 files changed

Lines changed: 67 additions & 0 deletions

File tree

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
sampling_kwargs:
2+
temperature: 0
3+
engine_args:
4+
max_model_len: 40960
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# SPEED-bench MTP speculative-decoding run for Qwen3.5-4B via vLLM.
2+
# Cell t0_d3: temperature=0, draft_length=3.
3+
# Slurm run on cw_dfw:
4+
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm_t0_d3.yaml --yes detach=true
5+
6+
job_name: Qwen3.5-4B_specdec_bench_mtp_vllm_t0_d3
7+
8+
pipeline:
9+
global_vars:
10+
hf_model: /hf-local/Qwen/Qwen3.5-4B
11+
12+
task_0:
13+
script: common/specdec_bench/run.sh
14+
args:
15+
- --dataset speed
16+
- --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative
17+
- --engine VLLM
18+
- --speculative_algorithm MTP
19+
- --draft_length 3
20+
- --runtime_params common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d3.yaml
21+
- --tp_size 2
22+
- --ep_size 1
23+
- --concurrency 32
24+
- --output_length 4096
25+
- --aa_timing
26+
- --show_progress
27+
- --save_dir /scratchspace/qwen35_4b_mtp_vllm_t0_d3/qualitative
28+
environment:
29+
- HF_MODEL_CKPT: <<global_vars.hf_model>>
30+
- HF_LOCAL: /hf-local
31+
slurm_config:
32+
_factory_: "slurm_factory"
33+
nodes: 1
34+
ntasks_per_node: 1
35+
gpus_per_node: 2
36+
container: vllm/vllm-openai:qwen3_5-cu130
37+
38+
task_1:
39+
script: common/specdec_bench/run.sh
40+
args:
41+
- --dataset speed
42+
- --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k
43+
- --engine VLLM
44+
- --speculative_algorithm MTP
45+
- --draft_length 3
46+
- --runtime_params common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d3.yaml
47+
- --tp_size 2
48+
- --ep_size 1
49+
- --concurrency 8
50+
- --num_requests 80
51+
- --output_length 4096
52+
- --aa_timing
53+
- --show_progress
54+
- --save_dir /scratchspace/qwen35_4b_mtp_vllm_t0_d3/throughput_32k
55+
environment:
56+
- HF_MODEL_CKPT: <<global_vars.hf_model>>
57+
- HF_LOCAL: /hf-local
58+
slurm_config:
59+
_factory_: "slurm_factory"
60+
nodes: 1
61+
ntasks_per_node: 1
62+
gpus_per_node: 2
63+
container: vllm/vllm-openai:qwen3_5-cu130

0 commit comments

Comments
 (0)