[OMNIML-4788] tools/launcher: bump qualitative concurrency to 32, throughput_32k to 80 samples @ concurrency=8

ChenhanYu · ChenhanYu · commit ae59ce9c781b · 2026-05-29T14:12:19.000-07:00
Prior config (concurrency=8 on qualitative, --num_requests 20 + concurrency=4 on
throughput_32k) was conservative-tuned for time-budget headroom. With tp_size=2
in place the KV budget is doubled, so we can push concurrency further:

  task_0 (qualitative):     concurrency 8 -&gt; 32        (still tp_size=2)
  task_1 (throughput_32k):  concurrency 4 -&gt; 8,
                            --num_requests 20 -&gt; 80    (still tp_size=2)

AL is concurrency-independent; the bump only sacrifices aa_timing fidelity.
8 * 32K = 256K tokens of in-flight KV stays within the doubled KV budget on
tp_size=2.

Signed-off-by: chenhany &lt;chenhany@nvidia.com&gt;
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench.yaml
@@ -32,7 +32,7 @@ pipeline:
     hf_model: /hf-local/Qwen/Qwen3.5-4B
 
   # Step 1: qualitative split — quality / acceptance-rate numbers.
-  # tp_size=2 + concurrency=8 trades aa_timing fidelity for ~10x wall-clock
+  # tp_size=2 + concurrency=32 trades aa_timing fidelity for ~30x wall-clock
   # speedup; acceptance-length (AL) is concurrency-independent and is the
   # primary metric we care about for this split.
   task_0:
@@ -44,7 +44,7 @@ pipeline:
       - --speculative_algorithm NONE
       - --tp_size 2
       - --ep_size 1
-      - --concurrency 8
+      - --concurrency 32
       - --output_length 4096
       - --aa_timing
       - --show_progress
@@ -60,10 +60,10 @@ pipeline:
       container: vllm/vllm-openai:qwen3_5-cu130
 
   # Step 2: throughput_32k split — long-context throughput.
-  # `--num_requests 20` caps the run at 20 samples (split has 1,536) so it fits
+  # `--num_requests 80` caps the run at 80 samples (split has 1,536) so it fits
   # in the 4h Slurm time-limit; each 32K-input sample takes ~60-90s.
-  # tp_size=2 doubles the KV-cache budget across 2 GPUs, making concurrency>1
-  # feasible at 32K prompts.
+  # tp_size=2 doubles the KV-cache budget across 2 GPUs; concurrency=8 keeps
+  # 8 * 32K = 256K tokens of in-flight KV under that doubled budget.
   task_1:
     script: common/specdec_bench/run.sh
     args:
@@ -73,8 +73,8 @@ pipeline:
       - --speculative_algorithm NONE
       - --tp_size 2
       - --ep_size 1
-      - --concurrency 4
-      - --num_requests 20
+      - --concurrency 8
+      - --num_requests 80
       - --output_length 4096
       - --runtime_params modules/Model-Optimizer/tools/launcher/common/specdec_bench/runtime_params_throughput_32k.yaml
       - --aa_timing
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp.yaml
@@ -25,7 +25,7 @@ pipeline:
     hf_model: /hf-local/Qwen/Qwen3.5-4B
 
   # Step 1: qualitative split — quality / acceptance-rate numbers with MTP draft=3.
-  # tp_size=2 + concurrency=8 trades aa_timing fidelity for ~10x wall-clock
+  # tp_size=2 + concurrency=32 trades aa_timing fidelity for ~30x wall-clock
   # speedup; acceptance-length (AL) is concurrency-independent and is the
   # primary metric we care about for this split.
   task_0:
@@ -38,7 +38,7 @@ pipeline:
       - --draft_length 3
       - --tp_size 2
       - --ep_size 1
-      - --concurrency 8
+      - --concurrency 32
       - --output_length 4096
       - --aa_timing
       - --show_progress
@@ -69,10 +69,10 @@ pipeline:
       container: vllm/vllm-openai:qwen3_5-cu130
 
   # Step 2: throughput_32k split — long-context throughput with MTP draft=3.
-  # `--num_requests 20` caps the run at 20 samples (split has 1,536) so it fits
+  # `--num_requests 80` caps the run at 80 samples (split has 1,536) so it fits
   # in the 4h Slurm time-limit; each 32K-input sample takes ~60-90s.
-  # tp_size=2 doubles the KV-cache budget across 2 GPUs, making concurrency>1
-  # feasible at 32K prompts.
+  # tp_size=2 doubles the KV-cache budget across 2 GPUs; concurrency=8 keeps
+  # 8 * 32K = 256K tokens of in-flight KV under that doubled budget.
   task_1:
     script: common/specdec_bench/run.sh
     args:
@@ -83,8 +83,8 @@ pipeline:
       - --draft_length 3
       - --tp_size 2
       - --ep_size 1
-      - --concurrency 4
-      - --num_requests 20
+      - --concurrency 8
+      - --num_requests 80
       - --output_length 4096
       - --runtime_params modules/Model-Optimizer/tools/launcher/common/specdec_bench/runtime_params_throughput_32k.yaml
       - --aa_timing