From 2c7cf6ac8d884fbea8c2ab2a84b10279ef64ec46 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Wed, 20 May 2026 14:05:36 -0700 Subject: [PATCH 1/5] feat: add Gemma 4 E2B-it model benchmark - 10.3GB model, fits on single L4 (g6.xlarge) with TP=1 - Uploaded to s3://dlc-cicd-models/llm-models/gemma-4-e2b-it.tar.gz - Conservative thresholds pending first benchmark run --- .github/config/model-tests/vllm-model-tests.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index a2f803d550c2..ecaf9cb58ef1 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -104,6 +104,17 @@ benchmark: min_throughput: 680 min_rps: 5.3 + - name: "gemma-4-e2b-it" + s3_model: "gemma-4-e2b-it.tar.gz" + fleet: "x86-g6xl-runner" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 20 + min_rps: 0.15 + # Pending p5e.48xlarge fleet creation. Fleet name "x86-p5e-runner" is a placeholder. # - name: "minimax-m2.7" # s3_model: "minimax-m2.7.tar.gz" From 44f386314cccf69a1c0df9a02c6af4128e9f3a5d Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Wed, 20 May 2026 15:45:31 -0700 Subject: [PATCH 2/5] test: comment out all benchmark models except gemma-4-e2b-it --- .../config/model-tests/vllm-model-tests.yml | 398 +++++++++--------- 1 file changed, 199 insertions(+), 199 deletions(-) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index ecaf9cb58ef1..d977909ab20f 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -33,76 +33,76 @@ smoke-test: benchmark: codebuild-fleet: - - name: "qwen3-embedding-0.6b" - s3_model: "qwen3-embedding-0.6b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--dtype bfloat16 --max-model-len 8192" - test_script: "vllm_embedding_benchmark_test.sh" - min_rps: 5 + # - name: "qwen3-embedding-0.6b" + # s3_model: "qwen3-embedding-0.6b.tar.gz" + # fleet: "x86-g6xl-runner" + # extra_args: "--dtype bfloat16 --max-model-len 8192" + # test_script: "vllm_embedding_benchmark_test.sh" + # min_rps: 5 - - name: "qwen3-vl-embedding-2b" - s3_model: "qwen3-vl-embedding-2b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" - test_script: "vllm_embedding_benchmark_test.sh" - min_rps: 3 + # - name: "qwen3-vl-embedding-2b" + # s3_model: "qwen3-vl-embedding-2b.tar.gz" + # fleet: "x86-g6xl-runner" + # extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" + # test_script: "vllm_embedding_benchmark_test.sh" + # min_rps: 3 - - name: "qwen3-asr-1.7b" - s3_model: "qwen3-asr-1.7b.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - test_script: "vllm_asr_benchmark_test.sh" - test_fixtures: - - "audio/asr_en.wav" - - "audio/asr_zh.wav" - benchmark_audio_fixture: "asr_en.wav" - min_throughput: 30 - min_rps: 1 - benchmark_profiles: "baseline,high_concurrency,sustained_load,burst" + # - name: "qwen3-asr-1.7b" + # s3_model: "qwen3-asr-1.7b.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + # test_script: "vllm_asr_benchmark_test.sh" + # test_fixtures: + # - "audio/asr_en.wav" + # - "audio/asr_zh.wav" + # benchmark_audio_fixture: "asr_en.wav" + # min_throughput: 30 + # min_rps: 1 + # benchmark_profiles: "baseline,high_concurrency,sustained_load,burst" - - name: "gpt-oss-20b" - s3_model: "gpt-oss-20b.tar.gz" - fleet: "x86-g6exl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 1200 - min_rps: 5 + # - name: "gpt-oss-20b" + # s3_model: "gpt-oss-20b.tar.gz" + # fleet: "x86-g6exl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 1200 + # min_rps: 5 - - name: "gemma-4-26b-a4b-it" - s3_model: "gemma-4-26b-a4b-it.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 300 - min_rps: 2.4 + # - name: "gemma-4-26b-a4b-it" + # s3_model: "gemma-4-26b-a4b-it.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 300 + # min_rps: 2.4 - - name: "gemma-4-31b-it" - s3_model: "gemma-4-31b-it.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 150 - min_rps: 1.2 + # - name: "gemma-4-31b-it" + # s3_model: "gemma-4-31b-it.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 150 + # min_rps: 1.2 - - name: "gemma-4-e4b-it" - s3_model: "gemma-4-e4b-it.tar.gz" - fleet: "x86-g6exl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 680 - min_rps: 5.3 + # - name: "gemma-4-e4b-it" + # s3_model: "gemma-4-e4b-it.tar.gz" + # fleet: "x86-g6exl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 680 + # min_rps: 5.3 - name: "gemma-4-e2b-it" s3_model: "gemma-4-e2b-it.tar.gz" @@ -138,27 +138,27 @@ benchmark: # min_throughput: 330 # min_rps: 2.6 - - name: "qwen3.5-9b" - s3_model: "qwen3.5-9b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --enforce-eager" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 20 - min_rps: 0.15 + # - name: "qwen3.5-9b" + # s3_model: "qwen3.5-9b.tar.gz" + # fleet: "x86-g6xl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --enforce-eager" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 20 + # min_rps: 0.15 - - name: "llama-3.3-70b" - s3_model: "llama-3.3-70b.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 128 - num_prompts: 32 - batch_size: 2 - min_throughput: 80 - min_rps: 0.35 + # - name: "llama-3.3-70b" + # s3_model: "llama-3.3-70b.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 32 + # batch_size: 2 + # min_throughput: 80 + # min_rps: 0.35 # https://github.com/vllm-project/vllm/issues/32637 # transformer version doesn't support this model @@ -174,141 +174,141 @@ benchmark: # min_throughput: 20 # min_rps: 1 - - name: "qwen3.5-35b-a3b-fp8" - s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" + # - name: "qwen3.5-35b-a3b-fp8" + # s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" + # fleet: "x86-g6e12xl-runner" # https://github.com/vllm-project/vllm/issues/35743 open bug for capturing CUDA graph fails # workaround with --enforce-eager tp=1 fail while tp=4 success - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 80 - min_rps: 0.35 + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 80 + # min_rps: 0.35 # A100 is compute capability 8.0 — FP8 requires 8.9+ (H100/L40S). # The Marlin fallback uses significantly more memory. - - name: "qwen3.5-27b-fp8" - s3_model: "qwen3.5-27b-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 20 - min_rps: 0.2 + # - name: "qwen3.5-27b-fp8" + # s3_model: "qwen3.5-27b-fp8.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 20 + # min_rps: 0.2 - - name: "qwen3-coder-next-fp8" - s3_model: "qwen3-coder-next-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 256 - num_prompts: 32 - batch_size: 2 - min_throughput: 93 - min_rps: 0.25 + # - name: "qwen3-coder-next-fp8" + # s3_model: "qwen3-coder-next-fp8.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 256 + # num_prompts: 32 + # batch_size: 2 + # min_throughput: 93 + # min_rps: 0.25 runner-scale-sets: - - name: "qwen3-32b" - s3_model: "qwen3-32b.tar.gz" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.85" - input_len: 512 - output_len: 256 - num_prompts: 32 - batch_size: 2 - min_throughput: 1133 - min_rps: 3 + # - name: "qwen3-32b" + # s3_model: "qwen3-32b.tar.gz" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.85" + # input_len: 512 + # output_len: 256 + # num_prompts: 32 + # batch_size: 2 + # min_throughput: 1133 + # min_rps: 3 - - name: "qwen3.5-35b-a3b-fp8" - s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" + # - name: "qwen3.5-35b-a3b-fp8" + # s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" # https://github.com/vllm-project/vllm/issues/35743 open bug for capturing CUDA graph fails # workaround with --enforce-eager tp=1 fail while tp=4 success - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 80 - min_rps: 0.35 + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 80 + # min_rps: 0.35 - - name: "qwen3.5-27b-fp8" - s3_model: "qwen3.5-27b-fp8.tar.gz" + # - name: "qwen3.5-27b-fp8" + # s3_model: "qwen3.5-27b-fp8.tar.gz" # A100 lacks native FP8 — vLLM dequantizes to BF16 at load, doubling weight memory - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --enforce-eager" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 20 - min_rps: 0.2 + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --enforce-eager" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 20 + # min_rps: 0.2 - - name: "qwen3-coder-next-fp8" - s3_model: "qwen3-coder-next-fp8.tar.gz" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 256 - num_prompts: 32 - batch_size: 2 - min_throughput: 93 - min_rps: 0.25 + # - name: "qwen3-coder-next-fp8" + # s3_model: "qwen3-coder-next-fp8.tar.gz" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 256 + # num_prompts: 32 + # batch_size: 2 + # min_throughput: 93 + # min_rps: 0.25 - - name: "llama-3.3-70b" - s3_model: "llama-3.3-70b.tar.gz" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 128 - num_prompts: 32 - batch_size: 2 - min_throughput: 80 - min_rps: 0.35 + # - name: "llama-3.3-70b" + # s3_model: "llama-3.3-70b.tar.gz" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 32 + # batch_size: 2 + # min_throughput: 80 + # min_rps: 0.35 # --- Qwen 3.5/3.6 new models (thresholds at ~50% of observed) --- - - name: "qwen3.5-2b" - s3_model: "qwen3.5-2b.tar.gz" - runner_label: "gpu-l4-1gpu-runners" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 5256 - min_rps: 8.2 + # - name: "qwen3.5-2b" + # s3_model: "qwen3.5-2b.tar.gz" + # runner_label: "gpu-l4-1gpu-runners" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 5256 + # min_rps: 8.2 - - name: "qwen3.6-27b" - s3_model: "qwen3.6-27b.tar.gz" - runner_label: "gpu-l40s-4gpu-runners" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 2195 - min_rps: 3.4 + # - name: "qwen3.6-27b" + # s3_model: "qwen3.6-27b.tar.gz" + # runner_label: "gpu-l40s-4gpu-runners" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 2195 + # min_rps: 3.4 - - name: "qwen3.6-35b-a3b" - s3_model: "qwen3.6-35b-a3b.tar.gz" - runner_label: "gpu-l40s-4gpu-runners" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 2654 - min_rps: 4.1 + # - name: "qwen3.6-35b-a3b" + # s3_model: "qwen3.6-35b-a3b.tar.gz" + # runner_label: "gpu-l40s-4gpu-runners" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 2654 + # min_rps: 4.1 - - name: "qwen3.5-0.8b" - s3_model: "qwen3.5-0.8b.tar.gz" - runner_label: "gpu-l4-1gpu-runners" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 5966 - min_rps: 9.3 + # - name: "qwen3.5-0.8b" + # s3_model: "qwen3.5-0.8b.tar.gz" + # runner_label: "gpu-l4-1gpu-runners" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 5966 + # min_rps: 9.3 # upstream # facebook/opt-125m From a04736fd57d582b3324b5e6d807e6a9aff83b513 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Wed, 20 May 2026 16:48:50 -0700 Subject: [PATCH 3/5] fix: set benchmark runner-scale-sets to empty list --- .github/config/model-tests/vllm-model-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index d977909ab20f..2eb525fd5184 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -211,7 +211,7 @@ benchmark: # min_throughput: 93 # min_rps: 0.25 - runner-scale-sets: + runner-scale-sets: [] # - name: "qwen3-32b" # s3_model: "qwen3-32b.tar.gz" # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.85" From a28d098e5d7f60c19ff41f17d1cd2ab4f05f87f7 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Thu, 21 May 2026 10:20:00 -0700 Subject: [PATCH 4/5] feat: set gemma-4-e2b-it thresholds to ~50% of observed Observed: 944.94 output tok/s, 7.38 rps Thresholds: min_throughput=470, min_rps=3.7 --- .github/config/model-tests/vllm-model-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index 2eb525fd5184..386d17596b20 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -112,8 +112,8 @@ benchmark: output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 20 - min_rps: 0.15 + min_throughput: 470 + min_rps: 3.7 # Pending p5e.48xlarge fleet creation. Fleet name "x86-p5e-runner" is a placeholder. # - name: "minimax-m2.7" From 9d09eab9f9208f2c8fa64307763ede0d1608657c Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Thu, 21 May 2026 10:24:42 -0700 Subject: [PATCH 5/5] chore: restore all models, keep only gemma-4-e2b-it addition --- .../config/model-tests/vllm-model-tests.yml | 400 +++++++++--------- 1 file changed, 200 insertions(+), 200 deletions(-) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index 386d17596b20..332e5ffbf973 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -33,76 +33,76 @@ smoke-test: benchmark: codebuild-fleet: - # - name: "qwen3-embedding-0.6b" - # s3_model: "qwen3-embedding-0.6b.tar.gz" - # fleet: "x86-g6xl-runner" - # extra_args: "--dtype bfloat16 --max-model-len 8192" - # test_script: "vllm_embedding_benchmark_test.sh" - # min_rps: 5 + - name: "qwen3-embedding-0.6b" + s3_model: "qwen3-embedding-0.6b.tar.gz" + fleet: "x86-g6xl-runner" + extra_args: "--dtype bfloat16 --max-model-len 8192" + test_script: "vllm_embedding_benchmark_test.sh" + min_rps: 5 - # - name: "qwen3-vl-embedding-2b" - # s3_model: "qwen3-vl-embedding-2b.tar.gz" - # fleet: "x86-g6xl-runner" - # extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" - # test_script: "vllm_embedding_benchmark_test.sh" - # min_rps: 3 + - name: "qwen3-vl-embedding-2b" + s3_model: "qwen3-vl-embedding-2b.tar.gz" + fleet: "x86-g6xl-runner" + extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" + test_script: "vllm_embedding_benchmark_test.sh" + min_rps: 3 - # - name: "qwen3-asr-1.7b" - # s3_model: "qwen3-asr-1.7b.tar.gz" - # fleet: "x86-g6e12xl-runner" - # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - # test_script: "vllm_asr_benchmark_test.sh" - # test_fixtures: - # - "audio/asr_en.wav" - # - "audio/asr_zh.wav" - # benchmark_audio_fixture: "asr_en.wav" - # min_throughput: 30 - # min_rps: 1 - # benchmark_profiles: "baseline,high_concurrency,sustained_load,burst" + - name: "qwen3-asr-1.7b" + s3_model: "qwen3-asr-1.7b.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + test_script: "vllm_asr_benchmark_test.sh" + test_fixtures: + - "audio/asr_en.wav" + - "audio/asr_zh.wav" + benchmark_audio_fixture: "asr_en.wav" + min_throughput: 30 + min_rps: 1 + benchmark_profiles: "baseline,high_concurrency,sustained_load,burst" - # - name: "gpt-oss-20b" - # s3_model: "gpt-oss-20b.tar.gz" - # fleet: "x86-g6exl-runner" - # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 1200 - # min_rps: 5 + - name: "gpt-oss-20b" + s3_model: "gpt-oss-20b.tar.gz" + fleet: "x86-g6exl-runner" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 1200 + min_rps: 5 - # - name: "gemma-4-26b-a4b-it" - # s3_model: "gemma-4-26b-a4b-it.tar.gz" - # fleet: "x86-g6e12xl-runner" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 300 - # min_rps: 2.4 + - name: "gemma-4-26b-a4b-it" + s3_model: "gemma-4-26b-a4b-it.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 300 + min_rps: 2.4 - # - name: "gemma-4-31b-it" - # s3_model: "gemma-4-31b-it.tar.gz" - # fleet: "x86-g6e12xl-runner" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 150 - # min_rps: 1.2 + - name: "gemma-4-31b-it" + s3_model: "gemma-4-31b-it.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 150 + min_rps: 1.2 - # - name: "gemma-4-e4b-it" - # s3_model: "gemma-4-e4b-it.tar.gz" - # fleet: "x86-g6exl-runner" - # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 680 - # min_rps: 5.3 + - name: "gemma-4-e4b-it" + s3_model: "gemma-4-e4b-it.tar.gz" + fleet: "x86-g6exl-runner" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 680 + min_rps: 5.3 - name: "gemma-4-e2b-it" s3_model: "gemma-4-e2b-it.tar.gz" @@ -138,27 +138,27 @@ benchmark: # min_throughput: 330 # min_rps: 2.6 - # - name: "qwen3.5-9b" - # s3_model: "qwen3.5-9b.tar.gz" - # fleet: "x86-g6xl-runner" - # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --enforce-eager" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 20 - # min_rps: 0.15 + - name: "qwen3.5-9b" + s3_model: "qwen3.5-9b.tar.gz" + fleet: "x86-g6xl-runner" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --enforce-eager" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 20 + min_rps: 0.15 - # - name: "llama-3.3-70b" - # s3_model: "llama-3.3-70b.tar.gz" - # fleet: "x86-g6e12xl-runner" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - # input_len: 512 - # output_len: 128 - # num_prompts: 32 - # batch_size: 2 - # min_throughput: 80 - # min_rps: 0.35 + - name: "llama-3.3-70b" + s3_model: "llama-3.3-70b.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + input_len: 512 + output_len: 128 + num_prompts: 32 + batch_size: 2 + min_throughput: 80 + min_rps: 0.35 # https://github.com/vllm-project/vllm/issues/32637 # transformer version doesn't support this model @@ -174,141 +174,141 @@ benchmark: # min_throughput: 20 # min_rps: 1 - # - name: "qwen3.5-35b-a3b-fp8" - # s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" - # fleet: "x86-g6e12xl-runner" + - name: "qwen3.5-35b-a3b-fp8" + s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" + fleet: "x86-g6e12xl-runner" # https://github.com/vllm-project/vllm/issues/35743 open bug for capturing CUDA graph fails # workaround with --enforce-eager tp=1 fail while tp=4 success - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 80 - # min_rps: 0.35 + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 80 + min_rps: 0.35 # A100 is compute capability 8.0 — FP8 requires 8.9+ (H100/L40S). # The Marlin fallback uses significantly more memory. - # - name: "qwen3.5-27b-fp8" - # s3_model: "qwen3.5-27b-fp8.tar.gz" - # fleet: "x86-g6e12xl-runner" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 20 - # min_rps: 0.2 + - name: "qwen3.5-27b-fp8" + s3_model: "qwen3.5-27b-fp8.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 20 + min_rps: 0.2 - # - name: "qwen3-coder-next-fp8" - # s3_model: "qwen3-coder-next-fp8.tar.gz" - # fleet: "x86-g6e12xl-runner" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - # input_len: 512 - # output_len: 256 - # num_prompts: 32 - # batch_size: 2 - # min_throughput: 93 - # min_rps: 0.25 + - name: "qwen3-coder-next-fp8" + s3_model: "qwen3-coder-next-fp8.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + input_len: 512 + output_len: 256 + num_prompts: 32 + batch_size: 2 + min_throughput: 93 + min_rps: 0.25 - runner-scale-sets: [] - # - name: "qwen3-32b" - # s3_model: "qwen3-32b.tar.gz" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.85" - # input_len: 512 - # output_len: 256 - # num_prompts: 32 - # batch_size: 2 - # min_throughput: 1133 - # min_rps: 3 + runner-scale-sets: + - name: "qwen3-32b" + s3_model: "qwen3-32b.tar.gz" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.85" + input_len: 512 + output_len: 256 + num_prompts: 32 + batch_size: 2 + min_throughput: 1133 + min_rps: 3 - # - name: "qwen3.5-35b-a3b-fp8" - # s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" + - name: "qwen3.5-35b-a3b-fp8" + s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" # https://github.com/vllm-project/vllm/issues/35743 open bug for capturing CUDA graph fails # workaround with --enforce-eager tp=1 fail while tp=4 success - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 80 - # min_rps: 0.35 + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 80 + min_rps: 0.35 - # - name: "qwen3.5-27b-fp8" - # s3_model: "qwen3.5-27b-fp8.tar.gz" + - name: "qwen3.5-27b-fp8" + s3_model: "qwen3.5-27b-fp8.tar.gz" # A100 lacks native FP8 — vLLM dequantizes to BF16 at load, doubling weight memory - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --enforce-eager" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 20 - # min_rps: 0.2 + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --enforce-eager" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 20 + min_rps: 0.2 - # - name: "qwen3-coder-next-fp8" - # s3_model: "qwen3-coder-next-fp8.tar.gz" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - # input_len: 512 - # output_len: 256 - # num_prompts: 32 - # batch_size: 2 - # min_throughput: 93 - # min_rps: 0.25 + - name: "qwen3-coder-next-fp8" + s3_model: "qwen3-coder-next-fp8.tar.gz" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + input_len: 512 + output_len: 256 + num_prompts: 32 + batch_size: 2 + min_throughput: 93 + min_rps: 0.25 - # - name: "llama-3.3-70b" - # s3_model: "llama-3.3-70b.tar.gz" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - # input_len: 512 - # output_len: 128 - # num_prompts: 32 - # batch_size: 2 - # min_throughput: 80 - # min_rps: 0.35 + - name: "llama-3.3-70b" + s3_model: "llama-3.3-70b.tar.gz" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + input_len: 512 + output_len: 128 + num_prompts: 32 + batch_size: 2 + min_throughput: 80 + min_rps: 0.35 # --- Qwen 3.5/3.6 new models (thresholds at ~50% of observed) --- - # - name: "qwen3.5-2b" - # s3_model: "qwen3.5-2b.tar.gz" - # runner_label: "gpu-l4-1gpu-runners" - # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 5256 - # min_rps: 8.2 + - name: "qwen3.5-2b" + s3_model: "qwen3.5-2b.tar.gz" + runner_label: "gpu-l4-1gpu-runners" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 5256 + min_rps: 8.2 - # - name: "qwen3.6-27b" - # s3_model: "qwen3.6-27b.tar.gz" - # runner_label: "gpu-l40s-4gpu-runners" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 2195 - # min_rps: 3.4 + - name: "qwen3.6-27b" + s3_model: "qwen3.6-27b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 2195 + min_rps: 3.4 - # - name: "qwen3.6-35b-a3b" - # s3_model: "qwen3.6-35b-a3b.tar.gz" - # runner_label: "gpu-l40s-4gpu-runners" - # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 2654 - # min_rps: 4.1 + - name: "qwen3.6-35b-a3b" + s3_model: "qwen3.6-35b-a3b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 2654 + min_rps: 4.1 - # - name: "qwen3.5-0.8b" - # s3_model: "qwen3.5-0.8b.tar.gz" - # runner_label: "gpu-l4-1gpu-runners" - # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 5966 - # min_rps: 9.3 + - name: "qwen3.5-0.8b" + s3_model: "qwen3.5-0.8b.tar.gz" + runner_label: "gpu-l4-1gpu-runners" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" + input_len: 512 + output_len: 128 + num_prompts: 64 + batch_size: 4 + min_throughput: 5966 + min_rps: 9.3 # upstream # facebook/opt-125m