From e77bc074585064692ce4015fba7ac5f0de3bbf90 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Mon, 4 May 2026 13:30:29 -0700 Subject: [PATCH 01/16] feat: add Qwen3.5/Qwen3.6 model smoke test and benchmark Signed-off-by: sirutBuasai --- .../config/model-tests/vllm-model-tests.yml | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index f7c1969e6715..8caf6bb46697 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -17,6 +17,26 @@ smoke-test: fleet: "x86-g6xl-runner" extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + - name: "qwen3.5-2b" + s3_model: "qwen3.5-2b.tar.gz" + fleet: "x86-g6xl-runner" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + + - name: "qwen3.5-27b-fp8" + s3_model: "qwen3.5-27b-fp8.tar.gz" + fleet: "x86-g6exl-runner" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096" + + - name: "qwen3.6-27b" + s3_model: "qwen3.6-27b.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + + - name: "qwen3.6-35b-a3b" + s3_model: "qwen3.6-35b-a3b.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + - name: "qwen3-embedding-0.6b" s3_model: "qwen3-embedding-0.6b.tar.gz" fleet: "x86-g6xl-runner" From 6174805afcb00c51300bde57b13ecd06f1a61fb1 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Tue, 5 May 2026 10:58:37 -0700 Subject: [PATCH 02/16] move models to p4d Signed-off-by: sirutBuasai --- .../config/model-tests/vllm-model-tests.yml | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index 8caf6bb46697..1aa0161c2ad9 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -24,18 +24,8 @@ smoke-test: - name: "qwen3.5-27b-fp8" s3_model: "qwen3.5-27b-fp8.tar.gz" - fleet: "x86-g6exl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096" - - - name: "qwen3.6-27b" - s3_model: "qwen3.6-27b.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - - - name: "qwen3.6-35b-a3b" - s3_model: "qwen3.6-35b-a3b.tar.gz" fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - name: "qwen3-embedding-0.6b" s3_model: "qwen3-embedding-0.6b.tar.gz" @@ -49,7 +39,14 @@ smoke-test: extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" test_script: "vllm_embedding_smoke_test.sh" - runner-scale-sets: [] + runner-scale-sets: + - name: "qwen3.6-27b" + s3_model: "qwen3.6-27b.tar.gz" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + + - name: "qwen3.6-35b-a3b" + s3_model: "qwen3.6-35b-a3b.tar.gz" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" benchmark: codebuild-fleet: From 6a731290f9e87f5801dea034525db2e0dffd8e95 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Wed, 6 May 2026 11:33:53 -0700 Subject: [PATCH 03/16] use gpu-p4d-runner Signed-off-by: sirutBuasai --- .github/workflows/reusable-vllm-model-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/reusable-vllm-model-tests.yml b/.github/workflows/reusable-vllm-model-tests.yml index fede1aadf196..48f5be947e3a 100644 --- a/.github/workflows/reusable-vllm-model-tests.yml +++ b/.github/workflows/reusable-vllm-model-tests.yml @@ -140,7 +140,7 @@ jobs: fail-fast: false matrix: include: ${{ fromJson(needs.load-models.outputs.runner-scale-sets-matrix) }} - runs-on: gpu-efa-runners + runs-on: gpu-p4d-runners steps: - name: Checkout code uses: actions/checkout@v5 From e3af51b783f041b1f27d921db25d3e5eaec5e215 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Wed, 6 May 2026 13:46:13 -0700 Subject: [PATCH 04/16] use efa runner Signed-off-by: sirutBuasai --- .github/workflows/reusable-vllm-model-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/reusable-vllm-model-tests.yml b/.github/workflows/reusable-vllm-model-tests.yml index 48f5be947e3a..fede1aadf196 100644 --- a/.github/workflows/reusable-vllm-model-tests.yml +++ b/.github/workflows/reusable-vllm-model-tests.yml @@ -140,7 +140,7 @@ jobs: fail-fast: false matrix: include: ${{ fromJson(needs.load-models.outputs.runner-scale-sets-matrix) }} - runs-on: gpu-p4d-runners + runs-on: gpu-efa-runners steps: - name: Checkout code uses: actions/checkout@v5 From cda2dee59d220ab0c3fa6a332d891fefe83649b6 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Wed, 6 May 2026 15:08:33 -0700 Subject: [PATCH 05/16] empty commit Signed-off-by: sirutBuasai From 8ec0d6b4b4210990b226cf95d8fa1b22b9714082 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Wed, 6 May 2026 15:30:28 -0700 Subject: [PATCH 06/16] use p4d runners Signed-off-by: sirutBuasai --- .github/workflows/reusable-vllm-model-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/reusable-vllm-model-tests.yml b/.github/workflows/reusable-vllm-model-tests.yml index fede1aadf196..48f5be947e3a 100644 --- a/.github/workflows/reusable-vllm-model-tests.yml +++ b/.github/workflows/reusable-vllm-model-tests.yml @@ -140,7 +140,7 @@ jobs: fail-fast: false matrix: include: ${{ fromJson(needs.load-models.outputs.runner-scale-sets-matrix) }} - runs-on: gpu-efa-runners + runs-on: gpu-p4d-runners steps: - name: Checkout code uses: actions/checkout@v5 From 036eecceebf81cc55ad30b976f543c4c1bd1c27f Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Wed, 6 May 2026 15:50:48 -0700 Subject: [PATCH 07/16] use efa runner Signed-off-by: sirutBuasai --- .github/workflows/reusable-vllm-model-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/reusable-vllm-model-tests.yml b/.github/workflows/reusable-vllm-model-tests.yml index 48f5be947e3a..fede1aadf196 100644 --- a/.github/workflows/reusable-vllm-model-tests.yml +++ b/.github/workflows/reusable-vllm-model-tests.yml @@ -140,7 +140,7 @@ jobs: fail-fast: false matrix: include: ${{ fromJson(needs.load-models.outputs.runner-scale-sets-matrix) }} - runs-on: gpu-p4d-runners + runs-on: gpu-efa-runners steps: - name: Checkout code uses: actions/checkout@v5 From f01c3c7b538bfd11216adc6a2a5e2fff6a666431 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Wed, 6 May 2026 16:33:58 -0700 Subject: [PATCH 08/16] empty commit Signed-off-by: sirutBuasai From b884423be1cd3a9b224afa2c67b690f82f3fd217 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Thu, 7 May 2026 13:37:53 -0700 Subject: [PATCH 09/16] migrate to runne scale Signed-off-by: sirutBuasai --- .github/actionlint.yaml | 5 ++++- .../config/model-tests/vllm-model-tests.yml | 19 +++++++++++-------- .../workflows/reusable-vllm-model-tests.yml | 2 +- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 8cc347272ac9..89b77b28065d 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -1,5 +1,8 @@ self-hosted-runner: labels: - - gpu-standard-runners + - gpu-l4-1gpu-runners + - gpu-l4-4gpu-runners + - gpu-l40s-1gpu-runners + - gpu-l40s-4gpu-runners - gpu-efa-runners - gpu-p4d-runners diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index 1aa0161c2ad9..b83973cdf5ae 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -11,41 +11,44 @@ s3_prefix: "s3://dlc-cicd-models/llm-models" test_fixtures_prefix: "s3://dlc-cicd-models/test-fixtures" smoke-test: - codebuild-fleet: + codebuild-fleet: [] + + runner-scale-sets: - name: "qwen3.5-0.8b" s3_model: "qwen3.5-0.8b.tar.gz" - fleet: "x86-g6xl-runner" + runner_label: "gpu-l4-1gpu-runners" extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - name: "qwen3.5-2b" s3_model: "qwen3.5-2b.tar.gz" - fleet: "x86-g6xl-runner" + runner_label: "gpu-l4-1gpu-runners" extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - name: "qwen3.5-27b-fp8" s3_model: "qwen3.5-27b-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + runner_label: "gpu-l40s-1gpu-runners" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096" - name: "qwen3-embedding-0.6b" s3_model: "qwen3-embedding-0.6b.tar.gz" - fleet: "x86-g6xl-runner" + runner_label: "gpu-l4-1gpu-runners" extra_args: "--dtype bfloat16 --max-model-len 8192" test_script: "vllm_embedding_smoke_test.sh" - name: "qwen3-vl-embedding-2b" s3_model: "qwen3-vl-embedding-2b.tar.gz" - fleet: "x86-g6xl-runner" + runner_label: "gpu-l4-1gpu-runners" extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" test_script: "vllm_embedding_smoke_test.sh" - runner-scale-sets: - name: "qwen3.6-27b" s3_model: "qwen3.6-27b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - name: "qwen3.6-35b-a3b" s3_model: "qwen3.6-35b-a3b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" benchmark: diff --git a/.github/workflows/reusable-vllm-model-tests.yml b/.github/workflows/reusable-vllm-model-tests.yml index fede1aadf196..9f2b0546c13c 100644 --- a/.github/workflows/reusable-vllm-model-tests.yml +++ b/.github/workflows/reusable-vllm-model-tests.yml @@ -140,7 +140,7 @@ jobs: fail-fast: false matrix: include: ${{ fromJson(needs.load-models.outputs.runner-scale-sets-matrix) }} - runs-on: gpu-efa-runners + runs-on: ${{ matrix.runner_label }} steps: - name: Checkout code uses: actions/checkout@v5 From 63345e78314b38474fa7fe73726f7d3cbbc1f3cb Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Thu, 7 May 2026 14:57:41 -0700 Subject: [PATCH 10/16] increase node sizes Signed-off-by: sirutBuasai --- .github/actionlint.yaml | 14 ++++++++------ .github/config/model-tests/vllm-model-tests.yml | 8 ++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 89b77b28065d..c9fc83b8e374 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -1,8 +1,10 @@ self-hosted-runner: labels: - - gpu-l4-1gpu-runners - - gpu-l4-4gpu-runners - - gpu-l40s-1gpu-runners - - gpu-l40s-4gpu-runners - - gpu-efa-runners - - gpu-p4d-runners + - gpu-l4-1gpu-runners # 1x L4 24GB VRAM + - gpu-l4-2gpu-runners # 2x L4 48GB VRAM + - gpu-l4-4gpu-runners # 4x L4 96GB VRAM + - gpu-l40s-1gpu-runners # 1x L40S 48GB VRAM + - gpu-l40s-2gpu-runners # 2x L40S 96GB VRAM + - gpu-l40s-4gpu-runners # 4x L40S 192GB VRAM + - gpu-efa-runners # 4x A100/H100 160-320GB VRAM + - gpu-p4d-runners # 4x A100 160GB VRAM (reserved) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index b83973cdf5ae..7135a9bd4e06 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -16,18 +16,18 @@ smoke-test: runner-scale-sets: - name: "qwen3.5-0.8b" s3_model: "qwen3.5-0.8b.tar.gz" - runner_label: "gpu-l4-1gpu-runners" + runner_label: "gpu-l4-2gpu-runners" extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - name: "qwen3.5-2b" s3_model: "qwen3.5-2b.tar.gz" - runner_label: "gpu-l4-1gpu-runners" + runner_label: "gpu-l4-2gpu-runners" extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - name: "qwen3.5-27b-fp8" s3_model: "qwen3.5-27b-fp8.tar.gz" - runner_label: "gpu-l40s-1gpu-runners" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - name: "qwen3-embedding-0.6b" s3_model: "qwen3-embedding-0.6b.tar.gz" From 9792e2e651935405885f29250184b9926ca0fdeb Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Thu, 7 May 2026 15:34:25 -0700 Subject: [PATCH 11/16] fix gpu Signed-off-by: sirutBuasai --- .github/config/model-tests/vllm-model-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index 7135a9bd4e06..e2289f5d7328 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -17,12 +17,12 @@ smoke-test: - name: "qwen3.5-0.8b" s3_model: "qwen3.5-0.8b.tar.gz" runner_label: "gpu-l4-2gpu-runners" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + extra_args: "--tensor-parallel-size 2 --max-model-len 4096 --dtype bfloat16" - name: "qwen3.5-2b" s3_model: "qwen3.5-2b.tar.gz" runner_label: "gpu-l4-2gpu-runners" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + extra_args: "--tensor-parallel-size 2 --max-model-len 4096 --dtype bfloat16" - name: "qwen3.5-27b-fp8" s3_model: "qwen3.5-27b-fp8.tar.gz" From 67bd46a12d15d0540014415ae038289445bbb780 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Thu, 7 May 2026 16:11:21 -0700 Subject: [PATCH 12/16] update memory util Signed-off-by: sirutBuasai --- .github/config/model-tests/vllm-model-tests.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index e2289f5d7328..0e6c66940bb1 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -17,39 +17,39 @@ smoke-test: - name: "qwen3.5-0.8b" s3_model: "qwen3.5-0.8b.tar.gz" runner_label: "gpu-l4-2gpu-runners" - extra_args: "--tensor-parallel-size 2 --max-model-len 4096 --dtype bfloat16" + extra_args: "--tensor-parallel-size 2 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" - name: "qwen3.5-2b" s3_model: "qwen3.5-2b.tar.gz" runner_label: "gpu-l4-2gpu-runners" - extra_args: "--tensor-parallel-size 2 --max-model-len 4096 --dtype bfloat16" + extra_args: "--tensor-parallel-size 2 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" - name: "qwen3.5-27b-fp8" s3_model: "qwen3.5-27b-fp8.tar.gz" runner_label: "gpu-l40s-4gpu-runners" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.8" - name: "qwen3-embedding-0.6b" s3_model: "qwen3-embedding-0.6b.tar.gz" runner_label: "gpu-l4-1gpu-runners" - extra_args: "--dtype bfloat16 --max-model-len 8192" + extra_args: "--dtype bfloat16 --max-model-len 8192 --gpu-memory-utilization 0.6" test_script: "vllm_embedding_smoke_test.sh" - name: "qwen3-vl-embedding-2b" s3_model: "qwen3-vl-embedding-2b.tar.gz" runner_label: "gpu-l4-1gpu-runners" - extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" + extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code --gpu-memory-utilization 0.6" test_script: "vllm_embedding_smoke_test.sh" - name: "qwen3.6-27b" s3_model: "qwen3.6-27b.tar.gz" runner_label: "gpu-l40s-4gpu-runners" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" - name: "qwen3.6-35b-a3b" s3_model: "qwen3.6-35b-a3b.tar.gz" runner_label: "gpu-l40s-4gpu-runners" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" benchmark: codebuild-fleet: From de5ca12ac038a49608bb3581d658ceb9cf5baef5 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Thu, 7 May 2026 16:33:22 -0700 Subject: [PATCH 13/16] use gpu uuid Signed-off-by: sirutBuasai --- .github/workflows/reusable-vllm-model-tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/reusable-vllm-model-tests.yml b/.github/workflows/reusable-vllm-model-tests.yml index 9f2b0546c13c..d74ac5e4e4f5 100644 --- a/.github/workflows/reusable-vllm-model-tests.yml +++ b/.github/workflows/reusable-vllm-model-tests.yml @@ -169,7 +169,10 @@ jobs: - name: Start container run: | docker pull ${{ inputs.image-uri }} - CONTAINER_ID=$(docker run -d -it --gpus all --entrypoint /bin/bash \ + # Get GPU UUIDs visible to this pod (k8s assigns a subset of host GPUs) + POD_GPUS=$(nvidia-smi --query-gpu=uuid --format=csv,noheader | paste -sd,) + echo "Pod GPU UUIDs: ${POD_GPUS}" + CONTAINER_ID=$(docker run -d -it --gpus "\"device=${POD_GPUS}\"" --entrypoint /bin/bash \ --ipc=host --shm-size=10g \ ${{ inputs.image-uri }}) echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV From cbaa7788bc49b54c0f2768f32feb48f7327f3e57 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Fri, 8 May 2026 11:46:31 -0700 Subject: [PATCH 14/16] benchmark Signed-off-by: sirutBuasai --- .../config/model-tests/vllm-model-tests.yml | 339 ++++++++++-------- 1 file changed, 184 insertions(+), 155 deletions(-) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index 0e6c66940bb1..326b9dbb3e01 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -53,213 +53,242 @@ smoke-test: benchmark: codebuild-fleet: - - name: "qwen3-embedding-0.6b" - s3_model: "qwen3-embedding-0.6b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--dtype bfloat16 --max-model-len 8192" - test_script: "vllm_embedding_benchmark_test.sh" - min_rps: 5 + # --- Existing benchmarks commented out for Qwen 3.5/3.6 validation --- + # - name: "qwen3-embedding-0.6b" + # s3_model: "qwen3-embedding-0.6b.tar.gz" + # fleet: "x86-g6xl-runner" + # extra_args: "--dtype bfloat16 --max-model-len 8192" + # test_script: "vllm_embedding_benchmark_test.sh" + # min_rps: 5 - - name: "qwen3-vl-embedding-2b" - s3_model: "qwen3-vl-embedding-2b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" - test_script: "vllm_embedding_benchmark_test.sh" - min_rps: 3 + # - name: "qwen3-vl-embedding-2b" + # s3_model: "qwen3-vl-embedding-2b.tar.gz" + # fleet: "x86-g6xl-runner" + # extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" + # test_script: "vllm_embedding_benchmark_test.sh" + # min_rps: 3 - - name: "qwen3-asr-1.7b" - s3_model: "qwen3-asr-1.7b.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - test_script: "vllm_asr_benchmark_test.sh" - test_fixtures: - - "audio/asr_en.wav" - - "audio/asr_zh.wav" - benchmark_audio_fixture: "asr_en.wav" - min_throughput: 30 - min_rps: 1 - benchmark_profiles: "baseline,high_concurrency,sustained_load,burst" + # - name: "qwen3-asr-1.7b" + # s3_model: "qwen3-asr-1.7b.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + # test_script: "vllm_asr_benchmark_test.sh" + # test_fixtures: + # - "audio/asr_en.wav" + # - "audio/asr_zh.wav" + # benchmark_audio_fixture: "asr_en.wav" + # min_throughput: 30 + # min_rps: 1 + # benchmark_profiles: "baseline,high_concurrency,sustained_load,burst" + + # - name: "gpt-oss-20b" + # s3_model: "gpt-oss-20b.tar.gz" + # fleet: "x86-g6exl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 1200 + # min_rps: 5 - - name: "gpt-oss-20b" - s3_model: "gpt-oss-20b.tar.gz" + # - name: "gemma-4-26b-a4b-it" + # s3_model: "gemma-4-26b-a4b-it.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 300 + # min_rps: 2.4 + + # - name: "gemma-4-31b-it" + # s3_model: "gemma-4-31b-it.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 150 + # min_rps: 1.2 + + # - name: "gemma-4-e4b-it" + # s3_model: "gemma-4-e4b-it.tar.gz" + # fleet: "x86-g6exl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 680 + # min_rps: 5.3 + + # - name: "qwen3.5-9b" + # s3_model: "qwen3.5-9b.tar.gz" + # fleet: "x86-g6xl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --enforce-eager" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 20 + # min_rps: 0.15 + + # - name: "llama-3.3-70b" + # s3_model: "llama-3.3-70b.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 32 + # batch_size: 2 + # min_throughput: 80 + # min_rps: 0.35 + + # - name: "qwen3.5-35b-a3b-fp8" + # s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 80 + # min_rps: 0.35 + + # - name: "qwen3.5-27b-fp8" + # s3_model: "qwen3.5-27b-fp8.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 20 + # min_rps: 0.2 + + # - name: "qwen3-coder-next-fp8" + # s3_model: "qwen3-coder-next-fp8.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 256 + # num_prompts: 32 + # batch_size: 2 + # min_throughput: 93 + # min_rps: 0.25 + + # --- Qwen 3.5/3.6 benchmark (placeholder thresholds — update after manual run) --- + - name: "qwen3.5-0.8b" + s3_model: "qwen3.5-0.8b.tar.gz" fleet: "x86-g6exl-runner" extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 1200 - min_rps: 5 + min_throughput: 1 + min_rps: 0.1 - - name: "gemma-4-26b-a4b-it" - s3_model: "gemma-4-26b-a4b-it.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + - name: "qwen3.5-2b" + s3_model: "qwen3.5-2b.tar.gz" + fleet: "x86-g6exl-runner" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 300 - min_rps: 2.4 + min_throughput: 1 + min_rps: 0.1 - - name: "gemma-4-31b-it" - s3_model: "gemma-4-31b-it.tar.gz" + - name: "qwen3.5-27b-fp8" + s3_model: "qwen3.5-27b-fp8.tar.gz" fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 150 - min_rps: 1.2 + min_throughput: 1 + min_rps: 0.1 - - name: "gemma-4-e4b-it" - s3_model: "gemma-4-e4b-it.tar.gz" - fleet: "x86-g6exl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + - name: "qwen3.6-27b" + s3_model: "qwen3.6-27b.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 680 - min_rps: 5.3 + min_throughput: 1 + min_rps: 0.1 - - name: "qwen3.5-9b" - s3_model: "qwen3.5-9b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --enforce-eager" + - name: "qwen3.6-35b-a3b" + s3_model: "qwen3.6-35b-a3b.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 20 - min_rps: 0.15 + min_throughput: 1 + min_rps: 0.1 - - name: "llama-3.3-70b" - s3_model: "llama-3.3-70b.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + runner-scale-sets: + # --- Qwen 3.5/3.6 benchmark (placeholder thresholds — update after manual run) --- + - name: "qwen3.5-0.8b" + s3_model: "qwen3.5-0.8b.tar.gz" + runner_label: "gpu-l4-1gpu-runners" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" input_len: 512 output_len: 128 - num_prompts: 32 - batch_size: 2 - min_throughput: 80 - min_rps: 0.35 - - # https://github.com/vllm-project/vllm/issues/32637 - # transformer version doesn't support this model - # https://github.com/vllm-project/vllm/issues/34098 - # - name: "glm-4.7-flash" - # s3_model: "glm-4.7-flash.tar.gz" - # fleet: "x86-g6xl-runner" - # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - # input_len: 512 - # output_len: 128 - # num_prompts: 64 - # batch_size: 4 - # min_throughput: 20 - # min_rps: 1 + num_prompts: 64 + batch_size: 4 + min_throughput: 1 + min_rps: 0.1 - - name: "qwen3.5-35b-a3b-fp8" - s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - # https://github.com/vllm-project/vllm/issues/35743 open bug for capturing CUDA graph fails - # workaround with --enforce-eager tp=1 fail while tp=4 success - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + - name: "qwen3.5-2b" + s3_model: "qwen3.5-2b.tar.gz" + runner_label: "gpu-l4-1gpu-runners" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 80 - min_rps: 0.35 + min_throughput: 1 + min_rps: 0.1 -# A100 is compute capability 8.0 — FP8 requires 8.9+ (H100/L40S). -# The Marlin fallback uses significantly more memory. - name: "qwen3.5-27b-fp8" s3_model: "qwen3.5-27b-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.8" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 20 - min_rps: 0.2 - - - name: "qwen3-coder-next-fp8" - s3_model: "qwen3-coder-next-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 256 - num_prompts: 32 - batch_size: 2 - min_throughput: 93 - min_rps: 0.25 - - runner-scale-sets: - - name: "qwen3-32b" - s3_model: "qwen3-32b.tar.gz" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.85" - input_len: 512 - output_len: 256 - num_prompts: 32 - batch_size: 2 - min_throughput: 1133 - min_rps: 3 + min_throughput: 1 + min_rps: 0.1 - - name: "qwen3.5-35b-a3b-fp8" - s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" - # https://github.com/vllm-project/vllm/issues/35743 open bug for capturing CUDA graph fails - # workaround with --enforce-eager tp=1 fail while tp=4 success - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + - name: "qwen3.6-27b" + s3_model: "qwen3.6-27b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 80 - min_rps: 0.35 + min_throughput: 1 + min_rps: 0.1 - - name: "qwen3.5-27b-fp8" - s3_model: "qwen3.5-27b-fp8.tar.gz" - # A100 lacks native FP8 — vLLM dequantizes to BF16 at load, doubling weight memory - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --enforce-eager" + - name: "qwen3.6-35b-a3b" + s3_model: "qwen3.6-35b-a3b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 20 - min_rps: 0.2 - - - name: "qwen3-coder-next-fp8" - s3_model: "qwen3-coder-next-fp8.tar.gz" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 256 - num_prompts: 32 - batch_size: 2 - min_throughput: 93 - min_rps: 0.25 - - - name: "llama-3.3-70b" - s3_model: "llama-3.3-70b.tar.gz" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 128 - num_prompts: 32 - batch_size: 2 - min_throughput: 80 - min_rps: 0.35 - -# upstream -# facebook/opt-125m -# meta-llama/Llama-3.2-1B-Instruct -# Qwen/Qwen3-0.6B -# fixie-ai/ultravox-v0_5-llama-3_2-1b -# llava-hf/llava-1.5-7b-hf -# microsoft/Phi-3.5-vision-instruct -# openai/whisper-large-v3-turbo -# jason9693/Qwen2.5-1.5B-apeach -# intfloat/e5-small -# BAAI/bge-reranker-v2-m3 -# meta-llama/Llama-3.1-8B-Instruct + min_throughput: 1 + min_rps: 0.1 From ef8d0570c4d01bcc9353f6a95a97a5f6303fe15b Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Fri, 8 May 2026 12:09:40 -0700 Subject: [PATCH 15/16] benchmark Signed-off-by: sirutBuasai --- .../config/model-tests/vllm-model-tests.yml | 58 +------------------ 1 file changed, 1 insertion(+), 57 deletions(-) diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index 326b9dbb3e01..dc8364e11a8e 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -52,7 +52,7 @@ smoke-test: extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" benchmark: - codebuild-fleet: + codebuild-fleet: [] # --- Existing benchmarks commented out for Qwen 3.5/3.6 validation --- # - name: "qwen3-embedding-0.6b" # s3_model: "qwen3-embedding-0.6b.tar.gz" @@ -180,62 +180,6 @@ benchmark: # min_throughput: 93 # min_rps: 0.25 - # --- Qwen 3.5/3.6 benchmark (placeholder thresholds — update after manual run) --- - - name: "qwen3.5-0.8b" - s3_model: "qwen3.5-0.8b.tar.gz" - fleet: "x86-g6exl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 1 - min_rps: 0.1 - - - name: "qwen3.5-2b" - s3_model: "qwen3.5-2b.tar.gz" - fleet: "x86-g6exl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 1 - min_rps: 0.1 - - - name: "qwen3.5-27b-fp8" - s3_model: "qwen3.5-27b-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 1 - min_rps: 0.1 - - - name: "qwen3.6-27b" - s3_model: "qwen3.6-27b.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 1 - min_rps: 0.1 - - - name: "qwen3.6-35b-a3b" - s3_model: "qwen3.6-35b-a3b.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 1 - min_rps: 0.1 - runner-scale-sets: # --- Qwen 3.5/3.6 benchmark (placeholder thresholds — update after manual run) --- - name: "qwen3.5-0.8b" From 68c5384783afef0c10cb59e6990d6f72895d7969 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Fri, 8 May 2026 14:05:53 -0700 Subject: [PATCH 16/16] fix cleanup runner scale Signed-off-by: sirutBuasai --- .github/workflows/reusable-vllm-model-tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/reusable-vllm-model-tests.yml b/.github/workflows/reusable-vllm-model-tests.yml index d74ac5e4e4f5..d34b1922146f 100644 --- a/.github/workflows/reusable-vllm-model-tests.yml +++ b/.github/workflows/reusable-vllm-model-tests.yml @@ -187,7 +187,6 @@ jobs: if [ -f "test/vllm/scripts/amzn2023/${{ matrix.test_script || '' }}" ]; then docker cp "test/vllm/scripts/amzn2023/${{ matrix.test_script }}" ${CONTAINER_ID}:/models/ fi - rm -rf /dlc-models - name: Download and copy test fixtures if: ${{ matrix.test_fixtures_paths != '' }} @@ -212,6 +211,4 @@ jobs: if: always() run: | docker stop ${CONTAINER_ID} 2>/dev/null || true - docker rm -f ${CONTAINER_ID} 2>/dev/null || true - docker rmi ${{ inputs.image-uri }} 2>/dev/null || true - rm -rf /dlc-models + docker rm -f ${CONTAINER_ID} 2>/dev/null || true \ No newline at end of file