Skip to content

Commit 858e3ba

Browse files
authored
feat: add Qwen3.5/Qwen3.6 model smoke test and benchmark (#6032)
* feat: add Qwen3.5/Qwen3.6 model smoke test and benchmark Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * move models to p4d Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * use gpu-p4d-runner Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * use efa runner Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * empty commit Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * use p4d runners Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * use efa runner Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * empty commit Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * migrate to runne scale Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * increase node sizes Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * fix gpu Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * update memory util Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * use gpu uuid Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * benchmark Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * benchmark Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * fix cleanup runner scale Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * reset model lists Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * update vllm amzn2023 latest Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> * fix allowlist Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com> --------- Signed-off-by: sirutBuasai <sirutbuasai27@outlook.com>
1 parent d80f25d commit 858e3ba

4 files changed

Lines changed: 72 additions & 7 deletions

File tree

.github/config/image/vllm-ec2-amzn2023.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ common:
1919
os_version: "amzn2023"
2020
customer_type: "ec2"
2121
arch_type: "x86"
22-
prod_image: "vllm:0.18-gpu-py312-ec2"
22+
prod_image: "vllm:server-cuda-v1"
2323
device_type: "gpu"
2424
contributor: "None"
2525

.github/config/model-tests/vllm-model-tests.yml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,51 @@ benchmark:
231231
min_throughput: 80
232232
min_rps: 0.35
233233

234+
# --- Qwen 3.5/3.6 new models (thresholds at ~50% of observed) ---
235+
- name: "qwen3.5-2b"
236+
s3_model: "qwen3.5-2b.tar.gz"
237+
runner_label: "gpu-l4-1gpu-runners"
238+
extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6"
239+
input_len: 512
240+
output_len: 128
241+
num_prompts: 64
242+
batch_size: 4
243+
min_throughput: 5256
244+
min_rps: 8.2
245+
246+
- name: "qwen3.6-27b"
247+
s3_model: "qwen3.6-27b.tar.gz"
248+
runner_label: "gpu-l40s-4gpu-runners"
249+
extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8"
250+
input_len: 512
251+
output_len: 128
252+
num_prompts: 64
253+
batch_size: 4
254+
min_throughput: 2195
255+
min_rps: 3.4
256+
257+
- name: "qwen3.6-35b-a3b"
258+
s3_model: "qwen3.6-35b-a3b.tar.gz"
259+
runner_label: "gpu-l40s-4gpu-runners"
260+
extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8"
261+
input_len: 512
262+
output_len: 128
263+
num_prompts: 64
264+
batch_size: 4
265+
min_throughput: 2654
266+
min_rps: 4.1
267+
268+
- name: "qwen3.5-0.8b"
269+
s3_model: "qwen3.5-0.8b.tar.gz"
270+
runner_label: "gpu-l4-1gpu-runners"
271+
extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6"
272+
input_len: 512
273+
output_len: 128
274+
num_prompts: 64
275+
batch_size: 4
276+
min_throughput: 5966
277+
min_rps: 9.3
278+
234279
# upstream
235280
# facebook/opt-125m
236281
# meta-llama/Llama-3.2-1B-Instruct

.github/workflows/reusable-vllm-model-tests.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ jobs:
140140
fail-fast: false
141141
matrix:
142142
include: ${{ fromJson(needs.load-models.outputs.runner-scale-sets-matrix) }}
143-
runs-on: gpu-efa-runners
143+
runs-on: ${{ matrix.runner_label }}
144144
steps:
145145
- name: Checkout code
146146
uses: actions/checkout@v5
@@ -169,7 +169,10 @@ jobs:
169169
- name: Start container
170170
run: |
171171
docker pull ${{ inputs.image-uri }}
172-
CONTAINER_ID=$(docker run -d -it --gpus all --entrypoint /bin/bash \
172+
# Get GPU UUIDs visible to this pod (k8s assigns a subset of host GPUs)
173+
POD_GPUS=$(nvidia-smi --query-gpu=uuid --format=csv,noheader | paste -sd,)
174+
echo "Pod GPU UUIDs: ${POD_GPUS}"
175+
CONTAINER_ID=$(docker run -d -it --gpus "\"device=${POD_GPUS}\"" --entrypoint /bin/bash \
173176
--ipc=host --shm-size=10g \
174177
${{ inputs.image-uri }})
175178
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
@@ -184,7 +187,6 @@ jobs:
184187
if [ -f "test/vllm/scripts/amzn2023/${{ matrix.test_script || '' }}" ]; then
185188
docker cp "test/vllm/scripts/amzn2023/${{ matrix.test_script }}" ${CONTAINER_ID}:/models/
186189
fi
187-
rm -rf /dlc-models
188190
189191
- name: Download and copy test fixtures
190192
if: ${{ matrix.test_fixtures_paths != '' }}
@@ -209,6 +211,4 @@ jobs:
209211
if: always()
210212
run: |
211213
docker stop ${CONTAINER_ID} 2>/dev/null || true
212-
docker rm -f ${CONTAINER_ID} 2>/dev/null || true
213-
docker rmi ${{ inputs.image-uri }} 2>/dev/null || true
214-
rm -rf /dlc-models
214+
docker rm -f ${CONTAINER_ID} 2>/dev/null || true

test/security/data/ecr_scan_allowlist/vllm_server/framework_allowlist.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,5 +91,25 @@
9191
"vulnerability_id": "GHSA-82j2-j2ch-gfr8",
9292
"reason": "rustls-webpki 0.103.12 bundled in uv binary (Rust). Fix requires rustls-webpki>=0.104.0-alpha.7 (pre-release). Not exploitable — uv only connects to PyPI over TLS, no CRL checking enabled.",
9393
"review_by": "2026-06-04"
94+
},
95+
{
96+
"vulnerability_id": "CVE-2026-33811",
97+
"reason": "go/stdlib 1.24.13 statically linked inside mooncake libetcd_wrapper.so, cannot be patched independently of mooncake-transfer-engine rebuild"
98+
},
99+
{
100+
"vulnerability_id": "CVE-2026-39820",
101+
"reason": "go/stdlib 1.24.13 statically linked inside mooncake libetcd_wrapper.so, cannot be patched independently of mooncake-transfer-engine rebuild"
102+
},
103+
{
104+
"vulnerability_id": "CVE-2026-33814",
105+
"reason": "go/stdlib 1.24.13 statically linked inside mooncake libetcd_wrapper.so, cannot be patched independently of mooncake-transfer-engine rebuild"
106+
},
107+
{
108+
"vulnerability_id": "CVE-2026-39836",
109+
"reason": "go/stdlib 1.24.13 statically linked inside mooncake libetcd_wrapper.so, cannot be patched independently of mooncake-transfer-engine rebuild"
110+
},
111+
{
112+
"vulnerability_id": "CVE-2026-42499",
113+
"reason": "go/stdlib 1.24.13 statically linked inside mooncake libetcd_wrapper.so, cannot be patched independently of mooncake-transfer-engine rebuild"
94114
}
95115
]

0 commit comments

Comments
 (0)