diff --git a/.github/config/model-tests/vllm-model-tests.yml b/.github/config/model-tests/vllm-model-tests.yml index f7c1969e6715..dc8364e11a8e 100644 --- a/.github/config/model-tests/vllm-model-tests.yml +++ b/.github/config/model-tests/vllm-model-tests.yml @@ -11,235 +11,228 @@ s3_prefix: "s3://dlc-cicd-models/llm-models" test_fixtures_prefix: "s3://dlc-cicd-models/test-fixtures" smoke-test: - codebuild-fleet: + codebuild-fleet: [] + + runner-scale-sets: - name: "qwen3.5-0.8b" s3_model: "qwen3.5-0.8b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + runner_label: "gpu-l4-2gpu-runners" + extra_args: "--tensor-parallel-size 2 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" + + - name: "qwen3.5-2b" + s3_model: "qwen3.5-2b.tar.gz" + runner_label: "gpu-l4-2gpu-runners" + extra_args: "--tensor-parallel-size 2 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" + + - name: "qwen3.5-27b-fp8" + s3_model: "qwen3.5-27b-fp8.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.8" - name: "qwen3-embedding-0.6b" s3_model: "qwen3-embedding-0.6b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--dtype bfloat16 --max-model-len 8192" + runner_label: "gpu-l4-1gpu-runners" + extra_args: "--dtype bfloat16 --max-model-len 8192 --gpu-memory-utilization 0.6" test_script: "vllm_embedding_smoke_test.sh" - name: "qwen3-vl-embedding-2b" s3_model: "qwen3-vl-embedding-2b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" + runner_label: "gpu-l4-1gpu-runners" + extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code --gpu-memory-utilization 0.6" test_script: "vllm_embedding_smoke_test.sh" - runner-scale-sets: [] + - name: "qwen3.6-27b" + s3_model: "qwen3.6-27b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" + + - name: "qwen3.6-35b-a3b" + s3_model: "qwen3.6-35b-a3b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" benchmark: - codebuild-fleet: - - name: "qwen3-embedding-0.6b" - s3_model: "qwen3-embedding-0.6b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--dtype bfloat16 --max-model-len 8192" - test_script: "vllm_embedding_benchmark_test.sh" - min_rps: 5 + codebuild-fleet: [] + # --- Existing benchmarks commented out for Qwen 3.5/3.6 validation --- + # - name: "qwen3-embedding-0.6b" + # s3_model: "qwen3-embedding-0.6b.tar.gz" + # fleet: "x86-g6xl-runner" + # extra_args: "--dtype bfloat16 --max-model-len 8192" + # test_script: "vllm_embedding_benchmark_test.sh" + # min_rps: 5 - - name: "qwen3-vl-embedding-2b" - s3_model: "qwen3-vl-embedding-2b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" - test_script: "vllm_embedding_benchmark_test.sh" - min_rps: 3 - - - name: "qwen3-asr-1.7b" - s3_model: "qwen3-asr-1.7b.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - test_script: "vllm_asr_benchmark_test.sh" - test_fixtures: - - "audio/asr_en.wav" - - "audio/asr_zh.wav" - benchmark_audio_fixture: "asr_en.wav" - min_throughput: 30 - min_rps: 1 - benchmark_profiles: "baseline,high_concurrency,sustained_load,burst" - - - name: "gpt-oss-20b" - s3_model: "gpt-oss-20b.tar.gz" - fleet: "x86-g6exl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 1200 - min_rps: 5 + # - name: "qwen3-vl-embedding-2b" + # s3_model: "qwen3-vl-embedding-2b.tar.gz" + # fleet: "x86-g6xl-runner" + # extra_args: "--runner pooling --dtype bfloat16 --max-model-len 8192 --trust-remote-code" + # test_script: "vllm_embedding_benchmark_test.sh" + # min_rps: 3 - - name: "gemma-4-26b-a4b-it" - s3_model: "gemma-4-26b-a4b-it.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 300 - min_rps: 2.4 + # - name: "qwen3-asr-1.7b" + # s3_model: "qwen3-asr-1.7b.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + # test_script: "vllm_asr_benchmark_test.sh" + # test_fixtures: + # - "audio/asr_en.wav" + # - "audio/asr_zh.wav" + # benchmark_audio_fixture: "asr_en.wav" + # min_throughput: 30 + # min_rps: 1 + # benchmark_profiles: "baseline,high_concurrency,sustained_load,burst" - - name: "gemma-4-31b-it" - s3_model: "gemma-4-31b-it.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 150 - min_rps: 1.2 + # - name: "gpt-oss-20b" + # s3_model: "gpt-oss-20b.tar.gz" + # fleet: "x86-g6exl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 1200 + # min_rps: 5 - - name: "gemma-4-e4b-it" - s3_model: "gemma-4-e4b-it.tar.gz" - fleet: "x86-g6exl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 680 - min_rps: 5.3 + # - name: "gemma-4-26b-a4b-it" + # s3_model: "gemma-4-26b-a4b-it.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 300 + # min_rps: 2.4 - - name: "qwen3.5-9b" - s3_model: "qwen3.5-9b.tar.gz" - fleet: "x86-g6xl-runner" - extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --enforce-eager" - input_len: 512 - output_len: 128 - num_prompts: 64 - batch_size: 4 - min_throughput: 20 - min_rps: 0.15 + # - name: "gemma-4-31b-it" + # s3_model: "gemma-4-31b-it.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 150 + # min_rps: 1.2 - - name: "llama-3.3-70b" - s3_model: "llama-3.3-70b.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 128 - num_prompts: 32 - batch_size: 2 - min_throughput: 80 - min_rps: 0.35 - - # https://github.com/vllm-project/vllm/issues/32637 - # transformer version doesn't support this model - # https://github.com/vllm-project/vllm/issues/34098 - # - name: "glm-4.7-flash" - # s3_model: "glm-4.7-flash.tar.gz" - # fleet: "x86-g6xl-runner" + # - name: "gemma-4-e4b-it" + # s3_model: "gemma-4-e4b-it.tar.gz" + # fleet: "x86-g6exl-runner" # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16" # input_len: 512 # output_len: 128 # num_prompts: 64 # batch_size: 4 + # min_throughput: 680 + # min_rps: 5.3 + + # - name: "qwen3.5-9b" + # s3_model: "qwen3.5-9b.tar.gz" + # fleet: "x86-g6xl-runner" + # extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --enforce-eager" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 # min_throughput: 20 - # min_rps: 1 + # min_rps: 0.15 + + # - name: "llama-3.3-70b" + # s3_model: "llama-3.3-70b.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 32 + # batch_size: 2 + # min_throughput: 80 + # min_rps: 0.35 + + # - name: "qwen3.5-35b-a3b-fp8" + # s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 80 + # min_rps: 0.35 + + # - name: "qwen3.5-27b-fp8" + # s3_model: "qwen3.5-27b-fp8.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 128 + # num_prompts: 64 + # batch_size: 4 + # min_throughput: 20 + # min_rps: 0.2 + + # - name: "qwen3-coder-next-fp8" + # s3_model: "qwen3-coder-next-fp8.tar.gz" + # fleet: "x86-g6e12xl-runner" + # extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + # input_len: 512 + # output_len: 256 + # num_prompts: 32 + # batch_size: 2 + # min_throughput: 93 + # min_rps: 0.25 - - name: "qwen3.5-35b-a3b-fp8" - s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - # https://github.com/vllm-project/vllm/issues/35743 open bug for capturing CUDA graph fails - # workaround with --enforce-eager tp=1 fail while tp=4 success - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + runner-scale-sets: + # --- Qwen 3.5/3.6 benchmark (placeholder thresholds — update after manual run) --- + - name: "qwen3.5-0.8b" + s3_model: "qwen3.5-0.8b.tar.gz" + runner_label: "gpu-l4-1gpu-runners" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 80 - min_rps: 0.35 + min_throughput: 1 + min_rps: 0.1 -# A100 is compute capability 8.0 — FP8 requires 8.9+ (H100/L40S). -# The Marlin fallback uses significantly more memory. - - name: "qwen3.5-27b-fp8" - s3_model: "qwen3.5-27b-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + - name: "qwen3.5-2b" + s3_model: "qwen3.5-2b.tar.gz" + runner_label: "gpu-l4-1gpu-runners" + extra_args: "--tensor-parallel-size 1 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.6" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 20 - min_rps: 0.2 + min_throughput: 1 + min_rps: 0.1 - - name: "qwen3-coder-next-fp8" - s3_model: "qwen3-coder-next-fp8.tar.gz" - fleet: "x86-g6e12xl-runner" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 256 - num_prompts: 32 - batch_size: 2 - min_throughput: 93 - min_rps: 0.25 - - runner-scale-sets: - - name: "qwen3-32b" - s3_model: "qwen3-32b.tar.gz" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.85" - input_len: 512 - output_len: 256 - num_prompts: 32 - batch_size: 2 - min_throughput: 1133 - min_rps: 3 - - - name: "qwen3.5-35b-a3b-fp8" - s3_model: "qwen3.5-35b-a3b-fp8.tar.gz" - # https://github.com/vllm-project/vllm/issues/35743 open bug for capturing CUDA graph fails - # workaround with --enforce-eager tp=1 fail while tp=4 success - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + - name: "qwen3.5-27b-fp8" + s3_model: "qwen3.5-27b-fp8.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.8" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 80 - min_rps: 0.35 + min_throughput: 1 + min_rps: 0.1 - - name: "qwen3.5-27b-fp8" - s3_model: "qwen3.5-27b-fp8.tar.gz" - # A100 lacks native FP8 — vLLM dequantizes to BF16 at load, doubling weight memory - extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --enforce-eager" + - name: "qwen3.6-27b" + s3_model: "qwen3.6-27b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" input_len: 512 output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 20 - min_rps: 0.2 + min_throughput: 1 + min_rps: 0.1 - - name: "qwen3-coder-next-fp8" - s3_model: "qwen3-coder-next-fp8.tar.gz" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" - input_len: 512 - output_len: 256 - num_prompts: 32 - batch_size: 2 - min_throughput: 93 - min_rps: 0.25 - - - name: "llama-3.3-70b" - s3_model: "llama-3.3-70b.tar.gz" - extra_args: "--tensor-parallel-size 4 --max-model-len 4096" + - name: "qwen3.6-35b-a3b" + s3_model: "qwen3.6-35b-a3b.tar.gz" + runner_label: "gpu-l40s-4gpu-runners" + extra_args: "--tensor-parallel-size 4 --max-model-len 4096 --dtype bfloat16 --gpu-memory-utilization 0.8" input_len: 512 output_len: 128 - num_prompts: 32 - batch_size: 2 - min_throughput: 80 - min_rps: 0.35 - -# upstream -# facebook/opt-125m -# meta-llama/Llama-3.2-1B-Instruct -# Qwen/Qwen3-0.6B -# fixie-ai/ultravox-v0_5-llama-3_2-1b -# llava-hf/llava-1.5-7b-hf -# microsoft/Phi-3.5-vision-instruct -# openai/whisper-large-v3-turbo -# jason9693/Qwen2.5-1.5B-apeach -# intfloat/e5-small -# BAAI/bge-reranker-v2-m3 -# meta-llama/Llama-3.1-8B-Instruct + num_prompts: 64 + batch_size: 4 + min_throughput: 1 + min_rps: 0.1 diff --git a/.github/workflows/reusable-vllm-model-tests.yml b/.github/workflows/reusable-vllm-model-tests.yml index fede1aadf196..d34b1922146f 100644 --- a/.github/workflows/reusable-vllm-model-tests.yml +++ b/.github/workflows/reusable-vllm-model-tests.yml @@ -140,7 +140,7 @@ jobs: fail-fast: false matrix: include: ${{ fromJson(needs.load-models.outputs.runner-scale-sets-matrix) }} - runs-on: gpu-efa-runners + runs-on: ${{ matrix.runner_label }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -169,7 +169,10 @@ jobs: - name: Start container run: | docker pull ${{ inputs.image-uri }} - CONTAINER_ID=$(docker run -d -it --gpus all --entrypoint /bin/bash \ + # Get GPU UUIDs visible to this pod (k8s assigns a subset of host GPUs) + POD_GPUS=$(nvidia-smi --query-gpu=uuid --format=csv,noheader | paste -sd,) + echo "Pod GPU UUIDs: ${POD_GPUS}" + CONTAINER_ID=$(docker run -d -it --gpus "\"device=${POD_GPUS}\"" --entrypoint /bin/bash \ --ipc=host --shm-size=10g \ ${{ inputs.image-uri }}) echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV @@ -184,7 +187,6 @@ jobs: if [ -f "test/vllm/scripts/amzn2023/${{ matrix.test_script || '' }}" ]; then docker cp "test/vllm/scripts/amzn2023/${{ matrix.test_script }}" ${CONTAINER_ID}:/models/ fi - rm -rf /dlc-models - name: Download and copy test fixtures if: ${{ matrix.test_fixtures_paths != '' }} @@ -209,6 +211,4 @@ jobs: if: always() run: | docker stop ${CONTAINER_ID} 2>/dev/null || true - docker rm -f ${CONTAINER_ID} 2>/dev/null || true - docker rmi ${{ inputs.image-uri }} 2>/dev/null || true - rm -rf /dlc-models + docker rm -f ${CONTAINER_ID} 2>/dev/null || true \ No newline at end of file