diff --git a/.github/actions/build-image/action.yml b/.github/actions/build-image/action.yml index 62e3374be9bd..027177f5e485 100644 --- a/.github/actions/build-image/action.yml +++ b/.github/actions/build-image/action.yml @@ -69,6 +69,10 @@ inputs: description: 'Transformers library version (e.g., 4.28.1)' required: false default: '' + runtime-base: + description: 'Pre-built runtime base image URI. When set, skips compile stages.' + required: false + default: '' outputs: image-uri: @@ -120,3 +124,4 @@ runs: INFERENCE_TOOLKIT_VERSION: ${{ inputs.inference-toolkit-version }} TORCHSERVE_VERSION: ${{ inputs.torchserve-version }} TRANSFORMERS_VERSION: ${{ inputs.transformers-version }} + RUNTIME_BASE: ${{ inputs.runtime-base }} diff --git a/.github/config/vllm-omni-ec2-amzn2023.yml b/.github/config/vllm-omni-ec2-amzn2023.yml new file mode 100644 index 000000000000..00f051d150a1 --- /dev/null +++ b/.github/config/vllm-omni-ec2-amzn2023.yml @@ -0,0 +1,26 @@ +# vLLM-Omni EC2 AL2023 Image Configuration + +image: + name: "vllm-omni-ec2-amzn2023" + description: "vLLM-Omni for EC2 instances (AL2023, omni-modality serving)" + +common: + framework: "vllm-omni" + framework_version: "0.18.0" + job_type: "general" + python_version: "py312" + cuda_version: "cu129" + os_version: "amzn2023" + customer_type: "ec2" + arch_type: "x86" + prod_image: "vllm-omni:0.18-gpu-py312-ec2" + device_type: "gpu" + contributor: "None" + +release: + release: false + force_release: false + public_registry: false + private_registry: true + enable_soci: true + environment: production diff --git a/.github/config/vllm-omni-model-tests.yml b/.github/config/vllm-omni-model-tests.yml new file mode 100644 index 000000000000..f093bf77c2d8 --- /dev/null +++ b/.github/config/vllm-omni-model-tests.yml @@ -0,0 +1,57 @@ +# vLLM-Omni Model Test Configuration +# Tests for omni-modality models (TTS, image generation, video, omni-chat) +# +# Each model defines its test_request (sent to /invocations via middleware) +# and the route for the SageMaker routing middleware. +# +# Models use s3_model (pre-cached in S3) downloaded by the download-model action. + +s3_prefix: "s3://dlc-cicd-models/omni-models" + +smoke-test: + codebuild-fleet: + # --- TTS models (route: /v1/audio/speech) --- + - name: "qwen3-tts-1.7b-customvoice" + s3_model: "qwen3-tts-1.7b-customvoice.tar.gz" + fleet: "x86-g6xl-runner" + extra_args: "" + route: "/v1/audio/speech" + test_request: '{"input": "Hello, how are you?", "voice": "vivian", "language": "English"}' + validate: "binary_size_gt:1000" + + # --- Image generation models (route: /v1/images/generations) --- + - name: "flux2-klein-4b" + s3_model: "flux2-klein-4b.tar.gz" + fleet: "x86-g6xl-runner" + extra_args: "" + route: "/v1/images/generations" + test_request: '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}' + validate: "json_field:data[0].b64_json" + + # --- Video generation models (route: /v1/videos) --- + - name: "wan2.1-t2v-1.3b" + s3_model: "wan2.1-t2v-1.3b.tar.gz" + fleet: "x86-g6exl-runner" + extra_args: "" + route: "/v1/videos" + content_type: "multipart/form-data" + test_request: 'prompt=a dog running on a beach&num_frames=17&num_inference_steps=4&size=480x320&seed=42' + validate: "json_field:id" + + # --- Omni chat models (route: /v1/chat/completions, fallthrough) --- + # model is big, won't run for now + # - name: "bagel-7b-mot" + # s3_model: "bagel-7b-mot.tar.gz" + # fleet: "x86-g6e4xl-runner" + # extra_args: "" + # route: "/v1/chat/completions" + # test_request: '{"messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>A cute cat<|im_end|>"}]}], "modalities": ["image"], "height": 512, "width": 512, "num_inference_steps": 4, "seed": 42}' + # validate: "json_field:choices[0].message.content" + + - name: "qwen2.5-omni-3b" + s3_model: "qwen2.5-omni-3b.tar.gz" + fleet: "x86-g6e12xl-runner" + extra_args: "" + route: "/v1/chat/completions" + test_request: '{"messages": [{"role": "user", "content": "Say hello in one sentence."}], "max_tokens": 64}' + validate: "json_field:choices[0].message.content" diff --git a/.github/config/vllm-omni-sagemaker-amzn2023.yml b/.github/config/vllm-omni-sagemaker-amzn2023.yml new file mode 100644 index 000000000000..87b9e3b35f17 --- /dev/null +++ b/.github/config/vllm-omni-sagemaker-amzn2023.yml @@ -0,0 +1,26 @@ +# vLLM-Omni SageMaker AL2023 Image Configuration + +image: + name: "vllm-omni-sagemaker-amzn2023" + description: "vLLM-Omni for SageMaker (AL2023, omni-modality serving)" + +common: + framework: "vllm-omni" + framework_version: "0.18.0" + job_type: "general" + python_version: "py312" + cuda_version: "cu129" + os_version: "amzn2023" + customer_type: "sagemaker" + arch_type: "x86" + prod_image: "vllm-omni:0.18-gpu-py312-sagemaker" + device_type: "gpu" + contributor: "None" + +release: + release: false + force_release: false + public_registry: false + private_registry: true + enable_soci: true + environment: production diff --git a/.github/scripts/build_image.sh b/.github/scripts/build_image.sh index 224712f97e7e..4aca4dfc3dbd 100755 --- a/.github/scripts/build_image.sh +++ b/.github/scripts/build_image.sh @@ -26,6 +26,7 @@ CUSTOMER_TYPE="${CUSTOMER_TYPE:-}" INFERENCE_TOOLKIT_VERSION="${INFERENCE_TOOLKIT_VERSION:-}" TORCHSERVE_VERSION="${TORCHSERVE_VERSION:-}" TRANSFORMERS_VERSION="${TRANSFORMERS_VERSION:-}" +RUNTIME_BASE="${RUNTIME_BASE:-}" # Resolve image URI CI_IMAGE_URI="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/ci:${TAG_PR}" @@ -67,6 +68,13 @@ BUILD_CMD="docker buildx build --progress plain \ --build-arg FRAMEWORK=\"${FRAMEWORK}\" \ --build-arg FRAMEWORK_VERSION=\"${FRAMEWORK_VERSION}\"" +# Use pre-built runtime base if available (skips compile stages) +if [[ -n "${RUNTIME_BASE}" ]]; then + echo "Using pre-built runtime base: ${RUNTIME_BASE}" + BUILD_CMD="${BUILD_CMD} \ + --build-arg RUNTIME_BASE=\"${RUNTIME_BASE}\"" +fi + # Add SageMaker labels if customer-type is 'sagemaker' if [[ "${CUSTOMER_TYPE}" == "sagemaker" ]]; then BUILD_CMD="${BUILD_CMD} \ diff --git a/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml new file mode 100644 index 000000000000..924ddf62fe80 --- /dev/null +++ b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml @@ -0,0 +1,274 @@ +name: PR - vLLM-Omni EC2 AMZN2023 + +on: + pull_request: + branches: [main] + types: [opened, reopened, synchronize] + paths: + - "docker/vllm/Dockerfile.amzn2023" + - "scripts/vllm/omni_*" + - "scripts/common/**" + - "scripts/telemetry/**" + - ".github/config/vllm-omni-ec2-amzn2023.yml" + - ".github/config/vllm-omni-model-tests.yml" + - ".github/workflows/pr-vllm-omni-ec2-amzn2023.yml" + - ".github/workflows/reusable-vllm-omni-model-tests.yml" + - "test/vllm-omni/**" + - "test/telemetry/**" + +permissions: + contents: read + pull-requests: read + +env: + FORCE_COLOR: "1" + CONFIG_FILE: ".github/config/vllm-omni-ec2-amzn2023.yml" + +jobs: + gatekeeper: + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-gate-${{ github.event.pull_request.number }} + cancel-in-progress: true + steps: + - name: Checkout base branch (safe) + uses: actions/checkout@v5 + with: + ref: ${{ github.event.pull_request.base.sha }} + fetch-depth: 1 + + - name: Run permission gate (from base) + uses: ./.github/actions/pr-permission-gate + + load-config: + needs: [gatekeeper] + if: success() + runs-on: ubuntu-latest + outputs: + framework: ${{ steps.parse.outputs.framework }} + framework-version: ${{ steps.parse.outputs.framework-version }} + python-version: ${{ steps.parse.outputs.python-version }} + cuda-version: ${{ steps.parse.outputs.cuda-version }} + os-version: ${{ steps.parse.outputs.os-version }} + container-type: ${{ steps.parse.outputs.container-type }} + device-type: ${{ steps.parse.outputs.device-type }} + arch-type: ${{ steps.parse.outputs.arch-type }} + contributor: ${{ steps.parse.outputs.contributor }} + customer-type: ${{ steps.parse.outputs.customer-type }} + prod-image: ${{ steps.parse.outputs.prod-image }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Load configuration + id: load + uses: ./.github/actions/load-config + with: + config-file: ${{ env.CONFIG_FILE }} + + - name: Parse configuration + id: parse + run: | + echo '${{ steps.load.outputs.config }}' > config.json + echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT + echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT + echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT + echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT + echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT + echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT + echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT + echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT + echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT + echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT + echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT + + check-changes: + needs: [gatekeeper] + if: success() + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-check-changes-${{ github.event.pull_request.number }} + cancel-in-progress: true + outputs: + build-change: ${{ steps.changes.outputs.build-change }} + telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} + steps: + - name: Checkout DLC source + uses: actions/checkout@v5 + + - name: Setup python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Run pre-commit + uses: pre-commit/action@v3.0.1 + with: + extra_args: --all-files + + - name: Detect file changes + id: changes + uses: dorny/paths-filter@v3 + with: + filters: | + build-change: + - "docker/vllm/Dockerfile.amzn2023" + - "scripts/vllm/omni_*" + - "scripts/common/**" + - "scripts/telemetry/**" + - ".github/config/vllm-omni-ec2-amzn2023.yml" + - ".github/config/vllm-omni-model-tests.yml" + - "test/vllm-omni/**" + telemetry-test-change: + - "test/telemetry/**" + + build-runtime: + needs: [check-changes, load-config] + if: needs.check-changes.outputs.build-change == 'true' + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:x86-vllm-build-runner + buildspec-override:true + timeout-minutes: 720 + outputs: + runtime-base: ${{ steps.check.outputs.image }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Setup buildkitd + run: .github/scripts/buildkitd.sh + + - name: ECR login + uses: ./.github/actions/ecr-authenticate + with: + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Check or build runtime base + id: check + run: | + TAG="vllm-runtime-v${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.python-version }}" + IMAGE="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:${TAG}" + echo "image=${IMAGE}" >> $GITHUB_OUTPUT + + # Skip build if image already exists + if docker manifest inspect "${IMAGE}" >/dev/null 2>&1; then + echo "Runtime base exists: ${IMAGE}" + exit 0 + fi + + echo "Building runtime base: ${IMAGE}" + docker buildx build --progress plain \ + --target runtime-build \ + --tag "${IMAGE}" \ + --push \ + -f docker/vllm/Dockerfile.amzn2023 . + + build-image: + needs: [check-changes, load-config, build-runtime] + if: needs.check-changes.outputs.build-change == 'true' + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:x86-vllm-build-runner + buildspec-override:true + timeout-minutes: 720 + concurrency: + group: ${{ github.workflow }}-build-image-${{ github.event.pull_request.number }} + cancel-in-progress: true + outputs: + ci-image: ${{ steps.build.outputs.image-uri }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Build image + id: build + uses: ./.github/actions/build-image + with: + framework: ${{ needs.load-config.outputs.framework }} + target: vllm-omni-ec2-amzn2023 + base-image: nvidia/cuda:12.9.1-devel-amzn2023 + framework-version: ${{ needs.load-config.outputs.framework-version }} + container-type: ${{ needs.load-config.outputs.container-type }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + tag-pr: vllm-omni-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-ec2-pr-${{ github.event.pull_request.number }} + dockerfile-path: docker/vllm/Dockerfile.amzn2023 + arch-type: ${{ needs.load-config.outputs.arch-type }} + device-type: ${{ needs.load-config.outputs.device-type }} + cuda-version: ${{ needs.load-config.outputs.cuda-version }} + python-version: ${{ needs.load-config.outputs.python-version }} + os-version: ${{ needs.load-config.outputs.os-version }} + contributor: ${{ needs.load-config.outputs.contributor }} + customer-type: ${{ needs.load-config.outputs.customer-type }} + runtime-base: ${{ needs.build-runtime.outputs.runtime-base }} + + sanity-test: + needs: [check-changes, build-image, load-config] + if: | + always() && !failure() && !cancelled() && + needs.check-changes.outputs.build-change == 'true' + concurrency: + group: ${{ github.workflow }}-sanity-test-${{ github.event.pull_request.number }} + cancel-in-progress: true + uses: ./.github/workflows/reusable-sanity-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + framework: ${{ needs.load-config.outputs.framework }} + framework-version: ${{ needs.load-config.outputs.framework-version }} + python-version: ${{ needs.load-config.outputs.python-version }} + cuda-version: ${{ needs.load-config.outputs.cuda-version }} + os-version: ${{ needs.load-config.outputs.os-version }} + customer-type: ${{ needs.load-config.outputs.customer-type }} + arch-type: ${{ needs.load-config.outputs.arch-type }} + device-type: ${{ needs.load-config.outputs.device-type }} + contributor: ${{ needs.load-config.outputs.contributor }} + container-type: ${{ needs.load-config.outputs.container-type }} + + security-test: + needs: [build-image, load-config] + if: success() + concurrency: + group: ${{ github.workflow }}-security-test-${{ github.event.pull_request.number }} + cancel-in-progress: true + uses: ./.github/workflows/reusable-security-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + framework: ${{ needs.load-config.outputs.framework }} + framework-version: ${{ needs.load-config.outputs.framework-version }} + + telemetry-test: + needs: [check-changes, build-image, load-config] + if: | + always() && !failure() && !cancelled() && + (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true') + concurrency: + group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} + cancel-in-progress: false + uses: ./.github/workflows/reusable-telemetry-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + framework: ${{ needs.load-config.outputs.framework }} + framework-version: ${{ needs.load-config.outputs.framework-version }} + container-type: ${{ needs.load-config.outputs.container-type }} + + omni-model-smoke-tests: + needs: [build-image, load-config] + if: success() + concurrency: + group: ${{ github.workflow }}-omni-model-tests-${{ github.event.pull_request.number }} + cancel-in-progress: true + uses: ./.github/workflows/reusable-vllm-omni-model-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + customer-type: ${{ needs.load-config.outputs.customer-type }} + secrets: inherit diff --git a/.github/workflows/pr-vllm-omni-sagemaker-amzn2023.yml b/.github/workflows/pr-vllm-omni-sagemaker-amzn2023.yml new file mode 100644 index 000000000000..6eaec90c0a40 --- /dev/null +++ b/.github/workflows/pr-vllm-omni-sagemaker-amzn2023.yml @@ -0,0 +1,314 @@ +name: PR - vLLM-Omni SageMaker AMZN2023 + +on: + pull_request: + branches: [main] + types: [opened, reopened, synchronize] + paths: + - "docker/vllm/Dockerfile.amzn2023" + - "scripts/vllm/omni_*" + - "scripts/common/**" + - "scripts/telemetry/**" + - ".github/config/vllm-omni-sagemaker-amzn2023.yml" + - ".github/config/vllm-omni-model-tests.yml" + - ".github/workflows/pr-vllm-omni-sagemaker-amzn2023.yml" + - ".github/workflows/reusable-vllm-omni-model-tests.yml" + - "test/vllm-omni/**" + - "test/telemetry/**" + +permissions: + contents: read + pull-requests: read + +env: + FORCE_COLOR: "1" + CONFIG_FILE: ".github/config/vllm-omni-sagemaker-amzn2023.yml" + +jobs: + gatekeeper: + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-gate-${{ github.event.pull_request.number }} + cancel-in-progress: true + steps: + - name: Checkout base branch (safe) + uses: actions/checkout@v5 + with: + ref: ${{ github.event.pull_request.base.sha }} + fetch-depth: 1 + + - name: Run permission gate (from base) + uses: ./.github/actions/pr-permission-gate + + load-config: + needs: [gatekeeper] + if: success() + runs-on: ubuntu-latest + outputs: + framework: ${{ steps.parse.outputs.framework }} + framework-version: ${{ steps.parse.outputs.framework-version }} + python-version: ${{ steps.parse.outputs.python-version }} + cuda-version: ${{ steps.parse.outputs.cuda-version }} + os-version: ${{ steps.parse.outputs.os-version }} + container-type: ${{ steps.parse.outputs.container-type }} + device-type: ${{ steps.parse.outputs.device-type }} + arch-type: ${{ steps.parse.outputs.arch-type }} + contributor: ${{ steps.parse.outputs.contributor }} + customer-type: ${{ steps.parse.outputs.customer-type }} + prod-image: ${{ steps.parse.outputs.prod-image }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Load configuration + id: load + uses: ./.github/actions/load-config + with: + config-file: ${{ env.CONFIG_FILE }} + + - name: Parse configuration + id: parse + run: | + echo '${{ steps.load.outputs.config }}' > config.json + echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT + echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT + echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT + echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT + echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT + echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT + echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT + echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT + echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT + echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT + echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT + + check-changes: + needs: [gatekeeper] + if: success() + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-check-changes-${{ github.event.pull_request.number }} + cancel-in-progress: true + outputs: + build-change: ${{ steps.changes.outputs.build-change }} + telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }} + steps: + - name: Checkout DLC source + uses: actions/checkout@v5 + + - name: Setup python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Run pre-commit + uses: pre-commit/action@v3.0.1 + with: + extra_args: --all-files + + - name: Detect file changes + id: changes + uses: dorny/paths-filter@v3 + with: + filters: | + build-change: + - "docker/vllm/Dockerfile.amzn2023" + - "scripts/vllm/omni_*" + - "scripts/common/**" + - "scripts/telemetry/**" + - ".github/config/vllm-omni-sagemaker-amzn2023.yml" + - ".github/config/vllm-omni-model-tests.yml" + - "test/vllm-omni/**" + telemetry-test-change: + - "test/telemetry/**" + + build-runtime: + needs: [check-changes, load-config] + if: needs.check-changes.outputs.build-change == 'true' + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:x86-vllm-build-runner + buildspec-override:true + timeout-minutes: 720 + outputs: + runtime-base: ${{ steps.check.outputs.image }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Setup buildkitd + run: .github/scripts/buildkitd.sh + + - name: ECR login + uses: ./.github/actions/ecr-authenticate + with: + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Check or build runtime base + id: check + run: | + TAG="vllm-runtime-v${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.python-version }}" + IMAGE="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:${TAG}" + echo "image=${IMAGE}" >> $GITHUB_OUTPUT + + if docker manifest inspect "${IMAGE}" >/dev/null 2>&1; then + echo "Runtime base exists: ${IMAGE}" + exit 0 + fi + + echo "Building runtime base: ${IMAGE}" + docker buildx build --progress plain \ + --target runtime-build \ + --tag "${IMAGE}" \ + --push \ + -f docker/vllm/Dockerfile.amzn2023 . + + build-image: + needs: [check-changes, load-config, build-runtime] + if: needs.check-changes.outputs.build-change == 'true' + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:x86-vllm-build-runner + buildspec-override:true + timeout-minutes: 720 + concurrency: + group: ${{ github.workflow }}-build-image-${{ github.event.pull_request.number }} + cancel-in-progress: true + outputs: + ci-image: ${{ steps.build.outputs.image-uri }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Build image + id: build + uses: ./.github/actions/build-image + with: + framework: ${{ needs.load-config.outputs.framework }} + target: vllm-omni-sagemaker-amzn2023 + base-image: nvidia/cuda:12.9.1-devel-amzn2023 + framework-version: ${{ needs.load-config.outputs.framework-version }} + container-type: ${{ needs.load-config.outputs.container-type }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + tag-pr: vllm-omni-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-pr-${{ github.event.pull_request.number }} + dockerfile-path: docker/vllm/Dockerfile.amzn2023 + arch-type: ${{ needs.load-config.outputs.arch-type }} + device-type: ${{ needs.load-config.outputs.device-type }} + cuda-version: ${{ needs.load-config.outputs.cuda-version }} + python-version: ${{ needs.load-config.outputs.python-version }} + os-version: ${{ needs.load-config.outputs.os-version }} + contributor: ${{ needs.load-config.outputs.contributor }} + customer-type: ${{ needs.load-config.outputs.customer-type }} + runtime-base: ${{ needs.build-runtime.outputs.runtime-base }} + + sanity-test: + needs: [check-changes, build-image, load-config] + if: | + always() && !failure() && !cancelled() && + needs.check-changes.outputs.build-change == 'true' + concurrency: + group: ${{ github.workflow }}-sanity-test-${{ github.event.pull_request.number }} + cancel-in-progress: true + uses: ./.github/workflows/reusable-sanity-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + framework: ${{ needs.load-config.outputs.framework }} + framework-version: ${{ needs.load-config.outputs.framework-version }} + python-version: ${{ needs.load-config.outputs.python-version }} + cuda-version: ${{ needs.load-config.outputs.cuda-version }} + os-version: ${{ needs.load-config.outputs.os-version }} + customer-type: ${{ needs.load-config.outputs.customer-type }} + arch-type: ${{ needs.load-config.outputs.arch-type }} + device-type: ${{ needs.load-config.outputs.device-type }} + contributor: ${{ needs.load-config.outputs.contributor }} + container-type: ${{ needs.load-config.outputs.container-type }} + + security-test: + needs: [build-image, load-config] + if: success() + concurrency: + group: ${{ github.workflow }}-security-test-${{ github.event.pull_request.number }} + cancel-in-progress: true + uses: ./.github/workflows/reusable-security-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + framework: ${{ needs.load-config.outputs.framework }} + framework-version: ${{ needs.load-config.outputs.framework-version }} + + telemetry-test: + needs: [check-changes, build-image, load-config] + if: | + always() && !failure() && !cancelled() && + (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true') + concurrency: + group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }} + cancel-in-progress: false + uses: ./.github/workflows/reusable-telemetry-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + framework: ${{ needs.load-config.outputs.framework }} + framework-version: ${{ needs.load-config.outputs.framework-version }} + container-type: ${{ needs.load-config.outputs.container-type }} + + omni-model-smoke-tests: + needs: [build-image, load-config] + if: success() + concurrency: + group: ${{ github.workflow }}-omni-model-tests-${{ github.event.pull_request.number }} + cancel-in-progress: true + uses: ./.github/workflows/reusable-vllm-omni-model-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + customer-type: ${{ needs.load-config.outputs.customer-type }} + secrets: inherit + + sagemaker-endpoint-test: + needs: [build-image, load-config] + if: success() + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:default-runner + buildspec-override:true + concurrency: + group: ${{ github.workflow }}-sm-endpoint-${{ github.event.pull_request.number }} + cancel-in-progress: true + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Install test dependencies + run: | + uv venv --python 3.12 + source .venv/bin/activate + uv pip install -r test/requirements.txt + uv pip install -r test/vllm-omni/sagemaker/requirements.txt + + - name: Run SageMaker endpoint test + run: | + source .venv/bin/activate + cd test/ + python3 -m pytest -vs -rA --image-uri ${{ needs.build-image.outputs.ci-image }} vllm-omni/sagemaker + + - name: Cleanup orphaned endpoints + if: always() + run: | + source .venv/bin/activate + python3 -c " + import boto3 + sm = boto3.client('sagemaker') + for ep in sm.list_endpoints(NameContains='vllm-omni', StatusEquals='InService').get('Endpoints', []): + name = ep['EndpointName'] + print(f'Deleting orphaned endpoint: {name}') + sm.delete_endpoint(EndpointName=name) + sm.delete_endpoint_config(EndpointConfigName=name) + " diff --git a/.github/workflows/reusable-vllm-omni-model-tests.yml b/.github/workflows/reusable-vllm-omni-model-tests.yml new file mode 100644 index 000000000000..dad843e1c853 --- /dev/null +++ b/.github/workflows/reusable-vllm-omni-model-tests.yml @@ -0,0 +1,180 @@ +name: Reusable vLLM-Omni Model Smoke Tests + +permissions: + contents: read + +on: + workflow_call: + inputs: + image-uri: + description: "Image URI to test" + required: true + type: string + aws-account-id: + description: "AWS account ID for ECR authentication" + required: true + type: string + aws-region: + description: "AWS region for ECR authentication" + required: true + type: string + customer-type: + description: "Customer type: ec2 or sagemaker" + required: true + type: string + secrets: + HF_TOKEN: + description: "HuggingFace token for downloading models" + required: false + +jobs: + load-models: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.parse.outputs.matrix }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Parse model config + id: parse + run: | + python3 -c " + import yaml, json + with open('.github/config/vllm-omni-model-tests.yml') as f: + cfg = yaml.safe_load(f) + prefix = cfg.get('s3_prefix', '') + models = cfg.get('smoke-test', {}).get('codebuild-fleet', []) + for m in models: + if 's3_model' in m: + m['s3_path'] = prefix + '/' + m.pop('s3_model') + m['model_source'] = 's3' + elif 'hf_model' in m: + m['model_source'] = 'hf' + print(f'matrix={json.dumps(models)}') + " >> "$GITHUB_OUTPUT" + + smoke-test: + name: smoke-test (${{ matrix.model.name }}) + needs: load-models + if: needs.load-models.outputs.matrix != '[]' + strategy: + fail-fast: false + matrix: + model: ${{ fromJson(needs.load-models.outputs.matrix) }} + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:${{ matrix.model.fleet }} + buildspec-override:true + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: ECR login + uses: ./.github/actions/ecr-authenticate + with: + aws-account-id: ${{ inputs.aws-account-id }} + aws-region: ${{ inputs.aws-region }} + image-uri: ${{ inputs.image-uri }} + + - name: Download model from S3 + if: matrix.model.model_source == 's3' + uses: ./.github/actions/download-model + id: model + with: + s3-path: ${{ matrix.model.s3_path }} + model-name: ${{ matrix.model.name }} + + - name: Resolve model path + id: resolve + run: | + if [ "${{ matrix.model.model_source }}" = "s3" ]; then + echo "model_path=/models/${{ matrix.model.name }}" >> $GITHUB_OUTPUT + echo "volume=-v /dlc-models:/models" >> $GITHUB_OUTPUT + else + echo "model_path=${{ matrix.model.hf_model }}" >> $GITHUB_OUTPUT + echo "volume=" >> $GITHUB_OUTPUT + fi + + # EC2: entrypoint accepts CLI args directly + - name: Start container (EC2) + if: inputs.customer-type == 'ec2' + run: | + docker pull ${{ inputs.image-uri }} + CONTAINER_ID=$(docker run -d --gpus all --shm-size=4g \ + ${{ steps.resolve.outputs.volume }} \ + -e HF_TOKEN=${{ secrets.HF_TOKEN }} \ + -p 8080:8080 \ + ${{ inputs.image-uri }} \ + --model ${{ steps.resolve.outputs.model_path }} \ + --port 8080 \ + --stage-init-timeout 600 \ + ${{ matrix.model.extra_args }}) + echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV + + - name: Convert extra_args to SM env vars + if: inputs.customer-type == 'sagemaker' + id: sm-env + run: | + # Convert --key value pairs to SM_VLLM_KEY=value env vars + EXTRA_ENV="" + ARGS="${{ matrix.model.extra_args }}" + while [[ -n "$ARGS" ]]; do + if [[ "$ARGS" =~ ^--([a-z][a-z0-9-]*)[[:space:]]*(.*) ]]; then + KEY=$(echo "${BASH_REMATCH[1]}" | tr '-' '_' | tr '[:lower:]' '[:upper:]') + REST="${BASH_REMATCH[2]}" + if [[ "$REST" =~ ^--[a-z] ]] || [[ -z "$REST" ]]; then + EXTRA_ENV="$EXTRA_ENV -e SM_VLLM_${KEY}=true" + ARGS="$REST" + else + VALUE="${REST%% --*}" + EXTRA_ENV="$EXTRA_ENV -e SM_VLLM_${KEY}=${VALUE}" + ARGS="${REST#"$VALUE"}" + fi + ARGS="${ARGS# }" + else + break + fi + done + echo "env_flags=$EXTRA_ENV" >> $GITHUB_OUTPUT + + # SageMaker: entrypoint reads SM_VLLM_* env vars + - name: Start container (SageMaker) + if: inputs.customer-type == 'sagemaker' + run: | + docker pull ${{ inputs.image-uri }} + CONTAINER_ID=$(docker run -d --gpus all --shm-size=4g \ + ${{ steps.resolve.outputs.volume }} \ + -e SM_VLLM_MODEL=${{ steps.resolve.outputs.model_path }} \ + -e SM_VLLM_STAGE_INIT_TIMEOUT=600 \ + -e HF_TOKEN=${{ secrets.HF_TOKEN }} \ + ${{ steps.sm-env.outputs.env_flags }} \ + -p 8080:8080 \ + ${{ inputs.image-uri }}) + echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV + + - name: Copy test scripts + run: | + docker cp test/vllm-omni/scripts/vllm_omni_${{ inputs.customer-type }}_smoke_test.sh \ + ${CONTAINER_ID}:/tmp/smoke_test.sh + + - name: Run smoke test + run: | + docker exec ${CONTAINER_ID} bash /tmp/smoke_test.sh \ + "${{ matrix.model.route }}" \ + '${{ matrix.model.test_request }}' \ + "${{ matrix.model.validate }}" \ + "${{ matrix.model.content_type || 'application/json' }}" + + - name: Dump container logs + if: always() + run: | + docker logs ${CONTAINER_ID} 2>&1 | tail -500 || true + + - name: Cleanup + if: always() + run: | + kill ${{ steps.model.outputs.lock-pid }} 2>/dev/null || true + docker stop ${CONTAINER_ID} 2>/dev/null || true + docker rm -f ${CONTAINER_ID} 2>/dev/null || true + docker rmi ${{ inputs.image-uri }} 2>/dev/null || true diff --git a/docker/sglang/Dockerfile.amzn2023 b/docker/sglang/Dockerfile.amzn2023 index 121ca2d22d2f..901ca7f8d0b4 100644 --- a/docker/sglang/Dockerfile.amzn2023 +++ b/docker/sglang/Dockerfile.amzn2023 @@ -241,7 +241,8 @@ RUN uv pip install --system --no-cache \ "pillow>=12.1.1" \ "python_multipart>=0.0.22" \ "xgrammar>=0.1.32" \ - "setuptools>=78.1.1" + "setuptools>=78.1.1" \ + "aiohttp>=3.13.4" # Re-pin NCCL/cuDNN/cuSparseLt after CVE patches (transitive deps may downgrade or remove them) # cuSparseLt installed without --no-deps in case it wasn't present from builder diff --git a/docker/vllm/Dockerfile.amzn2023 b/docker/vllm/Dockerfile.amzn2023 index 2c580138665a..410e721df89d 100644 --- a/docker/vllm/Dockerfile.amzn2023 +++ b/docker/vllm/Dockerfile.amzn2023 @@ -1,6 +1,11 @@ ARG CUDA_VERSION=12.9.1 ARG PYTHON_VERSION=3.12 +# Pre-built runtime image. When set, skips the compile stages (source/build/deps) +# and uses this image directly as the runtime base. Build it with: +# docker buildx build --target runtime --tag /vllm-runtime: --push ... +ARG RUNTIME_BASE="" + # ============================================================================= # STAGE 0: source — clone vLLM and apply patches # ============================================================================= @@ -201,8 +206,9 @@ RUN PATH="/opt/venv/bin:${PATH}" bash /tmp/setup_oss_compliance.sh python${PYTHO # ============================================================================= # STAGE 3: runtime — minimal image with clean venv +# Built from scratch (compile path) or pulled from pre-built RUNTIME_BASE. # ============================================================================= -FROM nvidia/cuda:${CUDA_VERSION}-runtime-amzn2023 AS runtime +FROM nvidia/cuda:${CUDA_VERSION}-runtime-amzn2023 AS runtime-build ARG CUDA_VERSION ARG PYTHON_VERSION=3.12 @@ -238,6 +244,10 @@ ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_P ENV VLLM_USAGE_SOURCE=production-docker-image ENV HF_HUB_ENABLE_HF_TRANSFER=1 +# Pre-built runtime (fast path) — used when RUNTIME_BASE is set +ARG RUNTIME_BASE +FROM ${RUNTIME_BASE:-runtime-build} AS runtime + # ============================================================================= # STAGE 4: DLC overlay — Amazon DLC customizations on top of vLLM runtime # ============================================================================= @@ -339,4 +349,137 @@ RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=Fal COPY ./scripts/vllm/sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh +ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] + +# ============================================================================= +# STAGE: omni-deps — install vllm-omni on top of runtime venv +# ============================================================================= +FROM runtime AS omni-deps + +ARG VLLM_OMNI_VERSION=0.18.0 + +# System deps for omni-modality (TTS, audio, image/video) +# Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, sox, ffmpeg +RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-release \ + && dnf install -y spal-release \ + && dnf install -y --setopt=install_weak_deps=False espeak-ng sox ffmpeg-free \ + && dnf clean all && rm -rf /var/cache/dnf + +# Install vllm-omni (pure Python, no compilation) +RUN --mount=type=cache,target=/root/.cache/uv uv pip install vllm-omni==${VLLM_OMNI_VERSION} + +# ============================================================================= +# STAGE: builder-oss-omni — OSS compliance for omni venv +# ============================================================================= +FROM nvidia/cuda:${CUDA_VERSION}-runtime-amzn2023 AS builder-oss-omni +ARG PYTHON_VERSION +RUN dnf install -y --allowerasing python${PYTHON_VERSION} curl && dnf clean all +COPY --from=omni-deps /opt/venv /opt/venv +COPY scripts/common/setup_oss_compliance.sh /tmp/setup_oss_compliance.sh +RUN PATH="/opt/venv/bin:${PATH}" bash /tmp/setup_oss_compliance.sh python${PYTHON_VERSION} \ + && touch /root/THIRD_PARTY_SOURCE_CODE_URLS + +# ============================================================================= +# STAGE: omni-base — DLC overlay for vLLM-Omni +# ============================================================================= +FROM omni-deps AS omni-base + +ARG PYTHON="python3" +ARG PYTHON_VERSION=3.12 +ARG CUDA_VERSION + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" +LABEL dlc_minor_version="0" + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + DLC_CONTAINER_TYPE=general \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \ + PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" + +WORKDIR / + +# Install DLC Python dependencies +RUN uv pip install --no-cache-dir botocore + +# Patch CVEs +RUN uv pip install --no-cache-dir \ + "pillow>=12.1.1" \ + "xgrammar>=0.1.32" \ + "PyJWT>=2.12.0" \ + "cbor2>=5.9.0" \ + "gradio>=6.7.0" + +COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY ./scripts/telemetry/bash_telemetry.sh.template /tmp/bash_telemetry.sh.template + +ARG FRAMEWORK +ARG FRAMEWORK_VERSION +ARG CONTAINER_TYPE + +# telemetry +RUN chmod +x /usr/local/bin/deep_learning_container.py \ + && sed -e "s/{{FRAMEWORK}}/${FRAMEWORK}/g" \ + -e "s/{{FRAMEWORK_VERSION}}/${FRAMEWORK_VERSION}/g" \ + -e "s/{{CONTAINER_TYPE}}/${CONTAINER_TYPE}/g" \ + /tmp/bash_telemetry.sh.template >/usr/local/bin/bash_telemetry.sh \ + && chmod +x /usr/local/bin/bash_telemetry.sh \ + && rm /tmp/bash_telemetry.sh.template \ + && echo 'source /usr/local/bin/bash_telemetry.sh' >>/etc/bashrc \ + && echo 'source /usr/local/bin/bash_telemetry.sh' >>/root/.bashrc \ + && ln -sf /opt/venv/bin/python3 /usr/bin/python \ + && rm -rf /tmp/tmp* \ + && rm -rf /tmp/uv* \ + && rm -rf /var/cache/dnf \ + && rm -rf /root/.cache || true + +# OSS compliance (from omni-specific builder) +COPY --from=builder-oss-omni /root/THIRD_PARTY_SOURCE_CODE_URLS /root/THIRD_PARTY_SOURCE_CODE_URLS +COPY --from=builder-oss-omni /root/PYTHON_PACKAGES_LICENSES /root/PYTHON_PACKAGES_LICENSES +COPY --from=builder-oss-omni /root/LINUX_PACKAGES_LICENSES /root/LINUX_PACKAGES_LICENSES +COPY --from=builder-oss-omni /root/BUILD_FROM_SOURCE_PACKAGES_LICENCES /root/BUILD_FROM_SOURCE_PACKAGES_LICENCES +COPY --from=builder-oss-omni /usr/local/bin/testOSSCompliance /usr/local/bin/testOSSCompliance + +# install EFA +COPY ./scripts/common/install_efa_amzn2023.sh install_efa_amzn2023.sh +ARG EFA_VERSION="1.47.0" +RUN echo -e '[cuda-rhel9]\nname=cuda-rhel9\nbaseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64\nenabled=1\ngpgcheck=0' >/etc/yum.repos.d/cuda-rhel9.repo \ + && dnf install -y --setopt=install_weak_deps=False libnccl libnccl-devel \ + && ldconfig \ + && bash install_efa_amzn2023.sh ${EFA_VERSION} \ + && rm install_efa_amzn2023.sh \ + && dnf remove -y libnccl-devel \ + && dnf clean all && rm -rf /var/cache/dnf \ + && rm -rf /usr/local/cuda/bin/nvdisasm* + +# ====================== omni ec2 ========================================= +FROM omni-base AS vllm-omni-ec2-amzn2023 + +ARG CACHE_REFRESH=0 +RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=False \ + && dnf clean all && rm -rf /var/cache/dnf /tmp/* \ + && ln -sf /opt/venv/bin/python3 /usr/bin/python3 + +COPY ./scripts/vllm/omni_dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"] + +# ====================== omni sagemaker ========================================= +FROM omni-base AS vllm-omni-sagemaker-amzn2023 + +ARG CACHE_REFRESH=0 +RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=False \ + && dnf clean all && rm -rf /var/cache/dnf /tmp/* \ + && ln -sf /opt/venv/bin/python3 /usr/bin/python3 + +COPY ./scripts/vllm/omni_sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh +COPY ./scripts/vllm/omni_sagemaker_serve.py /usr/local/bin/omni_sagemaker_serve.py +ENV PYTHONPATH="/usr/local/bin:${PYTHONPATH}" +RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh + ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] \ No newline at end of file diff --git a/scripts/telemetry/deep_learning_container.py b/scripts/telemetry/deep_learning_container.py index a9122e2bce64..910a2c19dca6 100755 --- a/scripts/telemetry/deep_learning_container.py +++ b/scripts/telemetry/deep_learning_container.py @@ -228,7 +228,17 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( "--framework", - choices=["tensorflow", "mxnet", "pytorch", "base", "vllm", "sglang", "lambda", "ray"], + choices=[ + "tensorflow", + "mxnet", + "pytorch", + "base", + "vllm", + "sglang", + "lambda", + "ray", + "vllm-omni", + ], help="framework of container image.", required=True, ) diff --git a/scripts/vllm/omni_dockerd_entrypoint.sh b/scripts/vllm/omni_dockerd_entrypoint.sh new file mode 100755 index 000000000000..82166d04814c --- /dev/null +++ b/scripts/vllm/omni_dockerd_entrypoint.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +# Check if telemetry file exists before executing +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true + +exec vllm serve --omni "$@" diff --git a/scripts/vllm/omni_sagemaker_entrypoint.sh b/scripts/vllm/omni_sagemaker_entrypoint.sh new file mode 100755 index 000000000000..94b15f0a4091 --- /dev/null +++ b/scripts/vllm/omni_sagemaker_entrypoint.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Check if telemetry file exists before executing +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true + +PREFIX="SM_VLLM_" +ARG_PREFIX="--" + +ARGS=(--port 8080) + +# Auto-detect model if SM_VLLM_MODEL is not set +if [ -z "${SM_VLLM_MODEL}" ]; then + if [ -d "/opt/ml/model" ] && [ "$(ls -A /opt/ml/model 2>/dev/null)" ]; then + echo "INFO: SM_VLLM_MODEL not set, auto-detected model at /opt/ml/model" + ARGS+=(--model /opt/ml/model) + elif [ -n "${HF_MODEL_ID}" ]; then + echo "INFO: SM_VLLM_MODEL not set, using HF_MODEL_ID=${HF_MODEL_ID}" + ARGS+=(--model "${HF_MODEL_ID}") + else + echo "WARNING: No model specified. Set SM_VLLM_MODEL, HF_MODEL_ID, or mount a model to /opt/ml/model." + fi +fi + +while IFS='=' read -r key value; do + arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + + # Handle boolean flags: true -> flag only, false -> skip entirely + lower_value=$(echo "$value" | tr '[:upper:]' '[:lower:]') + if [ "$lower_value" = "true" ]; then + ARGS+=("${ARG_PREFIX}${arg_name}") + elif [ "$lower_value" = "false" ]; then + continue + else + ARGS+=("${ARG_PREFIX}${arg_name}") + if [ -n "$value" ]; then + ARGS+=("$value") + fi + fi +done < <(env | grep "^${PREFIX}") + +# Add SageMaker routing middleware to dispatch /invocations to the correct +# vllm-omni endpoint (e.g. /v1/audio/speech for TTS) +ARGS+=(--middleware omni_sagemaker_serve.SageMakerRouteMiddleware) + +exec vllm serve --omni "${ARGS[@]}" diff --git a/scripts/vllm/omni_sagemaker_serve.py b/scripts/vllm/omni_sagemaker_serve.py new file mode 100644 index 000000000000..7db1bb80aeaf --- /dev/null +++ b/scripts/vllm/omni_sagemaker_serve.py @@ -0,0 +1,50 @@ +"""SageMaker routing middleware for vLLM-Omni. + +Routes /invocations requests based on the X-Amzn-SageMaker-Custom-Attributes +header. Clients specify the target endpoint via route=, e.g.: + + CustomAttributes="route=/v1/audio/speech" + +If no route is specified, falls through to vLLM's built-in /invocations +handler (chat/completion/embed). + +Usage: vllm serve --omni --middleware omni_sagemaker_serve.SageMakerRouteMiddleware +""" + +import logging +import re + +from starlette.types import ASGIApp, Receive, Scope, Send + +logger = logging.getLogger("omni_sagemaker") + + +def _parse_route(headers: list[tuple[bytes, bytes]]) -> str | None: + """Extract route= from SageMaker custom attributes header.""" + for key, value in headers: + if key.lower() == b"x-amzn-sagemaker-custom-attributes": + m = re.search(r"route=(/[^\s,]+)", value.decode()) + return m.group(1) if m else None + return None + + +class SageMakerRouteMiddleware: + """ASGI middleware that reroutes /invocations based on CustomAttributes. + + Explicit route via header -> rewrites path to that endpoint. + No route specified -> falls through to vLLM's built-in /invocations handler. + """ + + def __init__(self, app: ASGIApp) -> None: + self.app = app + + async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: + if scope["type"] == "http" and scope["path"] == "/invocations": + route = _parse_route(scope.get("headers", [])) + if route: + logger.info("Rerouting /invocations -> %s", route) + scope = dict(scope) + scope["path"] = route + scope["raw_path"] = route.encode() + + await self.app(scope, receive, send) diff --git a/test/security/data/ecr_scan_allowlist/vllm/framework_allowlist.json b/test/security/data/ecr_scan_allowlist/vllm/framework_allowlist.json index 95591f599e4a..1dae8903a160 100644 --- a/test/security/data/ecr_scan_allowlist/vllm/framework_allowlist.json +++ b/test/security/data/ecr_scan_allowlist/vllm/framework_allowlist.json @@ -103,11 +103,6 @@ "vulnerability_id": "CVE-2026-31812", "reason": "Coming in as a dependency from the latest uv 0.10.9" }, - { - "vulnerability_id": "CVE-2026-33055", - "reason": "Rust tar crate 0.4.44 bundled in uv binary, fix requires uv upstream update to tar>=0.4.45", - "review_by": "2026-04-06" - }, { "vulnerability_id": "CVE-2026-27893", "reason": "vllm 0.10.2 RayServe image - trust_remote_code=True hardcoded, fixed in vllm>=0.18.0. RayServe image not updated in this PR." @@ -124,4 +119,4 @@ "vulnerability_id": "CVE-2026-34520", "reason": "aiohttp 3.12.15 vendored inside ray/_private/runtime_env/agent/thirdparty_files/, unpatchable without Ray upgrade" } -] \ No newline at end of file +] diff --git a/test/vllm-omni/sagemaker/requirements.txt b/test/vllm-omni/sagemaker/requirements.txt new file mode 100644 index 000000000000..6a4743d65577 --- /dev/null +++ b/test/vllm-omni/sagemaker/requirements.txt @@ -0,0 +1,2 @@ +sagemaker>=2,<3 +starlette diff --git a/test/vllm-omni/sagemaker/test_sagemaker_middleware.py b/test/vllm-omni/sagemaker/test_sagemaker_middleware.py new file mode 100644 index 000000000000..fa7ce616a6e3 --- /dev/null +++ b/test/vllm-omni/sagemaker/test_sagemaker_middleware.py @@ -0,0 +1,116 @@ +"""Unit tests for SageMaker routing middleware.""" + +import asyncio +import os +import sys + +# Allow importing omni_sagemaker_serve from scripts/vllm/ +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", "scripts", "vllm")) + +import pytest +from omni_sagemaker_serve import SageMakerRouteMiddleware, _parse_route + + +class TestParseRoute: + def test_extracts_route(self): + headers = [(b"x-amzn-sagemaker-custom-attributes", b"route=/v1/audio/speech")] + assert _parse_route(headers) == "/v1/audio/speech" + + def test_extracts_route_with_extra_attrs(self): + headers = [(b"x-amzn-sagemaker-custom-attributes", b"foo=bar,route=/v1/audio/speech,baz=1")] + assert _parse_route(headers) == "/v1/audio/speech" + + def test_no_route(self): + headers = [(b"x-amzn-sagemaker-custom-attributes", b"foo=bar")] + assert _parse_route(headers) is None + + def test_no_header(self): + assert _parse_route([]) is None + + def test_case_insensitive_header(self): + headers = [(b"X-Amzn-SageMaker-Custom-Attributes", b"route=/v1/chat/completions")] + assert _parse_route(headers) == "/v1/chat/completions" + + +class TestMiddleware: + @pytest.fixture + def captured(self): + return {} + + @pytest.fixture + def app(self, captured): + async def inner(scope, receive, send): + captured["path"] = scope["path"] + + return inner + + @pytest.fixture + def middleware(self, app): + return SageMakerRouteMiddleware(app) + + def _make_scope(self, path="/invocations", headers=None): + return { + "type": "http", + "path": path, + "raw_path": path.encode(), + "headers": headers or [], + } + + def _run(self, coro): + return asyncio.get_event_loop().run_until_complete(coro) + + def test_rewrites_with_route_header(self, middleware, captured): + scope = self._make_scope( + headers=[ + (b"x-amzn-sagemaker-custom-attributes", b"route=/v1/audio/speech"), + ] + ) + self._run(middleware(scope, None, None)) + assert captured["path"] == "/v1/audio/speech" + + def test_falls_through_without_route(self, middleware, captured): + scope = self._make_scope() + self._run(middleware(scope, None, None)) + assert captured["path"] == "/invocations" + + def test_ignores_non_invocations(self, middleware, captured): + scope = self._make_scope(path="/health") + self._run(middleware(scope, None, None)) + assert captured["path"] == "/health" + + def test_ignores_non_http(self, middleware, captured): + scope = {"type": "websocket", "path": "/invocations"} + self._run(middleware(scope, None, None)) + assert captured["path"] == "/invocations" + + def test_rewrites_raw_path(self, middleware, captured): + scope = self._make_scope( + headers=[ + (b"x-amzn-sagemaker-custom-attributes", b"route=/v1/completions"), + ] + ) + self._run(middleware(scope, None, None)) + assert captured["path"] == "/v1/completions" + + def test_adapter_attrs_without_route_falls_through(self, middleware, captured): + """Adapter attributes (no route=) should fall through to /invocations.""" + scope = self._make_scope( + headers=[ + (b"x-amzn-sagemaker-custom-attributes", b"adapter=my-lora-adapter"), + ] + ) + self._run(middleware(scope, None, None)) + assert captured["path"] == "/invocations" + + def test_adapter_attrs_with_route_rewrites(self, middleware, captured): + """Both adapter and route attrs — route takes effect, adapter preserved in headers.""" + scope = self._make_scope( + headers=[ + ( + b"x-amzn-sagemaker-custom-attributes", + b"adapter=my-lora,route=/v1/audio/speech", + ), + ] + ) + self._run(middleware(scope, None, None)) + assert captured["path"] == "/v1/audio/speech" diff --git a/test/vllm-omni/sagemaker/test_sm_omni_endpoint.py b/test/vllm-omni/sagemaker/test_sm_omni_endpoint.py new file mode 100644 index 000000000000..6920c78a09c8 --- /dev/null +++ b/test/vllm-omni/sagemaker/test_sm_omni_endpoint.py @@ -0,0 +1,235 @@ +"""Integration test for vLLM-Omni SageMaker endpoint""" + +import json +import logging +import time + +import pytest +from sagemaker.async_inference import AsyncInferenceConfig +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer +from test_utils import clean_string, random_suffix_name, wait_for_status +from test_utils.constants import INFERENCE_AMI_VERSION, SAGEMAKER_ROLE +from test_utils.huggingface_helper import get_hf_token + +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) + +ENDPOINT_WAIT_PERIOD = 60 +ENDPOINT_WAIT_LENGTH = 30 +ENDPOINT_INSERVICE = "InService" + + +def get_endpoint_status(sagemaker_client, endpoint_name): + response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name) + return response["EndpointStatus"] + + +@pytest.fixture(scope="function") +def model_id(request): + return request.param + + +@pytest.fixture(scope="function") +def instance_type(request): + return request.param + + +@pytest.fixture(scope="function") +def model_package(aws_session, image_uri, model_id): + sagemaker_client = aws_session.sagemaker + cleaned_id = clean_string(model_id.split("/")[1], "_./") + model_name = random_suffix_name(f"vllm-omni-{cleaned_id}", 50) + + try: + LOGGER.info(f"Creating SageMaker model: {model_name}") + hf_token = get_hf_token(aws_session) + model = Model( + name=model_name, + image_uri=image_uri, + role=SAGEMAKER_ROLE, + predictor_cls=Predictor, + env={ + "SM_VLLM_MODEL": model_id, + "HF_TOKEN": hf_token, + }, + ) + yield model + finally: + LOGGER.info(f"Deleting model: {model_name}") + sagemaker_client.delete_model(ModelName=model_name) + + +@pytest.fixture(scope="function") +def model_endpoint(aws_session, model_package, instance_type): + sagemaker_client = aws_session.sagemaker + model = model_package + cleaned_instance = clean_string(instance_type, "_./") + endpoint_name = random_suffix_name(f"vllm-omni-{cleaned_instance}", 50) + + try: + LOGGER.info("Starting endpoint deployment...") + predictor = model.deploy( + instance_type=instance_type, + initial_instance_count=1, + endpoint_name=endpoint_name, + inference_ami_version=INFERENCE_AMI_VERSION, + serializer=JSONSerializer(), + wait=True, + ) + + LOGGER.info(f"Waiting for endpoint {ENDPOINT_INSERVICE} status...") + assert wait_for_status( + ENDPOINT_INSERVICE, + ENDPOINT_WAIT_PERIOD, + ENDPOINT_WAIT_LENGTH, + get_endpoint_status, + sagemaker_client, + endpoint_name, + ) + yield predictor + finally: + LOGGER.info(f"Deleting endpoint: {endpoint_name}") + sagemaker_client.delete_endpoint(EndpointName=endpoint_name) + sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name) + + +@pytest.mark.parametrize("instance_type", ["ml.g5.xlarge"], indirect=True) +@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"], indirect=True) +def test_vllm_omni_tts_endpoint(model_endpoint): + """TTS via /invocations routed to /v1/audio/speech by the serve proxy.""" + predictor = model_endpoint + sm_runtime = predictor.sagemaker_session.sagemaker_runtime_client + + payload = json.dumps( + { + "input": "Hello, this is a test of the text to speech system.", + "voice": "vivian", + "language": "English", + } + ) + + LOGGER.info("Sending TTS request via /invocations with route=/v1/audio/speech") + # First request triggers torch.compile + CUDA graph capture (~67s), + # which exceeds SageMaker's 60s invoke timeout. Retry after warmup completes. + import time + + # https://github.com/aws/sagemaker-python-sdk/issues/1119 + for attempt in range(3): + try: + response = sm_runtime.invoke_endpoint( + EndpointName=predictor.endpoint_name, + ContentType="application/json", + Body=payload, + CustomAttributes="route=/v1/audio/speech", + ) + break + except Exception as e: + LOGGER.warning(f"Attempt {attempt + 1}/3 failed: {e}") + if attempt == 2: + raise + time.sleep(30) + + audio_bytes = response["Body"].read() + LOGGER.info(f"TTS audio response: {len(audio_bytes)} bytes") + assert len(audio_bytes) > 1000, f"TTS output too small: {len(audio_bytes)} bytes" + LOGGER.info("TTS endpoint test PASSED") + + +@pytest.fixture(scope="function") +def async_endpoint(aws_session, model_package, instance_type): + """Deploy an async inference endpoint (no 60s timeout limit).""" + sagemaker_client = aws_session.sagemaker + model = model_package + cleaned_instance = clean_string(instance_type, "_./") + endpoint_name = random_suffix_name(f"vllm-omni-async-{cleaned_instance}", 50) + account_id = aws_session.sts.get_caller_identity()["Account"] + s3_output = f"s3://sagemaker-{aws_session.region}-{account_id}/vllm-omni-async-output/" + + try: + LOGGER.info(f"Deploying async endpoint: {endpoint_name}") + predictor = model.deploy( + instance_type=instance_type, + initial_instance_count=1, + endpoint_name=endpoint_name, + inference_ami_version=INFERENCE_AMI_VERSION, + serializer=JSONSerializer(), + async_inference_config=AsyncInferenceConfig( + output_path=s3_output, + max_concurrent_invocations_per_instance=1, + ), + wait=True, + ) + + LOGGER.info(f"Waiting for endpoint {ENDPOINT_INSERVICE} status...") + assert wait_for_status( + ENDPOINT_INSERVICE, + ENDPOINT_WAIT_PERIOD, + ENDPOINT_WAIT_LENGTH, + get_endpoint_status, + sagemaker_client, + endpoint_name, + ) + yield predictor, s3_output + finally: + LOGGER.info(f"Deleting async endpoint: {endpoint_name}") + sagemaker_client.delete_endpoint(EndpointName=endpoint_name) + sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name) + + +@pytest.mark.parametrize("instance_type", ["ml.g5.xlarge"], indirect=True) +@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"], indirect=True) +def test_vllm_omni_tts_async_endpoint(async_endpoint): + """TTS via async inference — no 60s timeout, up to 1 hour.""" + predictor, s3_output = async_endpoint + sm_runtime = predictor.sagemaker_session.sagemaker_runtime_client + s3_client = predictor.sagemaker_session.boto_session.client("s3") + + payload = json.dumps( + { + "input": "Hello, this is a test of async text to speech.", + "voice": "vivian", + "language": "English", + } + ) + + LOGGER.info("Sending async TTS request") + response = sm_runtime.invoke_endpoint_async( + EndpointName=predictor.endpoint_name, + ContentType="application/json", + InputLocation=_upload_payload_to_s3(s3_client, payload, s3_output, predictor.endpoint_name), + CustomAttributes="route=/v1/audio/speech", + ) + + output_location = response["OutputLocation"] + LOGGER.info(f"Async output location: {output_location}") + + # Poll for result (up to 5 minutes) + bucket, key = _parse_s3_uri(output_location) + for i in range(60): + try: + obj = s3_client.get_object(Bucket=bucket, Key=key) + audio_bytes = obj["Body"].read() + LOGGER.info(f"Async TTS response: {len(audio_bytes)} bytes (after {i * 5}s)") + assert len(audio_bytes) > 1000, f"TTS output too small: {len(audio_bytes)} bytes" + LOGGER.info("Async TTS endpoint test PASSED") + return + except s3_client.exceptions.NoSuchKey: + time.sleep(5) + + pytest.fail("Async inference timed out after 300s") + + +def _upload_payload_to_s3(s3_client, payload, s3_output, endpoint_name): + """Upload request payload to S3 for async inference.""" + bucket, prefix = _parse_s3_uri(s3_output) + key = f"{prefix}{endpoint_name}-input.json" + s3_client.put_object(Bucket=bucket, Key=key, Body=payload, ContentType="application/json") + return f"s3://{bucket}/{key}" + + +def _parse_s3_uri(uri): + """Parse s3://bucket/key into (bucket, key).""" + parts = uri.replace("s3://", "").split("/", 1) + return parts[0], parts[1] if len(parts) > 1 else "" diff --git a/test/vllm-omni/scripts/vllm_omni_ec2_smoke_test.sh b/test/vllm-omni/scripts/vllm_omni_ec2_smoke_test.sh new file mode 100755 index 000000000000..d97fa684f908 --- /dev/null +++ b/test/vllm-omni/scripts/vllm_omni_ec2_smoke_test.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Smoke test for vLLM-Omni EC2 images +# Uses the OpenAI-compatible API directly (no /invocations middleware). +# Request payload and validation are passed as arguments from the model config. +set -eux + +ROUTE="${1:?Usage: $0 [content_type]}" +REQUEST="${2:?Usage: $0 [content_type]}" +VALIDATE="${3:?Usage: $0 [content_type]}" +CONTENT_TYPE="${4:-application/json}" +PORT=8080 + +echo "=== vLLM-Omni EC2 smoke test ===" +echo "Route: ${ROUTE}" +echo "Content-Type: ${CONTENT_TYPE}" +echo "Validate: ${VALIDATE}" + +# Wait for server +for i in $(seq 1 300); do + if curl -s http://localhost:${PORT}/health >/dev/null 2>&1; then + echo "Server ready after ${i}s" + break + fi + sleep 1 +done + +curl -sf http://localhost:${PORT}/health || { echo "Health check failed"; exit 1; } + +# Send request directly to the API endpoint +if [ "${CONTENT_TYPE}" = "multipart/form-data" ]; then + CURL_CMD=(curl -sf -X POST "http://localhost:${PORT}${ROUTE}") + IFS='&' read -ra PAIRS <<< "${REQUEST}" + for pair in "${PAIRS[@]}"; do + CURL_CMD+=(-F "${pair}") + done + CURL_CMD+=(--output /tmp/omni_response --max-time 300) + "${CURL_CMD[@]}" +else + curl -sf -X POST "http://localhost:${PORT}${ROUTE}" \ + -H "Content-Type: application/json" \ + -d "${REQUEST}" \ + --output /tmp/omni_response --max-time 300 +fi + +# Validate response +if [[ "${VALIDATE}" == binary_size_gt:* ]]; then + MIN_SIZE="${VALIDATE#binary_size_gt:}" + FILE_SIZE=$(stat -c%s /tmp/omni_response 2>/dev/null || stat -f%z /tmp/omni_response) + echo "Response size: ${FILE_SIZE} bytes (min: ${MIN_SIZE})" + [ "${FILE_SIZE}" -gt "${MIN_SIZE}" ] || { echo "FAIL: response too small"; exit 1; } + +elif [[ "${VALIDATE}" == json_field:* ]]; then + FIELD="${VALIDATE#json_field:}" + python3 -c " +import json, sys +data = json.load(open('/tmp/omni_response')) +obj = data +for part in '${FIELD}'.replace(']','').replace('[','.').split('.'): + if part.isdigit(): + obj = obj[int(part)] + else: + obj = obj[part] +assert obj, 'Field ${FIELD} is empty' +print(f'Validated: ${FIELD} present ({type(obj).__name__})') +" +fi + +echo "=== vLLM-Omni EC2 smoke test PASSED ===" diff --git a/test/vllm-omni/scripts/vllm_omni_sagemaker_smoke_test.sh b/test/vllm-omni/scripts/vllm_omni_sagemaker_smoke_test.sh new file mode 100755 index 000000000000..c7e63d5f8f91 --- /dev/null +++ b/test/vllm-omni/scripts/vllm_omni_sagemaker_smoke_test.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# Smoke test for vLLM-Omni SageMaker images +# Uses /invocations with the routing middleware (CustomAttributes: route=) +# Request payload and validation are passed as arguments from the model config. +set -eux + +ROUTE="${1:?Usage: $0 [content_type]}" +REQUEST="${2:?Usage: $0 [content_type]}" +VALIDATE="${3:?Usage: $0 [content_type]}" +CONTENT_TYPE="${4:-application/json}" +PORT=8080 + +echo "=== vLLM-Omni SageMaker smoke test ===" +echo "Route: ${ROUTE}" +echo "Content-Type: ${CONTENT_TYPE}" +echo "Validate: ${VALIDATE}" + +# Wait for server +for i in $(seq 1 300); do + if curl -s http://localhost:${PORT}/ping >/dev/null 2>&1; then + echo "Server ready after ${i}s" + break + fi + sleep 1 +done + +curl -sf http://localhost:${PORT}/ping || { echo "Ping failed"; exit 1; } + +# Send request via /invocations with route header +if [ "${CONTENT_TYPE}" = "multipart/form-data" ]; then + CURL_CMD=(curl -sf -X POST "http://localhost:${PORT}/invocations" + -H "X-Amzn-SageMaker-Custom-Attributes: route=${ROUTE}") + IFS='&' read -ra PAIRS <<< "${REQUEST}" + for pair in "${PAIRS[@]}"; do + CURL_CMD+=(-F "${pair}") + done + CURL_CMD+=(--output /tmp/omni_response --max-time 300) + "${CURL_CMD[@]}" +else + curl -sf -X POST http://localhost:${PORT}/invocations \ + -H "Content-Type: application/json" \ + -H "X-Amzn-SageMaker-Custom-Attributes: route=${ROUTE}" \ + -d "${REQUEST}" \ + --output /tmp/omni_response --max-time 300 +fi + +# Validate response +if [[ "${VALIDATE}" == binary_size_gt:* ]]; then + MIN_SIZE="${VALIDATE#binary_size_gt:}" + FILE_SIZE=$(stat -c%s /tmp/omni_response 2>/dev/null || stat -f%z /tmp/omni_response) + echo "Response size: ${FILE_SIZE} bytes (min: ${MIN_SIZE})" + [ "${FILE_SIZE}" -gt "${MIN_SIZE}" ] || { echo "FAIL: response too small"; exit 1; } + +elif [[ "${VALIDATE}" == json_field:* ]]; then + FIELD="${VALIDATE#json_field:}" + python3 -c " +import json, sys +data = json.load(open('/tmp/omni_response')) +obj = data +for part in '${FIELD}'.replace(']','').replace('[','.').split('.'): + if part.isdigit(): + obj = obj[int(part)] + else: + obj = obj[part] +assert obj, 'Field ${FIELD} is empty' +print(f'Validated: ${FIELD} present ({type(obj).__name__})') +" +fi + +echo "=== vLLM-Omni SageMaker smoke test PASSED ==="