diff --git a/.github/actions/build-image/action.yml b/.github/actions/build-image/action.yml
index 62e3374be9bd..027177f5e485 100644
--- a/.github/actions/build-image/action.yml
+++ b/.github/actions/build-image/action.yml
@@ -69,6 +69,10 @@ inputs:
     description: 'Transformers library version (e.g., 4.28.1)'
     required: false
     default: ''
+  runtime-base:
+    description: 'Pre-built runtime base image URI. When set, skips compile stages.'
+    required: false
+    default: ''
 
 outputs:
   image-uri:
@@ -120,3 +124,4 @@ runs:
         INFERENCE_TOOLKIT_VERSION: ${{ inputs.inference-toolkit-version }}
         TORCHSERVE_VERSION: ${{ inputs.torchserve-version }}
         TRANSFORMERS_VERSION: ${{ inputs.transformers-version }}
+        RUNTIME_BASE: ${{ inputs.runtime-base }}
diff --git a/.github/config/vllm-omni-ec2-amzn2023.yml b/.github/config/vllm-omni-ec2-amzn2023.yml
new file mode 100644
index 000000000000..00f051d150a1
--- /dev/null
+++ b/.github/config/vllm-omni-ec2-amzn2023.yml
@@ -0,0 +1,26 @@
+# vLLM-Omni EC2 AL2023 Image Configuration
+
+image:
+  name: "vllm-omni-ec2-amzn2023"
+  description: "vLLM-Omni for EC2 instances (AL2023, omni-modality serving)"
+
+common:
+  framework: "vllm-omni"
+  framework_version: "0.18.0"
+  job_type: "general"
+  python_version: "py312"
+  cuda_version: "cu129"
+  os_version: "amzn2023"
+  customer_type: "ec2"
+  arch_type: "x86"
+  prod_image: "vllm-omni:0.18-gpu-py312-ec2"
+  device_type: "gpu"
+  contributor: "None"
+
+release:
+  release: false
+  force_release: false
+  public_registry: false
+  private_registry: true
+  enable_soci: true
+  environment: production
diff --git a/.github/config/vllm-omni-model-tests.yml b/.github/config/vllm-omni-model-tests.yml
new file mode 100644
index 000000000000..f093bf77c2d8
--- /dev/null
+++ b/.github/config/vllm-omni-model-tests.yml
@@ -0,0 +1,57 @@
+# vLLM-Omni Model Test Configuration
+# Tests for omni-modality models (TTS, image generation, video, omni-chat)
+#
+# Each model defines its test_request (sent to /invocations via middleware)
+# and the route for the SageMaker routing middleware.
+#
+# Models use s3_model (pre-cached in S3) downloaded by the download-model action.
+
+s3_prefix: "s3://dlc-cicd-models/omni-models"
+
+smoke-test:
+  codebuild-fleet:
+    # --- TTS models (route: /v1/audio/speech) ---
+    - name: "qwen3-tts-1.7b-customvoice"
+      s3_model: "qwen3-tts-1.7b-customvoice.tar.gz"
+      fleet: "x86-g6xl-runner"
+      extra_args: ""
+      route: "/v1/audio/speech"
+      test_request: '{"input": "Hello, how are you?", "voice": "vivian", "language": "English"}'
+      validate: "binary_size_gt:1000"
+
+    # --- Image generation models (route: /v1/images/generations) ---
+    - name: "flux2-klein-4b"
+      s3_model: "flux2-klein-4b.tar.gz"
+      fleet: "x86-g6xl-runner"
+      extra_args: ""
+      route: "/v1/images/generations"
+      test_request: '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}'
+      validate: "json_field:data[0].b64_json"
+
+    # --- Video generation models (route: /v1/videos) ---
+    - name: "wan2.1-t2v-1.3b"
+      s3_model: "wan2.1-t2v-1.3b.tar.gz"
+      fleet: "x86-g6exl-runner"
+      extra_args: ""
+      route: "/v1/videos"
+      content_type: "multipart/form-data"
+      test_request: 'prompt=a dog running on a beach&num_frames=17&num_inference_steps=4&size=480x320&seed=42'
+      validate: "json_field:id"
+
+    # --- Omni chat models (route: /v1/chat/completions, fallthrough) ---
+    # model is big, won't run for now
+    # - name: "bagel-7b-mot"
+    #   s3_model: "bagel-7b-mot.tar.gz"
+    #   fleet: "x86-g6e4xl-runner"
+    #   extra_args: ""
+    #   route: "/v1/chat/completions"
+    #   test_request: '{"messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>A cute cat<|im_end|>"}]}], "modalities": ["image"], "height": 512, "width": 512, "num_inference_steps": 4, "seed": 42}'
+    #   validate: "json_field:choices[0].message.content"
+
+    - name: "qwen2.5-omni-3b"
+      s3_model: "qwen2.5-omni-3b.tar.gz"
+      fleet: "x86-g6e12xl-runner"
+      extra_args: ""
+      route: "/v1/chat/completions"
+      test_request: '{"messages": [{"role": "user", "content": "Say hello in one sentence."}], "max_tokens": 64}'
+      validate: "json_field:choices[0].message.content"
diff --git a/.github/config/vllm-omni-sagemaker-amzn2023.yml b/.github/config/vllm-omni-sagemaker-amzn2023.yml
new file mode 100644
index 000000000000..87b9e3b35f17
--- /dev/null
+++ b/.github/config/vllm-omni-sagemaker-amzn2023.yml
@@ -0,0 +1,26 @@
+# vLLM-Omni SageMaker AL2023 Image Configuration
+
+image:
+  name: "vllm-omni-sagemaker-amzn2023"
+  description: "vLLM-Omni for SageMaker (AL2023, omni-modality serving)"
+
+common:
+  framework: "vllm-omni"
+  framework_version: "0.18.0"
+  job_type: "general"
+  python_version: "py312"
+  cuda_version: "cu129"
+  os_version: "amzn2023"
+  customer_type: "sagemaker"
+  arch_type: "x86"
+  prod_image: "vllm-omni:0.18-gpu-py312-sagemaker"
+  device_type: "gpu"
+  contributor: "None"
+
+release:
+  release: false
+  force_release: false
+  public_registry: false
+  private_registry: true
+  enable_soci: true
+  environment: production
diff --git a/.github/scripts/build_image.sh b/.github/scripts/build_image.sh
index 224712f97e7e..4aca4dfc3dbd 100755
--- a/.github/scripts/build_image.sh
+++ b/.github/scripts/build_image.sh
@@ -26,6 +26,7 @@ CUSTOMER_TYPE="${CUSTOMER_TYPE:-}"
 INFERENCE_TOOLKIT_VERSION="${INFERENCE_TOOLKIT_VERSION:-}"
 TORCHSERVE_VERSION="${TORCHSERVE_VERSION:-}"
 TRANSFORMERS_VERSION="${TRANSFORMERS_VERSION:-}"
+RUNTIME_BASE="${RUNTIME_BASE:-}"
 
 # Resolve image URI
 CI_IMAGE_URI="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/ci:${TAG_PR}"
@@ -67,6 +68,13 @@ BUILD_CMD="docker buildx build --progress plain \
   --build-arg FRAMEWORK=\"${FRAMEWORK}\" \
   --build-arg FRAMEWORK_VERSION=\"${FRAMEWORK_VERSION}\""
 
+# Use pre-built runtime base if available (skips compile stages)
+if [[ -n "${RUNTIME_BASE}" ]]; then
+  echo "Using pre-built runtime base: ${RUNTIME_BASE}"
+  BUILD_CMD="${BUILD_CMD} \
+  --build-arg RUNTIME_BASE=\"${RUNTIME_BASE}\""
+fi
+
 # Add SageMaker labels if customer-type is 'sagemaker'
 if [[ "${CUSTOMER_TYPE}" == "sagemaker" ]]; then
   BUILD_CMD="${BUILD_CMD} \
diff --git a/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml
new file mode 100644
index 000000000000..924ddf62fe80
--- /dev/null
+++ b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml
@@ -0,0 +1,274 @@
+name: PR - vLLM-Omni EC2 AMZN2023
+
+on:
+  pull_request:
+    branches: [main]
+    types: [opened, reopened, synchronize]
+    paths:
+      - "docker/vllm/Dockerfile.amzn2023"
+      - "scripts/vllm/omni_*"
+      - "scripts/common/**"
+      - "scripts/telemetry/**"
+      - ".github/config/vllm-omni-ec2-amzn2023.yml"
+      - ".github/config/vllm-omni-model-tests.yml"
+      - ".github/workflows/pr-vllm-omni-ec2-amzn2023.yml"
+      - ".github/workflows/reusable-vllm-omni-model-tests.yml"
+      - "test/vllm-omni/**"
+      - "test/telemetry/**"
+
+permissions:
+  contents: read
+  pull-requests: read
+
+env:
+  FORCE_COLOR: "1"
+  CONFIG_FILE: ".github/config/vllm-omni-ec2-amzn2023.yml"
+
+jobs:
+  gatekeeper:
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-gate-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    steps:
+      - name: Checkout base branch (safe)
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ github.event.pull_request.base.sha }}
+          fetch-depth: 1
+
+      - name: Run permission gate (from base)
+        uses: ./.github/actions/pr-permission-gate
+
+  load-config:
+    needs: [gatekeeper]
+    if: success()
+    runs-on: ubuntu-latest
+    outputs:
+      framework: ${{ steps.parse.outputs.framework }}
+      framework-version: ${{ steps.parse.outputs.framework-version }}
+      python-version: ${{ steps.parse.outputs.python-version }}
+      cuda-version: ${{ steps.parse.outputs.cuda-version }}
+      os-version: ${{ steps.parse.outputs.os-version }}
+      container-type: ${{ steps.parse.outputs.container-type }}
+      device-type: ${{ steps.parse.outputs.device-type }}
+      arch-type: ${{ steps.parse.outputs.arch-type }}
+      contributor: ${{ steps.parse.outputs.contributor }}
+      customer-type: ${{ steps.parse.outputs.customer-type }}
+      prod-image: ${{ steps.parse.outputs.prod-image }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Load configuration
+        id: load
+        uses: ./.github/actions/load-config
+        with:
+          config-file: ${{ env.CONFIG_FILE }}
+
+      - name: Parse configuration
+        id: parse
+        run: |
+          echo '${{ steps.load.outputs.config }}' > config.json
+          echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT
+          echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT
+          echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT
+          echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT
+          echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT
+          echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT
+          echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT
+          echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT
+          echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT
+          echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT
+          echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT
+
+  check-changes:
+    needs: [gatekeeper]
+    if: success()
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-check-changes-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    outputs:
+      build-change: ${{ steps.changes.outputs.build-change }}
+      telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }}
+    steps:
+      - name: Checkout DLC source
+        uses: actions/checkout@v5
+
+      - name: Setup python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+
+      - name: Run pre-commit
+        uses: pre-commit/action@v3.0.1
+        with:
+          extra_args: --all-files
+
+      - name: Detect file changes
+        id: changes
+        uses: dorny/paths-filter@v3
+        with:
+          filters: |
+            build-change:
+              - "docker/vllm/Dockerfile.amzn2023"
+              - "scripts/vllm/omni_*"
+              - "scripts/common/**"
+              - "scripts/telemetry/**"
+              - ".github/config/vllm-omni-ec2-amzn2023.yml"
+              - ".github/config/vllm-omni-model-tests.yml"
+              - "test/vllm-omni/**"
+            telemetry-test-change:
+              - "test/telemetry/**"
+
+  build-runtime:
+    needs: [check-changes, load-config]
+    if: needs.check-changes.outputs.build-change == 'true'
+    runs-on:
+      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
+        fleet:x86-vllm-build-runner
+        buildspec-override:true
+    timeout-minutes: 720
+    outputs:
+      runtime-base: ${{ steps.check.outputs.image }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Setup buildkitd
+        run: .github/scripts/buildkitd.sh
+
+      - name: ECR login
+        uses: ./.github/actions/ecr-authenticate
+        with:
+          aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Check or build runtime base
+        id: check
+        run: |
+          TAG="vllm-runtime-v${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.python-version }}"
+          IMAGE="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:${TAG}"
+          echo "image=${IMAGE}" >> $GITHUB_OUTPUT
+
+          # Skip build if image already exists
+          if docker manifest inspect "${IMAGE}" >/dev/null 2>&1; then
+            echo "Runtime base exists: ${IMAGE}"
+            exit 0
+          fi
+
+          echo "Building runtime base: ${IMAGE}"
+          docker buildx build --progress plain \
+            --target runtime-build \
+            --tag "${IMAGE}" \
+            --push \
+            -f docker/vllm/Dockerfile.amzn2023 .
+
+  build-image:
+    needs: [check-changes, load-config, build-runtime]
+    if: needs.check-changes.outputs.build-change == 'true'
+    runs-on:
+      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
+        fleet:x86-vllm-build-runner
+        buildspec-override:true
+    timeout-minutes: 720
+    concurrency:
+      group: ${{ github.workflow }}-build-image-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    outputs:
+      ci-image: ${{ steps.build.outputs.image-uri }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Build image
+        id: build
+        uses: ./.github/actions/build-image
+        with:
+          framework: ${{ needs.load-config.outputs.framework }}
+          target: vllm-omni-ec2-amzn2023
+          base-image: nvidia/cuda:12.9.1-devel-amzn2023
+          framework-version: ${{ needs.load-config.outputs.framework-version }}
+          container-type: ${{ needs.load-config.outputs.container-type }}
+          aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+          aws-region: ${{ vars.AWS_REGION }}
+          tag-pr: vllm-omni-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-ec2-pr-${{ github.event.pull_request.number }}
+          dockerfile-path: docker/vllm/Dockerfile.amzn2023
+          arch-type: ${{ needs.load-config.outputs.arch-type }}
+          device-type: ${{ needs.load-config.outputs.device-type }}
+          cuda-version: ${{ needs.load-config.outputs.cuda-version }}
+          python-version: ${{ needs.load-config.outputs.python-version }}
+          os-version: ${{ needs.load-config.outputs.os-version }}
+          contributor: ${{ needs.load-config.outputs.contributor }}
+          customer-type: ${{ needs.load-config.outputs.customer-type }}
+          runtime-base: ${{ needs.build-runtime.outputs.runtime-base }}
+
+  sanity-test:
+    needs: [check-changes, build-image, load-config]
+    if: |
+      always() && !failure() && !cancelled() &&
+      needs.check-changes.outputs.build-change == 'true'
+    concurrency:
+      group: ${{ github.workflow }}-sanity-test-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    uses: ./.github/workflows/reusable-sanity-tests.yml
+    with:
+      image-uri: ${{ needs.build-image.outputs.ci-image }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+      aws-region: ${{ vars.AWS_REGION }}
+      framework: ${{ needs.load-config.outputs.framework }}
+      framework-version: ${{ needs.load-config.outputs.framework-version }}
+      python-version: ${{ needs.load-config.outputs.python-version }}
+      cuda-version: ${{ needs.load-config.outputs.cuda-version }}
+      os-version: ${{ needs.load-config.outputs.os-version }}
+      customer-type: ${{ needs.load-config.outputs.customer-type }}
+      arch-type: ${{ needs.load-config.outputs.arch-type }}
+      device-type: ${{ needs.load-config.outputs.device-type }}
+      contributor: ${{ needs.load-config.outputs.contributor }}
+      container-type: ${{ needs.load-config.outputs.container-type }}
+
+  security-test:
+    needs: [build-image, load-config]
+    if: success()
+    concurrency:
+      group: ${{ github.workflow }}-security-test-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    uses: ./.github/workflows/reusable-security-tests.yml
+    with:
+      image-uri: ${{ needs.build-image.outputs.ci-image }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+      aws-region: ${{ vars.AWS_REGION }}
+      framework: ${{ needs.load-config.outputs.framework }}
+      framework-version: ${{ needs.load-config.outputs.framework-version }}
+
+  telemetry-test:
+    needs: [check-changes, build-image, load-config]
+    if: |
+      always() && !failure() && !cancelled() &&
+      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true')
+    concurrency:
+      group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }}
+      cancel-in-progress: false
+    uses: ./.github/workflows/reusable-telemetry-tests.yml
+    with:
+      image-uri: ${{ needs.build-image.outputs.ci-image }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+      aws-region: ${{ vars.AWS_REGION }}
+      framework: ${{ needs.load-config.outputs.framework }}
+      framework-version: ${{ needs.load-config.outputs.framework-version }}
+      container-type: ${{ needs.load-config.outputs.container-type }}
+
+  omni-model-smoke-tests:
+    needs: [build-image, load-config]
+    if: success()
+    concurrency:
+      group: ${{ github.workflow }}-omni-model-tests-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    uses: ./.github/workflows/reusable-vllm-omni-model-tests.yml
+    with:
+      image-uri: ${{ needs.build-image.outputs.ci-image }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+      aws-region: ${{ vars.AWS_REGION }}
+      customer-type: ${{ needs.load-config.outputs.customer-type }}
+    secrets: inherit
diff --git a/.github/workflows/pr-vllm-omni-sagemaker-amzn2023.yml b/.github/workflows/pr-vllm-omni-sagemaker-amzn2023.yml
new file mode 100644
index 000000000000..6eaec90c0a40
--- /dev/null
+++ b/.github/workflows/pr-vllm-omni-sagemaker-amzn2023.yml
@@ -0,0 +1,314 @@
+name: PR - vLLM-Omni SageMaker AMZN2023
+
+on:
+  pull_request:
+    branches: [main]
+    types: [opened, reopened, synchronize]
+    paths:
+      - "docker/vllm/Dockerfile.amzn2023"
+      - "scripts/vllm/omni_*"
+      - "scripts/common/**"
+      - "scripts/telemetry/**"
+      - ".github/config/vllm-omni-sagemaker-amzn2023.yml"
+      - ".github/config/vllm-omni-model-tests.yml"
+      - ".github/workflows/pr-vllm-omni-sagemaker-amzn2023.yml"
+      - ".github/workflows/reusable-vllm-omni-model-tests.yml"
+      - "test/vllm-omni/**"
+      - "test/telemetry/**"
+
+permissions:
+  contents: read
+  pull-requests: read
+
+env:
+  FORCE_COLOR: "1"
+  CONFIG_FILE: ".github/config/vllm-omni-sagemaker-amzn2023.yml"
+
+jobs:
+  gatekeeper:
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-gate-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    steps:
+      - name: Checkout base branch (safe)
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ github.event.pull_request.base.sha }}
+          fetch-depth: 1
+
+      - name: Run permission gate (from base)
+        uses: ./.github/actions/pr-permission-gate
+
+  load-config:
+    needs: [gatekeeper]
+    if: success()
+    runs-on: ubuntu-latest
+    outputs:
+      framework: ${{ steps.parse.outputs.framework }}
+      framework-version: ${{ steps.parse.outputs.framework-version }}
+      python-version: ${{ steps.parse.outputs.python-version }}
+      cuda-version: ${{ steps.parse.outputs.cuda-version }}
+      os-version: ${{ steps.parse.outputs.os-version }}
+      container-type: ${{ steps.parse.outputs.container-type }}
+      device-type: ${{ steps.parse.outputs.device-type }}
+      arch-type: ${{ steps.parse.outputs.arch-type }}
+      contributor: ${{ steps.parse.outputs.contributor }}
+      customer-type: ${{ steps.parse.outputs.customer-type }}
+      prod-image: ${{ steps.parse.outputs.prod-image }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Load configuration
+        id: load
+        uses: ./.github/actions/load-config
+        with:
+          config-file: ${{ env.CONFIG_FILE }}
+
+      - name: Parse configuration
+        id: parse
+        run: |
+          echo '${{ steps.load.outputs.config }}' > config.json
+          echo "framework=$(jq -r '.common.framework' config.json)" >> $GITHUB_OUTPUT
+          echo "framework-version=$(jq -r '.common.framework_version' config.json)" >> $GITHUB_OUTPUT
+          echo "python-version=$(jq -r '.common.python_version' config.json)" >> $GITHUB_OUTPUT
+          echo "cuda-version=$(jq -r '.common.cuda_version' config.json)" >> $GITHUB_OUTPUT
+          echo "os-version=$(jq -r '.common.os_version' config.json)" >> $GITHUB_OUTPUT
+          echo "container-type=$(jq -r '.common.job_type' config.json)" >> $GITHUB_OUTPUT
+          echo "device-type=$(jq -r '.common.device_type // "gpu"' config.json)" >> $GITHUB_OUTPUT
+          echo "arch-type=$(jq -r '.common.arch_type // "x86"' config.json)" >> $GITHUB_OUTPUT
+          echo "contributor=$(jq -r '.common.contributor // "None"' config.json)" >> $GITHUB_OUTPUT
+          echo "customer-type=$(jq -r '.common.customer_type // ""' config.json)" >> $GITHUB_OUTPUT
+          echo "prod-image=$(jq -r '.common.prod_image' config.json)" >> $GITHUB_OUTPUT
+
+  check-changes:
+    needs: [gatekeeper]
+    if: success()
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-check-changes-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    outputs:
+      build-change: ${{ steps.changes.outputs.build-change }}
+      telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }}
+    steps:
+      - name: Checkout DLC source
+        uses: actions/checkout@v5
+
+      - name: Setup python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+
+      - name: Run pre-commit
+        uses: pre-commit/action@v3.0.1
+        with:
+          extra_args: --all-files
+
+      - name: Detect file changes
+        id: changes
+        uses: dorny/paths-filter@v3
+        with:
+          filters: |
+            build-change:
+              - "docker/vllm/Dockerfile.amzn2023"
+              - "scripts/vllm/omni_*"
+              - "scripts/common/**"
+              - "scripts/telemetry/**"
+              - ".github/config/vllm-omni-sagemaker-amzn2023.yml"
+              - ".github/config/vllm-omni-model-tests.yml"
+              - "test/vllm-omni/**"
+            telemetry-test-change:
+              - "test/telemetry/**"
+
+  build-runtime:
+    needs: [check-changes, load-config]
+    if: needs.check-changes.outputs.build-change == 'true'
+    runs-on:
+      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
+        fleet:x86-vllm-build-runner
+        buildspec-override:true
+    timeout-minutes: 720
+    outputs:
+      runtime-base: ${{ steps.check.outputs.image }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Setup buildkitd
+        run: .github/scripts/buildkitd.sh
+
+      - name: ECR login
+        uses: ./.github/actions/ecr-authenticate
+        with:
+          aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Check or build runtime base
+        id: check
+        run: |
+          TAG="vllm-runtime-v${{ needs.load-config.outputs.framework-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.python-version }}"
+          IMAGE="${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:${TAG}"
+          echo "image=${IMAGE}" >> $GITHUB_OUTPUT
+
+          if docker manifest inspect "${IMAGE}" >/dev/null 2>&1; then
+            echo "Runtime base exists: ${IMAGE}"
+            exit 0
+          fi
+
+          echo "Building runtime base: ${IMAGE}"
+          docker buildx build --progress plain \
+            --target runtime-build \
+            --tag "${IMAGE}" \
+            --push \
+            -f docker/vllm/Dockerfile.amzn2023 .
+
+  build-image:
+    needs: [check-changes, load-config, build-runtime]
+    if: needs.check-changes.outputs.build-change == 'true'
+    runs-on:
+      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
+        fleet:x86-vllm-build-runner
+        buildspec-override:true
+    timeout-minutes: 720
+    concurrency:
+      group: ${{ github.workflow }}-build-image-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    outputs:
+      ci-image: ${{ steps.build.outputs.image-uri }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Build image
+        id: build
+        uses: ./.github/actions/build-image
+        with:
+          framework: ${{ needs.load-config.outputs.framework }}
+          target: vllm-omni-sagemaker-amzn2023
+          base-image: nvidia/cuda:12.9.1-devel-amzn2023
+          framework-version: ${{ needs.load-config.outputs.framework-version }}
+          container-type: ${{ needs.load-config.outputs.container-type }}
+          aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+          aws-region: ${{ vars.AWS_REGION }}
+          tag-pr: vllm-omni-${{ needs.load-config.outputs.framework-version }}-gpu-${{ needs.load-config.outputs.python-version }}-${{ needs.load-config.outputs.cuda-version }}-${{ needs.load-config.outputs.os-version }}-sagemaker-pr-${{ github.event.pull_request.number }}
+          dockerfile-path: docker/vllm/Dockerfile.amzn2023
+          arch-type: ${{ needs.load-config.outputs.arch-type }}
+          device-type: ${{ needs.load-config.outputs.device-type }}
+          cuda-version: ${{ needs.load-config.outputs.cuda-version }}
+          python-version: ${{ needs.load-config.outputs.python-version }}
+          os-version: ${{ needs.load-config.outputs.os-version }}
+          contributor: ${{ needs.load-config.outputs.contributor }}
+          customer-type: ${{ needs.load-config.outputs.customer-type }}
+          runtime-base: ${{ needs.build-runtime.outputs.runtime-base }}
+
+  sanity-test:
+    needs: [check-changes, build-image, load-config]
+    if: |
+      always() && !failure() && !cancelled() &&
+      needs.check-changes.outputs.build-change == 'true'
+    concurrency:
+      group: ${{ github.workflow }}-sanity-test-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    uses: ./.github/workflows/reusable-sanity-tests.yml
+    with:
+      image-uri: ${{ needs.build-image.outputs.ci-image }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+      aws-region: ${{ vars.AWS_REGION }}
+      framework: ${{ needs.load-config.outputs.framework }}
+      framework-version: ${{ needs.load-config.outputs.framework-version }}
+      python-version: ${{ needs.load-config.outputs.python-version }}
+      cuda-version: ${{ needs.load-config.outputs.cuda-version }}
+      os-version: ${{ needs.load-config.outputs.os-version }}
+      customer-type: ${{ needs.load-config.outputs.customer-type }}
+      arch-type: ${{ needs.load-config.outputs.arch-type }}
+      device-type: ${{ needs.load-config.outputs.device-type }}
+      contributor: ${{ needs.load-config.outputs.contributor }}
+      container-type: ${{ needs.load-config.outputs.container-type }}
+
+  security-test:
+    needs: [build-image, load-config]
+    if: success()
+    concurrency:
+      group: ${{ github.workflow }}-security-test-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    uses: ./.github/workflows/reusable-security-tests.yml
+    with:
+      image-uri: ${{ needs.build-image.outputs.ci-image }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+      aws-region: ${{ vars.AWS_REGION }}
+      framework: ${{ needs.load-config.outputs.framework }}
+      framework-version: ${{ needs.load-config.outputs.framework-version }}
+
+  telemetry-test:
+    needs: [check-changes, build-image, load-config]
+    if: |
+      always() && !failure() && !cancelled() &&
+      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.telemetry-test-change == 'true')
+    concurrency:
+      group: ${{ github.workflow }}-telemetry-test-${{ github.event.pull_request.number }}
+      cancel-in-progress: false
+    uses: ./.github/workflows/reusable-telemetry-tests.yml
+    with:
+      image-uri: ${{ needs.build-image.outputs.ci-image }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+      aws-region: ${{ vars.AWS_REGION }}
+      framework: ${{ needs.load-config.outputs.framework }}
+      framework-version: ${{ needs.load-config.outputs.framework-version }}
+      container-type: ${{ needs.load-config.outputs.container-type }}
+
+  omni-model-smoke-tests:
+    needs: [build-image, load-config]
+    if: success()
+    concurrency:
+      group: ${{ github.workflow }}-omni-model-tests-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    uses: ./.github/workflows/reusable-vllm-omni-model-tests.yml
+    with:
+      image-uri: ${{ needs.build-image.outputs.ci-image }}
+      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+      aws-region: ${{ vars.AWS_REGION }}
+      customer-type: ${{ needs.load-config.outputs.customer-type }}
+    secrets: inherit
+
+  sagemaker-endpoint-test:
+    needs: [build-image, load-config]
+    if: success()
+    runs-on:
+      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
+        fleet:default-runner
+        buildspec-override:true
+    concurrency:
+      group: ${{ github.workflow }}-sm-endpoint-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Install test dependencies
+        run: |
+          uv venv --python 3.12
+          source .venv/bin/activate
+          uv pip install -r test/requirements.txt
+          uv pip install -r test/vllm-omni/sagemaker/requirements.txt
+
+      - name: Run SageMaker endpoint test
+        run: |
+          source .venv/bin/activate
+          cd test/
+          python3 -m pytest -vs -rA --image-uri ${{ needs.build-image.outputs.ci-image }} vllm-omni/sagemaker
+
+      - name: Cleanup orphaned endpoints
+        if: always()
+        run: |
+          source .venv/bin/activate
+          python3 -c "
+          import boto3
+          sm = boto3.client('sagemaker')
+          for ep in sm.list_endpoints(NameContains='vllm-omni', StatusEquals='InService').get('Endpoints', []):
+              name = ep['EndpointName']
+              print(f'Deleting orphaned endpoint: {name}')
+              sm.delete_endpoint(EndpointName=name)
+              sm.delete_endpoint_config(EndpointConfigName=name)
+          "
diff --git a/.github/workflows/reusable-vllm-omni-model-tests.yml b/.github/workflows/reusable-vllm-omni-model-tests.yml
new file mode 100644
index 000000000000..dad843e1c853
--- /dev/null
+++ b/.github/workflows/reusable-vllm-omni-model-tests.yml
@@ -0,0 +1,180 @@
+name: Reusable vLLM-Omni Model Smoke Tests
+
+permissions:
+  contents: read
+
+on:
+  workflow_call:
+    inputs:
+      image-uri:
+        description: "Image URI to test"
+        required: true
+        type: string
+      aws-account-id:
+        description: "AWS account ID for ECR authentication"
+        required: true
+        type: string
+      aws-region:
+        description: "AWS region for ECR authentication"
+        required: true
+        type: string
+      customer-type:
+        description: "Customer type: ec2 or sagemaker"
+        required: true
+        type: string
+    secrets:
+      HF_TOKEN:
+        description: "HuggingFace token for downloading models"
+        required: false
+
+jobs:
+  load-models:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.parse.outputs.matrix }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Parse model config
+        id: parse
+        run: |
+          python3 -c "
+          import yaml, json
+          with open('.github/config/vllm-omni-model-tests.yml') as f:
+              cfg = yaml.safe_load(f)
+          prefix = cfg.get('s3_prefix', '')
+          models = cfg.get('smoke-test', {}).get('codebuild-fleet', [])
+          for m in models:
+              if 's3_model' in m:
+                  m['s3_path'] = prefix + '/' + m.pop('s3_model')
+                  m['model_source'] = 's3'
+              elif 'hf_model' in m:
+                  m['model_source'] = 'hf'
+          print(f'matrix={json.dumps(models)}')
+          " >> "$GITHUB_OUTPUT"
+
+  smoke-test:
+    name: smoke-test (${{ matrix.model.name }})
+    needs: load-models
+    if: needs.load-models.outputs.matrix != '[]'
+    strategy:
+      fail-fast: false
+      matrix:
+        model: ${{ fromJson(needs.load-models.outputs.matrix) }}
+    runs-on:
+      - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
+        fleet:${{ matrix.model.fleet }}
+        buildspec-override:true
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: ECR login
+        uses: ./.github/actions/ecr-authenticate
+        with:
+          aws-account-id: ${{ inputs.aws-account-id }}
+          aws-region: ${{ inputs.aws-region }}
+          image-uri: ${{ inputs.image-uri }}
+
+      - name: Download model from S3
+        if: matrix.model.model_source == 's3'
+        uses: ./.github/actions/download-model
+        id: model
+        with:
+          s3-path: ${{ matrix.model.s3_path }}
+          model-name: ${{ matrix.model.name }}
+
+      - name: Resolve model path
+        id: resolve
+        run: |
+          if [ "${{ matrix.model.model_source }}" = "s3" ]; then
+            echo "model_path=/models/${{ matrix.model.name }}" >> $GITHUB_OUTPUT
+            echo "volume=-v /dlc-models:/models" >> $GITHUB_OUTPUT
+          else
+            echo "model_path=${{ matrix.model.hf_model }}" >> $GITHUB_OUTPUT
+            echo "volume=" >> $GITHUB_OUTPUT
+          fi
+
+      # EC2: entrypoint accepts CLI args directly
+      - name: Start container (EC2)
+        if: inputs.customer-type == 'ec2'
+        run: |
+          docker pull ${{ inputs.image-uri }}
+          CONTAINER_ID=$(docker run -d --gpus all --shm-size=4g \
+            ${{ steps.resolve.outputs.volume }} \
+            -e HF_TOKEN=${{ secrets.HF_TOKEN }} \
+            -p 8080:8080 \
+            ${{ inputs.image-uri }} \
+            --model ${{ steps.resolve.outputs.model_path }} \
+            --port 8080 \
+            --stage-init-timeout 600 \
+            ${{ matrix.model.extra_args }})
+          echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
+
+      - name: Convert extra_args to SM env vars
+        if: inputs.customer-type == 'sagemaker'
+        id: sm-env
+        run: |
+          # Convert --key value pairs to SM_VLLM_KEY=value env vars
+          EXTRA_ENV=""
+          ARGS="${{ matrix.model.extra_args }}"
+          while [[ -n "$ARGS" ]]; do
+            if [[ "$ARGS" =~ ^--([a-z][a-z0-9-]*)[[:space:]]*(.*) ]]; then
+              KEY=$(echo "${BASH_REMATCH[1]}" | tr '-' '_' | tr '[:lower:]' '[:upper:]')
+              REST="${BASH_REMATCH[2]}"
+              if [[ "$REST" =~ ^--[a-z] ]] || [[ -z "$REST" ]]; then
+                EXTRA_ENV="$EXTRA_ENV -e SM_VLLM_${KEY}=true"
+                ARGS="$REST"
+              else
+                VALUE="${REST%% --*}"
+                EXTRA_ENV="$EXTRA_ENV -e SM_VLLM_${KEY}=${VALUE}"
+                ARGS="${REST#"$VALUE"}"
+              fi
+              ARGS="${ARGS# }"
+            else
+              break
+            fi
+          done
+          echo "env_flags=$EXTRA_ENV" >> $GITHUB_OUTPUT
+
+      # SageMaker: entrypoint reads SM_VLLM_* env vars
+      - name: Start container (SageMaker)
+        if: inputs.customer-type == 'sagemaker'
+        run: |
+          docker pull ${{ inputs.image-uri }}
+          CONTAINER_ID=$(docker run -d --gpus all --shm-size=4g \
+            ${{ steps.resolve.outputs.volume }} \
+            -e SM_VLLM_MODEL=${{ steps.resolve.outputs.model_path }} \
+            -e SM_VLLM_STAGE_INIT_TIMEOUT=600 \
+            -e HF_TOKEN=${{ secrets.HF_TOKEN }} \
+            ${{ steps.sm-env.outputs.env_flags }} \
+            -p 8080:8080 \
+            ${{ inputs.image-uri }})
+          echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
+
+      - name: Copy test scripts
+        run: |
+          docker cp test/vllm-omni/scripts/vllm_omni_${{ inputs.customer-type }}_smoke_test.sh \
+            ${CONTAINER_ID}:/tmp/smoke_test.sh
+
+      - name: Run smoke test
+        run: |
+          docker exec ${CONTAINER_ID} bash /tmp/smoke_test.sh \
+            "${{ matrix.model.route }}" \
+            '${{ matrix.model.test_request }}' \
+            "${{ matrix.model.validate }}" \
+            "${{ matrix.model.content_type || 'application/json' }}"
+
+      - name: Dump container logs
+        if: always()
+        run: |
+          docker logs ${CONTAINER_ID} 2>&1 | tail -500 || true
+
+      - name: Cleanup
+        if: always()
+        run: |
+          kill ${{ steps.model.outputs.lock-pid }} 2>/dev/null || true
+          docker stop ${CONTAINER_ID} 2>/dev/null || true
+          docker rm -f ${CONTAINER_ID} 2>/dev/null || true
+          docker rmi ${{ inputs.image-uri }} 2>/dev/null || true
diff --git a/docker/sglang/Dockerfile.amzn2023 b/docker/sglang/Dockerfile.amzn2023
index 121ca2d22d2f..901ca7f8d0b4 100644
--- a/docker/sglang/Dockerfile.amzn2023
+++ b/docker/sglang/Dockerfile.amzn2023
@@ -241,7 +241,8 @@ RUN uv pip install --system --no-cache \
   "pillow>=12.1.1" \
   "python_multipart>=0.0.22" \
   "xgrammar>=0.1.32" \
-  "setuptools>=78.1.1"
+  "setuptools>=78.1.1" \
+  "aiohttp>=3.13.4"
 
 # Re-pin NCCL/cuDNN/cuSparseLt after CVE patches (transitive deps may downgrade or remove them)
 # cuSparseLt installed without --no-deps in case it wasn't present from builder
diff --git a/docker/vllm/Dockerfile.amzn2023 b/docker/vllm/Dockerfile.amzn2023
index 2c580138665a..410e721df89d 100644
--- a/docker/vllm/Dockerfile.amzn2023
+++ b/docker/vllm/Dockerfile.amzn2023
@@ -1,6 +1,11 @@
 ARG CUDA_VERSION=12.9.1
 ARG PYTHON_VERSION=3.12
 
+# Pre-built runtime image. When set, skips the compile stages (source/build/deps)
+# and uses this image directly as the runtime base. Build it with:
+#   docker buildx build --target runtime --tag <ecr>/vllm-runtime:<tag> --push ...
+ARG RUNTIME_BASE=""
+
 # =============================================================================
 # STAGE 0: source — clone vLLM and apply patches
 # =============================================================================
@@ -201,8 +206,9 @@ RUN PATH="/opt/venv/bin:${PATH}" bash /tmp/setup_oss_compliance.sh python${PYTHO
 
 # =============================================================================
 # STAGE 3: runtime — minimal image with clean venv
+# Built from scratch (compile path) or pulled from pre-built RUNTIME_BASE.
 # =============================================================================
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-amzn2023 AS runtime
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-amzn2023 AS runtime-build
 
 ARG CUDA_VERSION
 ARG PYTHON_VERSION=3.12
@@ -238,6 +244,10 @@ ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_P
 ENV VLLM_USAGE_SOURCE=production-docker-image
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
+# Pre-built runtime (fast path) — used when RUNTIME_BASE is set
+ARG RUNTIME_BASE
+FROM ${RUNTIME_BASE:-runtime-build} AS runtime
+
 # =============================================================================
 # STAGE 4: DLC overlay — Amazon DLC customizations on top of vLLM runtime
 # =============================================================================
@@ -339,4 +349,137 @@ RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=Fal
 COPY ./scripts/vllm/sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
 RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh
 
+ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
+
+# =============================================================================
+# STAGE: omni-deps — install vllm-omni on top of runtime venv
+# =============================================================================
+FROM runtime AS omni-deps
+
+ARG VLLM_OMNI_VERSION=0.18.0
+
+# System deps for omni-modality (TTS, audio, image/video)
+# Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, sox, ffmpeg
+RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-release \
+  && dnf install -y spal-release \
+  && dnf install -y --setopt=install_weak_deps=False espeak-ng sox ffmpeg-free \
+  && dnf clean all && rm -rf /var/cache/dnf
+
+# Install vllm-omni (pure Python, no compilation)
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install vllm-omni==${VLLM_OMNI_VERSION}
+
+# =============================================================================
+# STAGE: builder-oss-omni — OSS compliance for omni venv
+# =============================================================================
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-amzn2023 AS builder-oss-omni
+ARG PYTHON_VERSION
+RUN dnf install -y --allowerasing python${PYTHON_VERSION} curl && dnf clean all
+COPY --from=omni-deps /opt/venv /opt/venv
+COPY scripts/common/setup_oss_compliance.sh /tmp/setup_oss_compliance.sh
+RUN PATH="/opt/venv/bin:${PATH}" bash /tmp/setup_oss_compliance.sh python${PYTHON_VERSION} \
+  && touch /root/THIRD_PARTY_SOURCE_CODE_URLS
+
+# =============================================================================
+# STAGE: omni-base — DLC overlay for vLLM-Omni
+# =============================================================================
+FROM omni-deps AS omni-base
+
+ARG PYTHON="python3"
+ARG PYTHON_VERSION=3.12
+ARG CUDA_VERSION
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+LABEL dlc_minor_version="0"
+
+ENV LANG=C.UTF-8 \
+  LC_ALL=C.UTF-8 \
+  DLC_CONTAINER_TYPE=general \
+  PYTHONDONTWRITEBYTECODE=1 \
+  PYTHONUNBUFFERED=1 \
+  PYTHONIOENCODING=UTF-8 \
+  LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
+  PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"
+
+WORKDIR /
+
+# Install DLC Python dependencies
+RUN uv pip install --no-cache-dir botocore
+
+# Patch CVEs
+RUN uv pip install --no-cache-dir \
+  "pillow>=12.1.1" \
+  "xgrammar>=0.1.32" \
+  "PyJWT>=2.12.0" \
+  "cbor2>=5.9.0" \
+  "gradio>=6.7.0"
+
+COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+COPY ./scripts/telemetry/bash_telemetry.sh.template /tmp/bash_telemetry.sh.template
+
+ARG FRAMEWORK
+ARG FRAMEWORK_VERSION
+ARG CONTAINER_TYPE
+
+# telemetry
+RUN chmod +x /usr/local/bin/deep_learning_container.py \
+  && sed -e "s/{{FRAMEWORK}}/${FRAMEWORK}/g" \
+    -e "s/{{FRAMEWORK_VERSION}}/${FRAMEWORK_VERSION}/g" \
+    -e "s/{{CONTAINER_TYPE}}/${CONTAINER_TYPE}/g" \
+    /tmp/bash_telemetry.sh.template >/usr/local/bin/bash_telemetry.sh \
+  && chmod +x /usr/local/bin/bash_telemetry.sh \
+  && rm /tmp/bash_telemetry.sh.template \
+  && echo 'source /usr/local/bin/bash_telemetry.sh' >>/etc/bashrc \
+  && echo 'source /usr/local/bin/bash_telemetry.sh' >>/root/.bashrc \
+  && ln -sf /opt/venv/bin/python3 /usr/bin/python \
+  && rm -rf /tmp/tmp* \
+  && rm -rf /tmp/uv* \
+  && rm -rf /var/cache/dnf \
+  && rm -rf /root/.cache || true
+
+# OSS compliance (from omni-specific builder)
+COPY --from=builder-oss-omni /root/THIRD_PARTY_SOURCE_CODE_URLS /root/THIRD_PARTY_SOURCE_CODE_URLS
+COPY --from=builder-oss-omni /root/PYTHON_PACKAGES_LICENSES /root/PYTHON_PACKAGES_LICENSES
+COPY --from=builder-oss-omni /root/LINUX_PACKAGES_LICENSES /root/LINUX_PACKAGES_LICENSES
+COPY --from=builder-oss-omni /root/BUILD_FROM_SOURCE_PACKAGES_LICENCES /root/BUILD_FROM_SOURCE_PACKAGES_LICENCES
+COPY --from=builder-oss-omni /usr/local/bin/testOSSCompliance /usr/local/bin/testOSSCompliance
+
+# install EFA
+COPY ./scripts/common/install_efa_amzn2023.sh install_efa_amzn2023.sh
+ARG EFA_VERSION="1.47.0"
+RUN echo -e '[cuda-rhel9]\nname=cuda-rhel9\nbaseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64\nenabled=1\ngpgcheck=0' >/etc/yum.repos.d/cuda-rhel9.repo \
+  && dnf install -y --setopt=install_weak_deps=False libnccl libnccl-devel \
+  && ldconfig \
+  && bash install_efa_amzn2023.sh ${EFA_VERSION} \
+  && rm install_efa_amzn2023.sh \
+  && dnf remove -y libnccl-devel \
+  && dnf clean all && rm -rf /var/cache/dnf \
+  && rm -rf /usr/local/cuda/bin/nvdisasm*
+
+# ====================== omni ec2 =========================================
+FROM omni-base AS vllm-omni-ec2-amzn2023
+
+ARG CACHE_REFRESH=0
+RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=False \
+  && dnf clean all && rm -rf /var/cache/dnf /tmp/* \
+  && ln -sf /opt/venv/bin/python3 /usr/bin/python3
+
+COPY ./scripts/vllm/omni_dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
+RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
+
+ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"]
+
+# ====================== omni sagemaker =========================================
+FROM omni-base AS vllm-omni-sagemaker-amzn2023
+
+ARG CACHE_REFRESH=0
+RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=False \
+  && dnf clean all && rm -rf /var/cache/dnf /tmp/* \
+  && ln -sf /opt/venv/bin/python3 /usr/bin/python3
+
+COPY ./scripts/vllm/omni_sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
+COPY ./scripts/vllm/omni_sagemaker_serve.py /usr/local/bin/omni_sagemaker_serve.py
+ENV PYTHONPATH="/usr/local/bin:${PYTHONPATH}"
+RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh
+
 ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
\ No newline at end of file
diff --git a/scripts/telemetry/deep_learning_container.py b/scripts/telemetry/deep_learning_container.py
index a9122e2bce64..910a2c19dca6 100755
--- a/scripts/telemetry/deep_learning_container.py
+++ b/scripts/telemetry/deep_learning_container.py
@@ -228,7 +228,17 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--framework",
-        choices=["tensorflow", "mxnet", "pytorch", "base", "vllm", "sglang", "lambda", "ray"],
+        choices=[
+            "tensorflow",
+            "mxnet",
+            "pytorch",
+            "base",
+            "vllm",
+            "sglang",
+            "lambda",
+            "ray",
+            "vllm-omni",
+        ],
         help="framework of container image.",
         required=True,
     )
diff --git a/scripts/vllm/omni_dockerd_entrypoint.sh b/scripts/vllm/omni_dockerd_entrypoint.sh
new file mode 100755
index 000000000000..82166d04814c
--- /dev/null
+++ b/scripts/vllm/omni_dockerd_entrypoint.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+# Check if telemetry file exists before executing
+# Execute telemetry script if it exists, suppress errors
+bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
+
+exec vllm serve --omni "$@"
diff --git a/scripts/vllm/omni_sagemaker_entrypoint.sh b/scripts/vllm/omni_sagemaker_entrypoint.sh
new file mode 100755
index 000000000000..94b15f0a4091
--- /dev/null
+++ b/scripts/vllm/omni_sagemaker_entrypoint.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Check if telemetry file exists before executing
+# Execute telemetry script if it exists, suppress errors
+bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
+
+PREFIX="SM_VLLM_"
+ARG_PREFIX="--"
+
+ARGS=(--port 8080)
+
+# Auto-detect model if SM_VLLM_MODEL is not set
+if [ -z "${SM_VLLM_MODEL}" ]; then
+    if [ -d "/opt/ml/model" ] && [ "$(ls -A /opt/ml/model 2>/dev/null)" ]; then
+        echo "INFO: SM_VLLM_MODEL not set, auto-detected model at /opt/ml/model"
+        ARGS+=(--model /opt/ml/model)
+    elif [ -n "${HF_MODEL_ID}" ]; then
+        echo "INFO: SM_VLLM_MODEL not set, using HF_MODEL_ID=${HF_MODEL_ID}"
+        ARGS+=(--model "${HF_MODEL_ID}")
+    else
+        echo "WARNING: No model specified. Set SM_VLLM_MODEL, HF_MODEL_ID, or mount a model to /opt/ml/model."
+    fi
+fi
+
+while IFS='=' read -r key value; do
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    # Handle boolean flags: true -> flag only, false -> skip entirely
+    lower_value=$(echo "$value" | tr '[:upper:]' '[:lower:]')
+    if [ "$lower_value" = "true" ]; then
+        ARGS+=("${ARG_PREFIX}${arg_name}")
+    elif [ "$lower_value" = "false" ]; then
+        continue
+    else
+        ARGS+=("${ARG_PREFIX}${arg_name}")
+        if [ -n "$value" ]; then
+            ARGS+=("$value")
+        fi
+    fi
+done < <(env | grep "^${PREFIX}")
+
+# Add SageMaker routing middleware to dispatch /invocations to the correct
+# vllm-omni endpoint (e.g. /v1/audio/speech for TTS)
+ARGS+=(--middleware omni_sagemaker_serve.SageMakerRouteMiddleware)
+
+exec vllm serve --omni "${ARGS[@]}"
diff --git a/scripts/vllm/omni_sagemaker_serve.py b/scripts/vllm/omni_sagemaker_serve.py
new file mode 100644
index 000000000000..7db1bb80aeaf
--- /dev/null
+++ b/scripts/vllm/omni_sagemaker_serve.py
@@ -0,0 +1,50 @@
+"""SageMaker routing middleware for vLLM-Omni.
+
+Routes /invocations requests based on the X-Amzn-SageMaker-Custom-Attributes
+header. Clients specify the target endpoint via route=<path>, e.g.:
+
+  CustomAttributes="route=/v1/audio/speech"
+
+If no route is specified, falls through to vLLM's built-in /invocations
+handler (chat/completion/embed).
+
+Usage: vllm serve --omni --middleware omni_sagemaker_serve.SageMakerRouteMiddleware
+"""
+
+import logging
+import re
+
+from starlette.types import ASGIApp, Receive, Scope, Send
+
+logger = logging.getLogger("omni_sagemaker")
+
+
+def _parse_route(headers: list[tuple[bytes, bytes]]) -> str | None:
+    """Extract route=<path> from SageMaker custom attributes header."""
+    for key, value in headers:
+        if key.lower() == b"x-amzn-sagemaker-custom-attributes":
+            m = re.search(r"route=(/[^\s,]+)", value.decode())
+            return m.group(1) if m else None
+    return None
+
+
+class SageMakerRouteMiddleware:
+    """ASGI middleware that reroutes /invocations based on CustomAttributes.
+
+    Explicit route via header -> rewrites path to that endpoint.
+    No route specified -> falls through to vLLM's built-in /invocations handler.
+    """
+
+    def __init__(self, app: ASGIApp) -> None:
+        self.app = app
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
+        if scope["type"] == "http" and scope["path"] == "/invocations":
+            route = _parse_route(scope.get("headers", []))
+            if route:
+                logger.info("Rerouting /invocations -> %s", route)
+                scope = dict(scope)
+                scope["path"] = route
+                scope["raw_path"] = route.encode()
+
+        await self.app(scope, receive, send)
diff --git a/test/security/data/ecr_scan_allowlist/vllm/framework_allowlist.json b/test/security/data/ecr_scan_allowlist/vllm/framework_allowlist.json
index 95591f599e4a..1dae8903a160 100644
--- a/test/security/data/ecr_scan_allowlist/vllm/framework_allowlist.json
+++ b/test/security/data/ecr_scan_allowlist/vllm/framework_allowlist.json
@@ -103,11 +103,6 @@
         "vulnerability_id": "CVE-2026-31812",
         "reason": "Coming in as a dependency from the latest uv 0.10.9"
     },
-    {
-        "vulnerability_id": "CVE-2026-33055",
-        "reason": "Rust tar crate 0.4.44 bundled in uv binary, fix requires uv upstream update to tar>=0.4.45",
-        "review_by": "2026-04-06"
-    },
     {
         "vulnerability_id": "CVE-2026-27893",
         "reason": "vllm 0.10.2 RayServe image - trust_remote_code=True hardcoded, fixed in vllm>=0.18.0. RayServe image not updated in this PR."
@@ -124,4 +119,4 @@
         "vulnerability_id": "CVE-2026-34520",
         "reason": "aiohttp 3.12.15 vendored inside ray/_private/runtime_env/agent/thirdparty_files/, unpatchable without Ray upgrade"
     }
-]
\ No newline at end of file
+]
diff --git a/test/vllm-omni/sagemaker/requirements.txt b/test/vllm-omni/sagemaker/requirements.txt
new file mode 100644
index 000000000000..6a4743d65577
--- /dev/null
+++ b/test/vllm-omni/sagemaker/requirements.txt
@@ -0,0 +1,2 @@
+sagemaker>=2,<3
+starlette
diff --git a/test/vllm-omni/sagemaker/test_sagemaker_middleware.py b/test/vllm-omni/sagemaker/test_sagemaker_middleware.py
new file mode 100644
index 000000000000..fa7ce616a6e3
--- /dev/null
+++ b/test/vllm-omni/sagemaker/test_sagemaker_middleware.py
@@ -0,0 +1,116 @@
+"""Unit tests for SageMaker routing middleware."""
+
+import asyncio
+import os
+import sys
+
+# Allow importing omni_sagemaker_serve from scripts/vllm/
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", "scripts", "vllm"))
+
+import pytest
+from omni_sagemaker_serve import SageMakerRouteMiddleware, _parse_route
+
+
+class TestParseRoute:
+    def test_extracts_route(self):
+        headers = [(b"x-amzn-sagemaker-custom-attributes", b"route=/v1/audio/speech")]
+        assert _parse_route(headers) == "/v1/audio/speech"
+
+    def test_extracts_route_with_extra_attrs(self):
+        headers = [(b"x-amzn-sagemaker-custom-attributes", b"foo=bar,route=/v1/audio/speech,baz=1")]
+        assert _parse_route(headers) == "/v1/audio/speech"
+
+    def test_no_route(self):
+        headers = [(b"x-amzn-sagemaker-custom-attributes", b"foo=bar")]
+        assert _parse_route(headers) is None
+
+    def test_no_header(self):
+        assert _parse_route([]) is None
+
+    def test_case_insensitive_header(self):
+        headers = [(b"X-Amzn-SageMaker-Custom-Attributes", b"route=/v1/chat/completions")]
+        assert _parse_route(headers) == "/v1/chat/completions"
+
+
+class TestMiddleware:
+    @pytest.fixture
+    def captured(self):
+        return {}
+
+    @pytest.fixture
+    def app(self, captured):
+        async def inner(scope, receive, send):
+            captured["path"] = scope["path"]
+
+        return inner
+
+    @pytest.fixture
+    def middleware(self, app):
+        return SageMakerRouteMiddleware(app)
+
+    def _make_scope(self, path="/invocations", headers=None):
+        return {
+            "type": "http",
+            "path": path,
+            "raw_path": path.encode(),
+            "headers": headers or [],
+        }
+
+    def _run(self, coro):
+        return asyncio.get_event_loop().run_until_complete(coro)
+
+    def test_rewrites_with_route_header(self, middleware, captured):
+        scope = self._make_scope(
+            headers=[
+                (b"x-amzn-sagemaker-custom-attributes", b"route=/v1/audio/speech"),
+            ]
+        )
+        self._run(middleware(scope, None, None))
+        assert captured["path"] == "/v1/audio/speech"
+
+    def test_falls_through_without_route(self, middleware, captured):
+        scope = self._make_scope()
+        self._run(middleware(scope, None, None))
+        assert captured["path"] == "/invocations"
+
+    def test_ignores_non_invocations(self, middleware, captured):
+        scope = self._make_scope(path="/health")
+        self._run(middleware(scope, None, None))
+        assert captured["path"] == "/health"
+
+    def test_ignores_non_http(self, middleware, captured):
+        scope = {"type": "websocket", "path": "/invocations"}
+        self._run(middleware(scope, None, None))
+        assert captured["path"] == "/invocations"
+
+    def test_rewrites_raw_path(self, middleware, captured):
+        scope = self._make_scope(
+            headers=[
+                (b"x-amzn-sagemaker-custom-attributes", b"route=/v1/completions"),
+            ]
+        )
+        self._run(middleware(scope, None, None))
+        assert captured["path"] == "/v1/completions"
+
+    def test_adapter_attrs_without_route_falls_through(self, middleware, captured):
+        """Adapter attributes (no route=) should fall through to /invocations."""
+        scope = self._make_scope(
+            headers=[
+                (b"x-amzn-sagemaker-custom-attributes", b"adapter=my-lora-adapter"),
+            ]
+        )
+        self._run(middleware(scope, None, None))
+        assert captured["path"] == "/invocations"
+
+    def test_adapter_attrs_with_route_rewrites(self, middleware, captured):
+        """Both adapter and route attrs — route takes effect, adapter preserved in headers."""
+        scope = self._make_scope(
+            headers=[
+                (
+                    b"x-amzn-sagemaker-custom-attributes",
+                    b"adapter=my-lora,route=/v1/audio/speech",
+                ),
+            ]
+        )
+        self._run(middleware(scope, None, None))
+        assert captured["path"] == "/v1/audio/speech"
diff --git a/test/vllm-omni/sagemaker/test_sm_omni_endpoint.py b/test/vllm-omni/sagemaker/test_sm_omni_endpoint.py
new file mode 100644
index 000000000000..6920c78a09c8
--- /dev/null
+++ b/test/vllm-omni/sagemaker/test_sm_omni_endpoint.py
@@ -0,0 +1,235 @@
+"""Integration test for vLLM-Omni SageMaker endpoint"""
+
+import json
+import logging
+import time
+
+import pytest
+from sagemaker.async_inference import AsyncInferenceConfig
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+from test_utils import clean_string, random_suffix_name, wait_for_status
+from test_utils.constants import INFERENCE_AMI_VERSION, SAGEMAKER_ROLE
+from test_utils.huggingface_helper import get_hf_token
+
+LOGGER = logging.getLogger(__name__)
+LOGGER.setLevel(logging.INFO)
+
+ENDPOINT_WAIT_PERIOD = 60
+ENDPOINT_WAIT_LENGTH = 30
+ENDPOINT_INSERVICE = "InService"
+
+
+def get_endpoint_status(sagemaker_client, endpoint_name):
+    response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
+    return response["EndpointStatus"]
+
+
+@pytest.fixture(scope="function")
+def model_id(request):
+    return request.param
+
+
+@pytest.fixture(scope="function")
+def instance_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="function")
+def model_package(aws_session, image_uri, model_id):
+    sagemaker_client = aws_session.sagemaker
+    cleaned_id = clean_string(model_id.split("/")[1], "_./")
+    model_name = random_suffix_name(f"vllm-omni-{cleaned_id}", 50)
+
+    try:
+        LOGGER.info(f"Creating SageMaker model: {model_name}")
+        hf_token = get_hf_token(aws_session)
+        model = Model(
+            name=model_name,
+            image_uri=image_uri,
+            role=SAGEMAKER_ROLE,
+            predictor_cls=Predictor,
+            env={
+                "SM_VLLM_MODEL": model_id,
+                "HF_TOKEN": hf_token,
+            },
+        )
+        yield model
+    finally:
+        LOGGER.info(f"Deleting model: {model_name}")
+        sagemaker_client.delete_model(ModelName=model_name)
+
+
+@pytest.fixture(scope="function")
+def model_endpoint(aws_session, model_package, instance_type):
+    sagemaker_client = aws_session.sagemaker
+    model = model_package
+    cleaned_instance = clean_string(instance_type, "_./")
+    endpoint_name = random_suffix_name(f"vllm-omni-{cleaned_instance}", 50)
+
+    try:
+        LOGGER.info("Starting endpoint deployment...")
+        predictor = model.deploy(
+            instance_type=instance_type,
+            initial_instance_count=1,
+            endpoint_name=endpoint_name,
+            inference_ami_version=INFERENCE_AMI_VERSION,
+            serializer=JSONSerializer(),
+            wait=True,
+        )
+
+        LOGGER.info(f"Waiting for endpoint {ENDPOINT_INSERVICE} status...")
+        assert wait_for_status(
+            ENDPOINT_INSERVICE,
+            ENDPOINT_WAIT_PERIOD,
+            ENDPOINT_WAIT_LENGTH,
+            get_endpoint_status,
+            sagemaker_client,
+            endpoint_name,
+        )
+        yield predictor
+    finally:
+        LOGGER.info(f"Deleting endpoint: {endpoint_name}")
+        sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
+        sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
+
+
+@pytest.mark.parametrize("instance_type", ["ml.g5.xlarge"], indirect=True)
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"], indirect=True)
+def test_vllm_omni_tts_endpoint(model_endpoint):
+    """TTS via /invocations routed to /v1/audio/speech by the serve proxy."""
+    predictor = model_endpoint
+    sm_runtime = predictor.sagemaker_session.sagemaker_runtime_client
+
+    payload = json.dumps(
+        {
+            "input": "Hello, this is a test of the text to speech system.",
+            "voice": "vivian",
+            "language": "English",
+        }
+    )
+
+    LOGGER.info("Sending TTS request via /invocations with route=/v1/audio/speech")
+    # First request triggers torch.compile + CUDA graph capture (~67s),
+    # which exceeds SageMaker's 60s invoke timeout. Retry after warmup completes.
+    import time
+
+    # https://github.com/aws/sagemaker-python-sdk/issues/1119
+    for attempt in range(3):
+        try:
+            response = sm_runtime.invoke_endpoint(
+                EndpointName=predictor.endpoint_name,
+                ContentType="application/json",
+                Body=payload,
+                CustomAttributes="route=/v1/audio/speech",
+            )
+            break
+        except Exception as e:
+            LOGGER.warning(f"Attempt {attempt + 1}/3 failed: {e}")
+            if attempt == 2:
+                raise
+            time.sleep(30)
+
+    audio_bytes = response["Body"].read()
+    LOGGER.info(f"TTS audio response: {len(audio_bytes)} bytes")
+    assert len(audio_bytes) > 1000, f"TTS output too small: {len(audio_bytes)} bytes"
+    LOGGER.info("TTS endpoint test PASSED")
+
+
+@pytest.fixture(scope="function")
+def async_endpoint(aws_session, model_package, instance_type):
+    """Deploy an async inference endpoint (no 60s timeout limit)."""
+    sagemaker_client = aws_session.sagemaker
+    model = model_package
+    cleaned_instance = clean_string(instance_type, "_./")
+    endpoint_name = random_suffix_name(f"vllm-omni-async-{cleaned_instance}", 50)
+    account_id = aws_session.sts.get_caller_identity()["Account"]
+    s3_output = f"s3://sagemaker-{aws_session.region}-{account_id}/vllm-omni-async-output/"
+
+    try:
+        LOGGER.info(f"Deploying async endpoint: {endpoint_name}")
+        predictor = model.deploy(
+            instance_type=instance_type,
+            initial_instance_count=1,
+            endpoint_name=endpoint_name,
+            inference_ami_version=INFERENCE_AMI_VERSION,
+            serializer=JSONSerializer(),
+            async_inference_config=AsyncInferenceConfig(
+                output_path=s3_output,
+                max_concurrent_invocations_per_instance=1,
+            ),
+            wait=True,
+        )
+
+        LOGGER.info(f"Waiting for endpoint {ENDPOINT_INSERVICE} status...")
+        assert wait_for_status(
+            ENDPOINT_INSERVICE,
+            ENDPOINT_WAIT_PERIOD,
+            ENDPOINT_WAIT_LENGTH,
+            get_endpoint_status,
+            sagemaker_client,
+            endpoint_name,
+        )
+        yield predictor, s3_output
+    finally:
+        LOGGER.info(f"Deleting async endpoint: {endpoint_name}")
+        sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
+        sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
+
+
+@pytest.mark.parametrize("instance_type", ["ml.g5.xlarge"], indirect=True)
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"], indirect=True)
+def test_vllm_omni_tts_async_endpoint(async_endpoint):
+    """TTS via async inference — no 60s timeout, up to 1 hour."""
+    predictor, s3_output = async_endpoint
+    sm_runtime = predictor.sagemaker_session.sagemaker_runtime_client
+    s3_client = predictor.sagemaker_session.boto_session.client("s3")
+
+    payload = json.dumps(
+        {
+            "input": "Hello, this is a test of async text to speech.",
+            "voice": "vivian",
+            "language": "English",
+        }
+    )
+
+    LOGGER.info("Sending async TTS request")
+    response = sm_runtime.invoke_endpoint_async(
+        EndpointName=predictor.endpoint_name,
+        ContentType="application/json",
+        InputLocation=_upload_payload_to_s3(s3_client, payload, s3_output, predictor.endpoint_name),
+        CustomAttributes="route=/v1/audio/speech",
+    )
+
+    output_location = response["OutputLocation"]
+    LOGGER.info(f"Async output location: {output_location}")
+
+    # Poll for result (up to 5 minutes)
+    bucket, key = _parse_s3_uri(output_location)
+    for i in range(60):
+        try:
+            obj = s3_client.get_object(Bucket=bucket, Key=key)
+            audio_bytes = obj["Body"].read()
+            LOGGER.info(f"Async TTS response: {len(audio_bytes)} bytes (after {i * 5}s)")
+            assert len(audio_bytes) > 1000, f"TTS output too small: {len(audio_bytes)} bytes"
+            LOGGER.info("Async TTS endpoint test PASSED")
+            return
+        except s3_client.exceptions.NoSuchKey:
+            time.sleep(5)
+
+    pytest.fail("Async inference timed out after 300s")
+
+
+def _upload_payload_to_s3(s3_client, payload, s3_output, endpoint_name):
+    """Upload request payload to S3 for async inference."""
+    bucket, prefix = _parse_s3_uri(s3_output)
+    key = f"{prefix}{endpoint_name}-input.json"
+    s3_client.put_object(Bucket=bucket, Key=key, Body=payload, ContentType="application/json")
+    return f"s3://{bucket}/{key}"
+
+
+def _parse_s3_uri(uri):
+    """Parse s3://bucket/key into (bucket, key)."""
+    parts = uri.replace("s3://", "").split("/", 1)
+    return parts[0], parts[1] if len(parts) > 1 else ""
diff --git a/test/vllm-omni/scripts/vllm_omni_ec2_smoke_test.sh b/test/vllm-omni/scripts/vllm_omni_ec2_smoke_test.sh
new file mode 100755
index 000000000000..d97fa684f908
--- /dev/null
+++ b/test/vllm-omni/scripts/vllm_omni_ec2_smoke_test.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Smoke test for vLLM-Omni EC2 images
+# Uses the OpenAI-compatible API directly (no /invocations middleware).
+# Request payload and validation are passed as arguments from the model config.
+set -eux
+
+ROUTE="${1:?Usage: $0 <route> <test_request> <validate> [content_type]}"
+REQUEST="${2:?Usage: $0 <route> <test_request> <validate> [content_type]}"
+VALIDATE="${3:?Usage: $0 <route> <test_request> <validate> [content_type]}"
+CONTENT_TYPE="${4:-application/json}"
+PORT=8080
+
+echo "=== vLLM-Omni EC2 smoke test ==="
+echo "Route: ${ROUTE}"
+echo "Content-Type: ${CONTENT_TYPE}"
+echo "Validate: ${VALIDATE}"
+
+# Wait for server
+for i in $(seq 1 300); do
+    if curl -s http://localhost:${PORT}/health >/dev/null 2>&1; then
+        echo "Server ready after ${i}s"
+        break
+    fi
+    sleep 1
+done
+
+curl -sf http://localhost:${PORT}/health || { echo "Health check failed"; exit 1; }
+
+# Send request directly to the API endpoint
+if [ "${CONTENT_TYPE}" = "multipart/form-data" ]; then
+    CURL_CMD=(curl -sf -X POST "http://localhost:${PORT}${ROUTE}")
+    IFS='&' read -ra PAIRS <<< "${REQUEST}"
+    for pair in "${PAIRS[@]}"; do
+        CURL_CMD+=(-F "${pair}")
+    done
+    CURL_CMD+=(--output /tmp/omni_response --max-time 300)
+    "${CURL_CMD[@]}"
+else
+    curl -sf -X POST "http://localhost:${PORT}${ROUTE}" \
+      -H "Content-Type: application/json" \
+      -d "${REQUEST}" \
+      --output /tmp/omni_response --max-time 300
+fi
+
+# Validate response
+if [[ "${VALIDATE}" == binary_size_gt:* ]]; then
+    MIN_SIZE="${VALIDATE#binary_size_gt:}"
+    FILE_SIZE=$(stat -c%s /tmp/omni_response 2>/dev/null || stat -f%z /tmp/omni_response)
+    echo "Response size: ${FILE_SIZE} bytes (min: ${MIN_SIZE})"
+    [ "${FILE_SIZE}" -gt "${MIN_SIZE}" ] || { echo "FAIL: response too small"; exit 1; }
+
+elif [[ "${VALIDATE}" == json_field:* ]]; then
+    FIELD="${VALIDATE#json_field:}"
+    python3 -c "
+import json, sys
+data = json.load(open('/tmp/omni_response'))
+obj = data
+for part in '${FIELD}'.replace(']','').replace('[','.').split('.'):
+    if part.isdigit():
+        obj = obj[int(part)]
+    else:
+        obj = obj[part]
+assert obj, 'Field ${FIELD} is empty'
+print(f'Validated: ${FIELD} present ({type(obj).__name__})')
+"
+fi
+
+echo "=== vLLM-Omni EC2 smoke test PASSED ==="
diff --git a/test/vllm-omni/scripts/vllm_omni_sagemaker_smoke_test.sh b/test/vllm-omni/scripts/vllm_omni_sagemaker_smoke_test.sh
new file mode 100755
index 000000000000..c7e63d5f8f91
--- /dev/null
+++ b/test/vllm-omni/scripts/vllm_omni_sagemaker_smoke_test.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+# Smoke test for vLLM-Omni SageMaker images
+# Uses /invocations with the routing middleware (CustomAttributes: route=<path>)
+# Request payload and validation are passed as arguments from the model config.
+set -eux
+
+ROUTE="${1:?Usage: $0 <route> <test_request> <validate> [content_type]}"
+REQUEST="${2:?Usage: $0 <route> <test_request> <validate> [content_type]}"
+VALIDATE="${3:?Usage: $0 <route> <test_request> <validate> [content_type]}"
+CONTENT_TYPE="${4:-application/json}"
+PORT=8080
+
+echo "=== vLLM-Omni SageMaker smoke test ==="
+echo "Route: ${ROUTE}"
+echo "Content-Type: ${CONTENT_TYPE}"
+echo "Validate: ${VALIDATE}"
+
+# Wait for server
+for i in $(seq 1 300); do
+    if curl -s http://localhost:${PORT}/ping >/dev/null 2>&1; then
+        echo "Server ready after ${i}s"
+        break
+    fi
+    sleep 1
+done
+
+curl -sf http://localhost:${PORT}/ping || { echo "Ping failed"; exit 1; }
+
+# Send request via /invocations with route header
+if [ "${CONTENT_TYPE}" = "multipart/form-data" ]; then
+    CURL_CMD=(curl -sf -X POST "http://localhost:${PORT}/invocations"
+      -H "X-Amzn-SageMaker-Custom-Attributes: route=${ROUTE}")
+    IFS='&' read -ra PAIRS <<< "${REQUEST}"
+    for pair in "${PAIRS[@]}"; do
+        CURL_CMD+=(-F "${pair}")
+    done
+    CURL_CMD+=(--output /tmp/omni_response --max-time 300)
+    "${CURL_CMD[@]}"
+else
+    curl -sf -X POST http://localhost:${PORT}/invocations \
+      -H "Content-Type: application/json" \
+      -H "X-Amzn-SageMaker-Custom-Attributes: route=${ROUTE}" \
+      -d "${REQUEST}" \
+      --output /tmp/omni_response --max-time 300
+fi
+
+# Validate response
+if [[ "${VALIDATE}" == binary_size_gt:* ]]; then
+    MIN_SIZE="${VALIDATE#binary_size_gt:}"
+    FILE_SIZE=$(stat -c%s /tmp/omni_response 2>/dev/null || stat -f%z /tmp/omni_response)
+    echo "Response size: ${FILE_SIZE} bytes (min: ${MIN_SIZE})"
+    [ "${FILE_SIZE}" -gt "${MIN_SIZE}" ] || { echo "FAIL: response too small"; exit 1; }
+
+elif [[ "${VALIDATE}" == json_field:* ]]; then
+    FIELD="${VALIDATE#json_field:}"
+    python3 -c "
+import json, sys
+data = json.load(open('/tmp/omni_response'))
+obj = data
+for part in '${FIELD}'.replace(']','').replace('[','.').split('.'):
+    if part.isdigit():
+        obj = obj[int(part)]
+    else:
+        obj = obj[part]
+assert obj, 'Field ${FIELD} is empty'
+print(f'Validated: ${FIELD} present ({type(obj).__name__})')
+"
+fi
+
+echo "=== vLLM-Omni SageMaker smoke test PASSED ==="