Qualcomm AI Engine Direct - Support backend awareness pass infrastruc… #99
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Test ExecuTorch CUDA Windows Artifacts | |
| # This workflow exports models targeting CUDA Windows using optimum-executorch on Linux. | |
| # Then it runs those exported artifacts on a Windows CI machine. | |
| name: Test CUDA Windows Export and E2E | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - release/* | |
| tags: | |
| - ciflow/cuda/* | |
| pull_request: | |
| paths: | |
| - .github/workflows/cuda-windows.yml | |
| - backends/cuda/** | |
| - backends/aoti/** | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
| cancel-in-progress: false | |
| permissions: | |
| contents: read | |
| jobs: | |
| changed-files: | |
| name: Get changed files | |
| uses: ./.github/workflows/_get-changed-files.yml | |
| with: | |
| include-push-diff: true | |
| run-decision: | |
| name: CI run decision | |
| uses: ./.github/workflows/_ci-run-decision.yml | |
| export-model-cuda-windows-artifact: | |
| name: export-model-cuda-windows-artifact | |
| # Skip this job if the pull request is from a fork (HuggingFace secrets are not available). | |
| # Path-filtered on push: mirrors the workflow-level pull_request `paths:` | |
| # filter so push commits that don't touch CUDA-relevant paths skip | |
| # this job on non-sampled commits. See _ci-run-decision.yml for | |
| # the sampling policy. | |
| needs: [changed-files, run-decision] | |
| if: | | |
| (github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request') && | |
| ( | |
| contains(needs.changed-files.outputs.changed-files, 'backends/cuda') || | |
| contains(needs.changed-files.outputs.changed-files, 'backends/aoti') || | |
| contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') || | |
| needs.run-decision.outputs.is-full-run == 'true' | |
| ) | |
| uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
| permissions: | |
| id-token: write | |
| contents: read | |
| secrets: inherit | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - model_repo: "mistralai" | |
| model_name: "Voxtral-Mini-3B-2507" | |
| quant: "non-quantized" | |
| - model_repo: "mistralai" | |
| model_name: "Voxtral-Mini-3B-2507" | |
| quant: "quantized-int4-weight-only" | |
| - model_repo: "nvidia" | |
| model_name: "parakeet-tdt" | |
| quant: "non-quantized" | |
| - model_repo: "nvidia" | |
| model_name: "parakeet-tdt" | |
| quant: "quantized-int4-weight-only" | |
| # TODO: sortformer produces 0 segments on Windows after D97788666. | |
| # Temporarily disabled until root cause is debugged. | |
| # - model_repo: "nvidia" | |
| # model_name: "diar_streaming_sortformer_4spk-v2" | |
| # quant: "non-quantized" | |
| - model_repo: "mistralai" | |
| model_name: "Voxtral-Mini-4B-Realtime-2602" | |
| quant: "quantized-int4-tile-packed" | |
| - model_repo: "facebook" | |
| model_name: "dinov2-small-imagenet1k-1-layer" | |
| quant: "non-quantized" | |
| with: | |
| timeout: 90 | |
| secrets-env: EXECUTORCH_HF_TOKEN | |
| runner: linux.g5.4xlarge.nvidia.gpu | |
| gpu-arch-type: cuda | |
| gpu-arch-version: "13.0" | |
| docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows | |
| submodules: recursive | |
| upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }} | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| script: | | |
| set -eux | |
| echo "::group::Fix libstdc++ GLIBCXX version" | |
| # The executorch pybindings require GLIBCXX_3.4.30 which conda's libstdc++ doesn't have. | |
| # Replace conda's libstdc++ with the system version to fix ImportError. | |
| # Verify system version has GLIBCXX_3.4.30 | |
| strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX_3.4.30 | |
| # Backup and replace conda's version | |
| mv /opt/conda/lib/libstdc++.so.6 /opt/conda/lib/libstdc++.so.6.bak || true | |
| ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/libstdc++.so.6 | |
| echo "::endgroup::" | |
| echo "::group::Verify pre-installed dependencies" | |
| x86_64-w64-mingw32-g++ --version | |
| nvcc --version | |
| echo "WINDOWS_CUDA_HOME=${WINDOWS_CUDA_HOME}" | |
| ls -la "${WINDOWS_CUDA_HOME}" | |
| echo "::endgroup::" | |
| echo "::group::Setup ExecuTorch" | |
| # Disable MKL to avoid duplicate target error when conda has multiple MKL installations | |
| export USE_MKL=OFF | |
| PYTHON_EXECUTABLE=python ./install_executorch.sh | |
| echo "::endgroup::" | |
| # Setup Huggingface only for models that need it (not dinov2) | |
| if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ]; then | |
| echo "::group::Setup Huggingface" | |
| pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1" | |
| HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')" | |
| hf auth login --token "$HF_AUTH_TOKEN" | |
| OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) | |
| pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} | |
| echo "::endgroup::" | |
| fi | |
| VR_MODE="" | |
| if [ "${{ matrix.model_name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then | |
| VR_MODE="vr-offline" | |
| fi | |
| source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model_repo }}/${{ matrix.model_name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "${VR_MODE}" | |
| test-model-cuda-windows-e2e: | |
| name: test-model-cuda-windows-e2e | |
| # Same path filter as the export job above. Also explicitly gated | |
| # on the export job succeeding — when needs: jobs are *skipped* | |
| # (e.g. fork PR), GitHub still evaluates this if:, so without the | |
| # explicit success-check this job would run and then fail trying | |
| # to download an artifact that was never produced. | |
| needs: [changed-files, export-model-cuda-windows-artifact, run-decision] | |
| if: | | |
| needs.export-model-cuda-windows-artifact.result == 'success' && | |
| ( | |
| contains(needs.changed-files.outputs.changed-files, 'backends/cuda') || | |
| contains(needs.changed-files.outputs.changed-files, 'backends/aoti') || | |
| contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') || | |
| needs.run-decision.outputs.is-full-run == 'true' | |
| ) | |
| uses: pytorch/test-infra/.github/workflows/windows_job.yml@main | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - model_repo: "mistralai" | |
| model_name: "Voxtral-Mini-3B-2507" | |
| quant: "non-quantized" | |
| - model_repo: "mistralai" | |
| model_name: "Voxtral-Mini-3B-2507" | |
| quant: "quantized-int4-weight-only" | |
| - model_repo: "nvidia" | |
| model_name: "parakeet-tdt" | |
| quant: "non-quantized" | |
| - model_repo: "nvidia" | |
| model_name: "parakeet-tdt" | |
| quant: "quantized-int4-weight-only" | |
| # TODO: sortformer produces 0 segments on Windows after D97788666. | |
| # Temporarily disabled until root cause is debugged. | |
| # - model_repo: "nvidia" | |
| # model_name: "diar_streaming_sortformer_4spk-v2" | |
| # quant: "non-quantized" | |
| - model_repo: "mistralai" | |
| model_name: "Voxtral-Mini-4B-Realtime-2602" | |
| quant: "quantized-int4-tile-packed" | |
| - model_repo: "facebook" | |
| model_name: "dinov2-small-imagenet1k-1-layer" | |
| quant: "non-quantized" | |
| with: | |
| timeout: 240 | |
| runner: windows.g5.4xlarge.nvidia.gpu | |
| gpu-arch-type: cuda | |
| gpu-arch-version: "13.0" | |
| download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }} | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| script: | | |
| git config --global http.sslBackend openssl | |
| git submodule update --init --recursive | |
| conda init powershell | |
| powershell -Command "& { | |
| Set-PSDebug -Trace 1 | |
| \$ErrorActionPreference = 'Stop' | |
| \$PSNativeCommandUseErrorActionPreference = \$true | |
| \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0' | |
| \$env:CUDA_PATH = \$env:CUDA_HOME | |
| \$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\" | |
| nvcc --version | |
| .ci/scripts/setup-windows.ps1 | |
| \$artifactDir = \$env:RUNNER_ARTIFACT_DIR | |
| if ([string]::IsNullOrWhiteSpace(\$artifactDir)) { | |
| throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.' | |
| } | |
| .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '13.0' | |
| }" |