Qualcomm AI Engine Direct - Support backend awareness pass infrastruc… #99

Workflow file for this run

.github/workflows/cuda-windows.yml at 2759ef1

	# Test ExecuTorch CUDA Windows Artifacts
	# This workflow exports models targeting CUDA Windows using optimum-executorch on Linux.
	# Then it runs those exported artifacts on a Windows CI machine.

	name: Test CUDA Windows Export and E2E

	on:
	push:
	branches:
	- main
	- release/*
	tags:
	- ciflow/cuda/*
	pull_request:
	paths:
	- .github/workflows/cuda-windows.yml
	- backends/cuda/**
	- backends/aoti/**
	workflow_dispatch:

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
	cancel-in-progress: false

	permissions:
	contents: read

	jobs:
	changed-files:
	name: Get changed files
	uses: ./.github/workflows/_get-changed-files.yml
	with:
	include-push-diff: true

	run-decision:
	name: CI run decision
	uses: ./.github/workflows/_ci-run-decision.yml

	export-model-cuda-windows-artifact:
	name: export-model-cuda-windows-artifact
	# Skip this job if the pull request is from a fork (HuggingFace secrets are not available).
	# Path-filtered on push: mirrors the workflow-level pull_request `paths:`
	# filter so push commits that don't touch CUDA-relevant paths skip
	# this job on non-sampled commits. See _ci-run-decision.yml for
	# the sampling policy.
	needs: [changed-files, run-decision]
	if: \|
	(github.event.pull_request.head.repo.full_name == github.repository \|\| github.event_name != 'pull_request') &&
	(
	contains(needs.changed-files.outputs.changed-files, 'backends/cuda') \|\|
	contains(needs.changed-files.outputs.changed-files, 'backends/aoti') \|\|
	contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') \|\|
	needs.run-decision.outputs.is-full-run == 'true'
	)
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	permissions:
	id-token: write
	contents: read
	secrets: inherit
	strategy:
	fail-fast: false
	matrix:
	include:
	- model_repo: "mistralai"
	model_name: "Voxtral-Mini-3B-2507"
	quant: "non-quantized"
	- model_repo: "mistralai"
	model_name: "Voxtral-Mini-3B-2507"
	quant: "quantized-int4-weight-only"
	- model_repo: "nvidia"
	model_name: "parakeet-tdt"
	quant: "non-quantized"
	- model_repo: "nvidia"
	model_name: "parakeet-tdt"
	quant: "quantized-int4-weight-only"
	# TODO: sortformer produces 0 segments on Windows after D97788666.
	# Temporarily disabled until root cause is debugged.
	# - model_repo: "nvidia"
	# model_name: "diar_streaming_sortformer_4spk-v2"
	# quant: "non-quantized"
	- model_repo: "mistralai"
	model_name: "Voxtral-Mini-4B-Realtime-2602"
	quant: "quantized-int4-tile-packed"
	- model_repo: "facebook"
	model_name: "dinov2-small-imagenet1k-1-layer"
	quant: "non-quantized"
	with:
	timeout: 90
	secrets-env: EXECUTORCH_HF_TOKEN
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "13.0"
	docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
	submodules: recursive
	upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	script: \|
	set -eux

	echo "::group::Fix libstdc++ GLIBCXX version"
	# The executorch pybindings require GLIBCXX_3.4.30 which conda's libstdc++ doesn't have.
	# Replace conda's libstdc++ with the system version to fix ImportError.
	# Verify system version has GLIBCXX_3.4.30
	strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 \| grep GLIBCXX_3.4.30
	# Backup and replace conda's version
	mv /opt/conda/lib/libstdc++.so.6 /opt/conda/lib/libstdc++.so.6.bak \|\| true
	ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/libstdc++.so.6
	echo "::endgroup::"

	echo "::group::Verify pre-installed dependencies"
	x86_64-w64-mingw32-g++ --version
	nvcc --version
	echo "WINDOWS_CUDA_HOME=${WINDOWS_CUDA_HOME}"
	ls -la "${WINDOWS_CUDA_HOME}"
	echo "::endgroup::"

	echo "::group::Setup ExecuTorch"
	# Disable MKL to avoid duplicate target error when conda has multiple MKL installations
	export USE_MKL=OFF
	PYTHON_EXECUTABLE=python ./install_executorch.sh
	echo "::endgroup::"

	# Setup Huggingface only for models that need it (not dinov2)
	if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ]; then
	echo "::group::Setup Huggingface"
	pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1"
	HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" \| tr -d '\r\n')"
	hf auth login --token "$HF_AUTH_TOKEN"
	OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
	pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
	echo "::endgroup::"
	fi

	VR_MODE=""
	if [ "${{ matrix.model_name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
	VR_MODE="vr-offline"
	fi
	source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model_repo }}/${{ matrix.model_name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "${VR_MODE}"

	test-model-cuda-windows-e2e:
	name: test-model-cuda-windows-e2e
	# Same path filter as the export job above. Also explicitly gated
	# on the export job succeeding — when needs: jobs are skipped
	# (e.g. fork PR), GitHub still evaluates this if:, so without the
	# explicit success-check this job would run and then fail trying
	# to download an artifact that was never produced.
	needs: [changed-files, export-model-cuda-windows-artifact, run-decision]
	if: \|
	needs.export-model-cuda-windows-artifact.result == 'success' &&
	(
	contains(needs.changed-files.outputs.changed-files, 'backends/cuda') \|\|
	contains(needs.changed-files.outputs.changed-files, 'backends/aoti') \|\|
	contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') \|\|
	needs.run-decision.outputs.is-full-run == 'true'
	)
	uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
	strategy:
	fail-fast: false
	matrix:
	include:
	- model_repo: "mistralai"
	model_name: "Voxtral-Mini-3B-2507"
	quant: "non-quantized"
	- model_repo: "mistralai"
	model_name: "Voxtral-Mini-3B-2507"
	quant: "quantized-int4-weight-only"
	- model_repo: "nvidia"
	model_name: "parakeet-tdt"
	quant: "non-quantized"
	- model_repo: "nvidia"
	model_name: "parakeet-tdt"
	quant: "quantized-int4-weight-only"
	# TODO: sortformer produces 0 segments on Windows after D97788666.
	# Temporarily disabled until root cause is debugged.
	# - model_repo: "nvidia"
	# model_name: "diar_streaming_sortformer_4spk-v2"
	# quant: "non-quantized"
	- model_repo: "mistralai"
	model_name: "Voxtral-Mini-4B-Realtime-2602"
	quant: "quantized-int4-tile-packed"
	- model_repo: "facebook"
	model_name: "dinov2-small-imagenet1k-1-layer"
	quant: "non-quantized"
	with:
	timeout: 240
	runner: windows.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "13.0"
	download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	script: \|
	git config --global http.sslBackend openssl
	git submodule update --init --recursive
	conda init powershell
	powershell -Command "& {
	Set-PSDebug -Trace 1
	\$ErrorActionPreference = 'Stop'
	\$PSNativeCommandUseErrorActionPreference = \$true

	\$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0'
	\$env:CUDA_PATH = \$env:CUDA_HOME
	\$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\"
	nvcc --version

	.ci/scripts/setup-windows.ps1
	\$artifactDir = \$env:RUNNER_ARTIFACT_DIR
	if ([string]::IsNullOrWhiteSpace(\$artifactDir)) {
	throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
	}

	.ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '13.0'
	}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Qualcomm AI Engine Direct - Support backend awareness pass infrastruc… #99

Workflow file

Qualcomm AI Engine Direct - Support backend awareness pass infrastruc… #99

Uh oh!

Workflow file for this run