Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ case "${IMAGE_NAME}" in
LINTRUNNER=""
GCC_VERSION=11
CUDA_WINDOWS_CROSS_COMPILE=yes
CUDA_VERSION=12.8
CUDA_VERSION=12.6
SKIP_PYTORCH=yes
;;
*)
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/pytorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
release/2.11
358117c166b75167a09bca81ac9925940feda339
10 changes: 6 additions & 4 deletions .ci/docker/common/install_cuda_windows_cross_compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ set -ex

INSTALL_DIR="${WINDOWS_CUDA_INSTALL_DIR:-/opt/cuda-windows}"

# Mapping of CUDA versions to their corresponding driver versions for Windows installers
# Mapping of CUDA versions to their corresponding driver versions for Windows installers.
# Source: https://developer.nvidia.com/cuda-toolkit-archive
# Format: "PATCH_VERSION:DRIVER_VERSION". Starting with CUDA 13.0, NVIDIA dropped the
# driver suffix from the Windows installer filename, so the driver field is empty.
declare -A CUDA_DRIVER_MAP=(
["12.6"]="12.6.3:561.17"
["12.8"]="12.8.1:572.61"
["12.9"]="12.9.1:576.57"
["13.0"]="13.0.3:"
)

install_mingw() {
Expand Down Expand Up @@ -83,7 +84,8 @@ install_windows_cuda() {
mkdir -p "${INSTALL_DIR}"
cd "${INSTALL_DIR}"

CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
# CUDA 13.0+ installers no longer include the driver version in the filename.
CUDA_INSTALLER="cuda_${CUDA_VERSION}${CUDA_DRIVER_VERSION:+_${CUDA_DRIVER_VERSION}}_windows.exe"
CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"

# Check if already downloaded and extracted
Expand Down
9 changes: 7 additions & 2 deletions .ci/docker/common/install_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,19 @@ install_pytorch_and_domains() {
chown -R ci-user .

export _GLIBCXX_USE_CXX11_ABI=1
# PyTorch's FindARM.cmake hard-fails when the SVE+BF16 compile probe
# doesn't pass — gcc-11 in this image is too old to accept the combined
# NEON/SVE/bfloat16 intrinsics the probe exercises. Executorch's aarch64
# runtime targets (phones, embedded) don't use SVE, so bypass the check.
export BUILD_IGNORE_SVE_UNAVAILABLE=1
# Then build and install PyTorch
conda_run python setup.py bdist_wheel
pip_install "$(echo dist/*.whl)"

# Grab the pinned audio and vision commits from PyTorch
TORCHAUDIO_VERSION=release/2.11
TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
export TORCHAUDIO_VERSION
TORCHVISION_VERSION=release/0.26
TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
export TORCHVISION_VERSION
Comment on lines 39 to 43
Comment on lines 39 to 43

install_domains
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/test_model_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ if [ "$AUDIO_URL" != "" ]; then
elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
conda install -y -c conda-forge "ffmpeg<8"
pip install datasets soundfile
pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
fi

Expand Down
22 changes: 11 additions & 11 deletions .ci/scripts/test_wheel_package_qnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,17 @@ print(module_vars["TORCH_VERSION"])
PY
)

# NIGHTLY_VERSION=$(
# "$PYBIN" - <<'PY'
# import runpy
# module_vars = runpy.run_path("torch_pin.py")
# print(module_vars["NIGHTLY_VERSION"])
# PY
# )
echo "=== [$LABEL] Install torch==${TORCH_VERSION} ==="

# Install torch based on the pinned PyTorch version, preferring the PyTorch test index
"$PIPBIN" install torch=="${TORCH_VERSION}" --extra-index-url "https://download.pytorch.org/whl/test"
NIGHTLY_VERSION=$(
"$PYBIN" - <<'PY'
import runpy
module_vars = runpy.run_path("torch_pin.py")
print(module_vars["NIGHTLY_VERSION"])
PY
)
echo "=== [$LABEL] Install torch==${TORCH_VERSION}.${NIGHTLY_VERSION} ==="

# Install torchao based on the pinned PyTorch version
Comment thread
mergennachin marked this conversation as resolved.
Comment thread
mergennachin marked this conversation as resolved.
Comment thread
mergennachin marked this conversation as resolved.
"$PIPBIN" install torch=="${TORCH_VERSION}.${NIGHTLY_VERSION}" --index-url "https://download.pytorch.org/whl/nightly/cpu"
Comment thread
mergennachin marked this conversation as resolved.
Comment thread
mergennachin marked this conversation as resolved.
Comment thread
mergennachin marked this conversation as resolved.
"$PIPBIN" install wheel

# Install torchao based on the pinned commit from third-party/ao submodule
Expand Down
6 changes: 3 additions & 3 deletions .ci/scripts/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ dedupe_macos_loader_path_rpaths() {
pushd ..
torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
popd

Comment thread
mergennachin marked this conversation as resolved.
if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
return
fi
Expand Down Expand Up @@ -141,9 +141,9 @@ install_pytorch_and_domains() {

dedupe_macos_loader_path_rpaths
# Grab the pinned audio and vision commits from PyTorch
TORCHAUDIO_VERSION=release/2.11
TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
export TORCHAUDIO_VERSION
TORCHVISION_VERSION=release/0.26
TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
export TORCHVISION_VERSION
Comment on lines 143 to 147
Comment on lines 143 to 147

install_domains
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/cuda-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
secrets-env: EXECUTORCH_HF_TOKEN
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.8
gpu-arch-version: 12.6
docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
submodules: recursive
upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
Expand Down Expand Up @@ -146,7 +146,7 @@ jobs:
timeout: 240
runner: windows.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.8
gpu-arch-version: 12.6
download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
Expand All @@ -158,7 +158,7 @@ jobs:
\$ErrorActionPreference = 'Stop'
\$PSNativeCommandUseErrorActionPreference = \$true

\$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8'
\$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6'
\$env:CUDA_PATH = \$env:CUDA_HOME
\$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\"
nvcc --version
Expand All @@ -169,5 +169,5 @@ jobs:
throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
}

.ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
.ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.6'
}"
6 changes: 3 additions & 3 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Test ExecuTorch CUDA Build Compatibility
# This workflow tests whether ExecuTorch can be successfully built with CUDA support
# across different CUDA versions (12.6, 12.8, 12.9, 13.0) using the command:
# across different CUDA versions (12.6, 13.0) using the command:
# ./install_executorch.sh
#
# Note: ExecuTorch automatically detects the system CUDA version using nvcc and
Expand Down Expand Up @@ -31,7 +31,7 @@ jobs:
strategy:
fail-fast: false
matrix:
cuda-version: ["12.6", "12.8", "12.9", "13.0"]
cuda-version: ["12.6", "13.0"]

name: test-executorch-cuda-build-${{ matrix.cuda-version }}
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
Expand Down Expand Up @@ -66,7 +66,7 @@ jobs:
echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
exit 1
else
echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9, 13.0) completed successfully!"
echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 13.0) completed successfully!"
fi

test-models-cuda:
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/docker-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,20 @@ jobs:
matrix:
runner: [linux.4xlarge]
docker-image-name: [
executorch-ubuntu-22.04-gcc11,
executorch-ubuntu-22.04-gcc9-nopytorch,
executorch-ubuntu-22.04-clang12,
executorch-ubuntu-22.04-linter,
executorch-ubuntu-22.04-arm-sdk,
executorch-ubuntu-22.04-zephyr-sdk,
executorch-ubuntu-22.04-qnn-sdk,
executorch-ubuntu-22.04-mediatek-sdk,
executorch-ubuntu-22.04-clang12-android
]
include:
# PyTorch is built from source in these images; 4xlarge OOMs mid-build.
- docker-image-name: executorch-ubuntu-22.04-gcc11
runner: linux.12xlarge
- docker-image-name: executorch-ubuntu-22.04-zephyr-sdk
runner: linux.12xlarge
- docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64
runner: linux.arm64.2xlarge
- docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64-android
Expand Down
2 changes: 1 addition & 1 deletion examples/models/moshi/mimi/install_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
set -x

sudo apt install ffmpeg -y
pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
pip install moshi==0.2.11
pip install bitsandbytes soundfile einops
# Run llama2/install requirements for torchao deps
Expand Down
26 changes: 20 additions & 6 deletions install_requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@

from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible

from torch_pin import NIGHTLY_VERSION, TORCH_VERSION

# The pip repository that hosts nightly torch packages.
# This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
TORCH_URL_BASE = "https://download.pytorch.org/whl/test"
TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"

# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
# pip versions will have the required features.
Expand Down Expand Up @@ -42,14 +44,18 @@ def install_requirements(use_pytorch_nightly):
sys.exit(1)

# Determine the appropriate PyTorch URL based on CUDA delegate status
torch_url = determine_torch_url(TORCH_URL_BASE)
torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)

# pip packages needed by exir.
TORCH_PACKAGE = [
# Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
# that we don't need to set any version number there because they have already
# been installed on CI before this step, so pip won't reinstall them
("torch==2.11.0" if use_pytorch_nightly else "torch"),
(
f"torch=={TORCH_VERSION}.{NIGHTLY_VERSION}"
if use_pytorch_nightly
else "torch"
),
]

# Install the requirements for core ExecuTorch package.
Expand Down Expand Up @@ -108,12 +114,20 @@ def install_requirements(use_pytorch_nightly):

def install_optional_example_requirements(use_pytorch_nightly):
# Determine the appropriate PyTorch URL based on CUDA delegate status
torch_url = determine_torch_url(TORCH_URL_BASE)
torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)

print("Installing torch domain libraries")
DOMAIN_LIBRARIES = [
("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"),
("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"),
(
f"torchvision==0.27.0.{NIGHTLY_VERSION}"
if use_pytorch_nightly
else "torchvision"
),
(
f"torchaudio==2.11.0.{NIGHTLY_VERSION}"
Comment thread
mergennachin marked this conversation as resolved.
Comment thread
mergennachin marked this conversation as resolved.
if use_pytorch_nightly
else "torchaudio"
),
]
# Then install domain libraries
subprocess.run(
Expand Down
79 changes: 63 additions & 16 deletions runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
#define C10_HIP_HOST_DEVICE
#endif

#if defined(USE_ROCM)
// C10_WARP_SIZE is only allowed for device code.
// Host code _must_ use at::cuda::warp_size()
// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
// Host or device statically-sized arrays _must_ use either
// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
//
// HIP header used to define warpSize as a constexpr that was either 32 or 64
// depending on the target device, and then always set it to 64 for host code.
// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
// set it to something unreasonable to trigger obvious host code errors.

// For a time, that allowed C10_WARP_SIZE to be defined like so:
//
// #ifdef USE_ROCM
// #define C10_WARP_SIZE warpSize
// #else
// #define C10_WARP_SIZE 32
// #endif
//
// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
// We can now only use warpSize for C10_WARP_SIZE in device code and this is
// enforced by using __device__ in its definition. In host code where
// C10_WARP_SIZE was previously used as a compile-time constant, this will now
// cause a compile-time error.
//
// If an array was previously expected to be sized at compile-time using
// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
//
// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
// must now use at::cuda::warp_size() for the dynamic runtime query.
//
// Unfortunately, C10_WARP_SIZE has been public and available for both host and
// device since approximately 2019, so forcing it to be device-only would break
// existing code in the wild.
#if defined(USE_ROCM)
namespace at::cuda {
TORCH_CUDA_CPP_API int warp_size();
}
#ifdef __HIPCC__
static inline int __host__ C10_WARP_SIZE_INTERNAL() {
#if defined(__HIPCC__)
static __host__ inline int C10_WARP_SIZE_INTERNAL() {
return at::cuda::warp_size();
}

static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
// NOTE: __device__ C10_WARP_SIZE_INTERNAL
// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
// we can use constexpr. This matches prior behavior. We preserve this for
// backward compatibility instead of forcing old code to use dynamic warpSize
// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
// warpSize is not allowed.
#if defined(__SPIRV__)
static __device__ inline int C10_WARP_SIZE_INTERNAL() {
return warpSize;
}
#else // __SPIRV__
static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
#if defined(__GFX9__)
return 64;
#else // __GFX9__
return 32;
#endif // __GFX9__
}
#else // __HIPCC__
#endif // __SPIRV__
#if defined(__SPIRV__)
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 64
#elif defined(__GFX9__)
#define C10_WARP_SIZE_LOWER_BOUND 64
#define C10_WARP_SIZE_UPPER_BOUND 64
#else
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 32
#endif
#else // !__HIPCC__
static inline int C10_WARP_SIZE_INTERNAL() {
return at::cuda::warp_size();
}
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 64
#endif // __HIPCC__

#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
#define C10_WARP_SIZE_STATIC 64

#else // defined(USE_ROCM)
#else // !USE_ROCM
#define C10_WARP_SIZE 32
#endif
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 32
#endif // USE_ROCM

#if defined(_MSC_VER) && _MSC_VER <= 1900
#define __func__ __FUNCTION__
Expand Down Expand Up @@ -629,7 +676,7 @@ __host__ __device__
// This macro is used to find older C++ compilers
// that don't support move optimization for return values.

#if (defined(__GNUC__) && __GNUC__ < 13) || \
#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
(defined(__clang_major__) && __clang_major__ < 13)
#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
#else
Expand Down
Loading
Loading