pytorch
diff --git a/‎.ci/docker/build.sh‎
Lines changed: 17 additions & 0 deletions b/‎.ci/docker/build.sh‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/common/install_cuda.sh‎
Lines changed: 3 additions & 1 deletion b/‎.ci/docker/common/install_cuda.sh‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.ci/docker/common/install_pytorch.sh‎
Lines changed: 26 additions & 3 deletions b/‎.ci/docker/common/install_pytorch.sh‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎.ci/docker/ubuntu/Dockerfile‎
Lines changed: 5 additions & 0 deletions b/‎.ci/docker/ubuntu/Dockerfile‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 29 additions & 1 deletion b/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎.ci/scripts/test_backend_imports.py‎
Lines changed: 135 additions & 0 deletions b/‎.ci/scripts/test_backend_imports.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 12 additions & 68 deletions b/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 12 additions & 68 deletions
@@ -92,6 +92,18 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
+# Pull channel + spec/url helpers out of torch_pin.py so install_pytorch.sh
+# (which runs inside the docker build, where torch_pin.py isn't available)
+# can decide between wheel install (test/release) and source build (nightly).
+# Self-hosted runners often have python3 but not the unversioned python alias.
+PYTHON_BIN=$(command -v python3 || command -v python)
+TORCH_PIN_HELPERS=$(cd ../.. && "$PYTHON_BIN" -c "from torch_pin import CHANNEL, torch_spec, torchaudio_spec, torchvision_spec, torch_index_url_base; print(CHANNEL); print(torch_spec()); print(torchaudio_spec()); print(torchvision_spec()); print(torch_index_url_base())")
+TORCH_CHANNEL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '1p')
+TORCH_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '2p')
+TORCHAUDIO_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '3p')
+TORCHVISION_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '4p')
+TORCH_INDEX_URL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '5p')
+
 # Copy requirements-lintrunner.txt from root to here
 cp ../../requirements-lintrunner.txt ./
 
@@ -104,6 +116,11 @@ docker build \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
   --build-arg "TORCH_VERSION=${TORCH_VERSION}" \
+  --build-arg "TORCH_CHANNEL=${TORCH_CHANNEL}" \
+  --build-arg "TORCH_SPEC=${TORCH_SPEC}" \
+  --build-arg "TORCHAUDIO_SPEC=${TORCHAUDIO_SPEC}" \
+  --build-arg "TORCHVISION_SPEC=${TORCHVISION_SPEC}" \
+  --build-arg "TORCH_INDEX_URL=${TORCH_INDEX_URL}" \
   --build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
   --build-arg "LINTRUNNER=${LINTRUNNER:-}" \
   --build-arg "BUILD_DOCS=${BUILD_DOCS}" \
 
@@ -1 +1 @@
-release/2.11
+release/2.11
@@ -38,13 +38,15 @@ apt-get update
 # - libcublas-dev: cuBLAS development files
 # - libcusparse-dev: cuSPARSE development files
 # - libcufft-dev: cuFFT development files
+# - libcurand-dev: cuRAND development files
 apt-get install -y --no-install-recommends \
     "cuda-nvcc-${CUDA_VERSION_DASH}" \
     "cuda-cudart-dev-${CUDA_VERSION_DASH}" \
     "cuda-nvrtc-dev-${CUDA_VERSION_DASH}" \
     "libcublas-dev-${CUDA_VERSION_DASH}" \
     "libcusparse-dev-${CUDA_VERSION_DASH}" \
-    "libcufft-dev-${CUDA_VERSION_DASH}"
+    "libcufft-dev-${CUDA_VERSION_DASH}" \
+    "libcurand-dev-${CUDA_VERSION_DASH}"
 
 # Clean up
 apt-get clean
 
@@ -17,6 +17,24 @@ install_domains() {
 }
 
 install_pytorch_and_domains() {
+  if [ "${TORCH_CHANNEL}" != "nightly" ]; then
+    # Test/release: install the published wheels directly. The specs and URL
+    # are passed in as docker build args (computed from torch_pin.py by
+    # .ci/docker/build.sh). RC wheels at /whl/test/ get re-uploaded under the
+    # same version, so use --no-cache-dir there to avoid stale cache hits.
+    local cache_flag=""
+    if [ "${TORCH_CHANNEL}" = "test" ]; then
+      cache_flag="--no-cache-dir"
+    fi
+    pip_install --force-reinstall ${cache_flag} \
+      "${TORCH_SPEC}" "${TORCHVISION_SPEC}" "${TORCHAUDIO_SPEC}" \
+      --index-url "${TORCH_INDEX_URL}/cpu"
+    return
+  fi
+
+  # Nightly: build pytorch from source against the pinned SHA in pytorch.txt
+  # so we catch upstream regressions, then install audio/vision from the
+  # commits that pytorch itself pins.
   git clone https://github.com/pytorch/pytorch.git
 
   # Fetch the target commit
@@ -27,14 +45,19 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
+  # PyTorch's FindARM.cmake hard-fails when the SVE+BF16 compile probe
+  # doesn't pass — gcc-11 in this image is too old to accept the combined
+  # NEON/SVE/bfloat16 intrinsics the probe exercises. Executorch's aarch64
+  # runtime targets (phones, embedded) don't use SVE, so bypass the check.
+  export BUILD_IGNORE_SVE_UNAVAILABLE=1
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
 
-  # Grab the pinned audio and vision commits from PyTorch
-  TORCHAUDIO_VERSION=release/2.11
+  # Defer to PyTorch's own pinned audio/vision commits.
+  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=release/0.26
+  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
   export TORCHVISION_VERSION
 
   install_domains
 
@@ -64,6 +64,11 @@ ENV SCCACHE_S3_KEY_PREFIX executorch
 ENV SCCACHE_REGION us-east-1
 
 ARG TORCH_VERSION
+ARG TORCH_CHANNEL
+ARG TORCH_SPEC
+ARG TORCHAUDIO_SPEC
+ARG TORCHVISION_SPEC
+ARG TORCH_INDEX_URL
 ARG SKIP_PYTORCH
 COPY ./common/install_pytorch.sh install_pytorch.sh
 COPY ./common/utils.sh utils.sh
 
@@ -415,12 +415,40 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
 
   # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
   echo "::group::Export"
+  EXPORT_LOG=$(mktemp)
   TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
   python -m executorch.examples.models.qwen3_5_moe.export \
       --prequantized "$LOCAL_MODEL_DIR" \
-      --output-dir "${OUTPUT_DIR}"
+      --output-dir "${OUTPUT_DIR}" \
+      --dense-prefill dequant \
+      --moe-activation-dtype int8 2>&1 | tee "$EXPORT_LOG"
+  EXPORT_RC=${PIPESTATUS[0]}
   echo "::endgroup::"
 
+  if [ "$EXPORT_RC" -ne 0 ]; then
+    echo "ERROR: Qwen3.5 MoE export failed (exit $EXPORT_RC)"
+    rm -f "$EXPORT_LOG"
+    exit "$EXPORT_RC"
+  fi
+
+  # Gate peak GPU memory so we keep the export viable on consumer GPUs
+  # (e.g. RTX 4090 with 24 GB). The export script prints a machine-
+  # parseable marker line "EXPORT_GPU_PEAK_MEMORY_MB: <float>".
+  EXPORT_GPU_PEAK_MB_LIMIT="${EXPORT_GPU_PEAK_MB_LIMIT:-20480}"
+  PEAK_LINE=$(grep -E '^EXPORT_GPU_PEAK_MEMORY_MB:' "$EXPORT_LOG" | tail -1)
+  rm -f "$EXPORT_LOG"
+  if [ -z "$PEAK_LINE" ]; then
+    echo "ERROR: export did not emit EXPORT_GPU_PEAK_MEMORY_MB marker; cannot enforce GPU memory budget"
+    exit 1
+  fi
+  PEAK_MB=$(echo "$PEAK_LINE" | awk '{print $2}')
+  echo "Export GPU peak memory: ${PEAK_MB} MB (limit ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
+  if awk -v p="$PEAK_MB" -v l="$EXPORT_GPU_PEAK_MB_LIMIT" 'BEGIN{exit !(p>l)}'; then
+    echo "ERROR: export exceeded GPU memory budget (${PEAK_MB} MB > ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
+    echo "       — this would prevent the model from being exported on a 24 GB consumer GPU."
+    exit 1
+  fi
+
   test -f "${OUTPUT_DIR}/model.pte"
   test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
   ls -al "${OUTPUT_DIR}"
 
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+"""Validate that backend Python modules can be imported.
+
+The workflow passes backend-specific paths and package prefixes so the same
+checker can be reused for different backends.
+"""
+
+import argparse
+import importlib
+import sys
+from pathlib import Path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--name",
+        required=True,
+        help="Display name for log messages, for example `QNN`.",
+    )
+    parser.add_argument(
+        "--package-root",
+        required=True,
+        help="Path to the backend package root, relative to ExecuTorch root.",
+    )
+    parser.add_argument(
+        "--package-prefix",
+        required=True,
+        help="Python package prefix, for example `executorch.backends.qualcomm`.",
+    )
+    parser.add_argument(
+        "--skip-segment",
+        action="append",
+        default=["fb", "test", "tests"],
+        help="Package path segment to skip while walking modules.",
+    )
+    return parser.parse_args()
+
+
+def resolve_executorch_root() -> Path:
+    for parent in Path(__file__).resolve().parents:
+        if (parent / "backends").is_dir() and (parent / "examples").is_dir():
+            return parent
+    raise RuntimeError(
+        f"Could not locate ExecuTorch root from {Path(__file__).resolve()}"
+    )
+
+
+def resolve_directory(executorch_root: Path, relative_path: str) -> Path:
+    directory = executorch_root / relative_path
+    if not directory.is_dir():
+        raise RuntimeError(
+            f"Directory `{relative_path}` was not found under {executorch_root}"
+        )
+    return directory
+
+
+def normalize_package_prefix(package_prefix: str) -> str:
+    return package_prefix[:-1] if package_prefix.endswith(".") else package_prefix
+
+
+def should_skip_path(path: Path, skip_segments: list[str]) -> bool:
+    if any(segment in path.parts for segment in skip_segments):
+        return True
+
+    stem = path.stem
+    return any(
+        stem == segment or stem.startswith(f"{segment}_") for segment in skip_segments
+    )
+
+
+def discover_modules(
+    package_root: Path,
+    package_prefix: str,
+    skip_segments: list[str],
+) -> list[str]:
+    modules = []
+    for path in sorted(package_root.rglob("*.py")):
+        relative_path = path.relative_to(package_root)
+        if should_skip_path(relative_path, skip_segments):
+            continue
+
+        if relative_path.name == "__init__.py":
+            module_suffix = ".".join(relative_path.parent.parts)
+            if module_suffix:
+                modules.append(f"{package_prefix}.{module_suffix}")
+            else:
+                modules.append(package_prefix)
+            continue
+
+        modules.append(
+            f"{package_prefix}.{'.'.join(relative_path.with_suffix('').parts)}"
+        )
+    return modules
+
+
+def main() -> None:
+    args = parse_args()
+    executorch_root = resolve_executorch_root()
+    package_root = resolve_directory(executorch_root, args.package_root)
+    package_prefix = normalize_package_prefix(args.package_prefix)
+
+    failures: list[tuple[str, str, str]] = []
+    modules = discover_modules(package_root, package_prefix, args.skip_segment)
+    total_modules = len(modules)
+    if total_modules == 0:
+        print(f"No {args.name} Python modules found under {package_root}")
+        sys.exit(1)
+
+    for index, name in enumerate(modules, 1):
+        print(f"[{index}/{total_modules}] importing {name}", flush=True)
+        try:
+            importlib.import_module(name)
+        except Exception as error:
+            failures.append((name, type(error).__name__, str(error)))
+
+    if failures:
+        print(f"{len(failures)}/{total_modules} {args.name} import failure(s):")
+        for name, error_type, message in failures:
+            print(f"  FAIL: {name} -- {error_type}: {message}")
+        sys.exit(1)
+
+    print(f"All {total_modules} {args.name} modules imported successfully")
+
+
+if __name__ == "__main__":
+    main()
@@ -6,76 +6,20 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# End-to-end test for Cortex-M backend: export a model via aot_arm_compiler
-# with cortex-m55+int8 target, then run the .bpte on Corstone-300 FVP.
-#
-# Usage: bash .ci/scripts/test_cortex_m_e2e.sh <model_name>
-# Example: bash .ci/scripts/test_cortex_m_e2e.sh mv2
+# CI wrapper: export a model for the Cortex-M backend and run it on the
+# Corstone-300 FVP via examples/arm/run.sh. The real work (export, runner
+# build, FVP launch, Test_result: PASS/FAIL check) is done by run.sh and
+# the run_fvp.sh it invokes.
 
-set -eux
+set -eu
 
 MODEL=$1
-mkdir -p "./cortex_m_e2e/${MODEL}"
-WORK_DIR=$(realpath "./cortex_m_e2e/${MODEL}")
+script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
+et_root_dir=$(realpath "${script_dir}/../..")
 
-echo "=== Exporting ${MODEL} with cortex-m55+int8 ==="
-python -m backends.arm.scripts.aot_arm_compiler \
-    -m "${MODEL}" \
+# Quantization is the default for the cortex-m55+int8 target; run.sh's
+# arg parser only recognizes --no_quantize, so we omit any explicit flag.
+bash "${et_root_dir}/examples/arm/run.sh" \
+    --model_name="${MODEL}" \
     --target=cortex-m55+int8 \
-    --quantize \
-    --bundleio \
-    --intermediates="${WORK_DIR}/intermediates" \
-    --output="${WORK_DIR}/${MODEL}.bpte"
-
-BPTE="${WORK_DIR}/${MODEL}.bpte"
-test -f "${BPTE}" || { echo "FAIL: ${BPTE} not produced"; exit 1; }
-echo "=== Exported ${BPTE} ($(stat --printf='%s' "${BPTE}") bytes) ==="
-
-ELF="arm_test/arm_semihosting_executor_runner_corstone-300/arm_executor_runner"
-test -f "${ELF}" || { echo "FAIL: executor runner not found at ${ELF}"; exit 1; }
-
-LOG_FILE=$(mktemp)
-
-# Create a tiny dummy input file — the runner requires -i but BundleIO
-# ignores it and uses the embedded test inputs instead.
-dd if=/dev/zero of="${WORK_DIR}/dummy.bin" bs=4 count=1 2>/dev/null
-
-echo "=== Running ${MODEL} on Corstone-300 FVP ==="
-FVP_Corstone_SSE-300_Ethos-U55 \
-    -C ethosu.num_macs=128 \
-    -C mps3_board.visualisation.disable-visualisation=1 \
-    -C mps3_board.telnetterminal0.start_telnet=0 \
-    -C mps3_board.uart0.out_file='-' \
-    -C mps3_board.uart0.shutdown_on_eot=1 \
-    -C cpu0.semihosting-enable=1 \
-    -C cpu0.semihosting-stack_base=0 \
-    -C cpu0.semihosting-heap_limit=0 \
-    -C "cpu0.semihosting-cwd=${WORK_DIR}" \
-    -C "ethosu.extra_args='--fast'" \
-    -C "cpu0.semihosting-cmd_line='executor_runner -m ${MODEL}.bpte -i dummy.bin -o out'" \
-    -a "${ELF}" \
-    --timelimit 300 2>&1 | tee "${LOG_FILE}" || true
-
-echo "=== Checking FVP output ==="
-
-if grep -q "Test_result: PASS" "${LOG_FILE}"; then
-    echo "=== SUCCESS: ${MODEL} e2e BundleIO test PASSED on FVP ==="
-    rm "${LOG_FILE}"
-    exit 0
-fi
-
-if grep -q "Test_result: FAIL" "${LOG_FILE}"; then
-    echo "FAIL: ${MODEL} BundleIO output mismatch"
-    rm "${LOG_FILE}"
-    exit 1
-fi
-
-if grep -qE "(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)" "${LOG_FILE}"; then
-    echo "FAIL: ${MODEL} FVP run hit a fatal error"
-    rm "${LOG_FILE}"
-    exit 1
-fi
-
-echo "FAIL: ${MODEL} no BundleIO test result found in FVP output"
-rm "${LOG_FILE}"
-exit 1
+    --bundleio