pytorch
diff --git a/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 27 additions & 1 deletion b/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎.ci/scripts/test_backend.sh‎
Lines changed: 3 additions & 1 deletion b/‎.ci/scripts/test_backend.sh‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.ci/scripts/test_coreml_bc.sh‎
Lines changed: 3 additions & 3 deletions b/‎.ci/scripts/test_coreml_bc.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.ci/scripts/test_huggingface_optimum_model.py‎
Lines changed: 26 additions & 5 deletions b/‎.ci/scripts/test_huggingface_optimum_model.py‎
Lines changed: 26 additions & 5 deletions
diff --git a/‎.ci/scripts/test_lora.sh‎
Lines changed: 28 additions & 2 deletions b/‎.ci/scripts/test_lora.sh‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎.ci/scripts/test_lora_multimethod.sh‎
Lines changed: 19 additions & 1 deletion b/‎.ci/scripts/test_lora_multimethod.sh‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 13 additions & 1 deletion b/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎.github/workflows/_android.yml‎
Lines changed: 5 additions & 2 deletions b/‎.github/workflows/_android.yml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎.github/workflows/_test_backend.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/_test_backend.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/_test_cadence.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/_test_cadence.yml‎
Lines changed: 4 additions & 4 deletions
@@ -415,14 +415,40 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
 
   # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
   echo "::group::Export"
+  EXPORT_LOG=$(mktemp)
   TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
   python -m executorch.examples.models.qwen3_5_moe.export \
       --prequantized "$LOCAL_MODEL_DIR" \
       --output-dir "${OUTPUT_DIR}" \
       --dense-prefill dequant \
-      --moe-activation-dtype int8
+      --moe-activation-dtype int8 2>&1 | tee "$EXPORT_LOG"
+  EXPORT_RC=${PIPESTATUS[0]}
   echo "::endgroup::"
 
+  if [ "$EXPORT_RC" -ne 0 ]; then
+    echo "ERROR: Qwen3.5 MoE export failed (exit $EXPORT_RC)"
+    rm -f "$EXPORT_LOG"
+    exit "$EXPORT_RC"
+  fi
+
+  # Gate peak GPU memory so we keep the export viable on consumer GPUs
+  # (e.g. RTX 4090 with 24 GB). The export script prints a machine-
+  # parseable marker line "EXPORT_GPU_PEAK_MEMORY_MB: <float>".
+  EXPORT_GPU_PEAK_MB_LIMIT="${EXPORT_GPU_PEAK_MB_LIMIT:-20480}"
+  PEAK_LINE=$(grep -E '^EXPORT_GPU_PEAK_MEMORY_MB:' "$EXPORT_LOG" | tail -1)
+  rm -f "$EXPORT_LOG"
+  if [ -z "$PEAK_LINE" ]; then
+    echo "ERROR: export did not emit EXPORT_GPU_PEAK_MEMORY_MB marker; cannot enforce GPU memory budget"
+    exit 1
+  fi
+  PEAK_MB=$(echo "$PEAK_LINE" | awk '{print $2}')
+  echo "Export GPU peak memory: ${PEAK_MB} MB (limit ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
+  if awk -v p="$PEAK_MB" -v l="$EXPORT_GPU_PEAK_MB_LIMIT" 'BEGIN{exit !(p>l)}'; then
+    echo "ERROR: export exceeded GPU memory budget (${PEAK_MB} MB > ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
+    echo "       — this would prevent the model from being exported on a 24 GB consumer GPU."
+    exit 1
+  fi
+
   test -f "${OUTPUT_DIR}/model.pte"
   test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
   ls -al "${OUTPUT_DIR}"
 
@@ -35,6 +35,7 @@ export PYTHON_EXECUTABLE=python
 
 # CMake options to use, in addition to the defaults.
 EXTRA_BUILD_ARGS=""
+PYTEST_RETRY_ARGS=()
 
 if [[ "$FLOW" == *qnn* ]]; then
     # Setup QNN sdk and deps - note that this is a bit hacky due to the nature of the
@@ -57,6 +58,7 @@ if [[ "$FLOW" == *vulkan* ]]; then
 fi
 
 if [[ "$FLOW" == *arm* ]]; then
+    PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
 
     # Setup ARM deps.
     if [[ "$FLOW" == *vgf* ]]; then
@@ -95,6 +97,6 @@ GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
 export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
 
 EXIT_CODE=0
-${CONDA_RUN_CMD} pytest -c /dev/null -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
+${CONDA_RUN_CMD} pytest -c /dev/null -n auto "${PYTEST_RETRY_ARGS[@]}" backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
 # Generate markdown summary.
 ${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
@@ -23,7 +23,7 @@ source "${REPO_ROOT}/.ci/scripts/utils.sh"
 # Create a conda environment with Python 3.10 for compatibility with old ET versions
 # ET 1.0.0 only supports Python >=3.10,<3.13
 CONDA_ENV_NAME="coreml_bc_test_env"
-conda create -y -n "${CONDA_ENV_NAME}" python=3.10
+conda create -y -n "${CONDA_ENV_NAME}" python=3.10 pip packaging
 
 # Use conda run to execute commands in the new environment
 CONDA_RUN="conda run --no-capture-output -n ${CONDA_ENV_NAME}"
@@ -69,7 +69,7 @@ git submodule sync --recursive
 git submodule update --init --recursive
 
 # Install executorch
-${CONDA_RUN} pip install --upgrade pip
+${CONDA_RUN} python -m pip install --upgrade pip
 ${CONDA_RUN} python install_executorch.py
 
 # Step 3: Export model
@@ -129,7 +129,7 @@ git submodule update --init --recursive
 
 # Step 5: Install current version
 echo "=== Step 5: Installing current ET version ==="
-${CONDA_RUN} pip install --upgrade pip
+${CONDA_RUN} python -m pip install --upgrade pip
 ${CONDA_RUN} python install_executorch.py
 
 # Step 6: Run the old pte file
 
@@ -2,8 +2,10 @@
 import gc
 import logging
 import math
+import shutil
 import subprocess
 import tempfile
+import time
 from pathlib import Path
 from typing import List
 
@@ -25,6 +27,17 @@
 )
 
 
+EXPORT_RETRIES = 3
+
+
+def _clear_export_dir(model_dir):
+    for path in Path(model_dir).iterdir():
+        if path.is_dir() and not path.is_symlink():
+            shutil.rmtree(path)
+        else:
+            path.unlink()
+
+
 def cli_export(command, model_dir):
     p = Path(model_dir)
     if p.exists():
@@ -34,11 +47,19 @@ def cli_export(command, model_dir):
             raise Exception(
                 f"Existing directory {model_dir} is non-empty. Please remove it first."
             )
-    try:
-        subprocess.run(command, check=True)
-        print("Export completed successfully.")
-    except subprocess.CalledProcessError as e:
-        print(f"Export failed with error: {e}")
+
+    for attempt in range(1, EXPORT_RETRIES + 1):
+        try:
+            subprocess.run(command, check=True)
+            print("Export completed successfully.")
+            return
+        except subprocess.CalledProcessError as e:
+            print(f"Export attempt {attempt}/{EXPORT_RETRIES} failed with error: {e}")
+            if attempt == EXPORT_RETRIES:
+                raise
+            if p.exists():
+                _clear_export_dir(model_dir)
+            time.sleep(attempt * 10)
 
 
 def check_causal_lm_output_quality(
 
@@ -33,6 +33,24 @@ cleanup_files() {
   rm result*.txt
 }
 
+matches_base_response_prefix() {
+  local output_file="$1"
+  python - "$output_file" <<'PY'
+import pathlib
+import re
+import sys
+
+text = pathlib.Path(sys.argv[1]).read_text()
+pattern = re.compile(
+    r"^<\|im_start\|>user Calculate 15% of 80\?<\|im_end\|><\|im_start\|>assistant:\n"
+    r"(?:<think>\n)+"
+    r"Okay, so I need to calculate 15% of 80\.",
+    re.MULTILINE,
+)
+sys.exit(0 if pattern.match(text) else 1)
+PY
+}
+
 # Hosting lora adapter in personal repo for now.
 python -m pip install -q huggingface_hub
 HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
@@ -142,6 +160,13 @@ To calculate 15% of 80, we can multiply 80 by 15/100 and then simplify the fract
 So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
 #### 12
 The answer is: 12<|im_end|>"
+EXPECTED_QUANT_LORA_ALTERNATE_PREFIX="
+<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
+To calculate 15% of 80, we can multiply 80 by 15/100.
+80 * 15/100 = 12.
+So, 15% of 80 is 12.
+#### 12
+The answer is: 12<|im_end|>"
 
 
 # Export Quantized PTE, PTD file, no LoRA.
@@ -187,7 +212,7 @@ cmake-out/examples/models/llama/llama_main --model_path=qwen_q.pte --data_paths=
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 RESULT=$(cat result.txt)
-if [[ "${RESULT}" == "${EXPECTED_QUANT_PREFIX}"* ]]; then
+if matches_base_response_prefix result.txt; then
   echo "Expected result prefix: ${EXPECTED_QUANT_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Test 3: Success"
@@ -208,12 +233,13 @@ NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT=$(cat result.txt)
-if [[ "${RESULT}" == "${EXPECTED_QUANT_LORA_PREFIX}"* ]]; then
+if [[ "${RESULT}" == "${EXPECTED_QUANT_LORA_PREFIX}"* ]] || [[ "${RESULT}" == "${EXPECTED_QUANT_LORA_ALTERNATE_PREFIX}"* ]]; then
   echo "Expected result prefix: ${EXPECTED_QUANT_LORA_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Test 4: Success"
 else
   echo "Expected result prefix: ${EXPECTED_QUANT_LORA_PREFIX}"
+  echo "Alternate expected result prefix: ${EXPECTED_QUANT_LORA_ALTERNATE_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Test 4: Failure; results not the same"
   cleanup_files
 
@@ -33,6 +33,24 @@ cleanup_files() {
   rm -f result*.txt
 }
 
+matches_base_response_prefix() {
+  local output_file="$1"
+  python - "$output_file" <<'PY'
+import pathlib
+import re
+import sys
+
+text = pathlib.Path(sys.argv[1]).read_text()
+pattern = re.compile(
+    r"^<\|im_start\|>user Calculate 15% of 80\?<\|im_end\|><\|im_start\|>assistant:\n"
+    r"(?:<think>\n)+"
+    r"Okay, so I need to calculate 15% of 80\.",
+    re.MULTILINE,
+)
+sys.exit(0 if pattern.match(text) else 1)
+PY
+}
+
 # Download LoRA adapter.
 python -m pip install -q huggingface_hub
 HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
@@ -107,7 +125,7 @@ NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT=$(cat result_base.txt)
-if [[ "${RESULT}" == "${EXPECTED_BASE_PREFIX}"* ]]; then
+if matches_base_response_prefix result_base.txt; then
   echo "Test 2 (base_forward): Success"
 else
   echo "Test 2 (base_forward): Failure"
 
@@ -258,7 +258,19 @@ fi
 if [ "$AUDIO_URL" != "" ]; then
   curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
-  conda install -y -c conda-forge "ffmpeg<8"
+  if ! command -v ffmpeg >/dev/null; then
+    if [ "$(uname -s)" = "Linux" ] && command -v apt-get >/dev/null; then
+      if [ "$(id -u)" -eq 0 ]; then
+        apt-get update
+        apt-get install -y --no-install-recommends ffmpeg
+      else
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends ffmpeg
+      fi
+    else
+      conda install -y -c conda-forge ffmpeg
+    fi
+  fi
   pip install datasets soundfile
   # We pushd'd into EXECUTORCH_ROOT above, so torch_pin is importable here.
   TORCHCODEC_PKG=$(python -c "from torch_pin import torchcodec_spec; print(torchcodec_spec())")
 
@@ -109,6 +109,9 @@ jobs:
           ram-size: 16384M
           heap-size: 12288M
           force-avd-creation: false
-          disable-animations: true
-          emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none
+          # The action's built-in animation disabling runs immediately after
+          # boot and is not retried. Software-emulated boots can briefly drop
+          # adb there, so scripts/run_android_emulator.sh handles it instead.
+          disable-animations: false
+          emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none -no-metrics
           emulator-boot-timeout: 900
@@ -129,6 +129,7 @@ jobs:
 
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       ref: ${{ inputs.ref }}
       runner: macos-m1-stable
       python-version: "3.12"
 
@@ -45,9 +45,9 @@ jobs:
 
         ./install_requirements.sh > /dev/null
         pip install -e . --no-build-isolation > /dev/null
-        pip install beartype later pyre_extensions pytest-xdist
+        pip install beartype later pyre_extensions pytest-rerunfailures==15.1 pytest-xdist
 
-        python -m pytest backends/cadence/aot/tests/ -v -n auto
+        python -m pytest backends/cadence/aot/tests/ -v -n auto --reruns 2 --reruns-delay 1
 
   test-ops:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -66,12 +66,12 @@ jobs:
 
         ./install_requirements.sh > /dev/null
         pip install -e . --no-build-isolation > /dev/null
-        pip install beartype later pyre_extensions pytest-xdist
+        pip install beartype later pyre_extensions pytest-rerunfailures==15.1 pytest-xdist
 
         # Use the pre-built runner from the build job
         mkdir -p cmake-out/backends/cadence
         cp "${RUNNER_ARTIFACT_DIR}/cadence_runner" cmake-out/backends/cadence/cadence_runner
         chmod +x cmake-out/backends/cadence/cadence_runner
 
         export PYTHONPATH="${PYTHONPATH:-}:$(pwd)/backends/cadence/utils/FACTO"
-        python -m pytest examples/cadence/operators/ -v -n auto
+        python -m pytest examples/cadence/operators/ -v -n auto --reruns 2 --reruns-delay 1