pytorch
diff --git a/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 24 additions & 6 deletions b/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 24 additions & 6 deletions
diff --git a/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 13 additions & 6 deletions b/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎.github/workflows/add-unanswered-to-project.yml‎
Lines changed: 20 additions & 10 deletions b/‎.github/workflows/add-unanswered-to-project.yml‎
Lines changed: 20 additions & 10 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 33 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 9 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 31 additions & 0 deletions b/‎CLAUDE.md‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎backends/aoti/CMakeLists.txt‎
Lines changed: 38 additions & 18 deletions b/‎backends/aoti/CMakeLists.txt‎
Lines changed: 38 additions & 18 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/_passes/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 0 additions & 2 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 0 additions & 2 deletions
@@ -5,16 +5,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Export model to CUDA/Metal format with optional quantization
+# Export model to CUDA/Metal/XNNPACK format with optional quantization
 
 show_help() {
   cat << EOF
 Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]
 
-Export a HuggingFace model to CUDA/Metal format with optional quantization.
+Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.
 
 Arguments:
-  device       cuda or metal (required)
+  device       cuda, metal, or xnnpack (required)
 
   hf_model     HuggingFace model ID (required)
                Supported models:
@@ -28,6 +28,7 @@ Arguments:
                  - non-quantized
                  - quantized-int4-tile-packed
                  - quantized-int4-weight-only
+                 - quantized-8da4w (XNNPACK only)
 
   output_dir   Output directory for artifacts (optional, default: current directory)
 
@@ -36,6 +37,7 @@ Examples:
   export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
   export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
   export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
+  export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
 EOF
 }
 
@@ -64,9 +66,11 @@ case "$DEVICE" in
     ;;
   metal)
     ;;
+  xnnpack)
+    ;;
   *)
     echo "Error: Unsupported device '$DEVICE'"
-    echo "Supported devices: cuda, cuda-windows, metal"
+    echo "Supported devices: cuda, cuda-windows, metal, xnnpack"
     exit 1
     ;;
 esac
@@ -139,9 +143,16 @@ case "$QUANT_NAME" in
     fi
     EXTRA_ARGS="--qlinear_encoder 4w"
     ;;
+  quantized-8da4w)
+    if [ "$DEVICE" != "xnnpack" ]; then
+      echo "Error: quantized-8da4w is only supported with xnnpack device"
+      exit 1
+    fi
+    EXTRA_ARGS="--qlinear 8da4w --qlinear_group_size 32 --qlinear_encoder 8da4w --qlinear_encoder_group_size 32"
+    ;;
   *)
     echo "Error: Unsupported quantization '$QUANT_NAME'"
-    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only"
+    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-8da4w"
     exit 1
     ;;
 esac
@@ -157,10 +168,17 @@ pip list
 if [ "$MODEL_NAME" = "parakeet" ]; then
   pip install -r examples/models/parakeet/install_requirements.txt
 
+  # Set dtype based on backend (XNNPACK uses fp32, CUDA/Metal use bf16)
+  if [ "$DEVICE" = "xnnpack" ]; then
+    DTYPE_ARG=""
+  else
+    DTYPE_ARG="--dtype bf16"
+  fi
+
   python -m executorch.examples.models.parakeet.export_parakeet_tdt \
       --backend "$DEVICE" \
       --output-dir "${OUTPUT_DIR}" \
-      --dtype bf16 \
+      ${DTYPE_ARG} \
       ${EXTRA_ARGS}
 
   test -f "${OUTPUT_DIR}/model.pte"
 
@@ -5,16 +5,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Test CUDA/Metal model end-to-end, need to run .ci/scripts/export_model_artifact.sh first
+# Test CUDA/Metal/XNNPACK model end-to-end, need to run .ci/scripts/export_model_artifact.sh first
 
 show_help() {
   cat << EOF
 Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir]
 
-Build and run end-to-end tests for CUDA/Metal models.
+Build and run end-to-end tests for CUDA/Metal/XNNPACK models.
 
 Arguments:
-  device      cuda or metal (required)
+  device      cuda, metal, or xnnpack (required)
 
   hf_model    HuggingFace model ID (required)
               Supported models:
@@ -28,6 +28,7 @@ Arguments:
                 - non-quantized
                 - quantized-int4-tile-packed
                 - quantized-int4-weight-only
+                - quantized-8da4w (XNNPACK only)
 
   model_dir   Directory containing model artifacts (optional, default: current directory)
               Expected files: model.pte, aoti_cuda_blob.ptd (CUDA only)
@@ -37,6 +38,7 @@ Examples:
   test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
   test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
   test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
+  test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
 EOF
 }
 
@@ -174,12 +176,17 @@ echo "::endgroup::"
 
 echo "::group::Build $MODEL_NAME Runner"
 
-if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ]; then
-  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda' or 'metal'."
+if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "xnnpack" ]; then
+  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', or 'xnnpack'."
   exit 1
 fi
 
-MAKE_TARGET="${RUNNER_PATH}-${DEVICE}"
+# Map device to make target (xnnpack uses cpu target which includes XNNPACK)
+if [ "$DEVICE" = "xnnpack" ]; then
+  MAKE_TARGET="${RUNNER_PATH}-cpu"
+else
+  MAKE_TARGET="${RUNNER_PATH}-${DEVICE}"
+fi
 make "${MAKE_TARGET}"
 echo "::endgroup::"
 
 
@@ -43,16 +43,16 @@ jobs:
               "ethansfng", "ThomasJannaud", "nirvanagth", "marcinkwiatkowski", "3l1", "omerjerk", "nitish2112", "yipjustin", 
               "ejnguyen", "andrewor14", "phaiting", "mgiordy", "LeeOHzzZ", "adicatana", "Polyomino", "ezrilow", "navsud", 
               "michaelmaitland", "RahulC7", "seyeong-han", "thdusdl1219", "jaejunku", "felixweilbach", "apullin", "trviv", "junluan01", 
-              "YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat", "azad-meta", "junpi", "pytorchbot", "pytorchmergebot", 
-              "pytorchupdatebot", "facebook-github-bot", "app/dependabot", "Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218", 
-              "per", "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils", "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", 
-              "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind", "benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", 
-              "agrima1304", "emmakujala", "annietllnd", "MatthiasHertel80", "AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL", 
-              "ArmRyan", "xingguo01", "tgonzalezorlandoarm", "chizkiyahu", "sarah-blades", "haowhsu-quic", "shewu-quic", "winskuo-quic", 
-              "chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", "jethroqti", "chenweng-quic", "cymbalrush", "DenisVieriu97", 
-              "billmguo", "StrycekSimon", "jirioc", "robert-kalmar", "skywall", "MartinPavella", "roman-janik-nxp", "novak-vaclav", 
-              "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", "ynimmaga", "daniil-lyakhov", "emmanuel-ferdman", "cavusmustafa", 
-              "anzr299", "Jiseong-oh", "alexdean08",
+              "mvartani-meta", "abeakkas", "elpdumont", "corporateshark", "YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat", 
+              "azad-meta", "junpi", "pytorchbot", "pytorchmergebot", "pytorchupdatebot", "facebook-github-bot", "app/dependabot", 
+              "Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218", "per", "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils", 
+              "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind", "benkli01", 
+              "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", "agrima1304", "emmakujala", "annietllnd", "MatthiasHertel80", 
+              "AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL", "ArmRyan", "xingguo01", "tgonzalezorlandoarm", "chizkiyahu", 
+              "sarah-blades", "haowhsu-quic", "shewu-quic", "winskuo-quic", "chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", 
+              "jethroqti", "chenweng-quic", "cymbalrush", "DenisVieriu97", "billmguo", "StrycekSimon", "jirioc", "robert-kalmar", 
+              "skywall", "MartinPavella", "roman-janik-nxp", "novak-vaclav", "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", 
+              "ynimmaga", "daniil-lyakhov", "emmanuel-ferdman", "cavusmustafa", "anzr299", "Jiseong-oh", "alexdean08",
               // explicitly include the dependabot bot login seen in PRs
               "dependabot[bot]"
             ]);
@@ -139,6 +139,11 @@ jobs:
                 } else {
                   console.log(`Skipping issue #${issue.number} by ${issue.user && issue.user.login}`);
                 }
+                if (!issue.pull_request && !isBotOrExcluded(issue.user) && !(await isMemberOfExcludedOrg(issue.user))) {
+                  await addItem(issue.node_id, 'issue', issue.number);
+                } else {
+                  console.log(`Skipping issue #${issue.number} by ${issue.user && issue.user.login}`);
+                }
               }
 
               // Add open, non-draft PRs (regardless of review state), exclude by author/bots
@@ -156,6 +161,11 @@ jobs:
                 } else {
                   console.log(`Skipping PR #${pr.number} by ${pr.user && pr.user.login}`);
                 }
+                if (!pr.draft && !isBotOrExcluded(pr.user) && !(await isMemberOfExcludedOrg(pr.user))) {
+                  await addItem(pr.node_id, 'pr', pr.number);
+                } else {
+                  console.log(`Skipping PR #${pr.number} by ${pr.user && pr.user.login}`);
+                }
               }
             } catch (error) {
               core.setFailed(`Workflow failed: ${error.message}`);
 
@@ -163,6 +163,39 @@ jobs:
         # Build and test ExecuTorch
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
 
+  test-parakeet-xnnpack-linux:
+    name: test-parakeet-xnnpack-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.4xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-clang12
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Export Parakeet with XNNPACK"
+        bash .ci/scripts/export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output"
+        echo "::endgroup::"
+
+        echo "::group::Test Parakeet with XNNPACK"
+        bash .ci/scripts/test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output"
+        echo "::endgroup::"
+
   test-llama-runner-linux:
     # Test Both linux x86 and linux aarch64
     name: test-llama-runner-linux
 
@@ -510,6 +510,15 @@ include_patterns = [
     'backends/arm/vgf/**/*.py',
     'backends/arm/tosa/**/*.py',
     'backends/arm/ethosu/**/*.py',
+    'backends/arm/operators/**/*.py',
+    'backends/arm/common/**/*.py',
+    'backends/arm/util/**/*.py',
+    'backends/arm/runtime/**/*.py',
+    'backends/arm/quantizer/**/*.py',
+    'backends/arm/debug/**/*.py',
+    'backends/arm/scripts/**/*.py',
+    'backends/arm/operator_support/**/*.py',
+    'backends/arm/*.py',
 ]
 exclude_patterns = ['third-party/**', '**/third-party/**']
 command = [
 
@@ -4,6 +4,37 @@ Refer to the repo/framework/runtime "executorch" (in lower cases) or "ExecuTorch
 camel cases), not "ExecutorTorch". With limited code or comment length, maybe refer
 to the framework "ET" but consider it as very unofficial and not recommended.
 
+# Install
+
+## Python
+
+If the user is mostly importing `executorch` module and experimenting with Ahead-Of-Time
+export flow, installation means installing `executorch` python package.
+
+Python virtual environment or conda environment is highly recommended for installing 
+executorch from source. Double check if the user wants to enable virtual enablement before
+building from source.
+
+First time install: run `install_executorch.sh` (or `install_executorch.bat` for Windows).
+
+This script handles dependencies properly (since `executorch` depends on nightly versions
+of `torch`, those packages won't be available in pip so need special index url).
+
+Subsequent install: run `pip install . -v --no-build-isolation` inside `executorch`
+directory.
+
+Editable mode is avilable (either through `install_executorch.sh` script or `pip install . -e`.
+
+Refer to more details in this [doc](docs/source/using-executorch-building-from-source.md).
+
+## C++
+If the user is building basic executorch C++ libraries, refer to root level [CMakeLists.txt](CMakeLists.txt).
+
+If working with LLM/ASR runners, prefer to use [Makefile](Makefile) and cmake [presets](CMakePresets.json).
+
+Again refer to this [doc](docs/source/using-executorch-building-from-source.md#building-the-c-runtime)
+for more details.
+
 # Commit messages
 
 Don't commit unless the user explicitly asks you to.
 
@@ -63,17 +63,48 @@ install(
   DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
+# ==============================================================================
+# SlimTensor INTERFACE library (header-only) Provides lightweight tensor
+# operations for AOTI backends
+# ==============================================================================
+add_library(slimtensor INTERFACE)
+target_include_directories(
+  slimtensor
+  INTERFACE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+            $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+            $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/runtime/core/portable_type/c10>
+            $<INSTALL_INTERFACE:include>
+)
+
+# Use custom macros instead of cmake-generated ones (same as Buck build)
+target_compile_definitions(
+  slimtensor INTERFACE C10_USING_CUSTOM_GENERATED_MACROS
+)
+
+# Add CUDA support for SlimTensor when building with CUDA
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  target_include_directories(slimtensor INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
+  target_link_libraries(slimtensor INTERFACE CUDA::cudart)
+endif()
+
+install(
+  TARGETS slimtensor
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+
 # ==============================================================================
 # AOTI common shims using SlimTensor (for CUDA backend) Uses SlimTensor for all
 # tensor operations
 # TODO(gasoonjia): Replace aoti_common with this one after metal migration
 # ==============================================================================
-add_library(aoti_common_shims_slim STATIC common_shims_slim.cpp)
-target_include_directories(
-  aoti_common_shims_slim
-  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
-         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
-)
+set(_aoti_common_shims_slim_sources common_shims_slim.cpp)
+if(EXECUTORCH_BUILD_CUDA)
+  list(APPEND _aoti_common_shims_slim_sources slim/cuda/guard.cpp)
+endif()
+
+add_library(aoti_common_shims_slim STATIC ${_aoti_common_shims_slim_sources})
 target_compile_options(
   aoti_common_shims_slim
   PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
@@ -83,18 +114,7 @@ target_compile_definitions(
   aoti_common_shims_slim PUBLIC $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
 )
 
-# Add CUDA include directories and link CUDA runtime when building with CUDA
-if(EXECUTORCH_BUILD_CUDA)
-  find_package(CUDAToolkit REQUIRED)
-  target_include_directories(
-    aoti_common_shims_slim PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
-  )
-  target_link_libraries(aoti_common_shims_slim PUBLIC CUDA::cudart)
-endif()
-
-target_link_libraries(
-  aoti_common_shims_slim PUBLIC slimtensor extension_tensor ${CMAKE_DL_LIBS}
-)
+target_link_libraries(aoti_common_shims_slim PUBLIC slimtensor ${CMAKE_DL_LIBS})
 
 install(
   TARGETS aoti_common_shims_slim
 
@@ -75,7 +75,6 @@
 from .decompose_select import DecomposeSelectPass  # noqa
 from .decompose_select_scatter_pass import DecomposeSelectScatterPass  # noqa
 from .decompose_sign_pass import DecomposeSignPass  # noqa
-from .decompose_silu_pass import DecomposeSiluPass  # noqa
 from .decompose_sinh_pass import DecomposeSinhPass  # noqa
 from .decompose_softmax_pass import DecomposeSoftmaxPass  # noqa
 from .decompose_softmax_unstable_pass import DecomposeSoftmaxUnstablePass  # noqa
 
@@ -76,7 +76,6 @@
     DecomposeSelectPass,
     DecomposeSelectScatterPass,
     DecomposeSignPass,
-    DecomposeSiluPass,
     DecomposeSinhPass,
     DecomposeSoftmaxPass,
     DecomposeSoftmaxUnstablePass,
@@ -434,7 +433,6 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
                 DecomposeLeakyReLUPass(tfa_pass=True),
                 DecomposeLinalgVectorNormPass(tfa_pass=True),
                 DecomposeSqrtPass(tfa_pass=True),
-                DecomposeSiluPass(tfa_pass=True),
                 DecomposeAvgPool2dPass(tfa_pass=True),
                 DecomposeSoftmaxUnstablePass(tfa_pass=True),
                 DecomposeSoftmaxPass(tfa_pass=True),