pytorch
diff --git a/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 2 additions & 1 deletion b/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.ci/scripts/test_qnn_static_llm.sh‎
Lines changed: 3 additions & 3 deletions b/‎.ci/scripts/test_qnn_static_llm.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/add-unanswered-to-project.yml‎
Lines changed: 13 additions & 10 deletions b/‎.github/workflows/add-unanswered-to-project.yml‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎.github/workflows/test-pico2-build.yml‎
Lines changed: 169 additions & 0 deletions b/‎.github/workflows/test-pico2-build.yml‎
Lines changed: 169 additions & 0 deletions
diff --git a/‎backends/aoti/aoti_backend.py‎
Lines changed: 11 additions & 0 deletions b/‎backends/aoti/aoti_backend.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎backends/apple/mps/runtime/MPSDevice.mm‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/runtime/MPSDevice.mm‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/README.md‎
Lines changed: 8 additions & 1 deletion b/‎backends/arm/README.md‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 5 additions & 0 deletions
@@ -1,5 +1,6 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2026 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -18,7 +19,7 @@ mkdir -p "./cortex_m_e2e/${MODEL}"
 WORK_DIR=$(realpath "./cortex_m_e2e/${MODEL}")
 
 echo "=== Exporting ${MODEL} with cortex-m55+int8 ==="
-python -m examples.arm.aot_arm_compiler \
+python -m backends.arm.scripts.aot_arm_compiler \
     -m "${MODEL}" \
     --target=cortex-m55+int8 \
     --quantize \
 
@@ -47,11 +47,11 @@ if [[ "${TASK_NAME}" == "stories_110m" ]]; then
     $PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
 
     # Compile only as weight sharing is not applicable on x86.
-    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --soc_model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
     exit_code1=$?
 
     # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
-    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --soc_model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
     exit_code2=$?
 
     # Check the exit codes and print messages
@@ -84,7 +84,7 @@ elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then
     if [ -n "$2" ]; then
         EXTRA_FLAGS="$EXTRA_FLAGS --static_llm_eval_method $2"
     fi
-    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 $EXTRA_FLAGS
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --soc_model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 $EXTRA_FLAGS
     exit_code1=$?
     if [ $exit_code1 -ne 0 ]; then
         exit 1
 
@@ -44,16 +44,19 @@ jobs:
               "ejnguyen", "andrewor14", "phaiting", "mgiordy", "LeeOHzzZ", "adicatana", "Polyomino", "ezrilow", "navsud", 
               "michaelmaitland", "RahulC7", "seyeong-han", "thdusdl1219", "jaejunku", "felixweilbach", "apullin", "trviv", "junluan01", 
               "mvartani-meta", "abeakkas", "elpdumont", "corporateshark", "bdemirb", "GeorgeTzoupis", "AdithyaReddy9", "drinkmorewaterr", 
-              "YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat","azad-meta", "junpi", "pytorchbot", "pytorchmergebot", "pytorchupdatebot", 
-              "facebook-github-bot", "app/dependabot", "Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218", "per", 
-              "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils", "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", "perheld", 
-              "Jerry-Ge", "gggekov", "fumchin", "wwwind", "benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", "agrima1304", 
-              "emmakujala", "annietllnd", "MatthiasHertel80", "AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL", "ArmRyan", "xingguo01", 
-              "tgonzalezorlandoarm", "chizkiyahu", "sarah-blades", "itsMarco-G", "usamahz", "haowhsu-quic", "shewu-quic", "winskuo-quic", 
-              "chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", "jethroqti", "chenweng-quic", "qti-horodnic", "qti-mmadhava", "quic-boyuc",
-              "cymbalrush", "DenisVieriu97", "billmguo", "StrycekSimon", "jirioc", "robert-kalmar", "skywall", "MartinPavella", "roman-janik-nxp", 
-              "novak-vaclav", "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", "ynimmaga", "daniil-lyakhov", "emmanuel-ferdman", 
-              "cavusmustafa", "anzr299", "suryasidd", "Jiseong-oh", "alexdean08",
+              "aliafzal", "YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat","azad-meta", "junpi", 
+              "pytorchbot", "pytorchmergebot", "pytorchupdatebot", "facebook-github-bot", "app/dependabot", 
+              "Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218", "per", "Sebastian-Larsson", "SaoirseARM", "robell", 
+              "mansnils", "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind", 
+              "benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", "agrima1304", "emmakujala", "annietllnd", 
+              "MatthiasHertel80", "AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL", "ArmRyan", "xingguo01", "tgonzalezorlandoarm", 
+              "chizkiyahu", "sarah-blades", "itsMarco-G", "usamahz", "Rob-Hughes-Arm", 
+              "haowhsu-quic", "shewu-quic", "winskuo-quic", "chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", "jethroqti", 
+              "chenweng-quic", "qti-horodnic", "qti-mmadhava", "quic-boyuc", "zhaoxul-qti", 
+              "cymbalrush", "DenisVieriu97", "billmguo", 
+              "StrycekSimon", "jirioc", "robert-kalmar", "skywall", "MartinPavella", "roman-janik-nxp", "novak-vaclav", "irtrukhina", 
+              "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", "ynimmaga", "daniil-lyakhov", 
+              "emmanuel-ferdman", "cavusmustafa", "anzr299", "suryasidd", "Jiseong-oh", "alexdean08",
               // explicitly include the dependabot bot login seen in PRs
               "dependabot[bot]"
             ]);
 
@@ -0,0 +1,169 @@
+name: Pico2 Build Validation
+
+on:
+  push:
+    branches: [main, release/*]
+    paths:
+      - examples/raspberry_pi/pico2/**
+      - backends/cortex_m/**
+      - .ci/scripts/**
+      - examples/arm/**
+      - .github/workflows/test-pico2-build.yml
+  schedule:
+    # Run daily at 3 AM UTC to catch upstream breakages
+    - cron: '0 3 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  test-pico2-fp32-build:
+    name: test-pico2-fp32-build
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        set -euo pipefail
+        # Activate conda environment from the docker image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
+
+        # Install ARM baremetal toolchain
+        .ci/scripts/setup-arm-baremetal-tools.sh
+        source examples/arm/arm-scratch/setup_path.sh
+
+        echo "=== Step 1: Export FP32 model ==="
+        cd examples/raspberry_pi/pico2
+        python export_mlp_mnist.py
+        test -f balanced_tiny_mlp_mnist.pte
+        echo "FP32 model exported: $(ls -la balanced_tiny_mlp_mnist.pte)"
+        cd -
+
+        echo "=== Step 2: Build FP32 firmware ==="
+        ./examples/raspberry_pi/pico2/build_firmware_pico.sh --model=balanced_tiny_mlp_mnist.pte
+
+        echo "=== Step 3: Validate FP32 artifacts ==="
+        ELF_FILE=examples/raspberry_pi/pico2/build/executorch_pico.elf
+        UF2_FILE=examples/raspberry_pi/pico2/build/executorch_pico.uf2
+
+        test -f "${ELF_FILE}" || { echo "FAIL: .elf not found"; exit 1; }
+        test -f "${UF2_FILE}" || { echo "FAIL: .uf2 not found"; exit 1; }
+
+        echo "--- Section sizes ---"
+        arm-none-eabi-size -A "${ELF_FILE}"
+
+        echo "--- Section headers ---"
+        arm-none-eabi-objdump -h "${ELF_FILE}"
+
+        echo "--- Key symbols ---"
+        arm-none-eabi-nm --print-size --size-sort --radix=d "${ELF_FILE}" | tail -20
+
+        # Validate binary fits in Pico2 memory using aggregated totals:
+        #   flash = text + data (4MB = 4194304 bytes)
+        #   SRAM  = data + bss  (520KB = 532480 bytes)
+        eval $(arm-none-eabi-size "${ELF_FILE}" | awk 'NR==2 {printf "TEXT_SIZE=%d DATA_SIZE=%d BSS_SIZE=%d", $1, $2, $3}')
+
+        FLASH_USED=$((TEXT_SIZE + DATA_SIZE))
+        SRAM_USED=$((DATA_SIZE + BSS_SIZE))
+        echo "FP32 binary: text=${TEXT_SIZE} data=${DATA_SIZE} bss=${BSS_SIZE} => flash=${FLASH_USED} sram=${SRAM_USED}"
+
+        if [ "${FLASH_USED}" -gt 4194304 ]; then
+          echo "FAIL: flash usage (${FLASH_USED}) exceeds 4MB"
+          exit 1
+        fi
+        if [ "${SRAM_USED}" -gt 532480 ]; then
+          echo "FAIL: SRAM usage (${SRAM_USED}) exceeds 520KB"
+          exit 1
+        fi
+        echo "PASS: FP32 firmware fits in Pico2 memory (SRAM: ${SRAM_USED}/532480, Flash: ${FLASH_USED}/4194304)"
+
+  test-pico2-cmsis-build:
+    name: test-pico2-cmsis-nn-build
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        set -euo pipefail
+        # Activate conda environment from the docker image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
+
+        # Install ARM baremetal toolchain
+        .ci/scripts/setup-arm-baremetal-tools.sh
+        source examples/arm/arm-scratch/setup_path.sh
+
+        echo "=== Step 1: Export INT8 CMSIS-NN model ==="
+        cd examples/raspberry_pi/pico2
+        python export_mlp_mnist_cmsis.py
+        test -f balanced_tiny_mlp_mnist_cmsis.pte
+        echo "INT8 model exported: $(ls -la balanced_tiny_mlp_mnist_cmsis.pte)"
+        cd -
+
+        echo "=== Step 2: Build CMSIS-NN firmware ==="
+        ./examples/raspberry_pi/pico2/build_firmware_pico.sh --cmsis --model=balanced_tiny_mlp_mnist_cmsis.pte
+
+        echo "=== Step 3: Validate CMSIS-NN artifacts ==="
+        ELF_FILE=examples/raspberry_pi/pico2/build/executorch_pico.elf
+        UF2_FILE=examples/raspberry_pi/pico2/build/executorch_pico.uf2
+
+        test -f "${ELF_FILE}" || { echo "FAIL: .elf not found"; exit 1; }
+        test -f "${UF2_FILE}" || { echo "FAIL: .uf2 not found"; exit 1; }
+
+        echo "--- Section sizes ---"
+        arm-none-eabi-size -A "${ELF_FILE}"
+
+        echo "--- Section headers ---"
+        arm-none-eabi-objdump -h "${ELF_FILE}"
+
+        echo "--- Key symbols ---"
+        arm-none-eabi-nm --print-size --size-sort --radix=d "${ELF_FILE}" | tail -20
+
+        # Verify CMSIS-NN symbols are linked
+        CMSIS_NN_SYMBOLS=$(arm-none-eabi-nm "${ELF_FILE}" | grep -E '(cmsis_nn|arm_nn)' || true)
+        if [ -n "${CMSIS_NN_SYMBOLS}" ]; then
+          echo "PASS: CMSIS-NN symbols found in binary"
+          printf '%s\n' "${CMSIS_NN_SYMBOLS}" | head -20
+        else
+          echo "FAIL: No CMSIS-NN symbols detected — cortex_m backend may not be linked correctly"
+          exit 1
+        fi
+
+        # Validate binary fits in Pico2 memory using aggregated totals:
+        #   flash = text + data (4MB = 4194304 bytes)
+        #   SRAM  = data + bss  (520KB = 532480 bytes)
+        eval $(arm-none-eabi-size "${ELF_FILE}" | awk 'NR==2 {printf "TEXT_SIZE=%d DATA_SIZE=%d BSS_SIZE=%d", $1, $2, $3}')
+
+        FLASH_USED=$((TEXT_SIZE + DATA_SIZE))
+        SRAM_USED=$((DATA_SIZE + BSS_SIZE))
+        echo "CMSIS-NN binary: text=${TEXT_SIZE} data=${DATA_SIZE} bss=${BSS_SIZE} => flash=${FLASH_USED} sram=${SRAM_USED}"
+
+        if [ "${FLASH_USED}" -gt 4194304 ]; then
+          echo "FAIL: flash usage (${FLASH_USED}) exceeds 4MB"
+          exit 1
+        fi
+        if [ "${SRAM_USED}" -gt 532480 ]; then
+          echo "FAIL: SRAM usage (${SRAM_USED}) exceeds 520KB"
+          exit 1
+        fi
+        echo "PASS: CMSIS-NN firmware fits in Pico2 memory (SRAM: ${SRAM_USED}/532480, Flash: ${FLASH_USED}/4194304)"
@@ -25,6 +25,7 @@
 
 class COMPILE_SPEC_KEYS(Enum):
     METHOD_NAME = "method_name"
+    SHARE_KV_CACHE_ACROSS_METHODS = "share_kv_cache_across_methods"
 
 
 @experimental(
@@ -286,3 +287,13 @@ def method_name_from_compile_specs(
         raise RuntimeError(
             f"Could not find method name in compile specs: {compile_specs}"
         )
+
+    @classmethod
+    def generate_share_kv_cache_compile_spec(cls) -> CompileSpec:
+        """
+        Generate a CompileSpec to enable cross-method KV cache sharing.
+        """
+        return CompileSpec(
+            COMPILE_SPEC_KEYS.SHARE_KV_CACHE_ACROSS_METHODS.value,
+            bytes([1]),
+        )
@@ -138,7 +138,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
     ET_CHECK_OR_RETURN_ERROR(
       err == Error::Ok,
       Internal,
-      "An error occured occured while compiling library %d", libraryType
+      "An error occurred while compiling library %d", libraryType
     );
   }
   if (_m_pso_cache.find(kernelName) == _m_pso_cache.end()) {
 
@@ -106,6 +106,13 @@ Setup:
 ./examples/arm/setup.sh --disable-ethos-u-deps --enable-mlsdk-deps
 ```
 
+This is the default setup path and installs the MLSDK components from pip.
+Developers who need local source builds can use:
+
+```
+./backends/arm/scripts/setup-mlsdk-from-source.sh
+```
+
 The current flow lowers to TOSA and converts to VGF for use in external projects,
 so the `executor_runner` is not typically used here.
 
@@ -155,7 +162,7 @@ scp -P 2222 arm_test/cmake-out/executor_runner root@127.0.0.1:/tmp/
 Create a PTE file:
 
 ```
-python3 -m examples.arm.aot_arm_compiler \
+python3 -m backends.arm.scripts.aot_arm_compiler \
   --model_name examples/arm/example_modules/add.py \
   --delegate \
   --quantize \
 
@@ -53,6 +53,7 @@
 from .decompose_glu_pass import DecomposeGluPass  # noqa
 from .decompose_grouped_conv_pass import DecomposeGroupedConvPass  # noqa
 from .decompose_groupnorm_pass import DecomposeGroupNormPass  # noqa
+from .decompose_gru_pass import DecomposeGruPass  # noqa
 from .decompose_index_copy_pass import DecomposeIndexCopyPass  # noqa
 from .decompose_index_select_to_gather_pass import (  # noqa
     DecomposeIndexSelectToGatherPass,
@@ -70,13 +71,15 @@
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
 from .decompose_log1p_pass import DecomposeLog1pPass  # noqa
 from .decompose_logit_pass import DecomposeLogitPass  # noqa
+from .decompose_lstm_pass import DecomposeLstmPass  # noqa
 from .decompose_masked_fill_pass import DecomposeMaskedFillPass  # noqa
 from .decompose_matmul import DecomposeMatmulPass  # noqa
 from .decompose_maxpool2d_with_dilation_pass import DecomposeMaxPool2dPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
 from .decompose_quant_nodes import DecomposeQuantNodesPass  # noqa
 from .decompose_remainder_pass import DecomposeRemainderPass  # noqa
+from .decompose_rnn_pass import DecomposeRnnPass  # noqa
 from .decompose_round_pass import DecomposeRoundPass  # noqa
 from .decompose_sdpa_pass import DecomposeScaledDotProductAttentionPass  # noqa
 from .decompose_select import DecomposeSelectPass  # noqa
@@ -141,6 +144,7 @@
 from .replace_scalar_with_tensor_pass import (  # noqa
     ReplaceScalarWithTensorByProfilePass,
 )
+from .rewrite_avg_pool2d_pass import RewriteAvgPool2dPass  # noqa
 from .rewrite_bool_bitwise_to_logical_pass import (  # noqa
     RewriteBoolBitwiseToLogicalPass,
 )
@@ -155,6 +159,7 @@
 from .rewrite_inplace_arithmetic_pass import RewriteInplaceArithmeticPass  # noqa
 from .rewrite_le_lt_to_ge_gt_pass import RewriteLeLtToGeGtPass  # noqa
 from .rewrite_matmul import RewriteMatmulPass  # noqa
+from .rewrite_max_pool2d_pass import RewriteMaxPool2dPass  # noqa
 from .rewrite_pad import RewritePadPass  # noqa
 from .rewrite_slice import RewriteSlicePass  # noqa
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de`
`138`	`138`	`ET_CHECK_OR_RETURN_ERROR(`
`139`	`139`	`err == Error::Ok,`
`140`	`140`	`Internal,`
`141`		`- "An error occured occured while compiling library %d", libraryType`
	`141`	`+ "An error occurred while compiling library %d", libraryType`
`142`	`142`	`);`
`143`	`143`	`}`
`144`	`144`	`if (_m_pso_cache.find(kernelName) == _m_pso_cache.end()) {`