Skip to content

Commit f675b01

Browse files
Merge branch 'main' into change-1225529
2 parents 6ae1b6a + 490ec5c commit f675b01

258 files changed

Lines changed: 14288 additions & 3908 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/scripts/test_cortex_m_e2e.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env bash
22
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# Copyright 2026 Arm Limited and/or its affiliates.
34
# All rights reserved.
45
#
56
# This source code is licensed under the BSD-style license found in the
@@ -18,7 +19,7 @@ mkdir -p "./cortex_m_e2e/${MODEL}"
1819
WORK_DIR=$(realpath "./cortex_m_e2e/${MODEL}")
1920

2021
echo "=== Exporting ${MODEL} with cortex-m55+int8 ==="
21-
python -m examples.arm.aot_arm_compiler \
22+
python -m backends.arm.scripts.aot_arm_compiler \
2223
-m "${MODEL}" \
2324
--target=cortex-m55+int8 \
2425
--quantize \

.ci/scripts/test_qnn_static_llm.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@ if [[ "${TASK_NAME}" == "stories_110m" ]]; then
4747
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
4848

4949
# Compile only as weight sharing is not applicable on x86.
50-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
50+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --soc_model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
5151
exit_code1=$?
5252

5353
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
54-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
54+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --soc_model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
5555
exit_code2=$?
5656

5757
# Check the exit codes and print messages
@@ -84,7 +84,7 @@ elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then
8484
if [ -n "$2" ]; then
8585
EXTRA_FLAGS="$EXTRA_FLAGS --static_llm_eval_method $2"
8686
fi
87-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 $EXTRA_FLAGS
87+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --soc_model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 $EXTRA_FLAGS
8888
exit_code1=$?
8989
if [ $exit_code1 -ne 0 ]; then
9090
exit 1

.github/workflows/add-unanswered-to-project.yml

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,19 @@ jobs:
4444
"ejnguyen", "andrewor14", "phaiting", "mgiordy", "LeeOHzzZ", "adicatana", "Polyomino", "ezrilow", "navsud",
4545
"michaelmaitland", "RahulC7", "seyeong-han", "thdusdl1219", "jaejunku", "felixweilbach", "apullin", "trviv", "junluan01",
4646
"mvartani-meta", "abeakkas", "elpdumont", "corporateshark", "bdemirb", "GeorgeTzoupis", "AdithyaReddy9", "drinkmorewaterr",
47-
"YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat","azad-meta", "junpi", "pytorchbot", "pytorchmergebot", "pytorchupdatebot",
48-
"facebook-github-bot", "app/dependabot", "Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218", "per",
49-
"Sebastian-Larsson", "SaoirseARM", "robell", "mansnils", "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", "perheld",
50-
"Jerry-Ge", "gggekov", "fumchin", "wwwind", "benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", "agrima1304",
51-
"emmakujala", "annietllnd", "MatthiasHertel80", "AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL", "ArmRyan", "xingguo01",
52-
"tgonzalezorlandoarm", "chizkiyahu", "sarah-blades", "itsMarco-G", "usamahz", "haowhsu-quic", "shewu-quic", "winskuo-quic",
53-
"chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", "jethroqti", "chenweng-quic", "qti-horodnic", "qti-mmadhava", "quic-boyuc",
54-
"cymbalrush", "DenisVieriu97", "billmguo", "StrycekSimon", "jirioc", "robert-kalmar", "skywall", "MartinPavella", "roman-janik-nxp",
55-
"novak-vaclav", "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", "ynimmaga", "daniil-lyakhov", "emmanuel-ferdman",
56-
"cavusmustafa", "anzr299", "suryasidd", "Jiseong-oh", "alexdean08",
47+
"aliafzal", "YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat","azad-meta", "junpi",
48+
"pytorchbot", "pytorchmergebot", "pytorchupdatebot", "facebook-github-bot", "app/dependabot",
49+
"Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218", "per", "Sebastian-Larsson", "SaoirseARM", "robell",
50+
"mansnils", "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind",
51+
"benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", "agrima1304", "emmakujala", "annietllnd",
52+
"MatthiasHertel80", "AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL", "ArmRyan", "xingguo01", "tgonzalezorlandoarm",
53+
"chizkiyahu", "sarah-blades", "itsMarco-G", "usamahz", "Rob-Hughes-Arm",
54+
"haowhsu-quic", "shewu-quic", "winskuo-quic", "chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", "jethroqti",
55+
"chenweng-quic", "qti-horodnic", "qti-mmadhava", "quic-boyuc", "zhaoxul-qti",
56+
"cymbalrush", "DenisVieriu97", "billmguo",
57+
"StrycekSimon", "jirioc", "robert-kalmar", "skywall", "MartinPavella", "roman-janik-nxp", "novak-vaclav", "irtrukhina",
58+
"neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", "ynimmaga", "daniil-lyakhov",
59+
"emmanuel-ferdman", "cavusmustafa", "anzr299", "suryasidd", "Jiseong-oh", "alexdean08",
5760
// explicitly include the dependabot bot login seen in PRs
5861
"dependabot[bot]"
5962
]);
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
name: Pico2 Build Validation
2+
3+
on:
4+
push:
5+
branches: [main, release/*]
6+
paths:
7+
- examples/raspberry_pi/pico2/**
8+
- backends/cortex_m/**
9+
- .ci/scripts/**
10+
- examples/arm/**
11+
- .github/workflows/test-pico2-build.yml
12+
schedule:
13+
# Run daily at 3 AM UTC to catch upstream breakages
14+
- cron: '0 3 * * *'
15+
16+
concurrency:
17+
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.pull_request.number || github.sha }}
18+
cancel-in-progress: true
19+
20+
jobs:
21+
test-pico2-fp32-build:
22+
name: test-pico2-fp32-build
23+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
24+
permissions:
25+
id-token: write
26+
contents: read
27+
with:
28+
runner: linux.2xlarge.memory
29+
docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
30+
submodules: 'recursive'
31+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
32+
timeout: 120
33+
script: |
34+
set -euo pipefail
35+
# Activate conda environment from the docker image
36+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
37+
conda activate "${CONDA_ENV}"
38+
39+
source .ci/scripts/utils.sh
40+
install_executorch "--use-pt-pinned-commit"
41+
42+
# Install ARM baremetal toolchain
43+
.ci/scripts/setup-arm-baremetal-tools.sh
44+
source examples/arm/arm-scratch/setup_path.sh
45+
46+
echo "=== Step 1: Export FP32 model ==="
47+
cd examples/raspberry_pi/pico2
48+
python export_mlp_mnist.py
49+
test -f balanced_tiny_mlp_mnist.pte
50+
echo "FP32 model exported: $(ls -la balanced_tiny_mlp_mnist.pte)"
51+
cd -
52+
53+
echo "=== Step 2: Build FP32 firmware ==="
54+
./examples/raspberry_pi/pico2/build_firmware_pico.sh --model=balanced_tiny_mlp_mnist.pte
55+
56+
echo "=== Step 3: Validate FP32 artifacts ==="
57+
ELF_FILE=examples/raspberry_pi/pico2/build/executorch_pico.elf
58+
UF2_FILE=examples/raspberry_pi/pico2/build/executorch_pico.uf2
59+
60+
test -f "${ELF_FILE}" || { echo "FAIL: .elf not found"; exit 1; }
61+
test -f "${UF2_FILE}" || { echo "FAIL: .uf2 not found"; exit 1; }
62+
63+
echo "--- Section sizes ---"
64+
arm-none-eabi-size -A "${ELF_FILE}"
65+
66+
echo "--- Section headers ---"
67+
arm-none-eabi-objdump -h "${ELF_FILE}"
68+
69+
echo "--- Key symbols ---"
70+
arm-none-eabi-nm --print-size --size-sort --radix=d "${ELF_FILE}" | tail -20
71+
72+
# Validate binary fits in Pico2 memory using aggregated totals:
73+
# flash = text + data (4MB = 4194304 bytes)
74+
# SRAM = data + bss (520KB = 532480 bytes)
75+
eval $(arm-none-eabi-size "${ELF_FILE}" | awk 'NR==2 {printf "TEXT_SIZE=%d DATA_SIZE=%d BSS_SIZE=%d", $1, $2, $3}')
76+
77+
FLASH_USED=$((TEXT_SIZE + DATA_SIZE))
78+
SRAM_USED=$((DATA_SIZE + BSS_SIZE))
79+
echo "FP32 binary: text=${TEXT_SIZE} data=${DATA_SIZE} bss=${BSS_SIZE} => flash=${FLASH_USED} sram=${SRAM_USED}"
80+
81+
if [ "${FLASH_USED}" -gt 4194304 ]; then
82+
echo "FAIL: flash usage (${FLASH_USED}) exceeds 4MB"
83+
exit 1
84+
fi
85+
if [ "${SRAM_USED}" -gt 532480 ]; then
86+
echo "FAIL: SRAM usage (${SRAM_USED}) exceeds 520KB"
87+
exit 1
88+
fi
89+
echo "PASS: FP32 firmware fits in Pico2 memory (SRAM: ${SRAM_USED}/532480, Flash: ${FLASH_USED}/4194304)"
90+
91+
test-pico2-cmsis-build:
92+
name: test-pico2-cmsis-nn-build
93+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
94+
permissions:
95+
id-token: write
96+
contents: read
97+
with:
98+
runner: linux.2xlarge.memory
99+
docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
100+
submodules: 'recursive'
101+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
102+
timeout: 120
103+
script: |
104+
set -euo pipefail
105+
# Activate conda environment from the docker image
106+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
107+
conda activate "${CONDA_ENV}"
108+
109+
source .ci/scripts/utils.sh
110+
install_executorch "--use-pt-pinned-commit"
111+
112+
# Install ARM baremetal toolchain
113+
.ci/scripts/setup-arm-baremetal-tools.sh
114+
source examples/arm/arm-scratch/setup_path.sh
115+
116+
echo "=== Step 1: Export INT8 CMSIS-NN model ==="
117+
cd examples/raspberry_pi/pico2
118+
python export_mlp_mnist_cmsis.py
119+
test -f balanced_tiny_mlp_mnist_cmsis.pte
120+
echo "INT8 model exported: $(ls -la balanced_tiny_mlp_mnist_cmsis.pte)"
121+
cd -
122+
123+
echo "=== Step 2: Build CMSIS-NN firmware ==="
124+
./examples/raspberry_pi/pico2/build_firmware_pico.sh --cmsis --model=balanced_tiny_mlp_mnist_cmsis.pte
125+
126+
echo "=== Step 3: Validate CMSIS-NN artifacts ==="
127+
ELF_FILE=examples/raspberry_pi/pico2/build/executorch_pico.elf
128+
UF2_FILE=examples/raspberry_pi/pico2/build/executorch_pico.uf2
129+
130+
test -f "${ELF_FILE}" || { echo "FAIL: .elf not found"; exit 1; }
131+
test -f "${UF2_FILE}" || { echo "FAIL: .uf2 not found"; exit 1; }
132+
133+
echo "--- Section sizes ---"
134+
arm-none-eabi-size -A "${ELF_FILE}"
135+
136+
echo "--- Section headers ---"
137+
arm-none-eabi-objdump -h "${ELF_FILE}"
138+
139+
echo "--- Key symbols ---"
140+
arm-none-eabi-nm --print-size --size-sort --radix=d "${ELF_FILE}" | tail -20
141+
142+
# Verify CMSIS-NN symbols are linked
143+
CMSIS_NN_SYMBOLS=$(arm-none-eabi-nm "${ELF_FILE}" | grep -E '(cmsis_nn|arm_nn)' || true)
144+
if [ -n "${CMSIS_NN_SYMBOLS}" ]; then
145+
echo "PASS: CMSIS-NN symbols found in binary"
146+
printf '%s\n' "${CMSIS_NN_SYMBOLS}" | head -20
147+
else
148+
echo "FAIL: No CMSIS-NN symbols detected — cortex_m backend may not be linked correctly"
149+
exit 1
150+
fi
151+
152+
# Validate binary fits in Pico2 memory using aggregated totals:
153+
# flash = text + data (4MB = 4194304 bytes)
154+
# SRAM = data + bss (520KB = 532480 bytes)
155+
eval $(arm-none-eabi-size "${ELF_FILE}" | awk 'NR==2 {printf "TEXT_SIZE=%d DATA_SIZE=%d BSS_SIZE=%d", $1, $2, $3}')
156+
157+
FLASH_USED=$((TEXT_SIZE + DATA_SIZE))
158+
SRAM_USED=$((DATA_SIZE + BSS_SIZE))
159+
echo "CMSIS-NN binary: text=${TEXT_SIZE} data=${DATA_SIZE} bss=${BSS_SIZE} => flash=${FLASH_USED} sram=${SRAM_USED}"
160+
161+
if [ "${FLASH_USED}" -gt 4194304 ]; then
162+
echo "FAIL: flash usage (${FLASH_USED}) exceeds 4MB"
163+
exit 1
164+
fi
165+
if [ "${SRAM_USED}" -gt 532480 ]; then
166+
echo "FAIL: SRAM usage (${SRAM_USED}) exceeds 520KB"
167+
exit 1
168+
fi
169+
echo "PASS: CMSIS-NN firmware fits in Pico2 memory (SRAM: ${SRAM_USED}/532480, Flash: ${FLASH_USED}/4194304)"

backends/aoti/aoti_backend.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
class COMPILE_SPEC_KEYS(Enum):
2727
METHOD_NAME = "method_name"
28+
SHARE_KV_CACHE_ACROSS_METHODS = "share_kv_cache_across_methods"
2829

2930

3031
@experimental(
@@ -286,3 +287,13 @@ def method_name_from_compile_specs(
286287
raise RuntimeError(
287288
f"Could not find method name in compile specs: {compile_specs}"
288289
)
290+
291+
@classmethod
292+
def generate_share_kv_cache_compile_spec(cls) -> CompileSpec:
293+
"""
294+
Generate a CompileSpec to enable cross-method KV cache sharing.
295+
"""
296+
return CompileSpec(
297+
COMPILE_SPEC_KEYS.SHARE_KV_CACHE_ACROSS_METHODS.value,
298+
bytes([1]),
299+
)

backends/apple/mps/runtime/MPSDevice.mm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
138138
ET_CHECK_OR_RETURN_ERROR(
139139
err == Error::Ok,
140140
Internal,
141-
"An error occured occured while compiling library %d", libraryType
141+
"An error occurred while compiling library %d", libraryType
142142
);
143143
}
144144
if (_m_pso_cache.find(kernelName) == _m_pso_cache.end()) {

backends/arm/README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,13 @@ Setup:
106106
./examples/arm/setup.sh --disable-ethos-u-deps --enable-mlsdk-deps
107107
```
108108

109+
This is the default setup path and installs the MLSDK components from pip.
110+
Developers who need local source builds can use:
111+
112+
```
113+
./backends/arm/scripts/setup-mlsdk-from-source.sh
114+
```
115+
109116
The current flow lowers to TOSA and converts to VGF for use in external projects,
110117
so the `executor_runner` is not typically used here.
111118

@@ -155,7 +162,7 @@ scp -P 2222 arm_test/cmake-out/executor_runner root@127.0.0.1:/tmp/
155162
Create a PTE file:
156163

157164
```
158-
python3 -m examples.arm.aot_arm_compiler \
165+
python3 -m backends.arm.scripts.aot_arm_compiler \
159166
--model_name examples/arm/example_modules/add.py \
160167
--delegate \
161168
--quantize \

backends/arm/_passes/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
from .decompose_glu_pass import DecomposeGluPass # noqa
5454
from .decompose_grouped_conv_pass import DecomposeGroupedConvPass # noqa
5555
from .decompose_groupnorm_pass import DecomposeGroupNormPass # noqa
56+
from .decompose_gru_pass import DecomposeGruPass # noqa
5657
from .decompose_index_copy_pass import DecomposeIndexCopyPass # noqa
5758
from .decompose_index_select_to_gather_pass import ( # noqa
5859
DecomposeIndexSelectToGatherPass,
@@ -70,13 +71,15 @@
7071
from .decompose_linear_pass import DecomposeLinearPass # noqa
7172
from .decompose_log1p_pass import DecomposeLog1pPass # noqa
7273
from .decompose_logit_pass import DecomposeLogitPass # noqa
74+
from .decompose_lstm_pass import DecomposeLstmPass # noqa
7375
from .decompose_masked_fill_pass import DecomposeMaskedFillPass # noqa
7476
from .decompose_matmul import DecomposeMatmulPass # noqa
7577
from .decompose_maxpool2d_with_dilation_pass import DecomposeMaxPool2dPass # noqa
7678
from .decompose_meandim_pass import DecomposeMeanDimPass # noqa
7779
from .decompose_ne_pass import DecomposeNotEqualPass # noqa
7880
from .decompose_quant_nodes import DecomposeQuantNodesPass # noqa
7981
from .decompose_remainder_pass import DecomposeRemainderPass # noqa
82+
from .decompose_rnn_pass import DecomposeRnnPass # noqa
8083
from .decompose_round_pass import DecomposeRoundPass # noqa
8184
from .decompose_sdpa_pass import DecomposeScaledDotProductAttentionPass # noqa
8285
from .decompose_select import DecomposeSelectPass # noqa
@@ -141,6 +144,7 @@
141144
from .replace_scalar_with_tensor_pass import ( # noqa
142145
ReplaceScalarWithTensorByProfilePass,
143146
)
147+
from .rewrite_avg_pool2d_pass import RewriteAvgPool2dPass # noqa
144148
from .rewrite_bool_bitwise_to_logical_pass import ( # noqa
145149
RewriteBoolBitwiseToLogicalPass,
146150
)
@@ -155,6 +159,7 @@
155159
from .rewrite_inplace_arithmetic_pass import RewriteInplaceArithmeticPass # noqa
156160
from .rewrite_le_lt_to_ge_gt_pass import RewriteLeLtToGeGtPass # noqa
157161
from .rewrite_matmul import RewriteMatmulPass # noqa
162+
from .rewrite_max_pool2d_pass import RewriteMaxPool2dPass # noqa
158163
from .rewrite_pad import RewritePadPass # noqa
159164
from .rewrite_slice import RewriteSlicePass # noqa
160165
from .rewrite_upsample import RewriteUpsamplePass # noqa

0 commit comments

Comments
 (0)