Skip to content

Commit 72134cf

Browse files
committed
Update on "Fix SLEEF preprocessor macro name to match ATen vec headers"
The ATen NEON vectorized math headers (vec128_float_neon.h) check for AT_BUILD_ARM_VEC256_WITH_SLEEF to enable SLEEF intrinsics for exp(), log(), etc. ExecuTorch's get_vec_preprocessor_flags() was defining ET_BUILD_ARM_VEC256_WITH_SLEEF (wrong prefix), so the USE_SLEEF macro always took the fallback path: map(std::exp) — scalar exp called per-element with full vector load/store overhead wrapping it. With this fix, Vectorized<float>::exp() correctly dispatches to Sleef_expf4_u10 on ARM, which is the intended behavior. Differential Revision: [D96044314](https://our.internmc.facebook.com/intern/diff/D96044314/) [ghstack-poisoned]
2 parents f8bda7f + 0997271 commit 72134cf

36 files changed

Lines changed: 2921 additions & 363 deletions

.ci/scripts/export_model_artifact.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,9 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
418418
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
419419
python -m executorch.examples.models.qwen3_5_moe.export \
420420
--prequantized "$LOCAL_MODEL_DIR" \
421-
--output-dir "${OUTPUT_DIR}"
421+
--output-dir "${OUTPUT_DIR}" \
422+
--dense-prefill dequant \
423+
--moe-activation-dtype int8
422424
echo "::endgroup::"
423425

424426
test -f "${OUTPUT_DIR}/model.pte"

.ci/scripts/test_cortex_m_e2e.sh

Lines changed: 12 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -6,76 +6,20 @@
66
# This source code is licensed under the BSD-style license found in the
77
# LICENSE file in the root directory of this source tree.
88

9-
# End-to-end test for Cortex-M backend: export a model via aot_arm_compiler
10-
# with cortex-m55+int8 target, then run the .bpte on Corstone-300 FVP.
11-
#
12-
# Usage: bash .ci/scripts/test_cortex_m_e2e.sh <model_name>
13-
# Example: bash .ci/scripts/test_cortex_m_e2e.sh mv2
9+
# CI wrapper: export a model for the Cortex-M backend and run it on the
10+
# Corstone-300 FVP via examples/arm/run.sh. The real work (export, runner
11+
# build, FVP launch, Test_result: PASS/FAIL check) is done by run.sh and
12+
# the run_fvp.sh it invokes.
1413

15-
set -eux
14+
set -eu
1615

1716
MODEL=$1
18-
mkdir -p "./cortex_m_e2e/${MODEL}"
19-
WORK_DIR=$(realpath "./cortex_m_e2e/${MODEL}")
17+
script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
18+
et_root_dir=$(realpath "${script_dir}/../..")
2019

21-
echo "=== Exporting ${MODEL} with cortex-m55+int8 ==="
22-
python -m backends.arm.scripts.aot_arm_compiler \
23-
-m "${MODEL}" \
20+
# Quantization is the default for the cortex-m55+int8 target; run.sh's
21+
# arg parser only recognizes --no_quantize, so we omit any explicit flag.
22+
bash "${et_root_dir}/examples/arm/run.sh" \
23+
--model_name="${MODEL}" \
2424
--target=cortex-m55+int8 \
25-
--quantize \
26-
--bundleio \
27-
--intermediates="${WORK_DIR}/intermediates" \
28-
--output="${WORK_DIR}/${MODEL}.bpte"
29-
30-
BPTE="${WORK_DIR}/${MODEL}.bpte"
31-
test -f "${BPTE}" || { echo "FAIL: ${BPTE} not produced"; exit 1; }
32-
echo "=== Exported ${BPTE} ($(stat --printf='%s' "${BPTE}") bytes) ==="
33-
34-
ELF="arm_test/arm_semihosting_executor_runner_corstone-300/arm_executor_runner"
35-
test -f "${ELF}" || { echo "FAIL: executor runner not found at ${ELF}"; exit 1; }
36-
37-
LOG_FILE=$(mktemp)
38-
39-
# Create a tiny dummy input file — the runner requires -i but BundleIO
40-
# ignores it and uses the embedded test inputs instead.
41-
dd if=/dev/zero of="${WORK_DIR}/dummy.bin" bs=4 count=1 2>/dev/null
42-
43-
echo "=== Running ${MODEL} on Corstone-300 FVP ==="
44-
FVP_Corstone_SSE-300_Ethos-U55 \
45-
-C ethosu.num_macs=128 \
46-
-C mps3_board.visualisation.disable-visualisation=1 \
47-
-C mps3_board.telnetterminal0.start_telnet=0 \
48-
-C mps3_board.uart0.out_file='-' \
49-
-C mps3_board.uart0.shutdown_on_eot=1 \
50-
-C cpu0.semihosting-enable=1 \
51-
-C cpu0.semihosting-stack_base=0 \
52-
-C cpu0.semihosting-heap_limit=0 \
53-
-C "cpu0.semihosting-cwd=${WORK_DIR}" \
54-
-C "ethosu.extra_args='--fast'" \
55-
-C "cpu0.semihosting-cmd_line='executor_runner -m ${MODEL}.bpte -i dummy.bin -o out'" \
56-
-a "${ELF}" \
57-
--timelimit 300 2>&1 | tee "${LOG_FILE}" || true
58-
59-
echo "=== Checking FVP output ==="
60-
61-
if grep -q "Test_result: PASS" "${LOG_FILE}"; then
62-
echo "=== SUCCESS: ${MODEL} e2e BundleIO test PASSED on FVP ==="
63-
rm "${LOG_FILE}"
64-
exit 0
65-
fi
66-
67-
if grep -q "Test_result: FAIL" "${LOG_FILE}"; then
68-
echo "FAIL: ${MODEL} BundleIO output mismatch"
69-
rm "${LOG_FILE}"
70-
exit 1
71-
fi
72-
73-
if grep -qE "(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)" "${LOG_FILE}"; then
74-
echo "FAIL: ${MODEL} FVP run hit a fatal error"
75-
rm "${LOG_FILE}"
76-
exit 1
77-
fi
78-
79-
echo "FAIL: ${MODEL} no BundleIO test result found in FVP output"
80-
rm "${LOG_FILE}"
81-
exit 1
25+
--bundleio

.github/workflows/_test_cortex_m_e2e.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,5 @@ jobs:
4343
.ci/scripts/setup-arm-baremetal-tools.sh
4444
source examples/arm/arm-scratch/setup_path.sh
4545
46-
# Build cortex-m test runner with bundled IO support
47-
backends/cortex_m/test/build_test_runner.sh
48-
49-
# Export model and run on FVP
46+
# Export and run model on FVP (run.sh internally builds the test runner).
5047
bash .ci/scripts/test_cortex_m_e2e.sh ${{ matrix.model }}

backends/arm/_passes/rewrite_avg_pool2d_pass.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def call_operator(self, op, args, kwargs, meta, updated=False):
4242

4343
pad_h, pad_w = to_2tuple(args[3]) if len(args) > 3 else (0, 0)
4444
# Make sure pad corresponds to TOSA
45-
pad = [pad_h, pad_w, pad_h, pad_w]
45+
pad = [pad_h, pad_h, pad_w, pad_w]
4646

4747
ceil_mode = args[4] if len(args) > 4 else False
4848

backends/arm/requirements-arm-ethos-u.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,7 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
# These dependencies need to match pyproject.toml
7+
68
ethos-u-vela == 5.0.0
79
pte-adapter-model-explorer == 0.0.2

backends/arm/requirements-arm-tosa.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
# These dependencies need to match pyproject.toml
7+
68
ml_dtypes == 0.5.1
79
flatbuffers == 24.3.25
810
tosa-adapter-model-explorer == 0.1.0

backends/arm/requirements-arm-vgf.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
# These dependencies need to match pyproject.toml
7+
68
ai_ml_emulation_layer_for_vulkan == 0.9.0
79
ai_ml_sdk_model_converter == 0.9.0
810
ai_ml_sdk_vgf_library == 0.9.0

backends/arm/scripts/aot_arm_compiler.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -847,8 +847,8 @@ def _to_edge_TOSA_delegate(
847847
)
848848

849849
# Replace quantized_decomposed::{quantize,dequantize}_per_tensor nodes
850-
# with cortex_m:: equivalents for int8 QDQ ops remaining outside the
851-
# delegated subgraph.
850+
# with cortex_m:: equivalents for int8/int16 QDQ ops remaining outside
851+
# the delegated subgraph.
852852
edge = _apply_replace_quant_nodes(edge, target, direct_drive)
853853

854854
return model_quant, edge
@@ -955,8 +955,8 @@ def _to_edge_no_delegate(
955955
)
956956

957957
# Replace quantized_decomposed::{quantize,dequantize}_per_tensor nodes
958-
# with cortex_m:: equivalents for int8 QDQ ops remaining outside the
959-
# delegated subgraph.
958+
# with cortex_m:: equivalents for int8/int16 QDQ ops remaining outside
959+
# the delegated subgraph.
960960
edge = _apply_replace_quant_nodes(edge, args.target, args.direct_drive)
961961

962962
return model_quant, edge

backends/arm/scripts/run_fvp.sh

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ _setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly ins
1919

2020
elf_file=""
2121
data_file=""
22+
bundle_file=""
2223
target="ethos-u55-128"
2324
timeout="600"
2425
etrecord_file=""
@@ -29,6 +30,7 @@ help() {
2930
echo "Options:"
3031
echo " --elf=<ELF_FILE> elf file to run"
3132
echo " --data=<FILE>@<ADDRESS> Place a file in memory at this address, useful to emulate a PTE flashed into memory instead as part of the code."
33+
echo " --bundle=<BPTE_FILE> Bundled program (.bpte) to load via semihosting. Required for cortex-m targets; the FVP launches a semihosting executor_runner that reads the bundle from the host filesystem and checks the embedded reference outputs."
3234
echo " --target=<TARGET> Target to build and run for Default: ${target}"
3335
echo " --timeout=<TIME_IN_SEC> Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}"
3436
echo " --etrecord=<FILE> If ETDump is used you can supply a ETRecord file matching the PTE"
@@ -41,6 +43,7 @@ for arg in "$@"; do
4143
-h|--help) help ;;
4244
--elf=*) elf_file="${arg#*=}";;
4345
--data=*) data_file="--data ${arg#*=}";;
46+
--bundle=*) bundle_file="${arg#*=}";;
4447
--target=*) target="${arg#*=}";;
4548
--timeout=*) timeout="${arg#*=}";;
4649
--etrecord=*) etrecord_file="${arg#*=}";;
@@ -52,7 +55,9 @@ done
5255

5356
elf_file=$(realpath ${elf_file})
5457

55-
if [[ ${target} == *"ethos-u55"* ]]; then
58+
# cortex-m55 is the only Cortex-M CPU on the Corstone-300 board today;
59+
# cortex-m85 lives on Corstone-320, so it falls through to the SSE-320 FVP.
60+
if [[ ${target} == *"ethos-u55"* || ${target} == cortex-m55* ]]; then
5661
fvp_model=FVP_Corstone_SSE-300_Ethos-U55
5762
else
5863
fvp_model=FVP_Corstone_SSE-320
@@ -71,7 +76,12 @@ hash ${fvp_model} \
7176

7277

7378
[[ ! -f $elf_file ]] && { echo "[${BASH_SOURCE[0]}]: Unable to find executor_runner elf: ${elf_file}"; exit 1; }
74-
num_macs=$(echo ${target} | cut -d - -f 3)
79+
if [[ ${target} == cortex-m* ]]; then
80+
# Cortex-M CPU-only; the NPU is unused but the FVP still needs a value.
81+
num_macs=128
82+
else
83+
num_macs=$(echo ${target} | cut -d - -f 3)
84+
fi
7585

7686
echo "--------------------------------------------------------------------------------"
7787
echo "Running ${elf_file} for ${target} run with FVP:${fvp_model} num_macs:${num_macs} timeout:${timeout}"
@@ -97,7 +107,44 @@ if [[ -n "${trace_file}" ]]; then
97107
extra_args_u85+=(-C "mps4_board.subsystem.ethosu.extra_args=--pmu-trace ${trace_file}")
98108
fi
99109

100-
if [[ ${target} == *"ethos-u55"* ]]; then
110+
if [[ ${target} == cortex-m* ]]; then
111+
[[ -z "${bundle_file}" ]] \
112+
&& { echo "[${BASH_SOURCE[0]}] --bundle=<BPTE_FILE> is required for cortex-m targets"; exit 1; }
113+
bundle_file=$(realpath "${bundle_file}")
114+
bundle_dir=$(dirname "${bundle_file}")
115+
bundle_name=$(basename "${bundle_file}")
116+
# Bundled-IO runner needs -i to point at a real file even though
117+
# inputs come from the bundle.
118+
dd if=/dev/zero of="${bundle_dir}/fvp_dummy_input.bin" bs=4 count=1 2>/dev/null
119+
${nobuf} ${fvp_model} \
120+
-C ethosu.num_macs=${num_macs} \
121+
-C mps3_board.visualisation.disable-visualisation=1 \
122+
-C mps3_board.telnetterminal0.start_telnet=0 \
123+
-C mps3_board.uart0.out_file='-' \
124+
-C mps3_board.uart0.shutdown_on_eot=1 \
125+
-C cpu0.semihosting-enable=1 \
126+
-C cpu0.semihosting-stack_base=0 \
127+
-C cpu0.semihosting-heap_limit=0 \
128+
-C "cpu0.semihosting-cwd=${bundle_dir}" \
129+
-C "ethosu.extra_args=--fast" \
130+
-C "cpu0.semihosting-cmd_line=executor_runner -m ${bundle_name} -i fvp_dummy_input.bin -o out" \
131+
-a "${elf_file}" \
132+
--timelimit ${timeout} 2>&1 | sed 's/\r$//' | tee ${log_file} || true
133+
echo "[${BASH_SOURCE[0]}] Simulation complete, $?"
134+
if grep -q "Test_result: PASS" "${log_file}"; then
135+
echo "[${BASH_SOURCE[0]}] Bundled I/O check PASSED for ${bundle_name}"
136+
rm "${log_file}"
137+
exit 0
138+
elif grep -q "Test_result: FAIL" "${log_file}"; then
139+
echo "[${BASH_SOURCE[0]}] Bundled I/O check FAILED for ${bundle_name}"
140+
rm "${log_file}"
141+
exit 1
142+
else
143+
echo "[${BASH_SOURCE[0]}] No Test_result line found in FVP output for ${bundle_name}"
144+
rm "${log_file}"
145+
exit 1
146+
fi
147+
elif [[ ${target} == *"ethos-u55"* ]]; then
101148
${nobuf} ${fvp_model} \
102149
-C ethosu.num_macs=${num_macs} \
103150
-C mps3_board.visualisation.disable-visualisation=1 \

0 commit comments

Comments
 (0)