Skip to content

Commit 787239e

Browse files
authored
Merge branch 'main' into android-combined-v2
2 parents 6ffdd62 + f3e49ff commit 787239e

11 files changed

Lines changed: 773 additions & 80 deletions

File tree

.ci/scripts/export_model_artifact.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,8 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
418418
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
419419
python -m executorch.examples.models.qwen3_5_moe.export \
420420
--prequantized "$LOCAL_MODEL_DIR" \
421-
--output-dir "${OUTPUT_DIR}"
421+
--output-dir "${OUTPUT_DIR}" \
422+
--moe-activation-dtype int8
422423
echo "::endgroup::"
423424

424425
test -f "${OUTPUT_DIR}/model.pte"

.ci/scripts/test_cortex_m_e2e.sh

Lines changed: 12 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -6,76 +6,20 @@
66
# This source code is licensed under the BSD-style license found in the
77
# LICENSE file in the root directory of this source tree.
88

9-
# End-to-end test for Cortex-M backend: export a model via aot_arm_compiler
10-
# with cortex-m55+int8 target, then run the .bpte on Corstone-300 FVP.
11-
#
12-
# Usage: bash .ci/scripts/test_cortex_m_e2e.sh <model_name>
13-
# Example: bash .ci/scripts/test_cortex_m_e2e.sh mv2
9+
# CI wrapper: export a model for the Cortex-M backend and run it on the
10+
# Corstone-300 FVP via examples/arm/run.sh. The real work (export, runner
11+
# build, FVP launch, Test_result: PASS/FAIL check) is done by run.sh and
12+
# the run_fvp.sh it invokes.
1413

15-
set -eux
14+
set -eu
1615

1716
MODEL=$1
18-
mkdir -p "./cortex_m_e2e/${MODEL}"
19-
WORK_DIR=$(realpath "./cortex_m_e2e/${MODEL}")
17+
script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
18+
et_root_dir=$(realpath "${script_dir}/../..")
2019

21-
echo "=== Exporting ${MODEL} with cortex-m55+int8 ==="
22-
python -m backends.arm.scripts.aot_arm_compiler \
23-
-m "${MODEL}" \
20+
# Quantization is the default for the cortex-m55+int8 target; run.sh's
21+
# arg parser only recognizes --no_quantize, so we omit any explicit flag.
22+
bash "${et_root_dir}/examples/arm/run.sh" \
23+
--model_name="${MODEL}" \
2424
--target=cortex-m55+int8 \
25-
--quantize \
26-
--bundleio \
27-
--intermediates="${WORK_DIR}/intermediates" \
28-
--output="${WORK_DIR}/${MODEL}.bpte"
29-
30-
BPTE="${WORK_DIR}/${MODEL}.bpte"
31-
test -f "${BPTE}" || { echo "FAIL: ${BPTE} not produced"; exit 1; }
32-
echo "=== Exported ${BPTE} ($(stat --printf='%s' "${BPTE}") bytes) ==="
33-
34-
ELF="arm_test/arm_semihosting_executor_runner_corstone-300/arm_executor_runner"
35-
test -f "${ELF}" || { echo "FAIL: executor runner not found at ${ELF}"; exit 1; }
36-
37-
LOG_FILE=$(mktemp)
38-
39-
# Create a tiny dummy input file — the runner requires -i but BundleIO
40-
# ignores it and uses the embedded test inputs instead.
41-
dd if=/dev/zero of="${WORK_DIR}/dummy.bin" bs=4 count=1 2>/dev/null
42-
43-
echo "=== Running ${MODEL} on Corstone-300 FVP ==="
44-
FVP_Corstone_SSE-300_Ethos-U55 \
45-
-C ethosu.num_macs=128 \
46-
-C mps3_board.visualisation.disable-visualisation=1 \
47-
-C mps3_board.telnetterminal0.start_telnet=0 \
48-
-C mps3_board.uart0.out_file='-' \
49-
-C mps3_board.uart0.shutdown_on_eot=1 \
50-
-C cpu0.semihosting-enable=1 \
51-
-C cpu0.semihosting-stack_base=0 \
52-
-C cpu0.semihosting-heap_limit=0 \
53-
-C "cpu0.semihosting-cwd=${WORK_DIR}" \
54-
-C "ethosu.extra_args='--fast'" \
55-
-C "cpu0.semihosting-cmd_line='executor_runner -m ${MODEL}.bpte -i dummy.bin -o out'" \
56-
-a "${ELF}" \
57-
--timelimit 300 2>&1 | tee "${LOG_FILE}" || true
58-
59-
echo "=== Checking FVP output ==="
60-
61-
if grep -q "Test_result: PASS" "${LOG_FILE}"; then
62-
echo "=== SUCCESS: ${MODEL} e2e BundleIO test PASSED on FVP ==="
63-
rm "${LOG_FILE}"
64-
exit 0
65-
fi
66-
67-
if grep -q "Test_result: FAIL" "${LOG_FILE}"; then
68-
echo "FAIL: ${MODEL} BundleIO output mismatch"
69-
rm "${LOG_FILE}"
70-
exit 1
71-
fi
72-
73-
if grep -qE "(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)" "${LOG_FILE}"; then
74-
echo "FAIL: ${MODEL} FVP run hit a fatal error"
75-
rm "${LOG_FILE}"
76-
exit 1
77-
fi
78-
79-
echo "FAIL: ${MODEL} no BundleIO test result found in FVP output"
80-
rm "${LOG_FILE}"
81-
exit 1
25+
--bundleio

.github/workflows/_test_cortex_m_e2e.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,5 @@ jobs:
4343
.ci/scripts/setup-arm-baremetal-tools.sh
4444
source examples/arm/arm-scratch/setup_path.sh
4545
46-
# Build cortex-m test runner with bundled IO support
47-
backends/cortex_m/test/build_test_runner.sh
48-
49-
# Export model and run on FVP
46+
# Export and run model on FVP (run.sh internally builds the test runner).
5047
bash .ci/scripts/test_cortex_m_e2e.sh ${{ matrix.model }}

backends/arm/scripts/run_fvp.sh

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ _setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly ins
1919

2020
elf_file=""
2121
data_file=""
22+
bundle_file=""
2223
target="ethos-u55-128"
2324
timeout="600"
2425
etrecord_file=""
@@ -29,6 +30,7 @@ help() {
2930
echo "Options:"
3031
echo " --elf=<ELF_FILE> elf file to run"
3132
echo " --data=<FILE>@<ADDRESS> Place a file in memory at this address, useful to emulate a PTE flashed into memory instead as part of the code."
33+
echo " --bundle=<BPTE_FILE> Bundled program (.bpte) to load via semihosting. Required for cortex-m targets; the FVP launches a semihosting executor_runner that reads the bundle from the host filesystem and checks the embedded reference outputs."
3234
echo " --target=<TARGET> Target to build and run for Default: ${target}"
3335
echo " --timeout=<TIME_IN_SEC> Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}"
3436
echo " --etrecord=<FILE> If ETDump is used you can supply a ETRecord file matching the PTE"
@@ -41,6 +43,7 @@ for arg in "$@"; do
4143
-h|--help) help ;;
4244
--elf=*) elf_file="${arg#*=}";;
4345
--data=*) data_file="--data ${arg#*=}";;
46+
--bundle=*) bundle_file="${arg#*=}";;
4447
--target=*) target="${arg#*=}";;
4548
--timeout=*) timeout="${arg#*=}";;
4649
--etrecord=*) etrecord_file="${arg#*=}";;
@@ -52,7 +55,9 @@ done
5255

5356
elf_file=$(realpath ${elf_file})
5457

55-
if [[ ${target} == *"ethos-u55"* ]]; then
58+
# cortex-m55 is the only Cortex-M CPU on the Corstone-300 board today;
59+
# cortex-m85 lives on Corstone-320, so it falls through to the SSE-320 FVP.
60+
if [[ ${target} == *"ethos-u55"* || ${target} == cortex-m55* ]]; then
5661
fvp_model=FVP_Corstone_SSE-300_Ethos-U55
5762
else
5863
fvp_model=FVP_Corstone_SSE-320
@@ -71,7 +76,12 @@ hash ${fvp_model} \
7176

7277

7378
[[ ! -f $elf_file ]] && { echo "[${BASH_SOURCE[0]}]: Unable to find executor_runner elf: ${elf_file}"; exit 1; }
74-
num_macs=$(echo ${target} | cut -d - -f 3)
79+
if [[ ${target} == cortex-m* ]]; then
80+
# Cortex-M CPU-only; the NPU is unused but the FVP still needs a value.
81+
num_macs=128
82+
else
83+
num_macs=$(echo ${target} | cut -d - -f 3)
84+
fi
7585

7686
echo "--------------------------------------------------------------------------------"
7787
echo "Running ${elf_file} for ${target} run with FVP:${fvp_model} num_macs:${num_macs} timeout:${timeout}"
@@ -97,7 +107,44 @@ if [[ -n "${trace_file}" ]]; then
97107
extra_args_u85+=(-C "mps4_board.subsystem.ethosu.extra_args=--pmu-trace ${trace_file}")
98108
fi
99109

100-
if [[ ${target} == *"ethos-u55"* ]]; then
110+
if [[ ${target} == cortex-m* ]]; then
111+
[[ -z "${bundle_file}" ]] \
112+
&& { echo "[${BASH_SOURCE[0]}] --bundle=<BPTE_FILE> is required for cortex-m targets"; exit 1; }
113+
bundle_file=$(realpath "${bundle_file}")
114+
bundle_dir=$(dirname "${bundle_file}")
115+
bundle_name=$(basename "${bundle_file}")
116+
# Bundled-IO runner needs -i to point at a real file even though
117+
# inputs come from the bundle.
118+
dd if=/dev/zero of="${bundle_dir}/fvp_dummy_input.bin" bs=4 count=1 2>/dev/null
119+
${nobuf} ${fvp_model} \
120+
-C ethosu.num_macs=${num_macs} \
121+
-C mps3_board.visualisation.disable-visualisation=1 \
122+
-C mps3_board.telnetterminal0.start_telnet=0 \
123+
-C mps3_board.uart0.out_file='-' \
124+
-C mps3_board.uart0.shutdown_on_eot=1 \
125+
-C cpu0.semihosting-enable=1 \
126+
-C cpu0.semihosting-stack_base=0 \
127+
-C cpu0.semihosting-heap_limit=0 \
128+
-C "cpu0.semihosting-cwd=${bundle_dir}" \
129+
-C "ethosu.extra_args=--fast" \
130+
-C "cpu0.semihosting-cmd_line=executor_runner -m ${bundle_name} -i fvp_dummy_input.bin -o out" \
131+
-a "${elf_file}" \
132+
--timelimit ${timeout} 2>&1 | sed 's/\r$//' | tee ${log_file} || true
133+
echo "[${BASH_SOURCE[0]}] Simulation complete, $?"
134+
if grep -q "Test_result: PASS" "${log_file}"; then
135+
echo "[${BASH_SOURCE[0]}] Bundled I/O check PASSED for ${bundle_name}"
136+
rm "${log_file}"
137+
exit 0
138+
elif grep -q "Test_result: FAIL" "${log_file}"; then
139+
echo "[${BASH_SOURCE[0]}] Bundled I/O check FAILED for ${bundle_name}"
140+
rm "${log_file}"
141+
exit 1
142+
else
143+
echo "[${BASH_SOURCE[0]}] No Test_result line found in FVP output for ${bundle_name}"
144+
rm "${log_file}"
145+
exit 1
146+
fi
147+
elif [[ ${target} == *"ethos-u55"* ]]; then
101148
${nobuf} ${fvp_model} \
102149
-C ethosu.num_macs=${num_macs} \
103150
-C mps3_board.visualisation.disable-visualisation=1 \

backends/cortex_m/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ backends/cortex_m/test/build_test_runner.sh # Buil
2020
pytest --config-file=backends/arm/test/pytest.ini backends/cortex_m/test # Run tests with correct configuration file
2121
```
2222

23+
For an end-to-end bundled-IO FVP run of a single model (export → build → FVP → `Test_result: PASS`), use `examples/arm/run.sh`:
24+
```
25+
examples/arm/run.sh --model_name=<model> --target=cortex-m55+int8 --bundleio
26+
```
27+
This drives `aot_arm_compiler --bundleio`, invokes `build_test_runner.sh`, and launches the Corstone-300 FVP via `backends/arm/scripts/run_fvp.sh`.
28+
2329
## Supported operators
2430
Refer to `backends/cortex_m/test/ops` for currently supported accelerated ops/dtypes. Additionally, the quantizer targets pure "data-movement ops" such as data copies, slicing and concatenations to use quantized dtypes using the portable-kernels operator library.
2531
In general however, operators not supported by Cortex-M are kept in `fp32` using non-accelerated portable-kernels. It is recommended to analyze the graph after lowering to understand how much of the graph has been accelerated.

backends/cuda/tests/test_fused_moe.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from executorch.backends.cuda.triton.kernels.fused_moe import (
3232
fused_moe as triton_fused_moe,
3333
fused_moe_batched as triton_fused_moe_batched,
34+
fused_moe_batched_gemm_int8 as triton_fused_moe_batched_int8,
3435
moe_align_block_size,
3536
)
3637
from executorch.exir import (
@@ -212,6 +213,11 @@ def _run_cpp_runner(runner_path, pte_path, ptd_path, input_files, output_base):
212213

213214

214215
class TestFusedMoE(unittest.TestCase):
216+
# TODO: migrate from manual max_abs/max_ref relative checks to
217+
# torch.allclose(atol=, rtol=). Current tests use per-tensor-max relative
218+
# error which is looser than per-element allclose — need to calibrate atol
219+
# for INT4 quantization noise floor across random weight magnitudes.
220+
215221
def setUp(self):
216222
if not torch.cuda.is_available():
217223
self.skipTest("CUDA is not available")
@@ -487,6 +493,152 @@ def test_e2e_cpp_runner(self):
487493
)
488494

489495

496+
class TestFusedMoEBatchedInt8(unittest.TestCase):
497+
"""Correctness tests for the INT8 dynamic-activation batched MoE kernel."""
498+
499+
INT8_TEST_CONFIGS = [
500+
(42, 8, 64, 32, 4, 2, 32, "8tok_small"),
501+
(7, 16, 64, 32, 8, 4, 32, "16tok_8exp_top4"),
502+
(13, 32, 128, 64, 8, 2, 64, "32tok_gs64"),
503+
(55, 64, 64, 32, 4, 2, 32, "64tok"),
504+
(99, 128, 128, 64, 8, 2, 32, "128tok"),
505+
(0, 256, 128, 64, 8, 2, 32, "256tok"),
506+
]
507+
508+
def test_int8_correctness(self):
509+
"""INT8 batched kernel matches reference across M values."""
510+
for (
511+
seed,
512+
M,
513+
hidden,
514+
intermediate,
515+
num_experts,
516+
top_k,
517+
gs,
518+
desc,
519+
) in self.INT8_TEST_CONFIGS:
520+
with self.subTest(desc=desc):
521+
torch.manual_seed(seed)
522+
x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
523+
w1_weight = torch.randn(
524+
num_experts,
525+
2 * intermediate,
526+
hidden,
527+
dtype=torch.bfloat16,
528+
device="cuda",
529+
)
530+
w2_weight = torch.randn(
531+
num_experts,
532+
hidden,
533+
intermediate,
534+
dtype=torch.bfloat16,
535+
device="cuda",
536+
)
537+
w1, w1s = _quantize_weights_int4(w1_weight.cpu(), gs)
538+
w2, w2s = _quantize_weights_int4(w2_weight.cpu(), gs)
539+
w1, w1s, w2, w2s = w1.cuda(), w1s.cuda(), w2.cuda(), w2s.cuda()
540+
541+
scores = torch.randn(M, num_experts, device="cuda")
542+
topk_weights, topk_ids = torch.topk(scores, top_k, dim=-1)
543+
topk_weights = topk_weights.softmax(dim=-1).float()
544+
545+
out_int8 = triton_fused_moe_batched_int8(
546+
x,
547+
w1,
548+
w1s,
549+
w2,
550+
w2s,
551+
topk_weights,
552+
topk_ids,
553+
top_k,
554+
num_experts,
555+
gs,
556+
)
557+
558+
w1_dq = _dequantize_int4(w1.cpu(), w1s.cpu(), gs).cuda()
559+
w2_dq = _dequantize_int4(w2.cpu(), w2s.cpu(), gs).cuda()
560+
ref = _reference_moe(x, w1_dq, w2_dq, topk_weights, topk_ids, top_k)
561+
562+
diff = (out_int8.float() - ref.float()).abs().max().item()
563+
rel = diff / (ref.float().abs().max().item() + 1e-10)
564+
self.assertLess(
565+
rel,
566+
0.10,
567+
f"{desc}: relative diff {rel:.4f} (abs {diff:.6f})",
568+
)
569+
570+
def test_int8_matches_bf16_batched(self):
571+
"""INT8 batched output is close to BF16 batched output."""
572+
for (
573+
seed,
574+
M,
575+
hidden,
576+
intermediate,
577+
num_experts,
578+
top_k,
579+
gs,
580+
desc,
581+
) in self.INT8_TEST_CONFIGS:
582+
with self.subTest(desc=desc):
583+
torch.manual_seed(seed)
584+
x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
585+
w1_weight = torch.randn(
586+
num_experts,
587+
2 * intermediate,
588+
hidden,
589+
dtype=torch.bfloat16,
590+
device="cuda",
591+
)
592+
w2_weight = torch.randn(
593+
num_experts,
594+
hidden,
595+
intermediate,
596+
dtype=torch.bfloat16,
597+
device="cuda",
598+
)
599+
w1, w1s = _quantize_weights_int4(w1_weight.cpu(), gs)
600+
w2, w2s = _quantize_weights_int4(w2_weight.cpu(), gs)
601+
w1, w1s, w2, w2s = w1.cuda(), w1s.cuda(), w2.cuda(), w2s.cuda()
602+
603+
scores = torch.randn(M, num_experts, device="cuda")
604+
topk_weights, topk_ids = torch.topk(scores, top_k, dim=-1)
605+
topk_weights = topk_weights.softmax(dim=-1).float()
606+
607+
out_bf16 = triton_fused_moe_batched(
608+
x,
609+
w1,
610+
w1s,
611+
w2,
612+
w2s,
613+
topk_weights,
614+
topk_ids,
615+
top_k,
616+
num_experts,
617+
gs,
618+
)
619+
620+
out_int8 = triton_fused_moe_batched_int8(
621+
x,
622+
w1,
623+
w1s,
624+
w2,
625+
w2s,
626+
topk_weights,
627+
topk_ids,
628+
top_k,
629+
num_experts,
630+
gs,
631+
)
632+
633+
diff = (out_int8.float() - out_bf16.float()).abs().max().item()
634+
rel = diff / (out_bf16.float().abs().max().item() + 1e-10)
635+
self.assertLess(
636+
rel,
637+
0.15,
638+
f"{desc}: int8 vs bf16 relative diff {rel:.4f} (abs {diff:.6f})",
639+
)
640+
641+
490642
class TestMoeAlignBlockSize(unittest.TestCase):
491643
def setUp(self):
492644
if not torch.cuda.is_available():

0 commit comments

Comments
 (0)