pytorch
diff --git a/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 2 additions & 1 deletion b/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 12 additions & 68 deletions b/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 12 additions & 68 deletions
diff --git a/‎.github/workflows/_test_cortex_m_e2e.yml‎
Lines changed: 1 addition & 4 deletions b/‎.github/workflows/_test_cortex_m_e2e.yml‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎backends/arm/scripts/run_fvp.sh‎
Lines changed: 50 additions & 3 deletions b/‎backends/arm/scripts/run_fvp.sh‎
Lines changed: 50 additions & 3 deletions
diff --git a/‎backends/cortex_m/README.md‎
Lines changed: 6 additions & 0 deletions b/‎backends/cortex_m/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/cuda/tests/test_fused_moe.py‎
Lines changed: 152 additions & 0 deletions b/‎backends/cuda/tests/test_fused_moe.py‎
Lines changed: 152 additions & 0 deletions
@@ -418,7 +418,8 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
   TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
   python -m executorch.examples.models.qwen3_5_moe.export \
       --prequantized "$LOCAL_MODEL_DIR" \
-      --output-dir "${OUTPUT_DIR}"
+      --output-dir "${OUTPUT_DIR}" \
+      --moe-activation-dtype int8
   echo "::endgroup::"
 
   test -f "${OUTPUT_DIR}/model.pte"
 
@@ -6,76 +6,20 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# End-to-end test for Cortex-M backend: export a model via aot_arm_compiler
-# with cortex-m55+int8 target, then run the .bpte on Corstone-300 FVP.
-#
-# Usage: bash .ci/scripts/test_cortex_m_e2e.sh <model_name>
-# Example: bash .ci/scripts/test_cortex_m_e2e.sh mv2
+# CI wrapper: export a model for the Cortex-M backend and run it on the
+# Corstone-300 FVP via examples/arm/run.sh. The real work (export, runner
+# build, FVP launch, Test_result: PASS/FAIL check) is done by run.sh and
+# the run_fvp.sh it invokes.
 
-set -eux
+set -eu
 
 MODEL=$1
-mkdir -p "./cortex_m_e2e/${MODEL}"
-WORK_DIR=$(realpath "./cortex_m_e2e/${MODEL}")
+script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
+et_root_dir=$(realpath "${script_dir}/../..")
 
-echo "=== Exporting ${MODEL} with cortex-m55+int8 ==="
-python -m backends.arm.scripts.aot_arm_compiler \
-    -m "${MODEL}" \
+# Quantization is the default for the cortex-m55+int8 target; run.sh's
+# arg parser only recognizes --no_quantize, so we omit any explicit flag.
+bash "${et_root_dir}/examples/arm/run.sh" \
+    --model_name="${MODEL}" \
     --target=cortex-m55+int8 \
-    --quantize \
-    --bundleio \
-    --intermediates="${WORK_DIR}/intermediates" \
-    --output="${WORK_DIR}/${MODEL}.bpte"
-
-BPTE="${WORK_DIR}/${MODEL}.bpte"
-test -f "${BPTE}" || { echo "FAIL: ${BPTE} not produced"; exit 1; }
-echo "=== Exported ${BPTE} ($(stat --printf='%s' "${BPTE}") bytes) ==="
-
-ELF="arm_test/arm_semihosting_executor_runner_corstone-300/arm_executor_runner"
-test -f "${ELF}" || { echo "FAIL: executor runner not found at ${ELF}"; exit 1; }
-
-LOG_FILE=$(mktemp)
-
-# Create a tiny dummy input file — the runner requires -i but BundleIO
-# ignores it and uses the embedded test inputs instead.
-dd if=/dev/zero of="${WORK_DIR}/dummy.bin" bs=4 count=1 2>/dev/null
-
-echo "=== Running ${MODEL} on Corstone-300 FVP ==="
-FVP_Corstone_SSE-300_Ethos-U55 \
-    -C ethosu.num_macs=128 \
-    -C mps3_board.visualisation.disable-visualisation=1 \
-    -C mps3_board.telnetterminal0.start_telnet=0 \
-    -C mps3_board.uart0.out_file='-' \
-    -C mps3_board.uart0.shutdown_on_eot=1 \
-    -C cpu0.semihosting-enable=1 \
-    -C cpu0.semihosting-stack_base=0 \
-    -C cpu0.semihosting-heap_limit=0 \
-    -C "cpu0.semihosting-cwd=${WORK_DIR}" \
-    -C "ethosu.extra_args='--fast'" \
-    -C "cpu0.semihosting-cmd_line='executor_runner -m ${MODEL}.bpte -i dummy.bin -o out'" \
-    -a "${ELF}" \
-    --timelimit 300 2>&1 | tee "${LOG_FILE}" || true
-
-echo "=== Checking FVP output ==="
-
-if grep -q "Test_result: PASS" "${LOG_FILE}"; then
-    echo "=== SUCCESS: ${MODEL} e2e BundleIO test PASSED on FVP ==="
-    rm "${LOG_FILE}"
-    exit 0
-fi
-
-if grep -q "Test_result: FAIL" "${LOG_FILE}"; then
-    echo "FAIL: ${MODEL} BundleIO output mismatch"
-    rm "${LOG_FILE}"
-    exit 1
-fi
-
-if grep -qE "(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)" "${LOG_FILE}"; then
-    echo "FAIL: ${MODEL} FVP run hit a fatal error"
-    rm "${LOG_FILE}"
-    exit 1
-fi
-
-echo "FAIL: ${MODEL} no BundleIO test result found in FVP output"
-rm "${LOG_FILE}"
-exit 1
+    --bundleio
@@ -43,8 +43,5 @@ jobs:
         .ci/scripts/setup-arm-baremetal-tools.sh
         source examples/arm/arm-scratch/setup_path.sh
 
-        # Build cortex-m test runner with bundled IO support
-        backends/cortex_m/test/build_test_runner.sh
-
-        # Export model and run on FVP
+        # Export and run model on FVP (run.sh internally builds the test runner).
         bash .ci/scripts/test_cortex_m_e2e.sh ${{ matrix.model }}
@@ -19,6 +19,7 @@ _setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly ins
 
 elf_file=""
 data_file=""
+bundle_file=""
 target="ethos-u55-128"
 timeout="600"
 etrecord_file=""
@@ -29,6 +30,7 @@ help() {
     echo "Options:"
     echo "  --elf=<ELF_FILE>         elf file to run"
     echo "  --data=<FILE>@<ADDRESS>  Place a file in memory at this address, useful to emulate a PTE flashed into memory instead as part of the code."
+    echo "  --bundle=<BPTE_FILE>     Bundled program (.bpte) to load via semihosting. Required for cortex-m targets; the FVP launches a semihosting executor_runner that reads the bundle from the host filesystem and checks the embedded reference outputs."
     echo "  --target=<TARGET>        Target to build and run for Default: ${target}"
     echo "  --timeout=<TIME_IN_SEC>  Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}"
     echo "  --etrecord=<FILE>        If ETDump is used you can supply a ETRecord file matching the PTE"
@@ -41,6 +43,7 @@ for arg in "$@"; do
       -h|--help) help ;;
       --elf=*) elf_file="${arg#*=}";;
       --data=*) data_file="--data ${arg#*=}";;
+      --bundle=*) bundle_file="${arg#*=}";;
       --target=*) target="${arg#*=}";;
       --timeout=*) timeout="${arg#*=}";;
       --etrecord=*) etrecord_file="${arg#*=}";;
@@ -52,7 +55,9 @@ done
 
 elf_file=$(realpath ${elf_file})
 
-if [[ ${target} == *"ethos-u55"*  ]]; then
+# cortex-m55 is the only Cortex-M CPU on the Corstone-300 board today;
+# cortex-m85 lives on Corstone-320, so it falls through to the SSE-320 FVP.
+if [[ ${target} == *"ethos-u55"* || ${target} == cortex-m55* ]]; then
     fvp_model=FVP_Corstone_SSE-300_Ethos-U55
 else
     fvp_model=FVP_Corstone_SSE-320
@@ -71,7 +76,12 @@ hash ${fvp_model} \
 
 
 [[ ! -f $elf_file ]] && { echo "[${BASH_SOURCE[0]}]: Unable to find executor_runner elf: ${elf_file}"; exit 1; }
-num_macs=$(echo ${target} | cut -d - -f 3)
+if [[ ${target} == cortex-m* ]]; then
+    # Cortex-M CPU-only; the NPU is unused but the FVP still needs a value.
+    num_macs=128
+else
+    num_macs=$(echo ${target} | cut -d - -f 3)
+fi
 
 echo "--------------------------------------------------------------------------------"
 echo "Running ${elf_file} for ${target} run with FVP:${fvp_model} num_macs:${num_macs} timeout:${timeout}"
@@ -97,7 +107,44 @@ if [[ -n "${trace_file}" ]]; then
     extra_args_u85+=(-C "mps4_board.subsystem.ethosu.extra_args=--pmu-trace ${trace_file}")
 fi
 
-if [[ ${target} == *"ethos-u55"*  ]]; then
+if [[ ${target} == cortex-m* ]]; then
+    [[ -z "${bundle_file}" ]] \
+        && { echo "[${BASH_SOURCE[0]}] --bundle=<BPTE_FILE> is required for cortex-m targets"; exit 1; }
+    bundle_file=$(realpath "${bundle_file}")
+    bundle_dir=$(dirname "${bundle_file}")
+    bundle_name=$(basename "${bundle_file}")
+    # Bundled-IO runner needs -i to point at a real file even though
+    # inputs come from the bundle.
+    dd if=/dev/zero of="${bundle_dir}/fvp_dummy_input.bin" bs=4 count=1 2>/dev/null
+    ${nobuf} ${fvp_model}                                              \
+        -C ethosu.num_macs=${num_macs}                                 \
+        -C mps3_board.visualisation.disable-visualisation=1            \
+        -C mps3_board.telnetterminal0.start_telnet=0                   \
+        -C mps3_board.uart0.out_file='-'                               \
+        -C mps3_board.uart0.shutdown_on_eot=1                          \
+        -C cpu0.semihosting-enable=1                                   \
+        -C cpu0.semihosting-stack_base=0                               \
+        -C cpu0.semihosting-heap_limit=0                               \
+        -C "cpu0.semihosting-cwd=${bundle_dir}"                        \
+        -C "ethosu.extra_args=--fast"                                  \
+        -C "cpu0.semihosting-cmd_line=executor_runner -m ${bundle_name} -i fvp_dummy_input.bin -o out" \
+        -a "${elf_file}"                                               \
+        --timelimit ${timeout} 2>&1 | sed 's/\r$//' | tee ${log_file} || true
+    echo "[${BASH_SOURCE[0]}] Simulation complete, $?"
+    if grep -q "Test_result: PASS" "${log_file}"; then
+        echo "[${BASH_SOURCE[0]}] Bundled I/O check PASSED for ${bundle_name}"
+        rm "${log_file}"
+        exit 0
+    elif grep -q "Test_result: FAIL" "${log_file}"; then
+        echo "[${BASH_SOURCE[0]}] Bundled I/O check FAILED for ${bundle_name}"
+        rm "${log_file}"
+        exit 1
+    else
+        echo "[${BASH_SOURCE[0]}] No Test_result line found in FVP output for ${bundle_name}"
+        rm "${log_file}"
+        exit 1
+    fi
+elif [[ ${target} == *"ethos-u55"*  ]]; then
     ${nobuf} ${fvp_model}                                   \
         -C ethosu.num_macs=${num_macs}                      \
         -C mps3_board.visualisation.disable-visualisation=1 \
 
@@ -20,6 +20,12 @@ backends/cortex_m/test/build_test_runner.sh                               # Buil
 pytest --config-file=backends/arm/test/pytest.ini backends/cortex_m/test  # Run tests with correct configuration file
 ```
 
+For an end-to-end bundled-IO FVP run of a single model (export → build → FVP → `Test_result: PASS`), use `examples/arm/run.sh`:
+```
+examples/arm/run.sh --model_name=<model> --target=cortex-m55+int8 --bundleio
+```
+This drives `aot_arm_compiler --bundleio`, invokes `build_test_runner.sh`, and launches the Corstone-300 FVP via `backends/arm/scripts/run_fvp.sh`.
+
 ## Supported operators
 Refer to `backends/cortex_m/test/ops` for currently supported accelerated ops/dtypes. Additionally, the quantizer targets pure "data-movement ops" such as data copies, slicing and concatenations to use quantized dtypes using the portable-kernels operator library.
 In general however, operators not supported by Cortex-M are kept in `fp32` using non-accelerated portable-kernels. It is recommended to analyze the graph after lowering to understand how much of the graph has been accelerated.
 
@@ -31,6 +31,7 @@
 from executorch.backends.cuda.triton.kernels.fused_moe import (
     fused_moe as triton_fused_moe,
     fused_moe_batched as triton_fused_moe_batched,
+    fused_moe_batched_gemm_int8 as triton_fused_moe_batched_int8,
     moe_align_block_size,
 )
 from executorch.exir import (
@@ -212,6 +213,11 @@ def _run_cpp_runner(runner_path, pte_path, ptd_path, input_files, output_base):
 
 
 class TestFusedMoE(unittest.TestCase):
+    # TODO: migrate from manual max_abs/max_ref relative checks to
+    # torch.allclose(atol=, rtol=). Current tests use per-tensor-max relative
+    # error which is looser than per-element allclose — need to calibrate atol
+    # for INT4 quantization noise floor across random weight magnitudes.
+
     def setUp(self):
         if not torch.cuda.is_available():
             self.skipTest("CUDA is not available")
@@ -487,6 +493,152 @@ def test_e2e_cpp_runner(self):
                     )
 
 
+class TestFusedMoEBatchedInt8(unittest.TestCase):
+    """Correctness tests for the INT8 dynamic-activation batched MoE kernel."""
+
+    INT8_TEST_CONFIGS = [
+        (42, 8, 64, 32, 4, 2, 32, "8tok_small"),
+        (7, 16, 64, 32, 8, 4, 32, "16tok_8exp_top4"),
+        (13, 32, 128, 64, 8, 2, 64, "32tok_gs64"),
+        (55, 64, 64, 32, 4, 2, 32, "64tok"),
+        (99, 128, 128, 64, 8, 2, 32, "128tok"),
+        (0, 256, 128, 64, 8, 2, 32, "256tok"),
+    ]
+
+    def test_int8_correctness(self):
+        """INT8 batched kernel matches reference across M values."""
+        for (
+            seed,
+            M,
+            hidden,
+            intermediate,
+            num_experts,
+            top_k,
+            gs,
+            desc,
+        ) in self.INT8_TEST_CONFIGS:
+            with self.subTest(desc=desc):
+                torch.manual_seed(seed)
+                x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+                w1_weight = torch.randn(
+                    num_experts,
+                    2 * intermediate,
+                    hidden,
+                    dtype=torch.bfloat16,
+                    device="cuda",
+                )
+                w2_weight = torch.randn(
+                    num_experts,
+                    hidden,
+                    intermediate,
+                    dtype=torch.bfloat16,
+                    device="cuda",
+                )
+                w1, w1s = _quantize_weights_int4(w1_weight.cpu(), gs)
+                w2, w2s = _quantize_weights_int4(w2_weight.cpu(), gs)
+                w1, w1s, w2, w2s = w1.cuda(), w1s.cuda(), w2.cuda(), w2s.cuda()
+
+                scores = torch.randn(M, num_experts, device="cuda")
+                topk_weights, topk_ids = torch.topk(scores, top_k, dim=-1)
+                topk_weights = topk_weights.softmax(dim=-1).float()
+
+                out_int8 = triton_fused_moe_batched_int8(
+                    x,
+                    w1,
+                    w1s,
+                    w2,
+                    w2s,
+                    topk_weights,
+                    topk_ids,
+                    top_k,
+                    num_experts,
+                    gs,
+                )
+
+                w1_dq = _dequantize_int4(w1.cpu(), w1s.cpu(), gs).cuda()
+                w2_dq = _dequantize_int4(w2.cpu(), w2s.cpu(), gs).cuda()
+                ref = _reference_moe(x, w1_dq, w2_dq, topk_weights, topk_ids, top_k)
+
+                diff = (out_int8.float() - ref.float()).abs().max().item()
+                rel = diff / (ref.float().abs().max().item() + 1e-10)
+                self.assertLess(
+                    rel,
+                    0.10,
+                    f"{desc}: relative diff {rel:.4f} (abs {diff:.6f})",
+                )
+
+    def test_int8_matches_bf16_batched(self):
+        """INT8 batched output is close to BF16 batched output."""
+        for (
+            seed,
+            M,
+            hidden,
+            intermediate,
+            num_experts,
+            top_k,
+            gs,
+            desc,
+        ) in self.INT8_TEST_CONFIGS:
+            with self.subTest(desc=desc):
+                torch.manual_seed(seed)
+                x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+                w1_weight = torch.randn(
+                    num_experts,
+                    2 * intermediate,
+                    hidden,
+                    dtype=torch.bfloat16,
+                    device="cuda",
+                )
+                w2_weight = torch.randn(
+                    num_experts,
+                    hidden,
+                    intermediate,
+                    dtype=torch.bfloat16,
+                    device="cuda",
+                )
+                w1, w1s = _quantize_weights_int4(w1_weight.cpu(), gs)
+                w2, w2s = _quantize_weights_int4(w2_weight.cpu(), gs)
+                w1, w1s, w2, w2s = w1.cuda(), w1s.cuda(), w2.cuda(), w2s.cuda()
+
+                scores = torch.randn(M, num_experts, device="cuda")
+                topk_weights, topk_ids = torch.topk(scores, top_k, dim=-1)
+                topk_weights = topk_weights.softmax(dim=-1).float()
+
+                out_bf16 = triton_fused_moe_batched(
+                    x,
+                    w1,
+                    w1s,
+                    w2,
+                    w2s,
+                    topk_weights,
+                    topk_ids,
+                    top_k,
+                    num_experts,
+                    gs,
+                )
+
+                out_int8 = triton_fused_moe_batched_int8(
+                    x,
+                    w1,
+                    w1s,
+                    w2,
+                    w2s,
+                    topk_weights,
+                    topk_ids,
+                    top_k,
+                    num_experts,
+                    gs,
+                )
+
+                diff = (out_int8.float() - out_bf16.float()).abs().max().item()
+                rel = diff / (out_bf16.float().abs().max().item() + 1e-10)
+                self.assertLess(
+                    rel,
+                    0.15,
+                    f"{desc}: int8 vs bf16 relative diff {rel:.4f} (abs {diff:.6f})",
+                )
+
+
 class TestMoeAlignBlockSize(unittest.TestCase):
     def setUp(self):
         if not torch.cuda.is_available():