Add Cortex-M as a first-class target in aot_arm_compiler

Github Executorch · Github Executorch · commit e9deeb66d68e · 2026-03-02T12:49:38.000-08:00
Previously, Cortex-M op conversion was applied as an afterthought to all
non-vgf targets via transform_for_cortex_m_backend(). This made the flow
hard to follow, used a bare EdgeCompileConfig that decomposed ops like
linear into addmm (requiring unnecessary workarounds), and didn't use the
CortexMQuantizer or CortexMPassManager.

Add a dedicated to_edge_cortex_m() path selected via --target=cortex-m that
owns the full pipeline: CortexMQuantizer for INT8 quantization, correct
EdgeCompileConfig with preserve_ops to prevent premature decomposition, and
CortexMPassManager.pass_list for op conversion. Remove the old scattered
transform_for_cortex_m_backend() function.

Verified all ops fully lowered to cortex_m::quantized_* operators for both
MobileNetV2 (70 nodes) and MobileNetV3 (122 nodes). E2E inference tested
on Alif E8 board.

Test Plan:
python3 -m examples.arm.aot_arm_compiler -m mv2 --target=cortex-m55+int8 --quantize --intermediates=./mv2_intermediates --output=./mv2_cortex_m.pte
python3 -m examples.arm.aot_arm_compiler -m mv3 --target=cortex-m55+int8 --quantize --intermediates=./mv3_intermediates --output=./mv3_cortex_m.pte

Also ran E2E inference on Alif E8 board
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -71,7 +71,7 @@ def _get_batch_size_from_conv(self, conv_node: torch.fx.Node):
 
     def _get_linear_replacement(self, node):
         """
-         Let
+        Let
         - yi be the output activations (y1, ... yn)
         - xj be the input activations (x1, ... xm)
         - wij be the weights (w11, ... wnm)
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
@@ -37,10 +37,11 @@
 
 from executorch.backends.arm.vgf import VgfCompileSpec
 
-# To use Cortex-M backend
 from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import (
     ReplaceQuantNodesPass,
 )
+from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
+from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
 
 from executorch.devtools import generate_etrecord
 from executorch.devtools.backend_debug import get_delegation_info
@@ -396,6 +397,7 @@ def forward(self, x):
     "TOSA-1.0+INT",
     "TOSA-1.0+FP",
     "TOSA-1.0+INT+int16",
+    "cortex-m55+int8",
 ]
 
 
@@ -528,7 +530,7 @@ def get_args():
         required=False,
         default="ethos-u55-128",
         choices=TARGETS,
-        help=f"For ArmBackend delegated models, pick the target, and therefore the instruction set generated. valid targets are {TARGETS}",
+        help=f"Target backend. For delegated models: Ethos-U/VGF/TOSA variants. For non-delegated: cortex-m55+int8 (CMSIS-NN portable kernels). Valid targets: {TARGETS}",
     )
     parser.add_argument(
         "-e",
@@ -795,6 +797,75 @@ def to_edge_TOSA_delegate(
     return model_quant, edge
 
 
+def to_edge_cortex_m(
+    exported_program: ExportedProgram,
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
+):
+    """Cortex-M/CMSIS-NN compilation path with no delegation."""
+    logging.info("Using Cortex-M/CMSIS-NN compilation path (no delegation)")
+
+    def _to_channels_last(x):
+        if isinstance(x, torch.Tensor):
+            if x.dim() == 4 and not x.is_contiguous(memory_format=torch.channels_last):
+                logging.warning(
+                    "Converting input tensor with shape %s to channels_last",
+                    list(x.shape),
+                )
+                return x.to(memory_format=torch.channels_last)
+            return x
+        elif isinstance(x, tuple):
+            return tuple(_to_channels_last(t) for t in x)
+        return x
+
+    if not args.quantize:
+        logging.warning(
+            "Quantization is DISABLED. Cortex-M typically requires quantization."
+        )
+    else:
+        model = model.to(memory_format=torch.channels_last)
+        example_inputs = tuple(_to_channels_last(x) for x in example_inputs)
+
+        quantizer = CortexMQuantizer()
+        prepared = prepare_pt2e(model, quantizer)
+
+        dataset = get_calibration_data(
+            args.model_name, example_inputs, args.evaluate, args.evaluate_config
+        )
+
+        if isinstance(dataset, DataLoader):
+            for sample, _ in dataset:
+                prepared(_to_channels_last(sample))
+        else:
+            prepared(*tuple(_to_channels_last(x) for x in dataset))
+
+        model_quant = convert_pt2e(prepared)
+
+        exported_program = torch.export.export(
+            model_quant, example_inputs, strict=args.strict_export
+        )
+
+    edge = to_edge_transform_and_lower(
+        exported_program,
+        compile_config=EdgeCompileConfig(
+            preserve_ops=[
+                torch.ops.aten.linear.default,
+                torch.ops.aten.hardsigmoid.default,
+                torch.ops.aten.hardsigmoid_.default,
+                torch.ops.aten.hardswish.default,
+                torch.ops.aten.hardswish_.default,
+            ],
+            _check_ir_validity=False,
+        ),
+    )
+
+    pass_manager = CortexMPassManager(edge.exported_program())
+    edge._edge_programs["forward"] = pass_manager.transform()
+
+    return model_quant if args.quantize else None, edge
+
+
 def to_edge_no_delegate(
     exported_program: ExportedProgram,
     args,
@@ -873,7 +944,12 @@ def to_edge_no_delegate(
 
     # Quantize if required
     model_quant = None
-    if args.delegate:
+    if args.target == "cortex-m55+int8":
+        # Cortex-M path: CMSIS-NN portable kernels, no delegation
+        model_quant, edge = to_edge_cortex_m(
+            exported_program, args, model, example_inputs
+        )
+    elif args.delegate:
         model_quant, edge = to_edge_TOSA_delegate(
             exported_program, args, model, example_inputs
         )