Skip to content

Commit e9deeb6

Browse files
author
Github Executorch
committed
Add Cortex-M as a first-class target in aot_arm_compiler
Previously, Cortex-M op conversion was applied as an afterthought to all non-vgf targets via transform_for_cortex_m_backend(). This made the flow hard to follow, used a bare EdgeCompileConfig that decomposed ops like linear into addmm (requiring unnecessary workarounds), and didn't use the CortexMQuantizer or CortexMPassManager. Add a dedicated to_edge_cortex_m() path selected via --target=cortex-m that owns the full pipeline: CortexMQuantizer for INT8 quantization, correct EdgeCompileConfig with preserve_ops to prevent premature decomposition, and CortexMPassManager.pass_list for op conversion. Remove the old scattered transform_for_cortex_m_backend() function. Verified all ops fully lowered to cortex_m::quantized_* operators for both MobileNetV2 (70 nodes) and MobileNetV3 (122 nodes). E2E inference tested on Alif E8 board. Test Plan: python3 -m examples.arm.aot_arm_compiler -m mv2 --target=cortex-m55+int8 --quantize --intermediates=./mv2_intermediates --output=./mv2_cortex_m.pte python3 -m examples.arm.aot_arm_compiler -m mv3 --target=cortex-m55+int8 --quantize --intermediates=./mv3_intermediates --output=./mv3_cortex_m.pte Also ran E2E inference on Alif E8 board
1 parent 0c2ff55 commit e9deeb6

2 files changed

Lines changed: 80 additions & 4 deletions

File tree

backends/cortex_m/passes/convert_to_cortex_m_pass.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def _get_batch_size_from_conv(self, conv_node: torch.fx.Node):
7171

7272
def _get_linear_replacement(self, node):
7373
"""
74-
Let
74+
Let
7575
- yi be the output activations (y1, ... yn)
7676
- xj be the input activations (x1, ... xm)
7777
- wij be the weights (w11, ... wnm)

examples/arm/aot_arm_compiler.py

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,11 @@
3737

3838
from executorch.backends.arm.vgf import VgfCompileSpec
3939

40-
# To use Cortex-M backend
4140
from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import (
4241
ReplaceQuantNodesPass,
4342
)
43+
from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
44+
from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
4445

4546
from executorch.devtools import generate_etrecord
4647
from executorch.devtools.backend_debug import get_delegation_info
@@ -396,6 +397,7 @@ def forward(self, x):
396397
"TOSA-1.0+INT",
397398
"TOSA-1.0+FP",
398399
"TOSA-1.0+INT+int16",
400+
"cortex-m55+int8",
399401
]
400402

401403

@@ -528,7 +530,7 @@ def get_args():
528530
required=False,
529531
default="ethos-u55-128",
530532
choices=TARGETS,
531-
help=f"For ArmBackend delegated models, pick the target, and therefore the instruction set generated. valid targets are {TARGETS}",
533+
help=f"Target backend. For delegated models: Ethos-U/VGF/TOSA variants. For non-delegated: cortex-m55+int8 (CMSIS-NN portable kernels). Valid targets: {TARGETS}",
532534
)
533535
parser.add_argument(
534536
"-e",
@@ -795,6 +797,75 @@ def to_edge_TOSA_delegate(
795797
return model_quant, edge
796798

797799

800+
def to_edge_cortex_m(
801+
exported_program: ExportedProgram,
802+
args,
803+
model: GraphModule,
804+
example_inputs: Tuple[torch.Tensor],
805+
):
806+
"""Cortex-M/CMSIS-NN compilation path with no delegation."""
807+
logging.info("Using Cortex-M/CMSIS-NN compilation path (no delegation)")
808+
809+
def _to_channels_last(x):
810+
if isinstance(x, torch.Tensor):
811+
if x.dim() == 4 and not x.is_contiguous(memory_format=torch.channels_last):
812+
logging.warning(
813+
"Converting input tensor with shape %s to channels_last",
814+
list(x.shape),
815+
)
816+
return x.to(memory_format=torch.channels_last)
817+
return x
818+
elif isinstance(x, tuple):
819+
return tuple(_to_channels_last(t) for t in x)
820+
return x
821+
822+
if not args.quantize:
823+
logging.warning(
824+
"Quantization is DISABLED. Cortex-M typically requires quantization."
825+
)
826+
else:
827+
model = model.to(memory_format=torch.channels_last)
828+
example_inputs = tuple(_to_channels_last(x) for x in example_inputs)
829+
830+
quantizer = CortexMQuantizer()
831+
prepared = prepare_pt2e(model, quantizer)
832+
833+
dataset = get_calibration_data(
834+
args.model_name, example_inputs, args.evaluate, args.evaluate_config
835+
)
836+
837+
if isinstance(dataset, DataLoader):
838+
for sample, _ in dataset:
839+
prepared(_to_channels_last(sample))
840+
else:
841+
prepared(*tuple(_to_channels_last(x) for x in dataset))
842+
843+
model_quant = convert_pt2e(prepared)
844+
845+
exported_program = torch.export.export(
846+
model_quant, example_inputs, strict=args.strict_export
847+
)
848+
849+
edge = to_edge_transform_and_lower(
850+
exported_program,
851+
compile_config=EdgeCompileConfig(
852+
preserve_ops=[
853+
torch.ops.aten.linear.default,
854+
torch.ops.aten.hardsigmoid.default,
855+
torch.ops.aten.hardsigmoid_.default,
856+
torch.ops.aten.hardswish.default,
857+
torch.ops.aten.hardswish_.default,
858+
],
859+
_check_ir_validity=False,
860+
),
861+
)
862+
863+
pass_manager = CortexMPassManager(edge.exported_program())
864+
edge._edge_programs["forward"] = pass_manager.transform()
865+
866+
return model_quant if args.quantize else None, edge
867+
868+
798869
def to_edge_no_delegate(
799870
exported_program: ExportedProgram,
800871
args,
@@ -873,7 +944,12 @@ def to_edge_no_delegate(
873944

874945
# Quantize if required
875946
model_quant = None
876-
if args.delegate:
947+
if args.target == "cortex-m55+int8":
948+
# Cortex-M path: CMSIS-NN portable kernels, no delegation
949+
model_quant, edge = to_edge_cortex_m(
950+
exported_program, args, model, example_inputs
951+
)
952+
elif args.delegate:
877953
model_quant, edge = to_edge_TOSA_delegate(
878954
exported_program, args, model, example_inputs
879955
)

0 commit comments

Comments
 (0)