Summary:MV2 CortexM PassManager changes for Alif E8

Github Executorch · Github Executorch · commit 7f14a9ddc182 · 2026-02-04T01:05:50.000-08:00
Test Plan:
python3 -m examples.arm.aot_arm_compiler -m mv2 --target=cortex-m --quantize --enable_qdq_fusion_pass --intermediates=./mv2_intermediates --output=./mv2_cortex_m.pte

cat ./mv2_intermediates/delegation_info.txt
Delegation info:
Total delegated subgraphs: 0
Number of delegated nodes: 0
Number of non-delegated nodes: 72

Delegation table:
╒════╤═════════════════════════════════════════════╤═══════════════════════════════════╤═══════════════════════════════════════╕
│    │ op_type                                     │   occurrences_in_delegated_graphs │   occurrences_in_non_delegated_graphs │
╞════╪═════════════════════════════════════════════╪═══════════════════════════════════╪═══════════════════════════════════════╡
│  0 │ aten_as_strided_copy_default                │                                 0 │                                     1 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│  1 │ aten_mean_dim                               │                                 0 │                                     1 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│  2 │ aten_view_copy_default                      │                                 0 │                                     1 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│  3 │ cortex_m_dequantize_per_tensor_default      │                                 0 │                                     2 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│  4 │ cortex_m_quantize_per_tensor_default        │                                 0 │                                     2 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│  5 │ cortex_m_quantized_add_default              │                                 0 │                                    10 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│  6 │ cortex_m_quantized_conv2d_default           │                                 0 │                                    35 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│  7 │ cortex_m_quantized_depthwise_conv2d_default │                                 0 │                                    17 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│  8 │ cortex_m_quantized_linear_default           │                                 0 │                                     1 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│ 9  │ dim_order_ops__clone_dim_order_default      │                                 0 │                                     1 │
├────┼─────────────────────────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│ 10 │ Total                                       │                                 0 │                                    71 │
╘════╧═════════════════════════════════════════════╧═══════════════════════════════════╧═══════════════════════════════════════╛

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
@@ -33,6 +33,14 @@
 from torch.fx import GraphModule, Node
 
 
+# Passthrough ops that preserve quantization parameters from input to output.
+# These ops should be foldable even without explicit annotation metadata.
+PASSTHROUGH_OPS = {
+    exir_ops.edge.aten.hardtanh.default,
+    exir_ops.edge.aten.relu.default,
+    exir_ops.edge.aten.clamp.default,
+}
+
 def _get_special_dtype(qspec: QuantArgs) -> TosaSpecialDtype | None:
     if qspec.dtype == torch.int8:
         if qspec.qmax == 7 and qspec.qmin == -7:
@@ -248,6 +256,26 @@ def _handle_control_flow_node(self, node: Node, graph_module: GraphModule):
                 submodule.graph.erase_node(node_to_remove)
         return
 
+    @staticmethod
+    def _has_dq_input_and_q_output(node: Node) -> bool:
+        """
+        Check if a node has dequantize input(s) and quantize output(s).
+        This indicates the node is part of a quantized computation path.
+        """
+        # Check if any input is from a dequantize op
+        has_dq_input = any(
+            isinstance(arg, Node) and arg.target in DQ_OPS
+            for arg in node.args
+            if isinstance(arg, Node)
+        )
+
+        # Check if any output goes to a quantize op
+        has_q_output = any(
+            user.target in Q_OPS
+            for user in node.users
+        )
+        return has_dq_input and has_q_output
+
     @staticmethod
     def is_foldable(node: Node) -> bool:
         if node.op != "call_function":
@@ -263,6 +291,13 @@ def is_foldable(node: Node) -> bool:
         ):
             return True
 
+        # Passthrough ops (hardtanh, relu, clamp) that have dq inputs and q outputs
+        # should be foldable even without explicit annotation. These ops preserve
+        # quantization parameters and are common in quantized models like MobileNetV2.
+        if node.target in PASSTHROUGH_OPS:
+            if FoldAndAnnotateQParamsPass._has_dq_input_and_q_output(node):
+                return True
+
         # We should not fold q-dq nodes into non-quantized nodes.
         if not (
             ArmAnnotationInfo.CUSTOM_META_KEY in node.meta.get("custom", {})
@@ -335,6 +370,35 @@ def call(self, graph_module: GraphModule) -> PassResult:  # noqa: C901
             ):
                 self._handle_control_flow_node(n, graph_module)
 
+        # Second pass: Propagate qparams through passthrough ops.
+        # For ops like hardtanh that share qparams with their input, we need to:
+        # 1. Copy output_qparams from the passthrough op to its input node
+        # 2. Set input_qparams on the passthrough op
+        for n in graph_module.graph.nodes:
+            n = cast(Node, n)
+            if n.target not in PASSTHROUGH_OPS:
+                continue
+
+            # Check if this passthrough op has output_qparams but missing input_qparams
+            has_output = "output_qparams" in n.meta and len(n.meta.get("output_qparams", {})) > 0
+            has_input = "input_qparams" in n.meta and len(n.meta.get("input_qparams", {})) > 0
+
+            if not has_output or has_input:
+                continue
+
+            # Get the input node
+            if len(n.args) == 0 or not isinstance(n.args[0], Node):
+                continue
+
+            input_node = n.args[0]
+
+            # Propagate: For passthrough ops, output qparams equal input qparams
+            if "output_qparams" not in input_node.meta:
+                input_node.meta["output_qparams"] = n.meta["output_qparams"]
+
+            # Set input_qparams from output_qparams (same for passthrough ops)
+            n.meta["input_qparams"] = {0: n.meta["output_qparams"][0]}
+
         # retrace the graph to update the fake tensor types
         graph_module = super().call(graph_module).graph_module
 
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -69,9 +69,164 @@ def _get_batch_size_from_conv(self, conv_node: torch.fx.Node):
             pass
         return None
 
+    def _get_addmm_replacement(self, node):
+        """
+        Handle aten.addmm (decomposed linear):
+        addmm(bias, input, weight.T) = input @ weight.T + bias
+        
+        In the graph, weight is already transposed via cortex_m.transpose or aten.t
+        so we need to trace back to find the original weight placeholder.
+        
+        input_qparams indices for addmm:
+        [0] = bias (int32)
+        [1] = input activation (int8)
+        [2] = weight (int8) - often missing because weight goes through transpose
+        """
+        # addmm args: (bias, input, weight_transposed)
+        bias_node = node.args[0]
+        input_node = node.args[1]
+        weights_node = node.args[2]  # This is the transposed weight
+
+        # Get input qparams - use index 1 for input activation (index 0 is bias!)
+        input_scale = node.meta["input_qparams"][1].scale
+        input_zp = node.meta["input_qparams"][1].zp
+
+        # Get output qparams
+        output_scale = node.meta["output_qparams"][0].scale
+        output_zp = node.meta["output_qparams"][0].zp
+        output_min = node.meta["output_qparams"][0].qmin
+        output_max = node.meta["output_qparams"][0].qmax
+
+        # Trace back through graph to find original weight placeholder and its qparams
+        current_node = weights_node
+        max_depth = 10
+        found_transpose = False
+        original_weight_node = None
+        weight_qparams = None
+
+        # Check if weights_node (transpose) has qparams in its metadata
+        if "input_qparams" in weights_node.meta:
+            if 0 in weights_node.meta["input_qparams"]:
+                weight_qparams = weights_node.meta["input_qparams"][0]
+        if "output_qparams" in weights_node.meta:
+            if weight_qparams is None and 0 in weights_node.meta["output_qparams"]:
+                weight_qparams = weights_node.meta["output_qparams"][0]
+
+        # Trace back to find original weight placeholder
+        for depth in range(max_depth):
+            # Check for qparams in current node
+            if weight_qparams is None and "output_qparams" in current_node.meta:
+                oq = current_node.meta.get("output_qparams", {})
+                if 0 in oq:
+                    weight_qparams = oq[0]
+
+            if current_node.op == "placeholder":
+                original_weight_node = current_node
+                if "val" in original_weight_node.meta:
+                    val = original_weight_node.meta["val"]
+                # Check placeholder for output_qparams
+                if weight_qparams is None and "output_qparams" in original_weight_node.meta:
+                    oq = original_weight_node.meta.get("output_qparams", {})
+                    if 0 in oq:
+                        weight_qparams = oq[0]
+                break
+            elif current_node.op == "call_function":
+                target_name = str(current_node.target)
+                if ".t." in target_name or "transpose" in target_name.lower():
+                    found_transpose = True
+                if len(current_node.args) > 0:
+                    current_node = current_node.args[0]
+                else:
+                    break
+            else:
+                break
+
+        if original_weight_node is None:
+            raise RuntimeError(f"Could not find original weight placeholder for addmm node {node.name}")
+
+        # Get the weight tensor from the original placeholder
+        weights_tensor = get_param_tensor(self.exported_program, original_weight_node)
+
+        # If transpose found, original weights are [out_feat, in_feat]
+        # CMSIS-NN expects [out_feat, in_feat], so use original directly
+        if found_transpose:
+            final_weights = weights_tensor.contiguous()
+        else:
+            final_weights = weights_tensor.T.contiguous()
+
+        # Extract weight scale and zero_point
+        if weight_qparams is not None:
+            weight_scale = weight_qparams.scale
+            weight_zp = weight_qparams.zp
+        elif 2 in node.meta.get("input_qparams", {}):
+            # Fallback: check if weight qparams are at index 2
+            weight_scale = node.meta["input_qparams"][2].scale
+            weight_zp = node.meta["input_qparams"][2].zp
+        else:
+            # Derive weight scale from bias scale!
+            # For quantized linear: bias_scale = input_scale * weight_scale
+            # Therefore: weight_scale = bias_scale / input_scale
+            if 0 in node.meta.get("input_qparams", {}):
+                bias_scale = node.meta["input_qparams"][0].scale
+                weight_scale = bias_scale / input_scale
+                weight_zp = 0  # Symmetric quantization
+            else:
+                # Last resort: derive from weight tensor (symmetric quantization assumed)
+                weight_min = final_weights.min().item()
+                weight_max = final_weights.max().item()
+                weight_absmax = max(abs(weight_min), abs(weight_max))
+                weight_scale = weight_absmax / 127.0 if weight_absmax > 0 else 1.0
+                weight_zp = 0
+
+        # Calculate quantization multiplier and shift
+        quantized_multiplier, quantized_shift = quantize_multiplier_aot(
+            (input_scale * weight_scale) / output_scale
+        )
+
+        # Compute kernel_sum WITHOUT bias (pass None)
+        # Pass bias separately to C++ operator
+        kernel_sum_tensor = self._compute_kernel_sum(
+            final_weights, None, -input_zp, -weight_zp
+        )
+
+        # Create placeholders
+        with node.graph.inserting_after(original_weight_node):
+            weights_placeholder = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_weights_correct",
+                InputKind.PARAMETER,
+                final_weights,
+            )
+
+            kernel_sum = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_kernel_sum",
+                InputKind.PARAMETER,
+                kernel_sum_tensor,
+            )
+
+        # CMSIS-NN shift convention: use the shift as-is (not negated)
+        args = (
+            input_node,
+            weights_placeholder,
+            bias_node,           # Pass original bias (kernel_sum doesn't include it)
+            kernel_sum,
+            -input_zp,
+            -weight_zp,
+            output_zp,
+            [quantized_multiplier],
+            [quantized_shift],   # Use shift as-is
+            output_max,
+            output_min,
+        )
+
+        return exir_ops.edge.cortex_m.quantized_linear.default, args
+
     def _get_linear_replacement(self, node):
         """
-         Let
+        Let
         - yi be the output activations (y1, ... yn)
         - xj be the input activations (x1, ... xm)
         - wij be the weights (w11, ... wnm)
@@ -175,6 +330,7 @@ def _get_convolution_replacement(self, node) -> int:
 
         weight_tensor = get_param_tensor(self.exported_program, weight)
 
+
         # Detect depthwise convolution:
         # Depthwise means groups == in_channels, out_channels == K * in_channels
         # Weight shape is [out_ch, in_ch_per_group, H, W]
@@ -386,6 +542,11 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
             match node.target:
                 case exir_ops.edge.aten.linear.default:
                     op, args = self._get_linear_replacement(node)
+                case exir_ops.edge.aten.addmm.default:
+                    result = self._get_addmm_replacement(node)
+                    if result is None:
+                        continue
+                    op, args = result
                 case exir_ops.edge.aten.convolution.default:
                     # Check if it's transposed convolution (arg index 6)
                     transposed = node.args[6] if len(node.args) > 6 else False
diff --git a/backends/cortex_m/passes/cortex_m_pass_manager.py b/backends/cortex_m/passes/cortex_m_pass_manager.py
@@ -13,6 +13,9 @@
 from executorch.backends.transforms.replace_scalar_with_tensor import (
     ReplaceScalarWithTensorArgPass,
 )
+from executorch.backends.arm._passes.decompose_adaptive_avg_pool2d_pass import (
+    DecomposeAdaptiveAvgPool2dPass,
+)
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.pass_manager import PassManager
 from executorch.exir.program._program import _transform
@@ -33,6 +36,7 @@ class CortexMPassManager(PassManager):
         ReplaceScalarWithTensorArgPass,
         ReplaceQuantNodesPass,
         ActivationFusionPass,
+        DecomposeAdaptiveAvgPool2dPass,
         DecomposeHardswishPass,
         QuantizedOpFusionPass,
         ConvertToCortexMPass,
@@ -44,12 +48,22 @@ class CortexMPassManager(PassManager):
         ClampHardswishPass,
     ]
 
-    def __init__(self, exported_program, passes=None):
+    def __init__(self, exported_program, passes=None, skip_passes=None):
+        """
+        Initialize CortexMPassManager.
+
+        Args:
+            exported_program: The ExportedProgram to transform.
+            passes: Optional custom pass list. Uses default pass_list if None.
+            skip_passes: Optional list of pass classes to skip.
+        """
         self.exported_program = exported_program
         if passes is not None:
             self.passes = passes
         else:
-            self.passes = self.pass_list
+            self.passes = list(self.pass_list)
+        if skip_passes:
+            self.passes = [p for p in self.passes if p not in skip_passes]
 
     def transform_for_annotation(self, model):
         passes = self.pass_list_transform_for_annotation
diff --git a/backends/cortex_m/quantizer/quantizer.py b/backends/cortex_m/quantizer/quantizer.py
@@ -448,6 +448,11 @@ class SharedQspecQuantizer(Quantizer):
         torch.ops.aten._unsafe_view.default,
         torch.ops.aten.unflatten.int,
         torch.ops.aten.flatten.using_ints,
+        # Additional passthrough ops for MobileNetV2 and similar architectures
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+        torch.ops.aten.max_pool2d.default,
+        torch.ops.aten.dropout.default,
     ]
 
     def __init__(self, targets: Optional[List[OpOverload]] = None) -> None:
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py