pytorch
diff --git a/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 7 additions & 70 deletions b/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 7 additions & 70 deletions
diff --git a/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 63 additions & 35 deletions b/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 63 additions & 35 deletions
diff --git a/‎backends/arm/_passes/normalize_while_initial_args_pass.py‎
Lines changed: 3 additions & 1 deletion b/‎backends/arm/_passes/normalize_while_initial_args_pass.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/arm/quantizer/quantization_annotator.py‎
Lines changed: 24 additions & 20 deletions b/‎backends/arm/quantizer/quantization_annotator.py‎
Lines changed: 24 additions & 20 deletions
diff --git a/‎backends/arm/scripts/aot_arm_compiler.py‎
Lines changed: 4 additions & 4 deletions b/‎backends/arm/scripts/aot_arm_compiler.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/arm/test/misc/test_transpose_counts.py‎
Lines changed: 5 additions & 5 deletions b/‎backends/arm/test/misc/test_transpose_counts.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎backends/arm/test/ops/test_cond.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/test/ops/test_cond.py‎
Lines changed: 1 addition & 1 deletion
@@ -16,7 +16,6 @@
 )
 from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
-from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -51,14 +50,6 @@ def get_dynamic_meandim_decomposition(op) -> tuple:
     raise RuntimeError(f"Can't get meandim decomposition for op {op}")
 
 
-def get_avgpool(op):
-    if op in (exir_ops.edge.aten.mean.dim, exir_ops.edge.aten.mean.default):
-        return exir_ops.edge.aten.avg_pool2d.default
-    if op in (torch.ops.aten.mean.dim, torch.ops.aten.mean.default):
-        return torch.ops.aten.avg_pool2d.default
-    raise RuntimeError(f"Can't get meandim decomposition for op {op}")
-
-
 def get_view(op):
     if op in (exir_ops.edge.aten.mean.dim, exir_ops.edge.aten.mean.default):
         return exir_ops.edge.aten.view_copy.default
@@ -79,23 +70,21 @@ def get_quantization(op):
 
 
 class DecomposeMeanDimPass(ArmPass):
-    """Decomposes a meandim into avg_pool and/or sum + mul (1/N).
-
-    ::
+    """Decomposes a meandim into sum + mul (1/N).
 
-        h, w -> avg_pool
-        n, c -> sum + mul(1/N)
+    Each reduction dimension is handled via REDUCE_SUM followed by
+    multiplication by 1/N, which works on any axis without layout
+    constraints (unlike AVG_POOL2D which only pools over spatial H×W).
 
     For rank < 4, the input is reshaped to 4D by padding with dim=1 from the
     left.
 
     Example:
         x = mean_dim(x, (0,2), keepdim=False) # x = (c,h,w)
     Becomes:
-        x = view_copy.default(x, new_shape=(1,c,h,w)) # Reshape to work with avg_pool
-        x = avg_pool2d.default(x, kernel=(1,w), stride=(1,1)) # Reduce w with avg_pool
-        x = sum.dim_IntList(x, dim=1, keepdims=True) # Reduce c with sum
-        x = mul.Tensor(x, 1/c) # Divide by number of channels to get mean
+        x = view_copy.default(x, new_shape=(1,c,h,w)) # Reshape to 4D
+        x = sum.dim_IntList(x, dim=(1,3), keepdims=True) # Reduce c,w with sum
+        x = mul.Tensor(x, 1/(c*w)) # Divide by number of elements to get mean
         x = view_copy.default(x, new_shape=(h)) # Squeeze dims since keepdims = False
 
     """
@@ -110,14 +99,6 @@ def __init__(self, graph_module, tosa_spec, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._graph_module = graph_module
         self._tosa_spec = tosa_spec
-        # Lazy import to avoid circular dependency with operator_support
-        from executorch.backends.arm.operator_support.pool_2d_support import (
-            AvgPool2dSupported,
-        )
-
-        self._avg_pool_checker = AvgPool2dSupported(
-            self._tosa_spec, WhyNoPartitionReporter()
-        )
 
     def call_operator(self, op, args, kwargs, meta, updated=False):
         if op not in (
@@ -168,12 +149,6 @@ def call_operator(self, op, args, kwargs, meta, updated=False):
             x = super().call_operator(view_op, (x, new_shape), {}, meta, True)
             x = self._maybe_insert_q_dq_after(x, meta)
 
-        # Reduce (h,w) dims by avg pool if possible
-        if not has_symbolic_reduce_dim:
-            x, dims_to_reduce = self._reduce_by_average_pool(
-                op, x, dims_to_reduce, meta
-            )
-
         # Reshape back to 5D if necessary
         if len(input_shape) > 4:
             original_dims = input_shape[:-3]
@@ -259,44 +234,6 @@ def _reduce_by_sum(self, op, input_node, dims, meta):
 
         return super().call_operator(mul_op, (sum, divisor), {}, meta, True)
 
-    def _reduce_by_average_pool(self, op, input_node, dims, meta):
-        dims_to_reduce_by_avgpool = [dim for dim in dims if dim >= 2]
-        if len(dims_to_reduce_by_avgpool) == 0:
-            return input_node, dims
-
-        dims_to_reduce_by_sum = [dim for dim in dims if dim < 2]
-
-        avgpool_op = get_avgpool(op)
-        input_shape = input_node.data.size()
-
-        stride = [1, 1]
-        if dims_to_reduce_by_avgpool in ([2, 3], [3, 2]):
-            kernel_size = [input_shape[2], input_shape[3]]
-        elif dims_to_reduce_by_avgpool == [3]:
-            kernel_size = [1, input_shape[3]]
-        elif dims_to_reduce_by_avgpool == [2]:
-            kernel_size = [input_shape[2], 1]
-        else:
-            raise RuntimeError(
-                f"Bad dims {dims_to_reduce_by_avgpool} for {op} decomposition of mean_dim."
-            )
-
-        args = (input_node, kernel_size, stride)
-
-        avg_pool_node = self._graph_module.graph.create_node(
-            "call_function", avgpool_op, args
-        )
-        is_supported = self._avg_pool_checker.is_node_tosa_supported(
-            avg_pool_node, self._tosa_spec
-        )
-
-        if is_supported:
-            out = super().call_operator(avgpool_op, args, {}, meta, True)
-            out = self._maybe_insert_q_dq_after(out, meta)
-            return out, dims_to_reduce_by_sum
-
-        return input_node, dims
-
     def _maybe_insert_q_dq_after(self, op, meta):
         """If the input node of op is a dequant node, insert a q-dq pair after
         op with identical quantization parameters.
 
@@ -40,6 +40,19 @@ def _get_special_dtype(qspec: QuantArgs) -> TosaSpecialDtype | None:
     return None
 
 
+def _merge_qparams(qspec_1: QuantArgs, qspec_2: QuantArgs) -> QuantArgs:
+    """Merge two QuantArgs when inputs are quantized differently.
+
+    Requires same dtype; picks the first's parameters by default.
+
+    """
+    if qspec_1.dtype != qspec_2.dtype:
+        raise RuntimeError(
+            f"Cannot merge qparams of different dtypes: {qspec_1.dtype} vs {qspec_2.dtype}"
+        )
+    return qspec_1
+
+
 def get_input_qparams(node: Node) -> dict[int, QuantArgs]:
     """Get the input quantization parameters from a node, set by the
     'FoldAndAnnotateQParamsPass'.
@@ -121,57 +134,72 @@ def __init__(
         super().__init__(*args, **kwargs)
         self.exported_program = exported_program
 
-    def fold_and_annotate_arg(
-        self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int
-    ) -> None:
-        input_qparams = None
-        nodes_to_remove = set()
+    def _extract_input_params(
+        self, arg_list: list[Node]
+    ) -> tuple[Optional[QuantArgs], set[Node]]:
+        input_qparams: Optional[QuantArgs] = None
+        nodes_to_remove: set[Node] = set()
         for arg in arg_list:
             if not isinstance(arg, Node):
-                return
-
-            arg_quant_params = None
+                return None, set()
+            arg_quant: Optional[QuantArgs] = None
             if arg.target in DQ_OPS:
                 args = arg.args
                 scales = args[1]
                 if (
-                    isinstance(args[1], Node)
+                    isinstance(scales, Node)
                     and self.exported_program is not None
-                    and is_param_node(self.exported_program, args[1])
+                    and is_param_node(self.exported_program, scales)
                 ):
-                    scales = get_param_tensor(self.exported_program, args[1])
+                    scales = get_param_tensor(self.exported_program, scales)
                 zps = args[2]
                 if (
-                    isinstance(args[2], Node)
+                    isinstance(zps, Node)
                     and self.exported_program is not None
-                    and is_param_node(self.exported_program, args[2])
+                    and is_param_node(self.exported_program, zps)
                 ):
-                    zps = get_param_tensor(self.exported_program, args[2])
-                arg_quant_params = QuantArgs.from_operator(
+                    zps = get_param_tensor(self.exported_program, zps)
+                arg_quant = QuantArgs.from_operator(
                     arg.target, (args[0], scales, zps, *args[3:])
                 )
-                # add arg to nodes_to_remove to fold the dq-node
                 nodes_to_remove.add(arg)
-            if input_qparams is not None and input_qparams != arg_quant_params:
-                # Two args are quantized differently
-                raise RuntimeError("Input qparams do not match")
-            input_qparams = arg_quant_params
-        if input_qparams is not None:
-            node.meta["input_qparams"][i] = input_qparams
-            for n in nodes_to_remove:
-                if n.target not in DQ_OPS:
-                    raise RuntimeError(
-                        f"Expected one of {DQ_OPS} dq_op, got {n.target}"
-                    )
+            if arg_quant is not None:
+                if input_qparams is None:
+                    input_qparams = arg_quant
+                elif input_qparams != arg_quant:
+                    input_qparams = _merge_qparams(input_qparams, arg_quant)
+        return input_qparams, nodes_to_remove
+
+    def _annotate_input_params(
+        self,
+        graph_module: GraphModule,
+        node: Node,
+        index: int,
+        input_qparams: QuantArgs,
+        nodes_to_remove: set[Node],
+    ) -> None:
+        node.meta["input_qparams"][index] = input_qparams
+
+        for dq in nodes_to_remove:
+            if dq.target not in DQ_OPS:
+                raise RuntimeError(f"Expected one of {DQ_OPS} dq_op, got {dq.target}")
+            node.replace_input_with(dq, cast(Node, dq.args[0]))
+            if not dq.users:
+                graph_module.graph.erase_node(dq)
+
+        special = _get_special_dtype(input_qparams)
+        if special:
+            node.all_input_nodes[index].meta[TosaSpecialDtype.meta_key()] = special
 
-                node.replace_input_with(n, cast(Node, n.args[0]))
-                if len(n.users) == 0:
-                    graph_module.graph.erase_node(n)
-            special_dtype = _get_special_dtype(input_qparams)
-            if special_dtype:
-                node.all_input_nodes[i].meta[
-                    TosaSpecialDtype.meta_key()
-                ] = special_dtype
+    def fold_and_annotate_arg(
+        self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int
+    ) -> None:
+        input_qparams, nodes_to_remove = self._extract_input_params(arg_list)
+        if input_qparams is None:
+            return
+        self._annotate_input_params(
+            graph_module, node, i, input_qparams, nodes_to_remove
+        )
 
     def _handle_control_flow_node(self, node: Node, graph_module: GraphModule):
         """Fold outmost quant nodes inside submodule.
 
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -82,6 +82,8 @@ def _normalize_node(self, graph_module: GraphModule, node: Node) -> bool:
         new_carried = tuple(carried_inputs + additional_inputs)
         node.update_arg(2, new_carried)
         node.update_arg(3, ())
+        # annotate node so later keying of captured vs loop‐carried args is possible
+        node.meta["additional_inputs"] = additional_inputs
 
         body_module_name = str(cast(Node, node.args[1]).target)
         body_module = cast(GraphModule, graph_module.get_submodule(body_module_name))  # type: ignore
 
@@ -890,29 +890,33 @@ def any_or_hardtanh_min_zero(n: Node):
         submodule_args_pos = -1 if node.target == torch.ops.higher_order.cond else -2
         submodule_args = node.args[submodule_args_pos]
         output_qspec = output_act_qspec
-        if len(submodule_args) > 0:  # type: ignore[arg-type]
-            # The way the TOSA backend handles quantized inputs, arrays of input tensors (such as the input to a
-            # conditional graph) need shared quantization.
-            shared_qspec = SharedQuantizationSpec(
-                (cast(list[Node], submodule_args)[0], node)
-            )
-            quant_properties.quant_inputs = [
-                _QuantProperty(
-                    submodule_args_pos,
-                    [
-                        input_act_qspec,
-                        *([shared_qspec] * (len(submodule_args) - 1)),  # type: ignore[arg-type]
-                    ],
+        # Annotate each control-flow tensor independently using the default input qspec
+        if submodule_args:
+            if node.meta.get("additional_inputs", None):
+                qspecs = [input_act_qspec] * len(cast(Sequence[Node], submodule_args))  # type: ignore[arg-type]
+                quant_properties.quant_inputs = [
+                    _QuantProperty(submodule_args_pos, qspecs)
+                ]
+            else:
+                shared_qspec = SharedQuantizationSpec(
+                    (cast(list[Node], submodule_args)[0], node)
                 )
-            ]
-            if node.target == torch.ops.higher_order.while_loop:
-                # The output of the while loop body can either re-enter the body, or exit the while loop.
-                # Therefore, A and B in the diagram below need to share the same quantization parameters.
-                # A -> while ( RESCALE -> ... RESCALE -> ) -> B
-                output_qspec = shared_qspec
+                quant_properties.quant_inputs = [
+                    _QuantProperty(
+                        submodule_args_pos,
+                        [
+                            input_act_qspec,
+                            *([shared_qspec] * (len(submodule_args) - 1)),  # type: ignore[arg-type]
+                        ],
+                    )
+                ]
+                if node.target == torch.ops.higher_order.while_loop:
+                    # The output of the while loop body can either re-enter the body, or exit the while loop.
+                    # Therefore, A and B in the diagram below need to share the same quantization parameters.
+                    # A -> while ( RESCALE -> ... RESCALE -> ) -> B
+                    output_qspec = shared_qspec
 
         quant_properties.quant_output = _QuantProperty(0, output_qspec)
-
     else:
         return None
 
 
@@ -847,8 +847,8 @@ def _to_edge_TOSA_delegate(
     )
 
     # Replace quantized_decomposed::{quantize,dequantize}_per_tensor nodes
-    # with cortex_m:: equivalents for int8 QDQ ops remaining outside the
-    # delegated subgraph.
+    # with cortex_m:: equivalents for int8/int16 QDQ ops remaining outside
+    # the delegated subgraph.
     edge = _apply_replace_quant_nodes(edge, target, direct_drive)
 
     return model_quant, edge
@@ -955,8 +955,8 @@ def _to_edge_no_delegate(
     )
 
     # Replace quantized_decomposed::{quantize,dequantize}_per_tensor nodes
-    # with cortex_m:: equivalents for int8 QDQ ops remaining outside the
-    # delegated subgraph.
+    # with cortex_m:: equivalents for int8/int16 QDQ ops remaining outside
+    # the delegated subgraph.
     edge = _apply_replace_quant_nodes(edge, args.target, args.direct_drive)
 
     return model_quant, edge
 
@@ -404,7 +404,7 @@ def forward(self, x):
     "groupnorm": TransposeCountCase(
         GroupNormModule(),
         (torch.randn(1, 4, 4, 4),),
-        1,
+        0,
     ),
     "multihead_attention_rank2": TransposeCountCase(
         MultiheadAttentionModule(),
@@ -430,16 +430,16 @@ def forward(self, x):
         Model1ConvMaxPoolResidualLinear(), (torch.randn(2, 8, 64),), 5
     ),
     "model_2_conv_mha_linear_layernorm": TransposeCountCase(
-        Model2ConvMhaLinearLayerNorm(), (torch.randn(2, 8, 32),), 11
+        Model2ConvMhaLinearLayerNorm(), (torch.randn(2, 8, 32),), 9
     ),
     "model_3_lstm_linear": TransposeCountCase(
         Model3LstmLinear(), (torch.randn(2, 16, 8),), 2
     ),
     "model_4_conv_lstm_linear_layernorm": TransposeCountCase(
-        Model4ConvLstmLinearLayerNorm(), (torch.randn(2, 8, 32),), 5
+        Model4ConvLstmLinearLayerNorm(), (torch.randn(2, 8, 32),), 3
     ),
     "model_5_dwconv_gelu_layernorm_avgpool": TransposeCountCase(
-        Model5DwConvGeluLayerNormAvgPool(), (torch.randn(1, 8, 16, 16),), 6
+        Model5DwConvGeluLayerNormAvgPool(), (torch.randn(1, 8, 16, 16),), 4
     ),
     "model_6_gru_linear": TransposeCountCase(
         Model6GruLinear(), (torch.randn(2, 16, 8),), 2
@@ -521,7 +521,7 @@ def forward(self, x):
     "groupnorm_channels_last": TransposeCountCase(
         GroupNormModule(),
         (torch.randn(1, 4, 4, 4).to(memory_format=torch.channels_last),),
-        3,
+        2,
     ),
     "cumsum_rank4_dim3_channels_last": TransposeCountCase(
         CumsumModule(),
 
@@ -82,7 +82,7 @@ def true_branch(arg: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
             return arg + torch.sin(arg), arg - torch.sin(arg)
 
         def false_branch(arg: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-            return arg - arg.mean(), arg + arg.mean()
+            return arg - torch.cos(arg), arg + torch.cos(arg)
 
         predicate = x.flatten().sum() > 0
         return torch.cond(predicate, true_branch, false_branch, [x])