pytorch
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 6 additions & 2 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_int16_activation_conv_pass.py‎
Lines changed: 67 additions & 14 deletions b/‎backends/arm/_passes/decompose_int16_activation_conv_pass.py‎
Lines changed: 67 additions & 14 deletions
diff --git a/‎backends/arm/_passes/fuse_canceling_transposes_pass.py‎
Lines changed: 82 additions & 0 deletions b/‎backends/arm/_passes/fuse_canceling_transposes_pass.py‎
Lines changed: 82 additions & 0 deletions
@@ -105,6 +105,7 @@
     QuantizeClampArgumentsPass,
 )
 from .fuse_batch_norm2d_pass import FuseBatchNorm2dPass  # noqa
+from .fuse_canceling_transposes_pass import FuseCancelingTransposesPass  # noqa
 from .fuse_consecutive_concat_shapes import FuseConsecutiveConcatShapesPass  # noqa
 from .fuse_consecutive_rescales_pass import FuseConsecutiveRescalesPass  # noqa
 from .fuse_constant_ops_pass import (  # noqa
@@ -129,6 +130,7 @@
 from .match_arg_dtype_pass import MatchArgDtypePass  # noqa
 from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
 from .mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
+from .normalize_delegate_io_layout_pass import NormalizeDelegateIOLayoutPass  # noqa
 from .normalize_index_put_bool_index_tensor_pass import (  # noqa
     NormalizeIndexPutBoolIndexTensorPass,
 )
@@ -158,6 +160,7 @@
 from .rewrite_le_lt_to_ge_gt_pass import RewriteLeLtToGeGtPass  # noqa
 from .rewrite_matmul import RewriteMatmulPass  # noqa
 from .rewrite_pad import RewritePadPass  # noqa
+from .rewrite_pool_pass import RewritePoolPass  # noqa
 from .rewrite_slice import RewriteSlicePass  # noqa
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 
@@ -101,6 +101,7 @@
     DecorateFp32toInt32CastingPass,
     FoldAndAnnotateQParamsPass,
     FuseBatchNorm2dPass,
+    FuseCancelingTransposesPass,
     FuseConsecutiveConcatShapesPass,
     FuseConsecutiveRescalesPass,
     FuseConstantArgsPass,
@@ -116,6 +117,7 @@
     InsertTableOpsPass,
     MatchArgDtypePass,
     MatchArgRanksPass,
+    NormalizeDelegateIOLayoutPass,
     NormalizeIndexPutBoolIndexTensorPass,
     NormalizeIndexPutNoneIndicesPass,
     NormalizeWhileInitialArgsPass,
@@ -135,11 +137,11 @@
     RewriteLeLtToGeGtPass,
     RewriteMatmulPass,
     RewritePadPass,
+    RewritePoolPass,
     RewriteSlicePass,
     RewriteUpsamplePass,
     ScalarsToAttributePass,
     SizeAdjustInputPass,
-    ToTosaMemoryFormatPass,
     UnsqueezeBeforeRepeatPass,
     UnsqueezeScalarPlaceholdersPass,
 )
@@ -497,9 +499,12 @@ def _tosa_pipeline(
             [
                 RewriteUpsamplePass(),
                 RewriteConvPass(exported_program),
+                RewritePoolPass(),
                 RewriteMatmulPass(),
                 RewritePadPass(),
                 RewriteSlicePass(),
+                NormalizeDelegateIOLayoutPass(exported_program),
+                FuseCancelingTransposesPass(),
                 InsertConstShapesPass(),
             ]
         )
@@ -510,7 +515,6 @@ def _tosa_pipeline(
                 CastInt64BuffersToInt32Pass(exported_program),
                 FuseEqualPlaceholdersPass(exported_program),
                 FuseConsecutiveConcatShapesPass(),
-                ToTosaMemoryFormatPass(exported_program),
                 RemoveNoopPass(),
                 InsertRescalePass(),
             ]
 
@@ -30,13 +30,66 @@ def __init__(self) -> None:
         super().__init__()
 
     _passes_required_after: Set[Type[ExportPass]] = set()
+    _NHWC_ORDER = [0, 2, 3, 1]
+    _NHWC_INVERSE_ORDER = [0, 3, 1, 2]
+    _NDHWC_ORDER = [0, 2, 3, 4, 1]
+    _NDHWC_INVERSE_ORDER = [0, 4, 1, 2, 3]
 
     def bias_view_shape(
         self, bias: torch.Tensor, activation_rank: int
     ) -> Sequence[int]:
         # reshape bias to match convolution output rank so addition broadcasts over channels
         return [1, bias.shape[0], *([1] * (activation_rank - 2))]
 
+    def _insert_rescale_with_optional_transpose(
+        self,
+        input_node,
+        output_dtype,
+        scales: list[float],
+        in_zp: int,
+        out_zp: int,
+        meta,
+        activation_rank: int,
+    ):
+        per_channel = len(scales) > 1
+        if not per_channel:
+            return super().call_operator(
+                exir_ops.backend.tosa.RESCALE.default,
+                (input_node, output_dtype, scales, in_zp, out_zp),
+                {},
+                meta,
+            )
+
+        if activation_rank == 4:
+            pre_permute = self._NHWC_ORDER
+            post_permute = self._NHWC_INVERSE_ORDER
+        elif activation_rank == 5:
+            pre_permute = self._NDHWC_ORDER
+            post_permute = self._NDHWC_INVERSE_ORDER
+        else:
+            raise RuntimeError(
+                f"Unsupported rank {activation_rank} for per-channel rescale"
+            )
+
+        channel_last = super().call_operator(
+            exir_ops.edge.aten.permute_copy.default,
+            (input_node, pre_permute),
+            {},
+            meta,
+        )
+        rescaled = super().call_operator(
+            exir_ops.backend.tosa.RESCALE.default,
+            (channel_last, output_dtype, scales, in_zp, out_zp),
+            {},
+            meta,
+        )
+        return super().call_operator(
+            exir_ops.edge.aten.permute_copy.default,
+            (rescaled, post_permute),
+            {},
+            meta,
+        )
+
     def call_operator(self, op, args, kwargs, meta):
         if op != exir_ops.edge.aten.convolution.default:
             return super().call_operator(op, args, kwargs, meta)
@@ -112,11 +165,14 @@ def call_operator(self, op, args, kwargs, meta):
             conv_rescale_factors = [1.0] * len(bias_scale)
             final_output_scale = [b / conv_output_scale for b in bias_scale]
 
-            conv_output = super().call_operator(
-                exir_ops.backend.tosa.RESCALE.default,
-                (convolution, torch.int32, conv_rescale_factors, 0, 0),
-                {},
+            conv_output = self._insert_rescale_with_optional_transpose(
+                convolution,
+                torch.int32,
+                conv_rescale_factors,
+                0,
+                0,
                 new_meta,
+                activation_rank,
             )
 
             add = super().call_operator(
@@ -126,17 +182,14 @@ def call_operator(self, op, args, kwargs, meta):
                 new_meta,
             )
 
-            res_rescale = super().call_operator(
-                exir_ops.backend.tosa.RESCALE.default,
-                (
-                    add,
-                    output_dtype,
-                    final_output_scale,
-                    0,
-                    0,
-                ),
-                {},
+            res_rescale = self._insert_rescale_with_optional_transpose(
+                add,
+                output_dtype,
+                final_output_scale,
+                0,
+                0,
                 new_meta,
+                activation_rank,
             )
 
         else:
 
@@ -0,0 +1,82 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class FuseCancelingTransposesPass(ArmPass):
+    """Collapse consecutive ``aten.permute_copy`` chains.
+
+    For a chain ``permute(permute(...permute(x, p0)..., pN-1), pN)``, compose
+    the permutations into a single permutation on ``x`` and:
+    - keep one ``permute_copy`` when the composition is non-identity
+    - remove the chain entirely when the composition is identity
+
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    @staticmethod
+    def _compose_permutations(
+        first: list[int] | tuple[int, ...],
+        second: list[int] | tuple[int, ...],
+    ) -> list[int]:
+        return [first[i] for i in second]
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        modified = False
+
+        for node in list(graph.nodes):
+            if (
+                node.op != "call_function"
+                or node.target != exir_ops.edge.aten.permute_copy.default
+            ):
+                continue
+
+            current_input = node.args[0]
+            node_perm = node.args[1]
+            if not isinstance(node_perm, (list, tuple)):
+                continue
+            composed = list(node_perm)
+            folded_any = False
+
+            while (
+                isinstance(current_input, torch.fx.Node)
+                and current_input.op == "call_function"
+                and current_input.target == exir_ops.edge.aten.permute_copy.default
+            ):
+                current_input_perm = current_input.args[1]
+                if not isinstance(current_input_perm, (list, tuple)):
+                    break
+                current_perm = list(current_input_perm)
+                if len(current_perm) != len(composed):
+                    break
+                composed = self._compose_permutations(current_perm, composed)
+                current_input = current_input.args[0]
+                folded_any = True
+
+            if not folded_any:
+                continue
+
+            if composed == list(range(len(composed))):
+                node.replace_all_uses_with(current_input)
+            else:
+                node.update_arg(0, current_input)
+                node.update_arg(1, composed)
+            modified = True
+
+        if modified:
+            graph.eliminate_dead_code()
+            graph.lint()
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, modified)