Arm backend: Fix remove tosa_dim_order review comments

AdrianLundell · AdrianLundell · commit b772b64067d8 · 2026-04-23T16:28:04.000+02:00
- Fix [] stride in avg/max_pool2d + add tests
- Fix meta-data of rescale in rewrite_upsample
- Merge conv weight permutes into singe help function
- Nits: Remove dead code, stale comments, TEMP path

Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
Change-Id: I6aa9221467a575e1c42a40cc5ca7237a810f782d
diff --git a/backends/arm/_passes/arm_pass_utils.py b/backends/arm/_passes/arm_pass_utils.py
@@ -397,3 +397,12 @@ def get_cond_while_submodules_nested(
     }
     # collect cond/while submodules (using mapping indices)
     return _get_control_flow_submodules(graph_module, mapping)
+
+
+def to_2tuple(value):
+    """Normalizes scalars, and 1-element sequences to a tuple of length 2."""
+    if isinstance(value, int):
+        return (value, value)
+    if len(value) == 1:
+        return (value[0], value[0])
+    return tuple(value)
diff --git a/backends/arm/_passes/rewrite_avg_pool2d_pass.py b/backends/arm/_passes/rewrite_avg_pool2d_pass.py
@@ -7,6 +7,7 @@
 
 import torch
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import to_2tuple
 from executorch.backends.arm.constants import NHWC_INVERSE_ORDER, NHWC_ORDER
 from executorch.backends.arm.operators.operator_validation_utils import (
     adjust_pooling_pad_if_needed,
@@ -33,19 +34,25 @@ def call_operator(self, op, args, kwargs, meta, updated=False):
             return super().call_operator(op, args, kwargs, meta, updated)
 
         x = args[0]
-        pad_h, pad_w = args[3] if len(args) > 3 else (0, 0)
+        kernel = to_2tuple(args[1])
+
+        stride = to_2tuple(args[2]) if len(args) > 2 else ()
+        if not stride:
+            stride = kernel  # default to kernel_size
+
+        pad_h, pad_w = to_2tuple(args[3]) if len(args) > 3 else (0, 0)
         # Make sure pad corresponds to TOSA
         pad = [pad_h, pad_w, pad_h, pad_w]
 
-        _, _, h, w = x.data.shape
-        kernel_h, kernel_w = args[1]
-        stride_h, stride_w = args[2] if len(args) > 2 else (1, 1)
-
         ceil_mode = args[4] if len(args) > 4 else False
 
         # Adjust padding if necessary
-        pad[1] = adjust_pooling_pad_if_needed(h, kernel_h, stride_h, pad[1], ceil_mode)
-        pad[3] = adjust_pooling_pad_if_needed(w, kernel_w, stride_w, pad[3], ceil_mode)
+        pad[1] = adjust_pooling_pad_if_needed(
+            x.data.shape[2], kernel[0], stride[0], pad[1], ceil_mode
+        )
+        pad[3] = adjust_pooling_pad_if_needed(
+            x.data.shape[3], kernel[1], stride[1], pad[3], ceil_mode
+        )
 
         # Materialize zero-point constants
         in_qparams = meta.data.get("input_qparams", {})
@@ -76,8 +83,8 @@ def call_operator(self, op, args, kwargs, meta, updated=False):
             pre_permute,
             input_zp,
             output_zp,
-            [kernel_h, kernel_w],
-            [stride_h, stride_w],
+            list(kernel),
+            list(stride),
             pad,
             acc_type,
         )
diff --git a/backends/arm/_passes/rewrite_conv_pass.py b/backends/arm/_passes/rewrite_conv_pass.py
diff --git a/backends/arm/_passes/rewrite_max_pool2d_pass.py b/backends/arm/_passes/rewrite_max_pool2d_pass.py
@@ -6,6 +6,7 @@
 from typing import Set, Type
 
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import to_2tuple
 from executorch.backends.arm.constants import NHWC_INVERSE_ORDER, NHWC_ORDER
 from executorch.backends.arm.operators.operator_validation_utils import (
     adjust_pooling_pad_if_needed,
@@ -16,14 +17,6 @@
 edge_max_pool2d_ops = (exir_ops.edge.aten.max_pool2d.default,)
 
 
-def _to_2tuple(value):
-    if isinstance(value, int):
-        return (value, value)
-    if len(value) == 1:
-        return (value[0], value[0])
-    return tuple(value)
-
-
 class RewriteMaxPool2dPass(ArmPass):
     """Rewrite max_pool2d ops to TOSA MAX_POOL2D."""
 
@@ -34,19 +27,23 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         x = args[0]
-        kernel = _to_2tuple(args[1])
-
-        if len(args) > 2 and args[2] is not None:
-            stride = _to_2tuple(args[2])
-        else:
-            stride = kernel
+        kernel = args[1]
+        stride = to_2tuple(args[2]) if len(args) > 2 else ()
+        if not stride:
+            stride = kernel  # default to kernel_size
 
-        padding = _to_2tuple(args[3]) if len(args) > 3 else (0, 0)
-        dilation = _to_2tuple(args[4]) if len(args) > 4 else (1, 1)
+        padding = to_2tuple(args[3]) if len(args) > 3 else (0, 0)
+        dilation = to_2tuple(args[4]) if len(args) > 4 else (1, 1)
         ceil_mode = args[5] if len(args) > 5 else False
 
-        if dilation != (1, 1):
-            return super().call_operator(op, args, kwargs, meta)
+        if not dilation == (1, 1):
+            from executorch.backends.arm._passes.decompose_maxpool2d_with_dilation_pass import (
+                DecomposeMaxPool2dPass,
+            )
+
+            raise RuntimeError(
+                f"Dilation > 1 is not supported for tosa.MAX_POOL2D, has {DecomposeMaxPool2dPass.__name__} run?"
+            )
 
         # TOSA MAX_POOL2D pad order is [top, bottom, left, right]
         pad = [padding[0], padding[0], padding[1], padding[1]]
diff --git a/backends/arm/_passes/rewrite_upsample.py b/backends/arm/_passes/rewrite_upsample.py
@@ -227,7 +227,10 @@ def call(self, graph_module):
                     rescale_node = create_node(
                         graph_module.graph,
                         exir_ops.backend.tosa.RESCALE.default,
+                        from_node=node,
                     )
+                    rescale_node.meta["val"] = node_replacement_fake
+
                     if input_dtype == torch.int16:
                         tosa_resize_node.meta[TosaSpecialDtype.meta_key()] = (
                             TosaSpecialDtype.INT48
diff --git a/backends/arm/constants.py b/backends/arm/constants.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -41,6 +41,8 @@
 NNCHW_ORDER: Final = (0, 1, 2, 3, 4)
 NNNCHW_ORDER: Final = (0, 1, 2, 3, 4, 5)
 
+OHWI_ORDER: Final = (1, 2, 3, 0)
+ODHWI_ORDER: Final = (0, 2, 3, 4, 1)
 HWCM_ORDER: Final = (2, 3, 0, 1)
 
 MAX_RANK: Final = 6
diff --git a/backends/arm/test/misc/test_transpose_counts.py b/backends/arm/test/misc/test_transpose_counts.py
@@ -169,7 +169,7 @@ def forward(self, x: torch.Tensor, dim: int):
         return torch.cumsum(x, dim)
 
 
-class ConvMaxPoolResidualLinear(torch.nn.Module):
+class Model1ConvMaxPoolResidualLinear(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.conv = torch.nn.Conv1d(8, 8, kernel_size=3, padding=1)
@@ -427,7 +427,7 @@ def forward(self, x):
         0,
     ),
     "model_1_conv_maxpool_residual_linear": TransposeCountCase(
-        ConvMaxPoolResidualLinear(), (torch.randn(2, 8, 64),), 5
+        Model1ConvMaxPoolResidualLinear(), (torch.randn(2, 8, 64),), 5
     ),
     "model_2_conv_mha_linear_layernorm": TransposeCountCase(
         Model2ConvMhaLinearLayerNorm(), (torch.randn(2, 8, 32),), 11
@@ -486,7 +486,7 @@ def forward(self, x):
             torch.randn(2, 2, 2, 3).to(memory_format=torch.channels_last),
             torch.randn(2, 2, 3, 4).to(memory_format=torch.channels_last),
         ),
-        2,  # The test crashes before reaching the transpose count
+        2,
     ),
     "pixel_shuffle_channels_last": TransposeCountCase(
         PixelShuffleModule(),
@@ -526,7 +526,7 @@ def forward(self, x):
     "cumsum_rank4_dim3_channels_last": TransposeCountCase(
         CumsumModule(),
         (torch.randn(1, 2, 3, 4).to(memory_format=torch.channels_last), 3),
-        1,  # The test crashes before reaching the transpose count
+        1,
     ),
 }
 
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
@@ -758,6 +758,5 @@ def test_convolution_2d_u85_INT_a16w8(test_data: input_t):
         a16w8_quantization=True,
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
-        custom_path="TEMP",
     )
     pipeline.run()
diff --git a/backends/arm/test/passes/test_rewrite_avg_pool2d_pass.py b/backends/arm/test/passes/test_rewrite_avg_pool2d_pass.py
@@ -9,6 +9,8 @@
 from executorch.backends.arm._passes.rewrite_avg_pool2d_pass import RewriteAvgPool2dPass
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir.dialects._ops import ops as exir_ops
 
 input_t = Tuple[torch.Tensor]
 
@@ -41,6 +43,22 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return torch.nn.functional.avg_pool2d(x, kernel_size=[2, 3])
 
 
+class AvgPool2dScalarPadding(torch.nn.Module):
+    def get_inputs(self) -> input_t:
+        return (torch.rand(1, 3, 8, 8),)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.avg_pool2d(x, kernel_size=3, stride=2, padding=1)
+
+
+class AvgPool2dWithEmptyStride(torch.nn.Module):
+    def get_inputs(self) -> input_t:
+        return (torch.rand(1, 3, 8, 8),)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.avg_pool2d(x, kernel_size=[2, 3], stride=[])
+
+
 modules: Dict[str, ModuleWithInputs] = {
     "avg_pool2d_with_stride": AvgPool2dWithStride(),
     "avg_pool2d_without_stride": AvgPool2dWithoutStride(),
@@ -67,3 +85,42 @@ def test_rewrite_avg_pool2d_tosa(module: ModuleWithInputs) -> None:
         "run_method_and_compare_outputs"
     )  # Cannot run aten graph with tosa dialect ops
     pipeline.run()
+
+
+def _get_tosa_avg_pool2d_node(
+    pipeline: PassPipeline[input_t],
+) -> torch.fx.Node:
+    exported_program = pipeline.tester.get_artifact(
+        StageType.RUN_PASSES
+    ).exported_program()
+    graph_module = exported_program.graph_module
+
+    tosa_nodes = [
+        node
+        for node in graph_module.graph.nodes
+        if node.op == "call_function"
+        and node.target == exir_ops.backend.tosa.AVG_POOL2D.default
+    ]
+    assert len(tosa_nodes) == 1
+    return tosa_nodes[0]
+
+
+def test_rewrite_avg_pool2d_tosa_empty_stride_uses_kernel_size() -> None:
+    module = AvgPool2dWithEmptyStride()
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default": 1,
+        },
+        ops_after_pass={
+            "executorch_exir_dialects_backend__ops_tosa_AVG_POOL2D_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 2,
+        },
+        pass_list=[RewriteAvgPool2dPass],
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+
+    tosa_node = _get_tosa_avg_pool2d_node(pipeline)
+    assert tosa_node.args[4] == [2, 3]
diff --git a/backends/arm/test/passes/test_rewrite_max_pool2d_pass.py b/backends/arm/test/passes/test_rewrite_max_pool2d_pass.py
@@ -10,6 +10,8 @@
 from executorch.backends.arm._passes.rewrite_max_pool2d_pass import RewriteMaxPool2dPass
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir.dialects._ops import ops as exir_ops
 
 input_t = Tuple[torch.Tensor]
 
@@ -42,6 +44,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return torch.nn.functional.max_pool2d(x, kernel_size=[2, 3])
 
 
+class MaxPool2dWithEmptyStride(torch.nn.Module):
+    def get_inputs(self) -> input_t:
+        return (torch.rand(1, 3, 8, 8),)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.max_pool2d(x, kernel_size=[2, 3], stride=[])
+
+
 modules: Dict[str, ModuleWithInputs] = {
     "max_pool2d_with_stride": MaxPool2dWithStride(),
     "max_pool2d_without_stride": MaxPool2dWithoutStride(),
@@ -67,3 +77,41 @@ def test_rewrite_max_pool2d_tosa(module: ModuleWithInputs) -> None:
         "run_method_and_compare_outputs"
     )  # Cannnot run aten graph with tosa dialect ops
     pipeline.run()
+
+
+def _get_tosa_max_pool2d_node(
+    pipeline: PassPipeline[input_t],
+) -> torch.fx.Node:
+    exported_program = pipeline.tester.get_artifact(
+        StageType.RUN_PASSES
+    ).exported_program()
+    graph_module = exported_program.graph_module
+
+    tosa_nodes = [
+        node
+        for node in graph_module.graph.nodes
+        if node.op == "call_function"
+        and node.target == exir_ops.backend.tosa.MAX_POOL2D.default
+    ]
+    assert len(tosa_nodes) == 1
+    return tosa_nodes[0]
+
+
+def test_rewrite_max_pool2d_tosa_empty_stride_uses_kernel_size() -> None:
+    module = MaxPool2dWithEmptyStride()
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default": 1,
+        },
+        ops_after_pass={
+            "executorch_exir_dialects_backend__ops_tosa_MAX_POOL2D_default": 1,
+        },
+        pass_list=[RemoveGetItemPass, RewriteMaxPool2dPass],
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+
+    tosa_node = _get_tosa_max_pool2d_node(pipeline)
+    assert tosa_node.args[2] == [2, 3]

Original file line number	Diff line number	Diff line change
`@@ -758,6 +758,5 @@ def test_convolution_2d_u85_INT_a16w8(test_data: input_t):`
`758`	`758`	`a16w8_quantization=True,`
`759`	`759`	`use_to_edge_transform_and_lower=True,`
`760`	`760`	`per_channel_quantization=per_channel_quantization,`
`761`		`- custom_path="TEMP",`
`762`	`761`	`)`
`763`	`762`	`pipeline.run()`