Add FuseConsecutiveRescalesPass to fuse redundant RESCALE pairs (#17830)

Ninja91 · meta-codesync[bot] · commit 7df3e6dfce84 · 2026-03-06T16:19:43.000-08:00
Summary: Pull Request resolved: #17830 TOSA requires INT32 arithmetic for add/sub/mul ops. `InsertRescaleInt32Pass` wraps each such op with input RESCALEs (INT8→INT32) and output RESCALE (INT32→INT8). When two such ops are chained, the output RESCALE of op1 feeds directly into the input RESCALE of op2, creating a redundant INT32→INT8→INT32 round-trip that wastes NPU cycles and loses precision. `FuseConsecutiveRescalesPass` detects these pairs and either: - Removes both if the composed scale is ~1.0 (identity) - Replaces both with a single INT32→INT32 RESCALE with composed scale Handles multi-user R1 nodes (e.g., residual connections, LayerNorm branching) by fusing each R1→R2 pair individually while preserving R1 for non-RESCALE users. ## Context Each unnecessary RESCALE is decomposed by Vela into Add+Mul NPU instructions (~1,130 cycles each on Ethos-U55-128). In meta-internal quantized models, RESCALE overhead accounts for 25-50% of total NPU cycles. This pass eliminates consecutive pairs at op boundaries, with multi-user handling catching additional pairs from branching patterns (LayerNorm's sub feeding both mul_square and mul_normalize). This diff also adds a `ResidualConvBlock` toy model and pass-level unit tests. Reviewed By: 3l1 Differential Revision: D94483331
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -96,6 +96,7 @@
     QuantizeClampArgumentsPass,
 )
 from .fuse_batch_norm2d_pass import FuseBatchNorm2dPass  # noqa
+from .fuse_consecutive_rescales_pass import FuseConsecutiveRescalesPass  # noqa
 from .fuse_constant_ops_pass import (  # noqa
     ComputeConstantOpsAOTPass,
     FuseConstantArgsPass,
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -93,6 +93,7 @@
     DecorateFp32toInt32CastingPass,
     FoldAndAnnotateQParamsPass,
     FuseBatchNorm2dPass,
+    FuseConsecutiveRescalesPass,
     FuseConstantArgsPass,
     FuseDuplicateUsersPass,
     FuseEqualPlaceholdersPass,
@@ -161,8 +162,8 @@ def configure_skip_passes(
         self,
         override_config: ArmPassPipelineConfig | None = None,
     ) -> tuple[type, ...]:
-        """Configures the pass manager to skip certain passes based on the
-        ArmPassPipelineConfig class found in the compile spec.
+        """Configures the pass manager to skip certain passes based on
+        the ArmPassPipelineConfig class found in the compile spec.
         """
         skip_set: set[type] = set()
 
@@ -189,11 +190,11 @@ def configure_skip_passes(
         return self._skip_pass_types
 
     def validate_constraints_mandatory(self):
-        """Validates that necessary passes have run before transforming to
-        backend.
+        """Validates that necessary passes have run before
+        transforming to backend.
 
-        Note that this differs from the original validate_constraints function,
-        which only checks the order of passes.
+        Note that this differs from the original validate_constraints
+        function, which only checks the order of passes.
 
         """
         passes_to_run = defaultdict(list)
@@ -264,6 +265,7 @@ def _tosa_pipeline(
                 # Ticket: MLETORCH-1539
                 DecomposeLinearPass(),
                 InsertRescaleInt32Pass(),
+                FuseConsecutiveRescalesPass(),
                 InsertControlFlowRescalesPass(),
                 DecomposeQuantNodesPass(),
             ]
diff --git a/backends/arm/_passes/fuse_consecutive_rescales_pass.py b/backends/arm/_passes/fuse_consecutive_rescales_pass.py
@@ -0,0 +1,124 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import cast, Set, Type
+
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_base import PassResult
+
+
+class FuseConsecutiveRescalesPass(ArmPass):
+    """Fuse consecutive RESCALE(INT32->INT8/INT16) ->
+    RESCALE(INT8/INT16->INT32) pairs.
+
+    InsertRescaleInt32Pass wraps each add/mul/sub with input rescales
+    (INT8/INT16->INT32) and an output rescale (INT32->INT8/INT16). When
+    two such ops are chained (e.g., add1 -> add2), the output rescale
+    of add1 feeds directly into an input rescale of add2, creating a
+    redundant INT32->INT8/INT16->INT32 round-trip that loses precision.
+
+    This pass detects such pairs and either:
+    - Removes both if the composed scale is ~1.0 and zero points match
+    - Replaces both with a single INT32->INT32 RESCALE with composed
+      scale
+
+    Handles multi-user R1 nodes: when R1 feeds both RESCALE and
+    non-RESCALE users, each R1->R2 RESCALE pair is fused individually
+    while preserving R1 for its non-RESCALE users.
+
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        graph = graph_module.graph
+        modified = False
+        nodes_to_erase = []
+
+        for node in list(graph.nodes):
+            node = cast(Node, node)
+            if not _is_rescale(node):
+                continue
+
+            # R1 = node: output rescale (INT32 -> INT8/INT16)
+            r1_output_dtype = node.args[1]
+            if r1_output_dtype not in (torch.int8, torch.int16):
+                continue
+
+            r1_input = node.args[0]
+            r1_input_zp = node.args[3]
+            r1_output_zp = node.args[4]
+            r1_scale = float(node.args[2][0])
+
+            # Check each user individually (handles multi-user R1)
+            for user in list(node.users):
+                if not _is_rescale(user):
+                    continue
+
+                # R2 = user: input rescale (INT8/INT16 -> INT32)
+                r2_output_dtype = user.args[1]
+                if r2_output_dtype != torch.int32:
+                    continue
+
+                r2_input_zp = user.args[3]
+
+                # Guard: intermediate zero points must match for correct
+                # composition. Without this, the offset term
+                # (r1_output_zp - r2_input_zp) * r2_scale is silently lost.
+                if r1_output_zp != r2_input_zp:
+                    continue
+
+                r2_scale = float(user.args[2][0])
+                composed_scale = r1_scale * r2_scale
+                r2_output_zp = user.args[4]
+
+                if abs(composed_scale - 1.0) < 1e-6 and r1_input_zp == r2_output_zp:
+                    # Identity: wire R1's input directly to R2's users
+                    user.replace_all_uses_with(r1_input)
+                    nodes_to_erase.append(user)
+                else:
+                    # Non-identity: replace with single INT32->INT32 RESCALE
+                    with graph.inserting_before(user):
+                        composed_node = create_node(
+                            graph,
+                            exir_ops.backend.tosa.RESCALE.default,
+                            (
+                                r1_input,
+                                r2_output_dtype,
+                                [composed_scale],
+                                r1_input_zp,
+                                r2_output_zp,
+                            ),
+                            from_node=user,
+                        )
+                    user.replace_all_uses_with(composed_node)
+                    nodes_to_erase.append(user)
+
+                modified = True
+
+            # Always consider R1 for removal; actual erasure is guarded below
+            nodes_to_erase.append(node)
+
+        for node in nodes_to_erase:
+            if len(node.users) == 0:
+                graph.erase_node(node)
+
+        if modified:
+            graph_module = super().call(graph_module).graph_module
+            graph_module.recompile()
+
+        return PassResult(graph_module, modified)
+
+
+def _is_rescale(node: Node) -> bool:
+    return (
+        node.op == "call_function"
+        and node.target == exir_ops.backend.tosa.RESCALE.default
+    )
diff --git a/backends/arm/test/models/test_residual_conv_block.py b/backends/arm/test/models/test_residual_conv_block.py
@@ -0,0 +1,141 @@
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Residual conv block model test for ARM TOSA backend.
+
+Tests a minimal residual architecture with conv->batchnorm->relu->add
+blocks and permute operations, representative of quantized signal
+processing models where FuseConsecutiveRescalesPass eliminates
+redundant RESCALE pairs.
+
+"""
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+
+class ResidualConvBlock(torch.nn.Module):
+    """Residual conv block with batchnorm and permute operations.
+
+    Architecture: conv->bn->relu->add (residual) -> permute ->
+    conv->bn->relu->add. When quantized, each residual add is
+    wrapped with INT32 RESCALEs by InsertRescaleInt32Pass. Stacked
+    blocks create consecutive RESCALE pairs (INT32->INT8->INT32)
+    between adjacent adds that FuseConsecutiveRescalesPass
+    eliminates.
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(3, 3, 3, padding=1)
+        self.bn1 = torch.nn.BatchNorm2d(3)
+        self.relu1 = torch.nn.ReLU()
+        self.conv2 = torch.nn.Conv2d(3, 3, 3, padding=1)
+        self.bn2 = torch.nn.BatchNorm2d(3)
+        self.relu2 = torch.nn.ReLU()
+
+    def forward(self, x):
+        # Block 1: conv → batchnorm → relu → residual add
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = out + x  # residual add 1
+
+        # Channel reordering (common in signal processing models)
+        out = out.permute(0, 1, 3, 2)
+
+        # Block 2: conv → batchnorm → relu → residual add
+        out2 = self.relu2(self.bn2(self.conv2(out)))
+        out2 = out2 + out  # residual add 2
+        return out2
+
+
+model = ResidualConvBlock().eval()
+model_inputs = (torch.randn(1, 3, 8, 8),)
+input_t = Tuple[torch.Tensor]
+
+
+def test_residual_conv_block_tosa_FP():
+    pipeline = TosaPipelineFP[input_t](
+        model,
+        model_inputs,
+        aten_op=[],
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+def test_residual_conv_block_tosa_INT():
+    pipeline = TosaPipelineINT[input_t](
+        model,
+        model_inputs,
+        aten_op=[],
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+        atol=0.25,
+        qtol=1,
+        frobenius_threshold=None,
+        cosine_threshold=None,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+def test_residual_conv_block_u55_INT():
+    pipeline = EthosU55PipelineINT[input_t](
+        model,
+        model_inputs,
+        aten_ops=[],
+        exir_ops=[],
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+def test_residual_conv_block_u85_INT():
+    pipeline = EthosU85PipelineINT[input_t](
+        model,
+        model_inputs,
+        aten_ops=[],
+        exir_ops=[],
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_residual_conv_block_vgf_quant():
+    pipeline = VgfPipeline[input_t](
+        model,
+        model_inputs,
+        aten_op=[],
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+        quantize=True,
+    )
+    pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_residual_conv_block_vgf_no_quant():
+    pipeline = VgfPipeline[input_t](
+        model,
+        model_inputs,
+        aten_op=[],
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+        quantize=False,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_fuse_quantized_activation_pass.py b/backends/arm/test/passes/test_fuse_quantized_activation_pass.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.fuse_quantized_activation_pass import (
+    FuseQuantizedActivationPass,
+)
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]
+
+
+class ConvRelu(torch.nn.Module):
+    """Conv2d followed by ReLU — existing fuseable behavior."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 3, 3, padding=1)
+        self.relu = torch.nn.ReLU()
+
+    def get_inputs(self) -> input_t:
+        return (torch.randn(1, 3, 8, 8),)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.relu(self.conv(x))
+
+
+def test_fuse_relu_after_conv_quantized() -> None:
+    """Existing behavior: ReLU after conv is fused in quantized graph."""
+    module = ConvRelu()
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        quantize=True,
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_relu_default": 1,
+        },
+        ops_not_after_pass=[
+            "executorch_exir_dialects_edge__ops_aten_relu_default",
+        ],
+        pass_list=[FuseQuantizedActivationPass],
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_rescale_optimization.py b/backends/arm/test/passes/test_rescale_optimization.py

Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@`
`96`	`96`	`QuantizeClampArgumentsPass,`
`97`	`97`	`)`
`98`	`98`	`from .fuse_batch_norm2d_pass import FuseBatchNorm2dPass # noqa`
	`99`	`+from .fuse_consecutive_rescales_pass import FuseConsecutiveRescalesPass # noqa`
`99`	`100`	`from .fuse_constant_ops_pass import ( # noqa`
`100`	`101`	`ComputeConstantOpsAOTPass,`
`101`	`102`	`FuseConstantArgsPass,`