pytorch
diff --git a/‎.ci/scripts/wheel/envvar_macos.sh‎
Lines changed: 5 additions & 0 deletions b/‎.ci/scripts/wheel/envvar_macos.sh‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 30 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 0 additions & 30 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 0 additions & 30 deletions
diff --git a/‎backends/arm/MODELS.md‎
Lines changed: 6 additions & 0 deletions b/‎backends/arm/MODELS.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fuse_consecutive_rescales_pass.py‎
Lines changed: 175 additions & 0 deletions b/‎backends/arm/_passes/fuse_consecutive_rescales_pass.py‎
Lines changed: 175 additions & 0 deletions
@@ -9,3 +9,8 @@
 # any variables so that subprocesses will see them.
 
 source "${GITHUB_WORKSPACE}/${REPOSITORY}/.ci/scripts/wheel/envvar_base.sh"
+
+# Force Apple Clang to avoid Homebrew LLVM, which doesn't properly handle
+# Apple SDK Objective-C framework headers (e.g. NSIntegerMax in NSObjCRuntime.h).
+export CC=/usr/bin/clang
+export CXX=/usr/bin/clang++
@@ -132,7 +132,7 @@ jobs:
         # Build executor_runner (needed by CUDA backend e2e tests)
         cmake --build cmake-out --target executor_runner
 
-        # Run all CUDA backend Python tests (including chunk_gated_delta e2e)
+        # Run CUDA backend Python tests
         python -m pytest backends/cuda/tests backends/cuda/passes/tests -v -o "addopts="
 
   export-model-cuda-artifact:
 
@@ -607,6 +607,36 @@ jobs:
           exit 1
         fi
 
+  test-mcu-cortex-m-backend:
+    name: test-mcu-cortex-m-backend
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
+
+        # Install arm dependencies
+        .ci/scripts/setup-arm-baremetal-tools.sh
+        source examples/arm/arm-scratch/setup_path.sh
+
+        # To build cortex-m test runner
+        backends/cortex_m/test/build_test_runner.sh
+
+        # To run cortex_m tests
+        pytest --config-file=backends/arm/test/pytest.ini backends/cortex_m/test
+
   android:
     uses: ./.github/workflows/_android.yml
     permissions:
 
@@ -1054,33 +1054,3 @@ jobs:
 
           .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
         }"
-
-  test-mcu-cortex-m-backend:
-    name: test-mcu-cortex-m-backend
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      runner: linux.2xlarge.memory
-      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 120
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        source .ci/scripts/utils.sh
-        install_executorch "--use-pt-pinned-commit"
-
-        # Install arm dependencies
-        .ci/scripts/setup-arm-baremetal-tools.sh
-        source examples/arm/arm-scratch/setup_path.sh
-
-        # To build cortex-m test runner
-        backends/cortex_m/test/build_test_runner.sh
-
-        # To run cortex_m tests
-        pytest --config-file=backends/arm/test/pytest.ini backends/cortex_m/test
@@ -1,12 +1,17 @@
 <!-- Copyright 2025-2026 Arm Limited and/or its affiliates. -->
 # The following file contains all models that have been confirmed to be functional and tested for the Arm backend:
+# Note: Deep AutoEncoder requires manual Linear+BatchNorm1d fusion as the quantizer does not yet support this pattern.
+# Note: DS CNN requires AvgPool2d workaround for Ethos-U55 due to stride > 3 limitation.
 - Conformer
+- Deep AutoEncoder
 - Deit Tiny
 - DeepLab v3 (DL3)
+- DS CNN
 - Inception v3 (IC3)
 - Llama
 - Gemma3n
 - Long Short-Term Memory (LSTM)
+- MobileNet V1 0.25
 - MobileNet v2 (MV2)
 - MobileNet v3 (MV3)
 - Some popular torch.nn.functional models (NN functional)
@@ -16,6 +21,7 @@
 - Neural Super Sampler (NSS)
 - Phi-3
 - ResNet 18
+- ResNet-8
 - Wav2Letter (W2L)
 - Stable Diffusion:
     * CLIP Text Encoder (CLIP Text with Projection)
 
@@ -102,6 +102,7 @@
     QuantizeClampArgumentsPass,
 )
 from .fuse_batch_norm2d_pass import FuseBatchNorm2dPass  # noqa
+from .fuse_consecutive_rescales_pass import FuseConsecutiveRescalesPass  # noqa
 from .fuse_constant_ops_pass import (  # noqa
     ComputeConstantOpsAOTPass,
     FuseConstantArgsPass,
 
@@ -98,6 +98,7 @@
     DecorateFp32toInt32CastingPass,
     FoldAndAnnotateQParamsPass,
     FuseBatchNorm2dPass,
+    FuseConsecutiveRescalesPass,
     FuseConstantArgsPass,
     FuseDuplicateUsersPass,
     FuseEqualPlaceholdersPass,
@@ -380,6 +381,7 @@ def _tosa_pipeline(
                 # Ticket: MLETORCH-1539
                 DecomposeLinearPass(),
                 InsertRescaleInt32Pass(),
+                FuseConsecutiveRescalesPass(),
                 InsertControlFlowRescalesPass(),
                 DecomposeQuantNodesPass(),
             ]
 
@@ -0,0 +1,175 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import cast, Set, Type
+
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx import GraphModule, Node
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+# TOSA RESCALE argument positions:
+#   args[0] = input tensor (Node)
+#   args[1] = output dtype (e.g., torch.int8, torch.int32)
+#   args[2] = scale list (List[float]; per-tensor when len == 1)
+#   args[3] = input zero point (int)
+#   args[4] = output zero point (int)
+_ARG_INPUT = 0
+_ARG_OUTPUT_DTYPE = 1
+_ARG_SCALE = 2
+_ARG_INPUT_ZP = 3
+_ARG_OUTPUT_ZP = 4
+
+
+class FuseConsecutiveRescalesPass(ArmPass):
+    """Fuse consecutive RESCALE(INT32->INT8/INT16) -> RESCALE(INT8/INT16->INT32)
+    pairs.
+
+    InsertRescaleInt32Pass wraps each quantized arithmetic and comparison
+    operator (add, sub, mul, abs, eq, ge, gt, le, lt, max, min, sum) with
+    input rescales (INT8/INT16->INT32) and an output rescale
+    (INT32->INT8/INT16). When two such ops are chained (e.g., add1 -> add2),
+    the output rescale of add1 feeds directly into an input rescale of add2,
+    creating a redundant INT32->INT8/INT16->INT32 round-trip that loses
+    precision.
+
+    This pass detects such pairs and handles two cases:
+
+    - **Identity** (composed scale ~1.0, matching zero points): Removes both
+      RESCALEs and directly wires R1's input to R2's users.  This eliminates
+      the entire round-trip.  Bypassing the intermediate INT8/INT16 clamp can
+      in theory cause up to ~120 INT8 steps of output difference when all
+      inputs are near the clamp boundary; in practice, observed differences
+      are 0-1 steps for typical distributions.  Tests use qtol=1.
+
+    - **Non-identity**: Leaves the pair unchanged.  The Vela NPU compiler
+      cannot correctly process INT32->INT32 RESCALE (produces all-zero NPU
+      outputs), so non-identity pairs retain their INT8/INT16 intermediate.
+
+    Handles multi-user R1 nodes: when R1 feeds both RESCALE and
+    non-RESCALE users, each R1->R2 RESCALE pair is fused individually
+    while preserving R1 for its non-RESCALE users.
+
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        graph = graph_module.graph
+        modified = False
+        rescale_before = sum(1 for n in graph.nodes if _is_rescale(n))
+        identity_pairs_fused = 0
+
+        for node in list(graph.nodes):
+            node = cast(Node, node)
+            if not _is_fuseable_r1(node):
+                continue
+
+            r1_input = node.args[_ARG_INPUT]
+            r1_input_zp = node.args[_ARG_INPUT_ZP]
+            r1_scale = float(node.args[_ARG_SCALE][0])  # type: ignore[arg-type]
+
+            node_fused = False
+            for user in list(node.users):
+                if _try_fuse_identity_pair(node, user, r1_input, r1_input_zp, r1_scale):
+                    node_fused = True
+                    identity_pairs_fused += 1
+
+            if node_fused:
+                modified = True
+
+        if modified:
+            graph.eliminate_dead_code()
+            rescale_after = sum(1 for n in graph.nodes if _is_rescale(n))
+            removed = rescale_before - rescale_after
+            logger.info(
+                "FuseConsecutiveRescalesPass: removed %d identity pairs "
+                "(%d RESCALEs: %d -> %d)",
+                identity_pairs_fused,
+                removed,
+                rescale_before,
+                rescale_after,
+            )
+            graph_module.recompile()
+            graph.lint()
+            # Note: we deliberately skip super().call() — retracing is
+            # unnecessary since this pass only rewires edges and removes
+            # nodes without introducing new operations.
+
+        return PassResult(graph_module, modified)
+
+
+def _is_rescale(node: Node) -> bool:
+    return (
+        node.op == "call_function"
+        and node.target == exir_ops.backend.tosa.RESCALE.default
+    )
+
+
+def _is_fuseable_r1(node: Node) -> bool:
+    """Check if node is an R1 candidate.
+
+    R1 is RESCALE(INT32 -> INT8/INT16) with per-tensor scale.
+
+    """
+    if not _is_rescale(node):
+        return False
+    if node.args[_ARG_OUTPUT_DTYPE] not in (torch.int8, torch.int16):
+        return False
+    if len(node.args[_ARG_SCALE]) != 1:  # type: ignore[arg-type]
+        return False
+    r1_input = node.args[_ARG_INPUT]
+    if not isinstance(r1_input, Node) or "val" not in r1_input.meta:
+        return False
+    if r1_input.meta["val"].dtype != torch.int32:
+        return False
+    return True
+
+
+def _try_fuse_identity_pair(
+    r1: Node,
+    r2: Node,
+    r1_input: Node,
+    r1_input_zp: int,
+    r1_scale: float,
+) -> bool:
+    """Try to fuse an R1->R2 identity pair.
+
+    Returns True if fused.
+
+    """
+    if not _is_rescale(r2):
+        return False
+    if r2.args[_ARG_OUTPUT_DTYPE] != torch.int32:
+        return False
+    if r1.args[_ARG_OUTPUT_ZP] != r2.args[_ARG_INPUT_ZP]:
+        return False
+    if len(r2.args[_ARG_SCALE]) != 1:  # type: ignore[arg-type]
+        return False
+
+    r2_scale = float(r2.args[_ARG_SCALE][0])  # type: ignore[arg-type, index]
+    composed_scale = r1_scale * r2_scale
+    r2_output_zp = r2.args[_ARG_OUTPUT_ZP]
+
+    if abs(composed_scale - 1.0) < 1e-6 and r1_input_zp == r2_output_zp:
+        # Identity case: remove both RESCALEs and directly wire
+        # R1's input (INT32) to R2's users.  The composed scale
+        # is ~1.0 so the round-trip is a no-op modulo the INT8
+        # clamp.  Bypassing the clamp can in theory cause up to
+        # ~120 INT8 steps of difference near clamp boundaries;
+        # observed differences are 0-1 steps.  Tests use qtol=1.
+        r2.replace_all_uses_with(r1_input)
+        return True
+
+    # Non-identity: leave the pair unchanged.  Creating a
+    # single INT32->INT32 RESCALE with the composed scale would
+    # be semantically correct (and the TOSA ref model handles
+    # it), but the Vela NPU compiler produces all-zero outputs
+    # for INT32->INT32 RESCALE operations.
+    return False
Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,7 @@`
`102`	`102`	`QuantizeClampArgumentsPass,`
`103`	`103`	`)`
`104`	`104`	`from .fuse_batch_norm2d_pass import FuseBatchNorm2dPass # noqa`
	`105`	`+from .fuse_consecutive_rescales_pass import FuseConsecutiveRescalesPass # noqa`
`105`	`106`	`from .fuse_constant_ops_pass import ( # noqa`
`106`	`107`	`ComputeConstantOpsAOTPass,`
`107`	`108`	`FuseConstantArgsPass,`