psiddh
diff --git a/‎.ci/scripts/unittest-macos-cmake.sh‎
Lines changed: 13 additions & 2 deletions b/‎.ci/scripts/unittest-macos-cmake.sh‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎.github/scripts/propose_ghstack_orig_pr.py‎
Lines changed: 1 addition & 4 deletions b/‎.github/scripts/propose_ghstack_orig_pr.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎.github/workflows/_unittest.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/_unittest.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/mlx.yml‎
Lines changed: 12 additions & 0 deletions b/‎.github/workflows/mlx.yml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/accumulate_index_put_pass.py‎
Lines changed: 5 additions & 3 deletions b/‎backends/arm/_passes/accumulate_index_put_pass.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎backends/arm/_passes/arm_pass.py‎
Lines changed: 98 additions & 1 deletion b/‎backends/arm/_passes/arm_pass.py‎
Lines changed: 98 additions & 1 deletion
diff --git a/‎backends/arm/_passes/canonicalize_gather_pass.py‎
Lines changed: 4 additions & 4 deletions b/‎backends/arm/_passes/canonicalize_gather_pass.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/arm/_passes/conv1d_unsqueeze_pass.py‎
Lines changed: 4 additions & 3 deletions b/‎backends/arm/_passes/conv1d_unsqueeze_pass.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/arm/_passes/convert_expand_copy_to_repeat.py‎
Lines changed: 4 additions & 3 deletions b/‎backends/arm/_passes/convert_expand_copy_to_repeat.py‎
Lines changed: 4 additions & 3 deletions
@@ -12,8 +12,19 @@ set -eux
 export TORCHINDUCTOR_CACHE_DIR="$(mktemp -d "${RUNNER_TEMP:-/tmp}/torchinductor_cache_XXXXXX")"
 trap 'rm -rf "${TORCHINDUCTOR_CACHE_DIR}"' EXIT
 
-# Run pytest with coverage
-${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
+# TODO(SS-JIA): AOTI tests hang on macOS CI runners — the thread blocks in
+# native C/C++ code (dlopen / inductor compilation) so faulthandler cannot
+# even produce a traceback. Diagnosis ongoing in #19886.
+AOTI_SKIPS=(
+  --ignore=examples/models/llama3_2_vision/preprocess/test_preprocess.py
+  --ignore=examples/models/llama3_2_vision/vision_encoder/test/test_vision_encoder.py
+  --ignore=examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py
+  --deselect=extension/llm/modules/test/test_position_embeddings.py::TilePositionalEmbeddingTest::test_tile_positional_embedding_aoti
+  --deselect=extension/llm/modules/test/test_position_embeddings.py::TiledTokenPositionalEmbeddingTest::test_tiled_token_positional_embedding_aoti
+  --deselect=extension/llm/modules/test/test_attention.py::AttentionTest::test_attention_aoti
+)
+
+${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml "${AOTI_SKIPS[@]}"
 # Run gtest
 LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \
 ${CONDA_RUN} test/run_oss_cpp_tests.sh
@@ -52,12 +52,9 @@ def extract_stack_from_body(pr_body: str) -> List[int]:
     """
 
     prs = []
-    ghstack_begin = (
-        "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):"
-    )
     ghstack_begin_seen = False
     for line in pr_body.splitlines():
-        if ghstack_begin in line:
+        if line.startswith("Stack from [ghstack]"):
             ghstack_begin_seen = True
         if not ghstack_begin_seen:
             continue
 
@@ -49,6 +49,7 @@ jobs:
       python-version: '3.11'
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
       script: |
         set -eux
         # This is needed to get the prebuilt PyTorch wheel from S3
 
@@ -80,6 +80,18 @@ jobs:
         ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_gated_delta_rule run -v
         echo "::endgroup::"
 
+        echo "::group::Run tq_norm op tests"
+        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq_norm run -v
+        echo "::endgroup::"
+
+        echo "::group::Run tq4_compress op tests"
+        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq4_compress run -v
+        echo "::endgroup::"
+
+        echo "::group::Run tq_dequant op tests"
+        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq_dequant run -v
+        echo "::endgroup::"
+
   test-mlx-qwen35-moe:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
 
@@ -5,7 +5,7 @@
 
 
 from . import arm_pass_utils  # noqa
-from .arm_pass import ArmPass  # noqa  # usort: skip
+from .arm_pass import ArmOpTargetedPass, ArmPass  # noqa  # usort: skip
 from .accumulate_index_put_pass import AccumulateIndexPutPass  # noqa
 from .broadcast_args_pass import BroadcastArgsPass  # noqa
 from .canonicalize_gather_pass import CanonicalizeGatherPass  # noqa
 
@@ -6,7 +6,7 @@
 
 import torch
 
-from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes import ArmOpTargetedPass
 from executorch.backends.arm._passes.decompose_index_tensor_to_gather_pass import (
     DecomposeIndexTensorToGatherPass,
 )
@@ -32,7 +32,7 @@ def get_ops(op):
     raise RuntimeError(f"Can't get index_put decomposition for op {op}")
 
 
-class AccumulateIndexPutPass(ArmPass):
+class AccumulateIndexPutPass(ArmOpTargetedPass):
     """This pass adjusts the values arg when the accumulate arg is set to true
     for the index_put op.
     """
@@ -41,9 +41,11 @@ class AccumulateIndexPutPass(ArmPass):
         DecomposeIndexTensorToGatherPass,
         RewriteIndexPutPass,
     }
+    target_ops = aten_ops + edge_ops
+    check_allowed_to_transform = True
 
     def call_operator(self, op, args, kwargs, meta):
-        if op not in (aten_ops + edge_ops) or not self.allowed_to_transform(meta):
+        if op not in self.target_ops or not self.allowed_to_transform(meta):
             return super().call_operator(op, args, kwargs, meta)
 
         source, indices, values = args[:3]
 
@@ -7,14 +7,15 @@
 import copy
 import traceback
 from abc import abstractmethod
+from collections.abc import Collection
 from typing import Any, List, Optional, Set, Type
 
 import torch
 from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY
 from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
-from torch.fx import GraphModule
+from torch.fx import GraphModule, Node
 from torch.fx.passes.infra.pass_base import PassResult
 from torch.utils import _pytree as pytree
 
@@ -191,3 +192,99 @@ def call_scalar(self, value: int | float, meta: NodeMetadata | dict[str, Any]):
             meta=meta,
             updated=True,
         )
+
+    def should_run_pass(self, graph_module: GraphModule) -> bool:
+        """Return whether this pass should run on the graph module.
+
+        Subclasses can override this to cheaply skip the pass before
+        ``call()`` starts the normal ``ExportPass`` retracing path.
+
+        Args:
+            graph_module (GraphModule): The graph module to inspect.
+
+        Returns:
+            bool: True when the pass should run.
+
+        """
+        return True
+
+    def __call__(self, graph_module: GraphModule) -> PassResult | None:
+        self.requires(graph_module)
+        if not self.should_run_pass(graph_module):
+            self.ensures(graph_module)
+            return PassResult(graph_module, False)
+        res = self.call(graph_module)
+        self.ensures(graph_module)
+        return res
+
+
+class ArmOpTargetedPass(ArmPass):
+    """Base class for passes that only transform selected operators.
+
+    Subclasses set ``target_ops`` to the call_function targets they can
+    transform. If the current graph and nested control-flow subgraphs do not
+    contain any target, the pass returns immediately without paying the default
+    ExportPass retracing cost.
+
+    Set ``check_allowed_to_transform`` to ``True`` when the target pre-scan
+    should also apply ``allowed_to_transform()`` to matching target nodes. This
+    is useful for TFA passes whose ``call_operator()`` leaves disallowed target
+    nodes unchanged. If all matching targets are disallowed, the pass can
+    return before entering the normal ``ExportPass`` path.
+
+    """
+
+    target_ops: Collection[Any] = ()
+    check_allowed_to_transform = False
+
+    def has_target_node(self, graph_module: GraphModule) -> bool:
+        """Return whether the graph module tree contains a target node.
+
+        Args:
+            graph_module (GraphModule): The graph module tree to inspect.
+
+        Returns:
+            bool: True if a matching call_function node is present.
+
+        """
+        visited_graph_modules = set()
+
+        def target_node_can_trigger_pass(node: Node) -> bool:
+            if not self.check_allowed_to_transform:
+                return True
+            if self.allowed_to_transform(node.meta):
+                return True
+            return False
+
+        def graph_has_target(module: GraphModule) -> bool:
+            if id(module) in visited_graph_modules:
+                return False
+            visited_graph_modules.add(id(module))
+
+            for target in self.target_ops:
+                for node in module.graph.find_nodes(
+                    op="call_function",
+                    target=target,
+                    sort=False,
+                ):
+                    if target_node_can_trigger_pass(node):
+                        return True
+
+            return any(
+                isinstance(child, GraphModule) and graph_has_target(child)
+                for child in module.children()
+            )
+
+        return graph_has_target(graph_module)
+
+    def should_run_pass(self, graph_module: GraphModule) -> bool:
+        """Return whether this pass has a target node to transform.
+
+        Args:
+            graph_module (GraphModule): The graph module tree to inspect.
+
+        Returns:
+            bool: True when a matching target node is present.
+
+        """
+        return self.has_target_node(graph_module)
@@ -6,12 +6,12 @@
 from typing import Set, Type
 
 import torch
-from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes import ArmOpTargetedPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
 
-class CanonicalizeGatherPass(ArmPass):
+class CanonicalizeGatherPass(ArmOpTargetedPass):
     """Canonicalize gather so it can be lowered to TOSA.GATHER via the backend
     dialect.
 
@@ -40,10 +40,10 @@ class CanonicalizeGatherPass(ArmPass):
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
-    _TARGET_OPS = {exir_ops.edge.aten.gather.default}
+    target_ops = {exir_ops.edge.aten.gather.default}
 
     def call_operator(self, op, args, kwargs, meta):
-        if op not in self._TARGET_OPS:
+        if op not in self.target_ops:
             return super().call_operator(op, args, kwargs, meta)
 
         # edge.aten.gather.default: (x, dim, index) with kw-only sparse_grad
 
@@ -8,7 +8,7 @@
 
 from typing import Set, Type
 
-from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes import ArmOpTargetedPass
 
 from executorch.backends.arm._passes.rewrite_conv_pass import RewriteConvPass
 from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
@@ -17,7 +17,7 @@
 from executorch.exir.pass_base import ExportPass
 
 
-class Conv1dUnsqueezePass(ArmPass):
+class Conv1dUnsqueezePass(ArmOpTargetedPass):
     """This pass is used to change conv1d ops into conv2d since TOSA only
     supports 2d and 3d convolution.
 
@@ -34,9 +34,10 @@ class Conv1dUnsqueezePass(ArmPass):
         RewriteConvPass,
         SizeAdjustInputPass,
     }
+    target_ops = (exir_ops.edge.aten.convolution.default,)
 
     def call_operator(self, op, args, kwargs, meta):
-        if op != exir_ops.edge.aten.convolution.default:
+        if op not in self.target_ops:
             return super().call_operator(op, args, kwargs, meta)
         stride = list(args[3])
         if len(stride) != 1:
 
@@ -9,7 +9,7 @@
 
 import torch
 
-from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass
 from executorch.backends.arm._passes.unsqueeze_before_repeat_pass import (
     UnsqueezeBeforeRepeatPass,
 )
@@ -51,7 +51,7 @@ def calculate_multiples(args):
     return multiples, expanded_rank != len(input_shape)
 
 
-class ConvertExpandCopyToRepeatPass(ArmPass):
+class ConvertExpandCopyToRepeatPass(ArmOpTargetedPass):
     """Replace expand copy with repeat since it is a repeat that can only repeat
     singleton dimensions.
     """
@@ -60,9 +60,10 @@ class ConvertExpandCopyToRepeatPass(ArmPass):
 
     expand_copy = exir_ops.edge.aten.expand_copy.default
     repeat = exir_ops.edge.aten.repeat.default
+    target_ops = (expand_copy,)
 
     def call_operator(self, op, args, kwargs, meta):
-        if op != self.expand_copy:
+        if op not in self.target_ops:
             return super().call_operator(op, args, kwargs, meta)
 
         multiples, changes_rank = calculate_multiples(args)