pytorch
diff --git a/‎.github/scripts/docathon-label-sync.py‎
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/docathon-label-sync.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 19 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 11 additions & 1 deletion b/‎Makefile‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎backends/arm/README.md‎
Lines changed: 8 additions & 3 deletions b/‎backends/arm/README.md‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 5 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 2 additions & 1 deletion b/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/arm/_passes/remove_permutes_around_elementwise_tosa_ops.py‎
Lines changed: 33 additions & 0 deletions b/‎backends/arm/_passes/remove_permutes_around_elementwise_tosa_ops.py‎
Lines changed: 33 additions & 0 deletions
@@ -9,7 +9,7 @@ def main() -> None:
     token = os.environ.get("GITHUB_TOKEN")
 
     repo_owner = "pytorch"
-    repo_name = "pytorch"
+    repo_name = "executorch"
     pull_request_number = int(sys.argv[1])
 
     g = Github(token)
 
@@ -148,6 +148,10 @@ jobs:
         # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache + sampler)
         python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py examples/models/qwen3_5_moe/test_sampler.py -v -o "addopts="
 
+        # Run Gemma 4 31B tests (quant unit tests + pipeline integration tests)
+        pip install gguf
+        python -m pytest examples/models/gemma4_31b/quant/tests/ examples/models/gemma4_31b/tests/ -v -o "addopts="
+
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
     # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
 
@@ -321,6 +321,25 @@ CI is run automatically on all pull requests. However, if you want to run tests
 - The `test/run_oss_cpp_tests.sh` script will build and run C++ tests locally
 - Running `pytest` from the root directory will run Python tests locally. Make sure to run this after finishing [Dev Install](#dev-install).
 
+To build C++ tests manually with CMake, run the following from the repository root:
+
+```bash
+cmake . -Bcmake-out -DCMAKE_INSTALL_PREFIX=cmake-out -DEXECUTORCH_BUILD_TESTS=ON
+cmake --build cmake-out -j9 --target install
+```
+
+You can then use `ctest` to list or run individual C++ tests directly:
+
+```bash
+ctest --test-dir cmake-out -N
+ctest --test-dir cmake-out -R <test_name_regex> --output-on-failure
+```
+
+This workflow is useful when you want to rerun one test, attach a debugger to a
+test binary under `cmake-out`, or keep a build directory around for quick rebuild
+cycles. Add the same `-DEXECUTORCH_BUILD_*` options used by
+`test/run_oss_cpp_tests.sh` when the test needs optional kernels or extensions.
+
 ### Writing Tests
 To help keep code quality high, ExecuTorch uses a combination of unit tests and
 end-to-end (e2e) tests. If you add a new feature or fix a bug, please add tests
 
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -126,6 +126,7 @@ help:
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
+	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  clean               - Clean build artifacts"
@@ -425,6 +426,15 @@ qwen3_5_moe-cuda:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
 
+gemma4_31b-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building Gemma 4 31B runner with CUDA..."
+	cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+
 qwen3_5_moe-metal:
 	@echo "==> Building and installing ExecuTorch with Metal..."
 	cmake --workflow --preset llm-release-metal
 
@@ -249,10 +249,15 @@ Some tests, with `u55`, `u85` and `vgf` in the name require external dependencie
   ```
 
 In addition, some model tests in the Arm backend require third-party libraries or packages.
-To run these tests, you need to install the required dependencies by running the script `examples/arm/setup.sh` with the flag `--setup-test-dependency`.
+To run these tests, install the required dependencies directly:
 
-Please note that installing model test dependencies is a standalone process. When using the `--setup-test-dependency` flag,
-the script will install only the necessary dependencies for model tests, skipping all other setup procedures.
+```
+bash backends/arm/scripts/install_models_for_test.sh
+```
+
+Installing model test dependencies is a standalone process. The script installs
+only the dependencies needed for model tests, skipping all other setup
+procedures.
 
 ## Using git hooks
 
 
@@ -140,6 +140,9 @@
 from .remove_getitem_pass import RemoveGetItemPass  # noqa
 from .remove_graph_asserts_pass import RemoveGraphAssertsPass  # noqa
 from .remove_noop_pass import RemoveNoopPass  # noqa
+from .remove_permutes_around_elementwise_tosa_ops import (  # noqa
+    RemovePermutesAroundElementwiseTosaOps,
+)
 from .replace_scalar_with_tensor_pass import (  # noqa
     ReplaceScalarWithTensorByProfilePass,
 )
 
@@ -125,6 +125,7 @@
     RemoveGetItemPass,
     RemoveGraphAssertsPass,
     RemoveNoopPass,
+    RemovePermutesAroundElementwiseTosaOps,
     ReplaceInfAndLimitValuesPass,
     ReplaceScalarWithTensorByProfilePass,
     RewriteAvgPool2dPass,
@@ -164,9 +165,6 @@
     PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView,
 )
 
-from executorch.backends.transforms.remove_permutes_around_elementwise_ops import (
-    RemovePermutesAroundElementwiseOps,
-)
 from executorch.exir import ExportedProgram
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.pass_manager import PassManager
@@ -523,6 +521,7 @@ def _tosa_pipeline(
                 DecomposeSumPass(),
                 InsertTableOpsPass(exported_program),
                 RemoveNoopPass(),
+                InsertDataLayoutCastsPass(),
             ]
         )
 
@@ -535,7 +534,7 @@ def _tosa_pipeline(
                 RewriteMatmulPass(),
                 RewritePadPass(),
                 FuseViewCopyTransformPass(),
-                RemovePermutesAroundElementwiseOps(),
+                RemovePermutesAroundElementwiseTosaOps(),
                 PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView(),
                 FuseCascadedTransposeOrPermuteOps(),
                 ConvertPermuteSingletonToViewPass(),
@@ -555,7 +554,6 @@ def _tosa_pipeline(
                 EnsureUniqueOutputNodesPass(),
                 RemoveNoopPass(),
                 InsertRescalePass(),
-                InsertDataLayoutCastsPass(),
             ]
         )
 
 
@@ -204,7 +204,6 @@ def call(self, graph_module):
                         f"{[input_node.name for input_node in input_nodes]}"
                     )
                     modified |= did_fuse
-                    graph_module.recompile()  # Recompile needed to catch chains of constant ops
                     input_nodes_to_maybe_delete.update(input_nodes)
             except Exception as e:
                 logger.warning(
 
@@ -278,11 +278,12 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     out_quantargs=output_qparams[0],
                 )
                 # Register buffer in self.exported_program.state_dict
+                # b_ prefix is important to be recognized as a constant in RemovePermutesAroundElementwiseOps
                 const_table_node = create_constant_placeholder(
                     exp_program=self.exported_program,
                     graph=node.graph,
                     kind=InputKind.BUFFER,
-                    name=node.name + "_table_constant",
+                    name="b_" + node.name + "_table_constant",
                     data=buffer,
                     persistent_buffer=True,
                 )
 
@@ -0,0 +1,33 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm._passes.insert_table_ops import TableOps
+from executorch.backends.transforms.remove_permutes_around_elementwise_ops import (
+    RemovePermutesAroundElementwiseOps,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class RemovePermutesAroundElementwiseTosaOps(RemovePermutesAroundElementwiseOps):
+    permutable_ops = {
+        *RemovePermutesAroundElementwiseOps.permutable_ops,
+        *TableOps.unary_table_ops.keys(),
+        *TableOps.special_table_ops,
+        exir_ops.backend.tosa.RESCALE.default,
+        exir_ops.backend.tosa.TABLE.default,
+    }
+
+    def permute_subgraph(self, subgraph):
+        # Original function will always permute constant nodes which is wrong for table ops
+        # Remove constant tosa.TABLE edges before running full function
+        new_constant_edges_in = set()
+        for const_node, user_node in subgraph.constant_edges_in:
+            if user_node.target == exir_ops.backend.tosa.TABLE.default:
+                continue
+            else:
+                new_constant_edges_in.add((const_node, user_node))
+
+        subgraph.constant_edges_in = new_constant_edges_in
+        super().permute_subgraph(subgraph)
Original file line number	Diff line number	Diff line change
`@@ -204,7 +204,6 @@ def call(self, graph_module):`
`204`	`204`	`f"{[input_node.name for input_node in input_nodes]}"`
`205`	`205`	`)`
`206`	`206`	`modified \|= did_fuse`
`207`		`- graph_module.recompile() # Recompile needed to catch chains of constant ops`
`208`	`207`	`input_nodes_to_maybe_delete.update(input_nodes)`
`209`	`208`	`except Exception as e:`
`210`	`209`	`logger.warning(`
Original file line number	Diff line number	Diff line change
`@@ -278,11 +278,12 @@ def call(self, graph_module: GraphModule) -> PassResult:`
`278`	`278`	`out_quantargs=output_qparams[0],`
`279`	`279`	`)`
`280`	`280`	`# Register buffer in self.exported_program.state_dict`
	`281`	`+ # b_ prefix is important to be recognized as a constant in RemovePermutesAroundElementwiseOps`
`281`	`282`	`const_table_node = create_constant_placeholder(`
`282`	`283`	`exp_program=self.exported_program,`
`283`	`284`	`graph=node.graph,`
`284`	`285`	`kind=InputKind.BUFFER,`
`285`		`- name=node.name + "_table_constant",`
	`286`	`+ name="b_" + node.name + "_table_constant",`
`286`	`287`	`data=buffer,`
`287`	`288`	`persistent_buffer=True,`
`288`	`289`	`)`