pytorch
diff --git a/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 2 additions & 2 deletions b/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/mlx.yml‎
Lines changed: 28 additions & 3 deletions b/‎.github/workflows/mlx.yml‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎Makefile‎
Lines changed: 11 additions & 1 deletion b/‎Makefile‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎backends/aoti/slim/core/slim_tensor.h‎
Lines changed: 34 additions & 14 deletions b/‎backends/aoti/slim/core/slim_tensor.h‎
Lines changed: 34 additions & 14 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 0 additions & 7 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎backends/arm/_passes/decompose_gru_pass.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/decompose_gru_pass.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_lstm_pass.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/decompose_lstm_pass.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_rnn_pass.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/decompose_rnn_pass.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_sum_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/decompose_sum_pass.py‎
Lines changed: 1 addition & 1 deletion
@@ -17,9 +17,9 @@ MODEL=$1
 script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
 et_root_dir=$(realpath "${script_dir}/../..")
 
-# Quantization is the default for the cortex-m55+int8 target; run.sh's
+# Quantization is the default for the cortex-m55 target; run.sh's
 # arg parser only recognizes --no_quantize, so we omit any explicit flag.
 bash "${et_root_dir}/examples/arm/run.sh" \
     --model_name="${MODEL}" \
-    --target=cortex-m55+int8 \
+    --target=cortex-m55 \
     --bundleio
@@ -148,6 +148,10 @@ jobs:
         # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache + sampler)
         python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py examples/models/qwen3_5_moe/test_sampler.py -v -o "addopts="
 
+        # Run Gemma 4 31B tests (quant unit tests + pipeline integration tests)
+        pip install gguf
+        python -m pytest examples/models/gemma4_31b/quant/tests/ examples/models/gemma4_31b/tests/ -v -o "addopts="
+
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
     # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
 
@@ -501,12 +501,26 @@ jobs:
             name: "gemma3-1b"
         use-custom: [false, true]
         qconfig: ["4w", "nvfp4"]
+        runner: ["macos-14-xlarge"]
+        include:
+          - model:
+              id: "google/gemma-4-E2B-it"
+              name: "gemma4-e2b"
+            use-custom: true
+            qconfig: "4w"
+            runner: "macos-15-xlarge"
+          - model:
+              id: "google/gemma-4-E2B-it"
+              name: "gemma4-e2b"
+            use-custom: false
+            qconfig: "4w"
+            runner: "macos-15-xlarge"
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
       default-packages: ""
       job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}-${{ matrix.qconfig }}
-      runner: macos-14-xlarge
+      runner: ${{ matrix.runner }}
       python-version: "3.12"
       submodules: recursive
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -521,12 +535,16 @@ jobs:
         MODEL_NAME="${{ matrix.model.name }}"
         USE_CUSTOM="${{ matrix.use-custom }}"
         QCONFIG="${{ matrix.qconfig }}"
-
         CUSTOM_ARGS=""
         if [ "${USE_CUSTOM}" = "true" ]; then
           CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
         fi
 
+        QEMBEDDING_ARGS="--qembedding ${QCONFIG}"
+        if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
+          QEMBEDDING_ARGS=""
+        fi
+
         echo "::group::Install ExecuTorch and configure MLX build"
         ${CONDA_RUN} python install_executorch.py > /dev/null
         ${CONDA_RUN} cmake --preset mlx-release
@@ -537,6 +555,13 @@ jobs:
         ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         ${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
+        if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
+          # Gemma 4 requires a newer Transformers build than the CI-wide
+          # optimum-executorch pin currently brings in. Keep this pinned to the
+          # locally validated commit instead of floating on Transformers HEAD.
+          GEMMA4_TRANSFORMERS_COMMIT=61461a7bcb458db7cf6eeea49678b9ab776a7821
+          ${CONDA_RUN} pip install -U "transformers @ git+https://github.com/huggingface/transformers.git@${GEMMA4_TRANSFORMERS_COMMIT}"
+        fi
         echo "::endgroup::"
 
         ${CONDA_RUN} pip list
@@ -546,7 +571,7 @@ jobs:
           --model-id "${MODEL_ID}" \
           --output /tmp/${MODEL_NAME}.pte \
           --qlinear ${QCONFIG} \
-          --qembedding ${QCONFIG} \
+          ${QEMBEDDING_ARGS} \
           ${CUSTOM_ARGS}
         echo "::endgroup::"
 
 
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -126,6 +126,7 @@ help:
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
+	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  clean               - Clean build artifacts"
@@ -425,6 +426,15 @@ qwen3_5_moe-cuda:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
 
+gemma4_31b-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building Gemma 4 31B runner with CUDA..."
+	cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+
 qwen3_5_moe-metal:
 	@echo "==> Building and installing ExecuTorch with Metal..."
 	cmake --workflow --preset llm-release-metal
 
@@ -433,13 +433,19 @@ class SlimTensor {
   /**
    * Copy data from another tensor to this tensor.
    *
-   * Both tensors must have the same numel and dtype.
-   * Currently only supports CPU-to-CPU copy (contiguous tensors only).
+   * Both tensors must have the same numel, sizes and dtype.
    *
    * @param other The source tensor to copy from
    * @return Reference to this tensor
    */
   SlimTensor& copy_(const SlimTensor& other) {
+    ET_CHECK_MSG(
+        this->dim() == other.dim(),
+        "copy_: dim of tensors must match (%zu vs %zu)",
+        this->dim(),
+        other.dim());
+    ET_CHECK_MSG(
+        this->sizes() == other.sizes(), "copy_: sizes of tensors must match");
     ET_CHECK_MSG(
         this->numel() == other.numel(), "copy_: numel of tensors must match");
     ET_CHECK_MSG(this->dtype() == other.dtype(), "copy_: dtype must match");
@@ -463,29 +469,43 @@ class SlimTensor {
 
     std::vector<int64_t> counter(this->dim(), 0);
     for (size_t i = 0; i < this->numel(); i++) {
-      // Compute src offset in elements
       int64_t src_offset = 0;
-      for (size_t d = 0; d < other.dim(); d++) {
-        src_offset += counter[d] * other.stride(d);
-      }
-
-      // Compute dst offset in elements
       int64_t dst_offset = 0;
       for (size_t d = 0; d < this->dim(); d++) {
-        dst_offset += counter[d] * this->stride(d);
+        int64_t src_term = 0;
+        int64_t dst_term = 0;
+        // src_offset = src_offset + counter[d] * other.stride(d)
+        // dst_offset = dst_offset + counter[d] * this->stride(d)
+        ET_CHECK_MSG(
+            !::c10::mul_overflows(counter[d], other.stride(d), &src_term) &&
+                !::c10::add_overflows(src_offset, src_term, &src_offset) &&
+                !::c10::mul_overflows(counter[d], this->stride(d), &dst_term) &&
+                !::c10::add_overflows(dst_offset, dst_term, &dst_offset),
+            "copy_: offset computation overflow");
       }
+      size_t src_byte_offset = 0;
+      size_t dst_byte_offset = 0;
+      // src_byte_offset = src_offset * elem_size
+      // dst_byte_offset = dst_offset * elem_size
+      ET_CHECK_MSG(
+          src_offset >= 0 && dst_offset >= 0 &&
+              !::c10::mul_overflows(
+                  static_cast<size_t>(src_offset),
+                  elem_size,
+                  &src_byte_offset) &&
+              !::c10::mul_overflows(
+                  static_cast<size_t>(dst_offset), elem_size, &dst_byte_offset),
+          "copy_: byte offset overflow");
 
       // Copy elem_size bytes from src to dst
       if (this->device().is_cpu() && other.device().is_cpu()) {
         std::memcpy(
-            dst_data + dst_offset * elem_size,
-            src_data + src_offset * elem_size,
-            elem_size);
+            dst_data + dst_byte_offset, src_data + src_byte_offset, elem_size);
       } else if (this->device().is_cuda() || other.device().is_cuda()) {
 #if defined(CUDA_AVAILABLE)
         DeviceTraits<c10::DeviceType::CUDA>::memcpy(
-            dst_data + dst_offset * elem_size,
-            src_data + src_offset * elem_size,
+            dst_data + dst_byte_offset,
+            src_data + src_byte_offset,
             elem_size,
             device(), // dst device
             other.device() // src device
 
@@ -150,7 +150,6 @@
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
 from executorch.backends.arm.common.pipeline_config import (
     ArmPassPipelineConfig,
-    FuseDuplicateUsersConfig,
     SoftmaxDecompositionConfig,
 )
 from executorch.backends.arm.tosa.specification import (
@@ -238,9 +237,6 @@ def configure_skip_passes(
             case SoftmaxDecompositionConfig.STABLE:
                 skip_set.add(DecomposeMaskedFillPass)
 
-        if config.fuse_duplicate_users is FuseDuplicateUsersConfig.DISABLED:
-            skip_set.add(FuseDuplicateUsersPass)
-
         self._skip_pass_types = tuple(skip_set)
         skip_names = [skipped_pass.__name__ for skipped_pass in self._skip_pass_types]
         logger.debug(f"Passes in skip list: {skip_names}")
@@ -403,9 +399,6 @@ def _tosa_pipeline(
                 ConvertToClampPass(),
                 DecomposeTOSAUnsupportedClampPass(),
                 DecomposeGroupNormPass(),
-                DecomposeGruPass(),
-                DecomposeLstmPass(),
-                DecomposeRnnPass(),
                 DecomposeLayerNormPass(),
                 DecomposeVarPass(),
                 DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec),
 
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -13,7 +14,6 @@
     create_node,
     get_getitem_users,
 )
-from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.pass_base import ExportPass, PassResult
 
 
@@ -34,7 +34,7 @@ class DecomposeGruPass(ArmPass):
 
     """
 
-    _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass}
+    _passes_required_after: Set[Type[ExportPass]] = set()
 
     _TARGET = torch.ops.aten.gru.input
 
 
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -13,7 +14,6 @@
     create_node,
     get_getitem_users,
 )
-from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.pass_base import ExportPass, PassResult
 
 
@@ -36,7 +36,7 @@ class DecomposeLstmPass(ArmPass):
 
     """
 
-    _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass}
+    _passes_required_after: Set[Type[ExportPass]] = set()
 
     _TARGET = torch.ops.aten.lstm.input
 
 
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -13,7 +14,6 @@
     create_node,
     get_getitem_users,
 )
-from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.pass_base import ExportPass, PassResult
 
 
@@ -30,7 +30,7 @@ class DecomposeRnnPass(ArmPass):
 
     """
 
-    _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass}
+    _passes_required_after: Set[Type[ExportPass]] = set()
 
     _TARGETS = {
         torch.ops.aten.rnn_tanh.input,
 
@@ -78,7 +78,7 @@ def call_operator(self, op, args, kwargs, meta):
         for dim in dims:
             input_node = super().call_operator(
                 sum_op,
-                (input_node, dim, True),
+                (input_node, [dim], True),
                 kwargs,
                 meta,
                 updated=True,