pytorch
diff --git a/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 2 additions & 2 deletions b/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/mlx.yml‎
Lines changed: 28 additions & 3 deletions b/‎.github/workflows/mlx.yml‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 10 additions & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 11 additions & 1 deletion b/‎Makefile‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎Package.swift‎
Lines changed: 58 additions & 8 deletions b/‎Package.swift‎
Lines changed: 58 additions & 8 deletions
diff --git a/‎backends/aoti/slim/core/slim_tensor.h‎
Lines changed: 34 additions & 14 deletions b/‎backends/aoti/slim/core/slim_tensor.h‎
Lines changed: 34 additions & 14 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 0 additions & 7 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 0 additions & 7 deletions
@@ -17,9 +17,9 @@ MODEL=$1
 script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
 et_root_dir=$(realpath "${script_dir}/../..")
 
-# Quantization is the default for the cortex-m55+int8 target; run.sh's
+# Quantization is the default for the cortex-m55 target; run.sh's
 # arg parser only recognizes --no_quantize, so we omit any explicit flag.
 bash "${et_root_dir}/examples/arm/run.sh" \
     --model_name="${MODEL}" \
-    --target=cortex-m55+int8 \
+    --target=cortex-m55 \
     --bundleio
@@ -148,6 +148,10 @@ jobs:
         # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache + sampler)
         python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py examples/models/qwen3_5_moe/test_sampler.py -v -o "addopts="
 
+        # Run Gemma 4 31B tests (quant unit tests + pipeline integration tests)
+        pip install gguf
+        python -m pytest examples/models/gemma4_31b/quant/tests/ examples/models/gemma4_31b/tests/ -v -o "addopts="
+
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
     # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
 
@@ -501,12 +501,26 @@ jobs:
             name: "gemma3-1b"
         use-custom: [false, true]
         qconfig: ["4w", "nvfp4"]
+        runner: ["macos-14-xlarge"]
+        include:
+          - model:
+              id: "google/gemma-4-E2B-it"
+              name: "gemma4-e2b"
+            use-custom: true
+            qconfig: "4w"
+            runner: "macos-15-xlarge"
+          - model:
+              id: "google/gemma-4-E2B-it"
+              name: "gemma4-e2b"
+            use-custom: false
+            qconfig: "4w"
+            runner: "macos-15-xlarge"
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
       default-packages: ""
       job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}-${{ matrix.qconfig }}
-      runner: macos-14-xlarge
+      runner: ${{ matrix.runner }}
       python-version: "3.12"
       submodules: recursive
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -521,12 +535,16 @@ jobs:
         MODEL_NAME="${{ matrix.model.name }}"
         USE_CUSTOM="${{ matrix.use-custom }}"
         QCONFIG="${{ matrix.qconfig }}"
-
         CUSTOM_ARGS=""
         if [ "${USE_CUSTOM}" = "true" ]; then
           CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
         fi
 
+        QEMBEDDING_ARGS="--qembedding ${QCONFIG}"
+        if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
+          QEMBEDDING_ARGS=""
+        fi
+
         echo "::group::Install ExecuTorch and configure MLX build"
         ${CONDA_RUN} python install_executorch.py > /dev/null
         ${CONDA_RUN} cmake --preset mlx-release
@@ -537,6 +555,13 @@ jobs:
         ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         ${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
+        if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
+          # Gemma 4 requires a newer Transformers build than the CI-wide
+          # optimum-executorch pin currently brings in. Keep this pinned to the
+          # locally validated commit instead of floating on Transformers HEAD.
+          GEMMA4_TRANSFORMERS_COMMIT=61461a7bcb458db7cf6eeea49678b9ab776a7821
+          ${CONDA_RUN} pip install -U "transformers @ git+https://github.com/huggingface/transformers.git@${GEMMA4_TRANSFORMERS_COMMIT}"
+        fi
         echo "::endgroup::"
 
         ${CONDA_RUN} pip list
@@ -546,7 +571,7 @@ jobs:
           --model-id "${MODEL_ID}" \
           --output /tmp/${MODEL_NAME}.pte \
           --qlinear ${QCONFIG} \
-          --qembedding ${QCONFIG} \
+          ${QEMBEDDING_ARGS} \
           ${CUSTOM_ARGS}
         echo "::endgroup::"
 
 
@@ -1489,11 +1489,11 @@ jobs:
         .ci/scripts/setup-linux.sh --build-tool "cmake"
 
         # Custom operator tests
-        PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh add
-        ./cmake-out/backends/vulkan/test/custom_ops/q8csw_linear
-        ./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d
-        ./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
-        ./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
+        PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh test_add
+        ./cmake-out/backends/vulkan/test/custom_ops/test_q8csw_linear
+        ./cmake-out/backends/vulkan/test/custom_ops/test_q8csw_conv2d
+        ./cmake-out/backends/vulkan/test/custom_ops/test_q4gsw_linear
+        ./cmake-out/backends/vulkan/test/custom_ops/test_choose_qparams_per_row
         ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
         ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_clone
         ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_binary
 
@@ -380,6 +380,11 @@ jobs:
 
         ARM_TEST=${{ matrix.test_arm_baremetal }}
 
+        # Output test report on pytest runs so that github can surface failing tests.
+        if [[ -n "${RUNNER_TEST_RESULTS_DIR:-}" ]]; then
+              export PYTEST_ADDOPTS="--junit-xml=${RUNNER_TEST_RESULTS_DIR}/${ARM_TEST}.xml ${PYTEST_ADDOPTS:-}"
+        fi
+
         # Test test_arm_baremetal.sh with test
         backends/arm/test/test_arm_baremetal.sh "${ARM_TEST}"
 
@@ -415,6 +420,11 @@ jobs:
 
         ARM_TEST=${{ matrix.test_arm_baremetal }}
 
+        # Output test report on pytest runs so that github can surface failing tests.
+        if [[ -n "${RUNNER_TEST_RESULTS_DIR:-}" ]]; then
+          export PYTEST_ADDOPTS="--junit-xml=${RUNNER_TEST_RESULTS_DIR}/${ARM_TEST}.xml ${PYTEST_ADDOPTS:-}"
+        fi
+
         backends/arm/test/test_arm_baremetal.sh "${ARM_TEST}"
 
   test-arm-ootb-linux:
 
@@ -189,6 +189,9 @@ if(NOT EXECUTORCH_ENABLE_PROGRAM_VERIFICATION)
   add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0)
 endif()
 
+# Disable the deprecated constant_buffer path.
+add_definitions(-DET_ENABLE_DEPRECATED_CONSTANT_BUFFER=0)
+
 if(EXECUTORCH_ENABLE_EVENT_TRACER)
   add_definitions(-DET_EVENT_TRACER_ENABLED)
 endif()
 
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -126,6 +126,7 @@ help:
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
+	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  clean               - Clean build artifacts"
@@ -425,6 +426,15 @@ qwen3_5_moe-cuda:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
 
+gemma4_31b-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building Gemma 4 31B runner with CUDA..."
+	cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+
 qwen3_5_moe-metal:
 	@echo "==> Building and installing ExecuTorch with Metal..."
 	cmake --workflow --preset llm-release-metal
 
@@ -18,6 +18,7 @@
 // https://pytorch.org/executorch/main/using-executorch-ios
 
 import PackageDescription
+import Foundation
 
 let debug_suffix = "_debug"
 let dependencies_suffix = "_with_dependencies"
@@ -126,6 +127,48 @@ for (key, value) in products {
   packageTargets.append(target)
 }
 
+// Test fixtures. add_coreml.pte and add_mul_coreml.pte are generated at CI
+// time by extension/apple/ExecuTorch/__tests__/resources/generate_coreml_test_models.py
+// (invoked by scripts/build_apple_frameworks.sh before `swift test`). They
+// are gitignored, so include them in test resources only when present so
+// that `swift test` runs on dev machines without CoreML python deps don't
+// fail at the SwiftPM resolve stage.
+let testResourcesDir = "extension/apple/ExecuTorch/__tests__/resources"
+var testResources: [Resource] = [.copy("resources/add.pte")]
+if FileManager.default.fileExists(atPath: "\(testResourcesDir)/add_coreml.pte") {
+  testResources.append(.copy("resources/add_coreml.pte"))
+}
+if FileManager.default.fileExists(atPath: "\(testResourcesDir)/add_mul_coreml.pte") {
+  testResources.append(.copy("resources/add_mul_coreml.pte"))
+}
+
+// SwiftPM resources must live under the target's path, so the ObjC test
+// target uses copies of the canonical resources directory's fixtures. The
+// copies themselves are gitignored and (re)created by scripts/build_apple_frameworks.sh.
+let objcTestsDir = "extension/apple/ExecuTorch/__tests__/ObjC"
+var objcTestResources: [Resource] = []
+if FileManager.default.fileExists(atPath: "\(objcTestsDir)/add.pte") {
+  objcTestResources.append(.copy("add.pte"))
+}
+if FileManager.default.fileExists(atPath: "\(objcTestsDir)/add_coreml.pte") {
+  objcTestResources.append(.copy("add_coreml.pte"))
+}
+if FileManager.default.fileExists(atPath: "\(objcTestsDir)/add_mul_coreml.pte") {
+  objcTestResources.append(.copy("add_mul_coreml.pte"))
+}
+
+let testLinkerSettings: [LinkerSetting] = [
+  .unsafeFlags([
+    "-Xlinker", "-force_load",
+    "-Xlinker", "cmake-out/kernels_optimized.xcframework/macos-arm64/libkernels_optimized_macos.a",
+    // CoreML backend registers itself with the global delegate registry via a
+    // static initializer; -force_load ensures that initializer is pulled in so
+    // the CoreML-delegated test fixtures can actually instantiate the backend.
+    "-Xlinker", "-force_load",
+    "-Xlinker", "cmake-out/backend_coreml.xcframework/macos-arm64/libbackend_coreml_macos.a",
+  ])
+]
+
 let package = Package(
   name: "executorch",
   platforms: [
@@ -139,17 +182,24 @@ let package = Package(
       dependencies: [
         .target(name: "executorch\(debug_suffix)"),
         .target(name: "kernels_optimized\(dependencies_suffix)"),
+        .target(name: "backend_coreml\(dependencies_suffix)"),
       ],
       path: "extension/apple/ExecuTorch/__tests__",
-      resources: [
-        .copy("resources/add.pte"),
+      exclude: ["ObjC", "resources/generate_coreml_test_models.py", "resources/.gitignore"],
+      resources: testResources,
+      linkerSettings: testLinkerSettings
+    ),
+    .testTarget(
+      name: "objc_tests",
+      dependencies: [
+        .target(name: "executorch\(debug_suffix)"),
+        .target(name: "kernels_optimized\(dependencies_suffix)"),
+        .target(name: "backend_coreml\(dependencies_suffix)"),
       ],
-      linkerSettings: [
-        .unsafeFlags([
-          "-Xlinker", "-force_load",
-          "-Xlinker", "cmake-out/kernels_optimized.xcframework/macos-arm64/libkernels_optimized_macos.a",
-        ])
-      ]
+      path: "extension/apple/ExecuTorch/__tests__/ObjC",
+      exclude: [".gitignore"],
+      resources: objcTestResources,
+      linkerSettings: testLinkerSettings
     )
   ]
 )
@@ -433,13 +433,19 @@ class SlimTensor {
   /**
    * Copy data from another tensor to this tensor.
    *
-   * Both tensors must have the same numel and dtype.
-   * Currently only supports CPU-to-CPU copy (contiguous tensors only).
+   * Both tensors must have the same numel, sizes and dtype.
    *
    * @param other The source tensor to copy from
    * @return Reference to this tensor
    */
   SlimTensor& copy_(const SlimTensor& other) {
+    ET_CHECK_MSG(
+        this->dim() == other.dim(),
+        "copy_: dim of tensors must match (%zu vs %zu)",
+        this->dim(),
+        other.dim());
+    ET_CHECK_MSG(
+        this->sizes() == other.sizes(), "copy_: sizes of tensors must match");
     ET_CHECK_MSG(
         this->numel() == other.numel(), "copy_: numel of tensors must match");
     ET_CHECK_MSG(this->dtype() == other.dtype(), "copy_: dtype must match");
@@ -463,29 +469,43 @@ class SlimTensor {
 
     std::vector<int64_t> counter(this->dim(), 0);
     for (size_t i = 0; i < this->numel(); i++) {
-      // Compute src offset in elements
       int64_t src_offset = 0;
-      for (size_t d = 0; d < other.dim(); d++) {
-        src_offset += counter[d] * other.stride(d);
-      }
-
-      // Compute dst offset in elements
       int64_t dst_offset = 0;
       for (size_t d = 0; d < this->dim(); d++) {
-        dst_offset += counter[d] * this->stride(d);
+        int64_t src_term = 0;
+        int64_t dst_term = 0;
+        // src_offset = src_offset + counter[d] * other.stride(d)
+        // dst_offset = dst_offset + counter[d] * this->stride(d)
+        ET_CHECK_MSG(
+            !::c10::mul_overflows(counter[d], other.stride(d), &src_term) &&
+                !::c10::add_overflows(src_offset, src_term, &src_offset) &&
+                !::c10::mul_overflows(counter[d], this->stride(d), &dst_term) &&
+                !::c10::add_overflows(dst_offset, dst_term, &dst_offset),
+            "copy_: offset computation overflow");
       }
+      size_t src_byte_offset = 0;
+      size_t dst_byte_offset = 0;
+      // src_byte_offset = src_offset * elem_size
+      // dst_byte_offset = dst_offset * elem_size
+      ET_CHECK_MSG(
+          src_offset >= 0 && dst_offset >= 0 &&
+              !::c10::mul_overflows(
+                  static_cast<size_t>(src_offset),
+                  elem_size,
+                  &src_byte_offset) &&
+              !::c10::mul_overflows(
+                  static_cast<size_t>(dst_offset), elem_size, &dst_byte_offset),
+          "copy_: byte offset overflow");
 
       // Copy elem_size bytes from src to dst
       if (this->device().is_cpu() && other.device().is_cpu()) {
         std::memcpy(
-            dst_data + dst_offset * elem_size,
-            src_data + src_offset * elem_size,
-            elem_size);
+            dst_data + dst_byte_offset, src_data + src_byte_offset, elem_size);
       } else if (this->device().is_cuda() || other.device().is_cuda()) {
 #if defined(CUDA_AVAILABLE)
         DeviceTraits<c10::DeviceType::CUDA>::memcpy(
-            dst_data + dst_offset * elem_size,
-            src_data + src_offset * elem_size,
+            dst_data + dst_byte_offset,
+            src_data + src_byte_offset,
             elem_size,
             device(), // dst device
             other.device() // src device
 
@@ -150,7 +150,6 @@
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
 from executorch.backends.arm.common.pipeline_config import (
     ArmPassPipelineConfig,
-    FuseDuplicateUsersConfig,
     SoftmaxDecompositionConfig,
 )
 from executorch.backends.arm.tosa.specification import (
@@ -238,9 +237,6 @@ def configure_skip_passes(
             case SoftmaxDecompositionConfig.STABLE:
                 skip_set.add(DecomposeMaskedFillPass)
 
-        if config.fuse_duplicate_users is FuseDuplicateUsersConfig.DISABLED:
-            skip_set.add(FuseDuplicateUsersPass)
-
         self._skip_pass_types = tuple(skip_set)
         skip_names = [skipped_pass.__name__ for skipped_pass in self._skip_pass_types]
         logger.debug(f"Passes in skip list: {skip_names}")
@@ -403,9 +399,6 @@ def _tosa_pipeline(
                 ConvertToClampPass(),
                 DecomposeTOSAUnsupportedClampPass(),
                 DecomposeGroupNormPass(),
-                DecomposeGruPass(),
-                DecomposeLstmPass(),
-                DecomposeRnnPass(),
                 DecomposeLayerNormPass(),
                 DecomposeVarPass(),
                 DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec),