pytorch
diff --git a/‎.github/workflows/build-cadence-runner.yml‎
Lines changed: 28 additions & 36 deletions b/‎.github/workflows/build-cadence-runner.yml‎
Lines changed: 28 additions & 36 deletions
diff --git a/‎.github/workflows/mlx.yml‎
Lines changed: 28 additions & 1 deletion b/‎.github/workflows/mlx.yml‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 27 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 11 additions & 1 deletion b/‎Makefile‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎backends/aoti/aoti_backend.py‎
Lines changed: 13 additions & 6 deletions b/‎backends/aoti/aoti_backend.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎backends/aoti/aoti_delegate_handle.h‎
Lines changed: 26 additions & 0 deletions b/‎backends/aoti/aoti_delegate_handle.h‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎backends/aoti/tests/TARGETS‎
Lines changed: 12 additions & 0 deletions b/‎backends/aoti/tests/TARGETS‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/aoti/tests/test_resolve_blob_keys.cpp‎
Lines changed: 53 additions & 0 deletions b/‎backends/aoti/tests/test_resolve_blob_keys.cpp‎
Lines changed: 53 additions & 0 deletions
@@ -19,36 +19,18 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  gate:
-    runs-on: ubuntu-latest
-    outputs:
-      run-cadence: ${{ steps.decide.outputs.run }}
-    steps:
-      - id: decide
-        env:
-          EVENT: ${{ github.event_name }}
-          IS_FORK: ${{ github.event.pull_request.head.repo.full_name != github.repository }}
-          HAS_CLA: ${{ contains(github.event.pull_request.labels.*.name, 'CLA Signed') }}
-          HAS_EXPORT: ${{ contains(github.event.pull_request.labels.*.name, 'meta-exported') }}
-        run: |
-          run=false
-          case "${EVENT}" in
-            push|schedule|workflow_dispatch)
-              run=true
-              ;;
-            pull_request)
-              [ "${IS_FORK}" = "false" ] && run=true
-              ;;
-            pull_request_target)
-              if [ "${IS_FORK}" = "true" ] && [ "${HAS_CLA}" = "true" ] && [ "${HAS_EXPORT}" = "true" ]; then
-                run=true
-              fi
-              ;;
-          esac
-          echo "run=${run}" >> "${GITHUB_OUTPUT}"
-
+  # Same-repo PRs run on pull_request, which reads the PR's own workflow AND code
+  # -- so CI changes, new test jobs, code, and tests are all validated pre-merge.
+  # Fork PRs can't get credentials (OIDC) on pull_request, so Meta-exported forks
+  # (labeled CLA Signed + meta-exported) run on pull_request_target instead. The
+  # run condition is inlined per job (GitHub Actions has no YAML anchors and env
+  # is unavailable in job-level if), so keep the copies in sync.
   cpu-build:
-    if: github.event_name != 'pull_request_target'
+    if: >-
+      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
+      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
+      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -58,7 +40,7 @@ jobs:
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12
       submodules: recursive
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: cadence-runner-build
       script: |
@@ -75,21 +57,28 @@ jobs:
 
   cpu-test:
     needs: cpu-build
-    if: github.event_name != 'pull_request_target'
+    if: >-
+      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
+      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
+      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
     permissions:
       id-token: write
       contents: read
     uses: ./.github/workflows/_test_cadence.yml
     with:
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }}
 
   # Cross-compile cadence_executor_runner for each Cadence Xtensa core, one job
   # per backend so they show as separate lines (no matrix grouping). Shared logic
   # lives in _xtensa_build.yml. fusion_g3 is omitted until the upstream fusion_g3
   # <-> nnlib-FusionG3 API skew is fixed (its runner does not link).
   hifi-build:
-    needs: gate
-    if: needs.gate.outputs.run-cadence == 'true'
+    if: >-
+      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
+      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
+      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
     permissions:
       id-token: write
       contents: read
@@ -99,8 +88,11 @@ jobs:
       ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }}
 
   vision-build:
-    needs: gate
-    if: needs.gate.outputs.run-cadence == 'true'
+    if: >-
+      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
+      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
+      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
     permissions:
       id-token: write
       contents: read
 
@@ -66,7 +66,11 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Build test runners"
-        ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 ))
+        ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner mlx_mutable_state_test -j$(( $(sysctl -n hw.ncpu) - 1 ))
+        echo "::endgroup::"
+
+        echo "::group::Run mutable-state (multi-session) unit test"
+        ./cmake-out/backends/mlx/test/mlx_mutable_state_test
         echo "::endgroup::"
 
         echo "::group::Run op unit tests"
@@ -161,6 +165,29 @@ jobs:
         fi
         echo "::endgroup::"
 
+        echo "::group::Verify chunked == unchunked prefill"
+        QWEN_TINY_PTE=/tmp/qwen35_moe_mlx_tiny/model.pte \
+          ${CONDA_RUN} python -m pytest \
+          examples/models/qwen3_5_moe/test_chunked_prefill.py -v
+        echo "::endgroup::"
+
+        echo "::group::Build Qwen 3.5 MoE MLX C++ runner"
+        # Validates the MLX C++ runner build wiring (compile + link + metallib).
+        # The tiny model has no compatible tokenizer (vocab 256, random weights),
+        # so we don't run C++ inference here — only confirm it builds.
+        ${CONDA_RUN} make qwen3_5_moe-mlx
+        RUNNER=cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner
+        if [ ! -x "$RUNNER" ]; then
+          echo "Failed: runner not found at $RUNNER"
+          exit 1
+        fi
+        if [ ! -f "$(dirname "$RUNNER")/mlx.metallib" ]; then
+          echo "Failed: mlx.metallib not copied next to runner"
+          exit 1
+        fi
+        echo "Success: built $RUNNER"
+        echo "::endgroup::"
+
   backend-tester:
     needs: run-decision
     if: |
 
@@ -816,6 +816,33 @@ jobs:
         # Test test_arm_backend.sh with test
         backends/arm/test/test_arm_backend.sh "${ARM_TEST}"
 
+  test-arm-backend-public-api-backward-compatibility:
+    name: test-arm-backend-public-api-backward-compatibility
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
+
+        .ci/scripts/setup-arm-baremetal-tools.sh --enable-mlsdk-deps --install-mlsdk-deps-with-pip
+        source examples/arm/arm-scratch/setup_path.sh
+
+        backends/arm/scripts/public_api_manifest/validate_all_public_api_manifests.sh
+
+        python backends/arm/test/public_api_bc/run_public_api_bc_scenarios.py
+
   test-llama-runner-qnn-linux:
     name: test-llama-runner-qnn-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
 
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal qwen3_5_moe-mlx clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -131,6 +131,7 @@ help:
 	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
+	@echo "  qwen3_5_moe-mlx     - Build Qwen3.5 MoE runner with MLX backend"
 	@echo "  clean               - Clean build artifacts"
 
 voxtral-cuda:
@@ -467,6 +468,15 @@ qwen3_5_moe-metal:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
 
+qwen3_5_moe-mlx:
+	@echo "==> Building and installing ExecuTorch with MLX..."
+	cmake --workflow --preset mlx-release
+	@echo "==> Building Qwen3.5 MoE runner with MLX..."
+	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-mlx
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+
 clean:
 	rm -rf cmake-out \
 	       extension/llm/tokenizers/build \
 
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
+import hashlib
 import os
 import typing
 from abc import ABC, abstractmethod
@@ -276,18 +277,21 @@ def preprocess(
 
         # Create named data store
         named_data_store = NamedDataStore()
-        method_name = cls.method_name_from_compile_specs(compile_specs)
 
-        named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None)
+        # Key each blob by a content hash so partitions in one method get distinct
+        # keys (a method-name-only key collides). Runtime recovers them from
+        # processed_bytes below.
+        so_blob_key = hashlib.sha256(so_data).hexdigest() + "_so_blob"
+        weights_blob_key = hashlib.sha256(blob_data).hexdigest() + "_weights_blob"
+
+        named_data_store.add_named_data(so_blob_key, so_data, 1, None)
         # Determine whether to save named data externally based on backend setting
         # External: save to separate .ptd file, otherwise merge with .pte file
         external_tag = (
             f"aoti_{device_name}_blob" if cls.save_data_externally() else None
         )
 
-        named_data_store.add_named_data(
-            method_name + "_weights_blob", blob_data, 1, external_tag
-        )
+        named_data_store.add_named_data(weights_blob_key, blob_data, 1, external_tag)
 
         # Clean up the generated files
         os.remove(so_path)
@@ -299,8 +303,11 @@ def preprocess(
         # the next preprocess call (e.g. for the next method).
         cls.release_moved_tensors(device_edge_program, compile_specs)
 
+        # The runtime cannot recompute these hash keys, so carry them (one per line).
+        processed_bytes = (so_blob_key + "\n" + weights_blob_key).encode("utf-8")
+
         return PreprocessResult(
-            processed_bytes=b"",
+            processed_bytes=processed_bytes,
             debug_handle_map={},
             data_store_output=named_data_store.get_named_data_store_output(),
         )
 
@@ -10,13 +10,15 @@
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/freeable_buffer.h>
 #include <string>
 
 namespace executorch {
 namespace backends {
 namespace aoti {
 
 using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
 using executorch::runtime::etensor::Tensor;
 
 extern "C" {
@@ -148,6 +150,30 @@ struct AOTIDelegateHandle {
       update_user_managed_constant_buffer_pairs;
 };
 
+// New-format payload is "<so_key>\n<weights_key>"; an empty payload is a
+// pre-this-change artifact, so fall back to the legacy method-name keys.
+inline Error resolve_blob_keys(
+    const FreeableBuffer* processed,
+    const std::string& method_name,
+    std::string& so_blob_key,
+    std::string& weights_blob_key) {
+  if (processed != nullptr && processed->size() > 0) {
+    const std::string keys(
+        static_cast<const char*>(processed->data()), processed->size());
+    const size_t newline = keys.find('\n');
+    if (newline == std::string::npos) {
+      return Error::Internal;
+    }
+    so_blob_key = keys.substr(0, newline);
+    weights_blob_key = keys.substr(newline + 1);
+  } else {
+    so_blob_key = method_name.empty() ? "so_blob" : method_name + "_so_blob";
+    weights_blob_key =
+        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+  }
+  return Error::Ok;
+}
+
 } // namespace aoti
 } // namespace backends
 } // namespace executorch
@@ -3,6 +3,18 @@ load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
 
 oncall("executorch")
 
+cpp_unittest(
+    name = "test_resolve_blob_keys",
+    srcs = [
+        "test_resolve_blob_keys.cpp",
+    ],
+    deps = [
+        "//executorch/backends/aoti:delegate_handle",
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/core:evalue",
+    ],
+)
+
 cpp_unittest(
     name = "test_common_shims",
     srcs = [
 
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/aoti_delegate_handle.h>
+
+#include <gtest/gtest.h>
+#include <string>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/freeable_buffer.h>
+
+using executorch::backends::aoti::resolve_blob_keys;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+
+TEST(ResolveBlobKeysTest, ParsesKeysFromPayload) {
+  const std::string payload = "aaa_so_blob\nbbb_weights_blob";
+  FreeableBuffer processed(payload.data(), payload.size(), nullptr);
+  std::string so_key;
+  std::string weights_key;
+
+  ASSERT_EQ(
+      resolve_blob_keys(&processed, "forward", so_key, weights_key), Error::Ok);
+  EXPECT_EQ(so_key, "aaa_so_blob");
+  EXPECT_EQ(weights_key, "bbb_weights_blob");
+}
+
+TEST(ResolveBlobKeysTest, FallsBackToMethodNameKeysWhenEmpty) {
+  FreeableBuffer processed; // size 0: a pre-this-change artifact
+  std::string so_key;
+  std::string weights_key;
+
+  ASSERT_EQ(
+      resolve_blob_keys(&processed, "forward", so_key, weights_key), Error::Ok);
+  EXPECT_EQ(so_key, "forward_so_blob");
+  EXPECT_EQ(weights_key, "forward_weights_blob");
+}
+
+TEST(ResolveBlobKeysTest, FailsOnMalformedPayload) {
+  const std::string payload = "missing_the_newline_separator";
+  FreeableBuffer processed(payload.data(), payload.size(), nullptr);
+  std::string so_key;
+  std::string weights_key;
+
+  EXPECT_EQ(
+      resolve_blob_keys(&processed, "forward", so_key, weights_key),
+      Error::Internal);
+}