pytorch
diff --git a/‎.ci/scripts/test_lora.sh‎
Lines changed: 1 addition & 2 deletions b/‎.ci/scripts/test_lora.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 22 additions & 1 deletion b/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎.claude/skills/qualcomm/SKILL.md‎
Lines changed: 0 additions & 1 deletion b/‎.claude/skills/qualcomm/SKILL.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/pytorch-probot.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/pytorch-probot.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/mlx.yml‎
Lines changed: 19 additions & 0 deletions b/‎.github/workflows/mlx.yml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 28 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 21 additions & 1 deletion b/‎Makefile‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎backends/aoti/aoti_backend.py‎
Lines changed: 0 additions & 11 deletions b/‎backends/aoti/aoti_backend.py‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎backends/apple/metal/metal_backend.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/apple/metal/metal_backend.py‎
Lines changed: 1 addition & 0 deletions
@@ -139,8 +139,7 @@ Okay, so I need to calculate 15% of 80."
 EXPECTED_QUANT_LORA_PREFIX="
 <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
 To calculate 15% of 80, we can multiply 80 by 15/100.
-80 * 15/100 = 12.
-So, 15% of 80 is 12.
+So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
 #### 12
 The answer is: 12<|im_end|>"
 
 
@@ -354,7 +354,7 @@ EOF
     fi
     ;;
   qwen3_5_moe)
-    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 128 --temperature 0"
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 128 --temperature 0 --cuda_graph"
     ;;
   voxtral_realtime)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
@@ -397,6 +397,27 @@ if [ -n "$EXPECTED_OUTPUT" ]; then
 else
   echo "SUCCESS: Runner completed successfully"
 fi
+
+# Validate GPU peak memory usage for models with known memory budgets.
+# The runner prints "GPU peak memory usage: XXXX.X MiB" at the end.
+case "$MODEL_NAME" in
+  qwen3_5_moe)
+    MAX_MEMORY_MIB=20480  # 20 GB — must fit on a single GPU (e.g. 4090)
+    PEAK_MEM=$(echo "$OUTPUT" | grep -oP 'GPU peak memory usage: \K[0-9.]+' || true)
+    if [ -n "$PEAK_MEM" ]; then
+      # Compare as integers (truncate decimals)
+      PEAK_MEM_INT=${PEAK_MEM%%.*}
+      if [ "$PEAK_MEM_INT" -gt "$MAX_MEMORY_MIB" ]; then
+        echo "FAIL: GPU peak memory ${PEAK_MEM} MiB exceeds budget ${MAX_MEMORY_MIB} MiB"
+        exit 1
+      else
+        echo "Success: GPU peak memory ${PEAK_MEM} MiB within budget (max ${MAX_MEMORY_MIB} MiB)"
+      fi
+    else
+      echo "WARNING: GPU peak memory usage not found in output"
+    fi
+    ;;
+esac
 echo "::endgroup::"
 
 popd
@@ -93,6 +93,5 @@ Required flags: `-m` (SoC model), `-b` (Android build dir). Optional: `-s` (devi
 | `TestExampleLLMScript` | LLM script tests |
 | `TestExampleMultimodalityScript` | Multimodality script tests |
 | `TestExampleOssScript` | OSS model script tests |
-| `TestExampleQaihubScript` | QAI Hub script tests |
 | `TestExampleScript` | General example script tests |
 | `TestUtilsScript` | Utility script tests |
@@ -6,6 +6,7 @@ ciflow_push_tags:
 - ciflow/cuda
 - ciflow/cuda-perf
 - ciflow/metal
+- ciflow/mlx
 - ciflow/nightly
 - ciflow/trunk
 - ciflow/binaries
 
@@ -145,8 +145,8 @@ jobs:
         # Run CUDA backend Python tests
         python -m pytest backends/cuda/tests backends/cuda/passes/tests -v -o "addopts="
 
-        # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache)
-        python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py -v -o "addopts="
+        # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache + sampler)
+        python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py examples/models/qwen3_5_moe/test_sampler.py -v -o "addopts="
 
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
 
@@ -5,6 +5,8 @@ on:
     branches:
       - main
       - release/*
+    tags:
+      - ciflow/mlx/*
   pull_request:
     paths:
       - .github/workflows/mlx.yml
@@ -16,6 +18,10 @@ on:
       - examples/models/qwen3_5_moe/**
   workflow_dispatch:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
 permissions: {}
 
 jobs:
@@ -218,6 +224,10 @@ jobs:
         echo "::endgroup::"
 
   test-mlx-voxtral:
+    # Requires HuggingFace secrets — skip on fork PRs.
+    # Maintainers can opt-in by applying the ciflow/mlx label, which
+    # pushes a ciflow/mlx/<PR> tag that re-runs this workflow with secrets.
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
@@ -275,6 +285,9 @@ jobs:
         echo "::endgroup::"
 
   test-mlx-voxtral-realtime:
+    # Requires HuggingFace secrets — skip on fork PRs.
+    # Maintainers can opt-in by applying the ciflow/mlx label.
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
@@ -347,6 +360,9 @@ jobs:
         echo "::endgroup::"
 
   test-mlx-whisper:
+    # Requires HuggingFace secrets — skip on fork PRs.
+    # Maintainers can opt-in by applying the ciflow/mlx label.
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
@@ -458,6 +474,9 @@ jobs:
         echo "::endgroup::"
 
   test-mlx-llm:
+    # Requires HuggingFace secrets — skip on fork PRs.
+    # Maintainers can opt-in by applying the ciflow/mlx label.
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     strategy:
       fail-fast: false
       matrix:
 
@@ -1124,6 +1124,8 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/memory_allocator)
+  list(APPEND _executorch_extensions extension_memory_allocator)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
   list(APPEND _executorch_extensions extension_llm_runner)
 endif()
@@ -1228,6 +1230,32 @@ if(NOT EXECUTORCH_SELECT_OPS_YAML STREQUAL ""
   )
   list(APPEND _executorch_kernels executorch_selected_kernels)
 
+  # Auto-right-size the kernel registry unless the user has pinned
+  # MAX_KERNEL_NUM.
+  if(NOT DEFINED CACHE{MAX_KERNEL_NUM} AND NOT DEFINED MAX_KERNEL_NUM)
+    gen_selected_max_kernel_num(
+      LIB_NAME "executorch_selected_kernels" OPLIST_YAMLS
+      ${gen_selected_ops_output_yaml}
+    )
+    target_include_directories(
+      executorch_core
+      PRIVATE ${executorch_selected_kernels_max_kernel_num_include_dir}
+    )
+    add_dependencies(
+      executorch_core executorch_selected_kernels_max_kernel_num_header
+    )
+    if(TARGET executorch_core_shared)
+      target_include_directories(
+        executorch_core_shared
+        PRIVATE ${executorch_selected_kernels_max_kernel_num_include_dir}
+      )
+      add_dependencies(
+        executorch_core_shared
+        executorch_selected_kernels_max_kernel_num_header
+      )
+    endif()
+  endif()
+
   install(
     TARGETS executorch_selected_kernels
     EXPORT ExecuTorchTargets
 
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -103,6 +103,8 @@ help:
 	@echo "  voxtral_realtime-cpu - Build Voxtral Realtime runner with CPU backend"
 	@echo "  voxtral_realtime-metal - Build Voxtral Realtime runner with Metal backend (macOS only)"
 	@echo "  voxtral_realtime-mlx - Build Voxtral Realtime runner with MLX backend"
+	@echo "  voxtral_tts-cpu     - Build Voxtral TTS runner (CPU)"
+	@echo "  voxtral_tts-cuda    - Build Voxtral TTS runner with CUDA backend"
 	@echo "  whisper-cuda        - Build Whisper runner with CUDA backend"
 	@echo "  whisper-cuda-debug  - Build Whisper runner with CUDA backend (debug mode)"
 	@echo "  whisper-cpu         - Build Whisper runner with CPU backend"
@@ -396,6 +398,24 @@ gemma3-cpu:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/gemma3/gemma3_e2e_runner"
 
+voxtral_tts-cpu:
+	@echo "==> Building and installing ExecuTorch..."
+	cmake --workflow --preset llm-release
+	@echo "==> Building Voxtral TTS runner (CPU)..."
+	cd examples/models/voxtral_tts && cmake --workflow --preset voxtral-tts-cpu
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/voxtral_tts/voxtral_tts_runner"
+
+voxtral_tts-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building Voxtral TTS runner with CUDA..."
+	cd examples/models/voxtral_tts && cmake --workflow --preset voxtral-tts-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/voxtral_tts/voxtral_tts_runner"
+
 qwen3_5_moe-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda
 
@@ -25,7 +25,6 @@
 
 class COMPILE_SPEC_KEYS(Enum):
     METHOD_NAME = "method_name"
-    SHARE_KV_CACHE_ACROSS_METHODS = "share_kv_cache_across_methods"
 
 
 @experimental(
@@ -287,13 +286,3 @@ def method_name_from_compile_specs(
         raise RuntimeError(
             f"Could not find method name in compile specs: {compile_specs}"
         )
-
-    @classmethod
-    def generate_share_kv_cache_compile_spec(cls) -> CompileSpec:
-        """
-        Generate a CompileSpec to enable cross-method KV cache sharing.
-        """
-        return CompileSpec(
-            COMPILE_SPEC_KEYS.SHARE_KV_CACHE_ACROSS_METHODS.value,
-            bytes([1]),
-        )
@@ -35,6 +35,7 @@ def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
             "aoti_torch_mps_convolution": None,
             "aoti_torch_mps_mm_out": None,
             "at::_ops::_scaled_dot_product_attention_math_for_mps::call": None,
+            "at::_ops::_scaled_dot_product_attention_math_for_mps_v2::call": None,
             "torchao::_linear_fp_act_4bit_weight": None,
             "at::_ops::topk::call": None,
             "metal::gather_qmv": None,