pytorch
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/apple/metal/CMakeLists.txt‎
Lines changed: 34 additions & 3 deletions b/‎backends/apple/metal/CMakeLists.txt‎
Lines changed: 34 additions & 3 deletions
diff --git a/‎backends/apple/metal/runtime/shims/v2/aoti_dtype.h‎
Lines changed: 57 additions & 0 deletions b/‎backends/apple/metal/runtime/shims/v2/aoti_dtype.h‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎backends/apple/metal/runtime/shims/v2/aoti_dtype_stubs.cpp‎
Lines changed: 25 additions & 0 deletions b/‎backends/apple/metal/runtime/shims/v2/aoti_dtype_stubs.cpp‎
Lines changed: 25 additions & 0 deletions
@@ -751,6 +751,10 @@ if(EXECUTORCH_BUILD_CUDA)
 endif()
 
 if(EXECUTORCH_BUILD_METAL)
+  # backends/metal is the runtime library (lives at backends/metal/).
+  # The AOTI v2 shim (backends/apple/metal) and any other consumers
+  # depend on this lib for the metal_v2 runtime types.
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/metal)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/metal)
   list(APPEND _executorch_backends metal_backend)
 endif()
 
@@ -35,20 +35,43 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
 set(_aoti_metal_sources
-    runtime/metal_backend.cpp
     runtime/stats.cpp
+    # v1-only sources (no v2 replacement yet — built in both branches):
+    runtime/shims/utils.cpp
+)
+
+# AOTI shim layer: choose between v1 (vendored MLX qmv_fast for int4,
+# legacy stream/encoding/buffer-mgmt shims) and v2 (routes through
+# metal_v2::MetalStream + MetalOpRegistry → AffineQuantizedLinearOp,
+# SDPAOp with NAX / qmm_t / qmm_t_splitk dispatch + per-CB residency).
+#
+# v1 and v2 shim files share extern "C" symbol names (intentionally —
+# PTE files lower against the same AOTI shim ABI). They CANNOT coexist
+# in the same build; this if/else picks one.
+if(EXECUTORCH_USE_METAL_V2)
+  list(APPEND _aoti_metal_sources
+    runtime/metal_backend_v2.cpp
+    runtime/shims/v2/aoti_tensor.cpp
+    runtime/shims/v2/aoti_dtype_stubs.cpp
+    runtime/shims/v2/runtime.mm
+    runtime/shims/v2/aoti_kernel.mm
+    runtime/shims/v2/aoti_fallback_op.mm
+  )
+else()
+  list(APPEND _aoti_metal_sources
+    runtime/metal_backend.cpp
     runtime/shims/memory.cpp
     runtime/shims/et_metal.mm
     runtime/shims/shim_mps.mm
     runtime/shims/tensor_attribute.cpp
-    runtime/shims/utils.cpp
     runtime/ops/common.mm
     runtime/ops/op_bmm.mm
     runtime/ops/op_convolution.mm
     runtime/ops/op_linear_4bit.mm
     runtime/ops/op_mm.mm
     runtime/ops/op_sdpa.mm
-)
+  )
+endif()
 
 add_library(metal_backend STATIC ${_aoti_metal_sources})
 target_include_directories(
@@ -87,6 +110,14 @@ endif()
 
 target_link_options(metal_backend PUBLIC -Wl,-export_dynamic)
 
+# Under the v2 shim path, metal_backend depends on the metal_v2 lib
+# (defined in backends/metal/) for MetalStream, MetalOpRegistry,
+# AffineQuantizedLinearOp, etc. PRIVATE link so it doesn't propagate
+# through install exports of metal_backend's downstream consumers.
+if(EXECUTORCH_USE_METAL_V2)
+  target_link_libraries(metal_backend PRIVATE metal_v2)
+endif()
+
 # Find PyTorch's OpenMP library specifically for libtorch-less AOTI
 get_torch_base_path(TORCH_BASE_PATH)
 find_library(
 
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Dtype + layout helpers shared across the v2 AOTI shim layer.
+
+#pragma once
+
+#include <executorch/backends/aoti/utils.h>
+#include <executorch/backends/apple/metal/runtime/shims/v2/aoti_types.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace metal {
+
+// Both enums use the standard PyTorch dtype encoding; value-cast is safe.
+inline executorch::aten::ScalarType to_aten_scalar_type(
+    executorch::backends::aoti::slim::c10::ScalarType slim_dt) {
+  return static_cast<executorch::aten::ScalarType>(static_cast<int>(slim_dt));
+}
+
+inline size_t dtype_to_bytes(int32_t dtype) {
+  return executorch::backends::aoti::dtype_to_element_size(dtype);
+}
+
+// Standard PyTorch-style contiguous strides (in element units).
+// For a degenerate shape with a 0-sized dim, strides for the higher
+// dim collapse to 0 — same convention as torch.empty(N, 0).contiguous().
+inline std::vector<int64_t> compute_contiguous_strides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) return strides;
+  int64_t stride = 1;
+  for (ssize_t i = static_cast<ssize_t>(sizes.size()) - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= sizes[i];
+  }
+  return strides;
+}
+
+// Maximum tensor rank supported by StackTensorView (used by the
+// MetalOpRegistry fallback path in aoti_fallback_op.mm). AOTI shader
+// dispatch (aoti_kernel.mm) is rank-agnostic and not subject to this.
+constexpr size_t kMaxTensorDim = 8;
+
+}  // namespace metal
+}  // namespace backends
+}  // namespace executorch
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Linker stubs for AOTI dtype trampolines that aoti/common_shims_slim
+// doesn't define. Required so dlopen of an AOTI .so resolves cleanly
+// even when the model never actually uses the unsupported dtype.
+
+#include <cstdint>
+
+extern "C" {
+
+// PyTorch float16 dtype code = c10::ScalarType::Half. Models that
+// actually USE float16 will fault inside SlimTensor::check_supportive
+// because slim::c10::ScalarType has no Half variant; this stub just
+// satisfies the linker.
+int32_t aoti_torch_dtype_float16() {
+  return 5;
+}
+
+}  // extern "C"