pytorch
diff --git a/‎backends/mlx/ops.py‎
Lines changed: 21 additions & 7 deletions b/‎backends/mlx/ops.py‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎backends/mlx/runtime/MLXInterpreter.h‎
Lines changed: 11 additions & 0 deletions b/‎backends/mlx/runtime/MLXInterpreter.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎backends/mlx/serialization/schema.fbs‎
Lines changed: 7 additions & 1 deletion b/‎backends/mlx/serialization/schema.fbs‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎backends/mlx/test/test_ops.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/mlx/test/test_ops.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java‎
Lines changed: 10 additions & 2 deletions b/‎extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎extension/llm/runner/text_llm_runner.cpp‎
Lines changed: 19 additions & 10 deletions b/‎extension/llm/runner/text_llm_runner.cpp‎
Lines changed: 19 additions & 10 deletions
diff --git a/‎extension/tensor/tensor_ptr.cpp‎
Lines changed: 28 additions & 5 deletions b/‎extension/tensor/tensor_ptr.cpp‎
Lines changed: 28 additions & 5 deletions
diff --git a/‎extension/tensor/tensor_ptr.h‎
Lines changed: 13 additions & 4 deletions b/‎extension/tensor/tensor_ptr.h‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎extension/tensor/tensor_ptr_maker.cpp‎
Lines changed: 13 additions & 8 deletions b/‎extension/tensor/tensor_ptr_maker.cpp‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎extension/wasm/wasm_bindings.cpp‎
Lines changed: 6 additions & 4 deletions b/‎extension/wasm/wasm_bindings.cpp‎
Lines changed: 6 additions & 4 deletions
@@ -50,6 +50,7 @@
     AsStridedNode,
     AsTypeNode,
     Atan2Node,
+    BitwiseInvertNode,
     BroadcastToNode,
     CeilNode,
     ClipNode,
@@ -3066,27 +3067,40 @@ def _where_handler(P: MLXProgramBuilder, n: Node) -> Slot:
 
 @REGISTRY.register(target=[torch.ops.aten.bitwise_not.default])
 def _bitwise_not_handler(P: MLXProgramBuilder, n: Node) -> Slot:
-    """Handle aten.bitwise_not - for boolean tensors, dispatch to logical_not."""
+    """Handle aten.bitwise_not - logical_not for bool, bitwise_invert for integers."""
     args = P.args(n)
     require_args(args, 1, 1, "aten.bitwise_not")
     require_kwargs(P.kwargs(n), set(), "aten.bitwise_not")
     x_meta = n.args[0].meta.get("val")
+    out = P.make_or_get_slot(n)
 
-    if x_meta is not None and x_meta.dtype == torch.bool:
-        # For boolean tensors, bitwise_not is equivalent to logical_not
-        out = P.make_or_get_slot(n)
+    if x_meta is None or not hasattr(x_meta, "dtype"):
+        raise NotImplementedError(
+            "aten.bitwise_not requires known input dtype metadata for MLX lowering"
+        )
+
+    if x_meta.dtype == torch.bool:
         P.emit(
             LogicalNotNode(
                 x=P.slot_to_tid(args[0]),
                 out=P.slot_to_tid(out),
             )
         )
-        return out
+    elif x_meta.dtype in {
+        torch.int32,
+        torch.int64,
+    }:
+        P.emit(
+            BitwiseInvertNode(
+                x=P.slot_to_tid(args[0]),
+                out=P.slot_to_tid(out),
+            )
+        )
     else:
         raise NotImplementedError(
-            f"aten.bitwise_not is only supported for boolean tensors. "
-            f"Got dtype={x_meta.dtype if x_meta else 'unknown'}"
+            f"aten.bitwise_not on dtype {x_meta.dtype} is not supported for MLX lowering"
         )
+    return out
 
 
 @REGISTRY.register(
 
@@ -1380,6 +1380,13 @@ inline void exec_logical_not(
   st.set_tensor(n.out, logical_not(st.const_tensor_ref(n.x), s));
 }
 
+inline void exec_bitwise_invert(
+    const BitwiseInvertNode& n,
+    ExecutionState& st,
+    StreamOrDevice s) {
+  st.set_tensor(n.out, bitwise_invert(st.const_tensor_ref(n.x), s));
+}
+
 inline void exec_logical_and(
     const LogicalAndNode& n,
     ExecutionState& st,
@@ -2028,6 +2035,10 @@ class Interpreter {
       case OpCode::LOGICAL_NOT:
         ops::exec_logical_not(std::get<LogicalNotNode>(instr.node), st, s);
         break;
+      case OpCode::BITWISE_INVERT:
+        ops::exec_bitwise_invert(
+            std::get<BitwiseInvertNode>(instr.node), st, s);
+        break;
       case OpCode::LOGICAL_AND:
         ops::exec_logical_and(std::get<LogicalAndNode>(instr.node), st, s);
         break;
 
@@ -562,6 +562,11 @@ table LogicalNotNode {
     out: Tid (required);
 }
 
+table BitwiseInvertNode {
+    x: Tid (required);
+    out: Tid (required);
+}
+
 table LogicalAndNode {
     a: Tid (required);
     b: Tid (required);
@@ -1113,7 +1118,8 @@ union OpNode {
     GatherMmNode,
     GatherQmmNode,
     ScanNode,
-    MetalKernelNode
+    MetalKernelNode,
+    BitwiseInvertNode
     // BC: Add new op nodes here (append only)
 }
 
 
@@ -4111,6 +4111,7 @@ def create_model(self) -> nn.Module:
     {"op_name": "abs",        "op_fn": torch.abs},
     {"op_name": "neg",        "op_fn": torch.neg},
     {"op_name": "logical_not","op_fn": torch.logical_not, "shapes": [(2, 3, 4), (10,), (4, 8)], "dtypes": [torch.bool], "input_fn": _bool_input_fn()},
+    {"op_name": "bitwise_not_int", "op_fn": torch.bitwise_not, "shapes": _SHAPES_3, "dtypes": [torch.int32, torch.int64], "input_fn": _int_input_fn()},
     {"op_name": "isnan",      "op_fn": torch.isnan,      "shapes": _SHAPES_3, "dtypes": [torch.float32, torch.float16, torch.bfloat16], "input_fn": _nan_input_fn()},
     # activations
     {"op_name": "relu",    "op_fn": torch.relu,    "shapes": [(2, 3, 4), (10,), (4, 8), (2, 8, 16), (1, 128, 64)], "dtypes": [torch.float32], "input_fn": _input_fn(scale=2, offset=-1)},
 
@@ -12,6 +12,7 @@
 import com.facebook.jni.annotations.DoNotStrip;
 import com.facebook.soloader.nativeloader.NativeLoader;
 import com.facebook.soloader.nativeloader.SystemDelegate;
+import java.io.Closeable;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.concurrent.locks.Lock;
@@ -24,7 +25,7 @@
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
 @Experimental
-public class Module {
+public class Module implements Closeable {
 
   static {
     if (!NativeLoader.isInitialized()) {
@@ -274,12 +275,19 @@ public boolean etdump() {
   public void destroy() {
     if (mLock.tryLock()) {
       try {
-        mHybridData.resetNative();
+        if (mHybridData.isValid()) {
+          mHybridData.resetNative();
+        }
       } finally {
         mLock.unlock();
       }
     } else {
       throw new IllegalStateException("Cannot destroy module while method is executing");
     }
   }
+
+  @Override
+  public void close() {
+    destroy();
+  }
 }
@@ -138,16 +138,16 @@ Error TextLLMRunner::generate(
         num_prompt_tokens >= 1,
         InvalidArgument,
         "Expected at least 1 prompt token");
-    ET_CHECK_OR_RETURN_ERROR(
-        num_prompt_tokens <= max_seq_len,
-        InvalidArgument,
-        "num_prompt_tokens %d > max_seq_len %" PRId64
-        ", Single prefill chunk too large - please reduce prompt size or increase max_seq_len",
-        num_prompt_tokens,
-        max_seq_len);
-    // For non-sliding-window models, also check that we won't exceed
-    // KV cache capacity. Sliding window models (where max_seq_len <
-    // max_context_len) handle position wrapping internally.
+    // Note: We intentionally do NOT enforce num_prompt_tokens <= max_seq_len
+    // here. TextPrefiller::prefill() supports chunked prefill: when
+    // num_prompt_tokens > max_seq_len it splits the prompt into max_seq_len
+    // chunks and prefills them sequentially. Models that were exported with
+    // max_seq_len < max_context_len (e.g. a 1024 prefill chunk over a 4096 KV
+    // cache) rely on this behavior.
+    // Ensure the prompt fits within total KV cache capacity. For
+    // sliding-window models (where max_seq_len < max_context_len) the model
+    // handles position wrapping internally, so pos_ doesn't represent
+    // consumed capacity and we only need a per-call bound.
     if (max_seq_len >= max_context_len) {
       ET_CHECK_OR_RETURN_ERROR(
           pos_ + num_prompt_tokens < max_context_len,
@@ -158,6 +158,15 @@ Error TextLLMRunner::generate(
           pos_,
           num_prompt_tokens,
           max_context_len);
+    } else {
+      ET_CHECK_OR_RETURN_ERROR(
+          num_prompt_tokens < max_context_len,
+          InvalidArgument,
+          "num_prompt_tokens %d >= max_context_len %" PRId64
+          ", Prompt exceeds KV cache capacity - please reduce prompt size or "
+          "increase max_context_len in your export script",
+          num_prompt_tokens,
+          max_context_len);
     }
 
     // print prompts
 
@@ -10,6 +10,8 @@
 
 #include <numeric>
 
+#include <c10/util/safe_numerics.h>
+
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
 namespace executorch {
@@ -147,11 +149,26 @@ TensorPtr make_tensor_ptr(
     std::vector<executorch::aten::StridesType> strides,
     executorch::aten::ScalarType type,
     executorch::aten::TensorShapeDynamism dynamism) {
+  auto numel_result = executorch::aten::safe_numel(sizes.data(), sizes.size());
+  ET_CHECK_MSG(
+      numel_result.ok(),
+      "safe_numel failed: %d",
+      static_cast<int>(numel_result.error()));
+  const ssize_t numel = numel_result.get();
+  size_t nbytes;
   ET_CHECK_MSG(
-      data.size() ==
-          executorch::aten::compute_numel(sizes.data(), sizes.size()) *
-              executorch::aten::elementSize(type),
-      "Data size does not match tensor size.");
+      !c10::mul_overflows(
+          static_cast<size_t>(numel),
+          executorch::aten::elementSize(type),
+          &nbytes),
+      "Overflow computing nbytes: numel=%zd element_size=%zu",
+      numel,
+      executorch::aten::elementSize(type));
+  ET_CHECK_MSG(
+      data.size() == nbytes,
+      "Data size (%zu) does not match tensor size (%zu).",
+      data.size(),
+      nbytes);
   auto data_ptr = data.data();
   return make_tensor_ptr(
       std::move(sizes),
@@ -205,7 +222,13 @@ TensorPtr clone_tensor_ptr(
       runtime::canCast(tensor_type, type),
       "Cannot cast tensor type to desired type.");
   const auto tensor_numel = static_cast<size_t>(tensor.numel());
-  std::vector<uint8_t> data(tensor_numel * aten::elementSize(type));
+  size_t clone_nbytes;
+  ET_CHECK_MSG(
+      !c10::mul_overflows(tensor_numel, aten::elementSize(type), &clone_nbytes),
+      "Overflow computing clone nbytes: numel=%zu element_size=%zu",
+      tensor_numel,
+      aten::elementSize(type));
+  std::vector<uint8_t> data(clone_nbytes);
 
   // Create a minimal context for error handling in ET_SWITCH
   struct {
 
@@ -110,9 +110,13 @@ inline TensorPtr make_tensor_ptr(
     executorch::aten::ScalarType type = deduced_type,
     executorch::aten::TensorShapeDynamism dynamism =
         executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  auto numel_result = executorch::aten::safe_numel(sizes.data(), sizes.size());
   ET_CHECK_MSG(
-      data.size() ==
-          executorch::aten::compute_numel(sizes.data(), sizes.size()),
+      numel_result.ok(),
+      "safe_numel failed: %d",
+      static_cast<int>(numel_result.error()));
+  ET_CHECK_MSG(
+      data.size() == static_cast<size_t>(numel_result.get()),
       "Data size does not match tensor size.");
   if (type != deduced_type) {
     ET_CHECK_MSG(
@@ -368,8 +372,13 @@ inline TensorPtr make_tensor_ptr(
   const auto same_rank = sizes.size() == static_cast<size_t>(tensor.dim());
   const auto same_shape = same_rank &&
       std::equal(sizes.begin(), sizes.end(), tensor.sizes().begin());
-  const auto element_count =
-      executorch::aten::compute_numel(sizes.data(), sizes.size());
+  auto element_count_result =
+      executorch::aten::safe_numel(sizes.data(), sizes.size());
+  ET_CHECK_MSG(
+      element_count_result.ok(),
+      "safe_numel failed: %d",
+      static_cast<int>(element_count_result.error()));
+  const auto element_count = element_count_result.get();
   const auto parent_element_count = tensor.numel();
   ET_CHECK_MSG(
       element_count <= parent_element_count,
 
@@ -113,16 +113,21 @@ TensorPtr empty_strided(
     std::vector<executorch::aten::StridesType> strides,
     executorch::aten::ScalarType type,
     executorch::aten::TensorShapeDynamism dynamism) {
-  const auto numel = static_cast<size_t>(
-      executorch::aten::compute_numel(sizes.data(), sizes.size()));
-  const auto elem_size =
-      static_cast<size_t>(executorch::aten::elementSize(type));
-  size_t nbytes = 0;
+  auto numel_result = executorch::aten::safe_numel(sizes.data(), sizes.size());
   ET_CHECK_MSG(
-      !c10::mul_overflows(numel, elem_size, &nbytes),
-      "empty_strided size overflow: numel %zu * element size %zu",
+      numel_result.ok(),
+      "safe_numel failed: %d",
+      static_cast<int>(numel_result.error()));
+  const ssize_t numel = numel_result.get();
+  size_t nbytes;
+  ET_CHECK_MSG(
+      !c10::mul_overflows(
+          static_cast<size_t>(numel),
+          executorch::aten::elementSize(type),
+          &nbytes),
+      "Overflow computing nbytes: numel=%zd element_size=%zu",
       numel,
-      elem_size);
+      executorch::aten::elementSize(type));
   std::vector<uint8_t> data(nbytes);
   return make_tensor_ptr(
       std::move(sizes),
 
@@ -84,20 +84,22 @@ inline void js_array_push(val_array<T>& array, const T& value) {
   _(float, Float)                           \
   _(int64_t, Long)
 
-inline ssize_t compute_expected_numel(
+inline ::executorch::runtime::Result<ssize_t> compute_expected_numel(
     const std::vector<torch::executor::Tensor::SizesType>& sizes) {
-  return executorch::aten::compute_numel(sizes.data(), sizes.size());
+  return executorch::aten::safe_numel(sizes.data(), sizes.size());
 }
 
 template <typename T>
 inline void assert_valid_numel(
     const std::vector<T>& data,
     const std::vector<torch::executor::Tensor::SizesType>& sizes) {
   auto computed_numel = compute_expected_numel(sizes);
+  THROW_IF_ERROR(
+      computed_numel.error(), "Invalid tensor sizes: numel computation failed");
   THROW_IF_FALSE(
-      data.size() >= computed_numel,
+      data.size() >= static_cast<size_t>(computed_numel.get()),
       "Required %ld elements, given %ld",
-      computed_numel,
+      computed_numel.get(),
       data.size());
 }