pytorch
diff --git a/‎backends/cadence/aot/memory_constraints.py‎
Lines changed: 44 additions & 9 deletions b/‎backends/cadence/aot/memory_constraints.py‎
Lines changed: 44 additions & 9 deletions
diff --git a/‎backends/cadence/hifi/operators/op_permute_copy.cpp‎
Lines changed: 2 additions & 18 deletions b/‎backends/cadence/hifi/operators/op_permute_copy.cpp‎
Lines changed: 2 additions & 18 deletions
diff --git a/‎backends/cadence/hifi/operators/op_softmax.cpp‎
Lines changed: 3 additions & 0 deletions b/‎backends/cadence/hifi/operators/op_softmax.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/cadence/hifi/operators/op_where.cpp‎
Lines changed: 3 additions & 0 deletions b/‎backends/cadence/hifi/operators/op_where.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/cadence/utils/facto_util.py‎
Lines changed: 4 additions & 3 deletions b/‎backends/cadence/utils/facto_util.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/mlx/ops.py‎
Lines changed: 26 additions & 0 deletions b/‎backends/mlx/ops.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎backends/mlx/test/test_ops.py‎
Lines changed: 17 additions & 0 deletions b/‎backends/mlx/test/test_ops.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎backends/nxp/backend/ir/converter/conversion/translator.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/nxp/backend/ir/converter/conversion/translator.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎devtools/scripts/BUCK‎
Lines changed: 16 additions & 0 deletions b/‎devtools/scripts/BUCK‎
Lines changed: 16 additions & 0 deletions
@@ -452,6 +452,45 @@ def is_cat_along_outermost_dim(
                 return False
         return True
 
+    def _has_duplicate_resolved_sources(
+        self, cat_tensors: Sequence[torch.fx.Node]
+    ) -> bool:
+        """Return True if two cat inputs resolve to the same underlying tensor."""
+        if len(cat_tensors) != len(set(cat_tensors)):
+            return True
+        resolved_sources = set()
+        for arg in cat_tensors:
+            resolved = arg
+            while (
+                info := self.constraint.get_relative_placement_source(resolved)
+            ) is not None:
+                if self.constraint.is_alias_of(info.source, resolved):
+                    resolved = info.source
+                else:
+                    break
+            if id(resolved) in resolved_sources:
+                return True
+            resolved_sources.add(id(resolved))
+        return False
+
+    def _has_unaligned_cat_tensors(
+        self,
+        graph: torch.fx.Graph,
+        node: torch.fx.Node,
+        cat_tensors: Sequence[torch.fx.Node],
+    ) -> bool:
+        """Return True if any non-placeholder cat tensor has misaligned offset."""
+        if is_node_in_flattened_output(graph, node):
+            return False
+        expected_alignment = 8
+        relative_offsets = get_relative_offsets_of_cat_tensors(cat_tensors)
+        for idx, arg in enumerate(cat_tensors):
+            if not (arg.op == "placeholder") and (
+                relative_offsets[idx] & (expected_alignment - 1) != 0
+            ):
+                return True
+        return False
+
     # If A = cat(B, C), and the concatenation is along the outermost dimension, then
     # we can optimize away this cat operation if (1) B and C are placed contiguously,
     # and (2) the absolute memory location of tensor A is the same as B. This function
@@ -486,21 +525,17 @@ def is_removable_cat_op(
             return False
         # If the same tensor appears multiple times in the cat inputs,
         # we cannot place it at multiple different offsets relative to the output.
-        if len(cat_tensors) != len(set(cat_tensors)):
+        # Also check resolved sources: two different alias nodes may resolve to
+        # the same underlying tensor, which can't be at two offsets.
+        if self._has_duplicate_resolved_sources(cat_tensors):
             return False
 
         # Many ops in HiFi require the input to be aligned to 8-byte boundary.
         # If the cat is not the graph's output, then ensure that the relative
         # offset of any concatenated non-placeholder tensor is a multiple of
         # 8 bytes,
-        if not is_node_in_flattened_output(graph_module.graph, node):
-            expected_alignment = 8
-            relative_offsets = get_relative_offsets_of_cat_tensors(cat_tensors)
-            for idx, arg in enumerate(cat_tensors):
-                if not (arg.op == "placeholder") and (
-                    relative_offsets[idx] & (expected_alignment - 1) != 0
-                ):
-                    return False
+        if self._has_unaligned_cat_tensors(graph_module.graph, node, cat_tensors):
+            return False
 
         return True
 
 
@@ -73,8 +73,7 @@ Tensor& permute_copy_out(
 
   bool optimized = false;
 
-  if (out.scalar_type() == ScalarType::Float ||
-      out.scalar_type() == ScalarType::Char ||
+  if (out.scalar_type() == ScalarType::Char ||
       out.scalar_type() == ScalarType::Byte)
     optimized = true;
 
@@ -101,22 +100,7 @@ Tensor& permute_copy_out(
       p_permute_vec[i] = dims[i];
     }
 
-    if (in_type == ScalarType::Float) {
-      WORD32* p_inp = (WORD32*)in.const_data_ptr<float>();
-      WORD32* p_out = (WORD32*)out.mutable_data_ptr<float>();
-
-      WORD32 ret_val = xa_nn_transpose_32_32(
-          p_out,
-          p_out_shape,
-          p_inp,
-          p_inp_shape,
-          p_permute_vec,
-          num_out_dims,
-          num_inp_dims);
-
-      ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
-
-    } else if (in_type == ScalarType::Char) {
+    if (in_type == ScalarType::Char) {
       WORD8* p_inp = (WORD8*)in.const_data_ptr<char>();
       WORD8* p_out = (WORD8*)out.mutable_data_ptr<char>();
 
 
@@ -68,6 +68,9 @@ Tensor& _softmax_out(
   if (in.dim() > kNnlibMaxDim)
     optimized = false;
 
+  if (dim < in.dim() - 1)
+    optimized = false;
+
   if (optimized) {
     int* p_inp = (int*)in.const_data_ptr<float>();
     int* out_data = (int*)out.mutable_data_ptr<float>();
 
@@ -81,6 +81,9 @@ Tensor& where_self_out(
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
 
+  if (cond_is_broadcasted)
+    optimized = 0;
+
   if (optimized) {
     const float* a_data = a.const_data_ptr<float>();
     const float* b_data = b.const_data_ptr<float>();
 
@@ -249,7 +249,7 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
         case "permute_copy.default":
             tensor_constraints.extend(
                 [
-                    cp.Dtype.In(lambda deps: [torch.float32, torch.int8, torch.uint8]),
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
                     cp.Rank.Le(
                         lambda deps: 5
                     ),  # xa_nn_transpose only supports up to 5D
@@ -391,12 +391,13 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
             tensor_constraints.extend(
                 [
                     cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                    cp.Value.Ge(lambda deps, dtype, struct: 0),
                 ]
             )
         case "div.Tensor_mode" | "minimum.default":
             if index == 0:
                 tensor_constraints = [
-                    cp.Dtype.In(lambda deps: [torch.int64, torch.int32, torch.float32]),
+                    cp.Dtype.In(lambda deps: [torch.int32, torch.float32]),
                     cp.Value.Ge(lambda deps, dtype, struct: -(2**4)),
                     cp.Value.Le(lambda deps, dtype, struct: 2**4),
                     cp.Rank.Ge(lambda deps: 1),
@@ -405,7 +406,7 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
                 ]
             else:
                 tensor_constraints = [
-                    cp.Dtype.In(lambda deps: [torch.int64, torch.int32, torch.float32]),
+                    cp.Dtype.In(lambda deps: [torch.int32, torch.float32]),
                     cp.Value.Ge(lambda deps, dtype, struct: -(2**4)),
                     cp.Value.Le(lambda deps, dtype, struct: 2**4),
                     cp.Value.Ne(
 
@@ -418,6 +418,32 @@ def handler(P: MLXProgramBuilder, n: Node) -> Slot:
     REGISTRY.register(target=[_target])(_make_unary_handler(_node_cls, _op_name))
 
 
+# ---------------------------------------------------------------------------
+# Numerical checks
+# ---------------------------------------------------------------------------
+
+
+@REGISTRY.register(target=[torch.ops.aten.isnan.default])
+def _isnan_handler(P: MLXProgramBuilder, n: Node) -> Slot:
+    """Handle aten.isnan - check for NaN values element-wise.
+
+    isnan(x) is equivalent to x != x (NaN is the only value not equal to itself).
+    """
+    args = P.args(n)
+    require_args(args, 1, 1, "aten.isnan")
+    require_kwargs(P.kwargs(n), set(), "aten.isnan")
+    x = args[0]
+    out = P.make_or_get_slot(n)
+    P.emit(
+        NotEqualNode(
+            a=P.slot_to_tid(x),
+            b=P.slot_to_tid(x),
+            out=P.slot_to_tid(out),
+        )
+    )
+    return out
+
+
 _BINARY_OPS: List[Tuple[List[Any], Any, str, bool]] = [
     (
         [torch.ops.aten.mul.Tensor, torch.ops.aten.mul.Scalar],
 
@@ -4004,6 +4004,22 @@ def fn(shape, dtype):
     return fn
 
 
+def _nan_input_fn(nan_frac: float = 0.3):
+    """Return a callable(shape, dtype) that generates inputs with some NaN values.
+
+    Args:
+        nan_frac: Fraction of elements to set to NaN (default 0.3 = 30%).
+    """
+
+    def fn(shape, dtype):
+        x = torch.randn(shape, dtype=dtype)
+        mask = torch.rand(shape) > (1.0 - nan_frac)
+        x[mask] = float("nan")
+        return (x,)
+
+    return fn
+
+
 # Standard shape and dtype configs used by unary tests.
 _SHAPES_3 = [(16,), (4, 4), (2, 3, 4)]
 _SHAPES_2 = [(16,), (4, 4)]
@@ -4095,6 +4111,7 @@ def create_model(self) -> nn.Module:
     {"op_name": "abs",        "op_fn": torch.abs},
     {"op_name": "neg",        "op_fn": torch.neg},
     {"op_name": "logical_not","op_fn": torch.logical_not, "shapes": [(2, 3, 4), (10,), (4, 8)], "dtypes": [torch.bool], "input_fn": _bool_input_fn()},
+    {"op_name": "isnan",      "op_fn": torch.isnan,      "shapes": _SHAPES_3, "dtypes": [torch.float32, torch.float16, torch.bfloat16], "input_fn": _nan_input_fn()},
     # activations
     {"op_name": "relu",    "op_fn": torch.relu,    "shapes": [(2, 3, 4), (10,), (4, 8), (2, 8, 16), (1, 128, 64)], "dtypes": [torch.float32], "input_fn": _input_fn(scale=2, offset=-1)},
     {"op_name": "sigmoid", "op_fn": torch.sigmoid, "shapes": [(2, 3, 4), (10,), (4, 8), (2, 8, 16), (1, 1, 128)],  "dtypes": [torch.float32], "input_fn": _input_fn(scale=2)},
 
@@ -601,7 +601,7 @@ def numpy_type_to_tf_lite(numpy_type: np.dtype) -> TensorType:  # noqa C901
     elif numpy_type == np.int64:
         return TensorType.INT64
 
-    elif numpy_type == np.string_:
+    elif numpy_type == np.bytes_:
         return TensorType.STRING
 
     elif numpy_type == np.bool_:
@@ -659,7 +659,7 @@ def tf_lite_type_to_numpy(tfl_type: TensorType) -> np.ScalarType:  # noqa C901
         return np.dtype(np.int64)
 
     elif tfl_type == TensorType.STRING:
-        return np.dtype(np.string_)
+        return np.dtype(np.bytes_)
 
     elif tfl_type == TensorType.BOOL:
         return np.dtype(np.bool_)
 
@@ -0,0 +1,16 @@
+load("@fbcode_macros//build_defs:build_file_migration.bzl", "non_fbcode_target")
+
+oncall("executorch")
+
+non_fbcode_target(
+    _kind = native.sh_binary,
+    name = "_benchmark_android_sh",
+    main = "benchmark_android.sh",
+)
+
+non_fbcode_target(
+    _kind = native.command_alias,
+    name = "benchmark_android",
+    exe = ":_benchmark_android_sh",
+    args = ["--build-tool", "buck"],
+)