memory planner to allocate element-wise output buffer in place of input (#19067)

Reza Sajadiany · facebook-github-bot · commit f1fa6fced24e · 2026-05-07T13:39:23.000-07:00
Summary:

Adds a pass namely `InPlaceElemWiseLikeOpsPass` which checks for possible elem-wise ops in the graph w/o skip conection from input.
The pass then annotate the output as a new alloc type called `memory.alloc_inplace`.
In memory planning, the nodes with output spec type of `alloc_inplace` get output allocation in place of the same node's input.

Differential Revision: D100371295
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
@@ -1755,6 +1755,10 @@ def call_function(  # pyre-fixme[14]
             assert len(args) == 1
             return self._emit_spec(self.node.meta["spec"])
 
+        elif target == memory.alloc_inplace:
+            assert len(args) == 2
+            return self._emit_spec(self.node.meta["spec"])
+
         elif target == memory.view:
             return self._emit_view(args)
 
diff --git a/exir/memory.py b/exir/memory.py
@@ -33,6 +33,26 @@ def alloc(spec: AllocSpec) -> pytree.PyTree:
     return torch.empty(shape, dtype=dtype)
 
 
+def alloc_inplace(base: torch.Tensor, spec: AllocSpec) -> pytree.PyTree:
+    """
+    Allocate output tensor in the same memory as base tensor.
+
+    This is used by InPlaceElemWiseLikeOpsPass to signal to the memory planner
+    that the output should share the same memory offset as the base (input)
+    tensor. The base tensor must have allocated_memory >= output's
+    allocated_memory, and the base must be dead after the consuming op.
+
+    At runtime this behaves identically to alloc() — the in-place semantics
+    are resolved at planning time.
+    """
+    if isinstance(spec, list):
+        return [alloc_inplace(base, s) for s in spec]
+
+    shape, dtype = spec
+    shape = eval_shape(shape)
+    return torch.empty(shape, dtype=dtype)
+
+
 def free(spec: TensorSpec) -> None:
     """
     The function is nop. The major purpose is to put it in the Fx IR.
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
@@ -186,9 +186,16 @@ def verify_storage_reuse(
                 if not allow_lifetime_and_storage_overlap and self.lifetime_overlap(
                     lhs_spec, rhs_spec
                 ):
-                    raise InternalError(
-                        f"Unexpected storage overlap: {Verifier._debug_message_from_specs(lhs_spec, rhs_spec)}"
+                    # In-place element-wise ops intentionally share storage
+                    # between input and output despite overlapping lifetimes.
+                    is_inplace_pair = (
+                        lhs_spec.inplace_base is rhs_spec
+                        or rhs_spec.inplace_base is lhs_spec
                     )
+                    if not is_inplace_pair:
+                        raise InternalError(
+                            f"Unexpected storage overlap: {Verifier._debug_message_from_specs(lhs_spec, rhs_spec)}"
+                        )
 
                 # Check that each mem_obj_id is consistent with whether the tensors have
                 # storage overlap
@@ -485,6 +492,7 @@ def collect_specs_from_nodes(  # noqa: C901
                 or node.target
                 in [
                     memory.alloc,
+                    memory.alloc_inplace,
                     memory.view,
                     operator.getitem,
                     torch.ops.higher_order.cond,
@@ -838,22 +846,65 @@ def greedy(
 
     sorted_specs.reverse()
 
+    deferred_inplace: List[TensorSpec] = []
+
     for spec in sorted_specs:
-        # Create an entry for this TensorSpec in the result object that we'll be
-        # returning from this algorithm.
         spec_alloc_result = greedy_result.spec_dict.get(spec, SpecAllocResult(0, 0, 0))
         if spec.mem_id is None:
             spec_alloc_result.mem_id = 1
         else:
             spec_alloc_result.mem_id = spec.mem_id
         greedy_result.spec_dict[spec] = spec_alloc_result
         spec.realign(alignment)
+
+        if spec.inplace_base is not None:
+            deferred_inplace.append(spec)
+            continue
+
         spec2obj[spec] = pick_shared_obj(
             shared_objects[spec_alloc_result.mem_id],
             spec,
             allow_overlapping_allocations,
         )
 
+    remaining = list(deferred_inplace)
+    while remaining:
+        progress = False
+        next_remaining = []
+        for spec in remaining:
+            base = spec.inplace_base
+            if base not in spec2obj:
+                next_remaining.append(spec)
+                continue
+            progress = True
+            sobj = spec2obj[base]
+
+            base_alloc_result = greedy_result.spec_dict[base]
+            spec_alloc_result = greedy_result.spec_dict[spec]
+            spec_alloc_result.mem_id = base_alloc_result.mem_id
+
+            base_alloc_offset = None
+            for alloc_entry in sobj.allocations:
+                if alloc_entry.spec is base:
+                    base_alloc_offset = alloc_entry.offset
+                    break
+            assert base_alloc_offset is not None, (
+                f"Base allocation entry not found in shared object for spec "
+                f"with allocated_memory={spec.allocated_memory}"
+            )
+            sobj.first_used_index = min(sobj.first_used_index, spec.lifetime[0])
+            sobj.last_used_index = max(sobj.last_used_index, spec.lifetime[1])
+            sobj.allocations.append(AllocationSpec(base_alloc_offset, spec))
+            spec2obj[spec] = sobj
+        if not progress:
+            unresolved = ", ".join(
+                f"allocated_memory={s.allocated_memory}" for s in next_remaining
+            )
+            raise InternalError(
+                f"Circular or unresolvable in-place dependency chain: {unresolved}"
+            )
+        remaining = next_remaining
+
     if len(shared_objects) == 0:
         # Cannot find any tensor in the graph that needs to be allocated.
         # Return [0, 0] to be consistent with default behavior of naive.
@@ -1012,6 +1063,12 @@ def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int:
     bufsizes = cast(List[int], bufsizes)
 
     for spec in specs:
+        if spec.inplace_base is not None:
+            raise InternalError(
+                "The naive memory planning algorithm does not support in-place "
+                "element-wise ops (inplace_base). Use the greedy algorithm instead."
+            )
+
         spec_alloc_result = naive_result.spec_dict.get(spec, SpecAllocResult(0, 0, 0))
         # assume a single memory layer which has mem_id 1
         if spec.mem_id is None:
diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py
@@ -76,6 +76,8 @@
     "ToDevicePass",
     "EdgeToBackendOpsPass",
     "MemoryFormatOpsPass",
+    "InPlaceElemWiseLikeOpsPass",
+    "ElemWiseInPlaceAwareMemoryPlanningPass",
     "MemoryPlanningPass",
     "HintBasedSymShapeEvalPass",
     "insert_write_back_for_buffers_pass",
@@ -260,6 +262,7 @@ def callWithLoggerEnabled(self, graph_module: torch.fx.GraphModule) -> None:
     # we won't see it in the input graph to the to_out_variant pass, unless
     # it's retraced after running to_out_variant with the first trace.
     memory.alloc,
+    memory.alloc_inplace,
     memory.view,
     executorch_call_delegate,
 }
@@ -444,6 +447,109 @@ def get_submodule(node: torch.fx.Node) -> torch.fx.GraphModule:
         return PassResult(graph_module, True)
 
 
+class InPlaceElemWiseLikeOpsPass(PassBase):
+    """Replace memory.alloc with memory.alloc_inplace for element-wise-like ops.
+
+    For out-variant ops that are element-wise, the output can be allocated in
+    the same memory as the input when:
+      1. output_bytes <= input_bytes
+      2. The input tensor has no other users after this op (dead after consumption)
+
+    This pass replaces the memory.alloc node for the output with
+    memory.alloc_inplace(input_node, spec), which signals to the memory planner
+    to place the output at the same offset as the input.
+
+    Eligible ops are specified via the constructor's eligible_ops parameter or a
+    default set is used set by _default_eligible_ops.
+    """
+
+    @staticmethod
+    def _default_eligible_ops() -> Set[Callable[..., Any]]:
+        return {
+            torch.ops.cortex_m.quantize_per_tensor.out,
+        }
+
+    def __init__(self, eligible_ops: Optional[Set[Callable[..., Any]]] = None) -> None:
+        self._eligible_ops = (
+            eligible_ops if eligible_ops is not None else self._default_eligible_ops()
+        )
+
+    def _is_eligible(self, target: Any) -> bool:
+        return target in self._eligible_ops
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        changed = False
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if not self._is_eligible(node.target):
+                continue
+            if not memory_planning._is_out_var_node(node):
+                continue
+
+            out_arg_names = get_out_args_from_opoverload(node.target)
+            if len(out_arg_names) != 1:
+                continue
+
+            out_alloc_node = node.kwargs.get(out_arg_names[0])
+            if out_alloc_node is None or out_alloc_node.target != memory.alloc:
+                continue
+
+            out_val = out_alloc_node.meta.get("val")
+            if out_val is None or not isinstance(out_val, torch.Tensor):
+                continue
+            out_nbytes = out_val.nelement() * out_val.element_size()
+
+            input_node = None
+            for arg in node.args:
+                if not isinstance(arg, torch.fx.Node):
+                    continue
+                in_val = arg.meta.get("val")
+                if in_val is None or not isinstance(in_val, torch.Tensor):
+                    continue
+                if in_val.nelement() * in_val.element_size() < out_nbytes:
+                    continue
+                if any(u != node and u.target != memory.free for u in arg.users):
+                    continue
+                input_node = arg
+                break
+            if input_node is None:
+                continue
+
+            with graph_module.graph.inserting_before(out_alloc_node):
+                inplace_node = graph_module.graph.call_function(
+                    memory.alloc_inplace,
+                    (input_node, out_alloc_node.args[0]),
+                )
+                inplace_node.meta = out_alloc_node.meta.copy()
+
+            out_alloc_node.replace_all_uses_with(inplace_node)
+            graph_module.graph.erase_node(out_alloc_node)
+            changed = True
+
+        return PassResult(graph_module, changed)
+
+
+class ElemWiseInPlaceAwareMemoryPlanningPass(MemoryPlanningPass):
+    """MemoryPlanningPass that first runs InPlaceElemWiseLikeOpsPass."""
+
+    def __init__(
+        self,
+        eligible_ops: Optional[Set[Callable[..., Any]]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        self._inplace_pass = InPlaceElemWiseLikeOpsPass(eligible_ops)
+
+    def run(
+        self,
+        graph_module: torch.fx.GraphModule,
+        graph_signature=None,
+    ) -> PassResult:
+        self._inplace_pass(graph_module)
+        return super().run(graph_module, graph_signature)
+
+
 def to_scratch_op_pass(graph_module: torch.fx.GraphModule) -> PassResult:
     for node in graph_module.graph.nodes:
         if node.op != "call_function":
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
@@ -14,7 +14,7 @@
 import torch
 from executorch.exir._warnings import deprecated
 from executorch.exir.error import internal_assert
-from executorch.exir.memory import alloc
+from executorch.exir.memory import alloc, alloc_inplace
 from executorch.exir.memory_planning import (
     _is_out_var_node,
     apply_algo,
@@ -192,6 +192,12 @@ def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
                     if len(out_arg_names) == 1:
                         out_alloc_node = node.kwargs[out_arg_names[0]]
                         out_alloc_node.meta["spec"] = node.meta["spec"]
+                        if out_alloc_node.target == alloc_inplace and isinstance(
+                            out_alloc_node.args[0], Node
+                        ):
+                            base_spec = out_alloc_node.args[0].meta.get("spec")
+                            if base_spec is not None:
+                                node.meta["spec"].inplace_base = base_spec
                         continue
                     specs = get_node_tensor_specs(node)
                     i = 0
@@ -206,7 +212,7 @@ def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
                             # dont increment i as we dont have a spec for this node
                         internal_assert(
                             out_alloc_node.op == "call_function"
-                            and out_alloc_node.target == alloc,
+                            and out_alloc_node.target in (alloc, alloc_inplace),
                             f"Out-var's node {out_alloc_node} has op {out_alloc_node.op} and target {out_alloc_node.target}",
                         )
                         internal_assert(
diff --git a/exir/tensor.py b/exir/tensor.py
@@ -216,6 +216,9 @@ def init_mem_planning_fields(self) -> None:
         self.mem_id = None
         self.mem_obj_id = None
         self.mem_offset = None
+        # Set by InPlaceElemWiseLikeOpsPass: the base TensorSpec whose memory
+        # this spec should share (output allocated in-place over the input).
+        self.inplace_base: Optional["TensorSpec"] = None
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py