memory planner to allocate element-wise output buffer in place of input

Reza Sajadiany · facebook-github-bot · commit 30c6c9efbba8 · 2026-04-23T07:02:25.000-07:00
Summary:
Adds a pass namely `InPlaceElemWiseLikeOpsPass` which checks for possible elem-wise ops in the graph w/o skip conection from input.
The pass then annotate the output as a new alloc type called `memory.alloc_inplace`.
In memory planning, the nodes with output spec type of `alloc_inplace` get output allocation in place of the same node's input.

Differential Revision: D100371295
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
@@ -61,6 +61,7 @@ class ExecutorchBackendConfig:
     # EdgeProgramManager or can be defined per program.
     memory_planning_pass: Union[PassType, Dict[str, PassType]] = MemoryPlanningPass()
     to_out_var_pass: PassType = ToOutVarPass(ignore_to_out_var_failure=False)
+    inplace_elem_wise_like_ops_pass: Optional[PassType] = None
     dynamic_memory_planning_mode: DynamicMemoryPlanningMode = (
         DynamicMemoryPlanningMode.UPPER_BOUND
     )
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
@@ -1755,6 +1755,9 @@ def call_function(  # pyre-fixme[14]
             assert len(args) == 1
             return self._emit_spec(self.node.meta["spec"])
 
+        elif target == memory.alloc_inplace:
+            return self._emit_spec(self.node.meta["spec"])
+
         elif target == memory.view:
             return self._emit_view(args)
 
diff --git a/exir/memory.py b/exir/memory.py
@@ -33,6 +33,26 @@ def alloc(spec: AllocSpec) -> pytree.PyTree:
     return torch.empty(shape, dtype=dtype)
 
 
+def alloc_inplace(base: torch.Tensor, spec: AllocSpec) -> torch.Tensor:
+    """
+    Allocate output tensor in the same memory as base tensor.
+
+    This is used by InPlaceElemWiseLikeOpsPass to signal to the memory planner
+    that the output should share the same memory offset as the base (input)
+    tensor. The base tensor must have allocated_memory >= output's
+    allocated_memory, and the base must be dead after the consuming op.
+
+    At runtime this behaves identically to alloc() — the in-place semantics
+    are resolved at planning time.
+    """
+    if isinstance(spec, list):
+        return [alloc_inplace(base, s) for s in spec]
+
+    shape, dtype = spec
+    shape = eval_shape(shape)
+    return torch.empty(shape, dtype=dtype)
+
+
 def free(spec: TensorSpec) -> None:
     """
     The function is nop. The major purpose is to put it in the Fx IR.
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
@@ -186,9 +186,16 @@ def verify_storage_reuse(
                 if not allow_lifetime_and_storage_overlap and self.lifetime_overlap(
                     lhs_spec, rhs_spec
                 ):
-                    raise InternalError(
-                        f"Unexpected storage overlap: {Verifier._debug_message_from_specs(lhs_spec, rhs_spec)}"
+                    # In-place element-wise ops intentionally share storage
+                    # between input and output despite overlapping lifetimes.
+                    is_inplace_pair = (
+                        getattr(lhs_spec, "inplace_base", None) is rhs_spec
+                        or getattr(rhs_spec, "inplace_base", None) is lhs_spec
                     )
+                    if not is_inplace_pair:
+                        raise InternalError(
+                            f"Unexpected storage overlap: {Verifier._debug_message_from_specs(lhs_spec, rhs_spec)}"
+                        )
 
                 # Check that each mem_obj_id is consistent with whether the tensors have
                 # storage overlap
@@ -485,6 +492,7 @@ def collect_specs_from_nodes(  # noqa: C901
                 or node.target
                 in [
                     memory.alloc,
+                    memory.alloc_inplace,
                     memory.view,
                     operator.getitem,
                     torch.ops.higher_order.cond,
@@ -838,22 +846,46 @@ def greedy(
 
     sorted_specs.reverse()
 
+    deferred_inplace: List[TensorSpec] = []
+
     for spec in sorted_specs:
-        # Create an entry for this TensorSpec in the result object that we'll be
-        # returning from this algorithm.
         spec_alloc_result = greedy_result.spec_dict.get(spec, SpecAllocResult(0, 0, 0))
         if spec.mem_id is None:
             spec_alloc_result.mem_id = 1
         else:
             spec_alloc_result.mem_id = spec.mem_id
         greedy_result.spec_dict[spec] = spec_alloc_result
         spec.realign(alignment)
+
+        if getattr(spec, "inplace_base", None) is not None:
+            deferred_inplace.append(spec)
+            continue
+
         spec2obj[spec] = pick_shared_obj(
             shared_objects[spec_alloc_result.mem_id],
             spec,
             allow_overlapping_allocations,
         )
 
+    for spec in deferred_inplace:
+        base = spec.inplace_base
+        assert base in spec2obj, (
+            f"In-place base spec not found in allocated objects. "
+            f"Base allocated_memory={base.allocated_memory}, "
+            f"spec allocated_memory={spec.allocated_memory}"
+        )
+        sobj = spec2obj[base]
+
+        base_alloc_offset = 0
+        for alloc_entry in sobj.allocations:
+            if alloc_entry.spec is base:
+                base_alloc_offset = alloc_entry.offset
+                break
+        sobj.first_used_index = min(sobj.first_used_index, spec.lifetime[0])
+        sobj.last_used_index = max(sobj.last_used_index, spec.lifetime[1])
+        sobj.allocations.append(AllocationSpec(base_alloc_offset, spec))
+        spec2obj[spec] = sobj
+
     if len(shared_objects) == 0:
         # Cannot find any tensor in the graph that needs to be allocated.
         # Return [0, 0] to be consistent with default behavior of naive.
diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py
@@ -32,6 +32,7 @@
     to_scratch_op,
 )
 from executorch.exir.pass_base import ExportPass
+from executorch.exir.tensor import TensorSpec
 from executorch.exir.pass_manager import PassManager, PassType
 from executorch.exir.passes.const_prop_pass import ConstPropPass
 from executorch.exir.passes.debug_handle_generator_pass import DebugHandleGeneratorPass
@@ -76,6 +77,7 @@
     "ToDevicePass",
     "EdgeToBackendOpsPass",
     "MemoryFormatOpsPass",
+    "InPlaceElemWiseLikeOpsPass",
     "MemoryPlanningPass",
     "HintBasedSymShapeEvalPass",
     "insert_write_back_for_buffers_pass",
@@ -260,6 +262,7 @@ def callWithLoggerEnabled(self, graph_module: torch.fx.GraphModule) -> None:
     # we won't see it in the input graph to the to_out_variant pass, unless
     # it's retraced after running to_out_variant with the first trace.
     memory.alloc,
+    memory.alloc_inplace,
     memory.view,
     executorch_call_delegate,
 }
@@ -444,6 +447,86 @@ def get_submodule(node: torch.fx.Node) -> torch.fx.GraphModule:
         return PassResult(graph_module, True)
 
 
+class InPlaceElemWiseLikeOpsPass(PassBase):
+    """Replace memory.alloc with memory.alloc_inplace for element-wise-like ops.
+
+    For out-variant ops that are element-wise, the output can be allocated in
+    the same memory as the input when:
+      1. output_bytes <= input_bytes
+      2. The input tensor has no other users after this op (dead after consumption)
+
+    This pass replaces the memory.alloc node for the output with
+    memory.alloc_inplace(input_node, spec), which signals to the memory planner
+    to place the output at the same offset as the input.
+
+    Eligible ops are specified via the constructor's eligible_ops parameter.
+    """
+
+    def __init__(
+        self, eligible_ops: Optional[Set[Callable[..., Any]]] = None
+    ) -> None:
+        self._eligible_ops = eligible_ops or set()
+
+    def _is_eligible(self, target: Any) -> bool:
+        return target in self._eligible_ops
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        changed = False
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if not self._is_eligible(node.target):
+                continue
+            if not memory_planning._is_out_var_node(node):
+                continue
+
+            out_arg_names = get_out_args_from_opoverload(node.target)
+            if len(out_arg_names) != 1:
+                continue
+
+            out_alloc_node = node.kwargs.get(out_arg_names[0])
+            if out_alloc_node is None or out_alloc_node.target != memory.alloc:
+                continue
+
+            input_node = node.args[0]
+            if not isinstance(input_node, torch.fx.Node):
+                continue
+
+            # Compute sizes from FakeTensor metadata (specs may not be set yet
+            # on alloc nodes since _set_alloc_node_spec runs in MemoryPlanningPass).
+            out_val = out_alloc_node.meta.get("val")
+            in_val = input_node.meta.get("val")
+            if out_val is None or in_val is None:
+                continue
+            if not isinstance(out_val, torch.Tensor) or not isinstance(
+                in_val, torch.Tensor
+            ):
+                continue
+
+            out_nbytes = out_val.nelement() * out_val.element_size()
+            in_nbytes = in_val.nelement() * in_val.element_size()
+            if out_nbytes > in_nbytes:
+                continue
+
+            # Input must have no other users besides this node
+            input_users = [u for u in input_node.users if u != node]
+            if len(input_users) > 0:
+                continue
+
+            with graph_module.graph.inserting_before(out_alloc_node):
+                inplace_node = graph_module.graph.call_function(
+                    memory.alloc_inplace,
+                    (input_node, out_alloc_node.args[0]),
+                )
+                inplace_node.meta = out_alloc_node.meta.copy()
+
+            out_alloc_node.replace_all_uses_with(inplace_node)
+            graph_module.graph.erase_node(out_alloc_node)
+            changed = True
+
+        return PassResult(graph_module, changed)
+
+
 def to_scratch_op_pass(graph_module: torch.fx.GraphModule) -> PassResult:
     for node in graph_module.graph.nodes:
         if node.op != "call_function":
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
@@ -14,7 +14,7 @@
 import torch
 from executorch.exir._warnings import deprecated
 from executorch.exir.error import internal_assert
-from executorch.exir.memory import alloc
+from executorch.exir.memory import alloc, alloc_inplace
 from executorch.exir.memory_planning import (
     _is_out_var_node,
     apply_algo,
@@ -192,6 +192,13 @@ def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
                     if len(out_arg_names) == 1:
                         out_alloc_node = node.kwargs[out_arg_names[0]]
                         out_alloc_node.meta["spec"] = node.meta["spec"]
+                        if (
+                            out_alloc_node.target == alloc_inplace
+                            and isinstance(out_alloc_node.args[0], Node)
+                        ):
+                            base_spec = out_alloc_node.args[0].meta.get("spec")
+                            if base_spec is not None:
+                                node.meta["spec"].inplace_base = base_spec
                         continue
                     specs = get_node_tensor_specs(node)
                     i = 0
@@ -206,7 +213,7 @@ def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
                             # dont increment i as we dont have a spec for this node
                         internal_assert(
                             out_alloc_node.op == "call_function"
-                            and out_alloc_node.target == alloc,
+                            and out_alloc_node.target in (alloc, alloc_inplace),
                             f"Out-var's node {out_alloc_node} has op {out_alloc_node.op} and target {out_alloc_node.target}",
                         )
                         internal_assert(
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -821,18 +821,25 @@ def pre_memory_planning_passes(
         raise RuntimeError(
             f"sym_shape_eval_pass must be a dict or a PassBase, got {config.sym_shape_eval_pass}"
         )
+    inplace_pass = (
+        [config.inplace_elem_wise_like_ops_pass]
+        if config.inplace_elem_wise_like_ops_pass is not None
+        else []
+    )
     if config.remove_view_copy:
         return [
             NormalizeViewCopyBasePass(),
             dead_code_elimination_pass,
             ReplaceViewCopyWithViewPass(),
             sym_shape_eval_pass,
             config.to_out_var_pass,
+            *inplace_pass,
         ]
     else:
         return [
             sym_shape_eval_pass,
             config.to_out_var_pass,
+            *inplace_pass,
         ]
 
 
diff --git a/exir/tensor.py b/exir/tensor.py
@@ -216,6 +216,9 @@ def init_mem_planning_fields(self) -> None:
         self.mem_id = None
         self.mem_obj_id = None
         self.mem_offset = None
+        # Set by InPlaceElemWiseLikeOpsPass: the base TensorSpec whose memory
+        # this spec should share (output allocated in-place over the input).
+        self.inplace_base: Optional["TensorSpec"] = None
 
     @property
     def dtype(self) -> torch.dtype:

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ class ExecutorchBackendConfig:`
`61`	`61`	`# EdgeProgramManager or can be defined per program.`
`62`	`62`	`memory_planning_pass: Union[PassType, Dict[str, PassType]] = MemoryPlanningPass()`
`63`	`63`	`to_out_var_pass: PassType = ToOutVarPass(ignore_to_out_var_failure=False)`
	`64`	`+ inplace_elem_wise_like_ops_pass: Optional[PassType] = None`
`64`	`65`	`dynamic_memory_planning_mode: DynamicMemoryPlanningMode = (`
`65`	`66`	`DynamicMemoryPlanningMode.UPPER_BOUND`
`66`	`67`	`)`