Gate device copy insertion on device memory planning (pytorch#19961)

kirklandsign · web-flow · commit 7871a9b2a147 · 2026-06-03T04:52:23.000Z
Differential Revision: D107310726 Pull Request resolved: pytorch#19961
diff --git a/exir/passes/propagate_device_pass.py b/exir/passes/propagate_device_pass.py
@@ -165,10 +165,12 @@ def __init__(
         self,
         skip_h2d_for_method_inputs: bool = False,
         skip_d2h_for_method_outputs: bool = False,
+        enable_non_cpu_memory_planning: bool = False,
     ) -> None:
         super().__init__()
         self.skip_h2d_for_method_inputs = skip_h2d_for_method_inputs
         self.skip_d2h_for_method_outputs = skip_d2h_for_method_outputs
+        self.enable_non_cpu_memory_planning = enable_non_cpu_memory_planning
 
     def _is_placeholder(self, node: torch.fx.Node) -> bool:
         """Check if a node is a graph-level input (placeholder)."""
@@ -282,7 +284,7 @@ def _insert_d2h_for_getitem(
             )
         return True
 
-    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:  # noqa: C901
         # Two-pass approach:
         #   Pass 1 – For each delegate with a target_device CompileSpec, insert
         #            H2D copy nodes before delegate inputs and tag the delegate
@@ -313,9 +315,18 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 target_device_type, device_index = result
                 device_delegates.add(node)
 
-                changed |= self._insert_h2d_copies(
-                    graph_module, node, target_device_type, device_index
-                )
+                if self.enable_non_cpu_memory_planning:
+                    changed |= self._insert_h2d_copies(
+                        graph_module, node, target_device_type, device_index
+                    )
+                else:
+                    for arg in node.args[1:]:
+                        if isinstance(arg, torch.fx.Node):
+                            changed |= _tag_specs_with_device(
+                                arg.meta.get("spec"),
+                                target_device_type,
+                                device_index,
+                            )
 
                 changed |= _tag_specs_with_device(
                     node.meta.get("spec"),
@@ -337,7 +348,26 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
             if node.op == "call_function" and node.target == operator.getitem:
                 source = node.args[0]
                 if isinstance(source, torch.fx.Node) and source in device_delegates:
-                    changed |= self._insert_d2h_for_getitem(graph_module, node)
+                    if self.enable_non_cpu_memory_planning:
+                        changed |= self._insert_d2h_for_getitem(graph_module, node)
+                    else:
+                        spec = node.meta.get("spec")
+                        source_specs = source.meta.get("spec")
+                        idx = node.args[1]
+                        if (
+                            isinstance(spec, TensorSpec)
+                            and isinstance(source_specs, (tuple, list))
+                            and isinstance(idx, int)
+                            and idx < len(source_specs)
+                        ):
+                            source_spec = source_specs[idx]
+                            if isinstance(source_spec, TensorSpec):
+                                _set_device_on_spec(
+                                    spec,
+                                    source_spec.device,
+                                    source_spec.device_index,
+                                )
+                                changed = True
 
         graph_module.recompile()
         return PassResult(graph_module, changed)
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -767,6 +767,7 @@ def edge_to_executorch_passes(
         PropagateDevicePass(
             skip_h2d_for_method_inputs=config.skip_h2d_for_method_inputs,
             skip_d2h_for_method_outputs=config.skip_d2h_for_method_outputs,
+            enable_non_cpu_memory_planning=config.enable_non_cpu_memory_planning,
         ),
         EdgeToBackendOpsPass(),
         RemoveGraphAssertsPass(),
diff --git a/exir/tests/test_propagate_device_pass.py b/exir/tests/test_propagate_device_pass.py
@@ -121,6 +121,7 @@ def _lower_model_to_executorch(
     """Lower model all the way through to_executorch for E2E tests."""
     if et_config is None:
         et_config = ExecutorchBackendConfig(emit_stacktrace=False)
+
     ep = export(model, inputs)
     ep_copied = deepcopy(ep)
 
@@ -314,7 +315,10 @@ def forward(self, a, b):
         inputs = (torch.randn(2, 2), torch.randn(2, 2))
 
         for pipeline, gm in _lower_model_to_executorch(
-            model, inputs, DeviceAwarePartitioner("cuda:0")
+            model,
+            inputs,
+            DeviceAwarePartitioner("cuda:0"),
+            ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
         ):
             with self.subTest(pipeline=pipeline):
                 nodes = _collect_device_copy_nodes(gm)
@@ -371,7 +375,10 @@ def forward(self, a, b):
         inputs = (torch.randn(2, 2), torch.randn(2, 2))
 
         for pipeline, gm in _lower_model_to_executorch(
-            model, inputs, DeviceAwarePartitioner("cuda:0")
+            model,
+            inputs,
+            DeviceAwarePartitioner("cuda:0"),
+            ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
         ):
             with self.subTest(pipeline=pipeline):
                 nodes = _collect_device_copy_nodes(gm)
@@ -445,6 +452,24 @@ def forward(self, a, b):
                     f"[{pipeline}] Unexpected D2H copy nodes when no target_device is set",
                 )
 
+    def test_copy_nodes_require_non_cpu_memory_planning(self):
+        """Default lowering keeps legacy device tags without runtime copy ops."""
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        for pipeline, gm in _lower_model_to_executorch(
+            model, inputs, DeviceAwarePartitioner("cuda:0")
+        ):
+            with self.subTest(pipeline=pipeline):
+                device_copy_nodes = _collect_device_copy_nodes(gm)
+                self.assertEqual(len(device_copy_nodes.h2d_nodes), 0)
+                self.assertEqual(len(device_copy_nodes.d2h_nodes), 0)
+
         # ---- Integration tests: device consistency after to_executorch ----
 
     def test_device_consistency_cuda_1(self):
@@ -523,7 +548,10 @@ def forward(self, a, b):
         inputs = (torch.randn(2, 2), torch.randn(2, 2))
 
         for pipeline, gm in _lower_model_to_executorch(
-            model, inputs, DeviceAwarePartitioner("cuda:0")
+            model,
+            inputs,
+            DeviceAwarePartitioner("cuda:0"),
+            ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
         ):
             with self.subTest(pipeline=pipeline):
                 for node in gm.graph.nodes: