Add ExecutorchBackendConfig flags for skipping H2D/D2H copies

Gasoonjia · web-flow · commit 6043775338cb · 2026-06-02T09:40:38.000Z
Differential Revision: D99636778 Pull Request resolved: pytorch#19929
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
@@ -123,3 +123,15 @@ class ExecutorchBackendConfig:
     # vs. accelerator memory.  Default False preserves the legacy behavior
     # where all tensors are planned into CPU memory regardless of device.
     enable_non_cpu_memory_planning: bool = False
+
+    # When True, method-level input tensors that feed directly into a device
+    # delegate are NOT wrapped with _h2d_copy. The user must provide tensors
+    # already on the target device. Useful for pipelines where inputs are
+    # pre-staged on GPU.
+    skip_h2d_for_method_inputs: bool = False
+
+    # When True, device delegate outputs that are directly method outputs
+    # are NOT wrapped with _d2h_copy. The method outputs stay on device.
+    # Useful for cross-method GPU pipelines where the next method consumes
+    # GPU tensors directly.
+    skip_d2h_for_method_outputs: bool = False
diff --git a/exir/passes/propagate_device_pass.py b/exir/passes/propagate_device_pass.py
@@ -163,8 +163,12 @@ class PropagateDevicePass(PassBase):
 
     def __init__(
         self,
+        skip_h2d_for_method_inputs: bool = False,
+        skip_d2h_for_method_outputs: bool = False,
     ) -> None:
         super().__init__()
+        self.skip_h2d_for_method_inputs = skip_h2d_for_method_inputs
+        self.skip_d2h_for_method_outputs = skip_d2h_for_method_outputs
 
     def _is_placeholder(self, node: torch.fx.Node) -> bool:
         """Check if a node is a graph-level input (placeholder)."""
@@ -191,6 +195,23 @@ def _insert_h2d_copies(
             if not isinstance(arg_spec, TensorSpec):
                 continue
 
+            if self.skip_h2d_for_method_inputs and self._is_placeholder(arg):
+                # TODO(gasoonjia): support skip_h2d_for_method_inputs for
+                # multiple-user placeholder inputs.
+                if len(arg.users) != 1:
+                    raise RuntimeError(
+                        f"skip_h2d_for_method_inputs=True requires placeholder "
+                        f"'{arg.name}' to have exactly one user, but it has "
+                        f"{len(arg.users)} users. The placeholder is shared by "
+                        f"multiple consumers, so its TensorSpec cannot be safely "
+                        f"mutated in-place to the delegate's device. Either disable "
+                        f"skip_h2d_for_method_inputs, or ensure the placeholder is "
+                        f"used exclusively by this delegate."
+                    )
+                _set_device_on_spec(arg_spec, target_device_type, device_index)
+                changed = True
+                continue
+
             with graph_module.graph.inserting_before(node):
                 h2d_node = graph_module.graph.call_function(
                     torch.ops.et_copy._h2d_copy.default,
@@ -241,6 +262,9 @@ def _insert_d2h_for_getitem(
 
         _set_device_on_spec(spec, source_spec.device, source_spec.device_index)
 
+        if self.skip_d2h_for_method_outputs and self._feeds_directly_to_output(node):
+            return True
+
         with graph_module.graph.inserting_after(node):
             d2h_node = graph_module.graph.call_function(
                 torch.ops.et_copy._d2h_copy.default,
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -764,7 +764,10 @@ def edge_to_executorch_passes(
         # there exists an unbacked symint operation.
         *config.passes,
         SpecPropPass(),
-        PropagateDevicePass(),
+        PropagateDevicePass(
+            skip_h2d_for_method_inputs=config.skip_h2d_for_method_inputs,
+            skip_d2h_for_method_outputs=config.skip_d2h_for_method_outputs,
+        ),
         EdgeToBackendOpsPass(),
         RemoveGraphAssertsPass(),
     ] + pre_memory_planning_passes(config, name)
diff --git a/exir/tests/test_propagate_device_pass.py b/exir/tests/test_propagate_device_pass.py