Arm backend: Add optional ToDevicePass (#18230)

Erik-Lundell · web-flow · commit fcccda373b3d · 2026-04-07T12:58:46.000+02:00
A pass to move a graph_module to a device correctly. This is needed on
models containing ops with "device" kwargs. They are not moved when
model.to(device=....) is called.

Signed-off-by: Erik Lundell &lt;erik.lundell@arm.com&gt;
diff --git a/backends/arm/README.md b/backends/arm/README.md
@@ -308,6 +308,17 @@ List of model specific and optional passes:
     - backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
     - backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
 
+- ToDevicePass
+  - This is a utility for moving an already-quantized or already-decomposed GraphModule to another device.
+  - it is intended to be used immediately before rerunning / retracing / torch.export.export(...)
+  - Functionalities:
+    - Calls `.to(device)` on the GraphModule and rewrites explicit `device=` kwargs on `call_function` nodes to a user-specified device.
+    - Useful when manually moving an already-quantized or already-decomposed graph module to another device for validation, since some constant-producing nodes may still carry an export-time device kwarg.
+  - Example usage:
+    - `from executorch.exir.passes import ToDevicePass`
+    - `graph_module = ToDevicePass("cpu")(graph_module).graph_module`
+    - backends/arm/test/misc/test_post_quant_device_switch.py
+
 ## Help & Improvements
 
 If you have problems or questions, or have suggestions for ways to improve the Arm backend, please reach out
diff --git a/backends/arm/test/misc/test_post_quant_device_switch.py b/backends/arm/test/misc/test_post_quant_device_switch.py
@@ -0,0 +1,232 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+from dataclasses import dataclass
+from typing import Callable
+
+import pytest
+import torch
+import torch.nn.functional as F
+from executorch.backends.arm.quantizer import (
+    get_symmetric_quantization_config,
+    TOSAQuantizer,
+)
+from executorch.backends.arm.tosa import TosaSpecification
+from executorch.exir.passes import ToDevicePass
+from torch._subclasses.fake_tensor import FakeTensor
+from torchao.quantization.pt2e import move_exported_model_to_eval
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_qat_pt2e
+
+
+class AddAlpha(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.add(x, y, alpha=2.0)
+
+
+class SubAlpha(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.sub(x, y, alpha=2.0)
+
+
+class SliceScatter(torch.nn.Module):
+    def forward(self, x, src):
+        return torch.slice_scatter(x, src, dim=1, start=0, end=4, step=2)
+
+
+class MeanDim(torch.nn.Module):
+    def forward(self, x):
+        return torch.mean(x, dim=(1,), keepdim=True)
+
+
+class MeanDefault(torch.nn.Module):
+    def forward(self, x):
+        return torch.mean(x)
+
+
+class VarCorrection(torch.nn.Module):
+    def forward(self, x):
+        return torch.var(x, dim=(2, 3), correction=1, keepdim=True)
+
+
+class VarDim(torch.nn.Module):
+    def forward(self, x):
+        return torch.ops.aten.var.dim(x, [2, 3], 1, True)
+
+
+class DivTensorMode(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.div(x, y, rounding_mode="trunc")
+
+
+class LeakyRelu(torch.nn.Module):
+    def forward(self, x):
+        return F.leaky_relu(x, negative_slope=0.2)
+
+
+class AvgPool2d(torch.nn.Module):
+    def forward(self, x):
+        return F.avg_pool2d(x, kernel_size=2, stride=1, padding=1)
+
+
+class LayerNorm(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer_norm = torch.nn.LayerNorm(4, elementwise_affine=False)
+
+    def forward(self, x):
+        return self.layer_norm(x)
+
+
+class GroupNorm(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.group_norm = torch.nn.GroupNorm(2, 4, affine=False)
+
+    def forward(self, x):
+        return self.group_norm(x)
+
+
+@dataclass(frozen=True)
+class MetaRetraceCase:
+    name: str
+    module_factory: Callable[[], torch.nn.Module]
+    inputs_factory: Callable[[], tuple[torch.Tensor, ...]]
+    aten_op: str
+
+
+_TEST_CASES = [
+    MetaRetraceCase(
+        "add_alpha",
+        AddAlpha,
+        lambda: (torch.randn(2, 3), torch.randn(2, 3)),
+        "aten.add.Tensor",
+    ),
+    MetaRetraceCase(
+        "sub_alpha",
+        SubAlpha,
+        lambda: (torch.randn(2, 3), torch.randn(2, 3)),
+        "aten.sub.Tensor",
+    ),
+    MetaRetraceCase(
+        "slice_scatter",
+        SliceScatter,
+        lambda: (torch.randn(2, 4), torch.randn(2, 2)),
+        "aten.slice_scatter.default",
+    ),
+    MetaRetraceCase(
+        "mean_dim",
+        MeanDim,
+        lambda: (torch.randn(2, 3, 4),),
+        "aten.mean.dim",
+    ),
+    MetaRetraceCase(
+        "mean_default",
+        MeanDefault,
+        lambda: (torch.randn(2, 3, 4),),
+        "aten.mean.default",
+    ),
+    MetaRetraceCase(
+        "var_correction",
+        VarCorrection,
+        lambda: (torch.randn(2, 3, 4, 4),),
+        "aten.var.correction",
+    ),
+    MetaRetraceCase(
+        "var_dim",
+        VarDim,
+        lambda: (torch.randn(2, 3, 4, 4),),
+        "aten.var.dim",
+    ),
+    MetaRetraceCase(
+        "div_tensor_mode",
+        DivTensorMode,
+        lambda: (torch.randn(2, 3), torch.randn(2, 3) + 1.0),
+        "aten.div.Tensor_mode",
+    ),
+    MetaRetraceCase(
+        "leaky_relu",
+        LeakyRelu,
+        lambda: (torch.randn(2, 3),),
+        "aten.leaky_relu.default",
+    ),
+    MetaRetraceCase(
+        "avg_pool2d",
+        AvgPool2d,
+        lambda: (torch.randn(1, 3, 4, 4),),
+        "aten.avg_pool2d.default",
+    ),
+    MetaRetraceCase(
+        "layer_norm",
+        LayerNorm,
+        lambda: (torch.randn(2, 3, 4),),
+        "aten.layer_norm.default",
+    ),
+    MetaRetraceCase(
+        "group_norm",
+        GroupNorm,
+        lambda: (torch.randn(2, 4, 3, 3),),
+        "aten.group_norm.default",
+    ),
+]
+
+
+def _make_quantizer() -> TOSAQuantizer:
+    quantizer = TOSAQuantizer(TosaSpecification.create_from_string("TOSA-1.0+INT"))
+    quantizer.set_global(get_symmetric_quantization_config(is_per_channel=False))
+    return quantizer
+
+
+def _iter_fake_tensors(meta_val):
+    if isinstance(meta_val, FakeTensor):
+        yield meta_val
+        return
+
+    if isinstance(meta_val, (list, tuple)):
+        for item in meta_val:
+            yield from _iter_fake_tensors(item)
+
+
+def _to_meta_inputs(
+    example_inputs: tuple[torch.Tensor, ...],
+) -> tuple[torch.Tensor, ...]:
+    return tuple(inp.to(device="meta") for inp in example_inputs)
+
+
+@pytest.mark.parametrize("case", _TEST_CASES, ids=[case.name for case in _TEST_CASES])
+def test_post_quant_device_switch_no_target(case: MetaRetraceCase) -> None:
+    """This test tests that moving a model to another device after quantiation
+    works.
+    """
+    module = case.module_factory().train()
+    example_inputs = case.inputs_factory()
+
+    # Quantize module
+    exported = torch.export.export(module, example_inputs, strict=True)
+    prepared = prepare_qat_pt2e(copy.deepcopy(exported.graph_module), _make_quantizer())
+    prepared(*example_inputs)
+    prepared = move_exported_model_to_eval(prepared)
+    quantized_module = convert_pt2e(prepared)
+
+    # Move and test running the model with other device.
+    meta_inputs = _to_meta_inputs(example_inputs)
+    meta_module = ToDevicePass("meta")(quantized_module).graph_module
+    meta_module(*meta_inputs)
+
+    # Retrace module using meta device to check all fake tensors are moved.
+    meta_module = torch.export.export(meta_module, meta_inputs, strict=True)
+
+    # Validate transformation.
+    fake_tensor_devices = [
+        (str(fake_tensor.device), str(node))
+        for node in meta_module.graph.nodes
+        for fake_tensor in _iter_fake_tensors(node.meta.get("val"))
+    ]
+
+    assert fake_tensor_devices, "Expected traced graph to contain FakeTensor metadata"
+    assert all(device == "meta" for device, _ in fake_tensor_devices), (
+        "Expected all traced FakeTensors to use the meta device, got "
+        f"{fake_tensor_devices}"
+    )
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
@@ -48,6 +48,7 @@ def define_arm_tests():
         "misc/test_bn_relu_folding_qat.py",
         "misc/test_custom_partition.py",
         "misc/test_debug_hook.py",
+        "misc/test_post_quant_device_switch.py",
         # "misc/test_dim_order.py", (TODO - T238390249)
     ]
 
diff --git a/exir/passes/BUCK b/exir/passes/BUCK
@@ -32,6 +32,7 @@ fbcode_target(_kind = runtime.python_library,
         ":spec_prop_pass",
         ":sym_shape_eval_pass",
         ":sym_to_tensor_pass",
+        ":to_device_pass",
         ":weights_to_outputs_pass",
         ":reinplace_pass",
         "//caffe2:torch",
@@ -92,6 +93,17 @@ fbcode_target(_kind = runtime.python_library,
     ],
 )
 
+fbcode_target(_kind = runtime.python_library,
+    name = "to_device_pass",
+    srcs = [
+        "to_device_pass.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir:pass_base",
+    ],
+)
+
 fbcode_target(_kind = runtime.python_library,
     name = "weights_to_outputs_pass",
     srcs = [
diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -30,7 +31,6 @@
     to_out_variant,
     to_scratch_op,
 )
-
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.pass_manager import PassManager, PassType
 from executorch.exir.passes.const_prop_pass import ConstPropPass
@@ -59,6 +59,8 @@
 from executorch.exir.passes.spec_prop_pass import SpecPropPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
 from executorch.exir.passes.sym_to_tensor_pass import SymToTensorPass
+
+from executorch.exir.passes.to_device_pass import ToDevicePass
 from executorch.exir.passes.weights_to_outputs_pass import weights_to_outputs_pass
 from torch import fx
 from torch._subclasses import FakeTensor
@@ -71,6 +73,7 @@
     "ConstPropPass",
     "QuantFusionPass",
     "OpReplacePass",
+    "ToDevicePass",
     "EdgeToBackendOpsPass",
     "MemoryFormatOpsPass",
     "MemoryPlanningPass",
diff --git a/exir/passes/to_device_pass.py b/exir/passes/to_device_pass.py
@@ -0,0 +1,45 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class ToDevicePass(ExportPass):
+    """Call .to(device) and rewrite explicit `device=` kwargs on call_function
+    nodes to given device.
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def __init__(self, device: str | torch.device, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.device = torch.device(device)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph_module = graph_module.to(self.device)
+        modified = False
+
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or "device" not in node.kwargs:
+                continue
+
+            current_device = node.kwargs["device"]
+            if current_device == self.device:
+                continue
+
+            node.update_kwarg("device", self.device)
+            modified = True
+
+        if modified:
+            graph_module.recompile()
+
+        return PassResult(graph_module, True)
+
+    def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        """Reimplement __call__ to avoid Optional[PassResult] type hint."""
+        return self.call(graph_module)

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@ def define_arm_tests():`
`48`	`48`	`"misc/test_bn_relu_folding_qat.py",`
`49`	`49`	`"misc/test_custom_partition.py",`
`50`	`50`	`"misc/test_debug_hook.py",`
	`51`	`+ "misc/test_post_quant_device_switch.py",`
`51`	`52`	`# "misc/test_dim_order.py", (TODO - T238390249)`
`52`	`53`	`]`
`53`	`54`