Arm backend: Data layout cast bugfixes

AdrianLundell · AdrianLundell · commit 82af6a50bfed · 2026-04-15T13:07:47.000+02:00
Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
Change-Id: Iff630a5bd12d05119f14be37f819bdb28586a6d2
diff --git a/backends/arm/_passes/insert_data_layout_casts_pass.py b/backends/arm/_passes/insert_data_layout_casts_pass.py
@@ -9,7 +9,7 @@
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm.tosa.specification import get_context_spec
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
+from executorch.exir.pass_base import ExportPass, NodeMetadata
 
 
 class InsertDataLayoutCastsPass(ArmPass):
@@ -55,6 +55,7 @@ class InsertDataLayoutCastsPass(ArmPass):
     }
 
     _int_to_fp_map = {
+        torch.int8: torch.float16,  # This doubles the size after casting, but is very unlikely to occur in practice since int8 is only ever used by LOGICAL_SHIFT and CAST/RESCALE ops in PRO-FP.
         torch.int16: torch.float16,
         torch.int32: torch.float32,
     }
@@ -63,9 +64,15 @@ def call_operator(self, op, args, kwargs, meta):
         if op not in self.targeted_ops:
             return super().call_operator(op, args, kwargs, meta)
 
-        dtype = args[0].data.dtype
-        spec = get_context_spec()
+        if op in self._concat_ops:
+            # Cast to largest dtype
+            dtypes = [arg.data.dtype for arg in args[0]]
+            dtype_sizes = [dtype.itemsize for dtype in dtypes]
+            dtype = dtypes[dtype_sizes.index(max(dtype_sizes))]
+        else:
+            dtype = args[0].data.dtype
 
+        spec = get_context_spec()
         dtype_is_integer = not dtype.is_floating_point and dtype != torch.bool
         if dtype_is_integer and not spec.support_integer():
             supported_dtype = self._int_to_fp_map.get(dtype, None)
@@ -93,16 +100,30 @@ def call_operator(self, op, args, kwargs, meta):
             for arg in args[0]:
                 x_casted.append(
                     super().call_operator(
-                        self._cast_op, (arg,), {"dtype": supported_dtype}, meta
+                        self._cast_op,
+                        (arg,),
+                        {"dtype": supported_dtype},
+                        NodeMetadata(arg.node.meta),
+                        updated=True,
                     )
                 )
-            y_casted = super().call_operator(op, (x_casted,), kwargs, meta)
+            y_casted = super().call_operator(
+                op, (x_casted, *args[1:]), kwargs, meta, updated=True
+            )
 
         else:
             x_casted = super().call_operator(
-                self._cast_op, (args[0],), {"dtype": supported_dtype}, meta
+                self._cast_op,
+                (args[0],),
+                {"dtype": supported_dtype},
+                NodeMetadata(args[0].node.meta),
+                updated=True,
+            )
+            y_casted = super().call_operator(
+                op, (x_casted, *args[1:]), kwargs, meta, updated=True
             )
-            y_casted = super().call_operator(op, (x_casted, *args[1:]), kwargs, meta)
 
-        y = super().call_operator(self._cast_op, (y_casted,), {"dtype": dtype}, meta)
+        y = super().call_operator(
+            self._cast_op, (y_casted,), {"dtype": dtype}, meta, updated=True
+        )
         return y
diff --git a/backends/arm/test/passes/test_insert_data_layout_casts_pass.py b/backends/arm/test/passes/test_insert_data_layout_casts_pass.py
@@ -11,7 +11,7 @@
 
 
 def _collect_cast_dtypes(
-    pipeline: PassPipeline[tuple[torch.Tensor]],
+    pipeline: PassPipeline[tuple[torch.Tensor, ...]],
 ) -> list[torch.dtype]:
     exported_program = pipeline.tester.get_artifact(
         StageType.RUN_PASSES
@@ -34,10 +34,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x.view(2, 2)
 
 
+class CatModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return torch.cat([x, y], dim=1)
+
+
 def test_insert_data_layout_casts_no_target_view_fp_profile_inserts_casts() -> None:
     test_data = (torch.arange(4, dtype=torch.int32).reshape(1, 4),)
 
-    pipeline = PassPipeline[tuple[torch.Tensor]](
+    pipeline = PassPipeline[tuple[torch.Tensor, ...]](
         ViewModule(),
         test_data,
         quantize=False,
@@ -78,3 +83,29 @@ def test_insert_data_layout_casts_no_target_view_fp_profile_skips_supported_dtyp
         pass_list=[InsertDataLayoutCastsPass],
     )
     pipeline.run()
+
+
+def test_insert_data_layout_casts_no_target_cat_fp_profile_inserts_casts() -> None:
+    test_data = (
+        torch.arange(4, dtype=torch.int32).reshape(1, 4),
+        torch.arange(4, dtype=torch.int32).reshape(1, 4),
+    )
+
+    pipeline = PassPipeline[tuple[torch.Tensor, ...]](
+        CatModule(),
+        test_data,
+        quantize=False,
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_cat_default": 1,
+        },
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_cat_default": 1,
+            "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 3,
+        },
+        pass_list=[InsertDataLayoutCastsPass],
+    )
+    pipeline.run()
+
+    cast_dtypes = _collect_cast_dtypes(pipeline)
+    assert cast_dtypes.count(torch.float32) == 2
+    assert cast_dtypes.count(torch.int32) == 1