Arm backend: add argmin support and int32 overflow guard to ConvertIn… (pytorch#19918)

vacu9708 · web-flow · commit e56c7c33ef54 · 2026-06-03T08:45:23.000+02:00
## Summary Follow-up to pytorch#13803. Two changes to `ConvertInt64OutputOpsToInt32Pass`. ## 1. argmin support `ConvertInt64OutputOpsToInt32Pass` inserts an `int64 → int32` cast after `aten.argmax` nodes so that the index output (TOSA has no int64) becomes int32 and downstream consumers can be delegated. `aten.argmin` returns int64 identically but was not handled — the committer explicitly deferred it as a future extension: > *"Future extensions may include operators that return int64 outputs by default (e.g., `argmin`) …"* ```mermaid flowchart LR subgraph before["Before"] direction LR A1["argmin\nint64"]:::cpu --> B1["mul\nint64"]:::blocked --> C1["add\nint64"]:::blocked end subgraph after["After"] direction LR A2["argmin\nint64"]:::cpu --> T["to_int32"]:::cpu T --> B2["mul\nint32"]:::delegated --> C2["add\nint32"]:::delegated end before ~~~ after classDef cpu fill:#f5c542,stroke:#b8962e,color:#000 classDef blocked fill:#e05c5c,stroke:#a33,color:#fff classDef delegated fill:#4caf7d,stroke:#2d7a54,color:#fff ``` **Changes:** Mirror the existing argmax registration to cover argmin. Rename the cast helper — it operates on the node's output dtype, not the op name, so the old name was misleading once argmin was added. --- ## 2. int32 overflow guard The pass previously had an open TODO: ```python # TODO: Add range check based on the input tensor shape before casting the output ``` `argmax`/`argmin` return an index in `[0, size)` where `size` is the number of elements searched. If `size > INT32_MAX`, casting to int32 silently truncates, producing a wrong index with no error. **Changes:** Add a compile-time shape check (`shape[dim]` or `numel()` for the no-dim form) and an `on_overflow` constructor param (`"raise"` / `"warn"` / `"skip"`, default `"raise"`). A compile-time error is preferable to a silent wrong result at runtime. --- ## Tests ```bash $ python -m pytest backends/arm/test/passes/test_convert_int64_output_ops_to_int32.py -v 9 passed # 5 existing + 2 parametrized [argmax]/[argmin] delegation + 4 overflow (raise/warn/skip/invalid) $ lintrunner backends/arm/_passes/convert_int64_output_ops_to_int32.py \ backends/arm/test/passes/test_convert_int64_output_ops_to_int32.py ok No lint issues. ``` The argmax and argmin delegation cases are unified into a single `@pytest.mark.parametrize` test. Signed-off-by: Youngsik Yang <vacu9708@gmail.com>
diff --git a/backends/arm/_passes/convert_int64_output_ops_to_int32.py b/backends/arm/_passes/convert_int64_output_ops_to_int32.py
@@ -5,7 +5,7 @@
 
 
 import logging
-from typing import Set, Type
+from typing import cast, Literal, Set, Type
 
 import torch
 from executorch.backends.arm._passes import ArmPass
@@ -25,26 +25,54 @@ class ConvertInt64OutputOpsToInt32Pass(ArmPass):
     """Rewrites or removes operations that produce int64 outputs, converting
     them to int32 where possible.
 
-    Currently, this pass handles casting and argmax operators:
+    Currently, this pass handles casting, argmax and argmin operators:
       1. int32 -> int64:
          removes the cast and redirects all uses to the original int32 value.
       2. other types -> int64:
          rewrites the cast to produce int32 instead of int64.
-      3. torch.argmax()
-         insert an int64->int32 cast after the argmax node
+      3. torch.argmax() / torch.argmin()
+         insert an int64->int32 cast after the argmax/argmin node
 
-    Future extensions may include operators that return int64 outputs by default
-    (e.g., `argmin`), rewriting them or inserting an int64 -> int32 cast to yield
-    int32 results.
+    Future extensions may include other operators that return int64 outputs by
+    default, rewriting them or inserting an int64 -> int32 cast to yield int32
+    results.
 
-    Note: Overflow checks are applied selectively in this pass. For operators without
-    such checks, it is the user's responsibility to ensure that values fit within
-    the int32 range.
+    Args:
+        on_overflow: Action when an argmax/argmin index cannot safely fit in
+            int32 (i.e. the reduced dimension has more than INT32_MAX elements).
+            ``"raise"`` (default) raises a ``RuntimeError`` at compile time.
+            ``"warn"`` logs a warning and skips the cast for that node.
+            ``"skip"`` silently skips the cast for that node.
 
     """
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
+    _INT32_MAX = torch.iinfo(torch.int32).max
+
+    def __init__(
+        self,
+        *args,
+        on_overflow: Literal["raise", "warn", "skip"] = "raise",
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        if on_overflow not in ("raise", "warn", "skip"):
+            raise ValueError(
+                f"on_overflow must be 'raise', 'warn', or 'skip', got {on_overflow!r}"
+            )
+        self.on_overflow = on_overflow
+
+    def _is_int32_range_safe(self, node: torch.fx.Node) -> bool:
+        """Return True if the argmax/argmin index output fits in int32."""
+        input_tensor = get_first_fake_tensor(cast(torch.fx.Node, node.args[0]))
+        dim = node.args[1] if len(node.args) > 1 and node.args[1] is not None else None
+        if dim is None:
+            size = input_tensor.numel()
+        else:
+            size = input_tensor.shape[cast(int, dim)]
+        return size <= self._INT32_MAX
+
     aten_cast_ops = (
         torch.ops.aten.to.dtype,
         torch.ops.aten.to.dtype_layout,
@@ -54,8 +82,11 @@ class ConvertInt64OutputOpsToInt32Pass(ArmPass):
     aten_argmax_ops = (torch.ops.aten.argmax.default,)
     edge_argmax_ops = (exir_ops.edge.aten.argmax.default,)
 
-    aten_ops = aten_cast_ops + aten_argmax_ops
-    edge_ops = edge_cast_ops + edge_argmax_ops
+    aten_argmin_ops = (torch.ops.aten.argmin.default,)
+    edge_argmin_ops = (exir_ops.edge.aten.argmin.default,)
+
+    aten_ops = aten_cast_ops + aten_argmax_ops + aten_argmin_ops
+    edge_ops = edge_cast_ops + edge_argmax_ops + edge_argmin_ops
 
     # dtype is specified in args
     cast_ops_args = (
@@ -104,7 +135,7 @@ def _convert_casting_operators(self, node: torch.fx.Node):
                 f" {input_dtype}->torch.int32 defined in {node.meta.get('stack_trace','[no stack trace found]')}"
             )
 
-    def _convert_argmax_operators(self, node: torch.fx.Node, graph: torch.fx.Graph):
+    def _cast_int64_output_to_int32(self, node: torch.fx.Node, graph: torch.fx.Graph):
         output_tensor = node
         to_copy_op = self._get_decomposition(node.target)
         with graph.inserting_after(node):
@@ -138,9 +169,23 @@ def call(self, graph_module: torch.fx.GraphModule):
 
             if node.target in self.aten_cast_ops + self.edge_cast_ops:
                 self._convert_casting_operators(node)
-            elif node.target in self.aten_argmax_ops + self.edge_argmax_ops:
-                # TODO: Add range check based on the input tensor shape before casting the output
-                self._convert_argmax_operators(node, graph)
+            elif node.target in (
+                self.aten_argmax_ops
+                + self.edge_argmax_ops
+                + self.aten_argmin_ops
+                + self.edge_argmin_ops
+            ):
+                if not self._is_int32_range_safe(node):
+                    msg = (
+                        f"{node.target} reduces over more than {self._INT32_MAX} elements; "
+                        f"the int64 index cannot be safely cast to int32."
+                    )
+                    if self.on_overflow == "raise":
+                        raise RuntimeError(msg)
+                    if self.on_overflow == "warn":
+                        logger.warning(msg)
+                    continue
+                self._cast_int64_output_to_int32(node, graph)
             else:
                 raise RuntimeError(f"Unexpected target {node.target} in {node.name}")
 
diff --git a/backends/arm/test/passes/test_convert_int64_output_ops_to_int32.py b/backends/arm/test/passes/test_convert_int64_output_ops_to_int32.py
@@ -5,12 +5,14 @@
 
 from typing import Callable, Dict, Tuple
 
+import pytest
 import torch
 from executorch.backends.arm._passes import ConvertInt64OutputOpsToInt32Pass
 
 from executorch.backends.arm.test import common
 
 from executorch.backends.arm.test.tester.test_pipeline import TosaPipelineFP
+from torch.fx import Graph, GraphModule
 
 input_t1 = Tuple[torch.Tensor]  # Input x
 
@@ -86,44 +88,86 @@ def test_convert_int64_output_ops_to_int32_tosa_FP_remove_casting(
     pipeline.run()
 
 
-#####################################################
-## Test arange(dtype=int64) -> arange(dtype=int32) ##
-#####################################################
+##########################################################
+## Test argmax/argmin int64 output -> int32 cast       ##
+##########################################################
 
 
-class Int64OutputModel(torch.nn.Module):
+@pytest.mark.parametrize(
+    "arg_op, aten_op_str",
+    [
+        (torch.argmax, "torch.ops.aten.argmax.default"),
+        (torch.argmin, "torch.ops.aten.argmin.default"),
+    ],
+    ids=["argmax", "argmin"],
+)
+def test_convert_int64_output_ops_to_int32_tosa_FP_insert_cast(arg_op, aten_op_str):
+    class ArgOpModel(torch.nn.Module):
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return (10 * arg_op(x, dim=-1) + 10) + 1.5
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # return torch.argmax(x)  # RuntimeError: Int did not match Long; But this is expected as we expect _argmax_i32 to generate int32 output
-        # return (10 * torch.argmax(x) + 10).to(dtype=torch.int32)  #  [1]. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0). (function _resize_output_check)
-        return (10 * torch.argmax(x, dim=-1) + 10) + 1.5
-
-    def get_inputs(self) -> input_t1:
-        return (
-            torch.randint(
-                0,
-                10,
-                (2, 4, 6, 8),
-            ),
-        )
-
-
-def test_convert_int64_output_ops_to_int32_tosa_FP_insert_cast():
-    module = Int64OutputModel()
-    aten_ops_checks = [
-        "torch.ops.aten.argmax.default",
-        "torch.ops.aten.mul.Tensor",
-        "torch.ops.aten.add.Tensor",
-    ]
-    exir_ops_checks = [
-        "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
-        "executorch_exir_dialects_edge__ops_aten_add_Tensor",
-    ]
     pipeline = TosaPipelineFP[input_t1](
-        module,
-        module.get_inputs(),
-        aten_op=aten_ops_checks,
-        exir_op=exir_ops_checks,
+        ArgOpModel(),
+        (torch.randint(0, 10, (2, 4, 6, 8)),),
+        aten_op=[aten_op_str, "torch.ops.aten.mul.Tensor", "torch.ops.aten.add.Tensor"],
+        exir_op=[
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_add_Tensor",
+        ],
         transform_passes=[ConvertInt64OutputOpsToInt32Pass()],
     )
     pipeline.run()
+
+
+##############################################################
+## Test on_overflow range check for argmax/argmin           ##
+##############################################################
+
+_OVERFLOW_DIM = torch.iinfo(torch.int32).max + 1
+
+
+def _make_argmax_graph_large_dim() -> GraphModule:
+    """Construct a minimal graph with an argmax over a dimension > INT32_MAX.
+
+    Uses FakeTensorMode so no memory is allocated for the large dimension.
+
+    """
+    from torch._subclasses import FakeTensorMode
+
+    graph = Graph()
+    with FakeTensorMode():
+        fake_input = torch.empty(_OVERFLOW_DIM, dtype=torch.float32)
+        fake_output = torch.empty((), dtype=torch.int64)
+    x = graph.placeholder("x")
+    x.meta["val"] = fake_input
+    out = graph.call_function(torch.ops.aten.argmax.default, (x, 0))
+    out.meta["val"] = fake_output
+    graph.output(out)
+    return GraphModule(torch.nn.Module(), graph)
+
+
+def test_on_overflow_raise():
+    gm = _make_argmax_graph_large_dim()
+    with pytest.raises(RuntimeError, match="cannot be safely cast to int32"):
+        ConvertInt64OutputOpsToInt32Pass(on_overflow="raise").call(gm)
+
+
+def test_on_overflow_warn(caplog):
+    import logging
+
+    gm = _make_argmax_graph_large_dim()
+    with caplog.at_level(logging.WARNING):
+        result = ConvertInt64OutputOpsToInt32Pass(on_overflow="warn").call(gm)
+    assert not result.modified
+    assert "cannot be safely cast to int32" in caplog.text
+
+
+def test_on_overflow_skip():
+    gm = _make_argmax_graph_large_dim()
+    result = ConvertInt64OutputOpsToInt32Pass(on_overflow="skip").call(gm)
+    assert not result.modified
+
+
+def test_on_overflow_invalid():
+    with pytest.raises(ValueError, match="on_overflow must be"):
+        ConvertInt64OutputOpsToInt32Pass(on_overflow="blah")