psiddh
diff --git a/‎backends/arm/TARGETS‎
Lines changed: 12 additions & 0 deletions b/‎backends/arm/TARGETS‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/arm/_passes/convert_int64_output_ops_to_int32.py‎
Lines changed: 61 additions & 16 deletions b/‎backends/arm/_passes/convert_int64_output_ops_to_int32.py‎
Lines changed: 61 additions & 16 deletions
diff --git a/‎backends/arm/_passes/decompose_grouped_conv_pass.py‎
Lines changed: 4 additions & 2 deletions b/‎backends/arm/_passes/decompose_grouped_conv_pass.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/arm/_passes/rewrite_conv_pass.py‎
Lines changed: 5 additions & 5 deletions b/‎backends/arm/_passes/rewrite_conv_pass.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎backends/arm/ethosu/partitioner.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/ethosu/partitioner.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/scripts/generate_grid_sampler_spirv.py‎
Lines changed: 75 additions & 0 deletions b/‎backends/arm/scripts/generate_grid_sampler_spirv.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎backends/arm/test/misc/test_custom_shader_payload.py‎
Lines changed: 79 additions & 0 deletions b/‎backends/arm/test/misc/test_custom_shader_payload.py‎
Lines changed: 79 additions & 0 deletions
@@ -87,15 +87,27 @@ runtime.python_library(
     name = "vgf",
     srcs = [
         "vgf/__init__.py",
+        "vgf/_passes/__init__.py",
+        "vgf/_passes/rewrite_grid_sampler_to_tosa_custom.py",
         "vgf/backend.py",
         "vgf/compile_spec.py",
         "vgf/model_converter.py",
         "vgf/partitioner.py",
+        "vgf/shaders/__init__.py",
+        "vgf/shaders/grid_sampler.py",
+    ],
+    resources = [
+        "vgf/shaders/grid_sampler.glsl",
+        "vgf/shaders/grid_sampler.spirv.b64",
     ],
     deps = [
         ":arm_compile_spec",
+        "//caffe2:torch",
+        "//executorch/backends/arm/_passes:passes",
+        "//executorch/backends/arm/tosa/dialect:lib",
         "//executorch/backends/arm/tosa:specification",
         "//executorch/backends/arm/tosa:partitioner",
+        "//executorch/exir:lib",
     ],
 )
 
 
@@ -5,7 +5,7 @@
 
 
 import logging
-from typing import Set, Type
+from typing import cast, Literal, Set, Type
 
 import torch
 from executorch.backends.arm._passes import ArmPass
@@ -25,26 +25,54 @@ class ConvertInt64OutputOpsToInt32Pass(ArmPass):
     """Rewrites or removes operations that produce int64 outputs, converting
     them to int32 where possible.
 
-    Currently, this pass handles casting and argmax operators:
+    Currently, this pass handles casting, argmax and argmin operators:
       1. int32 -> int64:
          removes the cast and redirects all uses to the original int32 value.
       2. other types -> int64:
          rewrites the cast to produce int32 instead of int64.
-      3. torch.argmax()
-         insert an int64->int32 cast after the argmax node
+      3. torch.argmax() / torch.argmin()
+         insert an int64->int32 cast after the argmax/argmin node
 
-    Future extensions may include operators that return int64 outputs by default
-    (e.g., `argmin`), rewriting them or inserting an int64 -> int32 cast to yield
-    int32 results.
+    Future extensions may include other operators that return int64 outputs by
+    default, rewriting them or inserting an int64 -> int32 cast to yield int32
+    results.
 
-    Note: Overflow checks are applied selectively in this pass. For operators without
-    such checks, it is the user's responsibility to ensure that values fit within
-    the int32 range.
+    Args:
+        on_overflow: Action when an argmax/argmin index cannot safely fit in
+            int32 (i.e. the reduced dimension has more than INT32_MAX elements).
+            ``"raise"`` (default) raises a ``RuntimeError`` at compile time.
+            ``"warn"`` logs a warning and skips the cast for that node.
+            ``"skip"`` silently skips the cast for that node.
 
     """
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
+    _INT32_MAX = torch.iinfo(torch.int32).max
+
+    def __init__(
+        self,
+        *args,
+        on_overflow: Literal["raise", "warn", "skip"] = "raise",
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        if on_overflow not in ("raise", "warn", "skip"):
+            raise ValueError(
+                f"on_overflow must be 'raise', 'warn', or 'skip', got {on_overflow!r}"
+            )
+        self.on_overflow = on_overflow
+
+    def _is_int32_range_safe(self, node: torch.fx.Node) -> bool:
+        """Return True if the argmax/argmin index output fits in int32."""
+        input_tensor = get_first_fake_tensor(cast(torch.fx.Node, node.args[0]))
+        dim = node.args[1] if len(node.args) > 1 and node.args[1] is not None else None
+        if dim is None:
+            size = input_tensor.numel()
+        else:
+            size = input_tensor.shape[cast(int, dim)]
+        return size <= self._INT32_MAX
+
     aten_cast_ops = (
         torch.ops.aten.to.dtype,
         torch.ops.aten.to.dtype_layout,
@@ -54,8 +82,11 @@ class ConvertInt64OutputOpsToInt32Pass(ArmPass):
     aten_argmax_ops = (torch.ops.aten.argmax.default,)
     edge_argmax_ops = (exir_ops.edge.aten.argmax.default,)
 
-    aten_ops = aten_cast_ops + aten_argmax_ops
-    edge_ops = edge_cast_ops + edge_argmax_ops
+    aten_argmin_ops = (torch.ops.aten.argmin.default,)
+    edge_argmin_ops = (exir_ops.edge.aten.argmin.default,)
+
+    aten_ops = aten_cast_ops + aten_argmax_ops + aten_argmin_ops
+    edge_ops = edge_cast_ops + edge_argmax_ops + edge_argmin_ops
 
     # dtype is specified in args
     cast_ops_args = (
@@ -104,7 +135,7 @@ def _convert_casting_operators(self, node: torch.fx.Node):
                 f" {input_dtype}->torch.int32 defined in {node.meta.get('stack_trace','[no stack trace found]')}"
             )
 
-    def _convert_argmax_operators(self, node: torch.fx.Node, graph: torch.fx.Graph):
+    def _cast_int64_output_to_int32(self, node: torch.fx.Node, graph: torch.fx.Graph):
         output_tensor = node
         to_copy_op = self._get_decomposition(node.target)
         with graph.inserting_after(node):
@@ -138,9 +169,23 @@ def call(self, graph_module: torch.fx.GraphModule):
 
             if node.target in self.aten_cast_ops + self.edge_cast_ops:
                 self._convert_casting_operators(node)
-            elif node.target in self.aten_argmax_ops + self.edge_argmax_ops:
-                # TODO: Add range check based on the input tensor shape before casting the output
-                self._convert_argmax_operators(node, graph)
+            elif node.target in (
+                self.aten_argmax_ops
+                + self.edge_argmax_ops
+                + self.aten_argmin_ops
+                + self.edge_argmin_ops
+            ):
+                if not self._is_int32_range_safe(node):
+                    msg = (
+                        f"{node.target} reduces over more than {self._INT32_MAX} elements; "
+                        f"the int64 index cannot be safely cast to int32."
+                    )
+                    if self.on_overflow == "raise":
+                        raise RuntimeError(msg)
+                    if self.on_overflow == "warn":
+                        logger.warning(msg)
+                    continue
+                self._cast_int64_output_to_int32(node, graph)
             else:
                 raise RuntimeError(f"Unexpected target {node.target} in {node.name}")
 
 
@@ -257,8 +257,10 @@ def call_operator(self, op, args, kwargs, meta):
 
         input_node = args[0]
         if DecomposeGroupedConvPass._is_depthwise_conv(input_node, groups, transposed):
-            # This is a depthwise convolution which is handled elsewhere
-            return super().call_operator(op, args, kwargs, meta)
+            # Conv2D depthwise maps to TOSA DEPTHWISE_CONV2D — handled in RewriteConvPass.
+            # Conv3D has no DEPTHWISE_CONV3D, so fall through and decompose like grouped conv.
+            if len(input_node.data.shape) != 5:
+                return super().call_operator(op, args, kwargs, meta)
 
         weight_node = args[1]
         bias_node = args[2]
 
@@ -129,13 +129,13 @@ def _is_depthwise_conv2d(self, node: torch.fx.Node) -> bool:
 
     def _is_conv3d(self, rank, groups) -> bool:
         if rank == 5:
-            # A Conv3D is considered depthwise if Group == InChannels and
-            # Group * N == OutChannels, where N is a possitive integer.
-            # Currently we do not support depthwise or grouped conv3d.
-            # @TODO Add grouped/depthwise conv3d support or reject in partitioner.
+            # Both grouped and depthwise Conv3D are decomposed into groups==1
+            # convolutions by DecomposeGroupedConvPass before reaching here.
+            # This guard is defense-in-depth for paths that bypass that pass.
             if groups != 1:
                 raise RuntimeError(
-                    "CONV3D with groups != 1 is not supported in the Arm backend."
+                    "CONV3D with groups != 1 reached unexpectedly; "
+                    "DecomposeGroupedConvPass should have decomposed it first."
                 )
             return True
         return False
 
@@ -5,10 +5,10 @@
 
 from typing import final, Optional, Sequence
 
-import torch
 from executorch.backends.arm.ethosu import EthosUBackend, EthosUCompileSpec
 from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
 from executorch.exir.backend.partitioner import DelegationSpec
+from torch._ops import OpOverload
 from torch.fx.passes.operator_support import OperatorSupportBase
 
 
@@ -33,5 +33,5 @@ def __init__(
         )
         self.additional_checks = additional_checks
         self.tosa_spec = compile_spec.tosa_spec
-        self._custom_partition_ops: set[torch._ops.OpOverload] = set()
+        self._custom_partition_ops: set[OpOverload] = set()
         self.intermediate_path = compile_spec._get_intermediate_path()
@@ -0,0 +1,75 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import base64
+import shutil
+import subprocess  # nosec B404 - required to invoke the shader compiler.
+import tempfile
+from pathlib import Path
+
+
+SHADER_DIR = Path(__file__).resolve().parents[1] / "vgf" / "shaders"
+DEFAULT_SOURCE = SHADER_DIR / "grid_sampler.glsl"
+DEFAULT_OUTPUT = SHADER_DIR / "grid_sampler.spirv.b64"
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Compile the VGF grid_sampler GLSL shader to SPIR-V and write the "
+            "base64-encoded payload consumed by the ExecuTorch custom-shader "
+            "lowering."
+        )
+    )
+    parser.add_argument(
+        "--source",
+        type=Path,
+        default=DEFAULT_SOURCE,
+        help=f"GLSL source file. Defaults to {DEFAULT_SOURCE}",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=DEFAULT_OUTPUT,
+        help=f"Base64 SPIR-V output file. Defaults to {DEFAULT_OUTPUT}",
+    )
+    parser.add_argument(
+        "--glslc",
+        default="glslc",
+        help="Path to glslc. Defaults to resolving glslc from PATH.",
+    )
+    return parser.parse_args()
+
+
+def _resolve_glslc(glslc: str) -> str:
+    resolved = shutil.which(glslc)
+    if resolved is None:
+        raise RuntimeError(
+            f"Could not find {glslc}. Install the Vulkan SDK or pass --glslc."
+        )
+    return resolved
+
+
+def _write_base64_spirv(spirv_path: Path, output_path: Path) -> None:
+    encoded = base64.b64encode(spirv_path.read_bytes()).decode("ascii")
+    output_path.write_text(encoded + "\n", encoding="utf-8")
+
+
+def main() -> None:
+    args = _parse_args()
+    glslc = _resolve_glslc(args.glslc)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        spirv_path = Path(tmpdir) / "grid_sampler.spirv"
+        subprocess.run(  # nosec B603 - glslc path is resolved explicitly.
+            [glslc, str(args.source), "-o", str(spirv_path)],
+            check=True,
+        )
+        _write_base64_spirv(spirv_path, args.output)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,79 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import base64
+
+import pytest
+from executorch.backends.arm.vgf.shaders.grid_sampler import (
+    build_grid_sampler_2d_payload,
+    decode_payload,
+    encode_payload,
+    GRID_SAMPLER_2D_SHADER_BINARY,
+    GRID_SAMPLER_2D_SHADER_ENTRY_POINT,
+    GRID_SAMPLER_2D_SHADER_LANGUAGE,
+    GRID_SAMPLER_2D_SHADER_SOURCE,
+    GRID_SAMPLER_2D_VK_FORMAT,
+    GRID_SAMPLER_2D_WORKGROUP_SIZES,
+)
+
+
+def test_grid_sampler_2d_custom_shader_payload_no_target_round_trip():
+    payload = build_grid_sampler_2d_payload(
+        interpolation_mode=0,
+        padding_mode=2,
+        align_corners=True,
+    )
+    decoded = decode_payload(encode_payload(payload))
+
+    assert decoded["entry_point"] == GRID_SAMPLER_2D_SHADER_ENTRY_POINT
+    assert decoded["workgroup_sizes"] == GRID_SAMPLER_2D_WORKGROUP_SIZES
+    assert decoded["shader_language"] == GRID_SAMPLER_2D_SHADER_LANGUAGE
+    assert base64.b64decode(decoded["shader_code"])[:4] == b"\x03\x02\x23\x07"
+    assert decoded["input_0_type"] == "Tensor"
+    assert decoded["input_0_vkformat"] == GRID_SAMPLER_2D_VK_FORMAT
+    assert decoded["input_0_vkdescriptortype"] == "VK_DESCRIPTOR_TYPE_STORAGE_BUFFER"
+    assert decoded["input_0_binding"] == 0
+    assert decoded["input_1_type"] == "Tensor"
+    assert decoded["input_1_vkformat"] == GRID_SAMPLER_2D_VK_FORMAT
+    assert decoded["input_1_vkdescriptortype"] == "VK_DESCRIPTOR_TYPE_STORAGE_BUFFER"
+    assert decoded["input_1_binding"] == 1
+    assert decoded["output_0_type"] == "Tensor"
+    assert decoded["output_0_vkformat"] == GRID_SAMPLER_2D_VK_FORMAT
+    assert decoded["output_0_vkdescriptortype"] == "VK_DESCRIPTOR_TYPE_STORAGE_BUFFER"
+    assert decoded["output_0_binding"] == 2
+
+
+def test_grid_sampler_2d_custom_shader_payload_no_target_uses_spirv():
+    payload = build_grid_sampler_2d_payload(
+        interpolation_mode=0,
+        padding_mode=0,
+        align_corners=False,
+    )
+
+    shader_binary = base64.b64decode(payload["shader_code"])
+
+    assert payload["shader_language"] == "SPIR-V"
+    assert shader_binary[:4] == b"\x03\x02\x23\x07"
+
+
+def test_grid_sampler_2d_custom_shader_payload_no_target_has_shader_resources():
+    assert GRID_SAMPLER_2D_SHADER_SOURCE == "grid_sampler.glsl"
+    assert GRID_SAMPLER_2D_SHADER_BINARY == "grid_sampler.spirv.b64"
+
+
+def test_grid_sampler_2d_custom_shader_payload_no_target_rejects_bad_modes():
+    with pytest.raises(ValueError, match="Unsupported interpolation_mode"):
+        build_grid_sampler_2d_payload(
+            interpolation_mode=99,
+            padding_mode=0,
+            align_corners=False,
+        )
+
+    with pytest.raises(ValueError, match="Unsupported padding_mode"):
+        build_grid_sampler_2d_payload(
+            interpolation_mode=0,
+            padding_mode=99,
+            align_corners=False,
+        )