[ET-VK] Implement aten.pixel_shuffle.default op

ssjia · SS-JIA · commit 8e18287c5d60 · 2026-05-09T02:08:46.000-04:00
The previous commit on this stack added the fused `q8ta_pixel_shuffle` custom op and, to make pattern matching easier, added `aten.pixel_shuffle.default` to the partitioner's `ops_not_to_decompose` list. That change had a side effect: any non-quantized model that uses `aten.pixel_shuffle.default` now reaches the Vulkan backend with the op intact, but the backend had no implementation registered for it, so those models fail to lower. This commit adds a layout- and dtype-agnostic implementation of `aten.pixel_shuffle.default` so existing models keep working. The implementation rearranges `(N, C*r*r, H, W)` -> `(N, C, H*r, W*r)`, where output element `(n, c, h_out, w_out)` reads from input element `(n, c*r*r + (h_out%r)*r + (w_out%r), h_out/r, w_out/r)`. Two compute shaders are added because the work-assignment paradigm differs between storage types: - `pixel_shuffle_buffer.glsl` assigns one thread per output element and uses `linear_idx_to_tensor_idx` against the output `BufferMetadata`, which makes it agnostic to the underlying `dim_order`. - `pixel_shuffle_texture.glsl` assigns one thread per output texel and uses `TextureMetadata` plus `indexing.glslh` helpers so the same shader handles channels-, width-, and height-packed layouts. The texture shader uses the `safe_idx` / `safe_set` if/else helpers everywhere a UBO-backed `ivec4` is indexed by a spec-constant-derived value, to avoid the Adreno 740 SPIR-V compiler crash on `ubo_struct.sizes[spec_const]` when the spec const resolves to 1 or 2. The buffer shader does not dynamically index any UBO `ivec4`. Op registration: `register_pixel_shuffle()` in `op_registry.py` uses `ANY_STORAGE`, `FP_T`, and `supports_resize=True`, so the partitioner accepts both storage types and both fp32/fp16, across all packed layouts. Differential Revision: [D104462059](https://our.internmc.facebook.com/intern/diff/D104462059/) ghstack-source-id: 379519849 Pull Request resolved: #19404
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -1505,6 +1505,20 @@ def register_upsample_cpp_ops():
     )
 
 
+# =============================================================================
+# PixelShuffle.cpp
+# =============================================================================
+
+
+@update_features(exir_ops.edge.aten.pixel_shuffle.default)
+def register_pixel_shuffle():
+    return OpFeatures(
+        inputs_storage=utils.ANY_STORAGE,
+        inputs_dtypes=utils.FP_T,
+        supports_resize=True,
+    )
+
+
 # =============================================================================
 # GridPriors.cpp
 # =============================================================================
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -48,6 +48,7 @@
 ops_not_to_decompose = [
     torch.ops.aten.hardswish.default,
     torch.ops.aten.upsample_nearest2d.vec,
+    torch.ops.aten.pixel_shuffle.default,
 ]
 
 logger: logging.Logger = logging.getLogger("")
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pixel_shuffle_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/pixel_shuffle_buffer.glsl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions(STORAGE, DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type(STORAGE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_outp", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)}
+
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "upscale_factor", "1")}
+
+/*
+ * pixel_shuffle: rearranges (N, C*r*r, H, W) -> (N, C, H*r, W*r).
+ *
+ * For output element at NCHW index (n, c, h_out, w_out):
+ *   h_in = h_out / r
+ *   w_in = w_out / r
+ *   c_in = c * r * r + (h_out % r) * r + (w_out % r)
+ *
+ * The W, H, C dims correspond to NCHW indices [3], [2], [1] when ndim == 4
+ * (ndim - 1, ndim - 2, ndim - 3 in general). We use NCHW dim numbering so the
+ * mapping is independent of the tensor's memory layout.
+ */
+void main() {
+  const uint outp_bufi = gl_GlobalInvocationID.x;
+  if (outp_bufi >= numel(outp)) {
+    return;
+  }
+
+  TensorIndex outp_tidx = linear_idx_to_tensor_idx(outp, outp_bufi);
+
+  // NCHW dim indices for W (last), H (second-last), C (third-last).
+  const int nd = int_ndim(outp);
+  const int w_dim_nchw = nd - 1;
+  const int h_dim_nchw = nd - 2;
+  const int c_dim_nchw = nd - 3;
+
+  // Convert NCHW dim index to WHCN dim index expected by indexing helpers,
+  // where the "dim" parameter is ndim - 1 - nchw_dim (i.e. logical ordering
+  // matching strides). The tidx.data array stores indices in WHCN order:
+  // tidx.data[0] = W, tidx.data[1] = H, tidx.data[2] = C, tidx.data[3] = N.
+  const int r = upscale_factor;
+
+  const uint w_out = idx_at(outp_tidx, 0);
+  const uint h_out = idx_at(outp_tidx, 1);
+  const uint c_out = idx_at(outp_tidx, 2);
+
+  const uint w_in = w_out / uint(r);
+  const uint h_in = h_out / uint(r);
+  const uint c_in = c_out * uint(r) * uint(r) +
+      (h_out % uint(r)) * uint(r) + (w_out % uint(r));
+
+  TensorIndex inp_tidx = outp_tidx;
+  inp_tidx.data[0][0] = w_in;
+  inp_tidx.data[0][1] = h_in;
+  inp_tidx.data[0][2] = c_in;
+
+  const uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
+
+  t_outp[outp_bufi] = t_inp[inp_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pixel_shuffle_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/pixel_shuffle_buffer.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pixel_shuffle_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: pixel_shuffle_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pixel_shuffle_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/pixel_shuffle_texture.glsl
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions(STORAGE, DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+#define T ${texel_load_component_type(DTYPE, STORAGE)}
+
+${define_active_storage_type(STORAGE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "upscale_factor", "1")}
+
+const int out_packed_dim = get_packed_dim(out_layout);
+
+/*
+ * pixel_shuffle: rearranges (N, C*r*r, H, W) -> (N, C, H*r, W*r).
+ *
+ * For output element at NCHW index (n, c, h_out, w_out):
+ *   w_in = w_out / r
+ *   h_in = h_out / r
+ *   c_in = c * r * r + (h_out % r) * r + (w_out % r)
+ *
+ * Each thread writes one output texel of 4 components along the packed dim.
+ * Each component may map to a different input texel, so we resolve per-
+ * component and use texelFetch on the input.
+ */
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (out_of_bounds(out_pos, outp)) {
+    return;
+  }
+
+  TensorIndex4D out_tidx =
+      texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout);
+
+  // safe_idx() avoids dynamic UBO-vector indexing, which crashes Adreno 740.
+  // The output may not span a full block of 4 along the packed dim if the
+  // packed-dim size is not a multiple of 4, so clamp the loop.
+  const int limit = min(
+      4,
+      safe_idx(outp.sizes, out_packed_dim) -
+          safe_idx(out_tidx.data, out_packed_dim));
+
+  const int r = upscale_factor;
+
+  VEC4_T out_texel = VEC4_T(0);
+  for (int comp = 0; comp < 4; comp++) {
+    if (comp >= limit) {
+      break;
+    }
+
+    // Build the per-component output tensor index. tidx.data is a local
+    // ivec4 in WHCN order ([0]=W, [1]=H, [2]=C, [3]=N), so dynamic indexing
+    // here is safe (not UBO-backed).
+    TensorIndex4D out_tidx_c = out_tidx;
+    safe_set(
+        out_tidx_c.data,
+        out_packed_dim,
+        safe_idx(out_tidx.data, out_packed_dim) + comp);
+
+    const int w_out = out_tidx_c.data.x;
+    const int h_out = out_tidx_c.data.y;
+    const int c_out = out_tidx_c.data.z;
+
+    const int w_in = w_out / r;
+    const int h_in = h_out / r;
+    const int c_in = c_out * r * r + (h_out % r) * r + (w_out % r);
+
+    TensorIndex4D in_tidx;
+    in_tidx.data = ivec4(w_in, h_in, c_in, out_tidx_c.data.w);
+
+    TextureElementIndex in_elem =
+        tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout);
+    VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0);
+    out_texel[comp] = in_texel[in_elem.comp];
+  }
+
+  imageStore(t_out, out_pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pixel_shuffle_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/pixel_shuffle_texture.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pixel_shuffle_texture:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: pixel_shuffle_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/PixelShuffle.cpp b/backends/vulkan/runtime/graph/ops/impl/PixelShuffle.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void resize_pixel_shuffle_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = args.at(1).refs.at(0);
+  const ValueRef upscale_factor_ref = resize_args.at(0);
+
+  const int64_t r = graph->extract_scalar<int64_t>(upscale_factor_ref);
+
+  std::vector<int64_t> in_sizes = graph->sizes_of(in);
+  const int64_t ndim = static_cast<int64_t>(in_sizes.size());
+  VK_CHECK_COND(ndim >= 3);
+
+  std::vector<int64_t> out_sizes = in_sizes;
+  out_sizes.at(ndim - 3) = in_sizes.at(ndim - 3) / (r * r);
+  out_sizes.at(ndim - 2) = in_sizes.at(ndim - 2) * r;
+  out_sizes.at(ndim - 1) = in_sizes.at(ndim - 1) * r;
+
+  graph->virtual_resize(out, out_sizes);
+}
+
+void add_pixel_shuffle_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef upscale_factor_ref,
+    const ValueRef out) {
+  const int64_t r = graph.extract_scalar<int64_t>(upscale_factor_ref);
+  VK_CHECK_COND(r >= 1);
+
+  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
+  const int64_t ndim = static_cast<int64_t>(in_sizes.size());
+  VK_CHECK_COND(ndim >= 3);
+  VK_CHECK_COND(in_sizes.at(ndim - 3) % (r * r) == 0);
+
+  std::string kernel_name = "pixel_shuffle";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  vkapi::ParamsBindList ubos = {graph.meta_ubo(out), graph.meta_ubo(in)};
+
+  vkapi::SpecVarList spec_constants = {
+      graph.hashed_layout_of(out),
+      graph.hashed_layout_of(in),
+      static_cast<int32_t>(r)};
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+      // Shader params buffers
+      ubos,
+      // Push Constants
+      {},
+      // Specialization Constants
+      spec_constants,
+      // Resize Args
+      {upscale_factor_ref},
+      // Resizing Logic
+      resize_pixel_shuffle_node));
+}
+
+void pixel_shuffle(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  const ValueRef in = args[0];
+  const ValueRef upscale_factor_ref = args[1];
+  const ValueRef out = args[2];
+  add_pixel_shuffle_node(graph, in, upscale_factor_ref, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.pixel_shuffle.default, pixel_shuffle);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -834,6 +834,29 @@ def get_upsample_bilinear2d_inputs():
     return VkTestSuite(inputs_list)
 
 
+@register_test_suite("aten.pixel_shuffle.default")
+def get_pixel_shuffle_inputs():
+    test_suite = VkTestSuite(
+        [
+            # (input tensor shape (N, C*r*r, H, W), upscale_factor r)
+            ((1, 4, 2, 2), 2),
+            ((1, 9, 3, 3), 3),
+            ((1, 16, 2, 2), 4),
+            ((2, 4, 3, 5), 2),
+            ((1, 8, 4, 4), 2),
+            ((1, 12, 3, 4), 2),
+        ]
+    )
+    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
+    test_suite.layouts = [
+        "utils::kChannelsPacked",
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
+    ]
+    test_suite.dtypes = ["at::kFloat", "at::kHalf"]
+    return test_suite
+
+
 @register_test_suite(["aten.full.default", "aten.full_like.default"])
 def get_full_inputs():
     test_suite = VkTestSuite(

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@`
`48`	`48`	`ops_not_to_decompose = [`
`49`	`49`	`torch.ops.aten.hardswish.default,`
`50`	`50`	`torch.ops.aten.upsample_nearest2d.vec,`
	`51`	`+ torch.ops.aten.pixel_shuffle.default,`
`51`	`52`	`]`
`52`	`53`
`53`	`54`	`logger: logging.Logger = logging.getLogger("")`