Skip to content

Commit 8e18287

Browse files
ssjiaSS-JIA
authored andcommitted
[ET-VK] Implement aten.pixel_shuffle.default op
The previous commit on this stack added the fused `q8ta_pixel_shuffle` custom op and, to make pattern matching easier, added `aten.pixel_shuffle.default` to the partitioner's `ops_not_to_decompose` list. That change had a side effect: any non-quantized model that uses `aten.pixel_shuffle.default` now reaches the Vulkan backend with the op intact, but the backend had no implementation registered for it, so those models fail to lower. This commit adds a layout- and dtype-agnostic implementation of `aten.pixel_shuffle.default` so existing models keep working. The implementation rearranges `(N, C*r*r, H, W)` -> `(N, C, H*r, W*r)`, where output element `(n, c, h_out, w_out)` reads from input element `(n, c*r*r + (h_out%r)*r + (w_out%r), h_out/r, w_out/r)`. Two compute shaders are added because the work-assignment paradigm differs between storage types: - `pixel_shuffle_buffer.glsl` assigns one thread per output element and uses `linear_idx_to_tensor_idx` against the output `BufferMetadata`, which makes it agnostic to the underlying `dim_order`. - `pixel_shuffle_texture.glsl` assigns one thread per output texel and uses `TextureMetadata` plus `indexing.glslh` helpers so the same shader handles channels-, width-, and height-packed layouts. The texture shader uses the `safe_idx` / `safe_set` if/else helpers everywhere a UBO-backed `ivec4` is indexed by a spec-constant-derived value, to avoid the Adreno 740 SPIR-V compiler crash on `ubo_struct.sizes[spec_const]` when the spec const resolves to 1 or 2. The buffer shader does not dynamically index any UBO `ivec4`. Op registration: `register_pixel_shuffle()` in `op_registry.py` uses `ANY_STORAGE`, `FP_T`, and `supports_resize=True`, so the partitioner accepts both storage types and both fp32/fp16, across all packed layouts. Differential Revision: [D104462059](https://our.internmc.facebook.com/intern/diff/D104462059/) ghstack-source-id: 379519849 Pull Request resolved: #19404
1 parent 1ee58ed commit 8e18287

8 files changed

Lines changed: 355 additions & 0 deletions

File tree

backends/vulkan/op_registry.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1505,6 +1505,20 @@ def register_upsample_cpp_ops():
15051505
)
15061506

15071507

1508+
# =============================================================================
1509+
# PixelShuffle.cpp
1510+
# =============================================================================
1511+
1512+
1513+
@update_features(exir_ops.edge.aten.pixel_shuffle.default)
1514+
def register_pixel_shuffle():
1515+
return OpFeatures(
1516+
inputs_storage=utils.ANY_STORAGE,
1517+
inputs_dtypes=utils.FP_T,
1518+
supports_resize=True,
1519+
)
1520+
1521+
15081522
# =============================================================================
15091523
# GridPriors.cpp
15101524
# =============================================================================

backends/vulkan/partitioner/vulkan_partitioner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
ops_not_to_decompose = [
4949
torch.ops.aten.hardswish.default,
5050
torch.ops.aten.upsample_nearest2d.vec,
51+
torch.ops.aten.pixel_shuffle.default,
5152
]
5253

5354
logger: logging.Logger = logging.getLogger("")
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
${define_required_extensions(STORAGE, DTYPE)}
12+
13+
#define PRECISION ${PRECISION}
14+
15+
#define T ${buffer_scalar_type(DTYPE)}
16+
17+
${define_active_storage_type(STORAGE)}
18+
19+
#extension GL_EXT_control_flow_attributes : require
20+
21+
layout(std430) buffer;
22+
23+
#include "indexing.glslh"
24+
25+
${layout_declare_tensor(B, "w", "t_outp", DTYPE, STORAGE)}
26+
${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)}
27+
28+
${layout_declare_ubo(B, "BufferMetadata", "outp")}
29+
${layout_declare_ubo(B, "BufferMetadata", "inp")}
30+
31+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
32+
33+
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
34+
${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
35+
${layout_declare_spec_const(C, "int", "upscale_factor", "1")}
36+
37+
/*
38+
* pixel_shuffle: rearranges (N, C*r*r, H, W) -> (N, C, H*r, W*r).
39+
*
40+
* For output element at NCHW index (n, c, h_out, w_out):
41+
* h_in = h_out / r
42+
* w_in = w_out / r
43+
* c_in = c * r * r + (h_out % r) * r + (w_out % r)
44+
*
45+
* The W, H, C dims correspond to NCHW indices [3], [2], [1] when ndim == 4
46+
* (ndim - 1, ndim - 2, ndim - 3 in general). We use NCHW dim numbering so the
47+
* mapping is independent of the tensor's memory layout.
48+
*/
49+
void main() {
50+
const uint outp_bufi = gl_GlobalInvocationID.x;
51+
if (outp_bufi >= numel(outp)) {
52+
return;
53+
}
54+
55+
TensorIndex outp_tidx = linear_idx_to_tensor_idx(outp, outp_bufi);
56+
57+
// NCHW dim indices for W (last), H (second-last), C (third-last).
58+
const int nd = int_ndim(outp);
59+
const int w_dim_nchw = nd - 1;
60+
const int h_dim_nchw = nd - 2;
61+
const int c_dim_nchw = nd - 3;
62+
63+
// Convert NCHW dim index to WHCN dim index expected by indexing helpers,
64+
// where the "dim" parameter is ndim - 1 - nchw_dim (i.e. logical ordering
65+
// matching strides). The tidx.data array stores indices in WHCN order:
66+
// tidx.data[0] = W, tidx.data[1] = H, tidx.data[2] = C, tidx.data[3] = N.
67+
const int r = upscale_factor;
68+
69+
const uint w_out = idx_at(outp_tidx, 0);
70+
const uint h_out = idx_at(outp_tidx, 1);
71+
const uint c_out = idx_at(outp_tidx, 2);
72+
73+
const uint w_in = w_out / uint(r);
74+
const uint h_in = h_out / uint(r);
75+
const uint c_in = c_out * uint(r) * uint(r) +
76+
(h_out % uint(r)) * uint(r) + (w_out % uint(r));
77+
78+
TensorIndex inp_tidx = outp_tidx;
79+
inp_tidx.data[0][0] = w_in;
80+
inp_tidx.data[0][1] = h_in;
81+
inp_tidx.data[0][2] = c_in;
82+
83+
const uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
84+
85+
t_outp[outp_bufi] = t_inp[inp_bufi];
86+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
pixel_shuffle_buffer:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
STORAGE: buffer
11+
generate_variant_forall:
12+
DTYPE:
13+
- VALUE: half
14+
- VALUE: float
15+
shader_variants:
16+
- NAME: pixel_shuffle_buffer
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
${define_required_extensions(STORAGE, DTYPE)}
12+
13+
#define PRECISION ${PRECISION}
14+
15+
#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
16+
#define T ${texel_load_component_type(DTYPE, STORAGE)}
17+
18+
${define_active_storage_type(STORAGE)}
19+
20+
#extension GL_EXT_control_flow_attributes : require
21+
22+
layout(std430) buffer;
23+
24+
#include "indexing.glslh"
25+
26+
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
27+
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
28+
29+
${layout_declare_ubo(B, "TextureMetadata", "outp")}
30+
${layout_declare_ubo(B, "TextureMetadata", "inp")}
31+
32+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
33+
34+
${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
35+
${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
36+
${layout_declare_spec_const(C, "int", "upscale_factor", "1")}
37+
38+
const int out_packed_dim = get_packed_dim(out_layout);
39+
40+
/*
41+
* pixel_shuffle: rearranges (N, C*r*r, H, W) -> (N, C, H*r, W*r).
42+
*
43+
* For output element at NCHW index (n, c, h_out, w_out):
44+
* w_in = w_out / r
45+
* h_in = h_out / r
46+
* c_in = c * r * r + (h_out % r) * r + (w_out % r)
47+
*
48+
* Each thread writes one output texel of 4 components along the packed dim.
49+
* Each component may map to a different input texel, so we resolve per-
50+
* component and use texelFetch on the input.
51+
*/
52+
void main() {
53+
const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
54+
55+
if (out_of_bounds(out_pos, outp)) {
56+
return;
57+
}
58+
59+
TensorIndex4D out_tidx =
60+
texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout);
61+
62+
// safe_idx() avoids dynamic UBO-vector indexing, which crashes Adreno 740.
63+
// The output may not span a full block of 4 along the packed dim if the
64+
// packed-dim size is not a multiple of 4, so clamp the loop.
65+
const int limit = min(
66+
4,
67+
safe_idx(outp.sizes, out_packed_dim) -
68+
safe_idx(out_tidx.data, out_packed_dim));
69+
70+
const int r = upscale_factor;
71+
72+
VEC4_T out_texel = VEC4_T(0);
73+
for (int comp = 0; comp < 4; comp++) {
74+
if (comp >= limit) {
75+
break;
76+
}
77+
78+
// Build the per-component output tensor index. tidx.data is a local
79+
// ivec4 in WHCN order ([0]=W, [1]=H, [2]=C, [3]=N), so dynamic indexing
80+
// here is safe (not UBO-backed).
81+
TensorIndex4D out_tidx_c = out_tidx;
82+
safe_set(
83+
out_tidx_c.data,
84+
out_packed_dim,
85+
safe_idx(out_tidx.data, out_packed_dim) + comp);
86+
87+
const int w_out = out_tidx_c.data.x;
88+
const int h_out = out_tidx_c.data.y;
89+
const int c_out = out_tidx_c.data.z;
90+
91+
const int w_in = w_out / r;
92+
const int h_in = h_out / r;
93+
const int c_in = c_out * r * r + (h_out % r) * r + (w_out % r);
94+
95+
TensorIndex4D in_tidx;
96+
in_tidx.data = ivec4(w_in, h_in, c_in, out_tidx_c.data.w);
97+
98+
TextureElementIndex in_elem =
99+
tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout);
100+
VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0);
101+
out_texel[comp] = in_texel[in_elem.comp];
102+
}
103+
104+
imageStore(t_out, out_pos, out_texel);
105+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
pixel_shuffle_texture:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
STORAGE: texture3d
11+
generate_variant_forall:
12+
DTYPE:
13+
- VALUE: half
14+
- VALUE: float
15+
shader_variants:
16+
- NAME: pixel_shuffle_texture3d
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
10+
11+
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
12+
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
13+
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
14+
15+
namespace vkcompute {
16+
17+
void resize_pixel_shuffle_node(
18+
ComputeGraph* graph,
19+
const std::vector<ArgGroup>& args,
20+
const std::vector<ValueRef>& resize_args) {
21+
const ValueRef out = args.at(0).refs.at(0);
22+
const ValueRef in = args.at(1).refs.at(0);
23+
const ValueRef upscale_factor_ref = resize_args.at(0);
24+
25+
const int64_t r = graph->extract_scalar<int64_t>(upscale_factor_ref);
26+
27+
std::vector<int64_t> in_sizes = graph->sizes_of(in);
28+
const int64_t ndim = static_cast<int64_t>(in_sizes.size());
29+
VK_CHECK_COND(ndim >= 3);
30+
31+
std::vector<int64_t> out_sizes = in_sizes;
32+
out_sizes.at(ndim - 3) = in_sizes.at(ndim - 3) / (r * r);
33+
out_sizes.at(ndim - 2) = in_sizes.at(ndim - 2) * r;
34+
out_sizes.at(ndim - 1) = in_sizes.at(ndim - 1) * r;
35+
36+
graph->virtual_resize(out, out_sizes);
37+
}
38+
39+
void add_pixel_shuffle_node(
40+
ComputeGraph& graph,
41+
const ValueRef in,
42+
const ValueRef upscale_factor_ref,
43+
const ValueRef out) {
44+
const int64_t r = graph.extract_scalar<int64_t>(upscale_factor_ref);
45+
VK_CHECK_COND(r >= 1);
46+
47+
const std::vector<int64_t> in_sizes = graph.sizes_of(in);
48+
const int64_t ndim = static_cast<int64_t>(in_sizes.size());
49+
VK_CHECK_COND(ndim >= 3);
50+
VK_CHECK_COND(in_sizes.at(ndim - 3) % (r * r) == 0);
51+
52+
std::string kernel_name = "pixel_shuffle";
53+
kernel_name.reserve(kShaderNameReserve);
54+
add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
55+
add_dtype_suffix(kernel_name, graph.dtype_of(out));
56+
57+
vkapi::ParamsBindList ubos = {graph.meta_ubo(out), graph.meta_ubo(in)};
58+
59+
vkapi::SpecVarList spec_constants = {
60+
graph.hashed_layout_of(out),
61+
graph.hashed_layout_of(in),
62+
static_cast<int32_t>(r)};
63+
64+
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
65+
graph,
66+
VK_KERNEL_FROM_STR(kernel_name),
67+
default_pick_global_wg_size,
68+
default_pick_local_wg_size,
69+
// Inputs and Outputs
70+
{{out, vkapi::kWrite}, {in, vkapi::kRead}},
71+
// Shader params buffers
72+
ubos,
73+
// Push Constants
74+
{},
75+
// Specialization Constants
76+
spec_constants,
77+
// Resize Args
78+
{upscale_factor_ref},
79+
// Resizing Logic
80+
resize_pixel_shuffle_node));
81+
}
82+
83+
void pixel_shuffle(ComputeGraph& graph, const std::vector<ValueRef>& args) {
84+
const ValueRef in = args[0];
85+
const ValueRef upscale_factor_ref = args[1];
86+
const ValueRef out = args[2];
87+
add_pixel_shuffle_node(graph, in, upscale_factor_ref, out);
88+
}
89+
90+
REGISTER_OPERATORS {
91+
VK_REGISTER_OP(aten.pixel_shuffle.default, pixel_shuffle);
92+
}
93+
94+
} // namespace vkcompute

backends/vulkan/test/op_tests/cases.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -834,6 +834,29 @@ def get_upsample_bilinear2d_inputs():
834834
return VkTestSuite(inputs_list)
835835

836836

837+
@register_test_suite("aten.pixel_shuffle.default")
838+
def get_pixel_shuffle_inputs():
839+
test_suite = VkTestSuite(
840+
[
841+
# (input tensor shape (N, C*r*r, H, W), upscale_factor r)
842+
((1, 4, 2, 2), 2),
843+
((1, 9, 3, 3), 3),
844+
((1, 16, 2, 2), 4),
845+
((2, 4, 3, 5), 2),
846+
((1, 8, 4, 4), 2),
847+
((1, 12, 3, 4), 2),
848+
]
849+
)
850+
test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
851+
test_suite.layouts = [
852+
"utils::kChannelsPacked",
853+
"utils::kWidthPacked",
854+
"utils::kHeightPacked",
855+
]
856+
test_suite.dtypes = ["at::kFloat", "at::kHalf"]
857+
return test_suite
858+
859+
837860
@register_test_suite(["aten.full.default", "aten.full_like.default"])
838861
def get_full_inputs():
839862
test_suite = VkTestSuite(

0 commit comments

Comments
 (0)