Skip to content

Commit d7e20b5

Browse files
ssjiaSS-JIA
authored andcommitted
[ET-VK] Modernize permute op with safe indexing and unified dispatch
Modernize the permute operator to follow current best practices, fixing an Adreno 740 driver crash caused by dynamic UBO indexing in the texture shader. Texture shader changes: - Replace old indexing_utils.h with indexing.glslh - Use TextureMetadata UBOs instead of push constant sizes - Use texture_pos_to_tensor4d_idx_simple() and related helpers - Replace permute_dims[out_packed_dim] with safe_idx() to avoid dynamic indexing of push constant with spec-const-derived index - Use TextureElementIndex pattern for the slow path C++ dispatch changes: - Merge add_permute_node() and add_permute_buffer_node() into a single unified function using graph.meta_ubo() and conditional logic - Remove unused channel_info computation - Move WHCNPermuteDims struct into anonymous namespace - Guard texture path with VK_CHECK_COND(permute_ndim <= 4) Differential Revision: [D98220451](https://our.internmc.facebook.com/intern/diff/D98220451/) ghstack-source-id: 357844381 Pull Request resolved: #18511
1 parent 2aa02bf commit d7e20b5

2 files changed

Lines changed: 110 additions & 182 deletions

File tree

backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl

Lines changed: 50 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -9,97 +9,97 @@
99
#version 450 core
1010

1111
${define_required_extensions("texture3d", DTYPE)}
12-
${define_explicit_type_extensions(DTYPE)}
1312

1413
#define PRECISION ${PRECISION}
1514

16-
#define VEC4_T ${texel_type(DTYPE)}
17-
#define T ${buffer_scalar_type(DTYPE)}
15+
#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
16+
#define T ${texel_load_component_type(DTYPE, "texture3d")}
1817

1918
${define_active_storage_type("texture3d")}
2019

20+
#extension GL_EXT_control_flow_attributes : require
21+
2122
layout(std430) buffer;
2223

23-
#include "indexing_utils.h"
24+
#include "common.glslh"
25+
#include "indexing.glslh"
2426

2527
${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
2628
${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
2729

30+
${layout_declare_ubo(B, "TextureMetadata", "outp")}
31+
${layout_declare_ubo(B, "TextureMetadata", "inp")}
32+
2833
layout(push_constant) uniform restrict Block {
29-
ivec4 out_sizes;
30-
ivec4 in_sizes;
31-
ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
34+
ivec4 permute_dims;
3235
};
3336

34-
${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
35-
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
36-
const lowp int out_packed_dim = unhash_packed_dim(out_layout);
37-
38-
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
39-
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
40-
const lowp int in_packed_dim = unhash_packed_dim(in_layout);
37+
${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
38+
${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
39+
const int out_packed_dim = get_packed_dim(out_layout);
40+
const int in_packed_dim = get_packed_dim(in_layout);
4141

4242
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4343

44-
// Convert output tensor index to input tensor index based on permutation
44+
// Convert output tensor index to input tensor index based on permutation.
45+
// permute_dims[i] = j means output dim i comes from input dim j.
46+
// We write: in_tidx[permute_dims.{x,y,z,w}] = out_tidx.{x,y,z,w}
47+
// This uses literal component access on the push constant (safe) and dynamic
48+
// indexing into the local in_tidx variable (also safe).
4549
ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
4650
ivec4 in_tidx;
47-
48-
// Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
4951
in_tidx[permute_dims.x] = out_tidx.x;
5052
in_tidx[permute_dims.y] = out_tidx.y;
5153
in_tidx[permute_dims.z] = out_tidx.z;
5254
in_tidx[permute_dims.w] = out_tidx.w;
53-
5455
return in_tidx;
5556
}
5657

57-
// Check if we can use the fast path where texels from the input tensor can be
58-
// copied directly into the output tensor. This occurs when the packed dimension
59-
// is preserved in the permutation, i.e. reading a texel from the output tensor
60-
// produces 4 texels along the same dimension as reading a texel from the input
61-
// tensor.
62-
bool can_use_fast_path() {
63-
// Fast path is possible when the packed dimension is preserved in the permutation
64-
// This means permute_dims[out_packed_dim] == in_packed_dim
65-
return permute_dims[out_packed_dim] == in_packed_dim;
66-
}
67-
6858
void main() {
69-
const ivec3 lpos = ivec3(gl_GlobalInvocationID);
70-
ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
59+
const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
7160

72-
if (any(greaterThanEqual(out_tidx, out_sizes))) {
61+
if (out_of_bounds(out_pos, outp)) {
7362
return;
7463
}
7564

76-
if (can_use_fast_path()) {
65+
TensorIndex4D out_tidx =
66+
texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout);
67+
68+
// Check if packed dimension is preserved in the permutation. Use safe_idx
69+
// to avoid dynamic indexing of push constant with spec-const-derived index.
70+
const bool fast_path =
71+
safe_idx(permute_dims, out_packed_dim) == in_packed_dim;
72+
73+
if (fast_path) {
7774
// Fast path: packed dimension is preserved, so we can copy texels directly
78-
ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
79-
ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
80-
VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
75+
ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data);
76+
TensorIndex4D in_tidx;
77+
in_tidx.data = in_tidx_data;
8178

82-
write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
83-
}
84-
else {
79+
ivec3 in_pos =
80+
tensor4d_idx_to_texel_pos_simple(inp, in_tidx, in_layout);
81+
VEC4_T in_texel = texelFetch(t_in, in_pos, 0);
82+
83+
imageStore(t_out, out_pos, in_texel);
84+
} else {
8585
// Slow path: packed dimension is not preserved, so each element of the
86-
// output texel may be "sourced" from a different texel in the input tensor.
87-
// Therefore each output texel element is processed individually.
86+
// output texel may come from a different texel in the input tensor.
8887
VEC4_T out_texel = VEC4_T(0);
8988

90-
for (int texel_i = 0; texel_i < 4; ++texel_i) {
91-
ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
92-
ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
93-
int element_idx = in_tidx[in_packed_dim] % 4;
89+
for (int comp = 0; comp < 4; comp++) {
90+
ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data);
91+
TensorIndex4D in_tidx;
92+
in_tidx.data = in_tidx_data;
9493

95-
VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
96-
T selected_value = T(in_texel[element_idx]);
94+
TextureElementIndex in_elem =
95+
tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout);
9796

98-
out_texel[texel_i] = selected_value;
97+
VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0);
98+
out_texel[comp] = in_texel[in_elem.comp];
9999

100-
out_tidx[out_packed_dim]++;
100+
out_tidx.data[out_packed_dim]++;
101101
}
102102

103-
write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
103+
imageStore(t_out, out_pos, out_texel);
104104
}
105105
}

0 commit comments

Comments
 (0)