|
9 | 9 | #version 450 core |
10 | 10 |
|
11 | 11 | ${define_required_extensions("texture3d", DTYPE)} |
12 | | -${define_explicit_type_extensions(DTYPE)} |
13 | 12 |
|
14 | 13 | #define PRECISION ${PRECISION} |
15 | 14 |
|
16 | | -#define VEC4_T ${texel_type(DTYPE)} |
17 | | -#define T ${buffer_scalar_type(DTYPE)} |
| 15 | +#define VEC4_T ${texel_load_type(DTYPE, "texture3d")} |
| 16 | +#define T ${texel_load_component_type(DTYPE, "texture3d")} |
18 | 17 |
|
19 | 18 | ${define_active_storage_type("texture3d")} |
20 | 19 |
|
| 20 | +#extension GL_EXT_control_flow_attributes : require |
| 21 | + |
21 | 22 | layout(std430) buffer; |
22 | 23 |
|
23 | | -#include "indexing_utils.h" |
| 24 | +#include "common.glslh" |
| 25 | +#include "indexing.glslh" |
24 | 26 |
|
25 | 27 | ${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} |
26 | 28 | ${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} |
27 | 29 |
|
| 30 | +${layout_declare_ubo(B, "TextureMetadata", "outp")} |
| 31 | +${layout_declare_ubo(B, "TextureMetadata", "inp")} |
| 32 | + |
28 | 33 | layout(push_constant) uniform restrict Block { |
29 | | - ivec4 out_sizes; |
30 | | - ivec4 in_sizes; |
31 | | - ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j |
| 34 | + ivec4 permute_dims; |
32 | 35 | }; |
33 | 36 |
|
34 | | -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} |
35 | | -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); |
36 | | -const lowp int out_packed_dim = unhash_packed_dim(out_layout); |
37 | | - |
38 | | -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} |
39 | | -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); |
40 | | -const lowp int in_packed_dim = unhash_packed_dim(in_layout); |
| 37 | +${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")} |
| 38 | +${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")} |
| 39 | +const int out_packed_dim = get_packed_dim(out_layout); |
| 40 | +const int in_packed_dim = get_packed_dim(in_layout); |
41 | 41 |
|
42 | 42 | layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; |
43 | 43 |
|
44 | | -// Convert output tensor index to input tensor index based on permutation |
| 44 | +// Convert output tensor index to input tensor index based on permutation. |
| 45 | +// permute_dims[i] = j means output dim i comes from input dim j. |
| 46 | +// We write: in_tidx[permute_dims.{x,y,z,w}] = out_tidx.{x,y,z,w} |
| 47 | +// This uses literal component access on the push constant (safe) and dynamic |
| 48 | +// indexing into the local in_tidx variable (also safe). |
45 | 49 | ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { |
46 | 50 | ivec4 in_tidx; |
47 | | - |
48 | | - // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i] |
49 | 51 | in_tidx[permute_dims.x] = out_tidx.x; |
50 | 52 | in_tidx[permute_dims.y] = out_tidx.y; |
51 | 53 | in_tidx[permute_dims.z] = out_tidx.z; |
52 | 54 | in_tidx[permute_dims.w] = out_tidx.w; |
53 | | - |
54 | 55 | return in_tidx; |
55 | 56 | } |
56 | 57 |
|
57 | | -// Check if we can use the fast path where texels from the input tensor can be |
58 | | -// copied directly into the output tensor. This occurs when the packed dimension |
59 | | -// is preserved in the permutation, i.e. reading a texel from the output tensor |
60 | | -// produces 4 texels along the same dimension as reading a texel from the input |
61 | | -// tensor. |
62 | | -bool can_use_fast_path() { |
63 | | - // Fast path is possible when the packed dimension is preserved in the permutation |
64 | | - // This means permute_dims[out_packed_dim] == in_packed_dim |
65 | | - return permute_dims[out_packed_dim] == in_packed_dim; |
66 | | -} |
67 | | - |
68 | 58 | void main() { |
69 | | - const ivec3 lpos = ivec3(gl_GlobalInvocationID); |
70 | | - ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); |
| 59 | + const ivec3 out_pos = ivec3(gl_GlobalInvocationID); |
71 | 60 |
|
72 | | - if (any(greaterThanEqual(out_tidx, out_sizes))) { |
| 61 | + if (out_of_bounds(out_pos, outp)) { |
73 | 62 | return; |
74 | 63 | } |
75 | 64 |
|
76 | | - if (can_use_fast_path()) { |
| 65 | + TensorIndex4D out_tidx = |
| 66 | + texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout); |
| 67 | + |
| 68 | + // Check if packed dimension is preserved in the permutation. Use safe_idx |
| 69 | + // to avoid dynamic indexing of push constant with spec-const-derived index. |
| 70 | + const bool fast_path = |
| 71 | + safe_idx(permute_dims, out_packed_dim) == in_packed_dim; |
| 72 | + |
| 73 | + if (fast_path) { |
77 | 74 | // Fast path: packed dimension is preserved, so we can copy texels directly |
78 | | - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); |
79 | | - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); |
80 | | - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); |
| 75 | + ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data); |
| 76 | + TensorIndex4D in_tidx; |
| 77 | + in_tidx.data = in_tidx_data; |
81 | 78 |
|
82 | | - write_texel_lpos(t_out, lpos, in_texel, out_axis_map); |
83 | | - } |
84 | | - else { |
| 79 | + ivec3 in_pos = |
| 80 | + tensor4d_idx_to_texel_pos_simple(inp, in_tidx, in_layout); |
| 81 | + VEC4_T in_texel = texelFetch(t_in, in_pos, 0); |
| 82 | + |
| 83 | + imageStore(t_out, out_pos, in_texel); |
| 84 | + } else { |
85 | 85 | // Slow path: packed dimension is not preserved, so each element of the |
86 | | - // output texel may be "sourced" from a different texel in the input tensor. |
87 | | - // Therefore each output texel element is processed individually. |
| 86 | + // output texel may come from a different texel in the input tensor. |
88 | 87 | VEC4_T out_texel = VEC4_T(0); |
89 | 88 |
|
90 | | - for (int texel_i = 0; texel_i < 4; ++texel_i) { |
91 | | - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); |
92 | | - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); |
93 | | - int element_idx = in_tidx[in_packed_dim] % 4; |
| 89 | + for (int comp = 0; comp < 4; comp++) { |
| 90 | + ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data); |
| 91 | + TensorIndex4D in_tidx; |
| 92 | + in_tidx.data = in_tidx_data; |
94 | 93 |
|
95 | | - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); |
96 | | - T selected_value = T(in_texel[element_idx]); |
| 94 | + TextureElementIndex in_elem = |
| 95 | + tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout); |
97 | 96 |
|
98 | | - out_texel[texel_i] = selected_value; |
| 97 | + VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0); |
| 98 | + out_texel[comp] = in_texel[in_elem.comp]; |
99 | 99 |
|
100 | | - out_tidx[out_packed_dim]++; |
| 100 | + out_tidx.data[out_packed_dim]++; |
101 | 101 | } |
102 | 102 |
|
103 | | - write_texel_lpos(t_out, lpos, out_texel, out_axis_map); |
| 103 | + imageStore(t_out, out_pos, out_texel); |
104 | 104 | } |
105 | 105 | } |
0 commit comments