executorch/backends/vulkan/runtime/graph/ops/impl/Copy.cpp at baa239b1b2fbdd0ff9f173c0935472c5b67e2660 · pytorch/executorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>

namespace vkcompute {

using utils::ivec3;
using utils::ivec4;
using utils::uvec3;

void add_copy_offset_node(
    ComputeGraph& graph,
    const ValueRef in,
    const ivec3& range,
    const ivec4& src_offset,
    const ivec4& dst_offset,
    const ValueRef out,
    bool calc_out_pos_using_src_chnl,
    bool calc_in_pos_using_dst_chnl) {
  vTensorPtr t_in = graph.get_tensor(in);
  vTensorPtr t_out = graph.get_tensor(out);

  std::string kernel_name = "copy_offset";
  kernel_name.reserve(kShaderNameReserve);
  add_dtype_suffix(kernel_name, *t_out);
  add_storage_type_suffix(kernel_name, *t_out);

  auto shader = VK_KERNEL_FROM_STR(kernel_name);

  graph.execute_nodes().emplace_back(new DispatchNode(
      graph,
      VK_KERNEL_FROM_STR(kernel_name),
      graph.create_global_wg_size(out),
      graph.create_local_wg_size(out),
      // Inputs and Outputs
      {
          {out, vkapi::kWrite},
          {in, vkapi::kRead},
      },
      // Parameter buffers
      {},
      // Specialization Constants
      {graph.hashed_layout_of(out),
       graph.hashed_layout_of(in),
       (calc_out_pos_using_src_chnl      ? 1
            : calc_in_pos_using_dst_chnl ? 2
                                         : 0)},
      nullptr,
      {},
      {
          PushConstantDataInfo(&range, sizeof(range), sizeof(ivec4)),
          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
      }));
}

void add_copy_packed_dim_offset_node(
    ComputeGraph& graph,
    const ValueRef in,
    const ivec3& range,
    const ivec4& src_offset,
    const ivec4& dst_offset,
    const ValueRef out,
    bool repeat) {
  vTensorPtr t_in = graph.get_tensor(in);
  vTensorPtr t_out = graph.get_tensor(out);

  // Check the packed dimension is same for both tensors
  VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
  if (!repeat) {
    // For non repeat copy also check if the packed dimension is Width or
    // Height. Since the function does not support channel packing.
    VK_CHECK_COND(
        check_same_packed_dim(*t_in, *t_out) &&
        (check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
         check_packed_dim_is(*t_in, WHCN::kHeightDim)));
  }

  std::string kernel_name = "copy_packed_dim_offset";
  kernel_name.reserve(kShaderNameReserve);
  add_dtype_suffix(kernel_name, *t_out);

  // A copy of range with the last element set to batch size of the input tensor
  ivec4 final_range = {
      range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
  ivec3 global_wg_size = t_out->logical_limits();

  if (!repeat) {
    const auto packed_dim = t_in->packed_dim();
    // The starting offset in a texel where this tensor will start copying from
    const auto src_lane_offset = src_offset[packed_dim] & 0x3;
    // The starting offset in a texel where this tensor will start copying to
    const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;

    // The total packed texels this tensor will be copied from
    // The first texel of tensor data in packed dimension will be copied from
    // remaining lanes from current source Hence (4 - src_lane_offset) is added
    // to tensor size in packed dimension
    const auto src_packed_size = utils::div_up_4(
        (4 - src_lane_offset) +
        dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));

    // The total packed texels this tensor will be copied to
    // The first texel of tensor data in packed dimension will be copied to
    // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
    // to tensor size in packed dimension
    const auto dst_packed_size = utils::div_up_4(
        (4 - dst_lane_offset) +
        dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));

    // If the starting src offset is not 0, and the total packed texels is
    // greater than the source texel range
    const bool has_additional_src_work =
        src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
    // If the starting dst offset is not 0, and the total packed texels is
    // greater than the source texel range
    const bool has_additional_dst_work =
        dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];

    if (has_additional_src_work || has_additional_dst_work) {
      global_wg_size[packed_dim]++; // Increase the global work group size in
                                    // packed dimension
      final_range[packed_dim]++; // Increase the range in packed dimension
    }
  }

  auto shader = VK_KERNEL_FROM_STR(kernel_name);

  graph.execute_nodes().emplace_back(new DispatchNode(
      graph,
      VK_KERNEL_FROM_STR(kernel_name),
      global_wg_size,
      graph.create_local_wg_size(global_wg_size),
      // Inputs and Outputs
      {
          {out, vkapi::MemoryAccessType::WRITE},
          {out, vkapi::MemoryAccessType::READ},
          {in, vkapi::MemoryAccessType::READ},
      },
      // Parameter buffers
      {},
      // Specialization Constants
      {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat},
      nullptr,
      {},
      {
          PushConstantDataInfo(
              &final_range, sizeof(final_range), sizeof(ivec4)),
          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
      }));
}

void add_copy_channel_offset_node(
    ComputeGraph& graph,
    const ValueRef in,
    int32_t channel_range,
    int32_t src_channel_offset,
    int32_t dst_channel_offset,
    const ValueRef out) {
  vTensorPtr t_in = graph.get_tensor(in);
  vTensorPtr t_out = graph.get_tensor(out);

  // Likely need to prepad these numbers.
  std::vector<int64_t> in_sizes = t_in->sizes();
  std::vector<int64_t> out_sizes = t_out->sizes();

  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
  VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));

  // NOTE: This function should be able to support 1d and 2d tensors when
  // range=1, src_offset=dst_offset=1.
  VK_CHECK_COND(t_in->dim() >= 3, "Src dim should be at least 3");
  VK_CHECK_COND(t_out->dim() >= 3, "Dst dim should be at least 3");

  VK_CHECK_COND(
      dim_at<kChannel4D>(in_sizes) >= src_channel_offset + channel_range,
      "Src channel (",
      src_channel_offset,
      ") and range (",
      channel_range,
      ") should be less than or equal to input tensor's channel size (",
      dim_at<kChannel4D>(in_sizes),
      ")");

  VK_CHECK_COND(
      dim_at<kChannel4D>(out_sizes) >= dst_channel_offset + channel_range,
      "Dst channel (",
      dst_channel_offset,
      ") and range (",
      channel_range,
      ") should be less than or equal to input tensor's channel size (",
      dim_at<kChannel4D>(out_sizes),
      ")");

  VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
  VK_CHECK_COND(
      src_channel_offset >= 0, "Src channel offset must be non-negative");
  VK_CHECK_COND(
      dst_channel_offset >= 0, "Dst channel offset must be non-negative");

  std::string kernel_name = "copy_channel_offset";
  kernel_name.reserve(kShaderNameReserve);
  add_dtype_suffix(kernel_name, *t_out);

  int32_t out_channels = dim_at<kChannel4D>(out_sizes);

  // Copy one batch at a time.
  for (int batch_idx = 0; batch_idx < dim_at<kBatch4D>(in_sizes); batch_idx++) {
    // Mapping the tensor NCHW coordinates into texture XYZ coordinates
    int32_t dst_first_z = dst_channel_offset / 4;
    int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;

    // We copy the entire width and height dimension. For the channel dimension,
    // we use the z-dimension of the global_size to specify the texture range.
    // The shader combines the global invocation id and the dst_offset to get
    // the actual coordinate.

    const ivec3 dst_offset{
        0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)};

    const uvec3 global_size{
        utils::safe_downcast<uint32_t>(dim_at<kWidth4D>(in_sizes)),
        utils::safe_downcast<uint32_t>(dim_at<kHeight4D>(in_sizes)),
        utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
    const uvec3 local_size = graph.create_local_wg_size(global_size);

    const utils::ivec4 range_params = {
        static_cast<int>(global_size[0]),
        static_cast<int>(global_size[1]),
        static_cast<int>(global_size[2]),
        channel_range};

    const ivec4 offset_params = {
        dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset};

    auto shader = VK_KERNEL_FROM_STR(kernel_name);

    graph.execute_nodes().emplace_back(new DispatchNode(
        graph,
        VK_KERNEL_FROM_STR(kernel_name),
        global_size,
        local_size,
        // Inputs and Outputs
        {
            {out, vkapi::MemoryAccessType::WRITE},
            {out, vkapi::MemoryAccessType::READ},
            {in, vkapi::MemoryAccessType::READ},
        },
        // Parameter buffers
        {},
        // Specialization Constants
        {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
        nullptr,
        {},
        {graph.sizes_pc_of(out),
         graph.sizes_pc_of(in),
         PushConstantDataInfo(&range_params, sizeof(range_params)),
         PushConstantDataInfo(&offset_params, sizeof(offset_params)),
         PushConstantDataInfo(
             &src_channel_offset, sizeof(src_channel_offset))}));
  }
}

void add_copy_offset_node(
    ComputeGraph& graph,
    ValueRef in,
    ValueRef range_ref,
    ValueRef src_offset_ref,
    ValueRef dst_offset_ref,
    ValueRef out) {
  ivec3 range = utils::make_ivec3(*graph.get_int_list(range_ref));
  ivec3 src = utils::make_ivec3(*graph.get_int_list(src_offset_ref));
  ivec3 dst = utils::make_ivec3(*graph.get_int_list(dst_offset_ref));

  ivec4 src_offset = {src[0], src[1], src[2], 0};
  ivec4 dst_offset = {dst[0], dst[1], dst[2], 0};

  add_copy_offset_node(
      graph, in, range, src_offset, dst_offset, out, false, false);
}

void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
  add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
}

void copy_channel_offset(
    ComputeGraph& graph,
    const std::vector<ValueRef>& args) {
  ValueRef in = args[0];
  ValueRef channel_range_ref = args[1];
  ValueRef src_channel_offset_ref = args[2];
  ValueRef dst_channel_offset_ref = args[3];
  ValueRef out = args[4];

  auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
  auto src_channel_offset =
      graph.extract_scalar<int64_t>(src_channel_offset_ref);
  auto dst_channel_offset =
      graph.extract_scalar<int64_t>(dst_channel_offset_ref);

  add_copy_channel_offset_node(
      graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
}

REGISTER_OPERATORS {
  VK_REGISTER_OP(etvk.copy_offset, copy_offset);
  VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
}

} // namespace vkcompute