Skip to content

Commit 3eca3e2

Browse files
author
ssjia
committed
Update base for Update on "[ET-VK][conv1d] Implement height-packed depthwise conv1d operator"
Implement a depthwise conv1d operator using height-packed layout where channels are the packed dimension (WHCN dim 1). Depthwise conv applies a separate filter to each channel independently (groups=C), so 4 channels can be processed in parallel using element-wise vec4 FMA over kernel positions. Thread mapping: X=C/4, Y=L_out, Z=N. Each thread computes one output texel (4 channels at one spatial position). Inner loop iterates over kernel positions K with bounds-checked input access for padding. Weight [C,1,K] is prepacked as channels-packed so each vec4 load gives 4 channels' weights at one kernel position. Supports both buffer and texture3d storage, fp32/fp16, optional bias, and arbitrary stride/padding/dilation. Registered as et_vk.conv1d_dw.default (standalone custom op). Performance on Adreno 750 (S24): - [1,128,4096] K=31 buffer f16: 231 GFLOP/s - [1,128,4096] K=31 buffer f32: 155 GFLOP/s - [1,512,2048] K=5 buffer f32: 66 GFLOP/s Differential Revision: [D97344091](https://our.internmc.facebook.com/intern/diff/D97344091/) [ghstack-poisoned]
1 parent b8ba505 commit 3eca3e2

2 files changed

Lines changed: 41 additions & 9 deletions

File tree

backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,14 @@ $if HAS_BIAS:
5656
int weight_B;
5757
float alpha;
5858
float beta;
59+
float output_min;
60+
float output_max;
5961
};
6062
$else:
6163
layout(push_constant) uniform restrict Block {
6264
int weight_B;
65+
float output_min;
66+
float output_max;
6367
};
6468

6569
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -190,5 +194,13 @@ void main() {
190194
}
191195
#endif
192196

197+
// Apply activation clamp
198+
[[unroll]] for (int m = 0; m < TILE_M; ++m) {
199+
[[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
200+
out_tile.data[m][n4] =
201+
clamp(out_tile.data[m][n4], VEC4_T(output_min), VEC4_T(output_max));
202+
}
203+
}
204+
193205
store_output_tile_with_checks(out_tile, n4_start, m_start, b, N4, M);
194206
}

backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
1818

19+
#include <limits>
20+
1921
namespace vkcompute {
2022

2123
// Minimum number of thread groups to target for good GPU occupancy.
@@ -117,11 +119,15 @@ void resize_conv1d_pw_node(
117119

118120
struct Conv1dPWIntParams final {
119121
int32_t weight_B;
122+
float output_min;
123+
float output_max;
120124
};
121125

122126
struct Conv1dPWBiasParams final {
123127
float alpha;
124128
float beta;
129+
float output_min;
130+
float output_max;
125131
};
126132

127133
vkapi::ShaderInfo pick_conv1d_pw_shader(
@@ -181,7 +187,9 @@ void add_conv1d_pw_node(
181187
const ValueRef in,
182188
const ValueRef weight_data,
183189
const ValueRef bias,
184-
const ValueRef out) {
190+
const ValueRef out,
191+
const float output_min = std::numeric_limits<float>::lowest(),
192+
const float output_max = std::numeric_limits<float>::max()) {
185193
VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim);
186194
VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim);
187195

@@ -199,20 +207,21 @@ void add_conv1d_pw_node(
199207
ValueRef C_out_ref = graph.add_scalar(C_out);
200208
ValueRef has_bias_ref = graph.add_scalar(has_bias);
201209

202-
Conv1dPWIntParams int_params{1};
203-
Conv1dPWBiasParams bias_params{1.0f, 1.0f};
210+
Conv1dPWIntParams int_params{1, output_min, output_max};
211+
Conv1dPWBiasParams bias_params{1.0f, 1.0f, output_min, output_max};
204212

205213
std::vector<ValueRef> read_inputs = {in, packed_weight};
206214
if (has_bias) {
207215
read_inputs.push_back(packed_bias);
208216
}
209217

210-
std::vector<PushConstantDataInfo> push_constants = {
211-
PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams)),
212-
};
218+
std::vector<PushConstantDataInfo> push_constants;
213219
if (has_bias) {
214220
push_constants.push_back(
215221
PushConstantDataInfo(&bias_params, sizeof(Conv1dPWBiasParams)));
222+
} else {
223+
push_constants.push_back(
224+
PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams)));
216225
}
217226

218227
vkapi::ParamsBindList shader_params = {
@@ -240,20 +249,31 @@ void add_conv1d_pw_node(
240249
resize_conv1d_pw_node));
241250
}
242251

252+
// Args: in, weight, bias, stride, padding, dilation, groups,
253+
// output_min, output_max, out
254+
// output_min and output_max may be kDummyValueRef (no clamp).
243255
void conv1d_pw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
244-
// args: in, weight, bias, stride, padding, dilation, groups, out
245256
ValueRef in = args[0];
246257
ValueRef weight = args[1];
247258
ValueRef bias = args[2];
248-
ValueRef out = args[7];
259+
ValueRef out = args[9];
249260

250261
const std::vector<int64_t> weight_sizes = graph.sizes_of(weight);
251262
VK_CHECK_COND(
252263
weight_sizes.at(2) == 1, "conv1d_pw only supports kernel_size=1");
253264
VK_CHECK_COND(
254265
graph.get_int(args[6]) == 1, "conv1d_pw only supports groups=1");
255266

256-
add_conv1d_pw_node(graph, in, weight, bias, out);
267+
float output_min = std::numeric_limits<float>::lowest();
268+
float output_max = std::numeric_limits<float>::max();
269+
if (is_valid(args[7])) {
270+
output_min = graph.extract_scalar<float>(args[7]);
271+
}
272+
if (is_valid(args[8])) {
273+
output_max = graph.extract_scalar<float>(args[8]);
274+
}
275+
276+
add_conv1d_pw_node(graph, in, weight, bias, out, output_min, output_max);
257277
}
258278

259279
REGISTER_OPERATORS {

0 commit comments

Comments
 (0)