Update on "[ET-VK][conv1d] Implement height-packed depthwise conv1d operator"

ssjia · ssjia · commit af5aaa517b17 · 2026-03-19T15:48:44.000-07:00
Implement a depthwise conv1d operator using height-packed layout where channels are the packed dimension (WHCN dim 1). Depthwise conv applies a separate filter to each channel independently (groups=C), so 4 channels can be processed in parallel using element-wise vec4 FMA over kernel positions. Thread mapping: X=C/4, Y=L_out, Z=N. Each thread computes one output texel (4 channels at one spatial position). Inner loop iterates over kernel positions K with bounds-checked input access for padding. Weight [C,1,K] is prepacked as channels-packed so each vec4 load gives 4 channels' weights at one kernel position. Supports both buffer and texture3d storage, fp32/fp16, optional bias, and arbitrary stride/padding/dilation. Registered as et_vk.conv1d_dw.default (standalone custom op). Performance on Adreno 750 (S24): - [1,128,4096] K=31 buffer f16: 231 GFLOP/s - [1,128,4096] K=31 buffer f32: 155 GFLOP/s - [1,512,2048] K=5 buffer f32: 66 GFLOP/s Differential Revision: [D97344091](https://our.internmc.facebook.com/intern/diff/D97344091/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl
@@ -39,6 +39,8 @@ layout(push_constant) uniform restrict Block {
   int stride;
   int padding;
   int dilation;
+  float output_min;
+  float output_max;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -86,6 +88,8 @@ void main() {
 #endif
 #endif
 
+  sum = clamp(sum, VEC4_T(output_min), VEC4_T(output_max));
+
 #ifdef BUFFER
   t_out[(n * L_out + l_out) * C4 + c4] = sum;
 #else
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl
@@ -56,10 +56,14 @@ $if HAS_BIAS:
     int weight_B;
     float alpha;
     float beta;
+    float output_min;
+    float output_max;
   };
 $else:
   layout(push_constant) uniform restrict Block {
     int weight_B;
+    float output_min;
+    float output_max;
   };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -190,5 +194,13 @@ void main() {
   }
 #endif
 
+  // Apply activation clamp
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      out_tile.data[m][n4] =
+          clamp(out_tile.data[m][n4], VEC4_T(output_min), VEC4_T(output_max));
+    }
+  }
+
   store_output_tile_with_checks(out_tile, n4_start, m_start, b, N4, M);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp
@@ -16,6 +16,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
+#include <limits>
+
 namespace vkcompute {
 
 void resize_conv1d_dw_node(
@@ -48,6 +50,11 @@ struct Conv1dDWParams final {
   int32_t dilation;
 };
 
+struct Conv1dDWClampParams final {
+  float output_min;
+  float output_max;
+};
+
 utils::uvec3 pick_conv1d_dw_global_wg_size(
     ComputeGraph* graph,
     const vkapi::ShaderInfo& shader,
@@ -74,7 +81,9 @@ void add_conv1d_dw_node(
     const ValueRef stride_ref,
     const ValueRef padding_ref,
     const ValueRef dilation_ref,
-    const ValueRef out) {
+    const ValueRef out,
+    const float output_min = std::numeric_limits<float>::lowest(),
+    const float output_max = std::numeric_limits<float>::max()) {
   VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim);
   VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim);
 
@@ -103,6 +112,11 @@ void add_conv1d_dw_node(
       utils::safe_downcast<int32_t>(dilation_val),
   };
 
+  Conv1dDWClampParams clamp_params{
+      output_min,
+      output_max,
+  };
+
   std::string kernel_name = has_bias ? "conv1d_dw_bias" : "conv1d_dw";
   kernel_name.reserve(kShaderNameReserve);
   add_storage_type_suffix(kernel_name, storage_type);
@@ -123,7 +137,8 @@ void add_conv1d_dw_node(
       // Shader params buffers
       {graph.sizes_ubo(in), graph.sizes_ubo(out)},
       // Push Constants
-      {PushConstantDataInfo(&params, sizeof(Conv1dDWParams))},
+      {PushConstantDataInfo(&params, sizeof(Conv1dDWParams)),
+       PushConstantDataInfo(&clamp_params, sizeof(Conv1dDWClampParams))},
       // Specialization Constants
       {},
       // Resize Args
@@ -132,17 +147,38 @@ void add_conv1d_dw_node(
       resize_conv1d_dw_node));
 }
 
+// Args: in, weight, bias, stride, padding, dilation, groups,
+//       output_min, output_max, out
+// output_min and output_max may be kDummyValueRef (no clamp).
 void conv1d_dw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // args: in, weight, bias, stride, padding, dilation, groups, out
   ValueRef in = args[0];
   ValueRef weight = args[1];
   ValueRef bias = args[2];
   ValueRef stride = args[3];
   ValueRef padding = args[4];
   ValueRef dilation = args[5];
-  ValueRef out = args[7];
+  ValueRef out = args[9];
 
-  add_conv1d_dw_node(graph, in, weight, bias, stride, padding, dilation, out);
+  float output_min = std::numeric_limits<float>::lowest();
+  float output_max = std::numeric_limits<float>::max();
+  if (is_valid(args[7])) {
+    output_min = graph.extract_scalar<float>(args[7]);
+  }
+  if (is_valid(args[8])) {
+    output_max = graph.extract_scalar<float>(args[8]);
+  }
+
+  add_conv1d_dw_node(
+      graph,
+      in,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      out,
+      output_min,
+      output_max);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp
@@ -16,6 +16,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
+#include <limits>
+
 namespace vkcompute {
 
 // Minimum number of thread groups to target for good GPU occupancy.
@@ -117,11 +119,15 @@ void resize_conv1d_pw_node(
 
 struct Conv1dPWIntParams final {
   int32_t weight_B;
+  float output_min;
+  float output_max;
 };
 
 struct Conv1dPWBiasParams final {
   float alpha;
   float beta;
+  float output_min;
+  float output_max;
 };
 
 vkapi::ShaderInfo pick_conv1d_pw_shader(
@@ -181,7 +187,9 @@ void add_conv1d_pw_node(
     const ValueRef in,
     const ValueRef weight_data,
     const ValueRef bias,
-    const ValueRef out) {
+    const ValueRef out,
+    const float output_min = std::numeric_limits<float>::lowest(),
+    const float output_max = std::numeric_limits<float>::max()) {
   VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim);
   VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim);
 
@@ -199,20 +207,21 @@ void add_conv1d_pw_node(
   ValueRef C_out_ref = graph.add_scalar(C_out);
   ValueRef has_bias_ref = graph.add_scalar(has_bias);
 
-  Conv1dPWIntParams int_params{1};
-  Conv1dPWBiasParams bias_params{1.0f, 1.0f};
+  Conv1dPWIntParams int_params{1, output_min, output_max};
+  Conv1dPWBiasParams bias_params{1.0f, 1.0f, output_min, output_max};
 
   std::vector<ValueRef> read_inputs = {in, packed_weight};
   if (has_bias) {
     read_inputs.push_back(packed_bias);
   }
 
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams)),
-  };
+  std::vector<PushConstantDataInfo> push_constants;
   if (has_bias) {
     push_constants.push_back(
         PushConstantDataInfo(&bias_params, sizeof(Conv1dPWBiasParams)));
+  } else {
+    push_constants.push_back(
+        PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams)));
   }
 
   vkapi::ParamsBindList shader_params = {
@@ -240,20 +249,31 @@ void add_conv1d_pw_node(
       resize_conv1d_pw_node));
 }
 
+// Args: in, weight, bias, stride, padding, dilation, groups,
+//       output_min, output_max, out
+// output_min and output_max may be kDummyValueRef (no clamp).
 void conv1d_pw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // args: in, weight, bias, stride, padding, dilation, groups, out
   ValueRef in = args[0];
   ValueRef weight = args[1];
   ValueRef bias = args[2];
-  ValueRef out = args[7];
+  ValueRef out = args[9];
 
   const std::vector<int64_t> weight_sizes = graph.sizes_of(weight);
   VK_CHECK_COND(
       weight_sizes.at(2) == 1, "conv1d_pw only supports kernel_size=1");
   VK_CHECK_COND(
       graph.get_int(args[6]) == 1, "conv1d_pw only supports groups=1");
 
-  add_conv1d_pw_node(graph, in, weight, bias, out);
+  float output_min = std::numeric_limits<float>::lowest();
+  float output_max = std::numeric_limits<float>::max();
+  if (is_valid(args[7])) {
+    output_min = graph.extract_scalar<float>(args[7]);
+  }
+  if (is_valid(args[8])) {
+    output_max = graph.extract_scalar<float>(args[8]);
+  }
+
+  add_conv1d_pw_node(graph, in, weight, bias, out, output_min, output_max);
 }
 
 REGISTER_OPERATORS {