[ET-VK] Address coopmat dispatch review feedback

xuyanwen2012 · xuyanwen2012 · commit 02faae52559a · 2026-05-01T14:21:23.000-07:00
Three correctness fixes flagged on PR #19009. 1. The linear_coopmat / matmul_coopmat dispatch gate previously only checked `M >= 64`. We now tighten the gates in `Linear.cpp` and `Matmul.cpp` to require `M % TILE_M == 0 && N % TILE_N == 0 && K % TILE_K == 0`; misaligned shapes correctly fall back to the tiled shader. 2. The bias path in `linear_coopmat.glsl` previously read the just-written output buffer back, added bias, and wrote it again. We now fold bias into the fp32 accumulator before `coopMatStore`. The binding now becomes `w` instead of `rw`. 3. We now use `packFloat2x16` directly to avoid fp16 -> fp32 -> fp16 round trip.
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.glsl
@@ -23,8 +23,9 @@
  * Output is always fp32 (fp32 accumulator -> fp32 store) when DTYPE=float,
  * or fp16 when DTYPE=half.
  *
- * Optional bias: when HAS_BIAS is defined, bias is added post-store via
- * read-modify-write on the output buffer (one pass over the tile).
+ * Optional bias: when HAS_BIAS is defined, bias is staged once into shared
+ * memory and broadcast into each accumulator tile (stride-0 coopMatLoad)
+ * before the store, so t_output is write-only.
  */
 
 #version 450 core
@@ -51,10 +52,7 @@ layout(std430) buffer;
 #include "common.glslh"
 
 // Bindings: output(0), mat1(1), weight_packed(2), [bias(3)]
-$if HAS_BIAS:
-  ${layout_declare_tensor(B, "rw", "t_output", DTYPE, "buffer", is_scalar_array=True)}
-$else:
-  ${layout_declare_tensor(B, "w", "t_output", DTYPE, "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "w", "t_output", DTYPE, "buffer", is_scalar_array=True)}
 ${layout_declare_tensor(B, "r", "t_mat1", DTYPE, "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, "buffer", is_scalar_array=False)}
 $if HAS_BIAS:
@@ -94,6 +92,12 @@ const uint B_STRIDE_VEC4 = (TILE_N + FP16_PER_VEC4) / FP16_PER_VEC4; // 9
 shared uvec4 Ash[TILE_M * A_STRIDE_VEC4];  // 5KB
 shared uvec4 Bsh[TILE_K * B_STRIDE_VEC4];  // 4.5KB
 
+#ifdef HAS_BIAS
+// fp32 staging buffer so coopMatLoad can broadcast directly into the
+// fp32 accumulator coopmat without a type conversion at the load.
+shared float bias_sh[TILE_N];  // 256B
+#endif
+
 // Accumulator tiles (fp32)
 coopmat<float, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator> result[C_ROWS][C_COLS];
 
@@ -146,8 +150,8 @@ void main() {
             f16vec4 v0 = t_mat1[row * K4 + k_hv4];
             f16vec4 v1 = t_mat1[row * K4 + k_hv4 + 1];
             Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4(
-                packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)),
-                packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw)));
+                packFloat2x16(v0.xy), packFloat2x16(v0.zw),
+                packFloat2x16(v1.xy), packFloat2x16(v1.zw));
 #else
             uint k_vec4 = k_elem / 4;
             vec4 v0 = t_mat1[row * K4 + k_vec4];
@@ -173,8 +177,8 @@ void main() {
             f16vec4 v0 = t_weight_packed[(k4 * N4 + n4_0) * 4u + dk];
             f16vec4 v1 = t_weight_packed[(k4 * N4 + n4_0 + 1u) * 4u + dk];
             Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4(
-                packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)),
-                packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw)));
+                packFloat2x16(v0.xy), packFloat2x16(v0.zw),
+                packFloat2x16(v1.xy), packFloat2x16(v1.zw));
 #else
             vec4 v0 = t_weight_packed[(k4 * N4 + n4_0) * 4u + dk];
             vec4 v1 = t_weight_packed[(k4 * N4 + n4_0 + 1u) * 4u + dk];
@@ -218,11 +222,37 @@ void main() {
         barrier();
     }
 
-    // --- Store result ---
+#ifdef HAS_BIAS
+    // Stage one TILE_N-wide row of bias into shared memory. The C++ dispatch
+    // gate ensures N % TILE_N == 0, so no per-element bounds check is needed.
+    {
+        const uint tile_n_start = TILE_N * tileID.x;
+        for (uint t = gl_LocalInvocationID.x; t < TILE_N; t += INVOCATIONS) {
+            bias_sh[t] = float(t_bias[tile_n_start + t]);
+        }
+    }
+    memoryBarrierShared();
+    barrier();
+#endif
+
+    // --- Store result (with bias folded in pre-store, if present) ---
     [[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
         [[unroll]] for (uint j = 0; j < C_COLS; ++j) {
             uint gi = TILE_M * tileID.y + lM * (C_ROWS * warpInTile.y + i);
             uint gj = TILE_N * tileID.x + lN * (C_COLS * warpInTile.x + j);
+
+#ifdef HAS_BIAS
+            // Stride-0 row-major load broadcasts lN bias values across all
+            // lM rows of the accumulator tile.
+            uint local_n = lN * (C_COLS * warpInTile.x + j);
+            coopmat<float, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator> bias_tile;
+            coopMatLoad(
+                bias_tile, bias_sh,
+                local_n, /*stride=*/0u,
+                gl_CooperativeMatrixLayoutRowMajor);
+            result[i][j] += bias_tile;
+#endif
+
 #ifdef IS_FP16_INPUT
             coopmat<float16_t, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator> out_tile =
                 coopmat<float16_t, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator>(result[i][j]);
@@ -238,24 +268,4 @@ void main() {
 #endif
         }
     }
-
-#ifdef HAS_BIAS
-    // Add bias via read-modify-write on the output buffer.
-    // barrier() ensures all coopMatStore writes within this workgroup are visible.
-    barrier();
-
-    const uint tile_m_start = TILE_M * tileID.y;
-    const uint tile_n_start = TILE_N * tileID.x;
-    // 64x64 tile = 4096 elements, 256 threads -> 16 elements per thread
-    for (uint idx = gl_LocalInvocationID.x; idx < TILE_M * TILE_N; idx += INVOCATIONS) {
-        uint local_m = idx / TILE_N;
-        uint local_n = idx % TILE_N;
-        uint gm = tile_m_start + local_m;
-        uint gn = tile_n_start + local_n;
-        if (gm < M && gn < N) {
-            uint out_idx = gm * N + gn;
-            t_output[out_idx] = t_output[out_idx] + t_bias[gn];
-        }
-    }
-#endif
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.glsl
@@ -134,8 +134,8 @@ void main() {
             f16vec4 v0 = t_mat1[row * K4 + k_hv4];
             f16vec4 v1 = t_mat1[row * K4 + k_hv4 + 1];
             Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4(
-                packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)),
-                packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw)));
+                packFloat2x16(v0.xy), packFloat2x16(v0.zw),
+                packFloat2x16(v1.xy), packFloat2x16(v1.zw));
 #else
             // fp32 inputs: load two vec4 (8 fp32), convert to 8 fp16
             uint k_vec4 = k_elem / 4;
@@ -157,8 +157,8 @@ void main() {
             f16vec4 v0 = t_mat2[row * N4 + n_hv4];
             f16vec4 v1 = t_mat2[row * N4 + n_hv4 + 1];
             Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4(
-                packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)),
-                packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw)));
+                packFloat2x16(v0.xy), packFloat2x16(v0.zw),
+                packFloat2x16(v1.xy), packFloat2x16(v1.zw));
 #else
             uint n_vec4 = n_elem / 4;
             vec4 v0 = t_mat2[row * N4 + n_vec4];
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -243,6 +243,7 @@ void add_linear_tiled_node(
 
 static constexpr uint32_t kLinearCoopMatTileM = 64;
 static constexpr uint32_t kLinearCoopMatTileN = 64;
+static constexpr uint32_t kLinearCoopMatTileK = 32;
 static constexpr uint32_t kLinearCoopMatInvocations = 256; // 4 subgroups x 64
 
 vkapi::ShaderInfo pick_linear_coopmat_shader(
@@ -251,8 +252,7 @@ vkapi::ShaderInfo pick_linear_coopmat_shader(
     const std::vector<ValueRef>& resize_args) {
   const ValueRef out = args.at(0).refs.at(0);
   bool has_bias = graph->get_bool(resize_args.at(1));
-  std::string kernel_name =
-      has_bias ? "linear_coopmat_bias" : "linear_coopmat";
+  std::string kernel_name = has_bias ? "linear_coopmat_bias" : "linear_coopmat";
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, graph->dtype_of(out));
   return VK_KERNEL_FROM_STR(kernel_name);
@@ -342,27 +342,38 @@ void linear_packed_weight(
   ValueRef out = args.at(3);
 
   bool has_bias = graph.val_is_not_none(bias);
-  // Coopmat shader assumes M is a multiple of TILE_M (64) because the store
-  // does not bounds-check. Fall back to the tiled shader otherwise.
-  // TODO: remove this guard once the coopmat shader gains partial-tile
-  // bounds checking.
+  // Coopmat shader has no partial-tile / K-tail handling: the store overruns
+  // unless M and N are multiples of the output tile, and the K-loop reads past
+  // the end unless K is a multiple of TILE_K. Fall back to the tiled shader
+  // when alignment is not met.
+  // TODO: remove this guard once the coopmat shader gains partial-tile +
+  // K-tail bounds checking.
   auto input_sizes = graph.sizes_of(input);
-  int64_t M = input_sizes.size() >= 2
-      ? input_sizes.at(input_sizes.size() - 2)
-      : 1;
+  auto out_sizes_vec = graph.sizes_of(out);
+  int64_t M =
+      input_sizes.size() >= 2 ? input_sizes.at(input_sizes.size() - 2) : 1;
+  int64_t K = input_sizes.back();
+  int64_t N = out_sizes_vec.back();
   bool use_coopmat =
       graph.context()->adapter_ptr()->supports_cooperative_matrix() &&
       graph.storage_type_of(out) == utils::kBuffer &&
-      M >= 64;
+      M % kLinearCoopMatTileM == 0 && N % kLinearCoopMatTileN == 0 &&
+      K % kLinearCoopMatTileK == 0;
 
   ValueRef packed_weight = prepack_fp_linear_weight(
-      graph, weight_data, /*is_transposed=*/true, /*B=*/1,
+      graph,
+      weight_data,
+      /*is_transposed=*/true,
+      /*B=*/1,
       /*force_buffer=*/use_coopmat);
 
   ValueRef packed_bias = kDummyValueRef;
   if (has_bias) {
     packed_bias = prepack_standard(
-        graph, bias, graph.storage_type_of(out), utils::kWidthPacked,
+        graph,
+        bias,
+        graph.storage_type_of(out),
+        utils::kWidthPacked,
         /*passthrough=*/use_coopmat);
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp b/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp
@@ -29,6 +29,7 @@ void resize_matmul_tiled_node(
 
 static constexpr uint32_t kCoopMatTileM = 64;
 static constexpr uint32_t kCoopMatTileN = 64;
+static constexpr uint32_t kCoopMatTileK = 32;
 static constexpr uint32_t kCoopMatInvocations = 256; // 4 subgroups × 64
 
 vkapi::ShaderInfo pick_matmul_coopmat_shader(
@@ -275,18 +276,35 @@ void matmul_tiled(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   ValueRef mat2 = args[1];
   ValueRef out = args[2];
 
+  // Coopmat path requires M%TILE_M==0, N%TILE_N==0, K%TILE_K==0 — the shader
+  // has no partial-tile or K-tail handling.
+  auto mat1_sizes = graph.sizes_of(mat1);
+  int64_t M = mat1_sizes.at(mat1_sizes.size() - 2);
+  int64_t K = mat1_sizes.back();
+  int64_t N = graph.sizes_of(out).back();
+  const bool coopmat_aligned = M % kCoopMatTileM == 0 &&
+      N % kCoopMatTileN == 0 && K % kCoopMatTileK == 0;
+
   if (graph.val_is_tref(mat2)) {
     auto mat2_sizes = graph.sizes_of(mat2);
     int64_t B = mat2_sizes.size() >= 3 ? mat2_sizes.at(0) : 1;
     bool use_coopmat =
         graph.context()->adapter_ptr()->supports_cooperative_matrix() &&
-        graph.storage_type_of(out) == utils::kBuffer;
+        graph.storage_type_of(out) == utils::kBuffer && coopmat_aligned;
     ValueRef packed = prepack_fp_linear_weight(
-        graph, mat2, /*is_transposed=*/false, B,
+        graph,
+        mat2,
+        /*is_transposed=*/false,
+        B,
         /*force_buffer=*/use_coopmat);
     if (use_coopmat) {
       add_linear_coopmat_node(
-          graph, mat1, packed, kDummyValueRef, false, out,
+          graph,
+          mat1,
+          packed,
+          kDummyValueRef,
+          false,
+          out,
           utils::safe_downcast<int32_t>(B));
     } else {
       add_linear_tiled_node(
@@ -300,7 +318,7 @@ void matmul_tiled(ComputeGraph& graph, const std::vector<ValueRef>& args) {
     }
   } else if (
       graph.context()->adapter_ptr()->supports_cooperative_matrix() &&
-      graph.storage_type_of(out) == utils::kBuffer) {
+      graph.storage_type_of(out) == utils::kBuffer && coopmat_aligned) {
     add_matmul_coopmat_node(graph, mat1, mat2, out);
   } else {
     add_matmul_tiled_node(graph, mat1, mat2, out);
diff --git a/backends/vulkan/test/custom_ops/linear_coopmat_bench.cpp b/backends/vulkan/test/custom_ops/linear_coopmat_bench.cpp
diff --git a/backends/vulkan/test/custom_ops/matmul_coopmat_bench.cpp b/backends/vulkan/test/custom_ops/matmul_coopmat_bench.cpp