opencl: GDN sv128 — stage k/q/g through __local (4× fewer global reads)

wanghqc · wanghqc · commit a7e73050cfe0 · 2026-05-30T19:51:56.000-07:00
In the sv128 specialization, the 4 columns in each workgroup share the
same head, so all four read identical k/q/g vectors. Each thread was
fetching 4 strided floats of k and q from global; with 4 cols per WG
that's 4× redundancy across the workgroup (the Adreno L1 absorbs some
of this, but explicit __local staging avoids the redundancy and frees
L1 footprint for the state column reads, which are unique per column).

128 threads cooperatively load 128 floats of k/q (1 element per thread,
fully coalesced) plus exp(g) in the kda path; one barrier; then each
lane's 4 reads hit __local instead of global. v[col] is per-column so
stays as a direct global read. Bit-exact in test-backend-ops -o
GATED_DELTA_NET (8/8 OpenCL cases OK, the head_size=128 case hits the
new path).

Perf is in the noise on a single benching session — Adreno X2 throttles
fast under sustained load and cross-config absolute numbers drift across
back-to-back -r 2 runs by ~30% on tg. Will re-bench from cold and update
[[gdn_opencl_wip]] when there's a clean number.
diff --git a/ggml/src/ggml-opencl/kernels/gated_delta_net.cl b/ggml/src/ggml-opencl/kernels/gated_delta_net.cl
@@ -206,6 +206,22 @@ kernel void kernel_gated_delta_net_f32_sv128(
     const float beta_val = ((global const float *)b_base)[hb];
     global const float * g_d = (global const float *)g_base + hb * (ulong)neg0;
 
+    // The 4 cols in this workgroup share the same head, so they all need the
+    // same k[i] and q[i] values. Stage them through __local once (each thread
+    // loads 1 element) so each lane's 4 reads hit __local instead of global —
+    // 4× fewer global k/q reads per workgroup. Same trick for g[i] in the
+    // kda path. v[col] is per-column so stays as a direct global read.
+    __local float k_loc[GDN_SV];
+    __local float q_loc[GDN_SV];
+    __local float g_loc[GDN_SV];  // unused / dead in scalar-g path
+
+    k_loc[lid] = k_d[lid];
+    q_loc[lid] = q_d[lid];
+    if (kda) {
+        g_loc[lid] = exp(g_d[lid]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     float s_shard[GDN_RPL];
     float k_reg  [GDN_RPL];
     float q_reg  [GDN_RPL];
@@ -215,14 +231,14 @@ kernel void kernel_gated_delta_net_f32_sv128(
     for (int r = 0; r < GDN_RPL; r++) {
         const int i = r * GDN_LPC + lane;
         s_shard[r] = s_in[i];
-        k_reg[r]   = k_d[i];
-        q_reg[r]   = q_d[i];
+        k_reg[r]   = k_loc[i];
+        q_reg[r]   = q_loc[i];
     }
 
     if (kda) {
         #pragma unroll
         for (int r = 0; r < GDN_RPL; r++) {
-            g_exp[r] = exp(g_d[r * GDN_LPC + lane]);
+            g_exp[r] = g_loc[r * GDN_LPC + lane];
         }
     } else {
         const float gv = exp(g_d[0]);