Skip to content

Commit a7e7305

Browse files
committed
opencl: GDN sv128 — stage k/q/g through __local (4× fewer global reads)
In the sv128 specialization, the 4 columns in each workgroup share the same head, so all four read identical k/q/g vectors. Each thread was fetching 4 strided floats of k and q from global; with 4 cols per WG that's 4× redundancy across the workgroup (the Adreno L1 absorbs some of this, but explicit __local staging avoids the redundancy and frees L1 footprint for the state column reads, which are unique per column). 128 threads cooperatively load 128 floats of k/q (1 element per thread, fully coalesced) plus exp(g) in the kda path; one barrier; then each lane's 4 reads hit __local instead of global. v[col] is per-column so stays as a direct global read. Bit-exact in test-backend-ops -o GATED_DELTA_NET (8/8 OpenCL cases OK, the head_size=128 case hits the new path). Perf is in the noise on a single benching session — Adreno X2 throttles fast under sustained load and cross-config absolute numbers drift across back-to-back -r 2 runs by ~30% on tg. Will re-bench from cold and update [[gdn_opencl_wip]] when there's a clean number.
1 parent 0d4ac15 commit a7e7305

1 file changed

Lines changed: 19 additions & 3 deletions

File tree

ggml/src/ggml-opencl/kernels/gated_delta_net.cl

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,22 @@ kernel void kernel_gated_delta_net_f32_sv128(
206206
const float beta_val = ((global const float *)b_base)[hb];
207207
global const float * g_d = (global const float *)g_base + hb * (ulong)neg0;
208208

209+
// The 4 cols in this workgroup share the same head, so they all need the
210+
// same k[i] and q[i] values. Stage them through __local once (each thread
211+
// loads 1 element) so each lane's 4 reads hit __local instead of global —
212+
// 4× fewer global k/q reads per workgroup. Same trick for g[i] in the
213+
// kda path. v[col] is per-column so stays as a direct global read.
214+
__local float k_loc[GDN_SV];
215+
__local float q_loc[GDN_SV];
216+
__local float g_loc[GDN_SV]; // unused / dead in scalar-g path
217+
218+
k_loc[lid] = k_d[lid];
219+
q_loc[lid] = q_d[lid];
220+
if (kda) {
221+
g_loc[lid] = exp(g_d[lid]);
222+
}
223+
barrier(CLK_LOCAL_MEM_FENCE);
224+
209225
float s_shard[GDN_RPL];
210226
float k_reg [GDN_RPL];
211227
float q_reg [GDN_RPL];
@@ -215,14 +231,14 @@ kernel void kernel_gated_delta_net_f32_sv128(
215231
for (int r = 0; r < GDN_RPL; r++) {
216232
const int i = r * GDN_LPC + lane;
217233
s_shard[r] = s_in[i];
218-
k_reg[r] = k_d[i];
219-
q_reg[r] = q_d[i];
234+
k_reg[r] = k_loc[i];
235+
q_reg[r] = q_loc[i];
220236
}
221237

222238
if (kda) {
223239
#pragma unroll
224240
for (int r = 0; r < GDN_RPL; r++) {
225-
g_exp[r] = exp(g_d[r * GDN_LPC + lane]);
241+
g_exp[r] = g_loc[r * GDN_LPC + lane];
226242
}
227243
} else {
228244
const float gv = exp(g_d[0]);

0 commit comments

Comments
 (0)