opencl: GDN n_tokens>1 (chunked / prefill) — fused GDN op covers prefill too

wanghqc · wanghqc · commit db6e75449240 · 2026-05-30T19:51:56.000-07:00
Extends both kernel variants (generic + sv128) with a t-loop so a single
GATED_DELTA_NET dispatch can handle the full ubatch instead of stopping
at n_tokens==1. supports_op now returns true for any n_tokens; the
graph builder picks build_delta_net_fused over build_delta_net_chunking
once cparams.fused_gdn_ch is enabled, so the chunked-primitive "soup"
(~260 tiny mul/add/concat/solve_tri/repeat dispatches per layer per
token) collapses to one fused kernel.

Kernel changes:
- sv128: t-loop iterates over n_tokens with state kept in private
  registers (s_shard[GDN_RPL]) across iterations. attn_data advances
  by S_v*H per token to match the [S_v,H,n_tokens,n_seqs] output layout.
- Generic: same t-loop pattern; copies state into out buffer once then
  updates in place across tokens.
- Removed the __local k/q/g cache from the sv128 hot loop. For n_tokens=1
  it bought 4× fewer global reads at the cost of one barrier; for
  n_tokens&gt;1 the two per-iter barriers compound and dominate (~75
  cycles each × 2 × n_tokens), making decode ~slightly worse and
  prefill slower than the chunked-primitive baseline. Direct global
  reads of k/q/g per iter — the 4 cols sharing a head only touch ~4
  L1 cache lines per (r, token), which the Adreno L1 absorbs.

Dispatch changes:
- Pass nbq2/nbk2/nbv2/nbb2/nbg2 + n_tokens.
- Adds GGML_OPENCL_DISABLE_GDN_CH=1 env override so A/B benches against
  the chunked-primitive path don't need a rebuild.

Correctness: test-backend-ops -o GATED_DELTA_NET reports 28/28 OpenCL
cases OK (multi-token cases head_size in {16,32,64} hit the generic
fallback; head_size=128 hits the sv128 path; both n_tokens=1 and
n_tokens up to 256 covered).

Perf on Adreno X2-90 / Qwen3.6-35B-A3B-MXFP4, ngl=99, fa=0, -r 2:
  tg128 @ d=16384:  11.85 ± 0.17  (was 11.82 ± 0.05 with sv128+cache)
  pp4096:           ~190           (chunked-primitive baseline ~187 at
                                    the same thermal point; +5% peak,
                                    wash in steady state)

Qwen3.6-35B-A3B is GEMM-bound at prefill (MoE 38% + dense 24%); the
GDN soup is ~25% of the rest, so this op-fuse is roughly neutral for
this model. For Qwen3-Next / kimi-linear style models where MoE isn't
the dominant cost, the fused path should win more clearly — keeping
the chunked-on path enabled by default so those models get it for free.
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -5918,17 +5918,22 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
         case GGML_OP_SSM_CONV:
             return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
         case GGML_OP_GATED_DELTA_NET: {
-            // f32 only; autoregressive (n_tokens == 1) only — prefill keeps the
-            // chunked path. (cparams.fused_gdn_ch then auto-disables on the
-            // chunked-graph reservation; fused_gdn_ar stays enabled.)
-            // GGML_OPENCL_DISABLE_GDN=1 forces CPU fallback for A/B benching.
-            static const bool gdn_disabled = getenv("GGML_OPENCL_DISABLE_GDN") != nullptr;
+            // f32 only. Both autoregressive (n_tokens==1) and chunked
+            // (n_tokens>1) — the sv128 kernel handles both via an internal
+            // t-loop. Other s_v sizes use the (slow) generic fallback that
+            // also handles both, so test-backend-ops correctness still holds.
+            // GGML_OPENCL_DISABLE_GDN=1 forces CPU fallback for A/B benching;
+            // GGML_OPENCL_DISABLE_GDN_CH=1 disables only the chunked path
+            // (keeps autoregressive on the GPU).
+            static const bool gdn_disabled    = getenv("GGML_OPENCL_DISABLE_GDN")    != nullptr;
+            static const bool gdn_ch_disabled = getenv("GGML_OPENCL_DISABLE_GDN_CH") != nullptr;
             if (gdn_disabled) return false;
             const ggml_tensor * v = op->src[2];
             for (int i = 0; i < 6; ++i) {
                 if (op->src[i]->type != GGML_TYPE_F32) return false;
             }
-            return op->type == GGML_TYPE_F32 && v->ne[2] == 1 && v->ne[0] >= 1;
+            if (gdn_ch_disabled && v->ne[2] > 1) return false;
+            return op->type == GGML_TYPE_F32 && v->ne[0] >= 1;
         }
         case GGML_OP_CONCAT:
             return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
@@ -10488,7 +10493,6 @@ static void ggml_cl_gated_delta_net(ggml_backend_t backend, const ggml_tensor *
 
     GGML_ASSERT(q && k && v && g && beta && state && dst);
     GGML_ASSERT(q->extra && k->extra && v->extra && g->extra && beta->extra && state->extra && dst->extra);
-    GGML_ASSERT(v->ne[2] == 1); // autoregressive only (see ggml_backend_opencl_device_supports_op)
 
     ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
@@ -10508,19 +10512,22 @@ static void ggml_cl_gated_delta_net(ggml_backend_t backend, const ggml_tensor *
     cl_ulong s_off = es->offset + state->view_offs;
     cl_ulong d_off = ed->offset + dst->view_offs;
 
-    const int s_v    = (int)v->ne[0];
-    const int H      = (int)v->ne[1];
-    const int n_seqs = (int)v->ne[3];
-    const int neq1   = (int)q->ne[1];
-    const int nek1   = (int)k->ne[1];
-    const int neq3   = (int)q->ne[3];
-    const int nek3   = (int)k->ne[3];
-    const int neg0   = (int)g->ne[0];
-    const int kda    = (neg0 == s_v) ? 1 : 0;
-
-    cl_ulong nbq1 = q->nb[1], nbq3 = q->nb[3];
-    cl_ulong nbk1 = k->nb[1], nbk3 = k->nb[3];
-    cl_ulong nbv1 = v->nb[1], nbv3 = v->nb[3];
+    const int s_v      = (int)v->ne[0];
+    const int H        = (int)v->ne[1];
+    const int n_tokens = (int)v->ne[2];
+    const int n_seqs   = (int)v->ne[3];
+    const int neq1     = (int)q->ne[1];
+    const int nek1     = (int)k->ne[1];
+    const int neq3     = (int)q->ne[3];
+    const int nek3     = (int)k->ne[3];
+    const int neg0     = (int)g->ne[0];
+    const int kda      = (neg0 == s_v) ? 1 : 0;
+
+    cl_ulong nbq1 = q->nb[1],    nbq2 = q->nb[2],    nbq3 = q->nb[3];
+    cl_ulong nbk1 = k->nb[1],    nbk2 = k->nb[2],    nbk3 = k->nb[3];
+    cl_ulong nbv1 = v->nb[1],    nbv2 = v->nb[2],    nbv3 = v->nb[3];
+    cl_ulong nbb1 = beta->nb[1], nbb2 = beta->nb[2], nbb3 = beta->nb[3];
+    cl_ulong nbg1 = g->nb[1],    nbg2 = g->nb[2],    nbg3 = g->nb[3];
 
     const bool use_sv128 = (s_v == 128) && (backend_ctx->kernel_gated_delta_net_f32_sv128 != nullptr);
 
@@ -10544,11 +10551,20 @@ static void ggml_cl_gated_delta_net(ggml_backend_t backend, const ggml_tensor *
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_mem),   &ed->data_device));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &d_off));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbq1));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbq2));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbq3));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbk1));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbk2));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbk3));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbv1));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbv2));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbv3));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbb1));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbb2));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbb3));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbg1));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbg2));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbg3));
     if (!use_sv128) {
         // generic kernel takes s_v as the next int arg
         CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),  &s_v));
@@ -10558,6 +10574,7 @@ static void ggml_cl_gated_delta_net(ggml_backend_t backend, const ggml_tensor *
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &neq3));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &nek3));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &H));
+    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &n_tokens));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &n_seqs));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &kda));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &neg0));
diff --git a/ggml/src/ggml-opencl/kernels/gated_delta_net.cl b/ggml/src/ggml-opencl/kernels/gated_delta_net.cl
@@ -43,13 +43,16 @@ kernel void kernel_gated_delta_net_f32(
     global char * b_base,    ulong b_off,
     global char * s_base,    ulong s_off,
     global char * dst_base,  ulong dst_off,
-    ulong nbq1, ulong nbq3,
-    ulong nbk1, ulong nbk3,
-    ulong nbv1, ulong nbv3,
+    ulong nbq1, ulong nbq2, ulong nbq3,
+    ulong nbk1, ulong nbk2, ulong nbk3,
+    ulong nbv1, ulong nbv2, ulong nbv3,
+    ulong nbb1, ulong nbb2, ulong nbb3,
+    ulong nbg1, ulong nbg2, ulong nbg3,
     int s_v,
     int neq1, int nek1,
     int neq3, int nek3,
     int H,
+    int n_tokens,
     int n_seqs,
     int kda,
     int neg0
@@ -78,40 +81,58 @@ kernel void kernel_gated_delta_net_f32(
     s_base   += s_off;
     dst_base += dst_off;
 
-    const ulong attn_elems = (ulong)s_v * H * n_seqs;
-    global float * attn_out  = (global float *)dst_base;
-    global float * state_out = (global float *)dst_base + attn_elems;
+    const ulong attn_elems = (ulong)s_v * H * (ulong)n_tokens * n_seqs;
+    global float * attn_out_base  = (global float *)dst_base;
+    global float * state_out_base = (global float *)dst_base + attn_elems;
 
     global const float * s_in  = (global const float *)s_base + ((ulong)iv3 * H + iv1) * s_v * s_v + (ulong)j * s_v;
-    global       float * s_out = state_out                    + ((ulong)iv3 * H + iv1) * s_v * s_v + (ulong)j * s_v;
-
-    global const float * q_d = (global const float *)(q_base + (ulong)iq3*nbq3 + (ulong)iq1*nbq1);
-    global const float * k_d = (global const float *)(k_base + (ulong)ik3*nbk3 + (ulong)ik1*nbk1);
-    global const float * v_d = (global const float *)(v_base + (ulong)iv3*nbv3 + (ulong)iv1*nbv1);
-    const ulong hb = ((ulong)iv3*H + iv1);
-    const float beta = ((global const float *)b_base)[hb];
-    global const float * g_d = (global const float *)g_base + hb * (ulong)neg0;
-
-    if (kda) {
-        for (int i = 0; i < s_v; ++i) s_out[i] = s_in[i] * exp(g_d[i]);
-    } else {
-        const float gd = exp(g_d[0]);
-        for (int i = 0; i < s_v; ++i) s_out[i] = s_in[i] * gd;
-    }
+    global       float * s_out = state_out_base               + ((ulong)iv3 * H + iv1) * s_v * s_v + (ulong)j * s_v;
+
+    // For n_tokens == 1, the state column is copied/updated in-place in global
+    // (preserves the original kernel's behavior). For n_tokens > 1, we keep
+    // s_out in global throughout but the columns are touched once per token.
+    // The naive kernel is slow for prefill; the sv128 specialization is the
+    // fast path for the only s_v we ship today (Qwen3-Next family).
+    // Initialize new state by copying input state into output state buffer.
+    for (int i = 0; i < s_v; ++i) s_out[i] = s_in[i];
+
+    global char * q_hd = q_base + (ulong)iq3*nbq3 + (ulong)iq1*nbq1;
+    global char * k_hd = k_base + (ulong)ik3*nbk3 + (ulong)ik1*nbk1;
+    global char * v_hd = v_base + (ulong)iv3*nbv3 + (ulong)iv1*nbv1;
+    global char * b_hd = b_base + (ulong)iv3 * nbb3 + (ulong)iv1 * nbb1;
+    global char * g_hd = g_base + (ulong)iv3 * nbg3 + (ulong)iv1 * nbg1;
+
+    global float * attn_data = attn_out_base + ((ulong)iv3 * (ulong)n_tokens * H + iv1) * s_v;
+
+    for (int t = 0; t < n_tokens; t++) {
+        global const float * q_d = (global const float *)(q_hd + (ulong)t * nbq2);
+        global const float * k_d = (global const float *)(k_hd + (ulong)t * nbk2);
+        global const float * v_d = (global const float *)(v_hd + (ulong)t * nbv2);
+        const float beta         = *(global const float *)(b_hd + (ulong)t * nbb2);
+        global const float * g_d = (global const float *)(g_hd + (ulong)t * nbg2);
+
+        if (kda) {
+            for (int i = 0; i < s_v; ++i) s_out[i] *= exp(g_d[i]);
+        } else {
+            const float gd = exp(g_d[0]);
+            for (int i = 0; i < s_v; ++i) s_out[i] *= gd;
+        }
 
-    float kv = 0.0f;
-    for (int i = 0; i < s_v; ++i) kv = mad(s_out[i], k_d[i], kv);
+        float kv = 0.0f;
+        for (int i = 0; i < s_v; ++i) kv = mad(s_out[i], k_d[i], kv);
 
-    const float delta = (v_d[j] - kv) * beta;
+        const float delta = (v_d[j] - kv) * beta;
 
-    float o = 0.0f;
-    for (int i = 0; i < s_v; ++i) {
-        const float sij = mad(k_d[i], delta, s_out[i]);
-        s_out[i] = sij;
-        o = mad(sij, q_d[i], o);
-    }
+        float o = 0.0f;
+        for (int i = 0; i < s_v; ++i) {
+            const float sij = mad(k_d[i], delta, s_out[i]);
+            s_out[i] = sij;
+            o = mad(sij, q_d[i], o);
+        }
 
-    attn_out[((ulong)iv3*H + iv1) * s_v + j] = o * scale;
+        attn_data[j] = o * scale;
+        attn_data += (ulong)s_v * H;
+    }
 }
 
 // ============================================================================
@@ -156,12 +177,15 @@ kernel void kernel_gated_delta_net_f32_sv128(
     global char * b_base,    ulong b_off,
     global char * s_base,    ulong s_off,
     global char * dst_base,  ulong dst_off,
-    ulong nbq1, ulong nbq3,
-    ulong nbk1, ulong nbk3,
-    ulong nbv1, ulong nbv3,
+    ulong nbq1, ulong nbq2, ulong nbq3,
+    ulong nbk1, ulong nbk2, ulong nbk3,
+    ulong nbv1, ulong nbv2, ulong nbv3,
+    ulong nbb1, ulong nbb2, ulong nbb3,
+    ulong nbg1, ulong nbg2, ulong nbg3,
     int neq1, int nek1,
     int neq3, int nek3,
     int H,
+    int n_tokens,
     int n_seqs,
     int kda,
     int neg0
@@ -192,82 +216,92 @@ kernel void kernel_gated_delta_net_f32_sv128(
     s_base   += s_off;
     dst_base += dst_off;
 
-    const ulong attn_elems = (ulong)GDN_SV * H * n_seqs;
-    global float * attn_out  = (global float *)dst_base;
-    global float * state_out = (global float *)dst_base + attn_elems;
+    // Output layout: [ attn (S_v * H * n_tokens * n_seqs) | new_state (S_v * S_v * H * n_seqs) ]
+    const ulong attn_elems = (ulong)GDN_SV * H * (ulong)n_tokens * n_seqs;
+    global float * attn_out_base  = (global float *)dst_base;
+    global float * state_out_base = (global float *)dst_base + attn_elems;
 
     global const float * s_in  = (global const float *)s_base + ((ulong)iv3 * H + iv1) * GDN_SV * GDN_SV + (ulong)col * GDN_SV;
-    global       float * s_out = state_out                    + ((ulong)iv3 * H + iv1) * GDN_SV * GDN_SV + (ulong)col * GDN_SV;
-
-    global const float * q_d = (global const float *)(q_base + (ulong)iq3*nbq3 + (ulong)iq1*nbq1);
-    global const float * k_d = (global const float *)(k_base + (ulong)ik3*nbk3 + (ulong)ik1*nbk1);
-    global const float * v_d = (global const float *)(v_base + (ulong)iv3*nbv3 + (ulong)iv1*nbv1);
-    const ulong hb = (ulong)iv3 * H + iv1;
-    const float beta_val = ((global const float *)b_base)[hb];
-    global const float * g_d = (global const float *)g_base + hb * (ulong)neg0;
-
-    // The 4 cols in this workgroup share the same head, so they all need the
-    // same k[i] and q[i] values. Stage them through __local once (each thread
-    // loads 1 element) so each lane's 4 reads hit __local instead of global —
-    // 4× fewer global k/q reads per workgroup. Same trick for g[i] in the
-    // kda path. v[col] is per-column so stays as a direct global read.
-    __local float k_loc[GDN_SV];
-    __local float q_loc[GDN_SV];
-    __local float g_loc[GDN_SV];  // unused / dead in scalar-g path
-
-    k_loc[lid] = k_d[lid];
-    q_loc[lid] = q_d[lid];
-    if (kda) {
-        g_loc[lid] = exp(g_d[lid]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
+    global       float * s_out = state_out_base               + ((ulong)iv3 * H + iv1) * GDN_SV * GDN_SV + (ulong)col * GDN_SV;
 
-    float s_shard[GDN_RPL];
-    float k_reg  [GDN_RPL];
-    float q_reg  [GDN_RPL];
-    float g_exp  [GDN_RPL];
+    // Per-head per-seq base pointers; per-token offsets applied inside the t-loop.
+    global char * q_hd = q_base + (ulong)iq3*nbq3 + (ulong)iq1*nbq1;
+    global char * k_hd = k_base + (ulong)ik3*nbk3 + (ulong)ik1*nbk1;
+    global char * v_hd = v_base + (ulong)iv3*nbv3 + (ulong)iv1*nbv1;
+    global char * b_hd = b_base + (ulong)iv3*nbb3 + (ulong)iv1*nbb1;
+    global char * g_hd = g_base + (ulong)iv3*nbg3 + (ulong)iv1*nbg1;
 
+    // Load state column 'col' into private once for the whole t-loop.
+    float s_shard[GDN_RPL];
     #pragma unroll
     for (int r = 0; r < GDN_RPL; r++) {
-        const int i = r * GDN_LPC + lane;
-        s_shard[r] = s_in[i];
-        k_reg[r]   = k_loc[i];
-        q_reg[r]   = q_loc[i];
+        s_shard[r] = s_in[r * GDN_LPC + lane];
     }
 
-    if (kda) {
+    const float scale = 1.0f / sqrt((float) GDN_SV);
+
+    // attn output advances by GDN_SV * H per token, starting at first token of
+    // this (seq, head): attn_data[t][col] = base + (iv3*n_tokens + t)*H*S_v + iv1*S_v + col.
+    global float * attn_data = attn_out_base + ((ulong)iv3 * (ulong)n_tokens * H + iv1) * GDN_SV;
+
+    // For decode (n_tokens==1) the __local-cache variant was a slight win but
+    // barriers would dominate for the prefill t-loop. We read k/q/g directly
+    // from global on every iter — the 4 cols sharing a head only need ~4 cache
+    // lines per (r,token) read, which the Adreno L1 absorbs across the 4
+    // cluster-of-32 reads in the same workgroup. No barriers in the hot loop.
+    for (int t = 0; t < n_tokens; t++) {
+        global const float * q_t = (global const float *)(q_hd + (ulong)t * nbq2);
+        global const float * k_t = (global const float *)(k_hd + (ulong)t * nbk2);
+        global const float * v_t = (global const float *)(v_hd + (ulong)t * nbv2);
+        const float beta_val     = *(global const float *)(b_hd + (ulong)t * nbb2);
+        global const float * g_t = (global const float *)(g_hd + (ulong)t * nbg2);
+
+        float k_reg[GDN_RPL];
+        float q_reg[GDN_RPL];
+        float g_exp[GDN_RPL];
+
         #pragma unroll
         for (int r = 0; r < GDN_RPL; r++) {
-            g_exp[r] = g_loc[r * GDN_LPC + lane];
+            const int i = r * GDN_LPC + lane;
+            k_reg[r] = k_t[i];
+            q_reg[r] = q_t[i];
         }
-    } else {
-        const float gv = exp(g_d[0]);
-        #pragma unroll
-        for (int r = 0; r < GDN_RPL; r++) g_exp[r] = gv;
-    }
 
-    const float v_val = v_d[col];
+        if (kda) {
+            #pragma unroll
+            for (int r = 0; r < GDN_RPL; r++) {
+                g_exp[r] = exp(g_t[r * GDN_LPC + lane]);
+            }
+        } else {
+            const float gv = exp(g_t[0]);
+            #pragma unroll
+            for (int r = 0; r < GDN_RPL; r++) g_exp[r] = gv;
+        }
 
-    float kv_shard = 0.0f;
-    #pragma unroll
-    for (int r = 0; r < GDN_RPL; r++) {
-        kv_shard = mad(g_exp[r] * s_shard[r], k_reg[r], kv_shard);
-    }
-    const float kv_col = gdn_cluster32_sum(kv_shard);
+        const float v_val = v_t[col];
 
-    const float delta = (v_val - kv_col) * beta_val;
+        float kv_shard = 0.0f;
+        #pragma unroll
+        for (int r = 0; r < GDN_RPL; r++) {
+            kv_shard = mad(g_exp[r] * s_shard[r], k_reg[r], kv_shard);
+        }
+        const float kv_col = gdn_cluster32_sum(kv_shard);
 
-    float attn_partial = 0.0f;
-    #pragma unroll
-    for (int r = 0; r < GDN_RPL; r++) {
-        const float sij = mad(k_reg[r], delta, g_exp[r] * s_shard[r]);
-        s_shard[r] = sij;
-        attn_partial = mad(sij, q_reg[r], attn_partial);
-    }
-    const float attn_col = gdn_cluster32_sum(attn_partial);
+        const float delta = (v_val - kv_col) * beta_val;
 
-    if (lane == 0) {
-        attn_out[((ulong)iv3 * H + iv1) * GDN_SV + col] = attn_col * (1.0f / sqrt((float) GDN_SV));
+        float attn_partial = 0.0f;
+        #pragma unroll
+        for (int r = 0; r < GDN_RPL; r++) {
+            const float sij = mad(k_reg[r], delta, g_exp[r] * s_shard[r]);
+            s_shard[r] = sij;
+            attn_partial = mad(sij, q_reg[r], attn_partial);
+        }
+        const float attn_col = gdn_cluster32_sum(attn_partial);
+
+        if (lane == 0) {
+            attn_data[col] = attn_col * scale;
+        }
+        attn_data += (ulong)GDN_SV * H;
     }
 
     #pragma unroll