opencl: GDN SV=128 cluster-of-32 specialization (Qwen3.6-A3B / Qwen3-Next)

wanghqc · wanghqc · commit 0d4ac15c2dfb · 2026-05-30T19:51:56.000-07:00
The naive kernel_gated_delta_net_f32 — one thread per (column j, head, seq)
— was 1.76× slower than the CPU GDN fallback at tg128 on Qwen3.6-35B-A3B
(11.28 vs 14.83 t/s). Each thread did 4 sequential length-S_v inner loops
with no SIMD use, and the state vector was read/written through global on
every step. The "fix the dispatch and ship the naive port" plan from the
prior session didn't move tg because the kernel itself is the bottleneck.

This commit ports the Vulkan cluster-of-32 design to OpenCL for the S_v=128
case (Qwen3-Next family head_v_dim — covers both Qwen3.6-35B-A3B and the
qwen3next 7B/80B). Layout:

- 128-lane workgroup (qcom_reqd_sub_group_size("full")) = 1 full Adreno
  subgroup, with 4 columns processed per workgroup, 32 lanes per column.
- Each lane keeps ROWS_PER_LANE = 4 floats of state in private registers
  across the (decay, kv, outer-product, attn) chain — eliminates the
  per-step global state read/write traffic of the naive kernel.
- kv and attn cluster-of-32 reductions via sub_group_shuffle_xor tree
  (mask=1,2,4,8,16); XOR with mask&lt;32 never crosses the 32-lane cluster
  boundary inside a 128-wide subgroup, so the four columns in the
  workgroup reduce independently without barriers or __local mem.
- Grid = (H, n_seqs, S_v / 4).
- The handle is created best-effort: clCreateKernel for the _sv128 entry
  is tolerated to fail (no subgroup_shuffle on the device); dispatch
  falls back to the naive kernel when the handle is NULL or S_v != 128.

Correctness: test-backend-ops -o GATED_DELTA_NET reports 8/8 OpenCL
cases OK (head_size=128 hits the new path; head_size=16/32/64 fall back
to the generic kernel; multi-token cases stay "not supported").

Also adds GGML_OPENCL_DISABLE_GDN=1 env var on the supports_op gate so
A/B benches against the CPU fallback don't require a rebuild.

Measured on Adreno X2-90 / Qwen3.6-35B-A3B-MXFP4, ngl=99, fa=0, -r 1:
                       naive    sv128   CPU-fallback
  tg128 @ d=0          11.28    19.84    12.17   (+76% vs naive,  +63% vs CPU)
  tg128 @ d=16384       7.55    11.05     8.30   (+46% vs naive,  +33% vs CPU)
  pp256 @ d=0         176.93   184.66   175.19
  pp256 @ d=16384     110.60   113.56   111.95

The prior session's "moe_histogram -54" block was a misdiagnosis: ne20
in the dispatch is n_expert_used (=8 for Qwen3.6), not n_experts (=256),
so local size 64*8=512 fits the 1024 device max. The naive kernel ran
fine end-to-end; it was just slow.
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -769,6 +769,7 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_conv_2d_f16_f32;
     cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
     cl_kernel kernel_gated_delta_net_f32;
+    cl_kernel kernel_gated_delta_net_f32_sv128 = nullptr;
     cl_kernel kernel_timestep_embedding;
     cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns;
     cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns;
@@ -2749,6 +2750,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
             build_program_from_source(backend_ctx, kernel_src.c_str(), compile_opts);
 
         CL_CHECK((backend_ctx->kernel_gated_delta_net_f32 = clCreateKernel(prog, "kernel_gated_delta_net_f32", &err), err));
+        // Specialized SV=128 (Qwen3-Next / Qwen3.6-A3B): cluster-of-32 reduction
+        // per column, 128-lane workgroup. Created best-effort — may be absent if
+        // the device lacks cl_*_subgroup_shuffle. ggml_cl_gated_delta_net falls
+        // back to the generic kernel when this handle is NULL.
+        cl_int err_sv128 = CL_SUCCESS;
+        backend_ctx->kernel_gated_delta_net_f32_sv128 =
+            clCreateKernel(prog, "kernel_gated_delta_net_f32_sv128", &err_sv128);
+        if (err_sv128 != CL_SUCCESS) {
+            backend_ctx->kernel_gated_delta_net_f32_sv128 = nullptr;
+        }
         CL_CHECK(clReleaseProgram(prog));
         GGML_LOG_CONT(".");
     }
@@ -5910,6 +5921,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
             // f32 only; autoregressive (n_tokens == 1) only — prefill keeps the
             // chunked path. (cparams.fused_gdn_ch then auto-disables on the
             // chunked-graph reservation; fused_gdn_ar stays enabled.)
+            // GGML_OPENCL_DISABLE_GDN=1 forces CPU fallback for A/B benching.
+            static const bool gdn_disabled = getenv("GGML_OPENCL_DISABLE_GDN") != nullptr;
+            if (gdn_disabled) return false;
             const ggml_tensor * v = op->src[2];
             for (int i = 0; i < 6; ++i) {
                 if (op->src[i]->type != GGML_TYPE_F32) return false;
@@ -10508,7 +10522,11 @@ static void ggml_cl_gated_delta_net(ggml_backend_t backend, const ggml_tensor *
     cl_ulong nbk1 = k->nb[1], nbk3 = k->nb[3];
     cl_ulong nbv1 = v->nb[1], nbv3 = v->nb[3];
 
-    cl_kernel kernel = backend_ctx->kernel_gated_delta_net_f32;
+    const bool use_sv128 = (s_v == 128) && (backend_ctx->kernel_gated_delta_net_f32_sv128 != nullptr);
+
+    cl_kernel kernel = use_sv128
+        ? backend_ctx->kernel_gated_delta_net_f32_sv128
+        : backend_ctx->kernel_gated_delta_net_f32;
 
     int i = 0;
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_mem),   &eq->data_device));
@@ -10531,7 +10549,10 @@ static void ggml_cl_gated_delta_net(ggml_backend_t backend, const ggml_tensor *
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbk3));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbv1));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(cl_ulong), &nbv3));
-    CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &s_v));
+    if (!use_sv128) {
+        // generic kernel takes s_v as the next int arg
+        CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),  &s_v));
+    }
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &neq1));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &nek1));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &neq3));
@@ -10541,9 +10562,18 @@ static void ggml_cl_gated_delta_net(ggml_backend_t backend, const ggml_tensor *
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &kda));
     CL_CHECK(clSetKernelArg(kernel, i++, sizeof(int),      &neg0));
 
-    // one thread per (column j, head, seq); driver picks the workgroup size
-    size_t global_work_size[] = { (size_t)s_v * H * n_seqs, 1, 1 };
-    backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, NULL, dst);
+    if (use_sv128) {
+        // 128-thread workgroup = 1 full subgroup; cluster of 32 lanes per col;
+        // 4 cols per workgroup; grid = (H, n_seqs, s_v / 4).
+        const int cols_per_wg = 4;
+        size_t global_work_size[] = { (size_t)H * 128, (size_t)n_seqs, (size_t)(s_v / cols_per_wg) };
+        size_t local_work_size[]  = { 128, 1, 1 };
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    } else {
+        // one thread per (column j, head, seq); driver picks the workgroup size
+        size_t global_work_size[] = { (size_t)s_v * H * n_seqs, 1, 1 };
+        backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, NULL, dst);
+    }
 }
 
 static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-opencl/kernels/gated_delta_net.cl b/ggml/src/ggml-opencl/kernels/gated_delta_net.cl
@@ -1,13 +1,10 @@
 // Gated DeltaNet (Qwen3-Next / KDA linear attention) fused op — autoregressive
 // (n_tokens == 1) case only. Reference: ggml/src/ggml-cpu/ops.cpp
-// ggml_compute_forward_gated_delta_net_f32, ggml/src/ggml-cuda/gated_delta_net.cu.
+// ggml_compute_forward_gated_delta_net_f32, ggml/src/ggml-cuda/gated_delta_net.cu,
+// ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp.
 //
-// One thread per (column j, head h, sequence s). Thread owns column j of the
-// per-head state matrix S, stored transposed in the output buffer's state
-// region as state_out[(h_seq)*S_v*S_v + j*S_v + i] = S[i][j] — i.e. the
-// contiguous run state_out[j*S_v .. j*S_v+S_v-1]. The state is read/written
-// directly in global memory (this op is memory-bound; no benefit from caching
-// the full column in private, which overflows the Adreno register file).
+// State layout (matches Vulkan / CPU): state[(h_seq)*S_v*S_v + j*S_v + i] = S[i][j]
+// i.e. each column j is contiguous along i.
 //
 // Single step (n_tokens == 1):
 //   copy:    S_out[i][j] = S_in[i][j]
@@ -17,6 +14,27 @@
 //   S_out[i][j] += k[i] * delta[j]
 //   out[j] = (sum_i S_out[i][j] * q[i]) * scale
 
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_khr_subgroup_shuffle
+#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable
+#define HAS_SUBGROUP_SHUFFLE 1
+#elif defined(cl_qcom_subgroup_shuffle)
+#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
+#define HAS_SUBGROUP_SHUFFLE 1
+#endif
+
+#if defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+#define REQD_SUBGROUP_SIZE_128
+#endif
+
+// ============================================================================
+// Generic fallback: one thread per (column j, head h, sequence s). Used when
+// the S_v=128 specialization is not applicable.
+// ============================================================================
 kernel void kernel_gated_delta_net_f32(
     global char * q_base,    ulong q_off,
     global char * k_base,    ulong k_off,
@@ -25,25 +43,23 @@ kernel void kernel_gated_delta_net_f32(
     global char * b_base,    ulong b_off,
     global char * s_base,    ulong s_off,
     global char * dst_base,  ulong dst_off,
-    // q/k/v strides in bytes ("contiguous rows": nb?0 == sizeof(float)).
-    // nb?1 = head stride, nb?3 = seq stride (nb?2 = token stride, unused: n_tokens == 1)
     ulong nbq1, ulong nbq3,
     ulong nbk1, ulong nbk3,
     ulong nbv1, ulong nbv3,
-    int s_v,                   // S_v = state dim
-    int neq1, int nek1,        // q/k head counts (<= H)
-    int neq3, int nek3,        // q/k seq counts  (<= n_seqs)
-    int H,                     // = src_v->ne[1]   (== n_heads_v)
+    int s_v,
+    int neq1, int nek1,
+    int neq3, int nek3,
+    int H,
     int n_seqs,
-    int kda,                   // 1 if g per-element ([S_v,...]), 0 if scalar ([1,...])
-    int neg0                   // g->ne[0]  (== S_v if kda else 1)
+    int kda,
+    int neg0
 ) {
-    const int gid = get_global_id(0);       // flattened (column j, head, seq)
+    const int gid = get_global_id(0);
     if (gid >= s_v * H * n_seqs) return;
-    const int j   = gid % s_v;              // column owned by this thread
-    const int hs  = gid / s_v;              // flattened (head, seq)
-    const int iv1 = hs % H;                 // head index   (0..H-1)
-    const int iv3 = hs / H;                 // sequence     (0..n_seqs-1)
+    const int j   = gid % s_v;
+    const int hs  = gid / s_v;
+    const int iv1 = hs % H;
+    const int iv3 = hs / H;
 
     const int rq3 = n_seqs / neq3;
     const int rk3 = n_seqs / nek3;
@@ -62,44 +78,186 @@ kernel void kernel_gated_delta_net_f32(
     s_base   += s_off;
     dst_base += dst_off;
 
-    // output: [ attn (S_v*H*1*n_seqs) | new_states (S_v*S_v*H*n_seqs) ]
-    const ulong attn_elems = (ulong)s_v * H * n_seqs;   // n_tokens == 1
+    const ulong attn_elems = (ulong)s_v * H * n_seqs;
     global float * attn_out  = (global float *)dst_base;
     global float * state_out = (global float *)dst_base + attn_elems;
 
-    // input/output state column j (contiguous run [j*s_v ..]) for this (head,seq)
     global const float * s_in  = (global const float *)s_base + ((ulong)iv3 * H + iv1) * s_v * s_v + (ulong)j * s_v;
     global       float * s_out = state_out                    + ((ulong)iv3 * H + iv1) * s_v * s_v + (ulong)j * s_v;
 
-    global const float * q_d = (global const float *)(q_base + (ulong)iq3*nbq3 + (ulong)iq1*nbq1);  // t == 0
+    global const float * q_d = (global const float *)(q_base + (ulong)iq3*nbq3 + (ulong)iq1*nbq1);
     global const float * k_d = (global const float *)(k_base + (ulong)ik3*nbk3 + (ulong)ik1*nbk1);
     global const float * v_d = (global const float *)(v_base + (ulong)iv3*nbv3 + (ulong)iv1*nbv1);
-    const ulong hb = ((ulong)iv3*H + iv1);                              // t == 0
+    const ulong hb = ((ulong)iv3*H + iv1);
     const float beta = ((global const float *)b_base)[hb];
     global const float * g_d = (global const float *)g_base + hb * (ulong)neg0;
 
-    // copy + decay
     if (kda) {
         for (int i = 0; i < s_v; ++i) s_out[i] = s_in[i] * exp(g_d[i]);
     } else {
         const float gd = exp(g_d[0]);
         for (int i = 0; i < s_v; ++i) s_out[i] = s_in[i] * gd;
     }
 
-    // kv[j] = sum_i S[i][j] * k[i]
     float kv = 0.0f;
     for (int i = 0; i < s_v; ++i) kv = mad(s_out[i], k_d[i], kv);
 
     const float delta = (v_d[j] - kv) * beta;
 
-    // outer product + output: S[i][j] += k[i]*delta ; out[j] = sum_i S[i][j]*q[i]
     float o = 0.0f;
     for (int i = 0; i < s_v; ++i) {
         const float sij = mad(k_d[i], delta, s_out[i]);
         s_out[i] = sij;
         o = mad(sij, q_d[i], o);
     }
 
-    // attn layout: [S_v, H, 1, n_seqs]
     attn_out[((ulong)iv3*H + iv1) * s_v + j] = o * scale;
 }
+
+// ============================================================================
+// S_v=128 specialization (Qwen3-Next / Qwen3.6-A3B).
+//
+// Layout per workgroup (1 full Adreno subgroup of 128 lanes):
+//   lane           = lid % 32       — row-lane within column (0..31)
+//   col_in_wg      = lid / 32       — column within workgroup (0..3)
+//   COLS_PER_WG    = 4              — 4 columns processed per workgroup
+//   LANES_PER_COL  = 32             — 32 lanes cooperate per column
+//   ROWS_PER_LANE  = 4              — each lane owns 4 rows of state in private
+//
+// Grid: (head_id, seq_id, col_block) with col_block in [0 .. 128/4 = 32).
+//   col = col_block * COLS_PER_WG + col_in_wg
+//
+// kv/attn reductions are cluster-of-32 sums via sub_group_shuffle_xor — each
+// 32-lane cluster within the 128-wide subgroup reduces independently because
+// XOR with mask < 32 never crosses cluster boundaries.
+// ============================================================================
+#if defined(HAS_SUBGROUP_SHUFFLE)
+
+#define GDN_SV    128
+#define GDN_LPC   32
+#define GDN_CPWG  4
+#define GDN_RPL   4
+
+inline float gdn_cluster32_sum(float v) {
+    v += sub_group_shuffle_xor(v,  1);
+    v += sub_group_shuffle_xor(v,  2);
+    v += sub_group_shuffle_xor(v,  4);
+    v += sub_group_shuffle_xor(v,  8);
+    v += sub_group_shuffle_xor(v, 16);
+    return v;
+}
+
+REQD_SUBGROUP_SIZE_128
+kernel void kernel_gated_delta_net_f32_sv128(
+    global char * q_base,    ulong q_off,
+    global char * k_base,    ulong k_off,
+    global char * v_base,    ulong v_off,
+    global char * g_base,    ulong g_off,
+    global char * b_base,    ulong b_off,
+    global char * s_base,    ulong s_off,
+    global char * dst_base,  ulong dst_off,
+    ulong nbq1, ulong nbq3,
+    ulong nbk1, ulong nbk3,
+    ulong nbv1, ulong nbv3,
+    int neq1, int nek1,
+    int neq3, int nek3,
+    int H,
+    int n_seqs,
+    int kda,
+    int neg0
+) {
+    const int lid       = get_local_id(0);
+    const int lane      = lid & (GDN_LPC - 1);
+    const int col_in_wg = lid >> 5;
+
+    const int head_id   = get_group_id(0);
+    const int seq_id    = get_group_id(1);
+    const int col_block = get_group_id(2);
+    const int col       = col_block * GDN_CPWG + col_in_wg;
+
+    const int iv1 = head_id;
+    const int iv3 = seq_id;
+    const int rq3 = n_seqs / neq3;
+    const int rk3 = n_seqs / nek3;
+    const int iq1 = iv1 % neq1;
+    const int ik1 = iv1 % nek1;
+    const int iq3 = iv3 / rq3;
+    const int ik3 = iv3 / rk3;
+
+    q_base   += q_off;
+    k_base   += k_off;
+    v_base   += v_off;
+    g_base   += g_off;
+    b_base   += b_off;
+    s_base   += s_off;
+    dst_base += dst_off;
+
+    const ulong attn_elems = (ulong)GDN_SV * H * n_seqs;
+    global float * attn_out  = (global float *)dst_base;
+    global float * state_out = (global float *)dst_base + attn_elems;
+
+    global const float * s_in  = (global const float *)s_base + ((ulong)iv3 * H + iv1) * GDN_SV * GDN_SV + (ulong)col * GDN_SV;
+    global       float * s_out = state_out                    + ((ulong)iv3 * H + iv1) * GDN_SV * GDN_SV + (ulong)col * GDN_SV;
+
+    global const float * q_d = (global const float *)(q_base + (ulong)iq3*nbq3 + (ulong)iq1*nbq1);
+    global const float * k_d = (global const float *)(k_base + (ulong)ik3*nbk3 + (ulong)ik1*nbk1);
+    global const float * v_d = (global const float *)(v_base + (ulong)iv3*nbv3 + (ulong)iv1*nbv1);
+    const ulong hb = (ulong)iv3 * H + iv1;
+    const float beta_val = ((global const float *)b_base)[hb];
+    global const float * g_d = (global const float *)g_base + hb * (ulong)neg0;
+
+    float s_shard[GDN_RPL];
+    float k_reg  [GDN_RPL];
+    float q_reg  [GDN_RPL];
+    float g_exp  [GDN_RPL];
+
+    #pragma unroll
+    for (int r = 0; r < GDN_RPL; r++) {
+        const int i = r * GDN_LPC + lane;
+        s_shard[r] = s_in[i];
+        k_reg[r]   = k_d[i];
+        q_reg[r]   = q_d[i];
+    }
+
+    if (kda) {
+        #pragma unroll
+        for (int r = 0; r < GDN_RPL; r++) {
+            g_exp[r] = exp(g_d[r * GDN_LPC + lane]);
+        }
+    } else {
+        const float gv = exp(g_d[0]);
+        #pragma unroll
+        for (int r = 0; r < GDN_RPL; r++) g_exp[r] = gv;
+    }
+
+    const float v_val = v_d[col];
+
+    float kv_shard = 0.0f;
+    #pragma unroll
+    for (int r = 0; r < GDN_RPL; r++) {
+        kv_shard = mad(g_exp[r] * s_shard[r], k_reg[r], kv_shard);
+    }
+    const float kv_col = gdn_cluster32_sum(kv_shard);
+
+    const float delta = (v_val - kv_col) * beta_val;
+
+    float attn_partial = 0.0f;
+    #pragma unroll
+    for (int r = 0; r < GDN_RPL; r++) {
+        const float sij = mad(k_reg[r], delta, g_exp[r] * s_shard[r]);
+        s_shard[r] = sij;
+        attn_partial = mad(sij, q_reg[r], attn_partial);
+    }
+    const float attn_col = gdn_cluster32_sum(attn_partial);
+
+    if (lane == 0) {
+        attn_out[((ulong)iv3 * H + iv1) * GDN_SV + col] = attn_col * (1.0f / sqrt((float) GDN_SV));
+    }
+
+    #pragma unroll
+    for (int r = 0; r < GDN_RPL; r++) {
+        s_out[r * GDN_LPC + lane] = s_shard[r];
+    }
+}
+
+#endif // HAS_SUBGROUP_SHUFFLE