From 289b9dcbb3782746f9518a589a3c2ed71438edc3 Mon Sep 17 00:00:00 2001 From: TheTom Date: Fri, 1 May 2026 09:03:23 -0500 Subject: [PATCH 1/2] sparse V: tile-level skip for CUDA tile FA (opt-in) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the per-element zero from the original PR #98 (which never saved compute since the V matmul still ran with val=0) with a tile-uniform skip of the entire V matmul block when no position contributes meaningfully. Per-warp shfl reduction of the per-thread max softmax probability, then a per-block fan-out via shared memory. Branch on the result so all threads take the same path — no warp divergence (was the cause of the April 24 revert in commit f2dc968 on the VEC kernel). Off by default. Opt in at build time: cmake -DGGML_CUDA_FLAGS=-DGGML_CUDA_TURBO_SPARSE_V_TILE # threshold defaults to 0.001f; override with # -DGGML_CUDA_FLAGS='-DGGML_CUDA_TURBO_SPARSE_V_TILE -DGGML_CUDA_TURBO_SPARSE_V_THRESHOLD=0.0001f' Mirror of vllm-project/vllm#41422 which validated this pattern on AMD MI300X: +7.13% decode @ 32K with PPL bit-identical and NIAH all-pass on Qwen3-8B. Default-off path is byte-identical to upstream (verified on M5 Max Metal: Qwen2.5-7B Q8_0 sym turbo3 PPL 6.6594, exact match). Co-Authored-By: Claude Opus 4.7 (1M context) --- ggml/src/ggml-cuda/fattn-tile.cuh | 47 +++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh index be4f3281a5b..baa290ee393 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cuh +++ b/ggml/src/ggml-cuda/fattn-tile.cuh @@ -2,6 +2,14 @@ #include "fattn-common.cuh" #include "fattn-wmma-f16.cuh" +// Tile-level sparse V skip (TurboQuant). Off by default; opt in by defining +// GGML_CUDA_TURBO_SPARSE_V_TILE at build time. Threshold defaults to 0.001 +// (matches vllm-project/vllm#41422 — bit-identical PPL + NIAH all-pass on +// Qwen3-8B at 32K). Override with -DGGML_CUDA_TURBO_SPARSE_V_THRESHOLD=. +#if defined(GGML_CUDA_TURBO_SPARSE_V_TILE) && !defined(GGML_CUDA_TURBO_SPARSE_V_THRESHOLD) +#define GGML_CUDA_TURBO_SPARSE_V_THRESHOLD 0.001f +#endif + // nbatch_fa == number of KQ rows to process per iteration // nbatch_K == number of K columns to load in parallel for KQ calculation @@ -642,6 +650,19 @@ static __device__ __forceinline__ void flash_attn_tile_iter( KQ_max_new[0] = warp_reduce_max(KQ_max_new[0]); } + // Tile-level sparse V skip (TurboQuant): cheaply track the max softmax + // probability seen across this FA tile so we can skip the whole V matmul + // block when no position contributes meaningfully. Opt-in via build flag + // -DGGML_CUDA_TURBO_SPARSE_V_TILE -DGGML_CUDA_TURBO_SPARSE_V_THRESHOLD=0.001f. + // The decision is block-uniform (all threads take the same branch via + // shared-memory reduction), avoiding the per-lane warp divergence that + // motivated the earlier April 24 revert (commit f2dc968). + // Mirror of vllm-project/vllm#41422 which validated +7.13% decode @ 32K + // on AMD MI300X with PPL bit-identical and NIAH all-pass. +#ifdef GGML_CUDA_TURBO_SPARSE_V_TILE + float thread_max_val = 0.0f; +#endif + // Calculate KQ softmax, write to shared KQ buffer, re-scale VKQ accumulators: #pragma unroll for (int jc0 = 0; jc0 < cpw; jc0 += KQ_cs) { @@ -665,6 +686,9 @@ static __device__ __forceinline__ void flash_attn_tile_iter( expf(KQ_acc[(i0/(np*warp_size))*cpw + jc] - KQ_max[jc]) : 0.0f; KQ_sum_add += val; tmp[i0/(np*warp_size)][jc1] = val; +#ifdef GGML_CUDA_TURBO_SPARSE_V_TILE + thread_max_val = fmaxf(thread_max_val, val); +#endif } KQ_sum[jc] = KQ_sum[jc]*KQ_max_scale + KQ_sum_add; @@ -693,12 +717,32 @@ static __device__ __forceinline__ void flash_attn_tile_iter( } } +#ifdef GGML_CUDA_TURBO_SPARSE_V_TILE + // Block-level reduction of thread_max_val. Per-warp shfl reduce, write to + // shared, syncthreads, fan back out — leaves block_max_val identical and + // warp-uniform across all threads. Cost: one __syncthreads + cheap fp ops. + thread_max_val = warp_reduce_max(thread_max_val); + __shared__ float sparse_v_warp_max[nwarps]; + if (threadIdx.x == 0) sparse_v_warp_max[threadIdx.y] = thread_max_val; + __syncthreads(); + float block_max_val = 0.0f; +#pragma unroll + for (int w = 0; w < nwarps; ++w) { + block_max_val = fmaxf(block_max_val, sparse_v_warp_max[w]); + } + constexpr float sparse_v_threshold = GGML_CUDA_TURBO_SPARSE_V_THRESHOLD; + const bool skip_v_matmul = block_max_val < sparse_v_threshold; +#endif + // VKQ = V @ KQ matrix multiplication: static_assert(DV <= DKQ, "bad DV"); static_assert(DV % nbatch_K == 0 || (nbatch_K % 3 == 0 && DV % (nbatch_K*2/3) == 0), "bad nbatch_K"); constexpr int nbatch_V = (DV % nbatch_K == 0 ? nbatch_K : nbatch_K*2/3) * nbatch_fa / DV; // Number of V columns that fit in SRAM for K. static_assert(nbatch_fa % nbatch_V == 0, "bad nbatch_V"); static_assert(nbatch_V % np == 0, "bad nbatch_V"); +#ifdef GGML_CUDA_TURBO_SPARSE_V_TILE + if (!skip_v_matmul) { +#endif #pragma unroll for (int k0 = 0; k0 < nbatch_fa; k0 += nbatch_V) { flash_attn_tile_load_tile @@ -769,6 +813,9 @@ static __device__ __forceinline__ void flash_attn_tile_iter( __syncthreads(); } +#ifdef GGML_CUDA_TURBO_SPARSE_V_TILE + } // close if (!skip_v_matmul) +#endif } template // D == head size From a07ab093c254443ff15f4b6c2cef4788ca1dd5ba Mon Sep 17 00:00:00 2001 From: TheTom Date: Fri, 1 May 2026 09:27:53 -0500 Subject: [PATCH 2/2] sparse V VEC: warp-uniform skip via warp_reduce_max (opt-in) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the vllm-project/vllm#41422 design pattern to llama.cpp's CUDA fattn-vec kernel: replace the per-lane sparse V skip (which had warp divergence on turbo paths and was compile-time gated off for turbo via PR #115's `if constexpr (!V_is_turbo)`) with a warp-uniform skip via `warp_reduce_max`. All lanes branch on the same value so there's no warp divergence regardless of V type. Off by default. Opt in at build time: cmake -DCMAKE_CXX_FLAGS=-DGGML_CUDA_TURBO_SPARSE_V_VEC Threshold defaults to 0.001f (matches vLLM PR #41422). Override with -DGGML_CUDA_TURBO_SPARSE_V_VEC_THRESHOLD=. Default-off path is byte-identical (verified on M5 Max Metal: Qwen2.5-7B Q8_0 sym turbo3 PPL 6.6594, exact match with PR #115 baseline). Pairs with the prior commit's tile-kernel sparse V skip (off-by-default opt-in via GGML_CUDA_TURBO_SPARSE_V_TILE) — that one targets fattn-tile prefill, this one targets fattn-vec decode (the actual hot path on single-token generation where the vLLM win was measured). NO MERGE — testing branch only. AMD MI300X HIP validation pending. Co-Authored-By: Claude Opus 4.7 (1M context) --- ggml/src/ggml-cuda/fattn-vec.cuh | 58 +++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index 1548277afdf..422c70f7204 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -1,6 +1,15 @@ #include "common.cuh" #include "fattn-common.cuh" +// Tile-uniform sparse V skip in the VEC kernel (TurboQuant). Off by default; +// opt in by defining GGML_CUDA_TURBO_SPARSE_V_VEC at build time. Threshold +// defaults to 0.001 (matches vllm-project/vllm#41422 — bit-identical PPL + +// NIAH all-pass on Qwen3-8B at 32K). Override with +// -DGGML_CUDA_TURBO_SPARSE_V_VEC_THRESHOLD=. +#if defined(GGML_CUDA_TURBO_SPARSE_V_VEC) && !defined(GGML_CUDA_TURBO_SPARSE_V_VEC_THRESHOLD) +#define GGML_CUDA_TURBO_SPARSE_V_VEC_THRESHOLD 0.001f +#endif + static int ggml_cuda_fattn_vec_get_nthreads_host(const int cc) { return 128; GGML_UNUSED(cc); @@ -412,12 +421,33 @@ static __global__ void flash_attn_ext_vec( } } - // Sparse V: skip V dequant if all attention weights for this position are negligible. - // For turbo types, the check is compiled out: at typical decode context lengths - // (< ~4K tokens) with threshold 1e-6, no positions are ever skipped, so the - // per-position branch is pure overhead (misprediction + comparison cost). This - // also dodges the warp-divergence regression on turbo paths that motivated the - // April 24 revert (commit f2dc968). + // Sparse V skip — two strategies: + // + // GGML_CUDA_TURBO_SPARSE_V_VEC (opt-in, default off): + // Warp-uniform skip via warp_reduce_max — all lanes branch on + // the same value so no warp divergence. Works on every V type + // including turbo. Threshold defaults to 0.001 (matches the + // vllm-project/vllm#41422 design that validated +7.13% decode + // at 32K on AMD MI300X with PPL bit-identical and NIAH all- + // pass). + // + // default (signalnine PR #115): + // Per-lane skip with `if constexpr (!V_is_turbo)` compile-time + // gate. Compiled out for turbo to dodge the warp-divergence + // regression that motivated the April 24 revert (commit + // f2dc968). Kept as the default while the warp-uniform variant + // is bench-validated cross-platform. +#ifdef GGML_CUDA_TURBO_SPARSE_V_VEC + { + float my_kq_max = 0.0f; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + my_kq_max = fmaxf(my_kq_max, __half2float(__low2half(KQ_k[j]))); + } + const float warp_max = warp_reduce_max(my_kq_max); + if (warp_max < (float) GGML_CUDA_TURBO_SPARSE_V_VEC_THRESHOLD) continue; + } +#else if constexpr (!V_is_turbo) { bool dominated = true; #pragma unroll @@ -426,6 +456,7 @@ static __global__ void flash_attn_ext_vec( } if (dominated) { continue; } } +#endif #pragma unroll for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) { @@ -461,8 +492,18 @@ static __global__ void flash_attn_ext_vec( } } - // Sparse V: skip V dequant if all attention weights for this position are negligible. - // Compiled out for turbo types — see half2 path comment above. + // Sparse V skip — see half2-path comment above. Same two strategies. +#ifdef GGML_CUDA_TURBO_SPARSE_V_VEC + { + float my_kq_max = 0.0f; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + my_kq_max = fmaxf(my_kq_max, KQ_k[j]); + } + const float warp_max = warp_reduce_max(my_kq_max); + if (warp_max < (float) GGML_CUDA_TURBO_SPARSE_V_VEC_THRESHOLD) continue; + } +#else if constexpr (!V_is_turbo) { bool dominated = true; #pragma unroll @@ -471,6 +512,7 @@ static __global__ void flash_attn_ext_vec( } if (dominated) { continue; } } +#endif // Turbo V path: precompute scaled centroids once per block to eliminate // per-element norm multiply. centroid[idx]*norm is computed 8/4/16 times