cuda: disable sparse V skip (warp divergence regression)

TheTom · TheTom · commit f2dc968bdd44 · 2026-04-24T08:40:56.000-05:00
Per-lane branching in the VEC FA kernel causes warp divergence that costs more than the skipped dequants save. Benchmarked at -0.3% to -2.8% on RTX 3090/4090 across all context lengths. Metal path unaffected (remains enabled, +4% to +23%). TODO: revisit with warp-level ballot skip (__ballot_sync + early exit when entire warp is below threshold). Data: @sztlink (Qwen3-30B-A3B Q4_K_M, CUDA SM86/SM89)
diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -333,6 +333,10 @@ static __global__ void flash_attn_ext_vec(
             }
 
             // Sparse V: skip V dequant if all attention weights for this position are negligible
+            // Disabled — per-lane branching causes warp divergence that costs more than the
+            // skipped dequants save (-0.3% to -2.8% on RTX 3090/4090).
+            // TODO: revisit with warp-level ballot skip.
+#if 0
             {
                 bool dominated = true;
 #pragma unroll
@@ -341,6 +345,7 @@ static __global__ void flash_attn_ext_vec(
                 }
                 if (dominated) { continue; }
             }
+#endif
 
 #pragma unroll
             for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
@@ -373,6 +378,10 @@ static __global__ void flash_attn_ext_vec(
             }
 
             // Sparse V: skip V dequant if all attention weights for this position are negligible
+            // Disabled — per-lane branching causes warp divergence that costs more than the
+            // skipped dequants save (-0.3% to -2.8% on RTX 3090/4090).
+            // TODO: revisit with warp-level ballot skip.
+#if 0
             {
                 bool dominated = true;
 #pragma unroll
@@ -381,6 +390,7 @@ static __global__ void flash_attn_ext_vec(
                 }
                 if (dominated) { continue; }
             }
+#endif
 
 #pragma unroll
             for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {

Original file line number	Diff line number	Diff line change
`@@ -333,6 +333,10 @@ static __global__ void flash_attn_ext_vec(`
`333`	`333`	`}`
`334`	`334`
`335`	`335`	`// Sparse V: skip V dequant if all attention weights for this position are negligible`
	`336`	`+ // Disabled — per-lane branching causes warp divergence that costs more than the`
	`337`	`+ // skipped dequants save (-0.3% to -2.8% on RTX 3090/4090).`
	`338`	`+ // TODO: revisit with warp-level ballot skip.`
	`339`	`+#if 0`
`336`	`340`	`{`
`337`	`341`	`bool dominated = true;`
`338`	`342`	`#pragma unroll`
`@@ -341,6 +345,7 @@ static __global__ void flash_attn_ext_vec(`
`341`	`345`	`}`
`342`	`346`	`if (dominated) { continue; }`
`343`	`347`	`}`
	`348`	`+#endif`
`344`	`349`
`345`	`350`	`#pragma unroll`
`346`	`351`	`for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {`
`@@ -373,6 +378,10 @@ static __global__ void flash_attn_ext_vec(`
`373`	`378`	`}`
`374`	`379`
`375`	`380`	`// Sparse V: skip V dequant if all attention weights for this position are negligible`
	`381`	`+ // Disabled — per-lane branching causes warp divergence that costs more than the`
	`382`	`+ // skipped dequants save (-0.3% to -2.8% on RTX 3090/4090).`
	`383`	`+ // TODO: revisit with warp-level ballot skip.`
	`384`	`+#if 0`
`376`	`385`	`{`
`377`	`386`	`bool dominated = true;`
`378`	`387`	`#pragma unroll`
`@@ -381,6 +390,7 @@ static __global__ void flash_attn_ext_vec(`
`381`	`390`	`}`
`382`	`391`	`if (dominated) { continue; }`
`383`	`392`	`}`
	`393`	`+#endif`
`384`	`394`
`385`	`395`	`#pragma unroll`
`386`	`396`	`for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {`