simplify sinks application

am17an · am17an · commit 8c19a4239f01 · 2026-02-02T15:38:44.000+01:00
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -8239,15 +8239,16 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
             }
         }
 
-        // sinks - skip when writing partials, reduce function will apply once
-        if (sinks && !write_partials) {
+        // sinks - apply only on the first kv-chunk
+        if (sinks && ic_start == 0) {
             const float s = ((float *)((char *) sinks->data))[h];
 
             float ms = 1.0f;
             float vs = 1.0f;
 
             if (s > M) {
                 ms = expf(M - s);
+                M = s;
                 ggml_vec_scale_f32(DV, VKQ32, ms);
             } else {
                 vs = expf(s - M);
@@ -8564,7 +8565,6 @@ static void ggml_flash_attn_ext_reduce_partials(
     const ggml_tensor * q = dst->src[0];
     const ggml_tensor * k = dst->src[1];
     const ggml_tensor * v = dst->src[2];
-    const ggml_tensor * sinks = dst->src[4];
 
     const int64_t DK = k->ne[0];
     const int64_t DV = v->ne[0];
@@ -8616,20 +8616,6 @@ static void ggml_flash_attn_ext_reduce_partials(
             M_final = M_new;
         }
 
-        // Apply sinks once after combining all chunks
-        if (sinks) {
-            const float s = ((float *) sinks->data)[q_head];
-
-            if (s > M_final) {
-                const float ms = expf(M_final - s);
-                ggml_vec_scale_f32(DV, VKQ_final, ms);
-                S_final = S_final * ms + 1.0f;
-                M_final = s;
-            } else {
-                S_final = S_final + expf(s - M_final);
-            }
-        }
-
         // Normalize and write to output
         if (S_final != 0.0f) {
             const float S_inv = 1.0f / S_final;