Fix SWA KVarN audit findings: OOB shared memory, force-materialize crash, ring sizing, Vulkan

Anbeeld · Anbeeld · commit a620cbd481d1 · 2026-06-17T19:23:26.000+02:00
Four fixes from code audit of commit e211e4d: 1. CUDA WHT out-of-bounds (critical): kvarn_materialize_swa_kernel re-implemented the 128-dim inverse WHT inline, running the butterfly on all 128 threads without the threadIdx.x < 64 guard. Threads 64-127 read/wrote sh[128..255] on a float[128] shared array — UB that produced correct results in sh[0..127] (only threads 0-63 touch those) but could corrupt neighboring blocks under high occupancy. Replaced with kvarn_wht_128(sh), the guarded store-path WHT. Since H_128 is symmetric, the forward WHT is the correct inverse. 2. force-materialize null mat_idxs (major): self_kvarn_mat_idxs_swa was only built under !kvarn_force_materialize_enabled(), but the non-rotated (force-materialize) path still calls get_k/get_v -> materialize(swa=true, mat_idxs=nullptr), which derefs indices->type and crashes. Now built whenever the SWA cache is KVarN, independent of force-materialize. 3. SWA ring under-size (major): n_groups_per_stream = ceil(kv_size/128) was too small — the metadata window of kv_size cells spans ceil(kv_size/128)+1 tiles (sliding window is rarely tile-aligned), so the oldest in-window tile's record slot collided with a newer tile, silently zeroing it. Now ceil(kv_size/128)+2 for SWA, with a backstop assert documenting the invariant. 4. Vulkan SWA path (gap): kvarn_store.comp and kvarn_materialize.comp had no SWA support (linear group decode, group==0 sink, no swa push-constant). Vulkan advertises kvarn_native_ops, so SWA KVarN layers could offload to Vulkan and run the non-SWA shaders on absolute-position indices -> silent garbage. Added swa push-constant, ring slot math, per-cell position decode, and empty-cell zeroing to both shaders, mirroring CPU/CUDA. Host dispatch reads op_params[4] (store) and [6] (materialize) and asserts single-stream for SWA. Verified: test-kvarn green (CPU+CUDA SWA parity, GPU SWA path now uses guarded WHT); llama-perplexity KLD on Gemma 4 31B Q5/16k/kvarn4 = 0.7296 (statistically identical to pre-fix 0.7305 — fixes resolve latent bugs without changing validated quality); GGML_KVARN_FORCE_MATERIALIZE=1 smoke on Gemma 4 31B generates coherent text (no crash). Vulkan path is theoretical (not compiled in CUDA-only build).
diff --git a/ggml/src/ggml-cuda/kvarn.cu b/ggml/src/ggml-cuda/kvarn.cu
@@ -1335,21 +1335,16 @@ static __global__ void kvarn_materialize_swa_kernel(
         return;
     }
 
-    // inverse WHT (128-dim) via shared-memory butterfly; mirrors kvarn_wht_128
+    // inverse WHT (128-dim): reuse the store path's shared-memory butterfly. It
+    // guards the butterfly to lanes < 64 (each lane handles one pair) and applies
+    // the 1/sqrt(128) normalization. Running the butterfly unguarded over all 128
+    // lanes (as an earlier inline version did) makes lanes 64..127 read/write
+    // sh[128..255] out of bounds on this float[128] array.
     __shared__ float sh[KVAR_N_DIM];
     sh[dim] = rotated;
-    __syncthreads();
-    for (int stride = 1; stride < KVAR_N_DIM; stride *= 2) {
-        const int j = (dim / stride) * (2 * stride) + (dim % stride);
-        const float a = sh[j];
-        const float b = sh[j + stride];
-        sh[j] = a + b;
-        sh[j + stride] = a - b;
-        __syncthreads();
-    }
-    const float out_val = sh[dim] * 0.08838834764831845f;
+    kvarn_wht_128(sh);
     half * out = dst + ((int64_t) cell * n_heads + head) * KVAR_N_DIM;
-    out[dim] = __float2half_rn(out_val);
+    out[dim] = __float2half_rn(sh[dim]);
 }
 
 template<int BITS, bool VALUE>
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1198,6 +1198,7 @@ struct vk_op_kvarn_store_push_constants {
     uint32_t bits;
     uint32_t iterations;
     uint32_t value;
+    uint32_t swa; // SWA sliding-window ring store (absolute-position indices, no sink)
 };
 static_assert(sizeof(vk_op_kvarn_store_push_constants) <= 128, "sizeof(vk_op_kvarn_store_push_constants) must be <= 128");
 
@@ -1212,6 +1213,7 @@ struct vk_op_kvarn_materialize_push_constants {
     uint32_t value;
     uint32_t n_indices;
     uint32_t emit_rotated;
+    uint32_t swa; // SWA sliding-window ring materialize (indices carry per-cell positions)
 };
 static_assert(sizeof(vk_op_kvarn_materialize_push_constants) <= 128, "sizeof(vk_op_kvarn_materialize_push_constants) must be <= 128");
 
@@ -9158,9 +9160,13 @@ static void ggml_vk_kvarn_store(ggml_backend_vk_context * ctx, vk_context& subct
     const int bits = ggml_get_op_params_i32(dst, 0);
     const int iterations = ggml_get_op_params_i32(dst, 1);
     const bool value = ggml_get_op_params_i32(dst, 2) != 0;
+    const bool swa = ggml_get_op_params_i32(dst, 4) != 0; // KVAR_N_OP_PARAM_STORE_SWA
     GGML_ASSERT(ggml_vk_kvarn_valid_bits(bits));
     const int n_stream = (int) (stage->ne[2] / 384);
     const int groups_per_stream = (int) (records->ne[2] / n_stream);
+    if (swa) {
+        GGML_ASSERT(n_stream == 1 && "SWA KVarN ring requires a single stream");
+    }
 
     vk_op_kvarn_store_push_constants pc = {
         (uint32_t) current->ne[1],
@@ -9171,6 +9177,7 @@ static void ggml_vk_kvarn_store(ggml_backend_vk_context * ctx, vk_context& subct
         (uint32_t) bits,
         (uint32_t) iterations,
         value ? 1u : 0u,
+        swa ? 1u : 0u,
     };
 
     const vk_subbuffer current_buf = ggml_vk_tensor_subbuffer(ctx, current);
@@ -9201,9 +9208,13 @@ static void ggml_vk_kvarn_materialize(ggml_backend_vk_context * ctx, vk_context&
     const int stream_start = ggml_get_op_params_i32(dst, 2);
     const int n_stream = ggml_get_op_params_i32(dst, 3);
     const bool emit_rotated = ggml_get_op_params_i32(dst, 5) != 0;
+    const bool swa = ggml_get_op_params_i32(dst, 6) != 0; // KVAR_N_OP_PARAM_MAT_SWA
     GGML_ASSERT(ggml_vk_kvarn_valid_bits(bits));
     const int n_total_stream = (int) (stage->ne[2] / 384);
     const int groups_per_stream = (int) (records->ne[2] / n_total_stream);
+    if (swa) {
+        GGML_ASSERT(n_stream == 1 && "SWA KVarN ring materialize requires a single stream");
+    }
 
     vk_op_kvarn_materialize_push_constants pc = {
         (uint32_t) dst->ne[1],
@@ -9216,6 +9227,7 @@ static void ggml_vk_kvarn_materialize(ggml_backend_vk_context * ctx, vk_context&
         value ? 1u : 0u,
         (uint32_t) indices->ne[0],
         emit_rotated ? 1u : 0u,
+        swa ? 1u : 0u,
     };
 
     const vk_subbuffer records_buf = ggml_vk_tensor_subbuffer(ctx, records);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/kvarn_materialize.comp b/ggml/src/ggml-vulkan/vulkan-shaders/kvarn_materialize.comp
@@ -13,6 +13,7 @@ layout(push_constant) uniform parameter {
     uint value;
     uint n_indices;
     uint emit_rotated;
+    uint swa; // SWA sliding-window ring materialize (indices carry per-cell positions)
 } p;
 
 layout(binding = 0, std430) readonly buffer Records { uint data_records[]; };
@@ -59,10 +60,18 @@ float load_axis_value(uint record_base, uint payload_words, uint axis, uint inde
 uint compute_live_group(uint stream, uint lane) {
     uint live_group = 0;
     for (uint i = lane; i < p.n_indices; i += KVAR_N_DIM) {
-        const uint group_global = read_index_low(i) / KVAR_N_DIM;
-        const uint idx_stream = group_global / p.groups_per_stream;
-        if (idx_stream == stream) {
-            live_group = max(live_group, group_global - stream * p.groups_per_stream);
+        if (p.swa != 0u) {
+            // SWA ring: indices carry absolute positions; negative marks empty cells
+            const int v = int(read_index_low(i));
+            if (v >= 0) {
+                live_group = max(live_group, uint(v) / KVAR_N_DIM);
+            }
+        } else {
+            const uint group_global = read_index_low(i) / KVAR_N_DIM;
+            const uint idx_stream = group_global / p.groups_per_stream;
+            if (idx_stream == stream) {
+                live_group = max(live_group, group_global - stream * p.groups_per_stream);
+            }
         }
     }
 
@@ -111,16 +120,40 @@ void main() {
     const uint stream = p.stream_start + out_stream;
 
     const uint live_group = compute_live_group(stream, lane);
-    const uint group = token / KVAR_N_DIM;
-    const uint pos = token % KVAR_N_DIM;
     const uint stage_base = stream * KVAR_N_DIM * KVAR_N_STAGE_GROUPS;
 
+    uint group;
+    uint pos;
+    bool from_stage;
+    bool from_record;
+    if (p.swa != 0u) {
+        // SWA ring: one absolute position per output cell; negative => empty cell.
+        const int abs_pos = int(read_index_low(token));
+        if (abs_pos < 0) {
+            sh_wht[lane] = 0.0;
+            barrier();
+            store_dst_pair(out_stream, token, head, lane);
+            return;
+        }
+        group = uint(abs_pos) / KVAR_N_DIM;
+        pos = uint(abs_pos) % KVAR_N_DIM;
+        from_stage  = (group + 1u >= live_group) && (group <= live_group);
+        from_record = (!from_stage) && (group < live_group) && (live_group - group < p.groups_per_stream);
+    } else {
+        group = token / KVAR_N_DIM;
+        pos = token % KVAR_N_DIM;
+        from_stage  = group == 0u || (group > 0u && group <= live_group && group + 1u >= live_group);
+        from_record = (!from_stage) && group < live_group && group < p.groups_per_stream;
+    }
+
     float x = 0.0;
-    if (group == 0 || (group > 0 && group <= live_group && group + 1 >= live_group)) {
-        const uint stage_pos = stage_base + (group == 0 ? pos : KVAR_N_DIM + (((group - 1) & 1) * KVAR_N_DIM) + pos);
+    if (from_stage) {
+        const uint stage_slot = p.swa != 0u ? (group % KVAR_N_STAGE_GROUPS) : (group == 0u ? 0u : 1u + ((group - 1u) & 1u));
+        const uint stage_pos = stage_base + stage_slot * KVAR_N_DIM + pos;
         x = load_stage_value(stage_pos, head, lane);
-    } else if (group < live_group && group < p.groups_per_stream) {
-        const uint record_group = stream * p.groups_per_stream + group;
+    } else if (from_record) {
+        const uint ring = p.swa != 0u ? (group % p.groups_per_stream) : group;
+        const uint record_group = stream * p.groups_per_stream + ring;
         const uint record_base = (record_group * p.n_heads + head) * p.record_words;
         const uint row = p.value != 0 ? pos : lane;
         const uint col = p.value != 0 ? lane : pos;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/kvarn_store.comp b/ggml/src/ggml-vulkan/vulkan-shaders/kvarn_store.comp
@@ -11,6 +11,7 @@ layout(push_constant) uniform parameter {
     uint bits;
     uint iterations;
     uint value;
+    uint swa; // SWA sliding-window ring store (absolute-position indices, no group-0 sink)
 } p;
 
 layout(binding = 0, std430) readonly buffer Current { float data_current[]; };
@@ -71,7 +72,9 @@ void wht_128(uint lane) {
 float stage_matrix_value(uint head, uint stage_base, uint stage_group, uint row, uint col) {
     const uint token = p.value != 0 ? row : col;
     const uint dim = p.value != 0 ? col : row;
-    const uint stage_pos = stage_base + KVAR_N_DIM + (((stage_group - 1) & 1) * KVAR_N_DIM) + token;
+    // SWA: 3-deep ping-pong over absolute tiles; non-SWA: tile 0 sink + slots 1/2.
+    const uint stage_slot = p.swa != 0u ? (stage_group % KVAR_N_STAGE_GROUPS) : (1u + ((stage_group - 1u) & 1u));
+    const uint stage_pos = stage_base + stage_slot * KVAR_N_DIM + token;
     return load_stage_value(stage_pos, head, dim);
 }
 
@@ -240,24 +243,28 @@ void main() {
     for (uint token = 0; token < p.n_tokens; ++token) {
         const uint idx = read_index_low(token);
         const uint group_global = idx / KVAR_N_DIM;
-        const uint stream = group_global / p.groups_per_stream;
-        const uint group = group_global - stream * p.groups_per_stream;
         const uint pos = idx % KVAR_N_DIM;
-        if (stream >= p.n_stream || group >= p.groups_per_stream) {
+        // SWA: idx is the absolute token position; records form a ring (single
+        // stream) and there is no permanent group-0 sink.
+        const uint stream = p.swa != 0u ? 0u : group_global / p.groups_per_stream;
+        const uint group = p.swa != 0u ? group_global : group_global - stream * p.groups_per_stream;
+        if (stream >= p.n_stream || (p.swa == 0u && group >= p.groups_per_stream)) {
             return;
         }
 
         const uint stage_base = stream * KVAR_N_DIM * KVAR_N_STAGE_GROUPS;
-        if (group > 2 && pos == 0) {
-            const uint flush_group = group - 2;
-            const uint flush_record_group = stream * p.groups_per_stream + flush_group;
+        if (pos == 0 && (p.swa != 0u ? group >= 2u : group > 2u)) {
+            const uint flush_group = group - 2u;
+            const uint flush_ring = p.swa != 0u ? flush_group % p.groups_per_stream : flush_group;
+            const uint flush_record_group = stream * p.groups_per_stream + flush_ring;
             const uint record_base = (flush_record_group * p.n_heads + head) * p.record_words;
             quantize_stage(head, stage_base, flush_group, record_base, lane);
         }
 
         sh_wht[lane] = data_current[(token * p.n_heads + head) * KVAR_N_DIM + lane];
         wht_128(lane);
-        const uint stage_pos = stage_base + (group == 0 ? pos : KVAR_N_DIM + (((group - 1) & 1) * KVAR_N_DIM) + pos);
+        const uint stage_slot = p.swa != 0u ? (group % KVAR_N_STAGE_GROUPS) : (group == 0u ? 0u : 1u + ((group - 1u) & 1u));
+        const uint stage_pos = stage_base + stage_slot * KVAR_N_DIM + pos;
         store_stage_pair(stage_pos, head, lane);
         barrier();
     }
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -2891,13 +2891,15 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
 
     inp->self_k_rot_swa = mctx_cur->get_swa()->build_input_k_rot(ctx0);
     inp->self_v_rot_swa = mctx_cur->get_swa()->build_input_v_rot(ctx0);
-    if (!kvarn_force_materialize_enabled()) {
-        if (const auto * kvarn_swa = dynamic_cast<const llama_kv_cache_kvarn_context *>(mctx_cur->get_swa())) {
-            inp->self_kvarn_mat_idxs_swa = kvarn_swa->build_input_kvarn_mat_idxs(ctx0);
-            // make the materialize indices available to the context at graph build time
-            // (get_k/get_v/materialize run during build, before set_input populates them)
-            const_cast<llama_kv_cache_kvarn_context *>(kvarn_swa)->set_mat_idxs(inp->self_kvarn_mat_idxs_swa);
-        }
+    // SWA KVarN materialize needs per-cell positions on BOTH the rotated and the
+    // force-materialize (non-rotated) paths, so build them whenever the SWA cache
+    // is a KVarN cache — independent of kvarn_force_materialize_enabled(). Omitting
+    // them under force-materialize left mat_idxs null and crashed in materialize().
+    if (const auto * kvarn_swa = dynamic_cast<const llama_kv_cache_kvarn_context *>(mctx_cur->get_swa())) {
+        inp->self_kvarn_mat_idxs_swa = kvarn_swa->build_input_kvarn_mat_idxs(ctx0);
+        // make the materialize indices available to the context at graph build time
+        // (get_k/get_v/materialize run during build, before set_input populates them)
+        const_cast<llama_kv_cache_kvarn_context *>(kvarn_swa)->set_mat_idxs(inp->self_kvarn_mat_idxs_swa);
     }
 
     return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
diff --git a/src/llama-kv-cache-kvarn.cpp b/src/llama-kv-cache-kvarn.cpp
@@ -414,7 +414,12 @@ llama_kv_cache_kvarn::llama_kv_cache_kvarn(
     hparams(hparams),
     params(params),
     n_stream(unified ? 1u : n_seq_max),
-    n_groups_per_stream((kv_size + KVAR_N_GROUP - 1) / KVAR_N_GROUP),
+    // SWA: the metadata window of up to kv_size cells spans kv_size/128 + 1 tiles
+    // (a sliding window is rarely tile-aligned), so the record ring needs 2 extra
+    // slots over the non-SWA count to represent the oldest in-window tile without
+    // a slot collision and to keep (live_group - group) < groups_per_stream.
+    n_groups_per_stream(((kv_size + KVAR_N_GROUP - 1) / KVAR_N_GROUP) +
+        ((n_swa > 0 && swa_type != LLAMA_SWA_TYPE_NONE) ? 2u : 0u)),
     swa(n_swa > 0 && swa_type != LLAMA_SWA_TYPE_NONE),
     metadata(std::make_unique<llama_kv_cache>(
         model,
@@ -437,6 +442,11 @@ llama_kv_cache_kvarn::llama_kv_cache_kvarn(
     GGML_ASSERT(swa || kv_size % KVAR_N_GROUP == 0);
     if (swa) {
         GGML_ASSERT(n_stream == 1 && "SWA KVarN ring requires a unified (single-stream) cache");
+        // Backstop for the ring-size invariant above: the record ring must have
+        // strictly more slots than the metadata window's worst-case tile span so
+        // the oldest in-window tile still materializes from records.
+        GGML_ASSERT(n_groups_per_stream > (kv_size + KVAR_N_GROUP - 1) / KVAR_N_GROUP &&
+            "SWA KVarN record ring is too small for the sliding window");
     }
 
     struct buft_comparator {