Fix KVarN workspace store and compact checkpoints

Anbeeld · Anbeeld · commit 4caa0a4641aa · 2026-06-15T19:24:38.000+02:00
Read split workspace flush groups per token so the CUDA KVarN store path no longer pulls stale stage tails at ubatch boundaries.

Serialize KVarN partial checkpoints as stage-only overlays while preserving full record serialization for portable/full cache state, matching the compact partial-state shape used by normal quant caches.
diff --git a/ggml/src/ggml-cuda/kvarn.cu b/ggml/src/ggml-cuda/kvarn.cu
@@ -1140,15 +1140,14 @@ static __global__ void kvarn_store_workspace_flush_kernel(
 
     const int flush_start = flush_group * KVAR_N_DIM;
     const int stage_base = stream * KVAR_N_DIM * KVAR_N_STAGE_GROUPS;
-    const bool from_workspace = flush_start >= start_local && flush_start + KVAR_N_DIM <= end_local;
     float * tile = shared;
     for (int i = threadIdx.x; i < KVAR_N_TILE_VALUES; i += blockDim.x) {
         const int row = i / KVAR_N_DIM;
         const int col = i % KVAR_N_DIM;
         const int token = value ? row : col;
         const int dim = value ? col : row;
-        if (from_workspace) {
-            const int local_pos = flush_start + token;
+        const int local_pos = flush_start + token;
+        if (local_pos >= start_local && local_pos < end_local) {
             const int src_token = token_base + local_pos - start_local;
             tile[i] = __half2float(workspace[((int64_t) src_token * n_heads + head) * KVAR_N_DIM + dim]);
         } else {
diff --git a/src/llama-kv-cache-kvarn.cpp b/src/llama-kv-cache-kvarn.cpp
@@ -19,7 +19,9 @@ namespace {
 constexpr uint32_t KVAR_N_GROUP = 128;
 constexpr uint32_t KVAR_N_STAGE_GROUPS = 3;
 constexpr uint32_t KVAR_N_STATE_MAGIC = 0x4e52564b; // "KVRN"
-constexpr uint32_t KVAR_N_STATE_VERSION = 3;
+constexpr uint32_t KVAR_N_STATE_VERSION = 4;
+constexpr uint32_t KVAR_N_STATE_RECORDS_FULL = 0;
+constexpr uint32_t KVAR_N_STATE_STAGE_ONLY_PARTIAL = 1;
 
 bool kvarn_backend_supports_native_ops(ggml_backend_dev_t dev) {
     if (dev == nullptr || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
@@ -694,6 +696,7 @@ bool llama_kv_cache_kvarn::apply_pending_stream_copies(llama_context * lctx) {
 
 void llama_kv_cache_kvarn::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
     metadata->state_write(io, seq_id, flags);
+    const bool partial_state = (flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) != 0 && seq_id >= 0;
 
     std::vector<uint32_t> saved_streams;
     if (seq_id == -1) {
@@ -718,6 +721,8 @@ void llama_kv_cache_kvarn::state_write(llama_io_write_i & io, llama_seq_id seq_i
     for (const uint32_t stream : saved_streams) {
         io.write(&stream, sizeof(stream));
     }
+    const uint32_t state_kind = partial_state ? KVAR_N_STATE_STAGE_ONLY_PARTIAL : KVAR_N_STATE_RECORDS_FULL;
+    io.write(&state_kind, sizeof(state_kind));
 
     // n_groups_used is single-valued across all saved streams. This is correct
     // because when seq_id >= 0, saved_streams has exactly 1 entry (the stream
@@ -743,10 +748,12 @@ void llama_kv_cache_kvarn::state_write(llama_io_write_i & io, llama_seq_id seq_i
         for (const uint32_t stream : saved_streams) {
             io.write(&stream, sizeof(stream));
 
-            const size_t k_records_used = n_groups_used * layer.k_records_stream[stream]->nb[2];
-            const size_t v_records_used = n_groups_used * layer.v_records_stream[stream]->nb[2];
-            write_kvarn_tensor_slice(io, layer.k_records_stream[stream], 0, k_records_used);
-            write_kvarn_tensor_slice(io, layer.v_records_stream[stream], 0, v_records_used);
+            if (state_kind == KVAR_N_STATE_RECORDS_FULL) {
+                const size_t k_records_used = n_groups_used * layer.k_records_stream[stream]->nb[2];
+                const size_t v_records_used = n_groups_used * layer.v_records_stream[stream]->nb[2];
+                write_kvarn_tensor_slice(io, layer.k_records_stream[stream], 0, k_records_used);
+                write_kvarn_tensor_slice(io, layer.v_records_stream[stream], 0, v_records_used);
+            }
             write_kvarn_tensor(io, layer.k_stage_stream[stream]);
             write_kvarn_tensor(io, layer.v_stage_stream[stream]);
         }
@@ -798,6 +805,14 @@ void llama_kv_cache_kvarn::state_read(llama_io_read_i & io, llama_seq_id seq_id,
         }
     }
 
+    uint32_t state_kind = KVAR_N_STATE_RECORDS_FULL;
+    if (version >= 4) {
+        io.read(&state_kind, sizeof(state_kind));
+        if (state_kind != KVAR_N_STATE_RECORDS_FULL && state_kind != KVAR_N_STATE_STAGE_ONLY_PARTIAL) {
+            throw std::runtime_error("invalid KVarN cache state kind");
+        }
+    }
+
     uint32_t n_groups_used = n_groups_per_stream;
     if (version >= 3) {
         io.read(&n_groups_used, sizeof(n_groups_used));
@@ -810,6 +825,11 @@ void llama_kv_cache_kvarn::state_read(llama_io_read_i & io, llama_seq_id seq_id,
     if (seq_id != -1 && seq_stream >= n_stream) {
         throw std::runtime_error("invalid KVarN sequence stream");
     }
+    if (state_kind == KVAR_N_STATE_STAGE_ONLY_PARTIAL) {
+        if (seq_id < 0) {
+            throw std::runtime_error("KVarN stage-only state requires a destination sequence");
+        }
+    }
 
     for (const auto & layer : layers) {
         uint32_t il;
@@ -833,11 +853,13 @@ void llama_kv_cache_kvarn::state_read(llama_io_read_i & io, llama_seq_id seq_id,
                 const size_t k_records_total = n_groups_per_stream * layer.k_records_stream[stream_dst]->nb[2];
                 const size_t v_records_total = n_groups_per_stream * layer.v_records_stream[stream_dst]->nb[2];
 
-                read_kvarn_tensor_slice(io, layer.k_records_stream[stream_dst], 0, k_records_used);
-                zero_kvarn_tensor_range(layer.k_records_stream[stream_dst], k_records_used, k_records_total - k_records_used);
+                if (state_kind == KVAR_N_STATE_RECORDS_FULL) {
+                    read_kvarn_tensor_slice(io, layer.k_records_stream[stream_dst], 0, k_records_used);
+                    zero_kvarn_tensor_range(layer.k_records_stream[stream_dst], k_records_used, k_records_total - k_records_used);
 
-                read_kvarn_tensor_slice(io, layer.v_records_stream[stream_dst], 0, v_records_used);
-                zero_kvarn_tensor_range(layer.v_records_stream[stream_dst], v_records_used, v_records_total - v_records_used);
+                    read_kvarn_tensor_slice(io, layer.v_records_stream[stream_dst], 0, v_records_used);
+                    zero_kvarn_tensor_range(layer.v_records_stream[stream_dst], v_records_used, v_records_total - v_records_used);
+                }
 
                 read_kvarn_tensor(io, layer.k_stage_stream[stream_dst]);
                 read_kvarn_tensor(io, layer.v_stage_stream[stream_dst]);
diff --git a/tests/test-dflash-plumbing.cpp b/tests/test-dflash-plumbing.cpp
@@ -2620,17 +2620,22 @@ int main(int argc, char ** argv) {
     ok &= expect(kv_cache_kvarn_cpp.find("GGML_ABORT(\"KVarN does not support position shifts\")") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("GGML_ABORT(\"KVarN does not support position division\")") != std::string::npos,
         "KVarN seq_add/seq_div must fail fast instead of logging and continuing");
-    ok &= expect(kv_cache_kvarn_cpp.find("constexpr uint32_t KVAR_N_STATE_VERSION = 3") != std::string::npos &&
+    ok &= expect(kv_cache_kvarn_cpp.find("constexpr uint32_t KVAR_N_STATE_VERSION = 4") != std::string::npos &&
+                 kv_cache_kvarn_cpp.find("KVAR_N_STATE_RECORDS_FULL") != std::string::npos &&
+                 kv_cache_kvarn_cpp.find("KVAR_N_STATE_STAGE_ONLY_PARTIAL") != std::string::npos &&
+                 kv_cache_kvarn_cpp.find("const uint32_t state_kind = partial_state ? KVAR_N_STATE_STAGE_ONLY_PARTIAL : KVAR_N_STATE_RECORDS_FULL;") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("saved_streams") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("metadata->get_stream_for_seq(seq_id)") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("layer.k_records_stream[stream]") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("layer.v_stage_stream[stream]") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("n_groups_used") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("write_kvarn_tensor_slice") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("read_kvarn_tensor_slice") != std::string::npos &&
+                 kv_cache_kvarn_cpp.find("if (state_kind == KVAR_N_STATE_RECORDS_FULL)") != std::string::npos &&
+                 kv_cache_kvarn_cpp.find("if (state_kind == KVAR_N_STATE_STAGE_ONLY_PARTIAL)") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("ggml_backend_tensor_memset") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("const uint64_t size64 = size") != std::string::npos,
-        "KVarN sequence state must serialize stream-scoped tensors with group-range compression");
+        "KVarN sequence state must serialize full records compactly and partial checkpoints as stage-only overlays");
     ok &= expect(kv_cache_kvarn_cpp.find("pending_stream_copies") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("llama_synchronize(lctx)") != std::string::npos &&
                  kv_cache_kvarn_cpp.find("copy_kvarn_stream") != std::string::npos,
diff --git a/tests/test-kvarn.cpp b/tests/test-kvarn.cpp
@@ -661,6 +661,16 @@ static void test_store_legacy_parity_gpu() {
         }
     }
 
+    for (int bits : { 2, 3, 4, 5, 6, 8 }) {
+        for (bool value : { false, true }) {
+            const std::vector<uint8_t> modern = test_store_records(
+                    backend, bits, value, false, 1, 2, 512, 200, false, true);
+            const std::vector<uint8_t> legacy = test_store_records(
+                    backend, bits, value, true, 1, 2, 512, 200, false, true);
+            require(modern == legacy, "KVarN CUDA split workspace store records differ from legacy path");
+        }
+    }
+
     for (int bits : { 2, 3, 4, 5, 6, 8 }) {
         for (bool value : { false, true }) {
             const std::vector<uint8_t> modern = test_store_records(