enable graph reuse

am17an · am17an · commit 415c6d073156 · 2026-06-12T12:45:31.000+08:00
diff --git a/scripts/gen-chat-inline-templates.py b/scripts/gen-chat-inline-templates.py
@@ -84,17 +84,22 @@ def main() -> None:
         arch, rel = match.group(1), match.group(2)
         # read the template verbatim (no newline translation) so the embedded
         # string is a byte-for-byte copy of the source .jinja file
-        content = (repo_root / rel).read_text(encoding="utf-8", newline="")
+        # Path.read_text() only grew a newline param in python 3.13
+        with open(repo_root / rel, encoding="utf-8", newline="") as f:
+            content = f.read()
         entries.append((arch, rel, content))
 
     text = render(entries)
 
     output = Path(args.output)
     # write only when the content changes to avoid spurious rebuilds
-    if output.exists() and output.read_text(encoding="utf-8", newline="") == text:
-        return
+    if output.exists():
+        with open(output, encoding="utf-8", newline="") as f:
+            if f.read() == text:
+                return
     output.parent.mkdir(parents=True, exist_ok=True)
-    output.write_text(text, encoding="utf-8", newline="")
+    with open(output, "w", encoding="utf-8", newline="") as f:
+        f.write(text)
 
 
 if __name__ == "__main__":
diff --git a/src/llama-kv-cache-dsv4.cpp b/src/llama-kv-cache-dsv4.cpp
@@ -17,6 +17,7 @@
 
 static constexpr uint32_t DSV4_CSA_RATIO = 4;
 static constexpr uint32_t DSV4_HCA_RATIO = 128;
+static constexpr uint32_t DSV4_CSA_GRAPH_RAW_BUCKET = DSV4_HCA_RATIO;
 
 static constexpr uint32_t DSV4_STATE_MAGIC         = 0x34565344; // DSV4
 static constexpr uint32_t DSV4_STATE_VERSION       = 1;
@@ -226,6 +227,7 @@ static llama_kv_cache_dsv4_context::comp_plan dsv4_build_comp_plan(
     };
 
     std::vector<persist_row> persist_rows;
+    llama_pos max_pos = -1;
 
     // For the overlap compressor, build_overlap_compressed_kv_from_state() consumes
     // state_read_idxs as two contiguous halves: the first ratio*n_blocks entries are
@@ -272,6 +274,7 @@ static llama_kv_cache_dsv4_context::comp_plan dsv4_build_comp_plan(
         }
 
         const llama_seq_id seq_id = ubatch.seq_id[i][0];
+        max_pos = std::max(max_pos, pos);
 
         const int64_t stream_off = n_stream > 1 ? (int64_t) seq_id*state_size : 0;
 
@@ -323,6 +326,36 @@ static llama_kv_cache_dsv4_context::comp_plan dsv4_build_comp_plan(
         }
     }
 
+    if (ratio == DSV4_CSA_RATIO && plan.state_write_idxs.empty() && !plan.state_idxs.empty()) {
+        assert(kv_size > 0);
+
+        uint32_t i = 0;
+        while (i < ubatch.n_tokens && ubatch.pos[i] < 0) {
+            ++i;
+        }
+        assert(i < ubatch.n_tokens);
+
+        const llama_pos    pos    = ubatch.pos[i];
+        const llama_seq_id seq_id = ubatch.seq_id[i][0];
+        const int64_t cache_off = n_stream > 1 && seq_id >= 0 ? (int64_t) seq_id*kv_size : 0;
+        const int32_t source_idx = state_source_idx(seq_id, pos);
+
+        plan.state_write_idxs.push_back(cache_off + kv_size - 1);
+        plan.state_write_pos .push_back(0);
+        plan.state_write_end .push_back(-1);
+
+        if (overlap) {
+            for (uint32_t j = 0; j < ratio; ++j) {
+                overlap_prev_reads.push_back(source_idx);
+                overlap_cur_reads .push_back(source_idx);
+            }
+        } else {
+            for (uint32_t j = 0; j < ratio; ++j) {
+                plan.state_read_idxs.push_back(source_idx);
+            }
+        }
+    }
+
     if (overlap) {
         // [ all blocks' prev-window indices | all blocks' cur-window indices ]
         plan.state_read_idxs.reserve(overlap_prev_reads.size() + overlap_cur_reads.size());
@@ -332,6 +365,19 @@ static llama_kv_cache_dsv4_context::comp_plan dsv4_build_comp_plan(
                 overlap_cur_reads.begin(), overlap_cur_reads.end());
     }
 
+    if (ratio == DSV4_CSA_RATIO && max_pos >= 0) {
+        const int64_t raw_bucket = DSV4_CSA_GRAPH_RAW_BUCKET;
+        const int64_t pos_p1     = max_pos + 1;
+        int64_t n_raw_buckets    = (pos_p1 + raw_bucket - 1)/raw_bucket;
+        if (pos_p1 % raw_bucket == 0) {
+            ++n_raw_buckets;
+        }
+
+        const int64_t bucketed_tokens = n_raw_buckets * raw_bucket;
+        const int64_t bucketed_n_kv   = (bucketed_tokens + ratio - 1)/ratio;
+        plan.n_kv = std::min<int64_t>(kv_size, std::max<int64_t>(plan.n_kv, bucketed_n_kv));
+    }
+
     std::sort(persist_rows.begin(), persist_rows.end(),
             [](const persist_row & a, const persist_row & b) {
                 return a.dst < b.dst;
diff --git a/src/llama-kv-cache-dsv4.h b/src/llama-kv-cache-dsv4.h
@@ -175,6 +175,7 @@ class llama_kv_cache_dsv4_context : public llama_memory_context_i {
         std::vector<int32_t> state_read_idxs;
 
         // Final compressed-cache row ids written by state-backed commits.
+        // A non-boundary CSA/LID decode step can target a masked scratch row.
         std::vector<int64_t> state_write_idxs;
 
         // RoPE positions for state-backed commits.
@@ -186,7 +187,8 @@ class llama_kv_cache_dsv4_context : public llama_memory_context_i {
         // Number of completed compressed rows visible for each query token.
         std::vector<int32_t> n_visible;
 
-        // Maximum compressed rows visible to this ubatch.
+        // Graph-width for compressed rows. This can be larger than n_visible
+        // so masked padding rows do not force a new graph at every CSA block.
         int64_t n_kv = 0;
     };
 
diff --git a/src/models/deepseek-v4.cpp b/src/models/deepseek-v4.cpp
@@ -894,31 +894,29 @@ ggml_tensor * llama_model_deepseek4::graph::build_attention(
         csa_state_score = ggml_add(ctx0, csa_state_score, csa_ape_rows);
         cb(csa_state_score, "csa_state_score_ape", il);
 
-        ggml_tensor * csa_state_dep = nullptr;
-        if (inp_dsv4->get_csa().state_write_idxs) {
-            ggml_tensor * csa_source_kv = ggml_concat(ctx0,
-                    inp_dsv4->mctx->get_csa_state()->get_kv(ctx0, il), csa_state_kv, 1);
-            ggml_tensor * csa_source_score = ggml_concat(ctx0,
-                    inp_dsv4->mctx->get_csa_state()->get_score(ctx0, il), csa_state_score, 1);
-
-            ggml_tensor * kv_comp_csa_state = build_overlap_compressed_kv_from_state(
-                    csa_source_kv,
-                    csa_source_score,
-                    inp_dsv4->get_csa().state_read_idxs,
-                    inp_dsv4->get_csa().state_write_pos,
-                    layer.attn_comp_norm,
-                    DSV4_CSA_RATIO,
-                    n_embd_head,
-                    "csa_state_compress",
-                    il);
-
-            ggml_build_forward_expand(gf, inp_dsv4->mctx->get_csa()->cpy_k(ctx0,
-                        kv_comp_csa_state, inp_dsv4->get_csa().state_write_idxs, il));
-            csa_state_dep = kv_comp_csa_state;
-        }
+        GGML_ASSERT(inp_dsv4->get_csa().state_write_idxs);
+
+        ggml_tensor * csa_source_kv = ggml_concat(ctx0,
+                inp_dsv4->mctx->get_csa_state()->get_kv(ctx0, il), csa_state_kv, 1);
+        ggml_tensor * csa_source_score = ggml_concat(ctx0,
+                inp_dsv4->mctx->get_csa_state()->get_score(ctx0, il), csa_state_score, 1);
+
+        ggml_tensor * kv_comp_csa_state = build_overlap_compressed_kv_from_state(
+                csa_source_kv,
+                csa_source_score,
+                inp_dsv4->get_csa().state_read_idxs,
+                inp_dsv4->get_csa().state_write_pos,
+                layer.attn_comp_norm,
+                DSV4_CSA_RATIO,
+                n_embd_head,
+                "csa_state_compress",
+                il);
 
-        csa_state_kv    = dsv4_with_zero_dep(ctx0, csa_state_kv,    csa_state_dep);
-        csa_state_score = dsv4_with_zero_dep(ctx0, csa_state_score, csa_state_dep);
+        ggml_build_forward_expand(gf, inp_dsv4->mctx->get_csa()->cpy_k(ctx0,
+                    kv_comp_csa_state, inp_dsv4->get_csa().state_write_idxs, il));
+
+        csa_state_kv    = dsv4_with_zero_dep(ctx0, csa_state_kv,    kv_comp_csa_state);
+        csa_state_score = dsv4_with_zero_dep(ctx0, csa_state_score, kv_comp_csa_state);
 
         ggml_tensor * csa_persist_kv = ggml_get_rows(ctx0, csa_state_kv, inp_dsv4->get_csa().state_persist_src_idxs);
         ggml_tensor * csa_persist_score = ggml_get_rows(ctx0, csa_state_score, inp_dsv4->get_csa().state_persist_src_idxs);
@@ -946,36 +944,34 @@ ggml_tensor * llama_model_deepseek4::graph::build_attention(
         lid_state_score = ggml_add(ctx0, lid_state_score, lid_ape_rows);
         cb(lid_state_score, "lid_state_score_ape", il);
 
-        ggml_tensor * lid_state_dep = nullptr;
-        if (inp_dsv4->get_lid().state_write_idxs) {
-            ggml_tensor * lid_source_kv = ggml_concat(ctx0,
-                    inp_dsv4->mctx->get_lid_state()->get_kv(ctx0, il), lid_state_kv, 1);
-            ggml_tensor * lid_source_score = ggml_concat(ctx0,
-                    inp_dsv4->mctx->get_lid_state()->get_score(ctx0, il), lid_state_score, 1);
-
-            ggml_tensor * kv_comp_lid_state = build_overlap_compressed_kv_from_state(
-                    lid_source_kv,
-                    lid_source_score,
-                    inp_dsv4->get_lid().state_read_idxs,
-                    inp_dsv4->get_lid().state_write_pos,
-                    layer.indexer_comp_norm,
-                    DSV4_CSA_RATIO,
-                    hparams.indexer_head_size,
-                    "lid_state_compress",
-                    il);
-
-            if (inp_dsv4->get_lid().k_rot) {
-                kv_comp_lid_state = ggml_mul_mat(ctx0, inp_dsv4->get_lid().k_rot, kv_comp_lid_state);
-                cb(kv_comp_lid_state, "lid_state_compress_rot", il);
-            }
+        GGML_ASSERT(inp_dsv4->get_lid().state_write_idxs);
+
+        ggml_tensor * lid_source_kv = ggml_concat(ctx0,
+                inp_dsv4->mctx->get_lid_state()->get_kv(ctx0, il), lid_state_kv, 1);
+        ggml_tensor * lid_source_score = ggml_concat(ctx0,
+                inp_dsv4->mctx->get_lid_state()->get_score(ctx0, il), lid_state_score, 1);
+
+        ggml_tensor * kv_comp_lid_state = build_overlap_compressed_kv_from_state(
+                lid_source_kv,
+                lid_source_score,
+                inp_dsv4->get_lid().state_read_idxs,
+                inp_dsv4->get_lid().state_write_pos,
+                layer.indexer_comp_norm,
+                DSV4_CSA_RATIO,
+                hparams.indexer_head_size,
+                "lid_state_compress",
+                il);
 
-            ggml_build_forward_expand(gf, inp_dsv4->mctx->get_lid()->cpy_k(ctx0,
-                        kv_comp_lid_state, inp_dsv4->get_lid().state_write_idxs, il));
-            lid_state_dep = kv_comp_lid_state;
+        if (inp_dsv4->get_lid().k_rot) {
+            kv_comp_lid_state = ggml_mul_mat(ctx0, inp_dsv4->get_lid().k_rot, kv_comp_lid_state);
+            cb(kv_comp_lid_state, "lid_state_compress_rot", il);
         }
 
-        lid_state_kv    = dsv4_with_zero_dep(ctx0, lid_state_kv,    lid_state_dep);
-        lid_state_score = dsv4_with_zero_dep(ctx0, lid_state_score, lid_state_dep);
+        ggml_build_forward_expand(gf, inp_dsv4->mctx->get_lid()->cpy_k(ctx0,
+                    kv_comp_lid_state, inp_dsv4->get_lid().state_write_idxs, il));
+
+        lid_state_kv    = dsv4_with_zero_dep(ctx0, lid_state_kv,    kv_comp_lid_state);
+        lid_state_score = dsv4_with_zero_dep(ctx0, lid_state_score, kv_comp_lid_state);
 
         ggml_tensor * lid_persist_kv = ggml_get_rows(ctx0, lid_state_kv, inp_dsv4->get_lid().state_persist_src_idxs);
         ggml_tensor * lid_persist_score = ggml_get_rows(ctx0, lid_state_score, inp_dsv4->get_lid().state_persist_src_idxs);