fix(qwen35moe): size KV reservation by n_full + cache-type, not n_layer×f16 (#454)

dusterbloom · web-flow · commit cd8b0654bbd7 · 2026-06-26T15:27:40.000+02:00
Only n_full = n_layer/full_attention_interval layers carry a KV cache (the rest are O(1)-state SSM/DeltaNet); honoring that plus the resolved q4_0 cache type cuts the placement reservation ~14x (25 -> 1.76 GiB @131k), keeping experts all-hot at deep context instead of forcing the slow hybrid spec path. Extract a shared kv_reservation_bytes_per_token() helper (one source of truth for qwen35 + qwen35moe) and add a unit test pinning n_full + cache-type vs the old form.
diff --git a/server/src/kv_quant.h b/server/src/kv_quant.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "ggml.h"
+#include <cstdint>
 #include <string>
 
 namespace dflash {
@@ -27,4 +28,21 @@ bool is_supported_kv_pair(ggml_type k, ggml_type v);
 // and calls std::abort(). Returns the resolved pair via out params.
 void resolve_kv_types(ggml_type & k_out, ggml_type & v_out);
 
+// KV reservation bytes per token for a hybrid attention/SSM model. Single source
+// of truth for both qwen35 (dense) and qwen35moe expert-placement budgeting.
+// Only the full-attention layers carry a KV cache — the rest are O(1)-state
+// SSM/DeltaNet — so count n_full = n_layer / full_attention_interval, and honor
+// the resolved cache element type (q4_0 ≪ f16). Using n_layer or a hardcoded f16
+// over-reserves KV and falsely forces experts cold (the qwen35moe:2595 bug).
+inline uint64_t kv_reservation_bytes_per_token(
+        int n_layer, int full_attention_interval, int n_head_kv,
+        ggml_type kv_k, int n_embd_head_k,
+        ggml_type kv_v, int n_embd_head_v) {
+    const int n_full = (full_attention_interval > 0)
+        ? (n_layer / full_attention_interval) : n_layer;
+    return (uint64_t)n_full * (uint64_t)n_head_kv *
+        (uint64_t)(ggml_row_size(kv_k, n_embd_head_k) +
+                   ggml_row_size(kv_v, n_embd_head_v));
+}
+
 }  // namespace dflash
diff --git a/server/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
@@ -156,12 +156,12 @@ KvFlashAutoBudget Qwen35Backend::make_kvflash_budget(const TargetWeights & w,
                                                      int64_t gpu_free) const {
     ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0;
     dflash::resolve_kv_types(kv_k, kv_v);
-    const int n_full = w.n_layer / w.full_attention_interval;
     KvFlashAutoBudget b;
     b.free_bytes      = gpu_free;
-    b.bytes_per_token = (int64_t)n_full * w.n_head_kv *
-        (int64_t)(ggml_row_size(kv_k, w.n_embd_head_k) +
-                  ggml_row_size(kv_v, w.n_embd_head_v));
+    // Single source of truth with the qwen35moe placement path — see kv_quant.h.
+    b.bytes_per_token = (int64_t)dflash::kv_reservation_bytes_per_token(
+        w.n_layer, w.full_attention_interval, w.n_head_kv,
+        kv_k, w.n_embd_head_k, kv_v, w.n_embd_head_v);
     b.reserve_bytes   = (int64_t)(1.5 * 1073741824.0) +
         (kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
     return b;
diff --git a/server/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp
@@ -11,6 +11,7 @@
 #include "dflash_draft_graph.h"
 #include "dflash_feature_ring.h"
 #include "graph_builders.h"
+#include "kv_quant.h"
 
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -2163,9 +2164,16 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
     int max_context = ctx_env ? std::atoi(ctx_env) : cfg_.device.max_ctx;
     if (max_context <= 0) max_context = 8192;
 
-    // KV cache: n_layer × 2 (K+V) × n_head_kv × head_dim × sizeof(fp16) × max_context
-    const uint64_t kv_bytes_per_tok = (uint64_t)w.n_layer * 2 *
-        (uint64_t)w.n_head_kv * (uint64_t)w.n_embd_head_k * 2;
+    // KV reservation: only full-attention layers carry KV (the rest are O(1)-state
+    // SSM/DeltaNet), and honor the resolved cache type. The old n_layer × hardcoded
+    // -f16 form over-reserved ~14× at deep ctx (25 GiB vs 1.76 GiB @131K), forcing
+    // experts cold → the slow hybrid spec path. Shared helper / single source of
+    // truth with the dense backend — see kv_quant.h.
+    ggml_type kv_k_t = GGML_TYPE_Q8_0, kv_v_t = GGML_TYPE_Q8_0;
+    dflash::resolve_kv_types(kv_k_t, kv_v_t);
+    const uint64_t kv_bytes_per_tok = dflash::kv_reservation_bytes_per_token(
+        w.n_layer, w.full_attention_interval, w.n_head_kv,
+        kv_k_t, w.n_embd_head_k, kv_v_t, w.n_embd_head_v);
     // Size the reservation with the SAME inputs runtime uses (scorer policy +
     // VRAM budget); the bare-max_context call took the no-budget fallback
     // (max_ctx/2) and over-reserved KV, starving experts of hot placement.
diff --git a/server/test/test_kv_quant.cpp b/server/test/test_kv_quant.cpp
@@ -158,6 +158,37 @@ static void t4_tq3_shorthand() {
     std::puts("T4 PASS");
 }
 
+// ─── T5: hybrid KV reservation uses n_full + cache-type (not n_layer × f16) ──
+// The bug at qwen35moe_backend.cpp:2595 reserved n_layer(40) × hardcoded f16,
+// over-reserving ~7× and forcing experts cold. The shared helper must count
+// only full-attention layers (n_full) and honor the resolved cache type.
+static void t5_kv_reservation() {
+    // Qwen3.6-35B-A3B geometry: 40 layers, full_attention_interval=4 → n_full=10.
+    const int n_layer = 40, interval = 4, n_head_kv = 10, dk = 128, dv = 128;
+    const uint64_t got = dflash::kv_reservation_bytes_per_token(
+        n_layer, interval, n_head_kv, GGML_TYPE_Q4_0, dk, GGML_TYPE_Q4_0, dv);
+    const uint64_t expect = (uint64_t)(n_layer / interval) * (uint64_t)n_head_kv *
+        (uint64_t)(ggml_row_size(GGML_TYPE_Q4_0, dk) + ggml_row_size(GGML_TYPE_Q4_0, dv));
+    assert(got == expect);
+
+    // Regression guard vs the old bug (n_layer × hardcoded f16 = 2 B/elem):
+    const uint64_t buggy = (uint64_t)n_layer * 2 * (uint64_t)n_head_kv * (uint64_t)dk * 2;
+    assert(got < buggy / 3);  // ÷4 (layers) × (q4_0 < f16) → well over 3× smaller
+
+    // cache-type is honored: q4_0 reservation is strictly smaller than f16.
+    const uint64_t f16 = dflash::kv_reservation_bytes_per_token(
+        n_layer, interval, n_head_kv, GGML_TYPE_F16, dk, GGML_TYPE_F16, dv);
+    assert(got < f16);
+
+    // full_attention_interval=0 guard: no div-by-zero, falls back to n_layer.
+    const uint64_t fallback = dflash::kv_reservation_bytes_per_token(
+        n_layer, 0, n_head_kv, GGML_TYPE_Q4_0, dk, GGML_TYPE_Q4_0, dv);
+    assert(fallback == (uint64_t)n_layer * (uint64_t)n_head_kv *
+        (uint64_t)(ggml_row_size(GGML_TYPE_Q4_0, dk) + ggml_row_size(GGML_TYPE_Q4_0, dv)));
+
+    std::puts("T5 PASS");
+}
+
 // ─── main ────────────────────────────────────────────────────────────────────
 
 int main() {
@@ -167,6 +198,7 @@ int main() {
     t2_resolve_kv_types();
     t3_is_supported_kv_pair();
     t4_tq3_shorthand();
+    t5_kv_reservation();
 
     std::puts("ALL TESTS PASS");
     return 0;