Skip to content

Commit cd8b065

Browse files
authored
fix(qwen35moe): size KV reservation by n_full + cache-type, not n_layer×f16 (#454)
Only n_full = n_layer/full_attention_interval layers carry a KV cache (the rest are O(1)-state SSM/DeltaNet); honoring that plus the resolved q4_0 cache type cuts the placement reservation ~14x (25 -> 1.76 GiB @131k), keeping experts all-hot at deep context instead of forcing the slow hybrid spec path. Extract a shared kv_reservation_bytes_per_token() helper (one source of truth for qwen35 + qwen35moe) and add a unit test pinning n_full + cache-type vs the old form.
1 parent 55a205d commit cd8b065

4 files changed

Lines changed: 65 additions & 7 deletions

File tree

server/src/kv_quant.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22
#include "ggml.h"
3+
#include <cstdint>
34
#include <string>
45

56
namespace dflash {
@@ -27,4 +28,21 @@ bool is_supported_kv_pair(ggml_type k, ggml_type v);
2728
// and calls std::abort(). Returns the resolved pair via out params.
2829
void resolve_kv_types(ggml_type & k_out, ggml_type & v_out);
2930

31+
// KV reservation bytes per token for a hybrid attention/SSM model. Single source
32+
// of truth for both qwen35 (dense) and qwen35moe expert-placement budgeting.
33+
// Only the full-attention layers carry a KV cache — the rest are O(1)-state
34+
// SSM/DeltaNet — so count n_full = n_layer / full_attention_interval, and honor
35+
// the resolved cache element type (q4_0 ≪ f16). Using n_layer or a hardcoded f16
36+
// over-reserves KV and falsely forces experts cold (the qwen35moe:2595 bug).
37+
inline uint64_t kv_reservation_bytes_per_token(
38+
int n_layer, int full_attention_interval, int n_head_kv,
39+
ggml_type kv_k, int n_embd_head_k,
40+
ggml_type kv_v, int n_embd_head_v) {
41+
const int n_full = (full_attention_interval > 0)
42+
? (n_layer / full_attention_interval) : n_layer;
43+
return (uint64_t)n_full * (uint64_t)n_head_kv *
44+
(uint64_t)(ggml_row_size(kv_k, n_embd_head_k) +
45+
ggml_row_size(kv_v, n_embd_head_v));
46+
}
47+
3048
} // namespace dflash

server/src/qwen35/qwen35_backend.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,12 +156,12 @@ KvFlashAutoBudget Qwen35Backend::make_kvflash_budget(const TargetWeights & w,
156156
int64_t gpu_free) const {
157157
ggml_type kv_k = GGML_TYPE_Q8_0, kv_v = GGML_TYPE_Q8_0;
158158
dflash::resolve_kv_types(kv_k, kv_v);
159-
const int n_full = w.n_layer / w.full_attention_interval;
160159
KvFlashAutoBudget b;
161160
b.free_bytes = gpu_free;
162-
b.bytes_per_token = (int64_t)n_full * w.n_head_kv *
163-
(int64_t)(ggml_row_size(kv_k, w.n_embd_head_k) +
164-
ggml_row_size(kv_v, w.n_embd_head_v));
161+
// Single source of truth with the qwen35moe placement path — see kv_quant.h.
162+
b.bytes_per_token = (int64_t)dflash::kv_reservation_bytes_per_token(
163+
w.n_layer, w.full_attention_interval, w.n_head_kv,
164+
kv_k, w.n_embd_head_k, kv_v, w.n_embd_head_v);
165165
b.reserve_bytes = (int64_t)(1.5 * 1073741824.0) +
166166
(kvflash_drafter_path_.empty() ? 0 : (int64_t)(1.7 * 1073741824.0));
167167
return b;

server/src/qwen35moe/qwen35moe_backend.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "dflash_draft_graph.h"
1212
#include "dflash_feature_ring.h"
1313
#include "graph_builders.h"
14+
#include "kv_quant.h"
1415

1516
#include "ggml-alloc.h"
1617
#include "ggml-backend.h"
@@ -2163,9 +2164,16 @@ bool Qwen35MoeBackend::load_dynamic_placement(const char * hotness_path,
21632164
int max_context = ctx_env ? std::atoi(ctx_env) : cfg_.device.max_ctx;
21642165
if (max_context <= 0) max_context = 8192;
21652166

2166-
// KV cache: n_layer × 2 (K+V) × n_head_kv × head_dim × sizeof(fp16) × max_context
2167-
const uint64_t kv_bytes_per_tok = (uint64_t)w.n_layer * 2 *
2168-
(uint64_t)w.n_head_kv * (uint64_t)w.n_embd_head_k * 2;
2167+
// KV reservation: only full-attention layers carry KV (the rest are O(1)-state
2168+
// SSM/DeltaNet), and honor the resolved cache type. The old n_layer × hardcoded
2169+
// -f16 form over-reserved ~14× at deep ctx (25 GiB vs 1.76 GiB @131K), forcing
2170+
// experts cold → the slow hybrid spec path. Shared helper / single source of
2171+
// truth with the dense backend — see kv_quant.h.
2172+
ggml_type kv_k_t = GGML_TYPE_Q8_0, kv_v_t = GGML_TYPE_Q8_0;
2173+
dflash::resolve_kv_types(kv_k_t, kv_v_t);
2174+
const uint64_t kv_bytes_per_tok = dflash::kv_reservation_bytes_per_token(
2175+
w.n_layer, w.full_attention_interval, w.n_head_kv,
2176+
kv_k_t, w.n_embd_head_k, kv_v_t, w.n_embd_head_v);
21692177
// Size the reservation with the SAME inputs runtime uses (scorer policy +
21702178
// VRAM budget); the bare-max_context call took the no-budget fallback
21712179
// (max_ctx/2) and over-reserved KV, starving experts of hot placement.

server/test/test_kv_quant.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,37 @@ static void t4_tq3_shorthand() {
158158
std::puts("T4 PASS");
159159
}
160160

161+
// ─── T5: hybrid KV reservation uses n_full + cache-type (not n_layer × f16) ──
162+
// The bug at qwen35moe_backend.cpp:2595 reserved n_layer(40) × hardcoded f16,
163+
// over-reserving ~7× and forcing experts cold. The shared helper must count
164+
// only full-attention layers (n_full) and honor the resolved cache type.
165+
static void t5_kv_reservation() {
166+
// Qwen3.6-35B-A3B geometry: 40 layers, full_attention_interval=4 → n_full=10.
167+
const int n_layer = 40, interval = 4, n_head_kv = 10, dk = 128, dv = 128;
168+
const uint64_t got = dflash::kv_reservation_bytes_per_token(
169+
n_layer, interval, n_head_kv, GGML_TYPE_Q4_0, dk, GGML_TYPE_Q4_0, dv);
170+
const uint64_t expect = (uint64_t)(n_layer / interval) * (uint64_t)n_head_kv *
171+
(uint64_t)(ggml_row_size(GGML_TYPE_Q4_0, dk) + ggml_row_size(GGML_TYPE_Q4_0, dv));
172+
assert(got == expect);
173+
174+
// Regression guard vs the old bug (n_layer × hardcoded f16 = 2 B/elem):
175+
const uint64_t buggy = (uint64_t)n_layer * 2 * (uint64_t)n_head_kv * (uint64_t)dk * 2;
176+
assert(got < buggy / 3); // ÷4 (layers) × (q4_0 < f16) → well over 3× smaller
177+
178+
// cache-type is honored: q4_0 reservation is strictly smaller than f16.
179+
const uint64_t f16 = dflash::kv_reservation_bytes_per_token(
180+
n_layer, interval, n_head_kv, GGML_TYPE_F16, dk, GGML_TYPE_F16, dv);
181+
assert(got < f16);
182+
183+
// full_attention_interval=0 guard: no div-by-zero, falls back to n_layer.
184+
const uint64_t fallback = dflash::kv_reservation_bytes_per_token(
185+
n_layer, 0, n_head_kv, GGML_TYPE_Q4_0, dk, GGML_TYPE_Q4_0, dv);
186+
assert(fallback == (uint64_t)n_layer * (uint64_t)n_head_kv *
187+
(uint64_t)(ggml_row_size(GGML_TYPE_Q4_0, dk) + ggml_row_size(GGML_TYPE_Q4_0, dv)));
188+
189+
std::puts("T5 PASS");
190+
}
191+
161192
// ─── main ────────────────────────────────────────────────────────────────────
162193

163194
int main() {
@@ -167,6 +198,7 @@ int main() {
167198
t2_resolve_kv_types();
168199
t3_is_supported_kv_pair();
169200
t4_tq3_shorthand();
201+
t5_kv_reservation();
170202

171203
std::puts("ALL TESTS PASS");
172204
return 0;

0 commit comments

Comments
 (0)