From ce5e8dc21ac55df8b9f8a8f1cc28f86493778d93 Mon Sep 17 00:00:00 2001 From: Javier Pazo Date: Sat, 9 May 2026 11:57:10 +0200 Subject: [PATCH] fix(dflash): derive n_target_layers fallback in gguf_draft_loader Follow-up to merged #79 ("read model params from GGUF at runtime, support any qwen35 size"). #79 covers the target loader and the common drafter fields, but the fallback chain in gguf_draft_loader still requires the legacy `dflash.n_target_layers` key to be present. Drafters published with the new metadata key naming (`dflash-draft.dflash.target_layer_ids` plus `n_target_features`) hit the path where the legacy key is missing and the loader fails. Concrete case: the published Q8 GGUF drafter for Qwen3.6-27B-DFlash. This change derives `n_target_layers` in two steps: 1. If `target_layer_ids` is present, use its length. 2. Otherwise, if `n_target_features` and `n_embd` are both present, use `n_target_features / n_embd` (with a sanity check that the division is exact). If neither is available, the loader still fails with the same honest error as before. The legacy key path is untouched. Validation (RTX 6000 Ada sm_89, Qwen3.6-27B Heretic Q4_K_M target, Q8 GGUF drafter via the new metadata): Loaded `SWA layers: 4/5`, decode 21.06 tok/s, no fallback chain errors during init. Verification vs existing community PRs: COMP-COMPL with #79 (merged 2026-05-03). #79 covered target loader and drafter fields generically. This PR is a small follow-up for the case where only the new metadata is present on the drafter side. Author: Javier Pazo --- dflash/src/gguf_draft_loader.cpp | 76 ++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 24 deletions(-) diff --git a/dflash/src/gguf_draft_loader.cpp b/dflash/src/gguf_draft_loader.cpp index ee317913..e339b957 100644 --- a/dflash/src/gguf_draft_loader.cpp +++ b/dflash/src/gguf_draft_loader.cpp @@ -6,12 +6,10 @@ // types — ggml's ggml_mul_mat handles Q8_0 × F32 dequantization transparently. // // GGUF arch: "qwen35-dflash-draft" (from convert_dflash_to_gguf.py / -// quantize_draft_q8.py) or the newer "dflash-draft" export. Tensor naming -// convention: +// quantize_draft_q8.py). Tensor naming convention: // -// dflash.fc.weight / dflash_fc.weight [5*hidden, hidden] Q8_0 / F16 -// dflash.hidden_norm.weight / -// dflash_hidden_norm.weight [hidden] F32 +// dflash.fc.weight [5*hidden, hidden] Q8_0 / F16 +// dflash.hidden_norm.weight [hidden] F32 // output_norm.weight [hidden] F32 // blk..attn_norm.weight [hidden] F32 // blk..ffn_norm.weight [hidden] F32 @@ -108,6 +106,14 @@ uint32_t get_u32_or(const gguf_context * g, const char * key, uint32_t fallback) return gguf_get_val_u32(g, id); } +int count_swa_layers(const DraftWeights & w) { + int n_swa = 0; + for (const DraftLayer & layer : w.layers) { + if (layer.is_swa) n_swa++; + } + return n_swa; +} + } // namespace bool load_draft_gguf(const std::string & path, @@ -126,6 +132,7 @@ bool load_draft_gguf(const std::string & path, } // Validate arch + std::string arch_s; { int64_t arch_id = gguf_find_key(gctx, "general.architecture"); if (arch_id < 0) { @@ -134,8 +141,8 @@ bool load_draft_gguf(const std::string & path, return false; } const char * arch = gguf_get_val_str(gctx, arch_id); - if (std::string(arch) != "qwen35-dflash-draft" && - std::string(arch) != "dflash-draft") { + arch_s = arch; + if (arch_s != "qwen35-dflash-draft" && arch_s != "dflash-draft") { set_last_error(std::string("unexpected draft arch: ") + arch + " (expected qwen35-dflash-draft or dflash-draft)"); gguf_free(gctx); @@ -144,8 +151,7 @@ bool load_draft_gguf(const std::string & path, } // Read dimensions from GGUF metadata - int64_t arch_id = gguf_find_key(gctx, "general.architecture"); - const char * A = gguf_get_val_str(gctx, arch_id); + const char * A = arch_s.c_str(); char key[128]; auto read_u32 = [&](const char * suffix, uint32_t fallback) -> uint32_t { @@ -162,16 +168,17 @@ bool load_draft_gguf(const std::string & path, const uint32_t block_sz = read_u32("dflash.block_size", 0); uint32_t n_tgt_lay = read_u32("dflash.n_target_layers", 0); if (n_tgt_lay == 0) { - const uint32_t n_tgt_feat = read_u32("dflash.n_target_features", 0); - if (n_tgt_feat != 0 && n_embd != 0 && (n_tgt_feat % n_embd) == 0) { - n_tgt_lay = n_tgt_feat / n_embd; + std::snprintf(key, sizeof(key), "%s.%s", A, "dflash.target_layer_ids"); + const int64_t target_ids_id = gguf_find_key(gctx, key); + if (target_ids_id >= 0 && + gguf_get_kv_type(gctx, target_ids_id) == GGUF_TYPE_ARRAY) { + n_tgt_lay = (uint32_t)gguf_get_arr_n(gctx, target_ids_id); } } - if (n_tgt_lay == 0) { - std::snprintf(key, sizeof(key), "%s.%s", A, "dflash.target_layer_ids"); - int64_t id = gguf_find_key(gctx, key); - if (id >= 0) { - n_tgt_lay = (uint32_t)gguf_get_arr_n(gctx, id); + if (n_tgt_lay == 0 && n_embd != 0) { + const uint32_t n_target_features = read_u32("dflash.n_target_features", 0); + if (n_target_features != 0 && (n_target_features % n_embd) == 0) { + n_tgt_lay = n_target_features / n_embd; } } @@ -240,17 +247,17 @@ bool load_draft_gguf(const std::string & path, auto g = [&](const char * name) -> ggml_tensor * { return ggml_get_tensor(meta_ctx, name); }; - - auto first = [](ggml_tensor * a, ggml_tensor * b) -> ggml_tensor * { - return a ? a : b; + auto g_any = [&](const char * a, const char * b) -> ggml_tensor * { + if (ggml_tensor * t = g(a)) return t; + return g(b); }; - out.fc = first(g("dflash.fc.weight"), g("dflash_fc.weight")); - out.hidden_norm = first(g("dflash.hidden_norm.weight"), g("dflash_hidden_norm.weight")); + out.fc = g_any("dflash.fc.weight", "dflash_fc.weight"); + out.hidden_norm = g_any("dflash.hidden_norm.weight", "dflash_hidden_norm.weight"); out.out_norm = g("output_norm.weight"); if (!out.fc || !out.hidden_norm || !out.out_norm) { set_last_error("draft GGUF: missing top-level tensors " - "(dflash fc / dflash hidden norm / output_norm)"); + "(dflash.fc|dflash_fc / dflash.hidden_norm|dflash_hidden_norm / output_norm)"); gguf_free(gctx); return false; } @@ -263,7 +270,8 @@ bool load_draft_gguf(const std::string & path, }; DraftLayer & L = out.layers[il]; L.attn_norm = fnd("attn_norm.weight"); - L.ffn_norm = first(fnd("ffn_norm.weight"), fnd("post_attention_norm.weight")); + L.ffn_norm = fnd("ffn_norm.weight"); + if (!L.ffn_norm) L.ffn_norm = fnd("post_attention_norm.weight"); L.wq = fnd("attn_q.weight"); L.wk = fnd("attn_k.weight"); L.wv = fnd("attn_v.weight"); @@ -283,6 +291,26 @@ bool load_draft_gguf(const std::string & path, } } + // GGUF Qwen3.6 drafters carry SWA metadata emitted by the converter: + // dflash-draft.attention.sliding_window = 2048 + // dflash-draft.attention.sliding_window_pattern = [true,true,true,true,false] + out.swa_window = (int)read_u32("attention.sliding_window", 0); + std::snprintf(key, sizeof(key), "%s.%s", A, "attention.sliding_window_pattern"); + int64_t swp_id = gguf_find_key(gctx, key); + if (swp_id >= 0 && gguf_get_kv_type(gctx, swp_id) == GGUF_TYPE_ARRAY && + gguf_get_arr_type(gctx, swp_id) == GGUF_TYPE_BOOL) { + const size_t n = gguf_get_arr_n(gctx, swp_id); + const bool * pattern = static_cast(gguf_get_arr_data(gctx, swp_id)); + for (size_t il = 0; il < n && il < out.layers.size(); il++) { + out.layers[il].is_swa = pattern[il]; + } + } + const int n_swa = count_swa_layers(out); + if (n_swa > 0) { + std::fprintf(stderr, "[draft GGUF] SWA layers: %d/%d (window=%d)\n", + n_swa, out.n_layer, out.swa_window); + } + // ── 3. Allocate CUDA buffer for all tensors ────────────────────────── out.buf = ggml_backend_alloc_ctx_tensors(meta_ctx, backend); if (!out.buf) {