From ce5e8dc21ac55df8b9f8a8f1cc28f86493778d93 Mon Sep 17 00:00:00 2001
From: Javier Pazo <xabicasa@gmail.com>
Date: Sat, 9 May 2026 11:57:10 +0200
Subject: [PATCH] fix(dflash): derive n_target_layers fallback in
 gguf_draft_loader

Follow-up to merged #79 ("read model params from GGUF at runtime,
support any qwen35 size"). #79 covers the target loader and the
common drafter fields, but the fallback chain in gguf_draft_loader
still requires the legacy `dflash.n_target_layers` key to be
present.

Drafters published with the new metadata key naming
(`dflash-draft.dflash.target_layer_ids` plus
`n_target_features`) hit the path where the legacy key is missing
and the loader fails. Concrete case: the published Q8 GGUF drafter
for Qwen3.6-27B-DFlash.

This change derives `n_target_layers` in two steps:

  1. If `target_layer_ids` is present, use its length.
  2. Otherwise, if `n_target_features` and `n_embd` are both
     present, use `n_target_features / n_embd` (with a sanity
     check that the division is exact).

If neither is available, the loader still fails with the same
honest error as before. The legacy key path is untouched.

Validation (RTX 6000 Ada sm_89, Qwen3.6-27B Heretic Q4_K_M target,
Q8 GGUF drafter via the new metadata):

  Loaded `SWA layers: 4/5`, decode 21.06 tok/s, no fallback chain
  errors during init.

Verification vs existing community PRs:

  COMP-COMPL with #79 (merged 2026-05-03). #79 covered target
  loader and drafter fields generically. This PR is a small
  follow-up for the case where only the new metadata is present
  on the drafter side.

Author: Javier Pazo <xabicasa@gmail.com>
---
 dflash/src/gguf_draft_loader.cpp | 76 ++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 24 deletions(-)
diff --git a/dflash/src/gguf_draft_loader.cpp b/dflash/src/gguf_draft_loader.cpp
index ee317913..e339b957 100644
--- a/dflash/src/gguf_draft_loader.cpp
+++ b/dflash/src/gguf_draft_loader.cpp
@@ -6,12 +6,10 @@
 // types — ggml's ggml_mul_mat handles Q8_0 × F32 dequantization transparently.
 //
 // GGUF arch: "qwen35-dflash-draft" (from convert_dflash_to_gguf.py /
-// quantize_draft_q8.py) or the newer "dflash-draft" export. Tensor naming
-// convention:
+// quantize_draft_q8.py). Tensor naming convention:
 //
-//   dflash.fc.weight / dflash_fc.weight     [5*hidden, hidden]  Q8_0 / F16
-//   dflash.hidden_norm.weight /
-//   dflash_hidden_norm.weight               [hidden]            F32
+//   dflash.fc.weight                        [5*hidden, hidden]  Q8_0 / F16
+//   dflash.hidden_norm.weight               [hidden]            F32
 //   output_norm.weight                      [hidden]            F32
 //   blk.<i>.attn_norm.weight                [hidden]            F32
 //   blk.<i>.ffn_norm.weight                 [hidden]            F32
@@ -108,6 +106,14 @@ uint32_t get_u32_or(const gguf_context * g, const char * key, uint32_t fallback)
     return gguf_get_val_u32(g, id);
 }
 
+int count_swa_layers(const DraftWeights & w) {
+    int n_swa = 0;
+    for (const DraftLayer & layer : w.layers) {
+        if (layer.is_swa) n_swa++;
+    }
+    return n_swa;
+}
+
 } // namespace
 
 bool load_draft_gguf(const std::string & path,
@@ -126,6 +132,7 @@ bool load_draft_gguf(const std::string & path,
     }
 
     // Validate arch
+    std::string arch_s;
     {
         int64_t arch_id = gguf_find_key(gctx, "general.architecture");
         if (arch_id < 0) {
@@ -134,8 +141,8 @@ bool load_draft_gguf(const std::string & path,
             return false;
         }
         const char * arch = gguf_get_val_str(gctx, arch_id);
-        if (std::string(arch) != "qwen35-dflash-draft" &&
-            std::string(arch) != "dflash-draft") {
+        arch_s = arch;
+        if (arch_s != "qwen35-dflash-draft" && arch_s != "dflash-draft") {
             set_last_error(std::string("unexpected draft arch: ") + arch +
                            " (expected qwen35-dflash-draft or dflash-draft)");
             gguf_free(gctx);
@@ -144,8 +151,7 @@ bool load_draft_gguf(const std::string & path,
     }
 
     // Read dimensions from GGUF metadata
-    int64_t arch_id = gguf_find_key(gctx, "general.architecture");
-    const char * A = gguf_get_val_str(gctx, arch_id);
+    const char * A = arch_s.c_str();
     char key[128];
 
     auto read_u32 = [&](const char * suffix, uint32_t fallback) -> uint32_t {
@@ -162,16 +168,17 @@ bool load_draft_gguf(const std::string & path,
     const uint32_t block_sz  = read_u32("dflash.block_size",       0);
     uint32_t n_tgt_lay       = read_u32("dflash.n_target_layers",  0);
     if (n_tgt_lay == 0) {
-        const uint32_t n_tgt_feat = read_u32("dflash.n_target_features", 0);
-        if (n_tgt_feat != 0 && n_embd != 0 && (n_tgt_feat % n_embd) == 0) {
-            n_tgt_lay = n_tgt_feat / n_embd;
+        std::snprintf(key, sizeof(key), "%s.%s", A, "dflash.target_layer_ids");
+        const int64_t target_ids_id = gguf_find_key(gctx, key);
+        if (target_ids_id >= 0 &&
+            gguf_get_kv_type(gctx, target_ids_id) == GGUF_TYPE_ARRAY) {
+            n_tgt_lay = (uint32_t)gguf_get_arr_n(gctx, target_ids_id);
         }
     }
-    if (n_tgt_lay == 0) {
-        std::snprintf(key, sizeof(key), "%s.%s", A, "dflash.target_layer_ids");
-        int64_t id = gguf_find_key(gctx, key);
-        if (id >= 0) {
-            n_tgt_lay = (uint32_t)gguf_get_arr_n(gctx, id);
+    if (n_tgt_lay == 0 && n_embd != 0) {
+        const uint32_t n_target_features = read_u32("dflash.n_target_features", 0);
+        if (n_target_features != 0 && (n_target_features % n_embd) == 0) {
+            n_tgt_lay = n_target_features / n_embd;
         }
     }
 
@@ -240,17 +247,17 @@ bool load_draft_gguf(const std::string & path,
     auto g = [&](const char * name) -> ggml_tensor * {
         return ggml_get_tensor(meta_ctx, name);
     };
-
-    auto first = [](ggml_tensor * a, ggml_tensor * b) -> ggml_tensor * {
-        return a ? a : b;
+    auto g_any = [&](const char * a, const char * b) -> ggml_tensor * {
+        if (ggml_tensor * t = g(a)) return t;
+        return g(b);
     };
 
-    out.fc          = first(g("dflash.fc.weight"), g("dflash_fc.weight"));
-    out.hidden_norm = first(g("dflash.hidden_norm.weight"), g("dflash_hidden_norm.weight"));
+    out.fc          = g_any("dflash.fc.weight", "dflash_fc.weight");
+    out.hidden_norm = g_any("dflash.hidden_norm.weight", "dflash_hidden_norm.weight");
     out.out_norm    = g("output_norm.weight");
     if (!out.fc || !out.hidden_norm || !out.out_norm) {
         set_last_error("draft GGUF: missing top-level tensors "
-                       "(dflash fc / dflash hidden norm / output_norm)");
+                       "(dflash.fc|dflash_fc / dflash.hidden_norm|dflash_hidden_norm / output_norm)");
         gguf_free(gctx);
         return false;
     }
@@ -263,7 +270,8 @@ bool load_draft_gguf(const std::string & path,
         };
         DraftLayer & L = out.layers[il];
         L.attn_norm = fnd("attn_norm.weight");
-        L.ffn_norm  = first(fnd("ffn_norm.weight"), fnd("post_attention_norm.weight"));
+        L.ffn_norm  = fnd("ffn_norm.weight");
+        if (!L.ffn_norm) L.ffn_norm = fnd("post_attention_norm.weight");
         L.wq        = fnd("attn_q.weight");
         L.wk        = fnd("attn_k.weight");
         L.wv        = fnd("attn_v.weight");
@@ -283,6 +291,26 @@ bool load_draft_gguf(const std::string & path,
         }
     }
 
+    // GGUF Qwen3.6 drafters carry SWA metadata emitted by the converter:
+    //   dflash-draft.attention.sliding_window = 2048
+    //   dflash-draft.attention.sliding_window_pattern = [true,true,true,true,false]
+    out.swa_window = (int)read_u32("attention.sliding_window", 0);
+    std::snprintf(key, sizeof(key), "%s.%s", A, "attention.sliding_window_pattern");
+    int64_t swp_id = gguf_find_key(gctx, key);
+    if (swp_id >= 0 && gguf_get_kv_type(gctx, swp_id) == GGUF_TYPE_ARRAY &&
+        gguf_get_arr_type(gctx, swp_id) == GGUF_TYPE_BOOL) {
+        const size_t n = gguf_get_arr_n(gctx, swp_id);
+        const bool * pattern = static_cast<const bool *>(gguf_get_arr_data(gctx, swp_id));
+        for (size_t il = 0; il < n && il < out.layers.size(); il++) {
+            out.layers[il].is_swa = pattern[il];
+        }
+    }
+    const int n_swa = count_swa_layers(out);
+    if (n_swa > 0) {
+        std::fprintf(stderr, "[draft GGUF] SWA layers: %d/%d (window=%d)\n",
+                     n_swa, out.n_layer, out.swa_window);
+    }
+
     // ── 3. Allocate CUDA buffer for all tensors ──────────────────────────
     out.buf = ggml_backend_alloc_ctx_tensors(meta_ctx, backend);
     if (!out.buf) {