dflash: remove rebundant logic & correct bias naming

ruixiang63 · ruixiang63 · commit e344c4a71736 · 2026-04-24T17:43:24.000Z
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
@@ -232,21 +232,6 @@ int main(int argc, char ** argv) {
 
     const auto t_dec_start = ggml_time_us();
 
-    // Hybrid targets (e.g. Qwen3.5) have recurrent layers that cannot be partially rolled back via seq_rm. 
-    // For them, snapshot the target state before verify and, on rejection, restore it and replay only the accepted tokens to ensure correctness 
-    // This is not efficient because the target model may run twice, but it is required in current llama.cpp design
-    const bool use_state_snapshot = params.speculative.dflash && llama_model_is_hybrid(model_tgt);
-    if (params.speculative.dflash) {
-        LOG_INF("%s: DFlash target=%s, using %s rollback path\n", __func__,
-                llama_model_is_hybrid(model_tgt) ? "hybrid" : "pure-attention",
-                use_state_snapshot ? "snapshot+restore" : "seq_rm");
-    }
-    std::vector<uint8_t> state_snap;
-    if (use_state_snapshot) {
-        const size_t sz = llama_state_seq_get_size(ctx_tgt, 0);
-        state_snap.resize(sz);
-    }
-
     while (true) {
         // generate or reuse draft tokens
         //
@@ -294,17 +279,6 @@ int main(int argc, char ** argv) {
 
         GGML_ASSERT(n_draft > 0);
 
-        // snapshot target state for potential rollback (hybrid/recurrent targets only)
-        const int         n_past_before = n_past;
-        const llama_token id_last_saved = id_last;
-        if (use_state_snapshot) {
-            const size_t sz = llama_state_seq_get_size(ctx_tgt, 0);
-            if (sz > state_snap.size()) {
-                state_snap.resize(sz);
-            }
-            llama_state_seq_get_data(ctx_tgt, state_snap.data(), sz, 0);
-        }
-
         // always have a token to evaluate from before - id_last
         common_batch_clear(batch_tgt);
         common_batch_add  (batch_tgt, id_last, n_past++, { 0 }, true);
@@ -403,21 +377,6 @@ int main(int argc, char ** argv) {
         draft.clear();
 
         {
-        // const bool had_rejection = ids.size() < draft.size() + 1;
-
-        // if (use_state_snapshot && had_rejection) {
-        //     // Restore snapshot and replay the committed prefix (id_last + accepted drafts) so target state exactly
-        //     LOG_DBG("DFlash rollback: restore target state and replay %zu tokens\n", ids.size());
-        //     llama_state_seq_set_data(ctx_tgt, state_snap.data(), state_snap.size(), 0);
-        //     common_batch_clear(batch_tgt);
-        //     common_batch_add(batch_tgt, id_last_saved, n_past_before, { 0 }, true);
-        //     for (size_t i = 0; i + 1 < ids.size(); ++i) {
-        //         common_batch_add(batch_tgt, ids[i], n_past_before + 1 + i, { 0 }, true);
-        //     }
-        //     if (batch_tgt.n_tokens > 0) {
-        //         llama_decode(ctx_tgt, batch_tgt);
-        //     }
-        // } else {
             LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
 
             llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -7382,10 +7382,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
 
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_k_gqa},          TENSOR_NOT_REQUIRED);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_v_gqa},          TENSOR_NOT_REQUIRED);
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},                TENSOR_NOT_REQUIRED);
+                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED);
+                        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_k_gqa},          TENSOR_NOT_REQUIRED);
+                        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_v_gqa},          TENSOR_NOT_REQUIRED);
+                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},                TENSOR_NOT_REQUIRED);
 
                         layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
                         layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
diff --git a/src/models/dflash.cpp b/src/models/dflash.cpp
@@ -67,25 +67,25 @@ llm_build_dflash_decode::llm_build_dflash_decode(const llama_model & model, cons
 
         // Q from noise only
         ggml_tensor * Qcur = build_lora_mm(layer.wq, noise_norm);
-        if (layer.bq) { Qcur = ggml_add(ctx0, Qcur, layer.bq); }
+        if (layer.wq_b) { Qcur = ggml_add(ctx0, Qcur, layer.wq_b); }
         cb(Qcur, "Qcur", il);
 
         // K = concat(k_proj(target_ctx), k_proj(noise))
         ggml_tensor * K_tgt   = build_lora_mm(layer.wk, target_ctx);
         ggml_tensor * K_noise = build_lora_mm(layer.wk, noise_norm);
-        if (layer.bk) {
-            K_tgt   = ggml_add(ctx0, K_tgt,   layer.bk);
-            K_noise = ggml_add(ctx0, K_noise, layer.bk);
+        if (layer.wk_b) {
+            K_tgt   = ggml_add(ctx0, K_tgt,   layer.wk_b);
+            K_noise = ggml_add(ctx0, K_noise, layer.wk_b);
         }
         ggml_tensor * Kcur = ggml_concat(ctx0, K_tgt, K_noise, 1);
         cb(Kcur, "Kcur", il);
 
         // V = concat(v_proj(target_ctx), v_proj(noise))
         ggml_tensor * V_tgt   = build_lora_mm(layer.wv, target_ctx);
         ggml_tensor * V_noise = build_lora_mm(layer.wv, noise_norm);
-        if (layer.bv) {
-            V_tgt   = ggml_add(ctx0, V_tgt,   layer.bv);
-            V_noise = ggml_add(ctx0, V_noise, layer.bv);
+        if (layer.wv_b) {
+            V_tgt   = ggml_add(ctx0, V_tgt,   layer.wv_b);
+            V_noise = ggml_add(ctx0, V_noise, layer.wv_b);
         }
         ggml_tensor * Vcur = ggml_concat(ctx0, V_tgt, V_noise, 1);
         cb(Vcur, "Vcur", il);
@@ -123,7 +123,7 @@ llm_build_dflash_decode::llm_build_dflash_decode(const llama_model & model, cons
         cb(cur, "kqv_out", il);
 
         cur = build_lora_mm(layer.wo, cur);
-        if (layer.bo) { cur = ggml_add(ctx0, cur, layer.bo); }
+        if (layer.wo_b) { cur = ggml_add(ctx0, cur, layer.wo_b); }
         cur = ggml_add(ctx0, cur, inpL);
         cb(cur, "attn_res", il);