gemma4: save/restore target_feat in prefix cache snapshot

howard0su · Copilot · howard0su · commit bedd3c67f2e9 · 2026-05-21T09:13:52.000+08:00
Matching Qwen35's approach: save target_feat (BF16 feature ring buffer) and
last_tok as part of the KV snapshot. On restore, target_feat is copied back
to GPU before the delta prefill + feature mirror resync.

Previously, only K/V tensors were snapshotted. After restore, the feature
mirror contained stale data from the previous request's decode phase, causing
the draft model to make poor predictions and halving speculative decode
acceptance rate (52% → 24%).

With this fix, the full feature state is correctly restored, and the
subsequent draft_feature_mirror_sync_tail ensures the mirror matches.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/dflash/src/gemma4/gemma4_backend.cpp b/dflash/src/gemma4/gemma4_backend.cpp
@@ -633,8 +633,15 @@ GenerateResult Gemma4Backend::restore_and_generate(int slot,
         }
     }
 
+    // Restore target_feat from snapshot
+    if (snap.feat_snap && cache_.target_feat) {
+        const size_t feat_nbytes = ggml_nbytes(snap.feat_snap);
+        ggml_backend_tensor_set(cache_.target_feat, snap.feat_snap->data, 0, feat_nbytes);
+    }
+
     const int snap_pos = snap.cur_pos;
     cache_.cur_pos = snap_pos;
+    cache_.last_tok = snap.last_tok;
 
     // Set up sampler
     sampler_ = req.sampler;
@@ -766,8 +773,9 @@ bool Gemma4Backend::snapshot_save(int slot) {
     if (needs_alloc) {
         free_gemma4_snapshot(snap);
 
+        const int n_feat_tensors = (cache_.target_feat && cache_.target_feat_cap > 0) ? 1 : 0;
         ggml_init_params ip{};
-        ip.mem_size = ggml_tensor_overhead() * (size_t)(n_layer * 2 + 4) + 4096;
+        ip.mem_size = ggml_tensor_overhead() * (size_t)(n_layer * 2 + n_feat_tensors + 4) + 4096;
         ip.no_alloc = true;
         snap.ctx = ggml_init(ip);
         if (!snap.ctx) return false;
@@ -787,10 +795,21 @@ bool Gemma4Backend::snapshot_save(int slot) {
             }
         }
 
+        // target_feat: save min(snap_pos, target_feat_cap) positions
+        snap.feat_snap = nullptr;
+        snap.feat_cap  = 0;
+        if (cache_.target_feat && cache_.target_feat_cap > 0) {
+            const int feat_len = std::min(snap_pos, cache_.target_feat_cap);
+            snap.feat_snap = ggml_new_tensor_2d(snap.ctx, cache_.target_feat->type,
+                                                 cache_.target_feat->ne[0], feat_len);
+            snap.feat_cap = cache_.target_feat_cap;
+        }
+
         snap.buf = ggml_backend_alloc_ctx_tensors(snap.ctx, snap_backend_);
         if (!snap.buf) {
             ggml_free(snap.ctx); snap.ctx = nullptr;
             snap.k_snap.clear(); snap.v_snap.clear();
+            snap.feat_snap = nullptr;
             return false;
         }
     }
@@ -820,9 +839,15 @@ bool Gemma4Backend::snapshot_save(int slot) {
         }
     }
     snap.cur_pos = snap_pos;
+    snap.last_tok = cache_.last_tok;
 
-    std::printf("[gemma4] snapshot saved slot=%d pos=%d\n", slot, snap.cur_pos);
-    std::fflush(stdout);
+    // target_feat: copy min(snap_pos, cap) positions from GPU to snapshot
+    if (snap.feat_snap && cache_.target_feat) {
+        const size_t feat_nbytes = ggml_nbytes(snap.feat_snap);
+        ggml_backend_tensor_get(cache_.target_feat, snap.feat_snap->data, 0, feat_nbytes);
+    }
+
+    std::fprintf(stderr, "[gemma4] snapshot saved slot=%d pos=%d\n", slot, snap.cur_pos);
     return true;
 }
 
diff --git a/dflash/src/gemma4/gemma4_internal.h b/dflash/src/gemma4/gemma4_internal.h
@@ -193,8 +193,11 @@ bool  create_gemma4_target_feat(ggml_backend_t backend, Gemma4Cache & cache,
 // Snapshot
 struct Gemma4Snapshot {
     int cur_pos = 0;
+    int32_t last_tok = -1;
     std::vector<ggml_tensor *> k_snap;
     std::vector<ggml_tensor *> v_snap;
+    ggml_tensor *             feat_snap = nullptr;  // [fc_in, feat_len]
+    int                       feat_cap  = 0;
     ggml_context *        ctx = nullptr;
     ggml_backend_buffer_t buf = nullptr;
 };
diff --git a/dflash/src/gemma4/gemma4_loader.cpp b/dflash/src/gemma4/gemma4_loader.cpp
@@ -478,7 +478,10 @@ void free_gemma4_snapshot(Gemma4Snapshot & s) {
     if (s.buf) { ggml_backend_buffer_free(s.buf); s.buf = nullptr; }
     if (s.ctx) { ggml_free(s.ctx); s.ctx = nullptr; }
     s.k_snap.clear(); s.v_snap.clear();
-    s.cur_pos = 0;
+    s.feat_snap = nullptr;
+    s.feat_cap  = 0;
+    s.cur_pos   = 0;
+    s.last_tok  = -1;
 }
 
 }  // namespace dflash27b