@@ -232,21 +232,6 @@ int main(int argc, char ** argv) {
232232
233233 const auto t_dec_start = ggml_time_us ();
234234
235- // Hybrid targets (e.g. Qwen3.5) have recurrent layers that cannot be partially rolled back via seq_rm.
236- // For them, snapshot the target state before verify and, on rejection, restore it and replay only the accepted tokens to ensure correctness
237- // This is not efficient because the target model may run twice, but it is required in current llama.cpp design
238- const bool use_state_snapshot = params.speculative .dflash && llama_model_is_hybrid (model_tgt);
239- if (params.speculative .dflash ) {
240- LOG_INF (" %s: DFlash target=%s, using %s rollback path\n " , __func__,
241- llama_model_is_hybrid (model_tgt) ? " hybrid" : " pure-attention" ,
242- use_state_snapshot ? " snapshot+restore" : " seq_rm" );
243- }
244- std::vector<uint8_t > state_snap;
245- if (use_state_snapshot) {
246- const size_t sz = llama_state_seq_get_size (ctx_tgt, 0 );
247- state_snap.resize (sz);
248- }
249-
250235 while (true ) {
251236 // generate or reuse draft tokens
252237 //
@@ -294,17 +279,6 @@ int main(int argc, char ** argv) {
294279
295280 GGML_ASSERT (n_draft > 0 );
296281
297- // snapshot target state for potential rollback (hybrid/recurrent targets only)
298- const int n_past_before = n_past;
299- const llama_token id_last_saved = id_last;
300- if (use_state_snapshot) {
301- const size_t sz = llama_state_seq_get_size (ctx_tgt, 0 );
302- if (sz > state_snap.size ()) {
303- state_snap.resize (sz);
304- }
305- llama_state_seq_get_data (ctx_tgt, state_snap.data (), sz, 0 );
306- }
307-
308282 // always have a token to evaluate from before - id_last
309283 common_batch_clear (batch_tgt);
310284 common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true );
@@ -403,21 +377,6 @@ int main(int argc, char ** argv) {
403377 draft.clear ();
404378
405379 {
406- // const bool had_rejection = ids.size() < draft.size() + 1;
407-
408- // if (use_state_snapshot && had_rejection) {
409- // // Restore snapshot and replay the committed prefix (id_last + accepted drafts) so target state exactly
410- // LOG_DBG("DFlash rollback: restore target state and replay %zu tokens\n", ids.size());
411- // llama_state_seq_set_data(ctx_tgt, state_snap.data(), state_snap.size(), 0);
412- // common_batch_clear(batch_tgt);
413- // common_batch_add(batch_tgt, id_last_saved, n_past_before, { 0 }, true);
414- // for (size_t i = 0; i + 1 < ids.size(); ++i) {
415- // common_batch_add(batch_tgt, ids[i], n_past_before + 1 + i, { 0 }, true);
416- // }
417- // if (batch_tgt.n_tokens > 0) {
418- // llama_decode(ctx_tgt, batch_tgt);
419- // }
420- // } else {
421380 LOG_DBG (" clear kv cache from any extra tokens, n_past = %d\n " , n_past);
422381
423382 llama_memory_seq_rm (llama_get_memory (ctx_tgt), 0 , n_past, -1 );
0 commit comments