ggml-org
diff --git a/‎common/speculative.cpp‎
Lines changed: 448 additions & 151 deletions b/‎common/speculative.cpp‎
Lines changed: 448 additions & 151 deletions
diff --git a/‎common/speculative.h‎
Lines changed: 22 additions & 5 deletions b/‎common/speculative.h‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎examples/speculative-simple/speculative-simple.cpp‎
Lines changed: 4 additions & 2 deletions b/‎examples/speculative-simple/speculative-simple.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/llama-batch.cpp‎
Lines changed: 4 additions & 7 deletions b/‎src/llama-batch.cpp‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎src/llama-batch.h‎
Lines changed: 1 addition & 2 deletions b/‎src/llama-batch.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/llama-context.cpp‎
Lines changed: 3 additions & 9 deletions b/‎src/llama-context.cpp‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎src/llama-graph.cpp‎
Lines changed: 7 additions & 15 deletions b/‎src/llama-graph.cpp‎
Lines changed: 7 additions & 15 deletions
diff --git a/‎src/llama-kv-cache-iswa.cpp‎
Lines changed: 14 additions & 54 deletions b/‎src/llama-kv-cache-iswa.cpp‎
Lines changed: 14 additions & 54 deletions
diff --git a/‎src/llama-kv-cache-iswa.h‎
Lines changed: 5 additions & 14 deletions b/‎src/llama-kv-cache-iswa.h‎
Lines changed: 5 additions & 14 deletions
@@ -26,12 +26,28 @@ void common_speculative_free(common_speculative * spec);
 
 // optionally call once at the beginning of a new generation
 void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt);
+// starts a new generation while preserving at most the retained common prefix that is
+// still valid in both the target and draft contexts
+void common_speculative_begin(
+        common_speculative * spec,
+        const llama_tokens & prompt,
+        llama_pos            retained_prefix_len);
+
+llama_pos common_speculative_get_committed_prefix_len(
+        const common_speculative * spec);
+
+void common_speculative_invalidate_retained_state(
+        common_speculative * spec);
 
-void common_speculative_set_prompt_hidden_states(
+// supplies the token/hidden-state source used by the next MTP first pass; start_pos
+// is the target-context position of source_tokens[0]
+void common_speculative_set_first_pass_source(
         common_speculative * spec,
-              const float * hidden_states,
-                    int32_t n_tokens,
-                    int32_t n_embd);
+        const llama_tokens & source_tokens,
+        const float *        hidden_states,
+        int32_t              n_tokens,
+        int32_t              n_embd,
+        llama_pos            start_pos);
 
 // sample up to n_draft tokens and add them to the batch using the draft model
 llama_tokens common_speculative_draft(
@@ -40,7 +56,8 @@ llama_tokens common_speculative_draft(
                      const llama_tokens & prompt,
                             llama_token   id_last);
 
-// informs the speculative decoder that n_accepted tokens were accepted by the target model
+// informs the speculative decoder that n_accepted tokens were accepted by the target model;
+// batch_idxs maps the frontier token and accepted draft tokens back to verifier output rows
 void common_speculative_accept(common_speculative * spec, uint16_t n_accepted, const std::vector<int32_t> & batch_idxs);
 
 // print statistics about the speculative decoding
 
@@ -163,11 +163,13 @@ int main(int argc, char ** argv) {
             std::memcpy(prompt_hidden.data() + i*llama_model_n_embd(model_tgt), hidden,
                     llama_model_n_embd(model_tgt)*sizeof(float));
         }
-        common_speculative_set_prompt_hidden_states(
+        common_speculative_set_first_pass_source(
                 spec,
+                prompt_tgt,
                 prompt_hidden.data(),
                 prompt_tgt.size(),
-                llama_model_n_embd(model_tgt));
+                llama_model_n_embd(model_tgt),
+                0);
     }
 
     llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
 
@@ -28,8 +28,7 @@ bool llama_batch_allocr::init(
         const llama_memory_i * memory,
         uint32_t n_embd,
         uint32_t n_seq_max,
-        bool output_all,
-        bool allow_non_contiguous_pos) {
+        bool output_all) {
     clear();
 
     batch = batch_inp;
@@ -314,11 +313,9 @@ bool llama_batch_allocr::init(
                 }
             }
 
-            if (!allow_non_contiguous_pos) {
-                if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
-                    LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
-                    return false;
-                }
+            if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
+                LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
+                return false;
             }
         }
     }
 
@@ -81,8 +81,7 @@ class llama_batch_allocr {
             const llama_memory_i * memory,
             uint32_t n_embd,
             uint32_t n_seq_max,
-            bool output_all,
-            bool allow_non_contiguous_pos = false);
+            bool output_all);
 
     const llama_batch & get_batch() const;
 
 
@@ -4,7 +4,6 @@
 #include "llama-impl.h"
 #include "llama-batch.h"
 #include "llama-io.h"
-#include "llama-kv-cache-iswa.h"
 #include "llama-memory.h"
 #include "llama-mmap.h"
 #include "llama-model.h"
@@ -1589,10 +1588,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
         }
     }
 
-    const llama_memory_i * memory_for_batch = memory.get();
-    const bool allow_non_contiguous_pos = false;
-
-    if (!balloc->init(batch_inp, vocab, memory_for_batch, n_embd, n_seq_max, output_all, allow_non_contiguous_pos)) {
+    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
@@ -1748,10 +1744,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
             t_embd = res->get_embd_pooled();
         }
 
-        const bool mtp_skip_output = false;
-
         // extract logits
-        if (!mtp_skip_output && logits.data && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers)) {
+        if (logits.data && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers)) {
             ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
             GGML_ASSERT(backend_res != nullptr);
             GGML_ASSERT(logits.data != nullptr);
@@ -1766,7 +1760,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
         }
 
         // extract embeddings
-        if (!mtp_skip_output && embd.data && t_embd && n_outputs > 0) {
+        if (embd.data && t_embd && n_outputs > 0) {
             ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
             GGML_ASSERT(backend_embd != nullptr);
 
 
@@ -502,15 +502,14 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
 }
 
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
-    // base tensors may not be allocated if the graph uses only SWA layers
+    // In single-layer ISWA graphs, one branch can be pruned and never get a backend buffer.
     if (self_k_idxs && self_k_idxs->buffer) {
         mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
         mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
 
         mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
     }
 
-    // swa tensors may not be allocated if the graph uses only base layers
     if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
         mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
         mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
@@ -534,21 +533,14 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
 
     bool res = true;
 
-    // base tensors may not be allocated if the graph uses only SWA layers
-    if (self_k_idxs && self_k_idxs->buffer) {
-        res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
-      //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
-
-        res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
-    }
+    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+  //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
 
-    // swa tensors may not be allocated if the graph uses only base layers
-    if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
-        res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
-      //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+    res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+  //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
 
-        res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
-    }
+    res &= can_reuse_kq_mask(self_kq_mask,     mctx->get_base(), params.ubatch, params.cparams);
+    res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(),  params.ubatch, params.cparams);
 
     return res;
 }
 
@@ -209,38 +209,6 @@ llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & ba
     return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
 }
 
-llama_memory_context_ptr llama_kv_cache_iswa::init_batch_with_sinfos(
-        llama_batch_allocr & balloc,
-        uint32_t n_ubatch,
-        const llama_kv_cache::slot_info_vec_t & sinfos,
-        bool is_inplace_update) {
-    if (sinfos.empty()) {
-        return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
-
-    balloc.split_reset();
-
-    std::vector<llama_ubatch> ubatches;
-    const uint32_t n_stream = kv_base->get_n_stream();
-    while (true) {
-        auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true);
-        if (ubatch.n_tokens == 0) {
-            break;
-        }
-        ubatches.push_back(std::move(ubatch)); // NOLINT
-    }
-
-    if (ubatches.size() != sinfos.size()) {
-        return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
-
-    auto sinfos_base = sinfos;
-    auto sinfos_swa  = sinfos;
-
-    return std::make_unique<llama_kv_cache_iswa_context>(
-            this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches), is_inplace_update);
-}
-
 llama_memory_context_ptr llama_kv_cache_iswa::init_full() {
     return std::make_unique<llama_kv_cache_iswa_context>(this);
 }
@@ -279,6 +247,20 @@ llama_kv_cache * llama_kv_cache_iswa::get_swa() const {
     return kv_swa.get();
 }
 
+void llama_kv_cache_iswa::set_swa_reuse_guard(llama_pos query_pos) {
+    kv_base->clear_swa_reuse_guard();
+    kv_swa->set_swa_reuse_guard(query_pos);
+}
+
+void llama_kv_cache_iswa::clear_swa_reuse_guard() {
+    kv_base->clear_swa_reuse_guard();
+    kv_swa->clear_swa_reuse_guard();
+}
+
+bool llama_kv_cache_iswa::consume_swa_reuse_guard_block_prepare() {
+    return kv_swa->consume_swa_reuse_guard_block_prepare();
+}
+
 //
 // llama_kv_cache_iswa_context
 //
@@ -313,19 +295,6 @@ llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
     status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
 }
 
-llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
-        llama_kv_cache_iswa * kv,
-        slot_info_vec_t sinfos_base,
-        slot_info_vec_t sinfos_swa,
-        std::vector<llama_ubatch> ubatches,
-        bool is_inplace_update) :
-    ubatches(std::move(ubatches)),
-    // note: here we copy the ubatches. not sure if this is ideal
-    ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches, is_inplace_update)),
-    ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa),  this->ubatches, is_inplace_update)),
-    status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
-}
-
 llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default;
 
 bool llama_kv_cache_iswa_context::next() {
@@ -373,12 +342,3 @@ const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa()  const {
 
     return static_cast<const llama_kv_cache_context *>(ctx_swa.get());
 }
-
-void llama_kv_cache_iswa_context::set_inplace(bool value) {
-    auto * base = const_cast<llama_kv_cache_context *>(
-            static_cast<const llama_kv_cache_context *>(ctx_base.get()));
-    auto * swa  = const_cast<llama_kv_cache_context *>(
-            static_cast<const llama_kv_cache_context *>(ctx_swa.get()));
-    if (base) { base->set_inplace(value); }
-    if (swa)  { swa ->set_inplace(value); }
-}
@@ -39,12 +39,6 @@ class llama_kv_cache_iswa : public llama_memory_i {
             uint32_t n_ubatch,
             bool embd_all) override;
 
-    llama_memory_context_ptr init_batch_with_sinfos(
-            llama_batch_allocr & balloc,
-            uint32_t n_ubatch,
-            const llama_kv_cache::slot_info_vec_t & sinfos,
-            bool is_inplace_update);
-
     llama_memory_context_ptr init_full() override;
 
     llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
@@ -76,6 +70,11 @@ class llama_kv_cache_iswa : public llama_memory_i {
     llama_kv_cache * get_base() const;
     llama_kv_cache * get_swa () const;
 
+    void set_swa_reuse_guard(llama_pos query_pos);
+    void clear_swa_reuse_guard();
+
+    bool consume_swa_reuse_guard_block_prepare();
+
 private:
     const llama_hparams & hparams;
 
@@ -108,12 +107,6 @@ class llama_kv_cache_iswa_context : public llama_memory_context_i {
             slot_info_vec_t sinfos_base,
             slot_info_vec_t sinfos_swa,
             std::vector<llama_ubatch> ubatches);
-    llama_kv_cache_iswa_context(
-            llama_kv_cache_iswa * kv,
-            slot_info_vec_t sinfos_base,
-            slot_info_vec_t sinfos_swa,
-            std::vector<llama_ubatch> ubatches,
-            bool is_inplace_update);
 
     virtual ~llama_kv_cache_iswa_context();
 
@@ -134,8 +127,6 @@ class llama_kv_cache_iswa_context : public llama_memory_context_i {
     const llama_kv_cache_context * get_base() const;
     const llama_kv_cache_context * get_swa()  const;
 
-    void set_inplace(bool value);
-
 private:
     //llama_kv_cache_iswa * kv;
Original file line number	Diff line number	Diff line change
`@@ -28,8 +28,7 @@ bool llama_batch_allocr::init(`
`28`	`28`	`const llama_memory_i * memory,`
`29`	`29`	`uint32_t n_embd,`
`30`	`30`	`uint32_t n_seq_max,`
`31`		`- bool output_all,`
`32`		`- bool allow_non_contiguous_pos) {`
	`31`	`+ bool output_all) {`
`33`	`32`	`clear();`
`34`	`33`
`35`	`34`	`batch = batch_inp;`
`@@ -314,11 +313,9 @@ bool llama_batch_allocr::init(`
`314`	`313`	`}`
`315`	`314`	`}`
`316`	`315`
`317`		`- if (!allow_non_contiguous_pos) {`
`318`		`- if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {`
`319`		`- LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);`
`320`		`- return false;`
`321`		`- }`
	`316`	`+ if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {`
	`317`	`+ LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);`
	`318`	`+ return false;`
`322`	`319`	`}`
`323`	`320`	`}`
`324`	`321`	`}`