spec : introduce common_speculative_process()

ggerganov · ggerganov · commit db8e326913e3 · 2026-05-09T17:12:24.000+03:00
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -149,6 +149,8 @@ struct common_speculative_impl {
 
     virtual void begin(llama_seq_id seq_id, const llama_tokens & prompt) = 0;
 
+    virtual bool process(const llama_batch & batch) = 0;
+
     virtual void draft(common_speculative_draft_params_vec & dparams) = 0;
 
     virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0;
@@ -223,6 +225,20 @@ struct common_speculative_state_draft : public common_speculative_impl {
         // noop
     }
 
+    bool process(const llama_batch & batch) override {
+        auto * ctx_dft = params.ctx_dft;
+
+        const int ret = llama_decode(ctx_dft, batch);
+
+        if (ret != 0) {
+            LOG_ERR("%s: failed to decode draft batch, ret = %d\n", __func__, ret);
+
+            return false;
+        }
+
+        return true;
+    }
+
     void draft(common_speculative_draft_params_vec & dparams) override {
         auto & ctx_dft = params.ctx_dft;
 
@@ -345,6 +361,11 @@ struct common_speculative_state_eagle3 : public common_speculative_impl {
         // noop
     }
 
+    bool process(const llama_batch & /*batch*/) override {
+        // TODO: implement
+        return true;
+    }
+
     void draft(common_speculative_draft_params_vec & /*dparams*/) override {
         // TODO: implement
     }
@@ -372,6 +393,11 @@ struct common_speculative_state_ngram_simple : public common_speculative_impl {
         // noop
     }
 
+    bool process(const llama_batch & /*batch*/) override {
+        // TODO: implement
+        return true;
+    }
+
     void draft(common_speculative_draft_params_vec & dparams) override {
         assert(dparams.size() == n_seq);
 
@@ -413,6 +439,11 @@ struct common_speculative_state_ngram_map_k : public common_speculative_impl {
         common_ngram_map_begin(config[seq_id], prompt);
     }
 
+    bool process(const llama_batch & /*batch*/) override {
+        // TODO: implement
+        return true;
+    }
+
     void draft(common_speculative_draft_params_vec & dparams) override {
         assert(dparams.size() == n_seq);
 
@@ -559,6 +590,11 @@ struct common_speculative_state_ngram_mod : public common_speculative_impl {
         sinfo.n_draft_last = result.size();
     }
 
+    bool process(const llama_batch & /*batch*/) override {
+        // TODO: implement
+        return true;
+    }
+
     void draft(common_speculative_draft_params_vec & dparams) override {
         assert(dparams.size() == n_seq);
 
@@ -706,6 +742,11 @@ struct common_speculative_state_ngram_cache : public common_speculative_impl {
         }
     }
 
+    bool process(const llama_batch & /*batch*/) override {
+        // TODO: implement
+        return true;
+    }
+
     void draft(common_speculative_draft_params_vec & dparams) override {
         assert(dparams.size() == n_seq);
 
@@ -937,6 +978,20 @@ void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, co
     }
 }
 
+bool common_speculative_process(common_speculative * spec, const llama_batch & batch) {
+    bool result = true;
+
+    if (spec == nullptr) {
+        return result;
+    }
+
+    for (auto & impl : spec->impls) {
+        result = result && impl->process(batch);
+    }
+
+    return result;
+}
+
 void common_speculative_draft(common_speculative * spec) {
     if (spec == nullptr) {
         return;
diff --git a/common/speculative.h b/common/speculative.h
@@ -22,6 +22,7 @@ struct common_speculative_draft_params {
     // this flag is used to chain the drafts through all the available implementations
     // after the first successful draft from an implementation, we set it
     //   to false to prevent further drafts for that sequence
+    // at the end of the draft() call, all drafting flags will be reset to false
     bool drafting = false;
 
     // overrides individual configurations (-1 disabled)
@@ -43,8 +44,8 @@ common_speculative_draft_params & common_speculative_get_draft_params(common_spe
 // optionally call once at the beginning of a new generation
 void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, const llama_tokens & prompt);
 
-// TODO: implement [TAG_COMMON_SPECULATIVE_PROCESS]
-//bool common_speculative_process(common_speculative * spec, const llama_batch & batch);
+// process the batch and update the internal state of the speculative context
+bool common_speculative_process(common_speculative * spec, const llama_batch & batch);
 
 // generate drafts for the sequences specified with `common_speculative_get_draft_params`
 void common_speculative_draft(common_speculative * spec);
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -2676,6 +2676,7 @@ struct server_context_impl {
                         if (ctx_dft) {
                             // TODO: in the future, figure out how to infuse target embeddings to the images
                             //       for now, we skip this for simplicity
+                            //       maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above?
                             res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
                             if (res != 0) {
                                 GGML_ABORT("failed to process multi-modal data on draft context\n");
@@ -2925,36 +2926,44 @@ struct server_context_impl {
             // | Eagle3      | yes          |
             // | DFlash      | yes          | https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4405406982
             //
-            // TODO: move to `common_speculative_process(spec, batch, ...)` [TAG_COMMON_SPECULATIVE_PROCESS]
-            if (ctx_dft) {
-                // TODO: update as needed for MTP, Eagle3, etc.
-                const bool need_tgt_embd = false;
-
-                if (need_tgt_embd) {
-                    llama_synchronize(ctx_tgt);
-                }
-
-                // the logic here varies depending on the speculative decoding method
-                //  - some draft contexts require embeddings from the target context, others don't
-                //  - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings
-                // TODO: extract this in a function ?
-                {
-                    // TODO: hook the embeddings from the last target batch here
-                    if (llama_model_has_encoder(model_dft.get())) {
-                        //llama_encode(ctx_dft, ...);
-
-                        GGML_ABORT("not implemented yet\n");
-                    }
-
-                    const int ret = llama_decode(ctx_dft.get(), batch_view);
-
-                    if (ret != 0) {
-                        SRV_ERR("failed to decode draft batch, ret = %d\n", ret);
-
-                        // TODO: handle error
-                        break;
-                    }
-                }
+            // note: this logic is now moved in `common_speculative_process()`
+            //       keeping the sketch here until for a bit, until the logic is finalized
+            //
+            //if (ctx_dft) {
+            //    // TODO: update as needed for MTP, Eagle3, etc.
+            //    const bool need_tgt_embd = false;
+
+            //    if (need_tgt_embd) {
+            //        llama_synchronize(ctx_tgt);
+            //    }
+
+            //    // the logic here varies depending on the speculative decoding method
+            //    //  - some draft contexts require embeddings from the target context, others don't
+            //    //  - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings
+            //    // TODO: extract this in a function ?
+            //    {
+            //        // TODO: hook the embeddings from the last target batch here
+            //        if (llama_model_has_encoder(model_dft.get())) {
+            //            //llama_encode(ctx_dft, ...);
+
+            //            GGML_ABORT("not implemented yet\n");
+            //        }
+
+            //        const int ret = llama_decode(ctx_dft.get(), batch_view);
+
+            //        if (ret != 0) {
+            //            SRV_ERR("failed to decode draft batch, ret = %d\n", ret);
+
+            //            // TODO: handle error
+            //            break;
+            //        }
+            //    }
+            //}
+            if (!common_speculative_process(spec.get(), batch_view)) {
+                SRV_ERR("%s", "failed to process speculative batch\n");
+
+                // TODO: handle error
+                break;
             }
 
             // move the head of the batch forward with the number of tokens we just processed