srogmann
diff --git a/‎common/speculative.cpp‎
Lines changed: 2 additions & 144 deletions b/‎common/speculative.cpp‎
Lines changed: 2 additions & 144 deletions
diff --git a/‎common/speculative.h‎
Lines changed: 4 additions & 42 deletions b/‎common/speculative.h‎
Lines changed: 4 additions & 42 deletions
@@ -1081,7 +1081,8 @@ common_speculative * common_speculative_init(
     }
 
     auto * result = new common_speculative {
-        /* .impls = */ std::move(impls)
+        /* .impls     = */ std::move(impls),
+        /* .curr_impl = */ nullptr,
     };
 
     return result;
@@ -1187,146 +1188,3 @@ void common_speculative_print_stats(const common_speculative * spec) {
                 str_perf.c_str());
     }
 }
-
-struct common_speculative_session::impl {
-    common_params_speculative params;
-
-    common_speculative * spec = nullptr;
-
-    bool has_partial = false;
-
-    llama_tokens draft;
-
-    impl(
-        const common_params_speculative & params,
-              llama_context             * ctx_tgt) : params(params) {
-        spec = common_speculative_init(this->params, ctx_tgt);
-    }
-
-    void begin(const llama_tokens & prompt_history) const {
-        common_speculative_begin(spec, prompt_history);
-    }
-
-    bool generate_draft(
-               const llama_tokens & tokens,
-                     llama_token    id_last,
-               const int            n_draft_max) {
-        GGML_ASSERT(spec);
-
-        if (n_draft_max == 0) {
-            this->clear();
-            return false;
-        }
-
-        if (has_partial) {
-            if (draft.empty()) {
-                this->clear();
-            }
-
-            LOG_DBG("%s: reuse shortened draft, #tokens=%zu, id_last=%d, size=%zu\n", __func__, tokens.size(), id_last, draft.size());
-
-            return false;
-        }
-
-        // call the speculative implementation to create a draft
-        draft = common_speculative_draft(spec, params, tokens, id_last);
-        LOG_DBG("draft: id_last=%d, #draft=%zu\n", id_last, draft.size());
-
-        if (draft.empty()) {
-            this->clear();
-            return false;
-        }
-
-        if (draft.size() > (size_t) n_draft_max) {
-            LOG_WRN("draft size %d exceeds max %d, truncating\n", (int) draft.size(), n_draft_max);
-            draft.resize(n_draft_max);
-        }
-
-        if (draft.size() < (size_t) params.n_min) {
-            LOG_DBG("ignoring small draft: %d < %d\n", (int) draft.size(), params.n_min);
-            this->clear();
-            return false;
-        }
-
-        return true;
-    }
-
-    bool accept(llama_tokens ids) {
-        LOG_WRN("%s: n_draft=%zu, ids.size=%zu\n", __func__, draft.size(), ids.size());
-
-        has_partial = false;
-
-        if (ids.size() < draft.size() + 1) {
-            // the main model rejected some tokens
-            if (params.use_checkpoints) {
-                // shorten the draft to the number of accepted tokens
-                draft.resize(ids.size() - 1);
-
-                has_partial = true;
-
-                return false;
-            }
-
-            LOG_DBG("%s: partial acceptance: %zu < %zu\n", __func__, draft.size(), draft.size());
-        }
-
-        draft = std::move(ids);
-
-        common_speculative_accept(spec, draft.size());
-
-        return true;
-    }
-
-    void print_stats() const {
-        GGML_ASSERT(spec);
-
-        common_speculative_print_stats(spec);
-    }
-
-    void clear() {
-        GGML_ASSERT(spec);
-
-        has_partial = false;
-        draft.clear();
-    }
-};
-
-common_speculative_session::common_speculative_session(
-        const common_params_speculative & params,
-              llama_context             * ctx_tgt) : pimpl(new impl{params, ctx_tgt}) {
-}
-
-common_speculative_session::~common_speculative_session() {
-    common_speculative_free(pimpl->spec);
-}
-
-bool common_speculative_session::fail() const {
-    return pimpl->spec == nullptr;
-}
-
-void common_speculative_session::begin(const llama_tokens & prompt_history) {
-    pimpl->begin(prompt_history);
-}
-
-bool common_speculative_session::generate_draft(
-       const llama_tokens & prompt,
-             llama_token    id_last,
-             int            n_draft_max) {
-    return pimpl->generate_draft(prompt, id_last, n_draft_max);
-}
-
-bool common_speculative_session::accept(llama_tokens ids) {
-    return pimpl->accept(std::move(ids));
-}
-
-const llama_tokens & common_speculative_session::get_draft() const {
-    return pimpl->draft;
-}
-
-void common_speculative_session::print_stats() const {
-    pimpl->print_stats();
-}
-
-void common_speculative_session::clear() {
-    pimpl->clear();
-}
@@ -3,15 +3,6 @@
 #include "llama.h"
 #include "common.h"
 
-// common/speculative.h has two interfaces:
-//
-// 1) struct common_speculative with init, begin, draft, accept and print_stats
-//    Simple interface, see examples/speculative/speculative.cpp
-//
-// 2) struct common_speculative_session with struct common_speculative_callback
-//    Complex interface which supports checkpoints, see tools/server/server-context.cpp
-//
-
 struct common_speculative;
 
 // comma separated list of all types
@@ -55,37 +46,8 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
 
-// speculative decoding which may use checkpoints to rewind in tokens history
-struct common_speculative_session {
-    common_speculative_session(
-            const common_params_speculative & params,
-                  llama_context             * ctx_tgt);
-
-    ~common_speculative_session();
-
-    // no implementations available
-    bool fail() const;
-
-    // call once at the beginning of a new generation
-    // some spec implementations use the prompt history to initialize lookup maps
-    void begin(const llama_tokens & prompt_history);
-
-    // do speculative decoding to compute a draft of tokens
-    bool generate_draft(
-            const llama_tokens & prompt,
-                  llama_token    id_last,
-                  int            n_draft_max);
-
-    // check if and how far the current draft is accepted
-    bool accept(llama_tokens ids);
-
-    const llama_tokens & get_draft() const;
-
-    void print_stats() const;
-
-    void clear();
-
-    private:
-        struct impl;
-        std::unique_ptr<impl> pimpl;
+struct common_speculative_deleter {
+    void operator()(common_speculative * s) { common_speculative_free(s); }
 };
+
+typedef std::unique_ptr<common_speculative, common_speculative_deleter> common_speculative_ptr;