janhq
diff --git a/‎.github/workflows/python-type-check.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/python-type-check.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/arg.cpp‎
Lines changed: 1 addition & 1 deletion b/‎common/arg.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/hf-cache.cpp‎
Lines changed: 1 addition & 1 deletion b/‎common/hf-cache.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/speculative.cpp‎
Lines changed: 44 additions & 55 deletions b/‎common/speculative.cpp‎
Lines changed: 44 additions & 55 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 1 addition & 1 deletion b/‎convert_hf_to_gguf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/speculative/speculative.cpp‎
Lines changed: 18 additions & 9 deletions b/‎examples/speculative/speculative.cpp‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎ggml/src/ggml-backend-meta.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-backend-meta.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-blas/ggml-blas.cpp‎
Lines changed: 2 additions & 2 deletions b/‎ggml/src/ggml-blas/ggml-blas.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cann/ggml-cann.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cann/ggml-cann.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cpu/ggml-cpu.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -31,7 +31,7 @@ jobs:
         uses: actions/setup-python@v6
         with:
           python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.26
+          pip-install: -r requirements/requirements-all.txt ty==0.0.33
       # - name: Type-check with Pyright
       #   uses: jakebailey/pyright-action@v2
       #   with:
 
@@ -3499,7 +3499,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_N_MIN"));
 
     add_opt(common_arg(
-        {"--spec--draft-p-split", "--draft-p-split"}, "P",
+        {"--spec-draft-p-split", "--draft-p-split"}, "P",
         string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.draft.p_split),
         [](common_params & params, const std::string & value) {
             params.speculative.draft.p_split = std::stof(value);
 
@@ -57,7 +57,7 @@ static fs::path get_cache_directory() {
 #ifndef _WIN32
         const struct passwd * pw = getpwuid(getuid());
 
-        if (pw->pw_dir && *pw->pw_dir) {
+        if (pw && pw->pw_dir && *pw->pw_dir) {
             return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
         }
 #endif
 
@@ -167,16 +167,14 @@ struct common_speculative_checkpoint {
     size_t size() const {
         return data.size();
     }
-
-    size_t ckpt_size   = 0;
 };
 
 struct common_speculative_state_draft : public common_speculative_state {
     llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
     llama_context * ctx_dft;
 
     bool use_ckpt = false;
-    struct common_speculative_checkpoint ckpt;
+    common_speculative_checkpoint ckpt;
 
     common_sampler * smpl;
 
@@ -249,26 +247,16 @@ struct common_speculative_state_draft : public common_speculative_state {
         llama_batch_free(batch);
     }
 
-    void begin(const llama_tokens & prompt) override {
-        if (use_ckpt && ckpt.size() > 0) {
-            // delete checkpoint
-            LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
-                    __func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
-            ckpt.pos_min   = 0;
-            ckpt.pos_max   = 0;
-            ckpt.n_tokens  = 0;
-            ckpt.ckpt_size = 0;
-            ckpt.data.clear();
-        }
+    void begin(const llama_tokens & /*prompt*/) override {
     }
 
-    size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
+    size_t create_checkpoint(int n_tokens_prompt) {
         int slot_id = 0;
         const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
 
         ckpt.pos_min  = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
         ckpt.pos_max  = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
-        ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
+        ckpt.n_tokens = n_tokens_prompt;
         ckpt.data.resize(checkpoint_size);
 
         const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
@@ -281,13 +269,13 @@ struct common_speculative_state_draft : public common_speculative_state {
         return n;
     }
 
-    size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
+    size_t restore_checkpoint() {
         int slot_id = 0;
         LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
         const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-        if (n != ckpt_size_part_expected) {
-            GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
-                        __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
+        if (n != ckpt.size()) {
+            GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu",
+                        __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size());
         }
         llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
 
@@ -346,35 +334,45 @@ struct common_speculative_state_draft : public common_speculative_state {
 
         const int i_start = std::max<int>(0, (int) prompt_cur.size() - n_ctx);
 
+        if (use_ckpt && i_start > 0) {
+            LOG_WRN("%s: context shift is not supported with checkpoint-based contexts - skipping\n", __func__);
+            return;
+        }
+
         // reuse as much as possible from the old draft context
         // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
         for (int i = 0; i < (int) prompt_dft.size(); ++i) {
             int cur = 0;
             while (i_start + cur < (int) prompt_cur.size() &&
-                    i       + cur < (int) prompt_dft.size() &&
-                    prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
+                   i       + cur < (int) prompt_dft.size() &&
+                   prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
                 cur++;
             }
 
             if ((cur >= 256 || n_ctx >= (int) prompt_cur.size()) && cur > reuse_n) {
                 reuse_i = i;
                 reuse_n = cur;
             }
+
+            if (use_ckpt) {
+                break;
+            }
         }
 
         LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
                 __func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
-        if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
-            LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
-                    __func__, reuse_i, reuse_n);
+        if (use_ckpt && ckpt.n_tokens > reuse_n) {
+            LOG_DBG("%s: checkpoint (n_tokens = %d) is outdated -> delete it\n", __func__, (int) ckpt.n_tokens);
+
             reuse_i = 0;
             reuse_n = 0;
+
+            ckpt = {};
         }
 
         result.clear();
         result.reserve(sparams.n_max);
 
-        bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
         if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
             llama_memory_clear(mem_dft, false);
             prompt_dft.clear();
@@ -393,50 +391,38 @@ struct common_speculative_state_draft : public common_speculative_state {
                 return;
             }
 
-            bool do_restore = false;
-            if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
-                // This can happen after a partial acceptance (speculative decoding with checkpoints)
-                LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
-                        __func__, prompt_dft.size(), prompt_cur.size());
-                prompt_dft.resize(prompt_cur.size());
-                do_restore = true;
-            }
-
             if (reuse_i > 0) {
+                GGML_ASSERT(!use_ckpt);
+
                 bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
                 if (!is_removed) {
                     LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
+                    return;
                 }
                 llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
 
                 prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
             }
 
-            if (reuse_n < (int) prompt_dft.size() || do_restore) {
+            if (reuse_n < (int) prompt_dft.size()) {
                 if (use_ckpt) {
-                    if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
-                        LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
-                                __func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
+                    if (ckpt.n_tokens > 0) {
+                        LOG_DBG("%s: restoring checkpoint, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
+                        restore_checkpoint();
+                        reuse_n = ckpt.n_tokens;
+                        prompt_dft.resize(reuse_n);
                     }
-                    draft_restore_checkpoint(ckpt.ckpt_size);
-                    reuse_n = ckpt.n_tokens;
-                    prompt_dft.resize(reuse_n);
-                    needs_ckpt = false;
                 } else {
-                    bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
+                    const bool is_removed = llama_memory_seq_rm(mem_dft, 0, reuse_n, -1);
                     if (!is_removed) {
-                        LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
-                                __func__, reuse_n, prompt_dft.size());
+                        LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
+                        return;
                     }
                     prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
                 }
             }
         }
 
-        if (needs_ckpt) {
-            ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
-        }
-
         // prepare a batch to evaluate any new tokens in the prompt
         common_batch_clear(batch);
 
@@ -450,12 +436,17 @@ struct common_speculative_state_draft : public common_speculative_state {
         // we should rarely end-up here during normal decoding
         if (batch.n_tokens > 0) {
             //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
+            LOG_DBG("%s: draft prompt batch: %d tokens\n", __func__, batch.n_tokens);
 
             int ret = llama_decode(ctx_dft, batch);
             if (ret != 0 && ret != 1) {
                 LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
                         __func__, ret, prompt_cur.size());
             }
+
+            if (use_ckpt) {
+                create_checkpoint(prompt_dft.size());
+            }
         }
 
         const llama_pos n_past = prompt_dft.size();
@@ -784,17 +775,15 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
     }
 
     void accept(uint16_t n_accepted) override {
-        if (verbose) {
-            LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
-        }
-
         // compute acceptance fraction if we have a recorded draft length
         if (n_draft_last > 0) {
             const double f_acc = (double)n_accepted / (double)n_draft_last;
             if (f_acc < 0.5) {
                 n_low++;
                 if (n_low >= 3) {
-                    LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
+                    if (verbose) {
+                        LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
+                    }
 
                     mod.reset();
                     n_low = 0;
 
@@ -6658,7 +6658,7 @@ def _xlmroberta_set_vocab(self) -> None:
 
         tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
         scores: list[float] = [-10000.0] * vocab_size
-        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size  # ty: ignore[invalid-assignment]
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
 
         if isinstance(tokenizer, SentencePieceProcessor):
             for token_id in range(tokenizer.vocab_size()):
 
@@ -110,13 +110,21 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    if (
-        llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
-        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
-        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
-        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
-    ) {
-        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
+    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
+        (llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
+        LOG_ERR("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
+                __func__,
+                llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
+                llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
+        return 1;
+    }
+
+    if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
+        (llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
+        LOG_ERR("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
+                __func__,
+                llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
+                llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
         return 1;
     }
 
@@ -137,11 +145,12 @@ int main(int argc, char ** argv) {
         for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
             const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
             const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
+
             if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
                 LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
                 LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
-                        common_token_to_piece(ctx_tgt, i).c_str(),
-                        common_token_to_piece(ctx_dft, i).c_str());
+                        common_token_to_piece(vocab_tgt, i).c_str(),
+                        common_token_to_piece(vocab_dft, i).c_str());
                 return 1;
             }
         }
 
@@ -2100,8 +2100,8 @@ static const ggml_backend_i ggml_backend_meta_i = {
     /* .free                    = */ ggml_backend_meta_free,
     /* .set_tensor_async        = */ ggml_backend_meta_set_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_meta_get_tensor_async,
-    /* .get_tensor_2d_async     = */ nullptr,
     /* .set_tensor_2d_async     = */ nullptr,
+    /* .get_tensor_2d_async     = */ nullptr,
     /* .cpy_tensor_async        = */ nullptr,
     /* .synchronize             = */ ggml_backend_meta_synchronize,
     /* .graph_plan_create       = */ nullptr,
 
@@ -262,9 +262,9 @@ static struct ggml_backend_i blas_backend_i = {
     /* .get_name                = */ ggml_backend_blas_get_name,
     /* .free                    = */ ggml_backend_blas_free,
     /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_2d_async     = */ NULL,
-    /* .set_tensor_2d_async     = */ NULL,
     /* .get_tensor_async        = */ NULL,
+    /* .set_tensor_2d_async     = */ NULL,
+    /* .get_tensor_2d_async     = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
     /* .synchronize             = */ NULL,
     /* .graph_plan_create       = */ NULL,
 
@@ -2746,8 +2746,8 @@ static const ggml_backend_i ggml_backend_cann_interface = {
     /* .free                    = */ ggml_backend_cann_free,
     /* .set_tensor_async        = */ ggml_backend_cann_set_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_cann_get_tensor_async,
-    /* .get_tensor_2d_async     = */ NULL,
     /* .set_tensor_2d_async     = */ NULL,
+    /* .get_tensor_2d_async     = */ NULL,
     /* .cpy_tensor_async        = */ ggml_backend_cann_cpy_tensor_async,
     /* .synchronize             = */ ggml_backend_cann_synchronize,
     /* .graph_plan_create       = */ NULL,
 
@@ -195,8 +195,8 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
     /* .free                    = */ ggml_backend_cpu_free,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
-    /* .get_tensor_2d_async     = */ NULL,
     /* .set_tensor_2d_async     = */ NULL,
+    /* .get_tensor_2d_async     = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
     /* .synchronize             = */ NULL,
     /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ static fs::path get_cache_directory() {`
`57`	`57`	`#ifndef _WIN32`
`58`	`58`	`const struct passwd * pw = getpwuid(getuid());`
`59`	`59`
`60`		`- if (pw->pw_dir && *pw->pw_dir) {`
	`60`	`+ if (pw && pw->pw_dir && *pw->pw_dir) {`
`61`	`61`	`return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";`
`62`	`62`	`}`
`63`	`63`	`#endif`