Skip to content

Commit febf7fb

Browse files
Merge pull request #503 from janhq/update-dev-from-master-2026-05-02-01-01
Sync master with upstream release b8999
2 parents 9dbd161 + b97ebdc commit febf7fb

209 files changed

Lines changed: 10270 additions & 8873 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/python-type-check.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
uses: actions/setup-python@v6
3232
with:
3333
python-version: "3.11"
34-
pip-install: -r requirements/requirements-all.txt ty==0.0.26
34+
pip-install: -r requirements/requirements-all.txt ty==0.0.33
3535
# - name: Type-check with Pyright
3636
# uses: jakebailey/pyright-action@v2
3737
# with:

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3499,7 +3499,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34993499
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_N_MIN"));
35003500

35013501
add_opt(common_arg(
3502-
{"--spec--draft-p-split", "--draft-p-split"}, "P",
3502+
{"--spec-draft-p-split", "--draft-p-split"}, "P",
35033503
string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.draft.p_split),
35043504
[](common_params & params, const std::string & value) {
35053505
params.speculative.draft.p_split = std::stof(value);

common/hf-cache.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ static fs::path get_cache_directory() {
5757
#ifndef _WIN32
5858
const struct passwd * pw = getpwuid(getuid());
5959

60-
if (pw->pw_dir && *pw->pw_dir) {
60+
if (pw && pw->pw_dir && *pw->pw_dir) {
6161
return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
6262
}
6363
#endif

common/speculative.cpp

Lines changed: 44 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -167,16 +167,14 @@ struct common_speculative_checkpoint {
167167
size_t size() const {
168168
return data.size();
169169
}
170-
171-
size_t ckpt_size = 0;
172170
};
173171

174172
struct common_speculative_state_draft : public common_speculative_state {
175173
llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
176174
llama_context * ctx_dft;
177175

178176
bool use_ckpt = false;
179-
struct common_speculative_checkpoint ckpt;
177+
common_speculative_checkpoint ckpt;
180178

181179
common_sampler * smpl;
182180

@@ -249,26 +247,16 @@ struct common_speculative_state_draft : public common_speculative_state {
249247
llama_batch_free(batch);
250248
}
251249

252-
void begin(const llama_tokens & prompt) override {
253-
if (use_ckpt && ckpt.size() > 0) {
254-
// delete checkpoint
255-
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
256-
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
257-
ckpt.pos_min = 0;
258-
ckpt.pos_max = 0;
259-
ckpt.n_tokens = 0;
260-
ckpt.ckpt_size = 0;
261-
ckpt.data.clear();
262-
}
250+
void begin(const llama_tokens & /*prompt*/) override {
263251
}
264252

265-
size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
253+
size_t create_checkpoint(int n_tokens_prompt) {
266254
int slot_id = 0;
267255
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
268256

269257
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
270258
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
271-
ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
259+
ckpt.n_tokens = n_tokens_prompt;
272260
ckpt.data.resize(checkpoint_size);
273261

274262
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
@@ -281,13 +269,13 @@ struct common_speculative_state_draft : public common_speculative_state {
281269
return n;
282270
}
283271

284-
size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
272+
size_t restore_checkpoint() {
285273
int slot_id = 0;
286274
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
287275
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
288-
if (n != ckpt_size_part_expected) {
289-
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
290-
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
276+
if (n != ckpt.size()) {
277+
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu",
278+
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size());
291279
}
292280
llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
293281

@@ -346,35 +334,45 @@ struct common_speculative_state_draft : public common_speculative_state {
346334

347335
const int i_start = std::max<int>(0, (int) prompt_cur.size() - n_ctx);
348336

337+
if (use_ckpt && i_start > 0) {
338+
LOG_WRN("%s: context shift is not supported with checkpoint-based contexts - skipping\n", __func__);
339+
return;
340+
}
341+
349342
// reuse as much as possible from the old draft context
350343
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
351344
for (int i = 0; i < (int) prompt_dft.size(); ++i) {
352345
int cur = 0;
353346
while (i_start + cur < (int) prompt_cur.size() &&
354-
i + cur < (int) prompt_dft.size() &&
355-
prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
347+
i + cur < (int) prompt_dft.size() &&
348+
prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
356349
cur++;
357350
}
358351

359352
if ((cur >= 256 || n_ctx >= (int) prompt_cur.size()) && cur > reuse_n) {
360353
reuse_i = i;
361354
reuse_n = cur;
362355
}
356+
357+
if (use_ckpt) {
358+
break;
359+
}
363360
}
364361

365362
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
366363
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
367-
if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
368-
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
369-
__func__, reuse_i, reuse_n);
364+
if (use_ckpt && ckpt.n_tokens > reuse_n) {
365+
LOG_DBG("%s: checkpoint (n_tokens = %d) is outdated -> delete it\n", __func__, (int) ckpt.n_tokens);
366+
370367
reuse_i = 0;
371368
reuse_n = 0;
369+
370+
ckpt = {};
372371
}
373372

374373
result.clear();
375374
result.reserve(sparams.n_max);
376375

377-
bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
378376
if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
379377
llama_memory_clear(mem_dft, false);
380378
prompt_dft.clear();
@@ -393,50 +391,38 @@ struct common_speculative_state_draft : public common_speculative_state {
393391
return;
394392
}
395393

396-
bool do_restore = false;
397-
if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
398-
// This can happen after a partial acceptance (speculative decoding with checkpoints)
399-
LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
400-
__func__, prompt_dft.size(), prompt_cur.size());
401-
prompt_dft.resize(prompt_cur.size());
402-
do_restore = true;
403-
}
404-
405394
if (reuse_i > 0) {
395+
GGML_ASSERT(!use_ckpt);
396+
406397
bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
407398
if (!is_removed) {
408399
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
400+
return;
409401
}
410402
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
411403

412404
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
413405
}
414406

415-
if (reuse_n < (int) prompt_dft.size() || do_restore) {
407+
if (reuse_n < (int) prompt_dft.size()) {
416408
if (use_ckpt) {
417-
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
418-
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
419-
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
409+
if (ckpt.n_tokens > 0) {
410+
LOG_DBG("%s: restoring checkpoint, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
411+
restore_checkpoint();
412+
reuse_n = ckpt.n_tokens;
413+
prompt_dft.resize(reuse_n);
420414
}
421-
draft_restore_checkpoint(ckpt.ckpt_size);
422-
reuse_n = ckpt.n_tokens;
423-
prompt_dft.resize(reuse_n);
424-
needs_ckpt = false;
425415
} else {
426-
bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
416+
const bool is_removed = llama_memory_seq_rm(mem_dft, 0, reuse_n, -1);
427417
if (!is_removed) {
428-
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
429-
__func__, reuse_n, prompt_dft.size());
418+
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
419+
return;
430420
}
431421
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
432422
}
433423
}
434424
}
435425

436-
if (needs_ckpt) {
437-
ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
438-
}
439-
440426
// prepare a batch to evaluate any new tokens in the prompt
441427
common_batch_clear(batch);
442428

@@ -450,12 +436,17 @@ struct common_speculative_state_draft : public common_speculative_state {
450436
// we should rarely end-up here during normal decoding
451437
if (batch.n_tokens > 0) {
452438
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
439+
LOG_DBG("%s: draft prompt batch: %d tokens\n", __func__, batch.n_tokens);
453440

454441
int ret = llama_decode(ctx_dft, batch);
455442
if (ret != 0 && ret != 1) {
456443
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
457444
__func__, ret, prompt_cur.size());
458445
}
446+
447+
if (use_ckpt) {
448+
create_checkpoint(prompt_dft.size());
449+
}
459450
}
460451

461452
const llama_pos n_past = prompt_dft.size();
@@ -784,17 +775,15 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
784775
}
785776

786777
void accept(uint16_t n_accepted) override {
787-
if (verbose) {
788-
LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
789-
}
790-
791778
// compute acceptance fraction if we have a recorded draft length
792779
if (n_draft_last > 0) {
793780
const double f_acc = (double)n_accepted / (double)n_draft_last;
794781
if (f_acc < 0.5) {
795782
n_low++;
796783
if (n_low >= 3) {
797-
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
784+
if (verbose) {
785+
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
786+
}
798787

799788
mod.reset();
800789
n_low = 0;

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6658,7 +6658,7 @@ def _xlmroberta_set_vocab(self) -> None:
66586658

66596659
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
66606660
scores: list[float] = [-10000.0] * vocab_size
6661-
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment]
6661+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
66626662

66636663
if isinstance(tokenizer, SentencePieceProcessor):
66646664
for token_id in range(tokenizer.vocab_size()):

examples/speculative/speculative.cpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,21 @@ int main(int argc, char ** argv) {
110110
return 1;
111111
}
112112

113-
if (
114-
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
115-
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
116-
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
117-
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
118-
) {
119-
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
113+
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
114+
(llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
115+
LOG_ERR("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
116+
__func__,
117+
llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
118+
llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
119+
return 1;
120+
}
121+
122+
if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
123+
(llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
124+
LOG_ERR("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
125+
__func__,
126+
llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
127+
llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
120128
return 1;
121129
}
122130

@@ -137,11 +145,12 @@ int main(int argc, char ** argv) {
137145
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
138146
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
139147
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
148+
140149
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
141150
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
142151
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
143-
common_token_to_piece(ctx_tgt, i).c_str(),
144-
common_token_to_piece(ctx_dft, i).c_str());
152+
common_token_to_piece(vocab_tgt, i).c_str(),
153+
common_token_to_piece(vocab_dft, i).c_str());
145154
return 1;
146155
}
147156
}

ggml/src/ggml-backend-meta.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2100,8 +2100,8 @@ static const ggml_backend_i ggml_backend_meta_i = {
21002100
/* .free = */ ggml_backend_meta_free,
21012101
/* .set_tensor_async = */ ggml_backend_meta_set_tensor_async,
21022102
/* .get_tensor_async = */ ggml_backend_meta_get_tensor_async,
2103-
/* .get_tensor_2d_async = */ nullptr,
21042103
/* .set_tensor_2d_async = */ nullptr,
2104+
/* .get_tensor_2d_async = */ nullptr,
21052105
/* .cpy_tensor_async = */ nullptr,
21062106
/* .synchronize = */ ggml_backend_meta_synchronize,
21072107
/* .graph_plan_create = */ nullptr,

ggml/src/ggml-blas/ggml-blas.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,9 +262,9 @@ static struct ggml_backend_i blas_backend_i = {
262262
/* .get_name = */ ggml_backend_blas_get_name,
263263
/* .free = */ ggml_backend_blas_free,
264264
/* .set_tensor_async = */ NULL,
265-
/* .get_tensor_2d_async = */ NULL,
266-
/* .set_tensor_2d_async = */ NULL,
267265
/* .get_tensor_async = */ NULL,
266+
/* .set_tensor_2d_async = */ NULL,
267+
/* .get_tensor_2d_async = */ NULL,
268268
/* .cpy_tensor_async = */ NULL,
269269
/* .synchronize = */ NULL,
270270
/* .graph_plan_create = */ NULL,

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2746,8 +2746,8 @@ static const ggml_backend_i ggml_backend_cann_interface = {
27462746
/* .free = */ ggml_backend_cann_free,
27472747
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
27482748
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
2749-
/* .get_tensor_2d_async = */ NULL,
27502749
/* .set_tensor_2d_async = */ NULL,
2750+
/* .get_tensor_2d_async = */ NULL,
27512751
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
27522752
/* .synchronize = */ ggml_backend_cann_synchronize,
27532753
/* .graph_plan_create = */ NULL,

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,8 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
195195
/* .free = */ ggml_backend_cpu_free,
196196
/* .set_tensor_async = */ NULL,
197197
/* .get_tensor_async = */ NULL,
198-
/* .get_tensor_2d_async = */ NULL,
199198
/* .set_tensor_2d_async = */ NULL,
199+
/* .get_tensor_2d_async = */ NULL,
200200
/* .cpy_tensor_async = */ NULL,
201201
/* .synchronize = */ NULL,
202202
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,

0 commit comments

Comments
 (0)