Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-type-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: "3.11"
pip-install: -r requirements/requirements-all.txt ty==0.0.26
pip-install: -r requirements/requirements-all.txt ty==0.0.33
# - name: Type-check with Pyright
# uses: jakebailey/pyright-action@v2
# with:
Expand Down
2 changes: 1 addition & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3499,7 +3499,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_N_MIN"));

add_opt(common_arg(
{"--spec--draft-p-split", "--draft-p-split"}, "P",
{"--spec-draft-p-split", "--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.draft.p_split),
[](common_params & params, const std::string & value) {
params.speculative.draft.p_split = std::stof(value);
Expand Down
2 changes: 1 addition & 1 deletion common/hf-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ static fs::path get_cache_directory() {
#ifndef _WIN32
const struct passwd * pw = getpwuid(getuid());

if (pw->pw_dir && *pw->pw_dir) {
if (pw && pw->pw_dir && *pw->pw_dir) {
return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
}
#endif
Expand Down
99 changes: 44 additions & 55 deletions common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,16 +167,14 @@ struct common_speculative_checkpoint {
size_t size() const {
return data.size();
}

size_t ckpt_size = 0;
};

struct common_speculative_state_draft : public common_speculative_state {
llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
llama_context * ctx_dft;

bool use_ckpt = false;
struct common_speculative_checkpoint ckpt;
common_speculative_checkpoint ckpt;

common_sampler * smpl;

Expand Down Expand Up @@ -249,26 +247,16 @@ struct common_speculative_state_draft : public common_speculative_state {
llama_batch_free(batch);
}

void begin(const llama_tokens & prompt) override {
if (use_ckpt && ckpt.size() > 0) {
// delete checkpoint
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
ckpt.pos_min = 0;
ckpt.pos_max = 0;
ckpt.n_tokens = 0;
ckpt.ckpt_size = 0;
ckpt.data.clear();
}
void begin(const llama_tokens & /*prompt*/) override {
}

size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
size_t create_checkpoint(int n_tokens_prompt) {
int slot_id = 0;
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);

ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
ckpt.n_tokens = n_tokens_prompt;
ckpt.data.resize(checkpoint_size);

const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
Expand All @@ -281,13 +269,13 @@ struct common_speculative_state_draft : public common_speculative_state {
return n;
}

size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
size_t restore_checkpoint() {
int slot_id = 0;
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != ckpt_size_part_expected) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
if (n != ckpt.size()) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size());
}
llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);

Expand Down Expand Up @@ -346,35 +334,45 @@ struct common_speculative_state_draft : public common_speculative_state {

const int i_start = std::max<int>(0, (int) prompt_cur.size() - n_ctx);

if (use_ckpt && i_start > 0) {
LOG_WRN("%s: context shift is not supported with checkpoint-based contexts - skipping\n", __func__);
return;
}

// reuse as much as possible from the old draft context
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
for (int i = 0; i < (int) prompt_dft.size(); ++i) {
int cur = 0;
while (i_start + cur < (int) prompt_cur.size() &&
i + cur < (int) prompt_dft.size() &&
prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
i + cur < (int) prompt_dft.size() &&
prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
cur++;
}

if ((cur >= 256 || n_ctx >= (int) prompt_cur.size()) && cur > reuse_n) {
reuse_i = i;
reuse_n = cur;
}

if (use_ckpt) {
break;
}
}

LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
__func__, reuse_i, reuse_n);
if (use_ckpt && ckpt.n_tokens > reuse_n) {
LOG_DBG("%s: checkpoint (n_tokens = %d) is outdated -> delete it\n", __func__, (int) ckpt.n_tokens);

reuse_i = 0;
reuse_n = 0;

ckpt = {};
}

result.clear();
result.reserve(sparams.n_max);

bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
llama_memory_clear(mem_dft, false);
prompt_dft.clear();
Expand All @@ -393,50 +391,38 @@ struct common_speculative_state_draft : public common_speculative_state {
return;
}

bool do_restore = false;
if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
// This can happen after a partial acceptance (speculative decoding with checkpoints)
LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
__func__, prompt_dft.size(), prompt_cur.size());
prompt_dft.resize(prompt_cur.size());
do_restore = true;
}

if (reuse_i > 0) {
GGML_ASSERT(!use_ckpt);

bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
if (!is_removed) {
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
return;
}
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);

prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
}

if (reuse_n < (int) prompt_dft.size() || do_restore) {
if (reuse_n < (int) prompt_dft.size()) {
if (use_ckpt) {
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
if (ckpt.n_tokens > 0) {
LOG_DBG("%s: restoring checkpoint, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
restore_checkpoint();
reuse_n = ckpt.n_tokens;
prompt_dft.resize(reuse_n);
}
draft_restore_checkpoint(ckpt.ckpt_size);
reuse_n = ckpt.n_tokens;
prompt_dft.resize(reuse_n);
needs_ckpt = false;
} else {
bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
const bool is_removed = llama_memory_seq_rm(mem_dft, 0, reuse_n, -1);
if (!is_removed) {
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
__func__, reuse_n, prompt_dft.size());
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
return;
}
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
}
}
}

if (needs_ckpt) {
ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
}

// prepare a batch to evaluate any new tokens in the prompt
common_batch_clear(batch);

Expand All @@ -450,12 +436,17 @@ struct common_speculative_state_draft : public common_speculative_state {
// we should rarely end-up here during normal decoding
if (batch.n_tokens > 0) {
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
LOG_DBG("%s: draft prompt batch: %d tokens\n", __func__, batch.n_tokens);

int ret = llama_decode(ctx_dft, batch);
if (ret != 0 && ret != 1) {
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
__func__, ret, prompt_cur.size());
}

if (use_ckpt) {
create_checkpoint(prompt_dft.size());
}
}

const llama_pos n_past = prompt_dft.size();
Expand Down Expand Up @@ -784,17 +775,15 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
}

void accept(uint16_t n_accepted) override {
if (verbose) {
LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
}

// compute acceptance fraction if we have a recorded draft length
if (n_draft_last > 0) {
const double f_acc = (double)n_accepted / (double)n_draft_last;
if (f_acc < 0.5) {
n_low++;
if (n_low >= 3) {
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
if (verbose) {
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
}

mod.reset();
n_low = 0;
Expand Down
2 changes: 1 addition & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6658,7 +6658,7 @@ def _xlmroberta_set_vocab(self) -> None:

tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment]
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size

if isinstance(tokenizer, SentencePieceProcessor):
for token_id in range(tokenizer.vocab_size()):
Expand Down
27 changes: 18 additions & 9 deletions examples/speculative/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,21 @@ int main(int argc, char ** argv) {
return 1;
}

if (
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
) {
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
(llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
LOG_ERR("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
__func__,
llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
return 1;
}

if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
(llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
LOG_ERR("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
__func__,
llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
return 1;
}

Expand All @@ -137,11 +145,12 @@ int main(int argc, char ** argv) {
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);

if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
common_token_to_piece(ctx_tgt, i).c_str(),
common_token_to_piece(ctx_dft, i).c_str());
common_token_to_piece(vocab_tgt, i).c_str(),
common_token_to_piece(vocab_dft, i).c_str());
return 1;
}
}
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-backend-meta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2100,8 +2100,8 @@ static const ggml_backend_i ggml_backend_meta_i = {
/* .free = */ ggml_backend_meta_free,
/* .set_tensor_async = */ ggml_backend_meta_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_meta_get_tensor_async,
/* .get_tensor_2d_async = */ nullptr,
/* .set_tensor_2d_async = */ nullptr,
/* .get_tensor_2d_async = */ nullptr,
/* .cpy_tensor_async = */ nullptr,
/* .synchronize = */ ggml_backend_meta_synchronize,
/* .graph_plan_create = */ nullptr,
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-blas/ggml-blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,9 @@ static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name,
/* .free = */ ggml_backend_blas_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2746,8 +2746,8 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .free = */ ggml_backend_cann_free,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cann_synchronize,
/* .graph_plan_create = */ NULL,
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cpu/ggml-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .free = */ ggml_backend_cpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
Expand Down
Loading
Loading