Skip to content

Commit 39ee9a9

Browse files
committed
chore(llama.cpp): bump to 1ec7ba0c14f33f17e980daeeda5f35b225d41994
Picks up the upstream `spec : parallel drafting support` change (ggml-org/llama.cpp#22838) which reshapes the speculative-decoding API and `server_context_impl`. Adapt the grpc-server wrapper accordingly: * `common_params_speculative::type` (single enum) became `types` (`std::vector<common_speculative_type>`). Update both the "default to draft when a draft model is set" branch and the `spec_type`/`speculative_type` option parser. The parser now also tolerates comma-separated lists, mirroring the upstream `common_speculative_types_from_names` semantics. * `common_params_speculative_draft::n_ctx` is gone (draft now shares the target context size). Keep the `draft_ctx_size` option name for backward compatibility and ignore the value rather than failing. * `server_context_impl::model` was renamed to `model_tgt`; update the two reranker / model-metadata call sites. Replaces #9763. Builds cleanly under the linux/amd64 cpu-llama-cpp target locally. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 621c612 commit 39ee9a9

2 files changed

Lines changed: 26 additions & 11 deletions

File tree

backend/cpp/llama-cpp/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
LLAMA_VERSION?=389ff61d77b5c71cec0cf92fe4e5d01ace80b797
2+
LLAMA_VERSION?=1ec7ba0c14f33f17e980daeeda5f35b225d41994
33
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
44

55
CMAKE_ARGS?=

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -444,8 +444,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
444444
if (!request->draftmodel().empty()) {
445445
params.speculative.draft.mparams.path = request->draftmodel();
446446
// Default to draft type if a draft model is set but no explicit type
447-
if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) {
448-
params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT;
447+
const bool no_spec_type = params.speculative.types.empty() ||
448+
(params.speculative.types.size() == 1 && params.speculative.types[0] == COMMON_SPECULATIVE_TYPE_NONE);
449+
if (no_spec_type) {
450+
params.speculative.types = { COMMON_SPECULATIVE_TYPE_DRAFT };
449451
}
450452
}
451453

@@ -673,9 +675,22 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
673675
}
674676
// Speculative decoding options
675677
} else if (!strcmp(optname, "spec_type") || !strcmp(optname, "speculative_type")) {
676-
auto type = common_speculative_type_from_name(optval_str);
677-
if (type != COMMON_SPECULATIVE_TYPE_COUNT) {
678-
params.speculative.type = type;
678+
// Upstream switched to a vector of types (comma-separated for multi-type
679+
// chaining via common_speculative_types_from_names). We keep accepting a
680+
// single value here, but also tolerate comma-separated lists.
681+
std::vector<std::string> names;
682+
std::string item;
683+
for (char c : optval_str) {
684+
if (c == ',') {
685+
if (!item.empty()) { names.push_back(item); item.clear(); }
686+
} else {
687+
item.push_back(c);
688+
}
689+
}
690+
if (!item.empty()) names.push_back(item);
691+
auto parsed = common_speculative_types_from_names(names);
692+
if (!parsed.empty()) {
693+
params.speculative.types = parsed;
679694
}
680695
} else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) {
681696
if (optval != NULL) {
@@ -710,9 +725,9 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
710725
try { params.speculative.draft.n_gpu_layers = std::stoi(optval_str); } catch (...) {}
711726
}
712727
} else if (!strcmp(optname, "draft_ctx_size")) {
713-
if (optval != NULL) {
714-
try { params.speculative.draft.n_ctx = std::stoi(optval_str); } catch (...) {}
715-
}
728+
// The draft context size is no longer a separate field upstream: the draft
729+
// shares the target context size. Accept the option for backward
730+
// compatibility but silently ignore it.
716731
}
717732
}
718733

@@ -2704,7 +2719,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
27042719

27052720
tasks.reserve(documents.size());
27062721
for (size_t i = 0; i < documents.size(); i++) {
2707-
auto tmp = format_prompt_rerank(ctx_server.impl->model, ctx_server.impl->vocab, ctx_server.impl->mctx, request->query(), documents[i]);
2722+
auto tmp = format_prompt_rerank(ctx_server.impl->model_tgt, ctx_server.impl->vocab, ctx_server.impl->mctx, request->query(), documents[i]);
27082723
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
27092724
task.id = rd.queue_tasks.get_new_id();
27102725
task.index = i;
@@ -2882,7 +2897,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
28822897
// Get template source and reconstruct a common_chat_template for analysis
28832898
std::string tmpl_src = common_chat_templates_source(ctx_server.impl->chat_params.tmpls.get());
28842899
if (!tmpl_src.empty()) {
2885-
const auto * vocab = llama_model_get_vocab(ctx_server.impl->model);
2900+
const auto * vocab = llama_model_get_vocab(ctx_server.impl->model_tgt);
28862901
std::string token_bos, token_eos;
28872902
if (vocab) {
28882903
auto bos_id = llama_vocab_bos(vocab);

0 commit comments

Comments
 (0)