diff --git a/backend/cpp/turboquant/Makefile b/backend/cpp/turboquant/Makefile index 7d0abf0e2e5a..cd3d1ab809da 100644 --- a/backend/cpp/turboquant/Makefile +++ b/backend/cpp/turboquant/Makefile @@ -1,7 +1,7 @@ # Pinned to the HEAD of feature/turboquant-kv-cache on https://github.com/TheTom/llama-cpp-turboquant. # Auto-bumped nightly by .github/workflows/bump_deps.yaml. -TURBOQUANT_VERSION?=45f8a066ed5f5bb38c695cec532f6cef9f4efa9d +TURBOQUANT_VERSION?=627ebbc6e27727bd4f65422d8aa60b13404993c8 LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant CMAKE_ARGS?= diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh index 5b534ece10de..c3dd967a078d 100755 --- a/backend/cpp/turboquant/patch-grpc-server.sh +++ b/backend/cpp/turboquant/patch-grpc-server.sh @@ -1,13 +1,22 @@ #!/bin/bash -# Augment the shared backend/cpp/llama-cpp/grpc-server.cpp allow-list of KV-cache -# types so the gRPC `LoadModel` call accepts the TurboQuant-specific -# `turbo2` / `turbo3` / `turbo4` cache types. +# Patch the shared backend/cpp/llama-cpp/grpc-server.cpp *copy* used by the +# turboquant build to account for two gaps between upstream and the fork: # -# We do this on the *copy* sitting in turboquant--build/, never on the -# original under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps -# compiling against vanilla upstream which does not know about GGML_TYPE_TURBO*. +# 1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the +# fork-specific `turbo2` / `turbo3` / `turbo4` cache types. +# 2. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962, +# server-side random per-instance marker) with the legacy "<__media__>" +# literal. The fork branched before that PR, so server-common.cpp has no +# get_media_marker symbol. The fork's mtmd_default_marker() still returns +# "<__media__>", and Go-side tooling falls back to that sentinel when the +# backend does not expose media_marker, so substituting the literal keeps +# behavior identical on the turboquant path. # -# Idempotent: skips the insertion if the marker is already present (so re-runs +# We patch the *copy* sitting in turboquant--build/, never the original +# under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling +# against vanilla upstream. +# +# Idempotent: skips each insertion if its marker is already present (so re-runs # of the same build dir don't double-insert). set -euo pipefail @@ -25,33 +34,47 @@ if [[ ! -f "$SRC" ]]; then fi if grep -q 'GGML_TYPE_TURBO2_0' "$SRC"; then - echo "==> $SRC already has TurboQuant cache types, skipping" - exit 0 -fi + echo "==> $SRC already has TurboQuant cache types, skipping KV allow-list patch" +else + echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types" -echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types" - -# Insert the three TURBO entries right after the first ` GGML_TYPE_Q5_1,` -# line (the kv_cache_types[] allow-list). Using awk because the builder image -# does not ship python3, and GNU sed's multi-line `a\` quoting is awkward. -awk ' - /^ GGML_TYPE_Q5_1,$/ && !done { - print - print " // turboquant fork extras — added by patch-grpc-server.sh" - print " GGML_TYPE_TURBO2_0," - print " GGML_TYPE_TURBO3_0," - print " GGML_TYPE_TURBO4_0," - done = 1 - next - } - { print } - END { - if (!done) { - print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr" - exit 1 + # Insert the three TURBO entries right after the first ` GGML_TYPE_Q5_1,` + # line (the kv_cache_types[] allow-list). Using awk because the builder image + # does not ship python3, and GNU sed's multi-line `a\` quoting is awkward. + awk ' + /^ GGML_TYPE_Q5_1,$/ && !done { + print + print " // turboquant fork extras — added by patch-grpc-server.sh" + print " GGML_TYPE_TURBO2_0," + print " GGML_TYPE_TURBO3_0," + print " GGML_TYPE_TURBO4_0," + done = 1 + next } - } -' "$SRC" > "$SRC.tmp" -mv "$SRC.tmp" "$SRC" + { print } + END { + if (!done) { + print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr" + exit 1 + } + } + ' "$SRC" > "$SRC.tmp" + mv "$SRC.tmp" "$SRC" + + echo "==> KV allow-list patch OK" +fi + +if grep -q 'get_media_marker()' "$SRC"; then + echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal" + # Only one call site today (ModelMetadata), but replace all occurrences to + # stay robust if upstream adds more. Use a temp file to avoid relying on + # sed -i portability (the builder image uses GNU sed, but keeping this + # consistent with the awk block above). + sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp" + mv "$SRC.tmp" "$SRC" + echo "==> get_media_marker() substitution OK" +else + echo "==> $SRC has no get_media_marker() call, skipping media-marker patch" +fi -echo "==> patched OK" +echo "==> all patches applied" diff --git a/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch b/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch deleted file mode 100644 index 0f1feed88b3c..000000000000 --- a/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch +++ /dev/null @@ -1,83 +0,0 @@ -From 660600081fb7b9b769ded5c805a2d39a419f0a0d Mon Sep 17 00:00:00 2001 -From: Yuri Khrustalev -Date: Wed, 8 Apr 2026 11:12:15 -0400 -Subject: [PATCH] server: respect the ignore eos flag (#21203) - ---- - tools/server/server-context.cpp | 3 +++ - tools/server/server-context.h | 3 +++ - tools/server/server-task.cpp | 3 ++- - tools/server/server-task.h | 1 + - 4 files changed, 9 insertions(+), 1 deletion(-) - -diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp -index 9d3ac538..b31981c5 100644 ---- a/tools/server/server-context.cpp -+++ b/tools/server/server-context.cpp -@@ -3033,6 +3033,8 @@ server_context_meta server_context::get_meta() const { - /* fim_rep_token */ llama_vocab_fim_rep(impl->vocab), - /* fim_sep_token */ llama_vocab_fim_sep(impl->vocab), - -+ /* logit_bias_eog */ impl->params_base.sampling.logit_bias_eog, -+ - /* model_vocab_type */ llama_vocab_type(impl->vocab), - /* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab), - /* model_n_ctx_train */ llama_model_n_ctx_train(impl->model), -@@ -3117,6 +3119,7 @@ std::unique_ptr server_routes::handle_completions_impl( - ctx_server.vocab, - params, - meta->slot_n_ctx, -+ meta->logit_bias_eog, - data); - task.id_slot = json_value(data, "id_slot", -1); - -diff --git a/tools/server/server-context.h b/tools/server/server-context.h -index d7ce8735..6ea9afc0 100644 ---- a/tools/server/server-context.h -+++ b/tools/server/server-context.h -@@ -39,6 +39,9 @@ struct server_context_meta { - llama_token fim_rep_token; - llama_token fim_sep_token; - -+ // sampling -+ std::vector logit_bias_eog; -+ - // model meta - enum llama_vocab_type model_vocab_type; - int32_t model_vocab_n_tokens; -diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp -index 4cc87bc5..856b3f0e 100644 ---- a/tools/server/server-task.cpp -+++ b/tools/server/server-task.cpp -@@ -239,6 +239,7 @@ task_params server_task::params_from_json_cmpl( - const llama_vocab * vocab, - const common_params & params_base, - const int n_ctx_slot, -+ const std::vector & logit_bias_eog, - const json & data) { - task_params params; - -@@ -562,7 +563,7 @@ task_params server_task::params_from_json_cmpl( - if (params.sampling.ignore_eos) { - params.sampling.logit_bias.insert( - params.sampling.logit_bias.end(), -- defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end()); -+ logit_bias_eog.begin(), logit_bias_eog.end()); - } - } - -diff --git a/tools/server/server-task.h b/tools/server/server-task.h -index d855bf08..243e47a8 100644 ---- a/tools/server/server-task.h -+++ b/tools/server/server-task.h -@@ -209,6 +209,7 @@ struct server_task { - const llama_vocab * vocab, - const common_params & params_base, - const int n_ctx_slot, -+ const std::vector & logit_bias_eog, - const json & data); - - // utility function --- -2.43.0 -