From 5ac191d2e2649650c4f2a41ebde7b216149a64de Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 19 Apr 2026 08:13:23 +0000 Subject: [PATCH 1/2] fix(turboquant): drop ignore-eos patch, bump fork to b8967-627ebbc The upstream PR #21203 (server: respect the ignore_eos flag) has been merged into the TheTom/llama-cpp-turboquant feature/turboquant-kv-cache branch. With the fix now in-tree, 0001-server-respect-the-ignore-eos-flag.patch no longer applies (git apply sees its additions already present) and the nightly turboquant bump fails. Retire the patch and bump the pin to the first fork revision that carries the merged fix (tag feature-turboquant-kv-cache-b8967-627ebbc). This matches the contract in apply-patches.sh: drop patches once the fork catches up. --- backend/cpp/turboquant/Makefile | 2 +- ...1-server-respect-the-ignore-eos-flag.patch | 83 ------------------- 2 files changed, 1 insertion(+), 84 deletions(-) delete mode 100644 backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch diff --git a/backend/cpp/turboquant/Makefile b/backend/cpp/turboquant/Makefile index 7d0abf0e2e5a..cd3d1ab809da 100644 --- a/backend/cpp/turboquant/Makefile +++ b/backend/cpp/turboquant/Makefile @@ -1,7 +1,7 @@ # Pinned to the HEAD of feature/turboquant-kv-cache on https://github.com/TheTom/llama-cpp-turboquant. # Auto-bumped nightly by .github/workflows/bump_deps.yaml. -TURBOQUANT_VERSION?=45f8a066ed5f5bb38c695cec532f6cef9f4efa9d +TURBOQUANT_VERSION?=627ebbc6e27727bd4f65422d8aa60b13404993c8 LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant CMAKE_ARGS?= diff --git a/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch b/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch deleted file mode 100644 index 0f1feed88b3c..000000000000 --- a/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch +++ /dev/null @@ -1,83 +0,0 @@ -From 660600081fb7b9b769ded5c805a2d39a419f0a0d Mon Sep 17 00:00:00 2001 -From: Yuri Khrustalev -Date: Wed, 8 Apr 2026 11:12:15 -0400 -Subject: [PATCH] server: respect the ignore eos flag (#21203) - ---- - tools/server/server-context.cpp | 3 +++ - tools/server/server-context.h | 3 +++ - tools/server/server-task.cpp | 3 ++- - tools/server/server-task.h | 1 + - 4 files changed, 9 insertions(+), 1 deletion(-) - -diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp -index 9d3ac538..b31981c5 100644 ---- a/tools/server/server-context.cpp -+++ b/tools/server/server-context.cpp -@@ -3033,6 +3033,8 @@ server_context_meta server_context::get_meta() const { - /* fim_rep_token */ llama_vocab_fim_rep(impl->vocab), - /* fim_sep_token */ llama_vocab_fim_sep(impl->vocab), - -+ /* logit_bias_eog */ impl->params_base.sampling.logit_bias_eog, -+ - /* model_vocab_type */ llama_vocab_type(impl->vocab), - /* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab), - /* model_n_ctx_train */ llama_model_n_ctx_train(impl->model), -@@ -3117,6 +3119,7 @@ std::unique_ptr server_routes::handle_completions_impl( - ctx_server.vocab, - params, - meta->slot_n_ctx, -+ meta->logit_bias_eog, - data); - task.id_slot = json_value(data, "id_slot", -1); - -diff --git a/tools/server/server-context.h b/tools/server/server-context.h -index d7ce8735..6ea9afc0 100644 ---- a/tools/server/server-context.h -+++ b/tools/server/server-context.h -@@ -39,6 +39,9 @@ struct server_context_meta { - llama_token fim_rep_token; - llama_token fim_sep_token; - -+ // sampling -+ std::vector logit_bias_eog; -+ - // model meta - enum llama_vocab_type model_vocab_type; - int32_t model_vocab_n_tokens; -diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp -index 4cc87bc5..856b3f0e 100644 ---- a/tools/server/server-task.cpp -+++ b/tools/server/server-task.cpp -@@ -239,6 +239,7 @@ task_params server_task::params_from_json_cmpl( - const llama_vocab * vocab, - const common_params & params_base, - const int n_ctx_slot, -+ const std::vector & logit_bias_eog, - const json & data) { - task_params params; - -@@ -562,7 +563,7 @@ task_params server_task::params_from_json_cmpl( - if (params.sampling.ignore_eos) { - params.sampling.logit_bias.insert( - params.sampling.logit_bias.end(), -- defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end()); -+ logit_bias_eog.begin(), logit_bias_eog.end()); - } - } - -diff --git a/tools/server/server-task.h b/tools/server/server-task.h -index d855bf08..243e47a8 100644 ---- a/tools/server/server-task.h -+++ b/tools/server/server-task.h -@@ -209,6 +209,7 @@ struct server_task { - const llama_vocab * vocab, - const common_params & params_base, - const int n_ctx_slot, -+ const std::vector & logit_bias_eog, - const json & data); - - // utility function --- -2.43.0 - From 838c510be178db49be2d0594b4625cbdf1acfb37 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 19 Apr 2026 16:46:53 +0000 Subject: [PATCH 2/2] fix(turboquant): patch out get_media_marker() call in grpc-server copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI turboquant docker build was failing with: grpc-server.cpp:2825:40: error: use of undeclared identifier 'get_media_marker' The call was added by 7809c5f5 (PR #9412) to propagate the mtmd random per-server media marker upstream landed in ggml-org/llama.cpp#21962. The TheTom/llama-cpp-turboquant fork branched before that PR, so its server-common.cpp has no such symbol. Extend patch-grpc-server.sh to substitute get_media_marker() with the legacy "<__media__>" literal in the build-time grpc-server.cpp copy under turboquant--build/. The fork's mtmd_default_marker() returns exactly that string, and the Go layer falls back to the same sentinel when media_marker is empty, so behavior on the turboquant path is unchanged. Patched copy only — the shared source under backend/cpp/llama-cpp/ keeps compiling against vanilla upstream. Verified by running `make docker-build-turboquant` locally end-to-end: all five flavors (avx, avx2, avx512, fallback, grpc+rpc-server) now compile past the previous failure and the image tags successfully. --- backend/cpp/turboquant/patch-grpc-server.sh | 91 +++++++++++++-------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh index 5b534ece10de..c3dd967a078d 100755 --- a/backend/cpp/turboquant/patch-grpc-server.sh +++ b/backend/cpp/turboquant/patch-grpc-server.sh @@ -1,13 +1,22 @@ #!/bin/bash -# Augment the shared backend/cpp/llama-cpp/grpc-server.cpp allow-list of KV-cache -# types so the gRPC `LoadModel` call accepts the TurboQuant-specific -# `turbo2` / `turbo3` / `turbo4` cache types. +# Patch the shared backend/cpp/llama-cpp/grpc-server.cpp *copy* used by the +# turboquant build to account for two gaps between upstream and the fork: # -# We do this on the *copy* sitting in turboquant--build/, never on the -# original under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps -# compiling against vanilla upstream which does not know about GGML_TYPE_TURBO*. +# 1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the +# fork-specific `turbo2` / `turbo3` / `turbo4` cache types. +# 2. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962, +# server-side random per-instance marker) with the legacy "<__media__>" +# literal. The fork branched before that PR, so server-common.cpp has no +# get_media_marker symbol. The fork's mtmd_default_marker() still returns +# "<__media__>", and Go-side tooling falls back to that sentinel when the +# backend does not expose media_marker, so substituting the literal keeps +# behavior identical on the turboquant path. # -# Idempotent: skips the insertion if the marker is already present (so re-runs +# We patch the *copy* sitting in turboquant--build/, never the original +# under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling +# against vanilla upstream. +# +# Idempotent: skips each insertion if its marker is already present (so re-runs # of the same build dir don't double-insert). set -euo pipefail @@ -25,33 +34,47 @@ if [[ ! -f "$SRC" ]]; then fi if grep -q 'GGML_TYPE_TURBO2_0' "$SRC"; then - echo "==> $SRC already has TurboQuant cache types, skipping" - exit 0 -fi + echo "==> $SRC already has TurboQuant cache types, skipping KV allow-list patch" +else + echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types" -echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types" - -# Insert the three TURBO entries right after the first ` GGML_TYPE_Q5_1,` -# line (the kv_cache_types[] allow-list). Using awk because the builder image -# does not ship python3, and GNU sed's multi-line `a\` quoting is awkward. -awk ' - /^ GGML_TYPE_Q5_1,$/ && !done { - print - print " // turboquant fork extras — added by patch-grpc-server.sh" - print " GGML_TYPE_TURBO2_0," - print " GGML_TYPE_TURBO3_0," - print " GGML_TYPE_TURBO4_0," - done = 1 - next - } - { print } - END { - if (!done) { - print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr" - exit 1 + # Insert the three TURBO entries right after the first ` GGML_TYPE_Q5_1,` + # line (the kv_cache_types[] allow-list). Using awk because the builder image + # does not ship python3, and GNU sed's multi-line `a\` quoting is awkward. + awk ' + /^ GGML_TYPE_Q5_1,$/ && !done { + print + print " // turboquant fork extras — added by patch-grpc-server.sh" + print " GGML_TYPE_TURBO2_0," + print " GGML_TYPE_TURBO3_0," + print " GGML_TYPE_TURBO4_0," + done = 1 + next } - } -' "$SRC" > "$SRC.tmp" -mv "$SRC.tmp" "$SRC" + { print } + END { + if (!done) { + print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr" + exit 1 + } + } + ' "$SRC" > "$SRC.tmp" + mv "$SRC.tmp" "$SRC" + + echo "==> KV allow-list patch OK" +fi + +if grep -q 'get_media_marker()' "$SRC"; then + echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal" + # Only one call site today (ModelMetadata), but replace all occurrences to + # stay robust if upstream adds more. Use a temp file to avoid relying on + # sed -i portability (the builder image uses GNU sed, but keeping this + # consistent with the awk block above). + sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp" + mv "$SRC.tmp" "$SRC" + echo "==> get_media_marker() substitution OK" +else + echo "==> $SRC has no get_media_marker() call, skipping media-marker patch" +fi -echo "==> patched OK" +echo "==> all patches applied"