diff --git a/backend/cpp/turboquant/Makefile b/backend/cpp/turboquant/Makefile
index 7d0abf0e2e5a..cd3d1ab809da 100644
--- a/backend/cpp/turboquant/Makefile
+++ b/backend/cpp/turboquant/Makefile
@@ -1,7 +1,7 @@
 
 # Pinned to the HEAD of feature/turboquant-kv-cache on https://github.com/TheTom/llama-cpp-turboquant.
 # Auto-bumped nightly by .github/workflows/bump_deps.yaml.
-TURBOQUANT_VERSION?=45f8a066ed5f5bb38c695cec532f6cef9f4efa9d
+TURBOQUANT_VERSION?=627ebbc6e27727bd4f65422d8aa60b13404993c8
 LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
 
 CMAKE_ARGS?=
diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh
index 5b534ece10de..c3dd967a078d 100755
--- a/backend/cpp/turboquant/patch-grpc-server.sh
+++ b/backend/cpp/turboquant/patch-grpc-server.sh
@@ -1,13 +1,22 @@
 #!/bin/bash
-# Augment the shared backend/cpp/llama-cpp/grpc-server.cpp allow-list of KV-cache
-# types so the gRPC `LoadModel` call accepts the TurboQuant-specific
-# `turbo2` / `turbo3` / `turbo4` cache types.
+# Patch the shared backend/cpp/llama-cpp/grpc-server.cpp *copy* used by the
+# turboquant build to account for two gaps between upstream and the fork:
 #
-# We do this on the *copy* sitting in turboquant-<flavor>-build/, never on the
-# original under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps
-# compiling against vanilla upstream which does not know about GGML_TYPE_TURBO*.
+#   1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the
+#      fork-specific `turbo2` / `turbo3` / `turbo4` cache types.
+#   2. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962,
+#      server-side random per-instance marker) with the legacy "<__media__>"
+#      literal. The fork branched before that PR, so server-common.cpp has no
+#      get_media_marker symbol. The fork's mtmd_default_marker() still returns
+#      "<__media__>", and Go-side tooling falls back to that sentinel when the
+#      backend does not expose media_marker, so substituting the literal keeps
+#      behavior identical on the turboquant path.
 #
-# Idempotent: skips the insertion if the marker is already present (so re-runs
+# We patch the *copy* sitting in turboquant-<flavor>-build/, never the original
+# under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling
+# against vanilla upstream.
+#
+# Idempotent: skips each insertion if its marker is already present (so re-runs
 # of the same build dir don't double-insert).
 
 set -euo pipefail
@@ -25,33 +34,47 @@ if [[ ! -f "$SRC" ]]; then
 fi
 
 if grep -q 'GGML_TYPE_TURBO2_0' "$SRC"; then
-    echo "==> $SRC already has TurboQuant cache types, skipping"
-    exit 0
-fi
+    echo "==> $SRC already has TurboQuant cache types, skipping KV allow-list patch"
+else
+    echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types"
 
-echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types"
-
-# Insert the three TURBO entries right after the first `    GGML_TYPE_Q5_1,`
-# line (the kv_cache_types[] allow-list). Using awk because the builder image
-# does not ship python3, and GNU sed's multi-line `a\` quoting is awkward.
-awk '
-    /^    GGML_TYPE_Q5_1,$/ && !done {
-        print
-        print "    // turboquant fork extras — added by patch-grpc-server.sh"
-        print "    GGML_TYPE_TURBO2_0,"
-        print "    GGML_TYPE_TURBO3_0,"
-        print "    GGML_TYPE_TURBO4_0,"
-        done = 1
-        next
-    }
-    { print }
-    END {
-        if (!done) {
-            print "patch-grpc-server.sh: anchor `    GGML_TYPE_Q5_1,` not found" > "/dev/stderr"
-            exit 1
+    # Insert the three TURBO entries right after the first `    GGML_TYPE_Q5_1,`
+    # line (the kv_cache_types[] allow-list). Using awk because the builder image
+    # does not ship python3, and GNU sed's multi-line `a\` quoting is awkward.
+    awk '
+        /^    GGML_TYPE_Q5_1,$/ && !done {
+            print
+            print "    // turboquant fork extras — added by patch-grpc-server.sh"
+            print "    GGML_TYPE_TURBO2_0,"
+            print "    GGML_TYPE_TURBO3_0,"
+            print "    GGML_TYPE_TURBO4_0,"
+            done = 1
+            next
         }
-    }
-' "$SRC" > "$SRC.tmp"
-mv "$SRC.tmp" "$SRC"
+        { print }
+        END {
+            if (!done) {
+                print "patch-grpc-server.sh: anchor `    GGML_TYPE_Q5_1,` not found" > "/dev/stderr"
+                exit 1
+            }
+        }
+    ' "$SRC" > "$SRC.tmp"
+    mv "$SRC.tmp" "$SRC"
+
+    echo "==> KV allow-list patch OK"
+fi
+
+if grep -q 'get_media_marker()' "$SRC"; then
+    echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal"
+    # Only one call site today (ModelMetadata), but replace all occurrences to
+    # stay robust if upstream adds more. Use a temp file to avoid relying on
+    # sed -i portability (the builder image uses GNU sed, but keeping this
+    # consistent with the awk block above).
+    sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp"
+    mv "$SRC.tmp" "$SRC"
+    echo "==> get_media_marker() substitution OK"
+else
+    echo "==> $SRC has no get_media_marker() call, skipping media-marker patch"
+fi
 
-echo "==> patched OK"
+echo "==> all patches applied"
diff --git a/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch b/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch
deleted file mode 100644
index 0f1feed88b3c..000000000000
--- a/backend/cpp/turboquant/patches/0001-server-respect-the-ignore-eos-flag.patch
+++ /dev/null
@@ -1,83 +0,0 @@
-From 660600081fb7b9b769ded5c805a2d39a419f0a0d Mon Sep 17 00:00:00 2001
-From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
-Date: Wed, 8 Apr 2026 11:12:15 -0400
-Subject: [PATCH] server: respect the ignore eos flag (#21203)
-
----
- tools/server/server-context.cpp | 3 +++
- tools/server/server-context.h   | 3 +++
- tools/server/server-task.cpp    | 3 ++-
- tools/server/server-task.h      | 1 +
- 4 files changed, 9 insertions(+), 1 deletion(-)
-
-diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
-index 9d3ac538..b31981c5 100644
---- a/tools/server/server-context.cpp
-+++ b/tools/server/server-context.cpp
-@@ -3033,6 +3033,8 @@ server_context_meta server_context::get_meta() const {
-         /* fim_rep_token          */ llama_vocab_fim_rep(impl->vocab),
-         /* fim_sep_token          */ llama_vocab_fim_sep(impl->vocab),
- 
-+        /* logit_bias_eog         */ impl->params_base.sampling.logit_bias_eog,
-+
-         /* model_vocab_type       */ llama_vocab_type(impl->vocab),
-         /* model_vocab_n_tokens   */ llama_vocab_n_tokens(impl->vocab),
-         /* model_n_ctx_train      */ llama_model_n_ctx_train(impl->model),
-@@ -3117,6 +3119,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
-                     ctx_server.vocab,
-                     params,
-                     meta->slot_n_ctx,
-+                    meta->logit_bias_eog,
-                     data);
-             task.id_slot = json_value(data, "id_slot", -1);
- 
-diff --git a/tools/server/server-context.h b/tools/server/server-context.h
-index d7ce8735..6ea9afc0 100644
---- a/tools/server/server-context.h
-+++ b/tools/server/server-context.h
-@@ -39,6 +39,9 @@ struct server_context_meta {
-     llama_token fim_rep_token;
-     llama_token fim_sep_token;
- 
-+    // sampling
-+    std::vector<llama_logit_bias> logit_bias_eog;
-+
-     // model meta
-     enum llama_vocab_type model_vocab_type;
-     int32_t model_vocab_n_tokens;
-diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
-index 4cc87bc5..856b3f0e 100644
---- a/tools/server/server-task.cpp
-+++ b/tools/server/server-task.cpp
-@@ -239,6 +239,7 @@ task_params server_task::params_from_json_cmpl(
-         const llama_vocab * vocab,
-         const common_params & params_base,
-         const int n_ctx_slot,
-+        const std::vector<llama_logit_bias> & logit_bias_eog,
-         const json & data) {
-     task_params params;
- 
-@@ -562,7 +563,7 @@ task_params server_task::params_from_json_cmpl(
-         if (params.sampling.ignore_eos) {
-             params.sampling.logit_bias.insert(
-                     params.sampling.logit_bias.end(),
--                    defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
-+                    logit_bias_eog.begin(), logit_bias_eog.end());
-         }
-     }
- 
-diff --git a/tools/server/server-task.h b/tools/server/server-task.h
-index d855bf08..243e47a8 100644
---- a/tools/server/server-task.h
-+++ b/tools/server/server-task.h
-@@ -209,6 +209,7 @@ struct server_task {
-         const llama_vocab * vocab,
-         const common_params & params_base,
-         const int n_ctx_slot,
-+        const std::vector<llama_logit_bias> & logit_bias_eog,
-         const json & data);
- 
-     // utility function
--- 
-2.43.0
-