From 60499061334888bdd0c903226db42d0ec2f11264 Mon Sep 17 00:00:00 2001
From: Winston Ma <winstonma@ymail.com>
Date: Sun, 17 May 2026 01:57:35 +0800
Subject: [PATCH 1/3] vulkan: removed duplicate #include <memory> in headers
 (#23144)

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 1 -
 1 file changed, 1 deletion(-)
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index d29a4bab2e2..a296d0ab446 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -49,7 +49,6 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
 #include <map>
 #include <set>
 #include <unordered_map>
-#include <memory>
 #include <mutex>
 #include <future>
 #include <thread>

From 64b38b561b987679c4e1c6231f93860d3eec2638 Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Sat, 16 May 2026 21:21:06 +0200
Subject: [PATCH 2/3] server: skip device enumeration in router mode to avoid
 creating CUDA primary context (#23137)

---
 common/common.cpp       | 18 +++++++++++-------
 common/common.h         |  2 +-
 tools/server/server.cpp |  6 ++++--
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 8b6d182f549..9cf11ea9f5f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -373,7 +373,7 @@ void common_init() {
     llama_log_set(common_log_default_callback, NULL);
 }
 
-void common_params_print_info(const common_params & params) {
+void common_params_print_info(const common_params & params, bool print_devices) {
 #ifdef NDEBUG
     const char * build_type = "";
 #else
@@ -382,12 +382,16 @@ void common_params_print_info(const common_params & params) {
     LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
 
     LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
-    LOG_INF("device_info:\n");
-    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-        auto * dev = ggml_backend_dev_get(i);
-        size_t free, total;
-        ggml_backend_dev_memory(dev, &free, &total);
-        LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+
+    // device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
+    if (print_devices) {
+        LOG_INF("device_info:\n");
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            auto * dev = ggml_backend_dev_get(i);
+            size_t free, total;
+            ggml_backend_dev_memory(dev, &free, &total);
+            LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+        }
     }
     LOG_INF("%s\n", common_params_get_system_info(params).c_str());
 }
diff --git a/common/common.h b/common/common.h
index 4cca9d71568..514bab11942 100644
--- a/common/common.h
+++ b/common/common.h
@@ -708,7 +708,7 @@ struct common_params {
 // initializes the logging system and prints info about the build
 void common_init();
 
-void common_params_print_info(const common_params & params);
+void common_params_print_info(const common_params & params, bool print_devices = true);
 std::string common_params_get_system_info(const common_params & params);
 
 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index a232550789c..c82f1179431 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -86,7 +86,10 @@ int main(int argc, char ** argv) {
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    common_params_print_info(params);
+    // router server never loads a model and must not touch the GPU
+    // skip device enumeration so the CUDA primary context stays uncreated
+    const bool is_router_server = params.model.path.empty();
+    common_params_print_info(params, !is_router_server);
 
     // validate batch size for embeddings
     // embeddings require all tokens to be processed in a single ubatch
@@ -126,7 +129,6 @@ int main(int argc, char ** argv) {
     server_routes routes(params, ctx_server);
     server_tools tools;
 
-    bool is_router_server = params.model.path.empty();
     std::optional<server_models_routes> models_routes{};
     if (is_router_server) {
         // setup server instances manager

From b64739ea393b3c9d07cc9907e0a611f707838051 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Sat, 16 May 2026 23:42:16 +0200
Subject: [PATCH 3/3] server: (router) alloc tmp buffer on heap (#23159)

---
 tools/server/server-models.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 433d2d8f04e..6c6fed52d58 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -798,9 +798,10 @@ void server_models::load(const std::string & name) {
         std::thread log_thread([&]() {
             // read stdout/stderr and forward to main server log
             // also handle status report from child process
+            std::vector<char> vec_buf(128 * 1024); // large buffer for storing info
+            char * buffer = vec_buf.data();
             if (stdout_file) {
-                char buffer[128 * 1024]; // large buffer for storing info
-                while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
+                while (fgets(buffer, vec_buf.size(), stdout_file) != nullptr) {
                     LOG("[%5d] %s", port, buffer);
                     std::string str(buffer);
                     if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {