Skip to content

Commit 64b38b5

Browse files
server: skip device enumeration in router mode to avoid creating CUDA primary context (ggml-org#23137)
1 parent 6049906 commit 64b38b5

3 files changed

Lines changed: 16 additions & 10 deletions

File tree

common/common.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ void common_init() {
373373
llama_log_set(common_log_default_callback, NULL);
374374
}
375375

376-
void common_params_print_info(const common_params & params) {
376+
void common_params_print_info(const common_params & params, bool print_devices) {
377377
#ifdef NDEBUG
378378
const char * build_type = "";
379379
#else
@@ -382,12 +382,16 @@ void common_params_print_info(const common_params & params) {
382382
LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
383383

384384
LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
385-
LOG_INF("device_info:\n");
386-
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
387-
auto * dev = ggml_backend_dev_get(i);
388-
size_t free, total;
389-
ggml_backend_dev_memory(dev, &free, &total);
390-
LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
385+
386+
// device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
387+
if (print_devices) {
388+
LOG_INF("device_info:\n");
389+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
390+
auto * dev = ggml_backend_dev_get(i);
391+
size_t free, total;
392+
ggml_backend_dev_memory(dev, &free, &total);
393+
LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
394+
}
391395
}
392396
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
393397
}

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ struct common_params {
708708
// initializes the logging system and prints info about the build
709709
void common_init();
710710

711-
void common_params_print_info(const common_params & params);
711+
void common_params_print_info(const common_params & params, bool print_devices = true);
712712
std::string common_params_get_system_info(const common_params & params);
713713

714714
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);

tools/server/server.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,10 @@ int main(int argc, char ** argv) {
8686
llama_backend_init();
8787
llama_numa_init(params.numa);
8888

89-
common_params_print_info(params);
89+
// router server never loads a model and must not touch the GPU
90+
// skip device enumeration so the CUDA primary context stays uncreated
91+
const bool is_router_server = params.model.path.empty();
92+
common_params_print_info(params, !is_router_server);
9093

9194
// validate batch size for embeddings
9295
// embeddings require all tokens to be processed in a single ubatch
@@ -126,7 +129,6 @@ int main(int argc, char ** argv) {
126129
server_routes routes(params, ctx_server);
127130
server_tools tools;
128131

129-
bool is_router_server = params.model.path.empty();
130132
std::optional<server_models_routes> models_routes{};
131133
if (is_router_server) {
132134
// setup server instances manager

0 commit comments

Comments
 (0)