Skip to content

Commit 935a340

Browse files
authored
server: implement /models?reload=1 (ggml-org#21848)
1 parent d8794ee commit 935a340

5 files changed

Lines changed: 313 additions & 88 deletions

File tree

tools/server/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1646,7 +1646,11 @@ Listing all models in cache. The model metadata will also include a field to ind
16461646
}
16471647
```
16481648

1649-
Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
1649+
Note:
1650+
1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
1651+
2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
1652+
- If a model is running but updated or removed from the source, it will be unloaded
1653+
- If a model is not running, it will be added or updated according to the source
16501654

16511655
The `status` object can be:
16521656

tools/server/server-models.cpp

Lines changed: 247 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,8 @@ void server_models::add_model(server_model_meta && meta) {
243243
};
244244
}
245245

246-
// TODO: allow refreshing cached model list
247246
void server_models::load_models() {
248-
// loading models from 3 sources:
247+
// Phase 1: load presets from all sources — pure I/O, no lock needed
249248
// 1. cached models
250249
common_presets cached_models = ctx_preset.load_from_cache();
251250
SRV_INF("Loaded %zu cached model presets\n", cached_models.size());
@@ -270,112 +269,266 @@ void server_models::load_models() {
270269

271270
// note: if a model exists in both cached and local, local takes precedence
272271
common_presets final_presets;
273-
for (const auto & [name, preset] : cached_models) {
274-
final_presets[name] = preset;
275-
}
276-
for (const auto & [name, preset] : local_models) {
277-
final_presets[name] = preset;
278-
}
279-
280-
// process custom presets from INI
272+
for (const auto & [name, preset] : cached_models) final_presets[name] = preset;
273+
for (const auto & [name, preset] : local_models) final_presets[name] = preset;
281274
for (const auto & [name, custom] : custom_presets) {
282275
if (final_presets.find(name) != final_presets.end()) {
283-
// apply custom config if exists
284-
common_preset & target = final_presets[name];
285-
target.merge(custom);
276+
final_presets[name].merge(custom);
286277
} else {
287-
// otherwise add directly
288278
final_presets[name] = custom;
289279
}
290280
}
291-
292-
// server base preset from CLI args take highest precedence
281+
// server base preset from CLI args takes highest precedence
293282
for (auto & [name, preset] : final_presets) {
294283
preset.merge(base_preset);
295284
}
296285

297-
// convert presets to server_model_meta and add to mapping
298-
for (const auto & preset : final_presets) {
299-
server_model_meta meta{
300-
/* preset */ preset.second,
301-
/* name */ preset.first,
302-
/* aliases */ {},
303-
/* tags */ {},
304-
/* port */ 0,
305-
/* status */ SERVER_MODEL_STATUS_UNLOADED,
306-
/* last_used */ 0,
307-
/* args */ std::vector<std::string>(),
308-
/* exit_code */ 0,
309-
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
310-
};
311-
add_model(std::move(meta));
312-
}
313-
314-
// log available models
315-
{
316-
std::unordered_set<std::string> custom_names;
317-
for (const auto & [name, preset] : custom_presets) {
318-
custom_names.insert(name);
319-
}
320-
auto join_set = [](const std::set<std::string> & s) {
321-
std::string result;
322-
for (const auto & v : s) {
323-
if (!result.empty()) {
324-
result += ", ";
325-
}
326-
result += v;
327-
}
328-
return result;
329-
};
330-
286+
// Helpers that read `mapping` — must be called while holding the lock.
287+
std::unordered_set<std::string> custom_names;
288+
for (const auto & [name, preset] : custom_presets) custom_names.insert(name);
289+
auto join_set = [](const std::set<std::string> & s) {
290+
std::string result;
291+
for (const auto & v : s) {
292+
if (!result.empty()) result += ", ";
293+
result += v;
294+
}
295+
return result;
296+
};
297+
auto log_available_models = [&]() {
331298
SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
332299
for (const auto & [name, inst] : mapping) {
333300
bool has_custom = custom_names.find(name) != custom_names.end();
334301
std::string info;
335-
if (!inst.meta.aliases.empty()) {
336-
info += " (aliases: " + join_set(inst.meta.aliases) + ")";
302+
if (!inst.meta.aliases.empty()) info += " (aliases: " + join_set(inst.meta.aliases) + ")";
303+
if (!inst.meta.tags.empty()) info += " [tags: " + join_set(inst.meta.tags) + "]";
304+
SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str());
305+
}
306+
};
307+
auto apply_stop_timeout = [&]() {
308+
for (auto & [name, inst] : mapping) {
309+
std::string val;
310+
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
311+
try {
312+
inst.meta.stop_timeout = std::stoi(val);
313+
} catch (...) {
314+
SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
315+
val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
316+
inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
317+
}
337318
}
338-
if (!inst.meta.tags.empty()) {
339-
info += " [tags: " + join_set(inst.meta.tags) + "]";
319+
}
320+
};
321+
// update_args() injects HOST/PORT/ALIAS, so strip them before comparing presets
322+
auto preset_options_for_compare = [](common_preset p) {
323+
p.unset_option("LLAMA_ARG_HOST");
324+
p.unset_option("LLAMA_ARG_PORT");
325+
p.unset_option("LLAMA_ARG_ALIAS");
326+
return p.options;
327+
};
328+
329+
// Phase 2: acquire the lock once for all mapping mutations.
330+
// We temporarily release it only when calling functions that acquire it internally
331+
// (unload, load) or when joining threads (the monitoring thread calls update_status
332+
// which locks the mutex, so joining while holding it would deadlock).
333+
std::unique_lock<std::mutex> lk(mutex);
334+
bool is_first_load = mapping.empty();
335+
336+
if (is_first_load) {
337+
// FIRST LOAD: add all models, then unlock for autoloading
338+
for (const auto & [name, preset] : final_presets) {
339+
server_model_meta meta{
340+
/* preset */ preset,
341+
/* name */ name,
342+
/* aliases */ {},
343+
/* tags */ {},
344+
/* port */ 0,
345+
/* status */ SERVER_MODEL_STATUS_UNLOADED,
346+
/* last_used */ 0,
347+
/* args */ std::vector<std::string>(),
348+
/* exit_code */ 0,
349+
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
350+
};
351+
add_model(std::move(meta));
352+
}
353+
apply_stop_timeout();
354+
log_available_models();
355+
356+
std::vector<std::string> models_to_load;
357+
for (const auto & [name, inst] : mapping) {
358+
std::string val;
359+
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) {
360+
models_to_load.push_back(name);
340361
}
341-
SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str());
342362
}
343-
}
363+
if ((int)models_to_load.size() > base_params.models_max) {
364+
throw std::runtime_error(string_format(
365+
"number of models to load on startup (%zu) exceeds models_max (%d)",
366+
models_to_load.size(), base_params.models_max));
367+
}
344368

345-
// handle custom stop-timeout option
346-
for (auto & [name, inst] : mapping) {
347-
std::string val;
348-
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
349-
try {
350-
inst.meta.stop_timeout = std::stoi(val);
351-
} catch (...) {
352-
SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
353-
val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
354-
inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
369+
lk.unlock();
370+
for (const auto & name : models_to_load) {
371+
SRV_INF("(startup) loading model %s\n", name.c_str());
372+
load(name);
373+
}
374+
} else {
375+
// RELOAD: diff the new preset list against the current mapping and reconcile
376+
is_reloading = true;
377+
378+
// find running models whose source was removed or whose preset changed
379+
std::vector<std::string> to_unload;
380+
for (const auto & [name, inst] : mapping) {
381+
if (!inst.meta.is_running()) continue;
382+
auto it = final_presets.find(name);
383+
if (it == final_presets.end()) {
384+
to_unload.push_back(name); // removed from source
385+
} else if (preset_options_for_compare(inst.meta.preset) != preset_options_for_compare(it->second)) {
386+
to_unload.push_back(name); // preset changed
355387
}
356388
}
357-
}
358389

359-
// load any autoload models
360-
std::vector<std::string> models_to_load;
361-
for (const auto & [name, inst] : mapping) {
362-
std::string val;
363-
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
364-
if (common_arg_utils::is_truthy(val)) {
365-
models_to_load.push_back(name);
390+
// unload() acquires the lock internally, so release before each call
391+
for (const auto & name : to_unload) {
392+
SRV_INF("(reload) unloading model name=%s (source updated or removed)\n", name.c_str());
393+
lk.unlock();
394+
unload(name);
395+
lk.lock();
396+
}
397+
398+
// wait for all targeted models to reach UNLOADED; cv.wait handles unlock/relock
399+
cv.wait(lk, [&]() {
400+
for (const auto & name : to_unload) {
401+
auto it = mapping.find(name);
402+
if (it != mapping.end() && it->second.meta.is_running()) return false;
403+
}
404+
return true;
405+
});
406+
407+
// collect all threads to join in one pass while the lock is held:
408+
// - monitoring threads from just-unloaded models (to_unload)
409+
// - threads of already-UNLOADED models that are being removed from source
410+
std::vector<std::thread> threads_to_join;
411+
for (const auto & name : to_unload) {
412+
auto it = mapping.find(name);
413+
if (it != mapping.end() && it->second.th.joinable()) {
414+
threads_to_join.push_back(std::move(it->second.th));
366415
}
367416
}
368-
}
369-
if ((int)models_to_load.size() > base_params.models_max) {
370-
throw std::runtime_error(string_format(
371-
"number of models to load on startup (%zu) exceeds models_max (%d)",
372-
models_to_load.size(),
373-
base_params.models_max
374-
));
375-
}
376-
for (const auto & name : models_to_load) {
377-
SRV_INF("(startup) loading model %s\n", name.c_str());
378-
load(name);
417+
for (auto & [name, inst] : mapping) {
418+
if (final_presets.find(name) == final_presets.end() && !inst.meta.is_running() && inst.th.joinable()) {
419+
threads_to_join.push_back(std::move(inst.th));
420+
}
421+
}
422+
423+
// join outside the lock — monitoring thread calls update_status (needs lock)
424+
lk.unlock();
425+
for (auto & th : threads_to_join) th.join();
426+
lk.lock();
427+
428+
// erase models no longer in any source
429+
for (auto it = mapping.begin(); it != mapping.end(); ) {
430+
if (final_presets.find(it->first) == final_presets.end()) {
431+
SRV_INF("(reload) removing model name=%s (no longer in source)\n", it->first.c_str());
432+
GGML_ASSERT(!it->second.th.joinable()); // must have been joined above
433+
it = mapping.erase(it);
434+
} else {
435+
++it;
436+
}
437+
}
438+
439+
// update presets for non-running models still in source
440+
for (auto & [name, inst] : mapping) {
441+
if (inst.meta.is_running()) continue;
442+
auto it = final_presets.find(name);
443+
if (it == final_presets.end()) continue; // erased above
444+
445+
inst.meta.preset = it->second;
446+
447+
// re-parse aliases, then validate against other models
448+
std::set<std::string> new_aliases;
449+
std::string alias_str;
450+
if (inst.meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) {
451+
for (auto & alias : string_split<std::string>(alias_str, ',')) {
452+
alias = string_strip(alias);
453+
if (!alias.empty()) new_aliases.insert(alias);
454+
}
455+
}
456+
inst.meta.aliases.clear();
457+
for (const auto & alias : new_aliases) {
458+
bool conflict = false;
459+
for (const auto & [other_name, other_inst] : mapping) {
460+
if (other_name == name) continue;
461+
if (other_name == alias || other_inst.meta.aliases.count(alias)) {
462+
SRV_WRN("(reload) alias '%s' for model '%s' conflicts with model '%s', skipping\n",
463+
alias.c_str(), name.c_str(), other_name.c_str());
464+
conflict = true;
465+
break;
466+
}
467+
}
468+
if (!conflict) inst.meta.aliases.insert(alias);
469+
}
470+
471+
// re-parse tags
472+
inst.meta.tags.clear();
473+
std::string tags_str;
474+
if (inst.meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) {
475+
for (auto & tag : string_split<std::string>(tags_str, ',')) {
476+
tag = string_strip(tag);
477+
if (!tag.empty()) inst.meta.tags.insert(tag);
478+
}
479+
}
480+
481+
inst.meta.exit_code = 0; // clear failed state so the model can be reloaded
482+
inst.meta.update_args(ctx_preset, bin_path);
483+
}
484+
485+
// add models that are new in this reload
486+
std::vector<std::string> newly_added;
487+
for (const auto & [name, preset] : final_presets) {
488+
if (mapping.find(name) == mapping.end()) {
489+
server_model_meta meta{
490+
/* preset */ preset,
491+
/* name */ name,
492+
/* aliases */ {},
493+
/* tags */ {},
494+
/* port */ 0,
495+
/* status */ SERVER_MODEL_STATUS_UNLOADED,
496+
/* last_used */ 0,
497+
/* args */ std::vector<std::string>(),
498+
/* exit_code */ 0,
499+
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
500+
};
501+
add_model(std::move(meta));
502+
newly_added.push_back(name);
503+
}
504+
}
505+
506+
apply_stop_timeout();
507+
508+
// clear reload flag before unlocking for autoload — load() blocks on !is_reloading,
509+
// so clearing it here (while still locked) prevents a deadlock in the autoload calls below
510+
is_reloading = false;
511+
cv.notify_all();
512+
513+
log_available_models();
514+
515+
// collect autoload candidates while still under the lock
516+
std::vector<std::string> to_autoload;
517+
for (const auto & name : newly_added) {
518+
auto it = mapping.find(name);
519+
if (it != mapping.end()) {
520+
std::string val;
521+
if (it->second.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) {
522+
to_autoload.push_back(name);
523+
}
524+
}
525+
}
526+
527+
lk.unlock();
528+
for (const auto & name : to_autoload) {
529+
SRV_INF("(reload) loading new model %s\n", name.c_str());
530+
load(name);
531+
}
379532
}
380533
}
381534

@@ -536,7 +689,10 @@ void server_models::load(const std::string & name) {
536689
}
537690
unload_lru();
538691

539-
std::lock_guard<std::mutex> lk(mutex);
692+
std::unique_lock<std::mutex> lk(mutex);
693+
// edge case: block until any in-progress reload has finished so we always load
694+
// against the freshest preset and a consistent mapping state
695+
cv.wait(lk, [this]() { return !is_reloading; });
540696

541697
auto meta = mapping[name].meta;
542698
if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
@@ -993,7 +1149,11 @@ void server_models_routes::init_routes() {
9931149
return res;
9941150
};
9951151

996-
this->get_router_models = [this](const server_http_req &) {
1152+
this->get_router_models = [this](const server_http_req & req) {
1153+
bool reload = !req.get_param("reload", "").empty();
1154+
if (reload) {
1155+
models.load_models();
1156+
}
9971157
auto res = std::make_unique<server_http_res>();
9981158
json models_json = json::array();
9991159
auto all_models = models.get_all_meta();

0 commit comments

Comments
 (0)