@@ -243,9 +243,8 @@ void server_models::add_model(server_model_meta && meta) {
243243 };
244244}
245245
246- // TODO: allow refreshing cached model list
247246void server_models::load_models () {
248- // loading models from 3 sources:
247+ // Phase 1: load presets from all sources — pure I/O, no lock needed
249248 // 1. cached models
250249 common_presets cached_models = ctx_preset.load_from_cache ();
251250 SRV_INF (" Loaded %zu cached model presets\n " , cached_models.size ());
@@ -270,112 +269,266 @@ void server_models::load_models() {
270269
271270 // note: if a model exists in both cached and local, local takes precedence
272271 common_presets final_presets;
273- for (const auto & [name, preset] : cached_models) {
274- final_presets[name] = preset;
275- }
276- for (const auto & [name, preset] : local_models) {
277- final_presets[name] = preset;
278- }
279-
280- // process custom presets from INI
272+ for (const auto & [name, preset] : cached_models) final_presets[name] = preset;
273+ for (const auto & [name, preset] : local_models) final_presets[name] = preset;
281274 for (const auto & [name, custom] : custom_presets) {
282275 if (final_presets.find (name) != final_presets.end ()) {
283- // apply custom config if exists
284- common_preset & target = final_presets[name];
285- target.merge (custom);
276+ final_presets[name].merge (custom);
286277 } else {
287- // otherwise add directly
288278 final_presets[name] = custom;
289279 }
290280 }
291-
292- // server base preset from CLI args take highest precedence
281+ // server base preset from CLI args takes highest precedence
293282 for (auto & [name, preset] : final_presets) {
294283 preset.merge (base_preset);
295284 }
296285
297- // convert presets to server_model_meta and add to mapping
298- for (const auto & preset : final_presets) {
299- server_model_meta meta{
300- /* preset */ preset.second ,
301- /* name */ preset.first ,
302- /* aliases */ {},
303- /* tags */ {},
304- /* port */ 0 ,
305- /* status */ SERVER_MODEL_STATUS_UNLOADED ,
306- /* last_used */ 0 ,
307- /* args */ std::vector<std::string>(),
308- /* exit_code */ 0 ,
309- /* stop_timeout */ DEFAULT_STOP_TIMEOUT ,
310- };
311- add_model (std::move (meta));
312- }
313-
314- // log available models
315- {
316- std::unordered_set<std::string> custom_names;
317- for (const auto & [name, preset] : custom_presets) {
318- custom_names.insert (name);
319- }
320- auto join_set = [](const std::set<std::string> & s) {
321- std::string result;
322- for (const auto & v : s) {
323- if (!result.empty ()) {
324- result += " , " ;
325- }
326- result += v;
327- }
328- return result;
329- };
330-
286+ // Helpers that read `mapping` — must be called while holding the lock.
287+ std::unordered_set<std::string> custom_names;
288+ for (const auto & [name, preset] : custom_presets) custom_names.insert (name);
289+ auto join_set = [](const std::set<std::string> & s) {
290+ std::string result;
291+ for (const auto & v : s) {
292+ if (!result.empty ()) result += " , " ;
293+ result += v;
294+ }
295+ return result;
296+ };
297+ auto log_available_models = [&]() {
331298 SRV_INF (" Available models (%zu) (*: custom preset)\n " , mapping.size ());
332299 for (const auto & [name, inst] : mapping) {
333300 bool has_custom = custom_names.find (name) != custom_names.end ();
334301 std::string info;
335- if (!inst.meta .aliases .empty ()) {
336- info += " (aliases: " + join_set (inst.meta .aliases ) + " )" ;
302+ if (!inst.meta .aliases .empty ()) info += " (aliases: " + join_set (inst.meta .aliases ) + " )" ;
303+ if (!inst.meta .tags .empty ()) info += " [tags: " + join_set (inst.meta .tags ) + " ]" ;
304+ SRV_INF (" %c %s%s\n " , has_custom ? ' *' : ' ' , name.c_str (), info.c_str ());
305+ }
306+ };
307+ auto apply_stop_timeout = [&]() {
308+ for (auto & [name, inst] : mapping) {
309+ std::string val;
310+ if (inst.meta .preset .get_option (COMMON_ARG_PRESET_STOP_TIMEOUT , val)) {
311+ try {
312+ inst.meta .stop_timeout = std::stoi (val);
313+ } catch (...) {
314+ SRV_WRN (" invalid stop-timeout value '%s' for model '%s', using default %d seconds\n " ,
315+ val.c_str (), name.c_str (), DEFAULT_STOP_TIMEOUT );
316+ inst.meta .stop_timeout = DEFAULT_STOP_TIMEOUT ;
317+ }
337318 }
338- if (!inst.meta .tags .empty ()) {
339- info += " [tags: " + join_set (inst.meta .tags ) + " ]" ;
319+ }
320+ };
321+ // update_args() injects HOST/PORT/ALIAS, so strip them before comparing presets
322+ auto preset_options_for_compare = [](common_preset p) {
323+ p.unset_option (" LLAMA_ARG_HOST" );
324+ p.unset_option (" LLAMA_ARG_PORT" );
325+ p.unset_option (" LLAMA_ARG_ALIAS" );
326+ return p.options ;
327+ };
328+
329+ // Phase 2: acquire the lock once for all mapping mutations.
330+ // We temporarily release it only when calling functions that acquire it internally
331+ // (unload, load) or when joining threads (the monitoring thread calls update_status
332+ // which locks the mutex, so joining while holding it would deadlock).
333+ std::unique_lock<std::mutex> lk (mutex);
334+ bool is_first_load = mapping.empty ();
335+
336+ if (is_first_load) {
337+ // FIRST LOAD: add all models, then unlock for autoloading
338+ for (const auto & [name, preset] : final_presets) {
339+ server_model_meta meta{
340+ /* preset */ preset,
341+ /* name */ name,
342+ /* aliases */ {},
343+ /* tags */ {},
344+ /* port */ 0 ,
345+ /* status */ SERVER_MODEL_STATUS_UNLOADED ,
346+ /* last_used */ 0 ,
347+ /* args */ std::vector<std::string>(),
348+ /* exit_code */ 0 ,
349+ /* stop_timeout */ DEFAULT_STOP_TIMEOUT ,
350+ };
351+ add_model (std::move (meta));
352+ }
353+ apply_stop_timeout ();
354+ log_available_models ();
355+
356+ std::vector<std::string> models_to_load;
357+ for (const auto & [name, inst] : mapping) {
358+ std::string val;
359+ if (inst.meta .preset .get_option (COMMON_ARG_PRESET_LOAD_ON_STARTUP , val) && common_arg_utils::is_truthy (val)) {
360+ models_to_load.push_back (name);
340361 }
341- SRV_INF (" %c %s%s\n " , has_custom ? ' *' : ' ' , name.c_str (), info.c_str ());
342362 }
343- }
363+ if ((int )models_to_load.size () > base_params.models_max ) {
364+ throw std::runtime_error (string_format (
365+ " number of models to load on startup (%zu) exceeds models_max (%d)" ,
366+ models_to_load.size (), base_params.models_max ));
367+ }
344368
345- // handle custom stop-timeout option
346- for (auto & [name, inst] : mapping) {
347- std::string val;
348- if (inst.meta .preset .get_option (COMMON_ARG_PRESET_STOP_TIMEOUT , val)) {
349- try {
350- inst.meta .stop_timeout = std::stoi (val);
351- } catch (...) {
352- SRV_WRN (" invalid stop-timeout value '%s' for model '%s', using default %d seconds\n " ,
353- val.c_str (), name.c_str (), DEFAULT_STOP_TIMEOUT );
354- inst.meta .stop_timeout = DEFAULT_STOP_TIMEOUT ;
369+ lk.unlock ();
370+ for (const auto & name : models_to_load) {
371+ SRV_INF (" (startup) loading model %s\n " , name.c_str ());
372+ load (name);
373+ }
374+ } else {
375+ // RELOAD: diff the new preset list against the current mapping and reconcile
376+ is_reloading = true ;
377+
378+ // find running models whose source was removed or whose preset changed
379+ std::vector<std::string> to_unload;
380+ for (const auto & [name, inst] : mapping) {
381+ if (!inst.meta .is_running ()) continue ;
382+ auto it = final_presets.find (name);
383+ if (it == final_presets.end ()) {
384+ to_unload.push_back (name); // removed from source
385+ } else if (preset_options_for_compare (inst.meta .preset ) != preset_options_for_compare (it->second )) {
386+ to_unload.push_back (name); // preset changed
355387 }
356388 }
357- }
358389
359- // load any autoload models
360- std::vector<std::string> models_to_load;
361- for (const auto & [name, inst] : mapping) {
362- std::string val;
363- if (inst.meta .preset .get_option (COMMON_ARG_PRESET_LOAD_ON_STARTUP , val)) {
364- if (common_arg_utils::is_truthy (val)) {
365- models_to_load.push_back (name);
390+ // unload() acquires the lock internally, so release before each call
391+ for (const auto & name : to_unload) {
392+ SRV_INF (" (reload) unloading model name=%s (source updated or removed)\n " , name.c_str ());
393+ lk.unlock ();
394+ unload (name);
395+ lk.lock ();
396+ }
397+
398+ // wait for all targeted models to reach UNLOADED; cv.wait handles unlock/relock
399+ cv.wait (lk, [&]() {
400+ for (const auto & name : to_unload) {
401+ auto it = mapping.find (name);
402+ if (it != mapping.end () && it->second .meta .is_running ()) return false ;
403+ }
404+ return true ;
405+ });
406+
407+ // collect all threads to join in one pass while the lock is held:
408+ // - monitoring threads from just-unloaded models (to_unload)
409+ // - threads of already-UNLOADED models that are being removed from source
410+ std::vector<std::thread> threads_to_join;
411+ for (const auto & name : to_unload) {
412+ auto it = mapping.find (name);
413+ if (it != mapping.end () && it->second .th .joinable ()) {
414+ threads_to_join.push_back (std::move (it->second .th ));
366415 }
367416 }
368- }
369- if ((int )models_to_load.size () > base_params.models_max ) {
370- throw std::runtime_error (string_format (
371- " number of models to load on startup (%zu) exceeds models_max (%d)" ,
372- models_to_load.size (),
373- base_params.models_max
374- ));
375- }
376- for (const auto & name : models_to_load) {
377- SRV_INF (" (startup) loading model %s\n " , name.c_str ());
378- load (name);
417+ for (auto & [name, inst] : mapping) {
418+ if (final_presets.find (name) == final_presets.end () && !inst.meta .is_running () && inst.th .joinable ()) {
419+ threads_to_join.push_back (std::move (inst.th ));
420+ }
421+ }
422+
423+ // join outside the lock — monitoring thread calls update_status (needs lock)
424+ lk.unlock ();
425+ for (auto & th : threads_to_join) th.join ();
426+ lk.lock ();
427+
428+ // erase models no longer in any source
429+ for (auto it = mapping.begin (); it != mapping.end (); ) {
430+ if (final_presets.find (it->first ) == final_presets.end ()) {
431+ SRV_INF (" (reload) removing model name=%s (no longer in source)\n " , it->first .c_str ());
432+ GGML_ASSERT (!it->second .th .joinable ()); // must have been joined above
433+ it = mapping.erase (it);
434+ } else {
435+ ++it;
436+ }
437+ }
438+
439+ // update presets for non-running models still in source
440+ for (auto & [name, inst] : mapping) {
441+ if (inst.meta .is_running ()) continue ;
442+ auto it = final_presets.find (name);
443+ if (it == final_presets.end ()) continue ; // erased above
444+
445+ inst.meta .preset = it->second ;
446+
447+ // re-parse aliases, then validate against other models
448+ std::set<std::string> new_aliases;
449+ std::string alias_str;
450+ if (inst.meta .preset .get_option (" LLAMA_ARG_ALIAS" , alias_str) && !alias_str.empty ()) {
451+ for (auto & alias : string_split<std::string>(alias_str, ' ,' )) {
452+ alias = string_strip (alias);
453+ if (!alias.empty ()) new_aliases.insert (alias);
454+ }
455+ }
456+ inst.meta .aliases .clear ();
457+ for (const auto & alias : new_aliases) {
458+ bool conflict = false ;
459+ for (const auto & [other_name, other_inst] : mapping) {
460+ if (other_name == name) continue ;
461+ if (other_name == alias || other_inst.meta .aliases .count (alias)) {
462+ SRV_WRN (" (reload) alias '%s' for model '%s' conflicts with model '%s', skipping\n " ,
463+ alias.c_str (), name.c_str (), other_name.c_str ());
464+ conflict = true ;
465+ break ;
466+ }
467+ }
468+ if (!conflict) inst.meta .aliases .insert (alias);
469+ }
470+
471+ // re-parse tags
472+ inst.meta .tags .clear ();
473+ std::string tags_str;
474+ if (inst.meta .preset .get_option (" LLAMA_ARG_TAGS" , tags_str) && !tags_str.empty ()) {
475+ for (auto & tag : string_split<std::string>(tags_str, ' ,' )) {
476+ tag = string_strip (tag);
477+ if (!tag.empty ()) inst.meta .tags .insert (tag);
478+ }
479+ }
480+
481+ inst.meta .exit_code = 0 ; // clear failed state so the model can be reloaded
482+ inst.meta .update_args (ctx_preset, bin_path);
483+ }
484+
485+ // add models that are new in this reload
486+ std::vector<std::string> newly_added;
487+ for (const auto & [name, preset] : final_presets) {
488+ if (mapping.find (name) == mapping.end ()) {
489+ server_model_meta meta{
490+ /* preset */ preset,
491+ /* name */ name,
492+ /* aliases */ {},
493+ /* tags */ {},
494+ /* port */ 0 ,
495+ /* status */ SERVER_MODEL_STATUS_UNLOADED ,
496+ /* last_used */ 0 ,
497+ /* args */ std::vector<std::string>(),
498+ /* exit_code */ 0 ,
499+ /* stop_timeout */ DEFAULT_STOP_TIMEOUT ,
500+ };
501+ add_model (std::move (meta));
502+ newly_added.push_back (name);
503+ }
504+ }
505+
506+ apply_stop_timeout ();
507+
508+ // clear reload flag before unlocking for autoload — load() blocks on !is_reloading,
509+ // so clearing it here (while still locked) prevents a deadlock in the autoload calls below
510+ is_reloading = false ;
511+ cv.notify_all ();
512+
513+ log_available_models ();
514+
515+ // collect autoload candidates while still under the lock
516+ std::vector<std::string> to_autoload;
517+ for (const auto & name : newly_added) {
518+ auto it = mapping.find (name);
519+ if (it != mapping.end ()) {
520+ std::string val;
521+ if (it->second .meta .preset .get_option (COMMON_ARG_PRESET_LOAD_ON_STARTUP , val) && common_arg_utils::is_truthy (val)) {
522+ to_autoload.push_back (name);
523+ }
524+ }
525+ }
526+
527+ lk.unlock ();
528+ for (const auto & name : to_autoload) {
529+ SRV_INF (" (reload) loading new model %s\n " , name.c_str ());
530+ load (name);
531+ }
379532 }
380533}
381534
@@ -536,7 +689,10 @@ void server_models::load(const std::string & name) {
536689 }
537690 unload_lru ();
538691
539- std::lock_guard<std::mutex> lk (mutex);
692+ std::unique_lock<std::mutex> lk (mutex);
693+ // edge case: block until any in-progress reload has finished so we always load
694+ // against the freshest preset and a consistent mapping state
695+ cv.wait (lk, [this ]() { return !is_reloading; });
540696
541697 auto meta = mapping[name].meta ;
542698 if (meta.status != SERVER_MODEL_STATUS_UNLOADED ) {
@@ -993,7 +1149,11 @@ void server_models_routes::init_routes() {
9931149 return res;
9941150 };
9951151
996- this ->get_router_models = [this ](const server_http_req &) {
1152+ this ->get_router_models = [this ](const server_http_req & req) {
1153+ bool reload = !req.get_param (" reload" , " " ).empty ();
1154+ if (reload) {
1155+ models.load_models ();
1156+ }
9971157 auto res = std::make_unique<server_http_res>();
9981158 json models_json = json::array ();
9991159 auto all_models = models.get_all_meta ();
0 commit comments