@@ -833,6 +833,8 @@ struct server_context_impl {
833833
834834 bool sleeping = false ;
835835
836+ int64_t t_last_load_progress_ms = 0 ;
837+
836838 void destroy () {
837839 spec.reset ();
838840 ctx_dft.reset ();
@@ -863,6 +865,30 @@ struct server_context_impl {
863865 sleeping = new_state;
864866 }
865867
868+ static bool load_progress_callback (float progress, void * user_data) {
869+ auto * ctx = static_cast <server_context_impl *>(user_data);
870+ GGML_ASSERT (ctx);
871+ // always emit the first and final sample; throttle the rest to one per 200ms
872+ {
873+ auto & t_last = ctx->t_last_load_progress_ms ;
874+ const int64_t t_now = ggml_time_ms ();
875+ const bool first = t_last == 0 ;
876+ const bool done = progress >= 1 .0f ;
877+ const bool throttled = !first && !done && (t_now - t_last) < 200 ;
878+ if (throttled) {
879+ return true ;
880+ }
881+ t_last = t_now;
882+ }
883+ if (ctx->callback_state ) {
884+ ctx->callback_state (SERVER_STATE_LOADING , {
885+ {" stage" , " text_model" },
886+ {" value" , progress},
887+ });
888+ }
889+ return true ;
890+ }
891+
866892 // load the model and initialize llama_context
867893 // this may also be called to resume from sleeping state
868894 bool load_model (common_params & params) {
@@ -916,6 +942,10 @@ struct server_context_impl {
916942
917943 // optionally reserve VRAM for the draft / MTP context before fitting the target model
918944 if (params_base.fit_params ) {
945+ if (callback_state) {
946+ callback_state (SERVER_STATE_LOADING , {{" stage" , " fit_params" }});
947+ }
948+
919949 const bool spec_mtp = std::find (params_base.speculative .types .begin (),
920950 params_base.speculative .types .end (),
921951 COMMON_SPECULATIVE_TYPE_DRAFT_MTP ) != params_base.speculative .types .end ();
@@ -991,6 +1021,13 @@ struct server_context_impl {
9911021 }
9921022 }
9931023
1024+ // attach a progress callback
1025+ {
1026+ t_last_load_progress_ms = 0 ;
1027+ params_base.load_progress_callback = load_progress_callback;
1028+ params_base.load_progress_callback_user_data = this ;
1029+ }
1030+
9941031 llama_init = common_init_from_params (params_base);
9951032
9961033 model_tgt = llama_init->model ();
@@ -1008,6 +1045,10 @@ struct server_context_impl {
10081045 add_bos_token = llama_vocab_get_add_bos (vocab);
10091046
10101047 if (params_base.speculative .has_dft ()) {
1048+ if (callback_state) {
1049+ callback_state (SERVER_STATE_LOADING , {{" stage" , " spec_model" }});
1050+ }
1051+
10111052 // TODO speculative: move to common/speculative.cpp?
10121053 const auto & params_spec = params_base.speculative .draft ;
10131054
@@ -1079,6 +1120,10 @@ struct server_context_impl {
10791120 }
10801121
10811122 if (has_mmproj) {
1123+ if (callback_state) {
1124+ callback_state (SERVER_STATE_LOADING , {{" stage" , " mmproj_model" }});
1125+ }
1126+
10821127 if (!is_resume) {
10831128 mtmd_helper_log_set (common_log_default_callback, nullptr );
10841129 }
@@ -1259,6 +1304,10 @@ struct server_context_impl {
12591304 return init ();
12601305 }
12611306
1307+ if (callback_state) {
1308+ callback_state (SERVER_STATE_READY , {});
1309+ }
1310+
12621311 return true ;
12631312 }
12641313
@@ -1335,6 +1384,9 @@ struct server_context_impl {
13351384 const bool enable_thinking = params_base.enable_reasoning != 0 && template_supports_thinking;
13361385 SRV_INF (" %s: chat template, thinking = %d\n " , __func__, enable_thinking);
13371386
1387+ // IMPORTANT: chat_params is reused across sleeping / resuming states,
1388+ // never store llama_context/llama_model pointers in chat_params,
1389+ // as they may be invalidated after sleeping
13381390 chat_params = {
13391391 /* use_jinja */ params_base.use_jinja ,
13401392 /* prefill_assistant */ params_base.prefill_assistant ,
@@ -3734,7 +3786,10 @@ struct server_res_generator : server_http_res {
37343786void server_context::set_state_callback (server_state_callback_t callback) {
37353787 impl->callback_state = std::move (callback);
37363788 impl->queue_tasks .on_sleeping_state ([this ](bool sleeping) {
3737- impl->callback_state (sleeping ? SERVER_STATE_SLEEPING : SERVER_STATE_READY , {});
3789+ if (sleeping) {
3790+ impl->callback_state (SERVER_STATE_SLEEPING , {});
3791+ }
3792+ // for sleeping == false, event is emitted by load_model()
37383793 });
37393794}
37403795
0 commit comments