@@ -189,9 +189,10 @@ struct server_slot {
189189 // stats
190190 size_t n_sent_text = 0 ; // number of sent text character
191191
192- int64_t t_print_last = 0 ;
193192 int64_t t_start_process_prompt;
194193 int64_t t_start_generation;
194+ int64_t t_print_last = 0 ;
195+ int32_t n_decoded_last = 0 ;
195196
196197 double t_prompt_processing = 0.0 ; // ms
197198 double t_token_generation = 0.0 ; // ms
@@ -470,11 +471,13 @@ struct server_slot {
470471 return ;
471472 }
472473
473- t_print_last = t_now;
474+ const double n_gen_second = 1e3 / (t_token_generation) * (n_decoded);
475+ const double n_gen_second_win = 1e6 / (t_now - t_print_last) * (n_decoded - n_decoded_last);
474476
475- const double n_gen_second = 1e3 / t_token_generation * n_decoded;
477+ t_print_last = t_now;
478+ n_decoded_last = n_decoded;
476479
477- SLT_INF (*this , " n_decoded = %6d, tg = %6.2f t/s\n " , n_decoded, n_gen_second);
480+ SLT_INF (*this , " n_decoded = %6d, tg = %6.2f t/s, tg_3s = %6.2f t/s \n " , n_decoded, n_gen_second, n_gen_second_win );
478481 }
479482
480483 void print_timings_pp () const {
@@ -3038,8 +3041,8 @@ struct server_context_impl {
30383041 }
30393042 }
30403043
3041- const int64_t t_current = ggml_time_us ();
3042- slot.t_prompt_processing = (t_current - slot.t_start_process_prompt ) / 1e3 ;
3044+ const int64_t t_now = ggml_time_us ();
3045+ slot.t_prompt_processing = (t_now - slot.t_start_process_prompt ) / 1e3 ;
30433046 slot.print_timings_pp ();
30443047
30453048 // truncate any tokens that are beyond n_past for this slot
@@ -3447,17 +3450,19 @@ struct server_context_impl {
34473450 common_sampler_accept (slot.smpl .get (), id, true );
34483451
34493452 // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
3450- const int64_t t_current = ggml_time_us ();
3453+ const int64_t t_now = ggml_time_us ();
34513454
34523455 slot.n_decoded += 1 ;
34533456
34543457 if (slot.n_decoded == 1 ) {
3455- slot.t_start_generation = t_current;
3458+ slot.t_start_generation = t_now;
3459+ slot.t_print_last = t_now;
3460+ slot.n_decoded_last = 0 ;
34563461 slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt ) / 1e3 ;
34573462 metrics.on_prompt_eval (slot);
34583463 }
34593464
3460- slot.t_token_generation = std::max<int64_t >(1 , t_current - slot.t_start_generation ) / 1e3 ;
3465+ slot.t_token_generation = std::max<int64_t >(1 , t_now - slot.t_start_generation ) / 1e3 ;
34613466
34623467 completion_token_output result;
34633468 result.tok = id;
@@ -3551,11 +3556,11 @@ struct server_context_impl {
35513556 slot.spec_draft = std::move (accepted);
35523557 }
35533558
3554- const int64_t t_current = ggml_time_us ();
3559+ const int64_t t_now = ggml_time_us ();
35553560
35563561 const auto ids = std::move (slot.spec_draft );
35573562
3558- slot.t_token_generation = std::max<int64_t >(1 , t_current - slot.t_start_generation ) / 1e3 ;
3563+ slot.t_token_generation = std::max<int64_t >(1 , t_now - slot.t_start_generation ) / 1e3 ;
35593564
35603565 // update how many tokens out of those tested were accepted
35613566 slot.n_draft_accepted += ids.size () - 1 ;
0 commit comments