server : add last-5-seconds generation speed display (ggml-org#24291)

akx · ggerganov · papamoose · commit 58e7106391a8 · 2026-06-26T20:43:19.000-06:00
* server : add last-5-seconds generation speed display

* cont : clean-up

---------

Co-authored-by: Georgi Gerganov &lt;ggerganov@gmail.com&gt;
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -189,9 +189,10 @@ struct server_slot {
     // stats
     size_t n_sent_text = 0; // number of sent text character
 
-    int64_t t_print_last = 0;
     int64_t t_start_process_prompt;
     int64_t t_start_generation;
+    int64_t t_print_last = 0;
+    int32_t n_decoded_last = 0;
 
     double t_prompt_processing = 0.0; // ms
     double t_token_generation = 0.0;  // ms
@@ -470,11 +471,13 @@ struct server_slot {
             return;
         }
 
-        t_print_last = t_now;
+        const double n_gen_second     = 1e3 / (t_token_generation)   * (n_decoded);
+        const double n_gen_second_win = 1e6 / (t_now - t_print_last) * (n_decoded - n_decoded_last);
 
-        const double n_gen_second = 1e3 / t_token_generation * n_decoded;
+        t_print_last = t_now;
+        n_decoded_last = n_decoded;
 
-        SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s\n", n_decoded, n_gen_second);
+        SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s, tg_3s = %6.2f t/s\n", n_decoded, n_gen_second, n_gen_second_win);
     }
 
     void print_timings_pp() const {
@@ -3038,8 +3041,8 @@ struct server_context_impl {
                         }
                     }
 
-                    const int64_t t_current = ggml_time_us();
-                    slot.t_prompt_processing = (t_current - slot.t_start_process_prompt) / 1e3;
+                    const int64_t t_now = ggml_time_us();
+                    slot.t_prompt_processing = (t_now - slot.t_start_process_prompt) / 1e3;
                     slot.print_timings_pp();
 
                     // truncate any tokens that are beyond n_past for this slot
@@ -3447,17 +3450,19 @@ struct server_context_impl {
                 common_sampler_accept(slot.smpl.get(), id, true);
 
                 // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
-                const int64_t t_current = ggml_time_us();
+                const int64_t t_now = ggml_time_us();
 
                 slot.n_decoded += 1;
 
                 if (slot.n_decoded == 1) {
-                    slot.t_start_generation = t_current;
+                    slot.t_start_generation = t_now;
+                    slot.t_print_last = t_now;
+                    slot.n_decoded_last = 0;
                     slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
                     metrics.on_prompt_eval(slot);
                 }
 
-                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
+                slot.t_token_generation = std::max<int64_t>(1, t_now - slot.t_start_generation) / 1e3;
 
                 completion_token_output result;
                 result.tok          = id;
@@ -3551,11 +3556,11 @@ struct server_context_impl {
                     slot.spec_draft = std::move(accepted);
                 }
 
-                const int64_t t_current = ggml_time_us();
+                const int64_t t_now = ggml_time_us();
 
                 const auto ids = std::move(slot.spec_draft);
 
-                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
+                slot.t_token_generation = std::max<int64_t>(1, t_now - slot.t_start_generation) / 1e3;
 
                 // update how many tokens out of those tested were accepted
                 slot.n_draft_accepted += ids.size() - 1;