Skip to content

Commit 58e7106

Browse files
akxggerganov
authored andcommitted
server : add last-5-seconds generation speed display (ggml-org#24291)
* server : add last-5-seconds generation speed display * cont : clean-up --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent e47ca54 commit 58e7106

1 file changed

Lines changed: 16 additions & 11 deletions

File tree

tools/server/server-context.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -189,9 +189,10 @@ struct server_slot {
189189
// stats
190190
size_t n_sent_text = 0; // number of sent text character
191191

192-
int64_t t_print_last = 0;
193192
int64_t t_start_process_prompt;
194193
int64_t t_start_generation;
194+
int64_t t_print_last = 0;
195+
int32_t n_decoded_last = 0;
195196

196197
double t_prompt_processing = 0.0; // ms
197198
double t_token_generation = 0.0; // ms
@@ -470,11 +471,13 @@ struct server_slot {
470471
return;
471472
}
472473

473-
t_print_last = t_now;
474+
const double n_gen_second = 1e3 / (t_token_generation) * (n_decoded);
475+
const double n_gen_second_win = 1e6 / (t_now - t_print_last) * (n_decoded - n_decoded_last);
474476

475-
const double n_gen_second = 1e3 / t_token_generation * n_decoded;
477+
t_print_last = t_now;
478+
n_decoded_last = n_decoded;
476479

477-
SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s\n", n_decoded, n_gen_second);
480+
SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s, tg_3s = %6.2f t/s\n", n_decoded, n_gen_second, n_gen_second_win);
478481
}
479482

480483
void print_timings_pp() const {
@@ -3038,8 +3041,8 @@ struct server_context_impl {
30383041
}
30393042
}
30403043

3041-
const int64_t t_current = ggml_time_us();
3042-
slot.t_prompt_processing = (t_current - slot.t_start_process_prompt) / 1e3;
3044+
const int64_t t_now = ggml_time_us();
3045+
slot.t_prompt_processing = (t_now - slot.t_start_process_prompt) / 1e3;
30433046
slot.print_timings_pp();
30443047

30453048
// truncate any tokens that are beyond n_past for this slot
@@ -3447,17 +3450,19 @@ struct server_context_impl {
34473450
common_sampler_accept(slot.smpl.get(), id, true);
34483451

34493452
// here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
3450-
const int64_t t_current = ggml_time_us();
3453+
const int64_t t_now = ggml_time_us();
34513454

34523455
slot.n_decoded += 1;
34533456

34543457
if (slot.n_decoded == 1) {
3455-
slot.t_start_generation = t_current;
3458+
slot.t_start_generation = t_now;
3459+
slot.t_print_last = t_now;
3460+
slot.n_decoded_last = 0;
34563461
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
34573462
metrics.on_prompt_eval(slot);
34583463
}
34593464

3460-
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
3465+
slot.t_token_generation = std::max<int64_t>(1, t_now - slot.t_start_generation) / 1e3;
34613466

34623467
completion_token_output result;
34633468
result.tok = id;
@@ -3551,11 +3556,11 @@ struct server_context_impl {
35513556
slot.spec_draft = std::move(accepted);
35523557
}
35533558

3554-
const int64_t t_current = ggml_time_us();
3559+
const int64_t t_now = ggml_time_us();
35553560

35563561
const auto ids = std::move(slot.spec_draft);
35573562

3558-
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
3563+
slot.t_token_generation = std::max<int64_t>(1, t_now - slot.t_start_generation) / 1e3;
35593564

35603565
// update how many tokens out of those tested were accepted
35613566
slot.n_draft_accepted += ids.size() - 1;

0 commit comments

Comments
 (0)