Skip to content

Commit d735725

Browse files
committed
server: expose speculative decoding counters in Prometheus metrics
Adds two new counters to the /metrics endpoint: - llamacpp:spec_tokens_drafted_total - llamacpp:spec_tokens_accepted_total These are accumulated via server_metrics::on_prediction() using the per-slot n_draft_total and n_draft_accepted fields already tracked during speculative decoding. Acceptance rate can be derived as spec_tokens_accepted_total / spec_tokens_drafted_total.
1 parent fcae601 commit d735725

3 files changed

Lines changed: 22 additions & 0 deletions

File tree

tools/server/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,8 @@ In *router mode* the query param `?model={model_id}` has to be set. This endpoin
10641064
| `llamacpp:n_tokens_max` | Counter | High watermark of the context size observed. |
10651065
| `llamacpp:n_decode_total` | Counter | Total Number of llama_decode() calls. |
10661066
| `llamacpp:n_busy_slots_per_decode` | Gauge | Average number of busy slots per llama_decode() call. |
1067+
| `llamacpp:spec_tokens_drafted_total` | Counter | Number of speculative draft tokens generated. |
1068+
| `llamacpp:spec_tokens_accepted_total` | Counter | Number of speculative draft tokens accepted. |
10671069

10681070
### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
10691071

tools/server/server-context.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,9 @@ struct server_metrics {
564564
uint64_t n_decode_total = 0;
565565
uint64_t n_busy_slots_total = 0;
566566

567+
uint64_t n_spec_tokens_drafted_total = 0;
568+
uint64_t n_spec_tokens_accepted_total = 0;
569+
567570
void init() {
568571
t_start = ggml_time_us();
569572
}
@@ -582,6 +585,9 @@ struct server_metrics {
582585
n_tokens_predicted += slot.n_decoded;
583586
t_tokens_generation += slot.t_token_generation;
584587
t_tokens_generation_total += slot.t_token_generation;
588+
589+
n_spec_tokens_drafted_total += slot.n_draft_total;
590+
n_spec_tokens_accepted_total += slot.n_draft_accepted;
585591
}
586592

587593
void on_decoded(const std::vector<server_slot> & slots) {
@@ -2001,6 +2007,9 @@ struct server_context_impl {
20012007
res->n_decode_total = metrics.n_decode_total;
20022008
res->n_busy_slots_total = metrics.n_busy_slots_total;
20032009

2010+
res->n_spec_tokens_drafted_total = metrics.n_spec_tokens_drafted_total;
2011+
res->n_spec_tokens_accepted_total = metrics.n_spec_tokens_accepted_total;
2012+
20042013
if (task.metrics_reset_bucket) {
20052014
metrics.reset_bucket();
20062015
}
@@ -3713,6 +3722,14 @@ void server_routes::init_routes() {
37133722
{"name", "n_tokens_max"},
37143723
{"help", "Largest observed n_tokens."},
37153724
{"value", res_task->n_tokens_max}
3725+
}, {
3726+
{"name", "spec_tokens_drafted_total"},
3727+
{"help", "Number of speculative draft tokens generated."},
3728+
{"value", (uint64_t) res_task->n_spec_tokens_drafted_total}
3729+
}, {
3730+
{"name", "spec_tokens_accepted_total"},
3731+
{"help", "Number of speculative draft tokens accepted."},
3732+
{"value", (uint64_t) res_task->n_spec_tokens_accepted_total}
37163733
}}},
37173734
{"gauge", {{
37183735
{"name", "prompt_tokens_seconds"},

tools/server/server-task.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,9 @@ struct server_task_result_metrics : server_task_result {
526526
uint64_t n_decode_total = 0;
527527
uint64_t n_busy_slots_total = 0;
528528

529+
uint64_t n_spec_tokens_drafted_total = 0;
530+
uint64_t n_spec_tokens_accepted_total = 0;
531+
529532
// while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
530533
// therefore, we use json to temporarily store the slot.to_json() result
531534
json slots_data = json::array();

0 commit comments

Comments
 (0)