server: expose speculative decoding counters in Prometheus metrics

boxcee · boxcee · commit d7357254cb06 · 2026-05-19T10:47:34.000+02:00
Adds two new counters to the /metrics endpoint:
- llamacpp:spec_tokens_drafted_total
- llamacpp:spec_tokens_accepted_total

These are accumulated via server_metrics::on_prediction() using the
per-slot n_draft_total and n_draft_accepted fields already tracked
during speculative decoding. Acceptance rate can be derived as
spec_tokens_accepted_total / spec_tokens_drafted_total.
diff --git a/tools/server/README.md b/tools/server/README.md
@@ -1064,6 +1064,8 @@ In *router mode* the query param `?model={model_id}` has to be set. This endpoin
 | `llamacpp:n_tokens_max` | Counter | High watermark of the context size observed. |
 | `llamacpp:n_decode_total` | Counter | Total Number of llama_decode() calls. |
 | `llamacpp:n_busy_slots_per_decode` | Gauge | Average number of busy slots per llama_decode() call. |
+| `llamacpp:spec_tokens_drafted_total` | Counter | Number of speculative draft tokens generated. |
+| `llamacpp:spec_tokens_accepted_total` | Counter | Number of speculative draft tokens accepted. |
 
 ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -564,6 +564,9 @@ struct server_metrics {
     uint64_t n_decode_total     = 0;
     uint64_t n_busy_slots_total = 0;
 
+    uint64_t n_spec_tokens_drafted_total  = 0;
+    uint64_t n_spec_tokens_accepted_total = 0;
+
     void init() {
         t_start = ggml_time_us();
     }
@@ -582,6 +585,9 @@ struct server_metrics {
         n_tokens_predicted         += slot.n_decoded;
         t_tokens_generation        += slot.t_token_generation;
         t_tokens_generation_total  += slot.t_token_generation;
+
+        n_spec_tokens_drafted_total  += slot.n_draft_total;
+        n_spec_tokens_accepted_total += slot.n_draft_accepted;
     }
 
     void on_decoded(const std::vector<server_slot> & slots) {
@@ -2001,6 +2007,9 @@ struct server_context_impl {
                     res->n_decode_total          = metrics.n_decode_total;
                     res->n_busy_slots_total      = metrics.n_busy_slots_total;
 
+                    res->n_spec_tokens_drafted_total  = metrics.n_spec_tokens_drafted_total;
+                    res->n_spec_tokens_accepted_total = metrics.n_spec_tokens_accepted_total;
+
                     if (task.metrics_reset_bucket) {
                         metrics.reset_bucket();
                     }
@@ -3713,6 +3722,14 @@ void server_routes::init_routes() {
                     {"name",  "n_tokens_max"},
                     {"help",  "Largest observed n_tokens."},
                     {"value",  res_task->n_tokens_max}
+            }, {
+                    {"name",  "spec_tokens_drafted_total"},
+                    {"help",  "Number of speculative draft tokens generated."},
+                    {"value",  (uint64_t) res_task->n_spec_tokens_drafted_total}
+            }, {
+                    {"name",  "spec_tokens_accepted_total"},
+                    {"help",  "Number of speculative draft tokens accepted."},
+                    {"value",  (uint64_t) res_task->n_spec_tokens_accepted_total}
             }}},
             {"gauge", {{
                     {"name",  "prompt_tokens_seconds"},
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
@@ -526,6 +526,9 @@ struct server_task_result_metrics : server_task_result {
     uint64_t n_decode_total     = 0;
     uint64_t n_busy_slots_total = 0;
 
+    uint64_t n_spec_tokens_drafted_total  = 0;
+    uint64_t n_spec_tokens_accepted_total = 0;
+
     // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
     // therefore, we use json to temporarily store the slot.to_json() result
     json slots_data = json::array();