Skip to content

Commit d9f3278

Browse files
authored
examples/llm_server: log per-turn serving metrics (#20455)
Add lightweight turn-level observability to the example LLM server harness. The C++ worker now reports warm-resume accounting, prefill/decode duration, total worker time, and token rates in the terminal JSONL done message. The Python WorkerClient and SessionRuntime propagate those fields, and ServingChat emits one structured INFO log line per completed turn. This is intended for local-agent demos and evals where we need to understand whether warm resume is firing and where time is spent, without changing the OpenAI-compatible response payload.
1 parent aada6d7 commit d9f3278

6 files changed

Lines changed: 90 additions & 0 deletions

File tree

examples/llm_server/cpp/worker_loop.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
// "reused_prompt_tokens": int, "prefilled_prompt_tokens": int,
4646
// "session_reset_reason": str
4747
// (new|exact_prefix|mismatch|dirty|equal),
48+
// "prefill_ms": float, "decode_ms": float, "total_ms": float,
49+
// "prefill_tok_s": float, "decode_tok_s": float,
4850
// "generated_token_ids"?: [int,...]} // omitted if stop-trimmed
4951
// open/close/reset: {"opened"|"closed"|"reset": true, "session_id": str}
5052
// error: {"error": str, "code"?: str} // capacity_exhausted |
@@ -59,6 +61,7 @@
5961
#include <pytorch/tokenizers/tokenizer.h>
6062

6163
#include <algorithm>
64+
#include <chrono>
6265
#include <cstdint>
6366
#include <iostream>
6467
#include <iterator>
@@ -108,6 +111,7 @@ inline void worker_handle_request(
108111
const std::unordered_map<std::string, int64_t>& metadata,
109112
const nlohmann::json& req,
110113
const std::vector<uint64_t>& prompt_prefix_ids = {}) {
114+
const auto request_start = std::chrono::steady_clock::now();
111115
LLMSession& session = *st.session;
112116
int64_t max_new = req.value("max_new_tokens", static_cast<int64_t>(-1));
113117
const float temperature = req.value("temperature", 0.0f);
@@ -203,6 +207,7 @@ inline void worker_handle_request(
203207

204208
SamplingConfig sampling;
205209
sampling.temperature = temperature;
210+
const auto prefill_start = std::chrono::steady_clock::now();
206211
if (session.prefill_tokens(to_prefill, &sampling) !=
207212
::executorch::runtime::Error::Ok) {
208213
st.dirty = true; // state may be partially mutated; force a reset next time
@@ -212,6 +217,7 @@ inline void worker_handle_request(
212217
// suffix, or the whole prompt). Keep the invariant
213218
// resident.size()==position().
214219
st.resident_token_ids = ids;
220+
const auto decode_start = std::chrono::steady_clock::now();
215221

216222
std::string buf; // bytes not yet forming a complete UTF-8 prefix
217223
std::string pending; // complete-UTF-8 text held back for stop-string matching
@@ -299,6 +305,25 @@ inline void worker_handle_request(
299305
st.resident_token_ids.end() - num_generated,
300306
st.resident_token_ids.end());
301307
}
308+
const auto request_end = std::chrono::steady_clock::now();
309+
const double prefill_ms =
310+
std::chrono::duration<double, std::milli>(decode_start - prefill_start)
311+
.count();
312+
const double decode_ms =
313+
std::chrono::duration<double, std::milli>(request_end - decode_start)
314+
.count();
315+
const double total_ms =
316+
std::chrono::duration<double, std::milli>(request_end - request_start)
317+
.count();
318+
done["prefill_ms"] = prefill_ms;
319+
done["decode_ms"] = decode_ms;
320+
done["total_ms"] = total_ms;
321+
done["prefill_tok_s"] = prefill_ms > 0.0
322+
? (static_cast<double>(prefilled) * 1000.0 / prefill_ms)
323+
: 0.0;
324+
done["decode_tok_s"] = decode_ms > 0.0
325+
? (static_cast<double>(num_generated) * 1000.0 / decode_ms)
326+
: 0.0;
302327
worker_emit(done);
303328
}
304329

examples/llm_server/python/serving_chat.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,29 @@ def _extract_tools(self, req: ChatCompletionRequest, text: str):
184184
text = parsed.normal_text
185185
return None, self._visible_content(text)
186186

187+
@staticmethod
188+
def _log_generation_stats(
189+
session_id: Optional[str], stats: GenStats, finish: str
190+
) -> None:
191+
logger.info(
192+
"llm_turn_stats session_id=%s reason=%s prompt_tokens=%d "
193+
"reused_prompt_tokens=%d prefilled_prompt_tokens=%d "
194+
"completion_tokens=%d prefill_ms=%.1f prefill_tok_s=%.1f "
195+
"decode_ms=%.1f decode_tok_s=%.1f total_ms=%.1f finish=%s",
196+
session_id or "<scratch>",
197+
stats.session_reset_reason,
198+
stats.prompt_tokens,
199+
stats.reused_prompt_tokens,
200+
stats.prefilled_prompt_tokens,
201+
stats.completion_tokens,
202+
stats.prefill_ms,
203+
stats.prefill_tok_s,
204+
stats.decode_ms,
205+
stats.decode_tok_s,
206+
stats.total_ms,
207+
finish,
208+
)
209+
187210
async def _clean(
188211
self, stream: AsyncIterator[str], stops: list[str], on_stop=None
189212
) -> AsyncIterator[str]:
@@ -480,6 +503,7 @@ async def _complete(
480503
finish = self._finish_reason(
481504
req, stats.completion_tokens, tool_calls, stopped, stats.finish_reason
482505
)
506+
self._log_generation_stats(req.session_id, stats, finish)
483507
return ChatCompletionResponse(
484508
model=self._model_id,
485509
choices=[
@@ -623,6 +647,7 @@ def chunk(delta: DeltaMessage, finish=None) -> str:
623647
stopped=stop_hit[0],
624648
worker_finish=stats.finish_reason,
625649
)
650+
self._log_generation_stats(req.session_id, stats, finish)
626651
yield chunk(DeltaMessage(), finish=finish)
627652
if req.stream_options and req.stream_options.include_usage:
628653
usage_chunk = ChatCompletionChunk(

examples/llm_server/python/session_runtime.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ class GenStats:
7171
reused_prompt_tokens: int = 0
7272
prefilled_prompt_tokens: int = 0
7373
session_reset_reason: Optional[str] = None
74+
prefill_ms: float = 0.0
75+
decode_ms: float = 0.0
76+
total_ms: float = 0.0
77+
prefill_tok_s: float = 0.0
78+
decode_tok_s: float = 0.0
7479
# Exact token ids generated this turn, for an adapter's transcript
7580
# store. Empty when the worker doesn't report them (e.g. a stop-trimmed turn).
7681
generated_token_ids: list = field(default_factory=list)
@@ -115,6 +120,11 @@ def stats_cb(self, s) -> None:
115120
self._stats.reused_prompt_tokens = getattr(s, "reused_prompt_tokens", 0)
116121
self._stats.prefilled_prompt_tokens = getattr(s, "prefilled_prompt_tokens", 0)
117122
self._stats.session_reset_reason = getattr(s, "session_reset_reason", None)
123+
self._stats.prefill_ms = getattr(s, "prefill_ms", 0.0)
124+
self._stats.decode_ms = getattr(s, "decode_ms", 0.0)
125+
self._stats.total_ms = getattr(s, "total_ms", 0.0)
126+
self._stats.prefill_tok_s = getattr(s, "prefill_tok_s", 0.0)
127+
self._stats.decode_tok_s = getattr(s, "decode_tok_s", 0.0)
118128
self._stats.generated_token_ids = getattr(s, "generated_token_ids", [])
119129

120130
def run(self) -> None:

examples/llm_server/python/tests/test_session_runtime.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ class S:
9696
num_prompt_tokens = 3
9797
num_generated_tokens = 2
9898
finish_reason = "stop"
99+
prefill_ms = 4.0
100+
decode_ms = 5.0
101+
total_ms = 10.0
102+
prefill_tok_s = 750.0
103+
decode_tok_s = 400.0
99104
generated_token_ids = [10, 11]
100105

101106
stats_callback(S())
@@ -110,6 +115,11 @@ async def scenario():
110115
assert "".join(out) == "Hello world"
111116
assert stats.completion_tokens == 2
112117
assert stats.finish_reason == "stop"
118+
assert stats.prefill_ms == 4.0
119+
assert stats.decode_ms == 5.0
120+
assert stats.total_ms == 10.0
121+
assert stats.prefill_tok_s == 750.0
122+
assert stats.decode_tok_s == 400.0
113123
assert stats.generated_token_ids == [10, 11]
114124

115125

examples/llm_server/python/tests/test_worker_client.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,11 @@ def test_generate_parses_warm_resume_metrics():
211211
"reused_prompt_tokens": 90,
212212
"prefilled_prompt_tokens": 10,
213213
"session_reset_reason": "exact_prefix",
214+
"prefill_ms": 12.5,
215+
"decode_ms": 25.0,
216+
"total_ms": 40.0,
217+
"prefill_tok_s": 800.0,
218+
"decode_tok_s": 40.0,
214219
},
215220
)
216221
)
@@ -222,6 +227,11 @@ def test_generate_parses_warm_resume_metrics():
222227
assert st.reused_prompt_tokens == 90
223228
assert st.prefilled_prompt_tokens == 10
224229
assert st.session_reset_reason == "exact_prefix"
230+
assert st.prefill_ms == 12.5
231+
assert st.decode_ms == 25.0
232+
assert st.total_ms == 40.0
233+
assert st.prefill_tok_s == 800.0
234+
assert st.decode_tok_s == 40.0
225235

226236

227237
def test_spawn_worker_waits_for_ready():

examples/llm_server/python/worker_client.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ class WorkerStats:
5353
reused_prompt_tokens: int = 0
5454
prefilled_prompt_tokens: int = 0
5555
session_reset_reason: Optional[str] = None
56+
prefill_ms: float = 0.0
57+
decode_ms: float = 0.0
58+
total_ms: float = 0.0
59+
prefill_tok_s: float = 0.0
60+
decode_tok_s: float = 0.0
5661
# The exact (non-terminal) token ids generated this turn. The control plane
5762
# stores these per session and splices them back as an `ids` prompt segment
5863
# next turn, so a prior assistant span is an exact token extension instead of
@@ -167,6 +172,11 @@ def _on_done(msg: dict, stats_callback) -> None:
167172
reused_prompt_tokens=msg.get("reused_prompt_tokens", 0),
168173
prefilled_prompt_tokens=msg.get("prefilled_prompt_tokens", 0),
169174
session_reset_reason=reason,
175+
prefill_ms=msg.get("prefill_ms", 0.0),
176+
decode_ms=msg.get("decode_ms", 0.0),
177+
total_ms=msg.get("total_ms", 0.0),
178+
prefill_tok_s=msg.get("prefill_tok_s", 0.0),
179+
decode_tok_s=msg.get("decode_tok_s", 0.0),
170180
generated_token_ids=msg.get("generated_token_ids", []),
171181
)
172182
)

0 commit comments

Comments
 (0)