Add usage.prompt_tokens_details.cached_tokens for prefix caching

lvhan028 · lvhan028 · commit ebe59b20f8ee · 2026-06-10T13:35:24.000Z
diff --git a/benchmark/benchmark_chat_completion.py b/benchmark/benchmark_chat_completion.py
@@ -2,7 +2,8 @@
 
 This script focuses on eval-style JSONL dumps where each row contains OpenAI
 chat ``messages``, or a string/list ``prompt`` (e.g. dapo-math-17k). List-type
-``prompt`` values are treated as message lists. It records streaming latency traces,
+``prompt`` values are treated as message lists. Optional per-row ``tools`` and
+``tool_choice`` fields are forwarded to ``/v1/chat/completions``. It records streaming latency traces,
 aggregates TTFT/ITL/TPOT metrics, and writes table plus report artifacts for concurrency/RPS sweeps.
 
 Generation options include ``--output-tokens`` (``max_completion_tokens``),
@@ -42,6 +43,8 @@ class BenchmarkRequest:
     messages: list[dict[str, Any]] = field(default_factory=list)
     input_ids: list[int] | None = None
     image_data: Any = None
+    tools: list[dict[str, Any]] | None = None
+    tool_choice: Any | None = None
 
 
 @dataclass
@@ -73,6 +76,7 @@ class RequestTrace:
     chunk_times: list[float] = field(default_factory=list)
     prompt_tokens: int = 0
     completion_tokens: int = 0
+    cached_tokens: int = 0
     usage_available: bool = False
     generated_text: str = ''
     reasoning_text: str = ''
@@ -205,6 +209,21 @@ def _extract_messages(row: dict[str, Any]) -> list[dict[str, Any]]:
     raise ValueError('row must contain messages or prompt')
 
 
+def _extract_tools(row: dict[str, Any]) -> list[dict[str, Any]] | None:
+    tools = row.get('tools')
+    if not tools:
+        return None
+    if not isinstance(tools, list):
+        raise ValueError('tools must be a list when present')
+    return tools
+
+
+def _extract_tool_choice(row: dict[str, Any]) -> Any | None:
+    if 'tool_choice' not in row:
+        return None
+    return row['tool_choice']
+
+
 def _normalize_row(
     row: dict[str, Any],
     dataset: str,
@@ -213,23 +232,35 @@ def _normalize_row(
 ) -> BenchmarkRequest:
     request_id = str(row.get('id', f'{dataset}-{row_index}'))
     messages = _extract_messages(row)
+    tools = _extract_tools(row)
+    tool_choice = _extract_tool_choice(row)
 
     if tokenizer is not None:
-        prompt_str = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-        )
+        template_kwargs: dict[str, Any] = {
+            'tokenize': False,
+            'add_generation_prompt': True,
+        }
+        if tools is not None:
+            template_kwargs['tools'] = tools
+        prompt_str = tokenizer.apply_chat_template(messages, **template_kwargs)
         return BenchmarkRequest(
             dataset=dataset,
             id=request_id,
             input_ids=tokenizer.encode(prompt_str, add_special_tokens=False),
             image_data=row.get('image_data'),
+            tools=tools,
+            tool_choice=tool_choice,
         )
 
     if not messages:
         raise ValueError(f'row {row_index} in {dataset} has invalid messages')
-    return BenchmarkRequest(dataset=dataset, id=request_id, messages=messages)
+    return BenchmarkRequest(
+        dataset=dataset,
+        id=request_id,
+        messages=messages,
+        tools=tools,
+        tool_choice=tool_choice,
+    )
 
 
 def _read_raw_rows(
@@ -326,6 +357,13 @@ def parse_sse_line(line: bytes | str) -> SSEEvent:
     )
 
 
+def _cached_tokens_from_usage(usage: dict[str, Any] | None) -> int:
+    if not usage:
+        return 0
+    details = usage.get('prompt_tokens_details') or {}
+    return int(details.get('cached_tokens', 0) or 0)
+
+
 def build_payload(
     request: BenchmarkRequest,
     model: str,
@@ -372,6 +410,10 @@ def build_payload(
         payload['logprobs'] = True
         if top_logprobs is not None:
             payload['top_logprobs'] = top_logprobs
+    if request.tools:
+        payload['tools'] = request.tools
+    if request.tool_choice is not None:
+        payload['tool_choice'] = request.tool_choice
     if extra_body:
         payload.update(extra_body)
     return payload
@@ -482,6 +524,7 @@ async def request_chat_completion(
                         trace.completion_tokens = int(
                             event.usage.get('completion_tokens', trace.completion_tokens) or 0
                         )
+                        trace.cached_tokens = _cached_tokens_from_usage(event.usage)
                     if event.routed_experts and shared_store is not None:
                         try:
                             await fetch_routed_experts(shared_store, event.routed_experts)
@@ -632,6 +675,7 @@ def aggregate_traces(traces: Sequence[RequestTrace]) -> list[dict[str, Any]]:
         duration = max(end - start, 0.0)
         total_input = sum(trace.prompt_tokens for trace in completed)
         total_output = sum(trace.completion_tokens for trace in completed)
+        total_cached = sum(trace.cached_tokens for trace in completed)
         itls = [itl for trace in completed for itl in trace.itls_s]
 
         summary: dict[str, Any] = {
@@ -646,6 +690,8 @@ def aggregate_traces(traces: Sequence[RequestTrace]) -> list[dict[str, Any]]:
             'duration_s': duration,
             'total_input_tokens': total_input,
             'total_output_tokens': total_output,
+            'total_cached_tokens': total_cached,
+            'cache_hit_rate': total_cached / total_input if total_input > 0 else 0.0,
             'request_throughput_req_s': len(completed) / duration if duration > 0 else 0.0,
             'input_throughput_tok_s': total_input / duration if duration > 0 else 0.0,
             'output_throughput_tok_s': total_output / duration if duration > 0 else 0.0,
@@ -691,6 +737,7 @@ def _write_requests_csv(path: Path, rows: Sequence[dict[str, Any]]) -> None:
         'e2e_latency_s',
         'prompt_tokens',
         'completion_tokens',
+        'cached_tokens',
         'usage_available',
         'finish_reason',
         'error',
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -536,6 +536,7 @@ class Response:
     last_hidden_state: torch.Tensor = None
     index: int = 0
     routed_experts: Any = None
+    cached_tokens: int = 0
 
     def __str__(self):
         return f'text={self.text}\n{self._format_none_text_fields()}'
@@ -651,6 +652,7 @@ class RequestMetrics:
     token_timestamp: float = 0.0
     engine_events: list[EngineEvent] = field(default_factory=list)
     spec_info: dict[str, Any] | None = None
+    cached_tokens: int = 0
 
 
 @dataclass
@@ -674,6 +676,7 @@ class EngineOutput:
     cache_block_ids: list[int] | None = None
     req_metrics: RequestMetrics | None = None
     routed_experts: torch.Tensor = None
+    cached_tokens: int = 0
 
 
 @dataclass
diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py
@@ -271,6 +271,26 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
                 buckets=build_1_2_5_buckets(max_model_len),
                 labelnames=labelnames).labels(*labelvalues)
 
+        self.histogram_num_cached_tokens_request = \
+            prometheus_client.Histogram(
+                name='lmdeploy:request_cached_tokens',
+                documentation='Number of prefix-cached input tokens per request.',
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_cache_hit_ratio_request = \
+            prometheus_client.Histogram(
+                name='lmdeploy:request_cache_hit_ratio',
+                documentation='Prefix cache hit ratio (cached_tokens / prompt_tokens) per request.',
+                buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.counter_cached_tokens_total = \
+            prometheus_client.Counter(
+                name='lmdeploy:cached_tokens_total',
+                documentation='Total prefix-cached input tokens served.',
+                labelnames=labelnames).labels(*labelvalues)
+
         self.histogram_iteration_tokens = \
             prometheus_client.Histogram(
                 name='lmdeploy:iteration_tokens_total',
@@ -385,6 +405,10 @@ def record_finish(self, stats: RequestStats) -> None:
         self.histogram_decode_time_request.observe(stats.decode_time_interval)
         self.histogram_num_prompt_tokens_request.observe(stats.prompt_tokens)
         self.histogram_num_generation_tokens_request.observe(stats.generation_tokens)
+        self.histogram_num_cached_tokens_request.observe(stats.cached_tokens)
+        if stats.prompt_tokens > 0:
+            self.histogram_cache_hit_ratio_request.observe(stats.cached_tokens / stats.prompt_tokens)
+        self.counter_cached_tokens_total.inc(stats.cached_tokens)
 
     @staticmethod
     def _get_counter_value(counter) -> float:
diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py
@@ -51,6 +51,8 @@ async def _run_metrics_handler(self):
                 outputs, req_stats, iteration_stats, specdecode_stats = update_data
 
                 # update request stats
+                if outputs:
+                    req_stats.cached_tokens = outputs.cached_tokens
                 if outputs and outputs.req_metrics:
                     # when users visit "/abort_request" endpoint, `req_metrics` might be None
                     req_stats.update_from_events(outputs.req_metrics.engine_events)
diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
@@ -124,6 +124,7 @@ def __init__(self, arrival_time: float = None, prompt_tokens: int = 0):
         self.prompt_tokens = prompt_tokens
 
         self.generation_tokens: int = 0
+        self.cached_tokens: int = 0
         self.queued_time: float = 0.0
         self.scheduled_time: float = 0.0
         self.first_token_time: float = 0.0
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
@@ -224,6 +224,7 @@ async def async_stream_infer(self,
 
             cache_block_ids = resp.data.get('cache_block_ids', None) if resp.data else None
             req_metrics = resp.data.get('req_metrics', None) if resp.data else None
+            cached_tokens = req_metrics.cached_tokens if req_metrics is not None else 0
             logprobs = resp.data.pop('logprobs', None) if resp.data else None
 
             if resp.type == ResponseType.SUCCESS:
@@ -234,6 +235,7 @@ async def async_stream_infer(self,
                                    token_ids[output_offset:].tolist(),
                                    cache_block_ids=cache_block_ids,
                                    req_metrics=req_metrics,
+                                   cached_tokens=cached_tokens,
                                    logprobs=logprobs)
                 output_offset = len(token_ids)
             elif resp.type in (ResponseType.FINISH, ResponseType.CANCEL):
@@ -258,6 +260,7 @@ async def async_stream_infer(self,
                                    logits=logits,
                                    cache_block_ids=cache_block_ids,
                                    req_metrics=req_metrics,
+                                   cached_tokens=cached_tokens,
                                    routed_experts=routed_experts,
                                    logprobs=logprobs)
                 break
diff --git a/lmdeploy/pytorch/engine/engine_loop.py b/lmdeploy/pytorch/engine/engine_loop.py
@@ -351,7 +351,10 @@ def __get_logprobs(batched_outputs: 'BatchedOutputs'):
             if num_draft_tokens is not None and model_inputs is None and self.config.enable_metrics:
                 num_accepted_tokens = (batched_outputs.next_token_ids[idx] > -1).sum() - 1
                 spec_info = dict(num_draft_tokens=num_draft_tokens, num_accepted_tokens=num_accepted_tokens.item())
-            req_metrics = RequestMetrics(new_token_timestamp, msg.engine_events, spec_info=spec_info)
+            req_metrics = RequestMetrics(new_token_timestamp,
+                                         msg.engine_events,
+                                         spec_info=spec_info,
+                                         cached_tokens=msg.prefix_cache_hit_tokens)
             out = InferOutput(session_id=session_id,
                               resp=msg.resp,
                               finish=finish,
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
@@ -657,6 +657,9 @@ class SchedulerSequence:
     # mrope
     history_mrope_pos_ids: HistoryMropePosIds = field(default_factory=HistoryMropePosIds)
 
+    # prefix caching
+    prefix_cache_hit_tokens: int = 0
+
     def __post_init__(self):
         """Post init."""
         self._seq_meta: SequenceMeta = self.session.seq_meta
diff --git a/lmdeploy/pytorch/paging/block_trie.py b/lmdeploy/pytorch/paging/block_trie.py
@@ -83,6 +83,7 @@ def get_root(self, adapter_name: str):
     def match(self, seq: SchedulerSequence):
         """Match sequence and cache."""
         if not self.enable:
+            seq.prefix_cache_hit_tokens = 0
             return
 
         block_size = self.block_size
@@ -124,6 +125,7 @@ def __match_success(node: Node):
         # record prefix hit
         self.stats.num_query_tokens += seq.num_all_ids - init_num_matched
         self.stats.num_hit_tokens += num_matched - init_num_matched
+        seq.prefix_cache_hit_tokens = num_matched - init_num_matched
 
         seq.logical_blocks.last_shared_node = curr
 
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
@@ -159,7 +159,7 @@ def _reorder_migrating():
         max_batches = self.scheduler_config.max_batches - self.num_ready() - self.num_running()
         while len(migration_waiting) > 0 and len(migration_ready) < max_batches:
             seq = migration_waiting.pop(0)
-            self.block_trie.match(migration_waiting)
+            self.block_trie.match(seq)
             if not __evict_for_seq(seq, migration_waiting):
                 break
 
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
@@ -54,6 +54,7 @@ class GenOut:
     last_hidden_state: Any = None
     cache_block_ids: list[int] | None = None  # for disaggregation
     routed_experts: Any = None  # for RL router replay
+    cached_tokens: int = 0
 
     def to_response(self, index: int = 0) -> Response:
         """Convert GenOut to Response object.
@@ -70,6 +71,7 @@ def to_response(self, index: int = 0) -> Response:
                         last_hidden_state=self.last_hidden_state,
                         logits=self.logits,
                         routed_experts=self.routed_experts,
+                        cached_tokens=self.cached_tokens,
                         index=index)
 
 
@@ -647,6 +649,7 @@ def is_error(status):
             response = ''
             response_chunks = []
             finish_reason = None
+            cached_tokens = 0
             async with self.safe_run(handle,
                                      session=session,
                                      **prompt_input,
@@ -664,6 +667,7 @@ def is_error(status):
                 outputs = EngineOutput(ResponseType.INTERNAL_ENGINE_ERROR, [])
 
                 async for outputs in gen:
+                    cached_tokens = outputs.cached_tokens
                     iteration_stats = IterationStats()  # per-iteration stats
                     specdecode_stats = SpeculativeDecodingStats(
                         self.num_spec_token) if self.num_spec_token > 0 else None
@@ -699,7 +703,8 @@ def is_error(status):
                                  finish_reason,
                                  token_ids=res,
                                  routed_experts=outputs.routed_experts,
-                                 cache_block_ids=outputs.cache_block_ids)
+                                 cache_block_ids=outputs.cache_block_ids,
+                                 cached_tokens=cached_tokens)
                     if outputs.logprobs is not None:
                         out.logprobs = (outputs.logprobs[:-hit_stop_token] if hit_stop_token else outputs.logprobs)
                     if outputs.last_hidden_state is not None:
@@ -752,7 +757,8 @@ def is_error(status):
                                  logits=logits,
                                  last_hidden_state=last_hidden_state,
                                  routed_experts=routed_experts,
-                                 cache_block_ids=outputs.cache_block_ids)
+                                 cache_block_ids=outputs.cache_block_ids,
+                                 cached_tokens=cached_tokens)
                     # Note: We remove the session step update here. Let the caller(e.g., pipeline.chat) take care of it.
                 else:
                     logger.error(f'session {session_id} finished, {outputs.status}, '
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
diff --git a/tests/test_lmdeploy/serve/openai/test_usage_info.py b/tests/test_lmdeploy/serve/openai/test_usage_info.py
diff --git a/tests/test_lmdeploy/test_prefix_cache_hit_tokens.py b/tests/test_lmdeploy/test_prefix_cache_hit_tokens.py