@@ -789,6 +789,7 @@ def _end_transfer_and_maybe_terminate(self, request: LlmRequest):
789789 response = request .create_response (False , self .dist .rank )
790790 if response :
791791 response .result .cached_tokens = request .cached_tokens
792+ self ._maybe_attach_ctx_usage (request , response )
792793 # Buffer the response instead of enqueueing immediately.
793794 # With ADP, _enqueue_responses does a tp_gather collective.
794795 # Calling it here would deadlock because only the owning DP
@@ -4321,6 +4322,15 @@ def fail_request(message: str) -> bool:
43214322 cum_log_probs [seq_slot , :beam_width ].copy_ (values )
43224323 return True
43234324
4325+ @staticmethod
4326+ def _maybe_attach_ctx_usage (request : LlmRequest , response ):
4327+ """Surface gen-first ctx usage (delivered via the KV-transfer aux
4328+ buffer in RxSession.unpack_aux) onto the response so the postprocessor
4329+ adopts the context-side prompt/cached token accounting."""
4330+ disagg_params = request .py_disaggregated_params
4331+ if disagg_params is not None and disagg_params .ctx_usage is not None :
4332+ response .result .ctx_usage = disagg_params .ctx_usage
4333+
43244334 def _maybe_prepend_logprobs_and_logits (self , req , beam_width ):
43254335 """Prepend logprobs and generation logits for first_gen_tokens
43264336 if transferred from prefill."""
@@ -4980,6 +4990,7 @@ def _emit_first_token_responses(self, prev_scheduled_requests):
49804990 if response is None :
49814991 continue
49824992 response .result .cached_tokens = request .cached_tokens
4993+ self ._maybe_attach_ctx_usage (request , response )
49834994 if logits_snapshot is not None :
49844995 response .result .generation_logits = logits_snapshot
49854996 new_responses .append ((request .py_request_id , response ))
@@ -5067,6 +5078,7 @@ def _handle_responses(self, emit_first_iter: bool = True):
50675078 if response :
50685079 request_done = request .is_finished
50695080 response .result .cached_tokens = request .cached_tokens
5081+ self ._maybe_attach_ctx_usage (request , response )
50705082 response .result .per_pos_drafted = request .py_per_pos_drafted
50715083 response .result .per_pos_accepted = request .py_per_pos_accepted
50725084 new_responses .append ((req_id , response ))
0 commit comments