|
27 | 27 | FinishReason, InflightBatchingStats, |
28 | 28 | IterationStats, KvCacheStats, |
29 | 29 | RequestStage, RequestStats, |
30 | | - SpecDecodingStats, |
| 30 | + RequestType, SpecDecodingStats, |
31 | 31 | StaticBatchingStats) |
32 | 32 | from tensorrt_llm.bindings.internal.batch_manager import (LlmRequestType, |
33 | 33 | ReqIdsSet) |
@@ -1198,6 +1198,108 @@ def _update_iter_stats(self, stats, iter_latency_ms, num_completed_requests, |
1198 | 1198 | # Calculate draft overhead |
1199 | 1199 | stats.specdec_stats.draft_overhead = 0.0 if iter_latency_ms <= 0.0 else float( |
1200 | 1200 | draft_latency_ms) / float(iter_latency_ms) |
| 1201 | + |
| 1202 | + # Extra per-iteration request-aggregate counters attached to |
| 1203 | + # inflight_batching_stats. These complement the existing |
| 1204 | + # num_context_requests / num_gen_requests / num_ctx_tokens / |
| 1205 | + # num_paused_requests members with token-weighted counts and |
| 1206 | + # queue/paused KV accounting. |
| 1207 | + |
| 1208 | + # Tokens read from prior state (prefix-cache hits and |
| 1209 | + # previously-chunked tokens) summed across scheduled context |
| 1210 | + # requests; complements num_ctx_tokens (tokens computed this |
| 1211 | + # iteration). Read from py_last_context_chunk, a Python-side |
| 1212 | + # cache set by _update_request_states before state mutation — it |
| 1213 | + # stays valid after the request transitions to |
| 1214 | + # GENERATION_IN_PROGRESS, unlike the C++ getContextChunkSize() / |
| 1215 | + # getContextCurrentPosition() accessors that would raise |
| 1216 | + # RuntimeError on a mutated request. |
| 1217 | + num_ctx_kv_tokens = 0 |
| 1218 | + for req in scheduled_batch.context_requests: |
| 1219 | + if getattr(req, "is_attention_dp_dummy", False): |
| 1220 | + continue |
| 1221 | + last_chunk = getattr(req, "py_last_context_chunk", None) |
| 1222 | + if last_chunk is not None and last_chunk[0] is not None: |
| 1223 | + start, _end = last_chunk |
| 1224 | + num_ctx_kv_tokens += start |
| 1225 | + else: |
| 1226 | + try: |
| 1227 | + num_ctx_kv_tokens += \ |
| 1228 | + req.context_current_position |
| 1229 | + except RuntimeError: |
| 1230 | + pass |
| 1231 | + |
| 1232 | + # Total KV context length (prompt + tokens generated so far) |
| 1233 | + # summed across scheduled generation requests. |
| 1234 | + num_gen_kv_tokens = 0 |
| 1235 | + for req in scheduled_batch.generation_requests: |
| 1236 | + if getattr(req, "is_attention_dp_dummy", False): |
| 1237 | + continue |
| 1238 | + try: |
| 1239 | + num_gen_kv_tokens += req.get_num_tokens(0) |
| 1240 | + except RuntimeError: |
| 1241 | + pass |
| 1242 | + |
| 1243 | + # Normal requests waiting in the executor_request_queue that have |
| 1244 | + # never been scheduled. Excludes non-normal control items |
| 1245 | + # (shutdown/cancel) and items with a missing payload. Each queued |
| 1246 | + # item is a RequestQueueItem wrapping an ExecutorRequest |
| 1247 | + # (tle::Request). Requests are routed by request_type: |
| 1248 | + # - CONTEXT_AND_GENERATION (default) and CONTEXT_ONLY |
| 1249 | + # (disagg-prefill side) -> queued-context counters. |
| 1250 | + # - GENERATION_ONLY (disagg-decode side, awaiting KV transfer |
| 1251 | + # before they can start decoding) -> queued-gen counters. |
| 1252 | + # On a non-disagg engine all items land in the context counters; |
| 1253 | + # on a disagg-decode engine all items land in the gen counters. |
| 1254 | + num_queued_context_requests = 0 |
| 1255 | + num_queued_ctx_tokens = 0 |
| 1256 | + num_queued_gen_requests = 0 |
| 1257 | + num_queued_gen_kv_tokens = 0 |
| 1258 | + for item in list(self.executor_request_queue.get_request_queue().queue): |
| 1259 | + if not item.is_normal_request: |
| 1260 | + continue |
| 1261 | + if item.request is None: |
| 1262 | + continue |
| 1263 | + try: |
| 1264 | + token_count = len(item.request.input_token_ids) |
| 1265 | + except (AttributeError, TypeError) as e: |
| 1266 | + # Unusual request shape with no usable token payload; |
| 1267 | + # exclude from all queued counters so downstream consumers |
| 1268 | + # see consistent per-request averages. Not expected on the |
| 1269 | + # current API (ExecutorRequest construction requires a |
| 1270 | + # non-empty input_token_ids), logged so future API drift |
| 1271 | + # surfaces instead of being silently dropped. |
| 1272 | + logger.warning(f"Excluding queued item {item.id} from queued " |
| 1273 | + f"counters: input_token_ids not readable " |
| 1274 | + f"({type(e).__name__})") |
| 1275 | + continue |
| 1276 | + if item.request.request_type == RequestType.REQUEST_TYPE_GENERATION_ONLY: |
| 1277 | + num_queued_gen_requests += 1 |
| 1278 | + num_queued_gen_kv_tokens += token_count |
| 1279 | + else: |
| 1280 | + num_queued_context_requests += 1 |
| 1281 | + num_queued_ctx_tokens += token_count |
| 1282 | + |
| 1283 | + # Total KV context length summed across paused (preempted-decode) |
| 1284 | + # requests — were decoding but got evicted back to the waiting |
| 1285 | + # pool for this iteration. |
| 1286 | + num_paused_kv_tokens = 0 |
| 1287 | + for req in scheduled_batch.paused_requests: |
| 1288 | + if getattr(req, "is_attention_dp_dummy", False): |
| 1289 | + continue |
| 1290 | + try: |
| 1291 | + num_paused_kv_tokens += req.get_num_tokens(0) |
| 1292 | + except RuntimeError: |
| 1293 | + pass |
| 1294 | + |
| 1295 | + stats.inflight_batching_stats.num_ctx_kv_tokens = num_ctx_kv_tokens |
| 1296 | + stats.inflight_batching_stats.num_gen_kv_tokens = num_gen_kv_tokens |
| 1297 | + stats.inflight_batching_stats.num_queued_context_requests = num_queued_context_requests |
| 1298 | + stats.inflight_batching_stats.num_queued_ctx_tokens = num_queued_ctx_tokens |
| 1299 | + stats.inflight_batching_stats.num_queued_gen_requests = num_queued_gen_requests |
| 1300 | + stats.inflight_batching_stats.num_queued_gen_kv_tokens = num_queued_gen_kv_tokens |
| 1301 | + stats.inflight_batching_stats.num_paused_kv_tokens = num_paused_kv_tokens |
| 1302 | + |
1201 | 1303 | return stats |
1202 | 1304 |
|
1203 | 1305 | def _append_iter_stats(self, |
|
0 commit comments