perf(router): sweep output-token deltas once per print interval

sufubao · sufubao · commit 3f607ea9f9df · 2026-05-09T11:30:59.000+08:00
Move the per-running-req shm_cur_output_len delta tracking from the
router tick (~33 Hz) into SystemStatusReporter.maybe_print, which only
runs once per log_stats_interval (&gt;= 5s). The reporter now owns the
per-req snapshot dict and exposes discard_req(req) for tail settlement
when a req leaves the running batch, so the router loop's hot path no
longer walks the batch every schedule cycle. Output TPS accuracy is
unchanged: still based on real shm_cur_output_len deltas, with tail
tokens settled at completion.
diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
@@ -67,9 +67,6 @@ def __init__(self, args: StartArgs):
         # 初始化 radix_cache_client 用于读取 prompt cache 的管理信息
         self.radix_cache_client = None
         self.status_reporter = None
-        # Track shm_cur_output_len per running request to compute per-tick deltas
-        # for accurate output TPS regardless of router schedule interval.
-        self._req_last_output_len: Dict[int, int] = {}
 
         # 共享变量，用于存储router端调度分析得到的机器负载信息
         self.shared_token_load = TokenLoad(f"{get_unique_server_name()}_shared_token_load", self.dp_size_in_node)
@@ -249,18 +246,8 @@ async def loop_for_fwd(
             await self._step()
             counter_count += 1
             if self.running_batch is not None:
-                # Count output tokens via per-request shm_cur_output_len deltas, since the
-                # router loop runs on schedule_time_interval and len(reqs) is not a per-step
-                # token count.
-                new_output_tokens = 0
-                for req in self.running_batch.reqs:
-                    cur_out_len = req.shm_cur_output_len
-                    prev_out_len = self._req_last_output_len.get(req.request_id, 0)
-                    if cur_out_len > prev_out_len:
-                        new_output_tokens += cur_out_len - prev_out_len
-                        self._req_last_output_len[req.request_id] = cur_out_len
-                if new_output_tokens:
-                    self.status_reporter.count_output_tokens(new_output_tokens)
+                # Output-token counting is done in bulk at the print-window boundary
+                # inside SystemStatusReporter.maybe_print, so the router tick stays cheap.
                 if counter_count % 100 == 0:
                     self.metric_client.gauge_set("lightllm_batch_pause_size", self._get_paused_req_num())
                 # pd decode mode need to update token_load more frequently
@@ -357,19 +344,16 @@ def _filter_reqs_from_running_batch(self):
             for req in self.running_batch.reqs:
                 if not req.shm_infer_released:
                     continue
-                # Settle any output-token delta produced after the last router tick
-                # so windowed TPS does not lose the request's tail tokens.
-                cur_out_len = req.shm_cur_output_len
-                prev_out_len = self._req_last_output_len.pop(req.request_id, 0)
-                if cur_out_len > prev_out_len:
-                    self.status_reporter.count_output_tokens(cur_out_len - prev_out_len)
+                # Settle any output-token tail produced after the last window boundary,
+                # so windowed TPS does not lose the req's last tokens.
+                self.status_reporter.discard_req(req)
                 # Aborted/disconnected requests can leave a partial output_len that
                 # would bias the EMA toward shorter generations; skip them.
                 if req.is_aborted:
                     continue
                 self.status_reporter.on_request_completed(
                     input_len=req.input_len,
-                    output_len=cur_out_len,
+                    output_len=req.shm_cur_output_len,
                     cache_len=req.prompt_cache_len,
                     mtp_accepted=req.mtp_accepted_token_num,
                 )
diff --git a/lightllm/server/router/stats.py b/lightllm/server/router/stats.py
@@ -1,5 +1,6 @@
 import time
 import logging
+from typing import Dict
 from lightllm.server.core.objs import StartArgs
 from lightllm.utils.log_utils import init_system_status_logger
 
@@ -31,13 +32,23 @@ def __init__(self, args, max_total_token_num, dp_size_in_node):
         self.global_mtp_output_total = 0
         self.global_mtp_accepted_total = 0
 
+        # Per-req shm_cur_output_len snapshot at the previous window boundary,
+        # used to compute the windowed output-token count without per-tick scans.
+        self._req_last_output_len: Dict[int, int] = {}
+
     def count_prompt_tokens(self, num_tokens: int):
         if self.enabled:
             self.prompt_tokens += num_tokens
 
-    def count_output_tokens(self, num_tokens: int):
-        if self.enabled:
-            self.output_tokens += num_tokens
+    def discard_req(self, req):
+        """Settle a finished/aborted req's tail output tokens (those produced after the last
+        window-boundary sweep) and drop its tracking entry."""
+        if not self.enabled:
+            return
+        cur_out_len = req.shm_cur_output_len
+        prev_out_len = self._req_last_output_len.pop(req.request_id, 0)
+        if cur_out_len > prev_out_len:
+            self.output_tokens += cur_out_len - prev_out_len
 
     def on_request_completed(self, input_len: int, output_len: int, cache_len: int, mtp_accepted: int):
         if self.enabled:
@@ -64,6 +75,17 @@ def maybe_print(
         if elapsed < self.interval:
             return
 
+        # Single bulk sweep at the window boundary: account for output tokens produced
+        # by every still-running req since the previous boundary, and refresh their
+        # snapshots. Reqs that finished in this window already settled via discard_req.
+        if running_batch is not None:
+            for req in running_batch.reqs:
+                cur_out_len = req.shm_cur_output_len
+                prev_out_len = self._req_last_output_len.get(req.request_id, 0)
+                if cur_out_len > prev_out_len:
+                    self.output_tokens += cur_out_len - prev_out_len
+                    self._req_last_output_len[req.request_id] = cur_out_len
+
         total_tps = (self.prompt_tokens + self.output_tokens) / elapsed
         input_tps = self.prompt_tokens / elapsed
         output_tps = self.output_tokens / elapsed