Skip to content

Commit 39c5a53

Browse files
committed
new log
1 parent a087340 commit 39c5a53

8 files changed

Lines changed: 180 additions & 47 deletions

File tree

lightllm/common/basemodel/cuda_graph.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import copy
44
import bisect
55
from typing import Optional
6+
from tqdm import tqdm
67
from lightllm.utils.log_utils import init_logger
78
from lightllm.utils.envs_utils import get_env_start_args
89
from lightllm.distributed import dist_group_manager, lightllm_capture_graph, CustomProcessGroup
@@ -196,7 +197,11 @@ def warmup(self, model):
196197
model: TpPartBaseModel = model
197198

198199
# decode cuda graph init
199-
for batch_size in self.cuda_graph_batch_sizes[::-1]:
200+
progress_bar = tqdm(self.cuda_graph_batch_sizes[::-1], desc="Capturing CUDA graphs")
201+
for batch_size in progress_bar:
202+
avail_mem, _ = torch.cuda.mem_get_info()
203+
avail_mem_gb = avail_mem / (1024 ** 3)
204+
progress_bar.set_description(f"Capturing CUDA graphs - Batch: {batch_size}, AvailMem: {avail_mem_gb:.2f}GB")
200205
seq_len = 2
201206
total_token_num = batch_size * seq_len
202207
max_len_in_batch = self.graph_max_len_in_batch
@@ -251,7 +256,13 @@ def warmup_overlap(self, model):
251256

252257
model: TpPartBaseModel = model
253258

254-
for batch_size in self.cuda_graph_batch_sizes[::-1]:
259+
progress_bar = tqdm(self.cuda_graph_batch_sizes[::-1], desc="Capturing overlap CUDA graphs")
260+
for batch_size in progress_bar:
261+
avail_mem, _ = torch.cuda.mem_get_info()
262+
avail_mem_gb = avail_mem / (1024 ** 3)
263+
progress_bar.set_description(
264+
f"Capturing overlap CUDA graphs - Batch: {batch_size}, AvailMem: {avail_mem_gb:.2f}GB"
265+
)
255266
decode_batches = []
256267
for micro_batch_index in [0, 1]:
257268
# dummy decoding, capture the cudagraph

lightllm/common/triton_utils/autotuner.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def _try_load_cache(self, static_key):
215215

216216
cache_file = os.path.join(self.cache_dir, KernelConfigs.get_config_file_name(static_key))
217217
if os.path.exists(cache_file):
218-
logger.info(f"Loading cached configs for {self.kernel_name} - {static_key}")
218+
logger.info(f"Loading cached configs for {self.kernel_name} - {dict(static_key)}")
219219
with open(cache_file, "rb") as f:
220220
self.cached_configs[static_key] = orjson.loads(f.read())
221221
return True
@@ -353,9 +353,9 @@ def _autotune(self, args, kwargs, static_key, run_key, rank_id, world_size):
353353
option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS | orjson.OPT_NON_STR_KEYS,
354354
)
355355
)
356-
logger.info(f"Saved configs for {self.kernel_name} - {_static_key}")
356+
logger.info(f"Saved configs for {self.kernel_name} - {dict(_static_key)}")
357357

358-
logger.info(f"rank {rank_id} tuning {self.kernel_name} _static_key {static_key} finished")
358+
logger.info(f"rank {rank_id} tuning {self.kernel_name} _static_key {dict(static_key)} finished")
359359

360360
def _mutate_args_clone(self, args, kwargs):
361361
origin_list = []

lightllm/server/detokenization/manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def _add_new_group_req_index(self, recv_obj: GroupReqIndexes):
5252
req.link_prompt_ids_shm_array()
5353
req.link_logprobs_shm_array()
5454

55-
logger.info(
55+
logger.debug(
5656
f"detokenization recv req id {req.request_id} " f"cost time {time.time() - recv_obj.time_mark} s"
5757
)
5858

@@ -160,7 +160,7 @@ def remove_finished_reqs(self):
160160

161161
for decode_req in finished_reqs:
162162
decode_req.req.can_released_mark = True
163-
logger.info(f"detoken release req id {decode_req.req.request_id}")
163+
logger.debug(f"detoken release req id {decode_req.req.request_id}")
164164
self.shm_req_manager.put_back_req_obj(decode_req.req)
165165
self.req_id_to_out.pop(decode_req.request_id, None)
166166
return

lightllm/server/httpserver/manager.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ async def _wait_to_token_package(
644644
(out_token_counter - sum(sub_req_id_to_mtp_accepted_token_num.values())), 1
645645
)
646646
format_start_time = datetime.datetime.fromtimestamp(start_time).strftime("%Y-%m-%d %H:%M:%S")
647-
logger.info(
647+
logger.debug(
648648
f"X-Request-Id:{x_request_id} "
649649
f"X-Session-Id:{x_session_id} start_time:{format_start_time} "
650650
f"lightllm_req_id:{group_request_id} first_token_cost:{first_token_cost_ms}ms "
@@ -732,8 +732,8 @@ async def recycle_resource_loop(self):
732732
if req_status is None:
733733
continue
734734

735-
logger.info(
736-
f"left req id {req_status.group_req_objs.group_req_id}"
735+
logger.debug(
736+
f"left req id {req_status.group_req_objs.group_req_id} "
737737
f"can release {req_status.group_req_objs.shm_req_objs[0].can_released_mark} "
738738
f"refcount {req_status.group_req_objs.shm_req_objs[0].ref_count}"
739739
)

lightllm/server/router/batch.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from typing import Dict, List, Optional, Tuple, Union
44
from lightllm.server.core.objs import ShmReqManager, Req
55
from lightllm.utils.log_utils import init_logger
6-
from .stats import RouterStatics
76

87
logger = init_logger(__name__)
98

@@ -50,14 +49,11 @@ def get_all_dp_req_num(self) -> List[int]:
5049
all_dp_req_num[req.sample_params.suggested_dp_index] += 1
5150
return all_dp_req_num
5251

53-
def filter_out_finished_req(self, shm_req_manager: ShmReqManager, router_statics: RouterStatics):
52+
def filter_out_finished_req(self, shm_req_manager: ShmReqManager):
5453
unfinished_req_ids = []
5554
for req in self.reqs:
5655
if req.shm_infer_released:
57-
logger.info(f"router release req id {req.request_id}")
58-
if not req.is_aborted:
59-
router_statics.update(req.candetoken_out_len)
60-
56+
logger.debug(f"router release req id {req.request_id}")
6157
shm_req_manager.put_back_req_obj(req)
6258
req = None
6359
else:

lightllm/server/router/manager.py

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .batch import Batch, Req
1717
from .model_infer.model_rpc import start_model_process, ModelRpcClient
1818
from .req_queue import build_req_queue
19+
from .stats import SystemStatusReporter
1920
from lightllm.server.core.objs.io_objs import (
2021
GroupReqIndexes,
2122
AbortedReqCmd,
@@ -25,7 +26,7 @@
2526
from .dynamic_prompt.radix_cache import RadixCacheReadOnlyClient
2627
from lightllm.server.multi_level_kv_cache.cpu_cache_client import CpuKvCacheClient
2728
from lightllm.server.core.objs.shm_objs_io_buffer import ShmObjsIOBuffer
28-
from lightllm.utils.log_utils import init_logger, log_time_ready
29+
from lightllm.utils.log_utils import init_logger
2930
from lightllm.server.router.token_load import TokenLoad
3031
from lightllm.server.metrics.manager import MetricClient
3132
from lightllm.common.basemodel.infer_lock import g_router_lock
@@ -65,6 +66,7 @@ def __init__(self, args: StartArgs):
6566
self.read_only_statics_mem_manager = ReadOnlyStaticsMemoryManager()
6667
# 初始化 radix_cache_client 用于读取 prompt cache 的管理信息
6768
self.radix_cache_client = None
69+
self.status_reporter = None
6870

6971
# 共享变量,用于存储router端调度分析得到的机器负载信息
7072
self.shared_token_load = TokenLoad(f"{get_unique_server_name()}_shared_token_load", self.dp_size_in_node)
@@ -194,6 +196,11 @@ async def wait_to_model_ready(self):
194196
)
195197
self.req_queue = build_req_queue(self.args, self, self.dp_size_in_node)
196198
logger.info(f"use req queue {self.req_queue.__class__.__name__}")
199+
self.status_reporter = SystemStatusReporter(
200+
args=self.args,
201+
max_total_token_num=self.max_total_token_num,
202+
dp_size_in_node=self.dp_size_in_node,
203+
)
197204

198205
if self.args.run_mode == "prefill":
199206
# 启动 prefill kv move 管理进程
@@ -239,26 +246,11 @@ async def loop_for_fwd(
239246
await self._step()
240247
counter_count += 1
241248
if self.running_batch is not None:
249+
# Count output tokens (each running req produces ~1 token per decode step)
250+
self.status_reporter.count_output_tokens(len(self.running_batch.reqs))
242251
if counter_count % 100 == 0:
243252
for dp_index in range(self.dp_size_in_node):
244-
token_ratio1 = self.get_used_tokens(dp_index) / self.max_total_token_num
245-
token_ratio2 = (
246-
self.max_total_token_num
247-
- self.read_only_statics_mem_manager.get_unrefed_token_num(dp_index)
248-
) / self.max_total_token_num
249-
d_i = dp_index
250-
frozen_token_num = self.shared_token_load.get_frozened_token_count(d_i)
251-
estimated_peak_token_count = self.shared_token_load.get_estimated_peak_token_count(d_i)
252-
paused_req_num = self._get_paused_req_num_in_dp_index(dp_index=d_i)
253-
logger.debug(
254-
f"dp_i {d_i} current batch size: {len(self.running_batch.reqs)} \n"
255-
f"dp_i {d_i} paused req num: {paused_req_num} \n"
256-
f"dp_i {d_i} frozen token num: {frozen_token_num} \n"
257-
f"dp_i {d_i} estimated_peak_token_count: {estimated_peak_token_count} \n"
258-
f"dp_i {d_i} token used ratio: {token_ratio1} not contain prompt cache tree unrefed token\n"
259-
f"dp_i {d_i} token used ratio: {token_ratio2} contain prompt cache tree unrefed token"
260-
)
261-
logger.debug(self.router_statics.log_str())
253+
paused_req_num = self._get_paused_req_num_in_dp_index(dp_index=dp_index)
262254
self.metric_client.gauge_set("lightllm_batch_pause_size", paused_req_num)
263255
# pd decode mode need to update token_load more frequently
264256
self.req_queue.update_token_load(self.running_batch, force_update=self.is_pd_decode_mode)
@@ -278,13 +270,15 @@ async def loop_for_fwd(
278270
self.metric_client.gauge_set("lightllm_batch_pause_size", 0.0)
279271
self.metric_client.gauge_set("lightllm_queue_size", 0.0)
280272
self.metric_client.gauge_set("lightllm_batch_current_max_tokens", 0.0)
281-
# 60s print once
282-
if log_time_ready("frozen_info", 60):
283-
for dp_i in range(self.dp_size_in_node):
284-
frozen_token_num = self.shared_token_load.get_frozened_token_count(dp_i)
285-
estimated_peak_token_count = self.shared_token_load.get_estimated_peak_token_count(dp_i)
286-
logger.debug(f"dp_i {dp_i} frozen token num: {frozen_token_num} \n")
287-
logger.debug(f"dp_i {dp_i} estimated_peak_token_count: {estimated_peak_token_count} \n")
273+
274+
self.status_reporter.maybe_print(
275+
running_batch=self.running_batch,
276+
req_queue=self.req_queue,
277+
read_only_statics_mem_manager=self.read_only_statics_mem_manager,
278+
paused_req_num=self._get_paused_req_num(),
279+
radix_cache_client=self.radix_cache_client,
280+
disable_dynamic_prompt_cache=self.args.disable_dynamic_prompt_cache,
281+
)
288282

289283
await asyncio.sleep(self._get_schedule_time_interval())
290284

@@ -314,6 +308,7 @@ async def _step(self):
314308

315309
async def _add_batch(self, batch: Batch):
316310
# 添加新请求
311+
self.status_reporter.count_prompt_tokens(batch.input_tokens())
317312
reqs = [r.to_router_rpc_obj() for r in batch.reqs]
318313
while not self.shm_reqs_io_buffer.is_empty():
319314
await asyncio.sleep(0.02)
@@ -350,7 +345,16 @@ def _add_new_batch_to_running_batch(self, new_batch: Batch):
350345

351346
def _filter_reqs_from_running_batch(self):
352347
if self.running_batch is not None:
353-
self.running_batch.filter_out_finished_req(self.shm_req_manager, self.router_statics)
348+
# Capture finished req stats before filtering
349+
for req in self.running_batch.reqs:
350+
if req.shm_infer_released:
351+
self.status_reporter.on_request_completed(
352+
input_len=req.input_len,
353+
output_len=req.shm_cur_output_len,
354+
cache_len=req.prompt_cache_len,
355+
mtp_accepted=req.mtp_accepted_token_num,
356+
)
357+
self.running_batch.filter_out_finished_req(self.shm_req_manager)
354358
if self.running_batch.is_clear():
355359
self.running_batch = None
356360
return
@@ -422,7 +426,7 @@ def _add_req(self, group_req_indexes: GroupReqIndexes):
422426
req._router_stop_str_matched = False
423427
req_group.append(req)
424428

425-
logger.info(f"router recive req id {req.request_id} cost time {time.time() - req.start_time} s")
429+
logger.debug(f"router recive req id {req.request_id} cost time {time.time() - req.start_time} s")
426430
self.req_queue.extend(req_group)
427431
self.send_to_detokenization.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
428432
return

lightllm/server/router/stats.py

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,114 @@
1-
from lightllm.utils.log_utils import init_logger
1+
import time
2+
import logging
23
from lightllm.server.core.objs import StartArgs
4+
from lightllm.utils.log_utils import init_system_status_logger
35

4-
logger = init_logger(__name__)
6+
7+
class SystemStatusReporter:
8+
def __init__(self, args, max_total_token_num, dp_size_in_node):
9+
self.enabled = not args.disable_log_stats
10+
self.interval = max(5, args.log_stats_interval)
11+
if args.log_stats_interval < 5:
12+
logging.getLogger(__name__).warning(
13+
f"log_stats_interval={args.log_stats_interval}s is below minimum, using 5s"
14+
)
15+
self.max_total_token_num = max_total_token_num
16+
self.dp_size_in_node = dp_size_in_node
17+
self.status_logger = init_system_status_logger("router")
18+
19+
# Accumulation counters (reset each interval)
20+
self.last_print_time = time.time()
21+
self.prompt_tokens = 0
22+
self.output_tokens = 0
23+
24+
# Global counters (never reset, for lifetime stats)
25+
self.global_input_total = 0
26+
self.global_cache_total = 0
27+
self.global_mtp_output_total = 0
28+
self.global_mtp_accepted_total = 0
29+
30+
def count_prompt_tokens(self, num_tokens: int):
31+
if self.enabled:
32+
self.prompt_tokens += num_tokens
33+
34+
def count_output_tokens(self, num_tokens: int):
35+
if self.enabled:
36+
self.output_tokens += num_tokens
37+
38+
def on_request_completed(self, input_len: int, output_len: int, cache_len: int, mtp_accepted: int):
39+
if self.enabled:
40+
self.global_input_total += input_len
41+
self.global_cache_total += cache_len
42+
self.global_mtp_output_total += output_len
43+
self.global_mtp_accepted_total += mtp_accepted
44+
45+
def maybe_print(
46+
self,
47+
running_batch,
48+
req_queue,
49+
read_only_statics_mem_manager,
50+
paused_req_num=0,
51+
radix_cache_client=None,
52+
disable_dynamic_prompt_cache=False,
53+
):
54+
if not self.enabled:
55+
return
56+
now = time.time()
57+
elapsed = now - self.last_print_time
58+
if elapsed < self.interval:
59+
return
60+
61+
total_tps = (self.prompt_tokens + self.output_tokens) / elapsed
62+
input_tps = self.prompt_tokens / elapsed
63+
output_tps = self.output_tokens / elapsed
64+
65+
running = len(running_batch.reqs) if running_batch else 0
66+
queued = req_queue.get_wait_req_num()
67+
68+
# Memory utilization (average across dp)
69+
# kv_used: physical KV memory usage (includes prefix cache tree occupancy)
70+
# kv_used_no_cache: effective usage excluding unrefed prefix cache tokens
71+
kv_used_list = []
72+
kv_used_no_cache_list = []
73+
for dp_i in range(self.dp_size_in_node):
74+
unrefed = read_only_statics_mem_manager.get_unrefed_token_num(dp_i)
75+
used = self.max_total_token_num - unrefed
76+
kv_used_list.append(used / self.max_total_token_num)
77+
if not disable_dynamic_prompt_cache and radix_cache_client is not None:
78+
cache_unrefed = radix_cache_client.get_unrefed_tokens_num(dp_i)
79+
kv_used_no_cache_list.append((used - cache_unrefed) / self.max_total_token_num)
80+
else:
81+
kv_used_no_cache_list.append(used / self.max_total_token_num)
82+
avg_kv_used = sum(kv_used_list) / len(kv_used_list)
83+
avg_kv_used_no_cache = sum(kv_used_no_cache_list) / len(kv_used_no_cache_list)
84+
85+
# Global prefix cache hit rate
86+
cache_hit_rate = (
87+
(self.global_cache_total / self.global_input_total * 100) if self.global_input_total > 0 else 0.0
88+
)
89+
90+
kv_pct = avg_kv_used * 100
91+
kv_pct_no_cache = avg_kv_used_no_cache * 100
92+
93+
# Avg MTP accepted length (only shown when MTP is active)
94+
mtp_suffix = ""
95+
if self.global_mtp_accepted_total > 0:
96+
decode_steps = self.global_mtp_output_total - self.global_mtp_accepted_total
97+
avg_mtp_len = self.global_mtp_output_total / max(decode_steps, 1)
98+
mtp_suffix = f" | MTP {avg_mtp_len:.2f}"
99+
100+
self.status_logger.info(
101+
f"Throughput {total_tps:>7.1f} tok/s (in {input_tps:.1f}, out {output_tps:.1f}) | "
102+
f"Reqs {running} run, {queued} wait, {paused_req_num} pause | "
103+
f"KV Cache {kv_pct:.1f}% (active {kv_pct_no_cache:.1f}%) | "
104+
f"Prefix Hit {cache_hit_rate:.1f}%"
105+
f"{mtp_suffix}"
106+
)
107+
108+
# Reset windowed counters
109+
self.prompt_tokens = 0
110+
self.output_tokens = 0
111+
self.last_print_time = now
5112

6113

7114
class RouterStatics:

lightllm/utils/log_utils.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
_FORMAT = "%(levelname)s %(asctime)s [%(filename)s:%(lineno)d] %(message)s"
1111
_DATE_FORMAT = "%m-%d %H:%M:%S"
1212

13-
_LOG_LEVEL = os.environ.get("LIGHTLLM_LOG_LEVEL", "debug")
13+
_STATUS_FORMAT = "%(levelname)s [%(asctime)s] %(message)s"
14+
15+
_LOG_LEVEL = os.environ.get("LIGHTLLM_LOG_LEVEL", "info")
1416
_LOG_LEVEL = getattr(logging, _LOG_LEVEL.upper(), 0)
1517
_LOG_DIR = os.environ.get("LIGHTLLM_LOG_DIR", None)
1618

@@ -96,6 +98,19 @@ def init_logger(name: str):
9698
return logger
9799

98100

101+
def init_system_status_logger(name: str):
102+
logger = logging.getLogger(f"lightllm.status.{name}")
103+
if not logger.handlers:
104+
logger.setLevel(logging.INFO)
105+
handler = logging.StreamHandler(sys.stdout)
106+
handler.flush = sys.stdout.flush
107+
fmt = logging.Formatter(_STATUS_FORMAT, datefmt=_DATE_FORMAT)
108+
handler.setFormatter(fmt)
109+
logger.addHandler(handler)
110+
logger.propagate = False
111+
return logger
112+
113+
99114
_log_time_mark_dict = {}
100115

101116

0 commit comments

Comments
 (0)