@@ -318,6 +318,27 @@ def launch_server(
318318 asyncio .run (server (host , port , sockets = [s ]))
319319
320320
321+ async def _grpc_iteration_stats_loop (llm , metrics_collector ) -> None :
322+ """Background task that periodically collects engine iteration stats
323+ (KV cache utilization, hit rate, etc.) and logs them to Prometheus.
324+
325+ Mirrors the _iteration_stats_collector_loop in OpenAIServer but runs
326+ independently since there is no HTTP framework in gRPC mode.
327+ """
328+ while True :
329+ try :
330+ latest_stat = None
331+ async for stat in llm .get_stats_async (timeout = 0.5 ):
332+ latest_stat = stat
333+ if latest_stat is not None :
334+ metrics_collector .log_iteration_stats (latest_stat )
335+ except asyncio .CancelledError :
336+ raise
337+ except Exception as e :
338+ logger .debug (f"Iteration stats collection error: { e } " )
339+ await asyncio .sleep (1.0 )
340+
341+
321342def launch_grpc_server (host : str ,
322343 port : int ,
323344 llm_args : dict ,
@@ -342,9 +363,11 @@ def launch_grpc_server(host: str,
342363 except ImportError :
343364 REFLECTION_AVAILABLE = False
344365
366+ from tensorrt_llm ._utils import set_prometheus_multiproc_dir
345367 from tensorrt_llm .grpc import trtllm_service_pb2 , trtllm_service_pb2_grpc
346368 from tensorrt_llm .grpc .grpc_request_manager import GrpcRequestManager
347369 from tensorrt_llm .grpc .grpc_servicer import TrtllmServiceServicer
370+ from tensorrt_llm .metrics .collector import MetricsCollector
348371
349372 async def serve_grpc_async ():
350373 logger .info ("Initializing TensorRT-LLM gRPC server..." )
@@ -369,8 +392,17 @@ async def serve_grpc_async():
369392
370393 logger .info ("Model loaded successfully" )
371394
372- # Create request manager
373- request_manager = GrpcRequestManager (llm )
395+ # Initialize prometheus metrics for gRPC mode
396+ set_prometheus_multiproc_dir ()
397+ metrics_collector = MetricsCollector ({
398+ "model_name" : str (model_path ),
399+ "engine_type" : "grpc" ,
400+ })
401+ logger .info ("Prometheus metrics collector initialized for gRPC mode" )
402+
403+ # Create request manager with metrics support
404+ request_manager = GrpcRequestManager (
405+ llm , metrics_collector = metrics_collector )
374406
375407 # Create servicer
376408 servicer = TrtllmServiceServicer (request_manager , model_path = model_path )
@@ -409,6 +441,12 @@ async def serve_grpc_async():
409441 logger .info (f"TensorRT-LLM gRPC server started on { address } " )
410442 logger .info ("Server is ready to accept requests" )
411443
444+ # Start background iteration stats collector (KV cache metrics, etc.)
445+ iteration_stats_task = asyncio .create_task (
446+ _grpc_iteration_stats_loop (llm , metrics_collector ))
447+ logger .info (
448+ "Started background iteration stats collector for gRPC mode" )
449+
412450 # Handle shutdown signals
413451 loop = asyncio .get_running_loop ()
414452 stop_event = asyncio .Event ()
@@ -426,6 +464,11 @@ def signal_handler():
426464 except KeyboardInterrupt :
427465 logger .info ("Interrupted by user" )
428466 finally :
467+ iteration_stats_task .cancel ()
468+ try :
469+ await iteration_stats_task
470+ except asyncio .CancelledError :
471+ pass
429472 logger .info ("Shutting down TensorRT-LLM gRPC server..." )
430473
431474 # Stop gRPC server
0 commit comments