Skip to content

Commit afd335b

Browse files
ConnorLi96root
authored andcommitted
feat: add Prometheus metrics collection for gRPC server mode
gRPC mode previously had no Prometheus metrics instrumentation, unlike the OpenAI-compatible HTTP server. This adds a MetricsCollector to the gRPC launch path and a background iteration-stats loop that mirrors the HTTP server's _iteration_stats_collector_loop, exposing KV-cache utilization, hit rate, and per-request latency/throughput metrics.
1 parent 64b5c79 commit afd335b

2 files changed

Lines changed: 52 additions & 3 deletions

File tree

tensorrt_llm/commands/serve.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,27 @@ def launch_server(
318318
asyncio.run(server(host, port, sockets=[s]))
319319

320320

321+
async def _grpc_iteration_stats_loop(llm, metrics_collector) -> None:
322+
"""Background task that periodically collects engine iteration stats
323+
(KV cache utilization, hit rate, etc.) and logs them to Prometheus.
324+
325+
Mirrors the _iteration_stats_collector_loop in OpenAIServer but runs
326+
independently since there is no HTTP framework in gRPC mode.
327+
"""
328+
while True:
329+
try:
330+
latest_stat = None
331+
async for stat in llm.get_stats_async(timeout=0.5):
332+
latest_stat = stat
333+
if latest_stat is not None:
334+
metrics_collector.log_iteration_stats(latest_stat)
335+
except asyncio.CancelledError:
336+
raise
337+
except Exception as e:
338+
logger.debug(f"Iteration stats collection error: {e}")
339+
await asyncio.sleep(1.0)
340+
341+
321342
def launch_grpc_server(host: str,
322343
port: int,
323344
llm_args: dict,
@@ -342,9 +363,11 @@ def launch_grpc_server(host: str,
342363
except ImportError:
343364
REFLECTION_AVAILABLE = False
344365

366+
from tensorrt_llm._utils import set_prometheus_multiproc_dir
345367
from tensorrt_llm.grpc import trtllm_service_pb2, trtllm_service_pb2_grpc
346368
from tensorrt_llm.grpc.grpc_request_manager import GrpcRequestManager
347369
from tensorrt_llm.grpc.grpc_servicer import TrtllmServiceServicer
370+
from tensorrt_llm.metrics.collector import MetricsCollector
348371

349372
async def serve_grpc_async():
350373
logger.info("Initializing TensorRT-LLM gRPC server...")
@@ -369,8 +392,17 @@ async def serve_grpc_async():
369392

370393
logger.info("Model loaded successfully")
371394

372-
# Create request manager
373-
request_manager = GrpcRequestManager(llm)
395+
# Initialize prometheus metrics for gRPC mode
396+
set_prometheus_multiproc_dir()
397+
metrics_collector = MetricsCollector({
398+
"model_name": str(model_path),
399+
"engine_type": "grpc",
400+
})
401+
logger.info("Prometheus metrics collector initialized for gRPC mode")
402+
403+
# Create request manager with metrics support
404+
request_manager = GrpcRequestManager(
405+
llm, metrics_collector=metrics_collector)
374406

375407
# Create servicer
376408
servicer = TrtllmServiceServicer(request_manager, model_path=model_path)
@@ -409,6 +441,12 @@ async def serve_grpc_async():
409441
logger.info(f"TensorRT-LLM gRPC server started on {address}")
410442
logger.info("Server is ready to accept requests")
411443

444+
# Start background iteration stats collector (KV cache metrics, etc.)
445+
iteration_stats_task = asyncio.create_task(
446+
_grpc_iteration_stats_loop(llm, metrics_collector))
447+
logger.info(
448+
"Started background iteration stats collector for gRPC mode")
449+
412450
# Handle shutdown signals
413451
loop = asyncio.get_running_loop()
414452
stop_event = asyncio.Event()
@@ -426,6 +464,11 @@ def signal_handler():
426464
except KeyboardInterrupt:
427465
logger.info("Interrupted by user")
428466
finally:
467+
iteration_stats_task.cancel()
468+
try:
469+
await iteration_stats_task
470+
except asyncio.CancelledError:
471+
pass
429472
logger.info("Shutting down TensorRT-LLM gRPC server...")
430473

431474
# Stop gRPC server

tensorrt_llm/grpc/grpc_request_manager.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,18 +47,21 @@ class GrpcRequestManager:
4747
- Submit requests to LLM.generate_async()
4848
- Stream token IDs (not text) back to gRPC clients
4949
- Handle abort/cancel operations
50+
- Record per-request prometheus metrics when metrics_collector is provided
5051
5152
This is modeled after vLLM's GrpcRequestManager but adapted for TensorRT-LLM's
5253
GenerationResult async iterator pattern.
5354
"""
5455

55-
def __init__(self, llm: Any):
56+
def __init__(self, llm: Any, metrics_collector=None):
5657
"""Initialize the request manager.
5758
5859
Args:
5960
llm: The TensorRT-LLM LLM instance (tensorrt_llm.LLM or tensorrt_llm._tensorrt_engine.LLM)
61+
metrics_collector: Optional MetricsCollector for prometheus metrics
6062
"""
6163
self.llm = llm
64+
self._metrics_collector = metrics_collector
6265
# Track active requests: request_id -> GenerationResult
6366
self._rid_to_result: Dict[str, GenerationResult] = {}
6467

@@ -119,6 +122,9 @@ async def generate(
119122
yield result
120123

121124
if result.finished:
125+
if self._metrics_collector and result.metrics_dict:
126+
self._metrics_collector.log_request_metrics_dict(
127+
result.metrics_dict)
122128
break
123129

124130
except asyncio.CancelledError:

0 commit comments

Comments
 (0)