@@ -66,7 +66,7 @@ def collect(self):
6666 Metric: Prometheus Metric objects that are not excluded.
6767 """
6868 for metric in self .base_registry .collect ():
69- if not any (name .startswith (metric . name ) for name in self .exclude_names ):
69+ if not any (metric . name .startswith (name ) for name in self .exclude_names ):
7070 yield metric
7171
7272
@@ -84,11 +84,15 @@ def get_filtered_metrics() -> str:
8484 multiprocess .MultiProcessCollector (base_registry )
8585
8686 filtered_registry = CollectorRegistry ()
87- # 注册一个新的colletor,过滤gauge指标
88- filtered_registry .register (SimpleCollector (base_registry , EXCLUDE_LABELS ))
87+ # 动态获取需要排除的 gauge 指标列表
88+ exclude_labels = main_process_metrics .get_excluded_metrics ()
89+ # 注册一个新的collector,过滤gauge指标
90+ filtered_registry .register (SimpleCollector (base_registry , exclude_labels ))
8991
9092 # 将gauge指标重新注册到filtered_registry中,从内存中读取
9193 main_process_metrics .re_register_gauge (filtered_registry )
94+ # 将speculative中的gauge指标也重新注册
95+ main_process_metrics .re_register_speculative_gauge (filtered_registry )
9296
9397 return generate_latest (filtered_registry ).decode ("utf-8" )
9498
@@ -196,7 +200,7 @@ class MetricsManager:
196200 "type" : Gauge ,
197201 "name" : "fastdeploy:num_requests_running" ,
198202 "description" : "Number of requests currently running" ,
199- "kwargs" : {"multiprocess_mode" : "sum" },
203+ "kwargs" : {},
200204 },
201205 "num_requests_waiting" : {
202206 "type" : Gauge ,
@@ -626,19 +630,22 @@ def __init__(self):
626630 # 在模块加载,指标注册先设置Prometheus环境变量
627631 setup_multiprocess_prometheus ()
628632
629- # 动态创建所有指标
633+ # 动态创建所有非 gauge 型指标
630634 for metric_name , config in self .METRICS .items ():
631635 setattr (
632636 self ,
633637 metric_name ,
634638 config ["type" ](config ["name" ], config ["description" ], ** config ["kwargs" ]),
635639 )
636- # 动态创建所有指标
640+ # 动态创建所有 gauge 型指标,统一配置 multiprocess_mode 为 livesum
637641 for metric_name , config in self .GAUGE_METRICS .items ():
642+ kwargs = config ["kwargs" ].copy ()
643+ if "multiprocess_mode" not in kwargs :
644+ kwargs ["multiprocess_mode" ] = "livesum"
638645 setattr (
639646 self ,
640647 metric_name ,
641- config ["type" ](config ["name" ], config ["description" ], ** config [ " kwargs" ] ),
648+ config ["type" ](config ["name" ], config ["description" ], ** kwargs ),
642649 )
643650 # 动态创建server metrics
644651 for metric_name , config in self .SERVER_METRICS .items ():
@@ -696,17 +703,22 @@ def _init_speculative_metrics(self, speculative_method, num_speculative_tokens):
696703 Gauge (
697704 f"{ config ['name' ]} _{ i } " ,
698705 f"{ config ['description' ]} (head { i } )" ,
706+ multiprocess_mode = "livesum" ,
699707 )
700708 )
701709 setattr (self , metric_name , gauges )
702710 else :
711+ # For Gauge metrics, automatically add multiprocess_mode="livesum"
712+ kwargs = config ["kwargs" ].copy ()
713+ if config ["type" ] == Gauge and "multiprocess_mode" not in kwargs :
714+ kwargs ["multiprocess_mode" ] = "livesum"
703715 setattr (
704716 self ,
705717 metric_name ,
706718 config ["type" ](
707719 config ["name" ],
708720 config ["description" ],
709- ** config [ " kwargs" ] ,
721+ ** kwargs ,
710722 ),
711723 )
712724
@@ -767,6 +779,19 @@ def register_speculative_metrics(self, registry: CollectorRegistry):
767779 else :
768780 registry .register (getattr (self , metric_name ))
769781
782+ def re_register_speculative_gauge (self , registry : CollectorRegistry ):
783+ """Re-register gauge metrics from SPECULATIVE_METRICS to the specified registry"""
784+ # Check if SPECULATIVE_METRICS was initialized in this process
785+ # (it's an instance attribute set by _init_speculative_metrics, not the class-level empty dict)
786+ if not hasattr (self , "spec_decode_draft_acceptance_rate" ):
787+ return
788+ for metric_name , config in self .SPECULATIVE_METRICS .items ():
789+ if metric_name == "spec_decode_draft_single_head_acceptance_rate" :
790+ for gauge in getattr (self , metric_name ):
791+ registry .register (gauge )
792+ elif config ["type" ] == Gauge :
793+ registry .register (getattr (self , metric_name ))
794+
770795 def re_register_gauge (self , registry : CollectorRegistry ):
771796 """Re-register gauge to the specified registry"""
772797 for metric_name in self .GAUGE_METRICS :
@@ -790,16 +815,19 @@ def register_all(self, registry: CollectorRegistry):
790815 if hasattr (main_process_metrics , "spec_decode_draft_acceptance_rate" ):
791816 self .register_speculative_metrics (registry )
792817
793- @classmethod
794- def get_excluded_metrics (cls ) -> Set [str ]:
818+ def get_excluded_metrics (self ) -> Set [str ]:
795819 """Get the set of indicator names that need to be excluded"""
796- return {config ["name" ] for config in cls .GAUGE_METRICS .values ()}
820+ excluded = {config ["name" ] for config in self .GAUGE_METRICS .values ()}
821+ # Also add gauge metrics from SPECULATIVE_METRICS (if initialized)
822+ if hasattr (self , "SPECULATIVE_METRICS" ):
823+ for config in self .SPECULATIVE_METRICS .values ():
824+ if config ["type" ] == Gauge or config ["type" ] == list [Gauge ]:
825+ excluded .add (config ["name" ])
826+ return excluded
797827
798828
799829main_process_metrics = MetricsManager ()
800830
801831# 由于zmq指标记录比较耗时,默认不开启,通过DEBUG参数开启
802832if envs .FD_DEBUG :
803833 main_process_metrics .init_zmq_metrics ()
804-
805- EXCLUDE_LABELS = MetricsManager .get_excluded_metrics ()
0 commit comments