Skip to content

Commit fe64ad9

Browse files
committed
[BugFix] Seperate prometheus multiproc dir for single-server multi-dp services
1 parent cbb0811 commit fe64ad9

2 files changed

Lines changed: 15 additions & 0 deletions

File tree

fastdeploy/engine/common_engine.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
)
6666
from fastdeploy.inter_communicator.fmq import FMQ
6767
from fastdeploy.metrics.metrics import main_process_metrics
68+
from fastdeploy.metrics.prometheus_multiprocess_setup import setup_dp_prometheus_dir
6869
from fastdeploy.model_executor.guided_decoding import schema_checker
6970
from fastdeploy.plugins.token_processor import load_token_processor_plugins
7071
from fastdeploy.spec_decode import SpecMethod
@@ -139,6 +140,10 @@ def __init__(self, cfg: FDConfig, start_queue=True, use_async_llm=False):
139140
self.cfg = cfg
140141
self.use_async_llm = use_async_llm
141142

143+
# 为每个 DP 设置独立的 PROMETHEUS_MULTIPROC_DIR,避免多 DP 的 Counter/Histogram 指标混淆
144+
if not envs.FD_ENABLE_MULTI_API_SERVER and self.cfg.parallel_config.data_parallel_size > 1:
145+
setup_dp_prometheus_dir(self.cfg.parallel_config.local_data_parallel_id)
146+
142147
if self.cfg.parallel_config.data_parallel_size > 1:
143148
self.llm_logger = get_logger(
144149
"fastdeploy", f"fastdeploy_dprank{self.cfg.parallel_config.local_data_parallel_id}.log"

fastdeploy/metrics/prometheus_multiprocess_setup.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,13 @@ def setup_multiprocess_prometheus():
4444
"will properly handle cleanup."
4545
)
4646
return os.environ["PROMETHEUS_MULTIPROC_DIR"]
47+
48+
49+
def setup_dp_prometheus_dir(dp_id):
50+
if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
51+
prom_dir = os.environ.get("PROMETHEUS_MULTIPROC_DIR")
52+
prom_dir_dp = os.path.join(os.path.dirname(prom_dir), os.path.basename(prom_dir) + f"_dp{dp_id}")
53+
if not os.path.exists(prom_dir_dp):
54+
os.makedirs(prom_dir_dp, exist_ok=True)
55+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = prom_dir_dp
56+
llm_logger.info(f"Set PROMETHEUS_MULTIPROC_DIR for DP {dp_id}: {prom_dir_dp}")

0 commit comments

Comments
 (0)