|
1 | 1 | """ |
2 | | -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. |
3 | 3 | # |
4 | | -# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License" |
5 | 5 | # you may not use this file except in compliance with the License. |
6 | 6 | # You may obtain a copy of the License at |
7 | 7 | # |
|
20 | 20 |
|
21 | 21 | from fastdeploy.utils import llm_logger |
22 | 22 |
|
| 23 | +_original_prom_dir = None |
| 24 | + |
| 25 | + |
| 26 | +def get_original_prom_dir(): |
| 27 | + """Return the PROMETHEUS_MULTIPROC_DIR before any dp suffix was appended.""" |
| 28 | + return _original_prom_dir |
| 29 | + |
23 | 30 |
|
24 | 31 | def setup_multiprocess_prometheus(): |
25 | | - """ |
26 | | - Cleans and recreates the Prometheus multiprocess directory. |
27 | | - """ |
| 32 | + """Cleans and recreates the Prometheus multiprocess directory.""" |
| 33 | + global _original_prom_dir |
28 | 34 |
|
29 | 35 | if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: |
30 | | - base_dir = "/tmp/prom_main" |
31 | | - instance_id = str(uuid.uuid4()) |
32 | | - prom_dir = f"{base_dir}_{instance_id}" |
| 36 | + prom_dir = f"/tmp/prom_main_{uuid.uuid4()}" |
33 | 37 | if os.path.exists(prom_dir): |
34 | 38 | shutil.rmtree(prom_dir, ignore_errors=True) |
35 | 39 | os.makedirs(prom_dir, exist_ok=True) |
36 | | - llm_logger.info(f"PROMETHEUS_MULTIPROC_DIR is set to be {prom_dir}") |
37 | 40 | os.environ["PROMETHEUS_MULTIPROC_DIR"] = prom_dir |
| 41 | + _original_prom_dir = prom_dir |
| 42 | + llm_logger.info(f"PROMETHEUS_MULTIPROC_DIR is set to be {prom_dir}") |
38 | 43 | return prom_dir |
39 | | - else: |
40 | | - prom_dir = os.environ["PROMETHEUS_MULTIPROC_DIR"] |
41 | | - llm_logger.warning( |
42 | | - f"Found PROMETHEUS_MULTIPROC_DIR:{prom_dir} was set by user. " |
43 | | - "you will find inaccurate metrics. Unset the variable " |
44 | | - "will properly handle cleanup." |
45 | | - ) |
46 | | - return os.environ["PROMETHEUS_MULTIPROC_DIR"] |
| 44 | + |
| 45 | + user_dir = os.environ["PROMETHEUS_MULTIPROC_DIR"] |
| 46 | + _original_prom_dir = user_dir |
| 47 | + os.makedirs(user_dir, exist_ok=True) |
| 48 | + llm_logger.info(f"PROMETHEUS_MULTIPROC_DIR is set to {user_dir}") |
| 49 | + return user_dir |
| 50 | + |
| 51 | + |
| 52 | +def setup_dp_prometheus_dir(dp_id, base_dir, env_dict=None): |
| 53 | + """Set up an isolated PROMETHEUS_MULTIPROC_DIR subdirectory for a DP rank. |
| 54 | +
|
| 55 | + For DP0: moves existing .db files from base_dir into dp0/ and updates env. |
| 56 | + mmap writes remain valid after rename on the same filesystem. |
| 57 | + For DP1+: creates dp{i}/ subdirectory and updates env. Fork triggers PID |
| 58 | + change → prometheus_client reset → new .db files in the subdirectory. |
| 59 | +
|
| 60 | + Args: |
| 61 | + dp_id: Data parallel rank id. |
| 62 | + base_dir: Original PROMETHEUS_MULTIPROC_DIR (before any dp suffix). |
| 63 | + env_dict: If provided, write to this dict instead of os.environ. |
| 64 | + """ |
| 65 | + prom_dir_dp = os.path.join(base_dir, f"dp{dp_id}") |
| 66 | + os.makedirs(prom_dir_dp, exist_ok=True) |
| 67 | + if dp_id == 0 and os.path.isdir(base_dir): |
| 68 | + for fname in os.listdir(base_dir): |
| 69 | + src = os.path.join(base_dir, fname) |
| 70 | + if os.path.isfile(src) and fname.endswith(".db"): |
| 71 | + os.rename(src, os.path.join(prom_dir_dp, fname)) |
| 72 | + llm_logger.info(f"Moved {src} -> {prom_dir_dp}") |
| 73 | + target = env_dict if env_dict is not None else os.environ |
| 74 | + target["PROMETHEUS_MULTIPROC_DIR"] = prom_dir_dp |
| 75 | + llm_logger.info(f"Set PROMETHEUS_MULTIPROC_DIR for DP {dp_id}: {prom_dir_dp}") |
0 commit comments