|
13 | 13 | # See the License for the specific language governing permissions and |
14 | 14 | # limitations under the License. |
15 | 15 |
|
16 | | -import math |
17 | 16 | import os |
18 | 17 | import subprocess |
19 | 18 | import time |
20 | 19 | from importlib import resources |
21 | 20 | from typing import Any, Callable |
22 | | -from urllib.parse import urlparse |
23 | 21 |
|
24 | 22 | import ray |
25 | 23 | import torch |
|
32 | 30 | from transfer_queue.metadata import KVBatchMeta |
33 | 31 | from transfer_queue.sampler import * # noqa: F401 |
34 | 32 | from transfer_queue.sampler import BaseSampler |
35 | | -from transfer_queue.storage.simple_storage import SimpleStorageUnit |
36 | | -from transfer_queue.utils.common import get_placement_group |
| 33 | +from transfer_queue.storage.bootstrap import StorageBootstrapProvider |
37 | 34 | from transfer_queue.utils.logging_utils import get_logger |
38 | | -from transfer_queue.utils.yuanrong_utils import ( |
39 | | - cleanup_yuanrong_resources, |
40 | | - initialize_yuanrong_backend, |
41 | | -) |
| 35 | +from transfer_queue.utils.yuanrong_utils import cleanup_yuanrong_resources |
42 | 36 | from transfer_queue.utils.zmq_utils import process_zmq_server_info |
43 | 37 |
|
44 | 38 | logger = get_logger(__name__) |
@@ -70,125 +64,23 @@ def _maybe_create_tq_client(conf: DictConfig | None = None) -> TransferQueueClie |
70 | 64 | return _TQ_CLIENT |
71 | 65 |
|
72 | 66 |
|
73 | | -# TODO(hz): Adopt registry pattern to manage storage backends for better scalability. |
74 | 67 | def _maybe_create_tq_storage(conf: DictConfig) -> DictConfig: |
75 | 68 | global _TQ_STORAGE |
76 | 69 |
|
77 | 70 | if _TQ_STORAGE is None: |
78 | 71 | _TQ_STORAGE = {} |
79 | | - if conf.backend.storage_backend == "SimpleStorage": |
80 | | - # initialize SimpleStorageUnit |
81 | | - simple_storage_handles = {} |
82 | | - num_data_storage_units = conf.backend.SimpleStorage.num_data_storage_units |
83 | | - total_storage_size = conf.backend.SimpleStorage.total_storage_size |
84 | | - storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1) |
85 | | - |
86 | | - for storage_unit_rank in range(num_data_storage_units): |
87 | | - storage_node = SimpleStorageUnit.options( # type: ignore[attr-defined] |
88 | | - placement_group=storage_placement_group, |
89 | | - placement_group_bundle_index=storage_unit_rank, |
90 | | - name=f"TransferQueueStorageUnit#{storage_unit_rank}", |
91 | | - ).remote( |
92 | | - storage_unit_size=math.ceil(total_storage_size / num_data_storage_units), |
93 | | - ) |
94 | | - simple_storage_handles[f"TransferQueueStorageUnit#{storage_unit_rank}"] = storage_node |
95 | | - logger.info(f"TransferQueueStorageUnit#{storage_unit_rank} has been created.") |
96 | | - |
97 | | - storage_zmq_info = process_zmq_server_info(simple_storage_handles) |
98 | | - backend_name = conf.backend.storage_backend |
99 | | - conf.backend[backend_name].zmq_info = storage_zmq_info |
100 | | - _TQ_STORAGE["SimpleStorage"] = simple_storage_handles |
101 | | - if conf.backend.storage_backend == "MooncakeStore": |
102 | | - if conf.backend.MooncakeStore.auto_init: |
103 | | - # Try to kill existing mooncake_master processes before starting a new one to avoid potential conflicts |
104 | | - check = subprocess.run(["pgrep", "-f", "mooncake_master"], stdout=subprocess.PIPE, text=True) |
105 | | - if check.returncode == 0: |
106 | | - pids = check.stdout.strip().replace("\n", ", ") |
107 | | - logger.info(f"Find existing mooncake_master (PID: {pids}), try to kill first...") |
108 | | - |
109 | | - result = os.system('pkill -f "[m]ooncake_master"') |
110 | | - if result == 0: |
111 | | - logger.info("Successfully killed existing mooncake_master processes.") |
112 | | - else: |
113 | | - raise RuntimeError(f"Failed to kill existing mooncake_master processes (exit code: {result}).") |
114 | | - |
115 | | - # process metadata_server |
116 | | - metadata_server_raw_address = conf.backend.MooncakeStore.metadata_server |
117 | | - if "://" not in metadata_server_raw_address: |
118 | | - metadata_server_raw_address = "//" + metadata_server_raw_address |
119 | | - |
120 | | - metadata_server_parsed = urlparse(metadata_server_raw_address) |
121 | | - |
122 | | - if not metadata_server_parsed.hostname or metadata_server_parsed.port is None: |
123 | | - raise ValueError( |
124 | | - f"Invalid metadata_server '{conf.backend.MooncakeStore.metadata_server}'. " |
125 | | - f"Host and port are required (e.g., host:port)." |
126 | | - ) |
127 | | - |
128 | | - metadata_server_host = metadata_server_parsed.hostname |
129 | | - metadata_server_port = str(metadata_server_parsed.port) |
130 | | - |
131 | | - # process master_server |
132 | | - master_server_raw_address = conf.backend.MooncakeStore.master_server_address |
133 | | - if "://" not in master_server_raw_address: |
134 | | - master_server_raw_address = "//" + master_server_raw_address |
135 | | - |
136 | | - master_server_parsed = urlparse(master_server_raw_address) |
137 | | - |
138 | | - if not master_server_parsed.hostname or master_server_parsed.port is None: |
139 | | - raise ValueError( |
140 | | - f"Invalid master_server_address '{conf.backend.MooncakeStore.master_server_address}'. " |
141 | | - f"Host and port are required (e.g., host:port)." |
142 | | - ) |
143 | | - |
144 | | - master_server_port = str(master_server_parsed.port) |
145 | | - |
146 | | - cmd = [ |
147 | | - "mooncake_master", |
148 | | - "-client_ttl=30", |
149 | | - "-default_kv_lease_ttl=999999", |
150 | | - "-default_kv_soft_pin_ttl=999999", |
151 | | - "--eviction_high_watermark_ratio=1.0", |
152 | | - "--eviction_ratio=0.0", |
153 | | - "--enable_http_metadata_server=true", |
154 | | - "--allow_evict_soft_pinned_objects=false", |
155 | | - f"--http_metadata_server_host={metadata_server_host}", |
156 | | - f"--http_metadata_server_port={metadata_server_port}", |
157 | | - f"--rpc_port={master_server_port}", |
158 | | - ] |
159 | | - |
160 | | - log_file_path = "/tmp/mooncake_master.log" |
161 | | - with open(log_file_path, "w") as log_file: |
162 | | - process = subprocess.Popen( |
163 | | - cmd, |
164 | | - stdout=log_file, |
165 | | - stderr=subprocess.STDOUT, |
166 | | - text=True, |
167 | | - bufsize=1, |
168 | | - universal_newlines=True, |
169 | | - start_new_session=True, |
170 | | - ) |
171 | | - time.sleep(3) |
172 | | - |
173 | | - if process.poll() is None: |
174 | | - logger.info( |
175 | | - f"mooncake_master started, PID: {process.pid}. Logs are at: {os.path.abspath(log_file_path)}" |
176 | | - ) |
177 | | - else: |
178 | | - error_msg = "" |
179 | | - try: |
180 | | - with open(log_file_path) as f: |
181 | | - error_msg = f.read() |
182 | | - except Exception as e: |
183 | | - error_msg = f"Failed to read log file: {e}" |
184 | | - |
185 | | - raise RuntimeError( |
186 | | - f"mooncake_master exited with error. Check {log_file_path} for detailed logs. " |
187 | | - f"Output:\n{error_msg}" |
188 | | - ) |
189 | | - _TQ_STORAGE["MooncakeStore"] = process |
190 | | - if conf.backend.storage_backend == "Yuanrong" and conf.backend.Yuanrong.auto_init: |
191 | | - _TQ_STORAGE["Yuanrong"] = initialize_yuanrong_backend(conf) |
| 72 | + backend_name = conf.backend.storage_backend |
| 73 | + provider_fn = StorageBootstrapProvider.get_provider(backend_name) |
| 74 | + if provider_fn is not None: |
| 75 | + backend_resources = provider_fn(conf) |
| 76 | + if backend_resources is not None: |
| 77 | + _TQ_STORAGE[backend_name] = backend_resources |
| 78 | + else: |
| 79 | + logger.error(f"Not found available {backend_name} storage resources, please check the config.") |
| 80 | + else: |
| 81 | + logger.error( |
| 82 | + f"Storage backend {backend_name} not registered. Please add it to the StorageBootstrapProvider." |
| 83 | + ) |
192 | 84 | return conf |
193 | 85 |
|
194 | 86 |
|
|
0 commit comments