|
1 | 1 | import os |
| 2 | +import socket |
2 | 3 | import sys |
3 | 4 | import threading |
4 | 5 | from functools import reduce |
|
19 | 20 | from .logger import get_logger |
20 | 21 |
|
21 | 22 |
|
22 | | -HF_PATCH_MODULES_CACHE_PREFIX = "modules_pid_" |
| 23 | +HF_PATCH_MODULES_CACHE_PREFIX = "modules_cache" |
23 | 24 |
|
24 | 25 | logger = get_logger() |
25 | 26 | XTUNER_DETERMINISTIC = os.getenv("XTUNER_DETERMINISTIC") == "true" |
@@ -134,18 +135,21 @@ def is_hf_model_path(path: str | Path) -> tuple[bool, Exception | None]: |
134 | 135 |
|
135 | 136 |
|
136 | 137 | def monkey_patch_hf_modules_cache(): |
137 | | - # 如果在hf中tokenizer、config等使用remote_code,例如 `AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)`, |
138 | | - # 会将hf_model_path 拷贝到 HF_MODULES_CACHE 中。 如果单机八卡机器上多个进程同时读写此目录,会导致冲突。 |
139 | | - # 因此需要将 HF_MODULES_CACHE 设置为当前进程的临时目录。 |
140 | | - modules_cache = os.path.join(constants.HF_HOME, f"{HF_PATCH_MODULES_CACHE_PREFIX}{os.getpid()}") |
| 138 | + # When using remote_code in HF for tokenizer, config, etc., e.g., `AutoConfig.from_pretrained(hf_model_path, |
| 139 | + # trust_remote_code=True)`, the hf_model_path will be copied to HF_MODULES_CACHE. If multiple processes read/write |
| 140 | + # this directory simultaneously, it will cause conflicts. Therefore, we need to set HF_MODULES_CACHE to a unique |
| 141 | + # temporary directory (identified w/ hostname + pid) for the current process. |
| 142 | + hostname = socket.gethostname() |
| 143 | + pid = os.getpid() |
| 144 | + modules_cache = os.path.join(constants.HF_HOME, f"{HF_PATCH_MODULES_CACHE_PREFIX}_{hostname}_{pid}") |
141 | 145 | os.environ["HF_MODULES_CACHE"] = modules_cache |
142 | 146 | transformers.utils.hub.HF_MODULES_CACHE = modules_cache |
143 | | - # 在 import 时刻,Python 会在 dynamic_module_utils 模块的命名空间中创建一个新的名字 HF_MODULES_CACHE, |
144 | | - # 并将其绑定到 transformers.utils.HF_MODULES_CACHE 当时所指向的对象。 |
145 | | - # 因此,需要将 transformers.dynamic_module_utils.HF_MODULES_CACHE 也设置为新的 modules_cache。 |
| 147 | + # At import time, Python creates a new name HF_MODULES_CACHE in the dynamic_module_utils module's namespace, |
| 148 | + # binding it to the object that transformers.utils.HF_MODULES_CACHE pointed to at that moment. |
| 149 | + # Therefore, we need to set transformers.dynamic_module_utils.HF_MODULES_CACHE to the new modules_cache as well. |
146 | 150 | transformers.dynamic_module_utils.HF_MODULES_CACHE = modules_cache |
147 | 151 | transformers.utils.HF_MODULES_CACHE = modules_cache |
148 | | - logger.info(f"set HF_MODULES_CACHE to {modules_cache} for current process {os.getpid()}") |
| 152 | + logger.info(f"set HF_MODULES_CACHE to {modules_cache} for current process (hostname={hostname}, pid={pid})") |
149 | 153 |
|
150 | 154 |
|
151 | 155 | class FunctionEnum(StrEnum): |
|
0 commit comments