Skip to content

Commit 714483a

Browse files
authored
[Fix] Fix hf cache monkey patch failure when HF_CACHE set to shared storage (#1673)
[Fix] Fix `monkey_patch_hf_modules_cache` failure when HF_CACHE set to shared storage
1 parent b8854c0 commit 714483a

1 file changed

Lines changed: 13 additions & 9 deletions

File tree

xtuner/v1/utils/misc.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import socket
23
import sys
34
import threading
45
from functools import reduce
@@ -19,7 +20,7 @@
1920
from .logger import get_logger
2021

2122

22-
HF_PATCH_MODULES_CACHE_PREFIX = "modules_pid_"
23+
HF_PATCH_MODULES_CACHE_PREFIX = "modules_cache"
2324

2425
logger = get_logger()
2526
XTUNER_DETERMINISTIC = os.getenv("XTUNER_DETERMINISTIC") == "true"
@@ -134,18 +135,21 @@ def is_hf_model_path(path: str | Path) -> tuple[bool, Exception | None]:
134135

135136

136137
def monkey_patch_hf_modules_cache():
137-
# 如果在hf中tokenizer、config等使用remote_code,例如 `AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)`,
138-
# 会将hf_model_path 拷贝到 HF_MODULES_CACHE 中。 如果单机八卡机器上多个进程同时读写此目录,会导致冲突。
139-
# 因此需要将 HF_MODULES_CACHE 设置为当前进程的临时目录。
140-
modules_cache = os.path.join(constants.HF_HOME, f"{HF_PATCH_MODULES_CACHE_PREFIX}{os.getpid()}")
138+
# When using remote_code in HF for tokenizer, config, etc., e.g., `AutoConfig.from_pretrained(hf_model_path,
139+
# trust_remote_code=True)`, the hf_model_path will be copied to HF_MODULES_CACHE. If multiple processes read/write
140+
# this directory simultaneously, it will cause conflicts. Therefore, we need to set HF_MODULES_CACHE to a unique
141+
# temporary directory (identified w/ hostname + pid) for the current process.
142+
hostname = socket.gethostname()
143+
pid = os.getpid()
144+
modules_cache = os.path.join(constants.HF_HOME, f"{HF_PATCH_MODULES_CACHE_PREFIX}_{hostname}_{pid}")
141145
os.environ["HF_MODULES_CACHE"] = modules_cache
142146
transformers.utils.hub.HF_MODULES_CACHE = modules_cache
143-
# import 时刻,Python 会在 dynamic_module_utils 模块的命名空间中创建一个新的名字 HF_MODULES_CACHE
144-
# 并将其绑定到 transformers.utils.HF_MODULES_CACHE 当时所指向的对象。
145-
# 因此,需要将 transformers.dynamic_module_utils.HF_MODULES_CACHE 也设置为新的 modules_cache
147+
# At import time, Python creates a new name HF_MODULES_CACHE in the dynamic_module_utils module's namespace,
148+
# binding it to the object that transformers.utils.HF_MODULES_CACHE pointed to at that moment.
149+
# Therefore, we need to set transformers.dynamic_module_utils.HF_MODULES_CACHE to the new modules_cache as well.
146150
transformers.dynamic_module_utils.HF_MODULES_CACHE = modules_cache
147151
transformers.utils.HF_MODULES_CACHE = modules_cache
148-
logger.info(f"set HF_MODULES_CACHE to {modules_cache} for current process {os.getpid()}")
152+
logger.info(f"set HF_MODULES_CACHE to {modules_cache} for current process (hostname={hostname}, pid={pid})")
149153

150154

151155
class FunctionEnum(StrEnum):

0 commit comments

Comments
 (0)