Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions deepmd/pd/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def get_trainer(
# Initialize DDP
world_size = dist.get_world_size()
if world_size > 1:
assert paddle.version.nccl() != "0"
assert not paddle.core.is_compiled_with_nccl() or paddle.version.nccl() != "0"
fleet.init(is_collective=True)

def prepare_trainer_input_single(
Expand Down Expand Up @@ -214,7 +214,7 @@ def get_compute_device(self) -> str:

def get_ngpus(self) -> int:
"""Get the number of GPUs."""
return paddle.device.cuda.device_count()
return paddle.device.device_count()

def get_backend_info(self) -> dict:
"""Get backend information."""
Expand Down
4 changes: 2 additions & 2 deletions deepmd/pd/utils/auto_batch_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def is_gpu_available(self) -> bool:
bool
True if GPU is available
"""
return paddle.device.cuda.device_count() > 0
return paddle.device.device_count() > 0

def is_oom_error(self, e: Exception) -> bool:
"""Check if the exception is an OOM error.
Expand All @@ -51,6 +51,6 @@ def is_oom_error(self, e: Exception) -> bool:
# (the meaningless error message should be considered as a bug in cusolver)
if isinstance(e, MemoryError) and ("ResourceExhaustedError" in e.args[0]):
# Release all unoccupied cached memory
paddle.device.cuda.empty_cache()
paddle.device.empty_cache()
return True
return False
4 changes: 2 additions & 2 deletions deepmd/pd/utils/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
# Make sure DDP uses correct device if applicable
LOCAL_RANK = int(os.environ.get("PADDLE_LOCAL_RANK", 0))

if os.environ.get("DEVICE") == "cpu" or paddle.device.cuda.device_count() <= 0:
if os.environ.get("DEVICE") == "cpu" or paddle.device.device_count() <= 0:
DEVICE = "cpu"
else:
DEVICE = f"gpu:{LOCAL_RANK}"
DEVICE = paddle.device.get_device()
Comment thread
HydrogenSulfate marked this conversation as resolved.

paddle.device.set_device(DEVICE)

Expand Down
15 changes: 14 additions & 1 deletion deepmd/pd/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
annotations,
)

import warnings
from contextlib import (
contextmanager,
)
Expand Down Expand Up @@ -345,8 +346,20 @@ def get_generator(
generator = paddle.framework.core.default_cuda_generator(
int(DEVICE.split("gpu:")[1])
)
elif DEVICE == "xpu":
generator = paddle.framework.core.default_xpu_generator(0)
elif DEVICE.startswith("xpu:"):
generator = paddle.framework.core.default_xpu_generator(
int(DEVICE.split("xpu:")[1])
)
else:
raise ValueError("DEVICE should be cpu or gpu or gpu:x")
# return none for compability in different devices
Comment thread
HydrogenSulfate marked this conversation as resolved.
warnings.warn(
f"DEVICE is {DEVICE}, which is not supported. Returning None.",
category=UserWarning,
)
return None
# raise ValueError("DEVICE should be cpu or gpu or gpu:x or xpu or xpu:x")
Comment thread
HydrogenSulfate marked this conversation as resolved.
generator.manual_seed(seed)
return generator
else:
Expand Down
2 changes: 1 addition & 1 deletion source/tests/pd/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
@pytest.fixture(scope="package", autouse=True)
def clear_cuda_memory(request):
yield
paddle.device.cuda.empty_cache()
paddle.device.empty_cache()
Loading