Skip to content

Commit e5baf69

Browse files
authored
feat: add device name display (for example: A100 not just cuda) (#5146)
…ot just cuda) <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Automatic GPU device detection added for PyTorch, TensorFlow, and Paddle; detected GPU names are now shown in system diagnostics when available. * Detection is conditional and safe: if no GPU is present or detection fails, diagnostics simply omit the device name without causing errors. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent 567c5ba commit e5baf69

4 files changed

Lines changed: 51 additions & 0 deletions

File tree

deepmd/pd/entrypoints/main.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,20 @@ def get_backend_info(self) -> dict:
224224
**op_info,
225225
}
226226

227+
def get_device_name(self) -> str | None:
228+
"""Get the underlying GPU name.
229+
230+
Returns
231+
-------
232+
str or None
233+
The device name if available, otherwise None.
234+
"""
235+
if paddle.device.is_compiled_with_cuda():
236+
cuda = paddle.device.cuda
237+
if cuda.device_count() > 0:
238+
return cuda.get_device_name()
239+
return None
240+
227241

228242
def train(
229243
input_file: str,

deepmd/pt/entrypoints/main.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,18 @@ def get_backend_info(self) -> dict:
252252
**op_info,
253253
}
254254

255+
def get_device_name(self) -> str | None:
256+
"""Use PyTorch's current device name as the device identifier.
257+
258+
Returns
259+
-------
260+
str or None
261+
The device name if available, otherwise None.
262+
"""
263+
if torch.cuda.is_available():
264+
return torch.cuda.get_device_name(torch.cuda.current_device())
265+
return None
266+
255267

256268
def train(
257269
input_file: str,

deepmd/tf/train/run_options.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,24 @@ def get_backend_info(self) -> dict:
7373
"build with TF lib": GLOBAL_CONFIG["tf_libs"].replace(";", "\n"),
7474
}
7575

76+
def get_device_name(self) -> str | None:
77+
"""Get the hardware device name if available.
78+
79+
Returns
80+
-------
81+
str or None
82+
The device name (e.g., NVIDIA A100) if available, otherwise None.
83+
"""
84+
try:
85+
gpus = tf.config.get_visible_devices("GPU")
86+
if gpus:
87+
details = tf.config.experimental.get_device_details(gpus[0])
88+
return details.get("device_name")
89+
except (AttributeError, RuntimeError):
90+
# Experimental API may not exist or fail in some TF versions
91+
pass
92+
return None
93+
7694

7795
class RunOptions:
7896
"""Class with info on how to run training (cluster, MPI and GPU config).

deepmd/utils/summary.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ def __call__(self) -> None:
7474
"computing device": self.get_compute_device(),
7575
}
7676
)
77+
device_name = self.get_device_name()
78+
if device_name:
79+
build_info["Device Name"] = device_name
7780
if self.is_built_with_cuda():
7881
env_value = os.environ.get("CUDA_VISIBLE_DEVICES", "unset")
7982
build_info["CUDA_VISIBLE_DEVICES"] = env_value
@@ -123,6 +126,10 @@ def get_compute_device(self) -> str:
123126
def get_ngpus(self) -> int:
124127
"""Get the number of GPUs."""
125128

129+
@abstractmethod
130+
def get_device_name(self) -> str | None:
131+
"""Get the device name (e.g., NVIDIA A800-SXM4-80GB) if available."""
132+
126133
def get_backend_info(self) -> dict:
127134
"""Get backend information."""
128135
return {}

0 commit comments

Comments
 (0)