Skip to content

Commit d99468f

Browse files
committed
arch: Refactor get_gpu_info
1 parent 89b3acb commit d99468f

2 files changed

Lines changed: 443 additions & 94 deletions

File tree

devito/arch/archinfo.py

Lines changed: 78 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import os
66
import re
77
import sys
8-
from collections import defaultdict
98
from contextlib import suppress
109
from functools import cached_property
1110
from pathlib import Path
@@ -20,14 +19,17 @@
2019
from devito.tools import all_equal, as_tuple, memoized_func
2120
from devito.warnings import warn
2221

22+
from .commands import lscpu, lshw, lspci, nvidia_smi, proc_cpuinfo, rocm_smi, sycl_ls
23+
24+
2325
__all__ = [ # noqa: RUF022
2426
'platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_visible_devices',
2527
'get_nvidia_cc', 'get_cuda_path', 'get_cuda_version', 'get_hip_path',
2628
'check_cuda_runtime', 'load_cudart', 'get_m1_llvm_path', 'get_advisor_path',
2729
'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power',
2830
'Device',
2931
'NvidiaDevice', 'AmdDevice', 'IntelDevice',
30-
'old_get_cpu_info',
32+
'old_get_cpu_info', 'old_get_gpu_info',
3133
# Brand-agnostic
3234
'ANYCPU', 'ANYGPU',
3335
# Intel CPUs
@@ -171,97 +173,6 @@ def get_cpu_brand():
171173
cpu_info['physical'] = physical
172174
return cpu_info
173175

174-
def text2dict(text):
175-
return {
176-
line.split(':', 1)[0].strip(): line.split(':', 1)[1].strip()
177-
for line in text.splitlines()
178-
}
179-
180-
def cast2numeric(adict):
181-
# Try and convert numeric values
182-
for k, v in adict.items():
183-
if not v:
184-
adict[k] = None
185-
continue
186-
for cast in [int, float]:
187-
with suppress(ValueError):
188-
adict[k] = cast(v)
189-
break
190-
return adict
191-
192-
def set2range(aset):
193-
if len(aset) == 1:
194-
r = aset.pop()
195-
else:
196-
r = aset
197-
return r
198-
199-
@memoized_func
200-
def proc_cpuinfo():
201-
""" Creates a `dict` containing the information in `/proc/cpuinfo`
202-
"""
203-
# Obtain CPU info as text
204-
try:
205-
with open('/proc/cpuinfo') as f:
206-
lines = f.read()
207-
command = 'cat /proc/cpuinfo'
208-
except FileNotFoundError:
209-
lines = ''
210-
command = '`/proc/cpuinfo` not found'
211-
warn(f'File {command}')
212-
213-
hwthreads = lines.strip().split('\n\n')
214-
logical = len(hwthreads)
215-
216-
info = []
217-
for hwt in hwthreads:
218-
info.append(cast2numeric(text2dict(hwt)))
219-
220-
# Nightmare
221-
variations = defaultdict(set)
222-
for hwt in info:
223-
for k, v in hwt.items():
224-
variations[k].add(v)
225-
226-
final = {}
227-
for k, v in variations.items():
228-
final[k] = set2range(v)
229-
230-
# `cpu MHz` is a "live" value so ignore it
231-
with suppress(KeyError):
232-
del final['cpu MHz']
233-
234-
final['command'] = command
235-
236-
return final
237-
238-
@memoized_func
239-
def lscpu():
240-
""" Creates a `dict` containing the information from `lscpu`
241-
"""
242-
# Use `lscpu -J` if available (not available prior to v2.30, cerca 2017)
243-
try:
244-
ret = run(['lscpu', '-J'], capture_output=True, text=True)
245-
info = {
246-
x['field'].rstrip(':'): x['data']
247-
for x in json.loads(ret.stdout)['lscpu']
248-
}
249-
info['command'] = 'lscpu -J'
250-
except Exception as e:
251-
info = None
252-
253-
# Use `lscpu` if `-J` argument is not available
254-
if info is None:
255-
try:
256-
ret = run(['lscpu'], capture_output=True, text=True)
257-
info = text2dict(ret.stdout)
258-
info['command'] = 'lscpu'
259-
except Exception as e:
260-
msg = '`lscpu` not found'
261-
warn(f'Command {msg}')
262-
info = {'command': msg}
263-
264-
return cast2numeric(info)
265176

266177
@memoized_func
267178
def get_cpu_info():
@@ -330,10 +241,19 @@ def get_cpu_info():
330241
cpu_info[procinfo.pop('command')] = procinfo
331242
cpu_info[lscpuinfo.pop('command')] = lscpuinfo
332243

244+
# This might actually be a bad idea...
245+
# ~ # Like gpu_info attach callbacks for memory status
246+
# ~ # NOTE: from the psutil docs:
247+
# ~ # - The sum of used and available does not necessarily equal total.
248+
# ~ # - free doesn’t reflect the actual memory available (use available instead)
249+
# ~ cpu_info['mem.free'] = lambda: psutil.virtual_memory().available
250+
# ~ cpu_info['mem.used'] = lambda: psutil.virtual_memory().used
251+
# ~ cpu_info['mem.total'] = psutil.virtual_memory().total
333252
return cpu_info
334253

254+
335255
@memoized_func
336-
def get_gpu_info():
256+
def old_get_gpu_info():
337257
"""Attempt GPU info autodetection."""
338258

339259
# Filter out virtual GPUs from a list of GPU dictionaries
@@ -657,6 +577,70 @@ def parse_product_arch():
657577
return None
658578

659579

580+
def homogenise_gpus(gpu_infos):
581+
"""Parse textual gpu info into a dict
582+
583+
Run homogeneity checks on a list of GPUs, return GPU with count if
584+
homogeneous, otherwise None.
585+
"""
586+
if gpu_infos == []:
587+
homogenous = {}
588+
else:
589+
# Check must ignore physical IDs as they may differ
590+
for gpu_info in gpu_infos:
591+
gpu_info.pop('physicalid', None)
592+
593+
if all_equal(gpu_infos):
594+
gpu_infos[0]['ncards'] = len(gpu_infos)
595+
homogenous = gpu_infos[0]
596+
else:
597+
warning('Different models of graphics cards detected')
598+
homogenous = {'ncards': len(gpu_infos)}
599+
600+
return homogenous
601+
602+
603+
@memoized_func
604+
def get_gpu_info():
605+
"""Attempt GPU info autodetection.
606+
607+
Probe for GPU information in the following order:
608+
1. `nvidia-smi`, nvidia cards only
609+
2. `rocm-smi`, AMD cards only
610+
3. `sycl-ls`, Intel cards only
611+
4. `lshw`
612+
5. `lspci`, more readable but less detailed than `lshw`
613+
614+
nvidia and AMD cards allow polling the used and available memory
615+
616+
Returns a dictionary which is empty or has at least the following keys populated:
617+
- physicalid: str
618+
- product: str
619+
- vendor: str
620+
- architecture: str, fallback 'unspecified'
621+
"""
622+
623+
for call in [nvidia_smi, rocm_smi, sycl_ls, lshw, lspci]:
624+
try:
625+
gpu_info = homogenise_gpus(call())
626+
break
627+
except (OSError):
628+
gpu_info = {}
629+
630+
# Attach callbacks to retrieve instantaneous memory info
631+
# Unsure whether this is used or even used to work!
632+
if gpu_info['vendor'] == 'NVIDIA':
633+
pass
634+
635+
if gpu_info['vendor'] == 'AMD':
636+
pass
637+
638+
if gpu_info['vendor'] == 'INTEL':
639+
pass
640+
641+
return gpu_info
642+
643+
660644
def get_visible_devices():
661645
device_vars = (
662646
'CUDA_VISIBLE_DEVICES',

0 commit comments

Comments
 (0)