Skip to content

Commit 98d6328

Browse files
committed
arch: Refactor UUID processing and enhance tests
1 parent 7223f18 commit 98d6328

2 files changed

Lines changed: 39 additions & 35 deletions

File tree

devito/arch/archinfo.py

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from packaging.version import InvalidVersion, parse
1717

1818
from devito.logger import warning
19-
from devito.tools import all_equal, as_tuple, memoized_func
19+
from devito.tools import all_equal, as_tuple, frozendict, memoized_func
2020

2121
__all__ = [ # noqa: RUF022
2222
'platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_visible_devices',
@@ -493,37 +493,31 @@ def parse_product_arch():
493493
return None
494494

495495

496-
def _resolve_uuids_to_indices(uuids):
496+
@memoized_func
497+
def _get_uuid_to_index_map():
497498
"""
498-
Map GPU UUID/unique-ID strings to integer device indices.
499+
Build a frozen mapping from GPU UUID/unique-ID strings to integer device indices.
499500
"""
500501
# (command, pattern) where group(1)=index, group(2)=uuid
501502
# nvidia-smi -L output: "GPU 0: <name> (UUID: GPU-xxxx-...)"
502503
# rocm-smi --showuniqueid output: "GPU[0] : Unique ID: 0x<hex>"
503504
queries = [
504-
(['nvidia-smi', '-L'], r'GPU\s+(\d+):.*\(UUID:\s*([\w-]+)\)'),
505+
(['nvidia-smi', '-L'], r'GPU\s+(\d+):.*\(UUID:\s*([\w-]+)\)'),
505506
(['rocm-smi', '--showuniqueid'], r'GPU\[(\d+)\].*Unique ID:\s*([\w]+)'),
506507
]
508+
mapper = {}
507509
for cmd, pattern in queries:
508510
try:
509511
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
510512
raw = proc.stdout.read().decode()
511513
except OSError:
512-
# Command not available
513514
continue
514515

515-
uuid_to_index = {m.group(2): int(m.group(1))
516-
for line in raw.splitlines()
517-
if (m := re.match(pattern, line))}
518-
if not uuid_to_index:
519-
continue
516+
for line in raw.splitlines():
517+
if m := re.match(pattern, line):
518+
mapper[m.group(2)] = int(m.group(1))
520519

521-
try:
522-
return tuple(uuid_to_index[u] for u in uuids)
523-
except KeyError:
524-
continue
525-
526-
return None
520+
return frozendict(mapper)
527521

528522

529523
def get_visible_devices():
@@ -556,13 +550,17 @@ def get_visible_devices():
556550
return v, ids
557551

558552
# Try UUID → device index resolution
559-
ids = _resolve_uuids_to_indices(entries)
560-
if ids is not None:
553+
mapper = _get_uuid_to_index_map()
554+
try:
555+
ids = tuple(mapper[u] for u in entries)
561556
return v, ids
557+
except KeyError:
558+
pass
562559

563-
raise RuntimeError(
564-
f"Cannot resolve device specifiers in {v}={os.environ[v]!r}."
565-
)
560+
warning("Unresolvable visible devices environment variables encountered:"
561+
f" {v}={os.environ[v]} ignored.")
562+
563+
return None, None
566564

567565
return None, None
568566

tests/test_gpu_common.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -128,26 +128,32 @@ def test_no_visible_devices(self, env_variables):
128128
_ = argmap._physical_deviceid
129129

130130
def test_visible_devices_uuid(self):
131-
# Query GPU 0's UUID independently of _resolve_uuids_to_indices
132-
try:
133-
proc = Popen(['nvidia-smi', '-L'], stdout=PIPE, stderr=DEVNULL)
134-
output = proc.stdout.read().decode()
135-
except OSError:
136-
pytest.skip("nvidia-smi not available")
137-
138-
uuid = None
139-
for line in output.splitlines():
140-
m = re.match(r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', line)
141-
if m:
142-
uuid = m.group(1)
131+
# Query GPU 0's UUID independently of _get_uuid_to_index_map
132+
probes = [
133+
(['nvidia-smi', '-L'], r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', 'CUDA_VISIBLE_DEVICES'),
134+
(['rocm-smi', '--showuniqueid'], r'GPU\[0\].*Unique ID:\s*([\w]+)', 'ROCR_VISIBLE_DEVICES'),
135+
]
136+
uuid = env_var = None
137+
for cmd, pattern, var in probes:
138+
try:
139+
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
140+
output = proc.stdout.read().decode()
141+
except OSError:
142+
continue
143+
for line in output.splitlines():
144+
m = re.match(pattern, line)
145+
if m:
146+
uuid, env_var = m.group(1), var
147+
break
148+
if uuid is not None:
143149
break
144150

145151
if uuid is None:
146-
pytest.skip("No GPU 0 UUID found in nvidia-smi output")
152+
pytest.skip("No GPU 0 UUID found via nvidia-smi or rocm-smi")
147153

148154
grid = Grid(shape=(10, 10))
149155
u = Function(name='u', grid=grid)
150-
with switchenv({'CUDA_VISIBLE_DEVICES': uuid}):
156+
with switchenv({env_var: uuid}):
151157
op = Operator(Eq(u, u+1))
152158
argmap = op.arguments()
153159
assert argmap._physical_deviceid == 0

0 commit comments

Comments
 (0)