Skip to content

Commit 7223f18

Browse files
committed
arch: Enhance visible devices handling to permit UUIDs
1 parent 68e2f74 commit 7223f18

2 files changed

Lines changed: 107 additions & 9 deletions

File tree

devito/arch/archinfo.py

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -493,24 +493,77 @@ def parse_product_arch():
493493
return None
494494

495495

496+
def _resolve_uuids_to_indices(uuids):
497+
"""
498+
Map GPU UUID/unique-ID strings to integer device indices.
499+
"""
500+
# (command, pattern) where group(1)=index, group(2)=uuid
501+
# nvidia-smi -L output: "GPU 0: <name> (UUID: GPU-xxxx-...)"
502+
# rocm-smi --showuniqueid output: "GPU[0] : Unique ID: 0x<hex>"
503+
queries = [
504+
(['nvidia-smi', '-L'], r'GPU\s+(\d+):.*\(UUID:\s*([\w-]+)\)'),
505+
(['rocm-smi', '--showuniqueid'], r'GPU\[(\d+)\].*Unique ID:\s*([\w]+)'),
506+
]
507+
for cmd, pattern in queries:
508+
try:
509+
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
510+
raw = proc.stdout.read().decode()
511+
except OSError:
512+
# Command not available
513+
continue
514+
515+
uuid_to_index = {m.group(2): int(m.group(1))
516+
for line in raw.splitlines()
517+
if (m := re.match(pattern, line))}
518+
if not uuid_to_index:
519+
continue
520+
521+
try:
522+
return tuple(uuid_to_index[u] for u in uuids)
523+
except KeyError:
524+
continue
525+
526+
return None
527+
528+
496529
def get_visible_devices():
497530
device_vars = (
498531
'CUDA_VISIBLE_DEVICES',
499532
'ROCR_VISIBLE_DEVICES',
500533
'HIP_VISIBLE_DEVICES'
501534
)
502535
for v in device_vars:
503-
try:
504-
return v, tuple(int(i) for i in os.environ[v].split(','))
505-
except ValueError:
506-
# Visible devices set via UUIDs or other non-integer identifiers.
507-
warning("Setting visible devices via UUIDs or other non-integer"
508-
" identifiers is currently unsupported: environment variable"
509-
f" {v}={os.environ[v]} ignored.")
510-
except KeyError:
511-
# Environment variable not set
536+
if v not in os.environ:
512537
continue
513538

539+
val = os.environ[v].strip()
540+
541+
errmsg = f"{v}={os.environ[v]!r} exposes no GPU devices."
542+
543+
# Empty string or known "no devices" sentinels
544+
if not val or val.upper() in ('NODEVFILES',):
545+
raise RuntimeError(errmsg)
546+
547+
entries = [e.strip() for e in val.split(',')]
548+
549+
# Try integer parsing first
550+
with suppress(ValueError):
551+
ids = tuple(int(i) for i in entries)
552+
# Negative sentinel (e.g. -1) means no devices exposed
553+
if len(ids) == 1 and ids[0] < 0:
554+
raise RuntimeError(errmsg)
555+
556+
return v, ids
557+
558+
# Try UUID → device index resolution
559+
ids = _resolve_uuids_to_indices(entries)
560+
if ids is not None:
561+
return v, ids
562+
563+
raise RuntimeError(
564+
f"Cannot resolve device specifiers in {v}={os.environ[v]!r}."
565+
)
566+
514567
return None, None
515568

516569

tests/test_gpu_common.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import re
2+
from subprocess import DEVNULL, PIPE, Popen
3+
14
import cloudpickle as pickle
25
import numpy as np
36
import pytest
@@ -107,6 +110,48 @@ def test_visible_devices(self, env_variables):
107110
# Default physical deviceid expected to be 0
108111
assert argmap2._physical_deviceid == 0
109112

113+
@pytest.mark.parametrize('env_variables', [
114+
{"CUDA_VISIBLE_DEVICES": "-1"},
115+
{"CUDA_VISIBLE_DEVICES": ""},
116+
{"CUDA_VISIBLE_DEVICES": "NoDevFiles"},
117+
{"ROCR_VISIBLE_DEVICES": "-1"},
118+
])
119+
def test_no_visible_devices(self, env_variables):
120+
"""Accessing _physical_deviceid when no devices are exposed should raise."""
121+
grid = Grid(shape=(10, 10))
122+
u = Function(name='u', grid=grid)
123+
124+
with switchenv(env_variables):
125+
op = Operator(Eq(u, u+1))
126+
argmap = op.arguments()
127+
with pytest.raises(RuntimeError):
128+
_ = argmap._physical_deviceid
129+
130+
def test_visible_devices_uuid(self):
131+
# Query GPU 0's UUID independently of _resolve_uuids_to_indices
132+
try:
133+
proc = Popen(['nvidia-smi', '-L'], stdout=PIPE, stderr=DEVNULL)
134+
output = proc.stdout.read().decode()
135+
except OSError:
136+
pytest.skip("nvidia-smi not available")
137+
138+
uuid = None
139+
for line in output.splitlines():
140+
m = re.match(r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', line)
141+
if m:
142+
uuid = m.group(1)
143+
break
144+
145+
if uuid is None:
146+
pytest.skip("No GPU 0 UUID found in nvidia-smi output")
147+
148+
grid = Grid(shape=(10, 10))
149+
u = Function(name='u', grid=grid)
150+
with switchenv({'CUDA_VISIBLE_DEVICES': uuid}):
151+
op = Operator(Eq(u, u+1))
152+
argmap = op.arguments()
153+
assert argmap._physical_deviceid == 0
154+
110155
@pytest.mark.parallel(mode=2)
111156
@pytest.mark.parametrize('visible_devices', [
112157
"1,2", "1,0", "0,2,3",

0 commit comments

Comments
 (0)