@@ -493,24 +493,77 @@ def parse_product_arch():
493493 return None
494494
495495
496+ def _resolve_uuids_to_indices (uuids ):
497+ """
498+ Map GPU UUID/unique-ID strings to integer device indices.
499+ """
500+ # (command, pattern) where group(1)=index, group(2)=uuid
501+ # nvidia-smi -L output: "GPU 0: <name> (UUID: GPU-xxxx-...)"
502+ # rocm-smi --showuniqueid output: "GPU[0] : Unique ID: 0x<hex>"
503+ queries = [
504+ (['nvidia-smi' , '-L' ], r'GPU\s+(\d+):.*\(UUID:\s*([\w-]+)\)' ),
505+ (['rocm-smi' , '--showuniqueid' ], r'GPU\[(\d+)\].*Unique ID:\s*([\w]+)' ),
506+ ]
507+ for cmd , pattern in queries :
508+ try :
509+ proc = Popen (cmd , stdout = PIPE , stderr = DEVNULL )
510+ raw = proc .stdout .read ().decode ()
511+ except OSError :
512+ # Command not available
513+ continue
514+
515+ uuid_to_index = {m .group (2 ): int (m .group (1 ))
516+ for line in raw .splitlines ()
517+ if (m := re .match (pattern , line ))}
518+ if not uuid_to_index :
519+ continue
520+
521+ try :
522+ return tuple (uuid_to_index [u ] for u in uuids )
523+ except KeyError :
524+ continue
525+
526+ return None
527+
528+
496529def get_visible_devices ():
497530 device_vars = (
498531 'CUDA_VISIBLE_DEVICES' ,
499532 'ROCR_VISIBLE_DEVICES' ,
500533 'HIP_VISIBLE_DEVICES'
501534 )
502535 for v in device_vars :
503- try :
504- return v , tuple (int (i ) for i in os .environ [v ].split (',' ))
505- except ValueError :
506- # Visible devices set via UUIDs or other non-integer identifiers.
507- warning ("Setting visible devices via UUIDs or other non-integer"
508- " identifiers is currently unsupported: environment variable"
509- f" { v } ={ os .environ [v ]} ignored." )
510- except KeyError :
511- # Environment variable not set
536+ if v not in os .environ :
512537 continue
513538
539+ val = os .environ [v ].strip ()
540+
541+ errmsg = f"{ v } ={ os .environ [v ]!r} exposes no GPU devices."
542+
543+ # Empty string or known "no devices" sentinels
544+ if not val or val .upper () in ('NODEVFILES' ,):
545+ raise RuntimeError (errmsg )
546+
547+ entries = [e .strip () for e in val .split (',' )]
548+
549+ # Try integer parsing first
550+ with suppress (ValueError ):
551+ ids = tuple (int (i ) for i in entries )
552+ # Negative sentinel (e.g. -1) means no devices exposed
553+ if len (ids ) == 1 and ids [0 ] < 0 :
554+ raise RuntimeError (errmsg )
555+
556+ return v , ids
557+
558+ # Try UUID → device index resolution
559+ ids = _resolve_uuids_to_indices (entries )
560+ if ids is not None :
561+ return v , ids
562+
563+ raise RuntimeError (
564+ f"Cannot resolve device specifiers in { v } ={ os .environ [v ]!r} ."
565+ )
566+
514567 return None , None
515568
516569
0 commit comments