@@ -24,26 +24,42 @@ def __init__(
2424 use_locked_clocks = False
2525 ):
2626 """Create object to control device using NVML."""
27+ # We set these first as __del__ checks these
28+ # and this __init__ may exceptions midway
29+ self .pwr_limit_default = None
30+ self .modified_clocks = False
31+
2732 pynvml .nvmlInit ()
2833
29- if sum (x is not None for x in [device_id , device_uuid , device_pci_bus ]) != 1 :
30- raise ValueError ("invalid device: specify either the index, the UUID, or the PCI-bus" )
31- elif device_id is not None :
34+ if device_id is not None :
3235 self .dev = pynvml .nvmlDeviceGetHandleByIndex (device_id )
3336 elif device_uuid is not None :
3437 self .dev = pynvml .nvmlDeviceGetHandleByUUID (device_uuid )
3538 elif device_pci_bus is not None :
36- self .dev = pynvml .nvmlDeviceGetHandleByPciBusId_v2 (device_pci_bus )
39+ self .dev = pynvml .nvmlDeviceGetHandleByPciBusId (device_pci_bus )
3740
3841 self .id = pynvml .nvmlDeviceGetIndex (self .dev )
42+ self .uuid = pynvml .nvmlDeviceGetUUID (self .dev )
43+ self .pci_bus = pynvml .nvmlDeviceGetPciInfo_v3 (self .dev ).busId
3944 self .nvidia_smi = nvidia_smi_fallback or "nvidia-smi"
4045
46+ if device_id is not None and self .id != device_id :
47+ raise ValueError (f"NVML device ID does not match requested device: { device_id } != { self .id } " )
48+
49+ # Some backends have UUID starting with "GPU-"
50+ if device_uuid is not None and self .uuid .removeprefix ("GPU-" ) != device_uuid .removeprefix ("GPU-" ):
51+ raise ValueError (f"NVML device UUID does not match requested device: { device_uuid } != { self .uuid } " )
52+
53+ # lstrip is needed since some backends use leading zeros
54+ if device_pci_bus is not None and self .pci_bus .lstrip ("0" ) != device_pci_bus .lstrip ("0" ):
55+ raise ValueError (f"NVML device PCI-bus does not match requested device: { device_pci_bus } != { self .pci_bus } " )
56+
4157 try :
4258 self .pwr_limit_default = pynvml .nvmlDeviceGetPowerManagementLimit (self .dev )
4359 self .pwr_constraints = pynvml .nvmlDeviceGetPowerManagementLimitConstraints (self .dev )
4460 except pynvml .NVMLError_NotSupported :
45- self .pwr_limit_default = None
4661 # inverted range to make all range checks fail
62+ self .pwr_limit_default = None
4763 self .pwr_constraints = [1 , 0 ]
4864
4965 try :
@@ -58,7 +74,6 @@ def __init__(
5874 self ._auto_boost = None
5975
6076 # try to initialize application clocks
61- self .modified_clocks = False
6277 try :
6378 if not use_locked_clocks :
6479 self .gr_clock_default = pynvml .nvmlDeviceGetDefaultApplicationsClock (
@@ -287,6 +302,11 @@ def pwr_usage(self):
287302 NVML_FI_DEV_POWER_INSTANT = 186
288303 return pynvml .nvmlDeviceGetFieldValues (self .dev , [NVML_FI_DEV_POWER_INSTANT ])[0 ].value .uiVal
289304
305+ def energy_usage (self ):
306+ """Return total energy usage since bootup in milli joules."""
307+ NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 83
308+ return pynvml .nvmlDeviceGetFieldValues (self .dev , [NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION ])[0 ].value .ullVal
309+
290310 def gr_voltage (self ):
291311 """Return current graphics voltage in millivolts."""
292312 args = ["nvidia-smi" , "-i" , str (self .id ), "-q" , "-d" , "VOLTAGE" ]
@@ -335,7 +355,7 @@ class NVMLObserver(BenchmarkObserver):
335355 def __init__ (
336356 self ,
337357 observables ,
338- device = 0 ,
358+ device = None ,
339359 save_all = False ,
340360 nvidia_smi_fallback = None ,
341361 use_locked_clocks = False ,
@@ -374,6 +394,8 @@ def __init__(
374394
375395 self .record_gr_voltage = False
376396 self .t0 = 0
397+ self .initial_energy_reading = None
398+
377399 if "gr_voltage" in observables :
378400 self .record_gr_voltage = True
379401 self .gr_voltage_readings = []
@@ -386,26 +408,34 @@ def __init__(
386408 self .iteration = {obs : [] for obs in self .during_obs }
387409
388410 def register_device (self , dev ):
411+ env = getattr (dev , "env" , dict ())
412+ uuid = env .get ("uuid" )
413+ pci_bus = env .get ("pci_bus_id" )
414+
389415 if self .device is not None :
390416 self .nvml = nvml (device_id = self .device , ** self .nvml_kwargs )
417+ elif uuid is not None and pci_bus is not None :
418+ self .nvml = nvml (device_uuid = uuid , device_pci_bus = pci_bus , ** self .nvml_kwargs )
419+ elif uuid is not None :
420+ self .nvml = nvml (device_uuid = uuid , ** self .nvml_kwargs )
421+ elif pci_bus is not None :
422+ self .nvml = nvml (device_pci_bus = pci_bus , ** self .nvml_kwargs )
391423 else :
392- env = getattr (dev , "env" , dict ())
393- uuid = env .get ("uuid" )
394- pci_bus = env .get ("pci_bus_id" )
395-
396- if uuid is not None :
397- self .nvml = nvml (device_uuid = uuid , ** self .nvml_kwargs )
398- elif pci_bus is not None :
399- self .nvml = nvml (device_pci_bus = pci_bus , ** self .nvml_kwargs )
400- else :
401- raise ValueError ("failed to detect NVIDIA device: no UUID or PCI-bus-id in environment" )
402-
403-
424+ raise ValueError ("failed to detect NVIDIA device: no UUID or PCI-bus-id in environment" )
404425
405426 def read_power (self ):
406427 """ Return power in Watt """
407428 return self .nvml .pwr_usage () / 1e3
408429
430+ def read_energy (self ):
431+ """ Return cumulative energy usage in Joule """
432+ now = self .nvml .energy_usage ()
433+
434+ if self .initial_energy_reading is None :
435+ self .initial_energy_reading = now
436+
437+ return (now - self .initial_energy_reading ) / 1e3
438+
409439 def before_start (self ):
410440 # clear results of the observables for next measurement
411441 self .iteration = {obs : [] for obs in self .during_obs }
@@ -530,3 +560,4 @@ def get_idle_power(device, n=5, sleep_s=0.1):
530560 time .sleep (sleep_s )
531561 readings .append (d .pwr_usage ())
532562 return np .mean (readings ) * 1e-3 # Watt
563+
0 commit comments