33import numpy as np
44import time
55
6+ # Trapz was renamed to trapezoid in Numpy 2.0
7+ try :
8+ from numpy import trapezoid
9+ except ImportError :
10+ from numpy import trapz as trapezoid
11+
612from uuid import UUID
7- from kernel_tuner .observers import BenchmarkObserver
13+ from kernel_tuner .observers import BenchmarkObserver , ContinuousObserver
814
915logger = logging .getLogger (__name__ )
1016
@@ -65,14 +71,157 @@ def _find_device_by_bdf(devices, pci_domain, pci_bus, pci_device):
6571 return result
6672
6773
74+ class AMDDevice :
75+ def __init__ (self , device ):
76+ self .device = device
77+
78+ def total_energy_usage (self ):
79+ """ Returns total energy usage since startup. """
80+
81+ result = amdsmi .amdsmi_get_energy_count (self .device )
82+
83+ # This field changed name in rocm 6.4
84+ if "energy_accumulator" not in result :
85+ if "power" in result :
86+ result ["energy_accumulator" ] = result ["power" ]
87+ else :
88+ raise RuntimeError (f"invalid result from amdsmi_get_energy_count: { result } " )
89+
90+ return result
91+
92+ def current_power_usage (self ):
93+ info = amdsmi .amdsmi_get_power_info (self .device )
94+
95+ if "current_socket_power" in info :
96+ # For newer Mi300+ cards
97+ return info ["current_socket_power" ]
98+ elif "average_socket_power" in info :
99+ # For older cards
100+ return info ["average_socket_power" ]
101+ else :
102+ raise RuntimeError (f"invalid result from amdsmi_get_power_info: { info } " )
103+
104+ def core_voltage (self ):
105+ """ Returns current voltage in Volt. """
106+
107+ milli_volt = amdsmi .amdsmi_get_gpu_volt_metric (
108+ self .device ,
109+ amdsmi .AmdSmiVoltageType .VDDGFX ,
110+ amdsmi .AmdSmiVoltageMetric .CURRENT ,
111+ )
112+
113+ # milli * 1-e3 -> volt
114+ return milli_volt * 1e-3
115+
116+ def temperature (self ):
117+ """ Returns current temperature in celcius. """
118+ return amdsmi .amdsmi_get_temp_metric (
119+ self .device ,
120+ amdsmi .AmdSmiTemperatureType .HOTSPOT ,
121+ amdsmi .AmdSmiTemperatureMetric .CURRENT ,
122+ )
123+
124+ def mem_temperature (self ):
125+ """ Returns current temperature in celcius. """
126+ return amdsmi .amdsmi_get_temp_metric (
127+ self .device ,
128+ amdsmi .AmdSmiTemperatureType .VRAM ,
129+ amdsmi .AmdSmiTemperatureMetric .CURRENT ,
130+ )
131+
132+ def core_freq (self ):
133+ """ Returns current core clock frequency in Hz. """
134+ obj = amdsmi .amdsmi_get_clk_freq (self .device , amdsmi .AmdSmiClkType .GFX )
135+ freq = obj ["frequency" ][obj ["current" ]]
136+ return freq
137+
138+ def mem_freq (self ):
139+ """ Returns current memory clock frequency in Hz. """
140+ obj = amdsmi .amdsmi_get_clk_freq (self .device , amdsmi .AmdSmiClkType .MEM )
141+ freq = obj ["frequency" ][obj ["current" ]]
142+ return freq
143+
144+ def core_activity (self ):
145+ """ Returns core usage as percentage (0-100). """
146+ obj = amdsmi .amdsmi_get_gpu_activity (self .device )
147+ result = obj ["gfx_activity" ]
148+ # Result is "N/A" on error, return NaN instead
149+ return float ("nan" ) if isinstance (result , str ) else result
150+
151+ def mem_activity (self ):
152+ """ Returns memory usage as percentage (0-100). """
153+ obj = amdsmi .amdsmi_get_gpu_activity (self .device )
154+ result = obj ["umc_activity" ]
155+ # Result is "N/A" on error, return NaN instead
156+ return float ("nan" ) if isinstance (result , str ) else result
157+
158+
68159SUPPORTED_OBSERVABLES = [
69160 "energy" ,
161+ "power" ,
70162 "core_freq" ,
71163 "mem_freq" ,
72164 "temperature" ,
165+ "mem_temperature" ,
73166 "core_voltage" ,
167+ "core_activity" ,
168+ "mem_activity" ,
74169]
75170
171+ class AMDSMIContinuousObserver (ContinuousObserver ):
172+ def __init__ (self , parent , continuous_duration = 1.0 ):
173+ self .parent = parent
174+ self .continuous_duration = continuous_duration
175+ self .warmup_time = min (0.1 , continuous_duration / 2 )
176+ self .results = None
177+
178+ def before_start (self ):
179+ self .parent .before_start ()
180+
181+ def after_start (self ):
182+ self .warmup_completed = False
183+ self .start_time = time .perf_counter () + self .warmup_time
184+
185+ def during (self ):
186+ now = time .perf_counter ()
187+
188+ if not self .warmup_completed :
189+ if now < self .start_time :
190+ return
191+
192+ # Only call `after_start` once warmup time has passed
193+ self .start_time = now
194+ self .warmup_completed = True
195+ self .parent .after_start ()
196+
197+ self .parent .during ()
198+
199+ def after_finish (self ):
200+ if self .warmup_completed :
201+ self .parent .after_finish ()
202+
203+ def get_results (self ):
204+ if not self .warmup_completed :
205+ return dict ()
206+
207+ elapsed_sec = time .perf_counter () - self .start_time
208+ time_sec = self .results ["time" ] * 1e-3
209+ ratio = time_sec / elapsed_sec
210+
211+ # Get results from the parent
212+ results = self .parent .get_results ()
213+
214+ # The energy field measures the energy over the entire
215+ # continuous duration. However, we want the average
216+ # energy usage _per_ kernel. To fix this, we multiply
217+ # by the ratio of elapsed time to time per kernel
218+ energy_field = self .parent .field_name ("energy" )
219+
220+ if energy_field in results :
221+ results [energy_field ] = results [energy_field ] * ratio
222+
223+ return results
224+
76225
77226class AMDSMIObserver (BenchmarkObserver ):
78227 """
@@ -81,7 +230,7 @@ class AMDSMIObserver(BenchmarkObserver):
81230 and core voltage (`core_voltage`).
82231 """
83232
84- def __init__ (self , observables = ["energy" ], * , device_id = None , prefix = "amdsmi" ):
233+ def __init__ (self , observables = ["energy" ], * , device_id = None , prefix = "amdsmi" , use_continuous_observer = True , continuous_duration = 1.0 ):
85234 """
86235 Initialize the AMDSMIObserver.
87236
@@ -91,15 +240,18 @@ def __init__(self, observables=["energy"], *, device_id=None, prefix="amdsmi"):
91240 :param device_id: Specific AMD device index. If None, auto-detection is used.
92241 :param prefix: Prefix used for name in the metrics. Defaults to "amdsmi".
93242 """
243+ observables = SUPPORTED_OBSERVABLES
94244 for obs in observables :
95245 if obs not in SUPPORTED_OBSERVABLES :
96246 raise ValueError (f"Observable { obs } not supported: { SUPPORTED_OBSERVABLES } " )
97247
98248 self .observables = set (observables )
99- self .iteration_results = {k : [] for k in self .observables }
100249 self .prefix = prefix
101250 self .device_id = device_id
102251 self .device = None
252+ self .use_continuous_observer = use_continuous_observer
253+ self .continuous_duration = continuous_duration
254+ self .results_per_iteration = {self .field_name (k ): [] for k in self .observables }
103255
104256 def register_device (self , dev ):
105257 amdsmi .amdsmi_init ()
@@ -154,90 +306,97 @@ def register_device(self, dev):
154306 f"invalid AMD SMI device_id { self .device_id } , found { len (devices )} devices"
155307 )
156308
157- self .device = devices [self .device_id ]
309+ self .device = AMDDevice (devices [self .device_id ])
310+
311+ if self .use_continuous_observer :
312+ self .continuous_observer = AMDSMIContinuousObserver (self , continuous_duration = self .continuous_duration )
158313
159314 def after_start (self ):
160- self .energy_after_start = amdsmi . amdsmi_get_energy_count ( self .device )
161- self .during_timestamps = []
162- self .during_results = {k : [] for k in self .observables if k != "energy" }
163- self .during ()
315+ self .energy_after_start = self .device . total_energy_usage ( )
316+ self .sample_timestamps = []
317+ self .sample_values = {k : [] for k in self .results_per_iteration }
318+ self .sample_metrics ()
164319
165320 def during (self ):
166- # Get the current timestamp for measurements
167- self .during_timestamps .append (time .perf_counter ())
321+ self .sample_metrics ()
168322
169- if "core_voltage" in self .observables :
170- milli_volt = amdsmi .amdsmi_get_gpu_volt_metric (
171- self .device ,
172- amdsmi .AmdSmiVoltageType .VDDGFX ,
173- amdsmi .AmdSmiVoltageMetric .CURRENT ,
174- )
323+ def field_name (self , name ):
324+ if self .prefix :
325+ return f"{ self .prefix } _{ name } "
326+ else :
327+ return name
175328
176- # milli * 1-e3 -> volt
177- self .during_results ["core_voltage" ].append (milli_volt * 1e-3 )
329+ def store_sample (self , name , value ):
330+ self .sample_values [self .field_name (name )].append (value )
331+
332+ def sample_metrics (self ):
333+ self .sample_timestamps .append (time .perf_counter ())
334+
335+ if "core_voltage" in self .observables :
336+ self .store_sample ("core_voltage" , self .device .core_voltage ())
178337
179338 if "core_freq" in self .observables :
180- obj = amdsmi .amdsmi_get_clk_freq (self .device , amdsmi .AmdSmiClkType .GFX )
181- freq = obj ["frequency" ][obj ["current" ]]
182- self .during_results ["core_freq" ].append (freq )
339+ self .store_sample ("core_freq" , self .device .core_freq ())
183340
184341 if "mem_freq" in self .observables :
185- obj = amdsmi .amdsmi_get_clk_freq (self .device , amdsmi .AmdSmiClkType .MEM )
186- freq = obj ["frequency" ][obj ["current" ]]
187- self .during_results ["mem_freq" ].append (freq )
342+ self .store_sample ("mem_freq" , self .device .mem_freq ())
188343
189344 if "temperature" in self .observables :
190- temp = amdsmi .amdsmi_get_temp_metric (
191- self .device ,
192- amdsmi .AmdSmiTemperatureType .HOTSPOT ,
193- amdsmi .AmdSmiTemperatureMetric .CURRENT ,
194- )
345+ self .store_sample ("temperature" , self .device .temperature ())
346+
347+ if "mem_temperature" in self .observables :
348+ self .store_sample ("mem_temperature" , self .device .mem_temperature ())
349+
350+ if "core_activity" in self .observables :
351+ self .store_sample ("core_activity" , self .device .core_activity ())
195352
196- self .during_results ["temperature" ].append (temp )
353+ if "mem_activity" in self .observables :
354+ self .store_sample ("mem_activity" , self .device .mem_activity ())
197355
198356 def after_finish (self ):
199- self .during ()
357+ before = self .energy_after_start
358+ after = self .device .total_energy_usage ()
359+ self .sample_metrics ()
360+
361+ diff = np .uint64 (after ["energy_accumulator" ]) - np .uint64 (before ["energy_accumulator" ])
362+ elapsed_ns = np .uint64 (after ["timestamp" ]) - np .uint64 (before ["timestamp" ])
363+ resolution = before ["counter_resolution" ]
364+ energy_uj = float (diff ) * float (resolution )
200365
201366 # Energy is an exception as it does not need integration over time
202367 if "energy" in self .observables :
203- before = self .energy_after_start
204- after = amdsmi .amdsmi_get_energy_count (self .device )
205-
206- # This field changed names in rocm 6.4
207- if "energy_accumulator" in before :
208- energy_field = "energy_accumulator"
209- elif "power" in before :
210- energy_field = "power"
211- else :
212- raise RuntimeError (f"invalid result from amdsmi_get_energy_count: { before } " )
368+ # microJ * 1e-6 -> J
369+ self .results_per_iteration [self .field_name ("energy" )].append (energy_uj * 1e-6 )
213370
214- diff = np .uint64 (after [energy_field ]) - np .uint64 (before [energy_field ])
215- resolution = before ["counter_resolution" ]
216- energy_mj = float (diff ) * float (resolution )
371+ if "power" in self .observables :
372+ self .results_per_iteration [self .field_name ("power" )].append (energy_uj / elapsed_ns * 1e3 )
217373
218- # microJ * 1e-6 -> J
219- self .iteration_results ["energy" ].append (energy_mj * 1e-6 )
374+ # normalize timestamps to [0, 1] such that integral (trapezoid) is the mean
375+ xs = np .array (self .sample_timestamps )
376+ xs = (xs - xs .min ()) / (xs .max () - xs .min ())
220377
221- # For the others, we integrate over time and take the average
222- x = self .during_timestamps
223- for key , values in self .during_results .items ():
224- # np.trapezoid was np.trapz in older versions of np
225- avg = np .trapezoid (values , x ) / np .ptp (x )
226- self .iteration_results [key ].append (avg )
378+ for key , values in self .sample_values .items ():
379+ # Could not sample, skip field
380+ if not values :
381+ continue
227382
228- def get_results (self ):
229- results = dict ()
383+ # If all values are the same, take that value directly.
384+ # This preserve that value bitwise exactly and prevents
385+ # rounding errors that occur in trapezoid
386+ if all (v == values [0 ] for v in values ):
387+ result = values [0 ]
388+ else :
389+ result = trapezoid (values , x = xs )
230390
231- for key in list (self .iteration_results ):
232- # Average of results at each iteration
233- avg = np .average (self .iteration_results [key ])
391+ self .results_per_iteration [key ].append (result )
234392
235- # Reset to empty
236- self . iteration_results [ key ] = []
393+ def get_results ( self ):
394+ results = dict ()
237395
238- if self .prefix :
239- results [ f" { self . prefix } _ { key } " ] = avg
240- else :
241- results [key ] = avg
396+ for key in list ( self .results_per_iteration ) :
397+ # Take average and reset!
398+ results [ key ] = np . average ( self . results_per_iteration [ key ])
399+ self . results_per_iteration [key ] = []
242400
243401 return results
402+
0 commit comments