Skip to content

Commit 1f2111f

Browse files
committed
Add AMDSMIContinuousObserver
1 parent 8756e6d commit 1f2111f

1 file changed

Lines changed: 222 additions & 63 deletions

File tree

kernel_tuner/observers/amd.py

Lines changed: 222 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,14 @@
33
import numpy as np
44
import time
55

6+
# Trapz was renamed to trapezoid in Numpy 2.0
7+
try:
8+
from numpy import trapezoid
9+
except ImportError:
10+
from numpy import trapz as trapezoid
11+
612
from uuid import UUID
7-
from kernel_tuner.observers import BenchmarkObserver
13+
from kernel_tuner.observers import BenchmarkObserver, ContinuousObserver
814

915
logger = logging.getLogger(__name__)
1016

@@ -65,14 +71,157 @@ def _find_device_by_bdf(devices, pci_domain, pci_bus, pci_device):
6571
return result
6672

6773

74+
class AMDDevice:
75+
def __init__(self, device):
76+
self.device = device
77+
78+
def total_energy_usage(self):
79+
""" Returns total energy usage since startup. """
80+
81+
result = amdsmi.amdsmi_get_energy_count(self.device)
82+
83+
# This field changed name in rocm 6.4
84+
if "energy_accumulator" not in result:
85+
if "power" in result:
86+
result["energy_accumulator"] = result["power"]
87+
else:
88+
raise RuntimeError(f"invalid result from amdsmi_get_energy_count: {result}")
89+
90+
return result
91+
92+
def current_power_usage(self):
93+
info = amdsmi.amdsmi_get_power_info(self.device)
94+
95+
if "current_socket_power" in info:
96+
# For newer Mi300+ cards
97+
return info["current_socket_power"]
98+
elif "average_socket_power" in info:
99+
# For older cards
100+
return info["average_socket_power"]
101+
else:
102+
raise RuntimeError(f"invalid result from amdsmi_get_power_info: {info}")
103+
104+
def core_voltage(self):
105+
""" Returns current voltage in Volt. """
106+
107+
milli_volt = amdsmi.amdsmi_get_gpu_volt_metric(
108+
self.device,
109+
amdsmi.AmdSmiVoltageType.VDDGFX,
110+
amdsmi.AmdSmiVoltageMetric.CURRENT,
111+
)
112+
113+
# milli * 1-e3 -> volt
114+
return milli_volt * 1e-3
115+
116+
def temperature(self):
117+
""" Returns current temperature in celcius. """
118+
return amdsmi.amdsmi_get_temp_metric(
119+
self.device,
120+
amdsmi.AmdSmiTemperatureType.HOTSPOT,
121+
amdsmi.AmdSmiTemperatureMetric.CURRENT,
122+
)
123+
124+
def mem_temperature(self):
125+
""" Returns current temperature in celcius. """
126+
return amdsmi.amdsmi_get_temp_metric(
127+
self.device,
128+
amdsmi.AmdSmiTemperatureType.VRAM,
129+
amdsmi.AmdSmiTemperatureMetric.CURRENT,
130+
)
131+
132+
def core_freq(self):
133+
""" Returns current core clock frequency in Hz. """
134+
obj = amdsmi.amdsmi_get_clk_freq(self.device, amdsmi.AmdSmiClkType.GFX)
135+
freq = obj["frequency"][obj["current"]]
136+
return freq
137+
138+
def mem_freq(self):
139+
""" Returns current memory clock frequency in Hz. """
140+
obj = amdsmi.amdsmi_get_clk_freq(self.device, amdsmi.AmdSmiClkType.MEM)
141+
freq = obj["frequency"][obj["current"]]
142+
return freq
143+
144+
def core_activity(self):
145+
""" Returns core usage as percentage (0-100). """
146+
obj = amdsmi.amdsmi_get_gpu_activity(self.device)
147+
result = obj["gfx_activity"]
148+
# Result is "N/A" on error, return NaN instead
149+
return float("nan") if isinstance(result, str) else result
150+
151+
def mem_activity(self):
152+
""" Returns memory usage as percentage (0-100). """
153+
obj = amdsmi.amdsmi_get_gpu_activity(self.device)
154+
result = obj["umc_activity"]
155+
# Result is "N/A" on error, return NaN instead
156+
return float("nan") if isinstance(result, str) else result
157+
158+
68159
SUPPORTED_OBSERVABLES = [
69160
"energy",
161+
"power",
70162
"core_freq",
71163
"mem_freq",
72164
"temperature",
165+
"mem_temperature",
73166
"core_voltage",
167+
"core_activity",
168+
"mem_activity",
74169
]
75170

171+
class AMDSMIContinuousObserver(ContinuousObserver):
172+
def __init__(self, parent, continuous_duration=1.0):
173+
self.parent = parent
174+
self.continuous_duration = continuous_duration
175+
self.warmup_time = min(0.1, continuous_duration / 2)
176+
self.results = None
177+
178+
def before_start(self):
179+
self.parent.before_start()
180+
181+
def after_start(self):
182+
self.warmup_completed = False
183+
self.start_time = time.perf_counter() + self.warmup_time
184+
185+
def during(self):
186+
now = time.perf_counter()
187+
188+
if not self.warmup_completed:
189+
if now < self.start_time:
190+
return
191+
192+
# Only call `after_start` once warmup time has passed
193+
self.start_time = now
194+
self.warmup_completed = True
195+
self.parent.after_start()
196+
197+
self.parent.during()
198+
199+
def after_finish(self):
200+
if self.warmup_completed:
201+
self.parent.after_finish()
202+
203+
def get_results(self):
204+
if not self.warmup_completed:
205+
return dict()
206+
207+
elapsed_sec = time.perf_counter() - self.start_time
208+
time_sec = self.results["time"] * 1e-3
209+
ratio = time_sec / elapsed_sec
210+
211+
# Get results from the parent
212+
results = self.parent.get_results()
213+
214+
# The energy field measures the energy over the entire
215+
# continuous duration. However, we want the average
216+
# energy usage _per_ kernel. To fix this, we multiply
217+
# by the ratio of elapsed time to time per kernel
218+
energy_field = self.parent.field_name("energy")
219+
220+
if energy_field in results:
221+
results[energy_field] = results[energy_field] * ratio
222+
223+
return results
224+
76225

77226
class AMDSMIObserver(BenchmarkObserver):
78227
"""
@@ -81,7 +230,7 @@ class AMDSMIObserver(BenchmarkObserver):
81230
and core voltage (`core_voltage`).
82231
"""
83232

84-
def __init__(self, observables=["energy"], *, device_id=None, prefix="amdsmi"):
233+
def __init__(self, observables=["energy"], *, device_id=None, prefix="amdsmi", use_continuous_observer=True, continuous_duration=1.0):
85234
"""
86235
Initialize the AMDSMIObserver.
87236
@@ -91,15 +240,18 @@ def __init__(self, observables=["energy"], *, device_id=None, prefix="amdsmi"):
91240
:param device_id: Specific AMD device index. If None, auto-detection is used.
92241
:param prefix: Prefix used for name in the metrics. Defaults to "amdsmi".
93242
"""
243+
observables=SUPPORTED_OBSERVABLES
94244
for obs in observables:
95245
if obs not in SUPPORTED_OBSERVABLES:
96246
raise ValueError(f"Observable {obs} not supported: {SUPPORTED_OBSERVABLES}")
97247

98248
self.observables = set(observables)
99-
self.iteration_results = {k: [] for k in self.observables}
100249
self.prefix = prefix
101250
self.device_id = device_id
102251
self.device = None
252+
self.use_continuous_observer = use_continuous_observer
253+
self.continuous_duration = continuous_duration
254+
self.results_per_iteration = {self.field_name(k): [] for k in self.observables}
103255

104256
def register_device(self, dev):
105257
amdsmi.amdsmi_init()
@@ -154,90 +306,97 @@ def register_device(self, dev):
154306
f"invalid AMD SMI device_id {self.device_id}, found {len(devices)} devices"
155307
)
156308

157-
self.device = devices[self.device_id]
309+
self.device = AMDDevice(devices[self.device_id])
310+
311+
if self.use_continuous_observer:
312+
self.continuous_observer = AMDSMIContinuousObserver(self, continuous_duration=self.continuous_duration)
158313

159314
def after_start(self):
160-
self.energy_after_start = amdsmi.amdsmi_get_energy_count(self.device)
161-
self.during_timestamps = []
162-
self.during_results = {k: [] for k in self.observables if k != "energy"}
163-
self.during()
315+
self.energy_after_start = self.device.total_energy_usage()
316+
self.sample_timestamps = []
317+
self.sample_values = {k: [] for k in self.results_per_iteration}
318+
self.sample_metrics()
164319

165320
def during(self):
166-
# Get the current timestamp for measurements
167-
self.during_timestamps.append(time.perf_counter())
321+
self.sample_metrics()
168322

169-
if "core_voltage" in self.observables:
170-
milli_volt = amdsmi.amdsmi_get_gpu_volt_metric(
171-
self.device,
172-
amdsmi.AmdSmiVoltageType.VDDGFX,
173-
amdsmi.AmdSmiVoltageMetric.CURRENT,
174-
)
323+
def field_name(self, name):
324+
if self.prefix:
325+
return f"{self.prefix}_{name}"
326+
else:
327+
return name
175328

176-
# milli * 1-e3 -> volt
177-
self.during_results["core_voltage"].append(milli_volt * 1e-3)
329+
def store_sample(self, name, value):
330+
self.sample_values[self.field_name(name)].append(value)
331+
332+
def sample_metrics(self):
333+
self.sample_timestamps.append(time.perf_counter())
334+
335+
if "core_voltage" in self.observables:
336+
self.store_sample("core_voltage", self.device.core_voltage())
178337

179338
if "core_freq" in self.observables:
180-
obj = amdsmi.amdsmi_get_clk_freq(self.device, amdsmi.AmdSmiClkType.GFX)
181-
freq = obj["frequency"][obj["current"]]
182-
self.during_results["core_freq"].append(freq)
339+
self.store_sample("core_freq", self.device.core_freq())
183340

184341
if "mem_freq" in self.observables:
185-
obj = amdsmi.amdsmi_get_clk_freq(self.device, amdsmi.AmdSmiClkType.MEM)
186-
freq = obj["frequency"][obj["current"]]
187-
self.during_results["mem_freq"].append(freq)
342+
self.store_sample("mem_freq", self.device.mem_freq())
188343

189344
if "temperature" in self.observables:
190-
temp = amdsmi.amdsmi_get_temp_metric(
191-
self.device,
192-
amdsmi.AmdSmiTemperatureType.HOTSPOT,
193-
amdsmi.AmdSmiTemperatureMetric.CURRENT,
194-
)
345+
self.store_sample("temperature", self.device.temperature())
346+
347+
if "mem_temperature" in self.observables:
348+
self.store_sample("mem_temperature", self.device.mem_temperature())
349+
350+
if "core_activity" in self.observables:
351+
self.store_sample("core_activity", self.device.core_activity())
195352

196-
self.during_results["temperature"].append(temp)
353+
if "mem_activity" in self.observables:
354+
self.store_sample("mem_activity", self.device.mem_activity())
197355

198356
def after_finish(self):
199-
self.during()
357+
before = self.energy_after_start
358+
after = self.device.total_energy_usage()
359+
self.sample_metrics()
360+
361+
diff = np.uint64(after["energy_accumulator"]) - np.uint64(before["energy_accumulator"])
362+
elapsed_ns = np.uint64(after["timestamp"]) - np.uint64(before["timestamp"])
363+
resolution = before["counter_resolution"]
364+
energy_uj = float(diff) * float(resolution)
200365

201366
# Energy is an exception as it does not need integration over time
202367
if "energy" in self.observables:
203-
before = self.energy_after_start
204-
after = amdsmi.amdsmi_get_energy_count(self.device)
205-
206-
# This field changed names in rocm 6.4
207-
if "energy_accumulator" in before:
208-
energy_field = "energy_accumulator"
209-
elif "power" in before:
210-
energy_field = "power"
211-
else:
212-
raise RuntimeError(f"invalid result from amdsmi_get_energy_count: {before}")
368+
# microJ * 1e-6 -> J
369+
self.results_per_iteration[self.field_name("energy")].append(energy_uj * 1e-6)
213370

214-
diff = np.uint64(after[energy_field]) - np.uint64(before[energy_field])
215-
resolution = before["counter_resolution"]
216-
energy_mj = float(diff) * float(resolution)
371+
if "power" in self.observables:
372+
self.results_per_iteration[self.field_name("power")].append(energy_uj / elapsed_ns * 1e3)
217373

218-
# microJ * 1e-6 -> J
219-
self.iteration_results["energy"].append(energy_mj * 1e-6)
374+
# normalize timestamps to [0, 1] such that integral (trapezoid) is the mean
375+
xs = np.array(self.sample_timestamps)
376+
xs = (xs - xs.min()) / (xs.max() - xs.min())
220377

221-
# For the others, we integrate over time and take the average
222-
x = self.during_timestamps
223-
for key, values in self.during_results.items():
224-
# np.trapezoid was np.trapz in older versions of np
225-
avg = np.trapezoid(values, x) / np.ptp(x)
226-
self.iteration_results[key].append(avg)
378+
for key, values in self.sample_values.items():
379+
# Could not sample, skip field
380+
if not values:
381+
continue
227382

228-
def get_results(self):
229-
results = dict()
383+
# If all values are the same, take that value directly.
384+
# This preserve that value bitwise exactly and prevents
385+
# rounding errors that occur in trapezoid
386+
if all(v == values[0] for v in values):
387+
result = values[0]
388+
else:
389+
result = trapezoid(values, x=xs)
230390

231-
for key in list(self.iteration_results):
232-
# Average of results at each iteration
233-
avg = np.average(self.iteration_results[key])
391+
self.results_per_iteration[key].append(result)
234392

235-
# Reset to empty
236-
self.iteration_results[key] = []
393+
def get_results(self):
394+
results = dict()
237395

238-
if self.prefix:
239-
results[f"{self.prefix}_{key}"] = avg
240-
else:
241-
results[key] = avg
396+
for key in list(self.results_per_iteration):
397+
# Take average and reset!
398+
results[key] = np.average(self.results_per_iteration[key])
399+
self.results_per_iteration[key] = []
242400

243401
return results
402+

0 commit comments

Comments
 (0)