Skip to content

Commit 2266f15

Browse files
committed
add nvidia_smi averaging function
1 parent 5891888 commit 2266f15

1 file changed

Lines changed: 51 additions & 0 deletions

File tree

sams/sampler/NvidiaSMI.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
import re
6363
import subprocess
6464
import threading
65+
import time
6566

6667
import sams.base
6768

@@ -126,6 +127,10 @@ def stopped(self):
126127
class Sampler(sams.base.Sampler):
127128
def __init__(self, id, outQueue, config):
128129
super(Sampler, self).__init__(id, outQueue, config)
130+
self._start_time = time.time()
131+
self._last_sample_time = dict()
132+
self._average_values = dict()
133+
self._last_averaged_values = dict()
129134
self.processes = {}
130135
self.sampler_interval = self.config.get([self.id, "sampler_interval"], 60)
131136
self.gpu_index_environment = self.config.get(
@@ -147,6 +152,14 @@ def __init__(self, id, outQueue, config):
147152
"utilization.memory",
148153
],
149154
)
155+
self.metrics_to_average = self.config.get(
156+
[self.id, "metrics_to_average"],
157+
[
158+
"power.draw",
159+
"utilization.gpu",
160+
"utilization.memory",
161+
],
162+
)
150163

151164
self.smi = None
152165
if self.gpu_index_environment in os.environ:
@@ -172,11 +185,49 @@ def sample(self):
172185
logger.debug(data)
173186
index = data["index"]
174187
del data["index"]
188+
self.compute_sample_averages(data, index)
175189
entry = {index: data}
176190
most_recent_sample.append(self._storage_wrapping(entry))
177191
self.store(entry)
178192
self._most_recent_sample = most_recent_sample
179193

194+
def compute_sample_averages(self, data, index):
195+
""" Computes averages of selected measurements by
196+
means of trapezoidal quadrature, approximating
197+
that the time this function is called is the actual
198+
time of sampling. This is not completely correct but simplifies
199+
the implementation.
200+
"""
201+
sample_time = time.time()
202+
if index not in self._last_sample_time:
203+
# Keep it simple by approximating sampling time
204+
self._last_sample_time[index] = self._start_time
205+
self._average_values[index] = dict()
206+
self._last_averaged_values[index] = dict()
207+
for key in data:
208+
if key.replace('_', '.') in self.metrics_to_average:
209+
# Initialize trapezoidal integral at 0.
210+
self._average_values[index][key] = 0.
211+
self._last_averaged_values[index][key] = 0.
212+
elapsed_time = sample_time - self._last_sample_time[index]
213+
total_elapsed_time = sample_time - self._start_time
214+
average_values = self._average_values[index]
215+
last_averaged_values = self._last_averaged_values[index]
216+
self._last_sample_time[index] = sample_time
217+
for key, item in data.items():
218+
if key.replace('_', '.') in self.metrics_to_average:
219+
# Trapezoidal quadrature
220+
weighted_item = (
221+
0.5 * (float(item) + float(last_averaged_values[key])) * elapsed_time)
222+
last_averaged_values[key] = item
223+
previous_integral = average_values[key] * (total_elapsed_time - elapsed_time)
224+
new_integral = previous_integral + weighted_item
225+
average_values[key] = new_integral / total_elapsed_time
226+
227+
for key, item in average_values.items():
228+
data[key + '_average'] = item
229+
data['elapsed_time'] = total_elapsed_time
230+
180231
def final_data(self):
181232
if self.smi:
182233
self.smi.stop()

0 commit comments

Comments
 (0)