-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathget_benchmark_results.py
More file actions
executable file
·143 lines (104 loc) · 3.81 KB
/
get_benchmark_results.py
File metadata and controls
executable file
·143 lines (104 loc) · 3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
import argparse
from datetime import datetime
import requests
import os
import sys
from termcolor import cprint
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import LogLocator
import sharedconfig
def build_results_dataframe_from_azml():
from azureml.core import Workspace, Experiment
workspace = Workspace.get(sharedconfig.workspace_name)
experiment = Experiment(workspace, sharedconfig.experiment_name)
runs = [run for run in experiment.get_runs() if run.status == "Completed"]
results = []
for run in tqdm(runs):
tags = {k: v for k, v in run.get_tags().items() if not k.startswith("_")}
tags["num_nodes"] = int(tags["num_nodes"])
tags["iter"] = int(tags["iter"])
tags["ims_per_gpu"] = int(tags["ims_per_gpu"])
tags["fps"], tags["dfps"], _ = get_driver0_fps(run)
results.append(tags)
return pd.DataFrame(results)
def get_driver0_fps(run):
log = requests.get(run.get_details()["logFiles"]["azureml-logs/70_driver_log_0.txt"])
ipb = None
perf_reps = {}
iter_0 = None
for line in log.iter_lines():
dline = line.decode()
if "Training Iteration:" in dline:
line_data = dline.split()
perf_reps[int(line_data[6])] = datetime.fromisoformat(
"Z".join([line_data[1], line_data[2]])
)
continue
if "IMS_PER_BATCH:" in dline:
if ipb is None:
ipb = int(dline.split()[1])
continue
if "PARAMETER train_starti : True" in dline:
line_data = dline.split()
iter_0 = datetime.fromisoformat("Z".join([line_data[1], line_data[2]]))
if "train_perf_fps" in dline:
fps = float(dline.split()[-1])
break
imin = min(perf_reps.keys())
imax = max(perf_reps.keys())
delta = (perf_reps[imax] - perf_reps[imin]).total_seconds()
dfps = (imax - imin) * ipb / delta
perf_reps[0] = iter_0
return fps, dfps, perf_reps
def main():
parser = argparse.ArgumentParser("Download benchmarking results")
parser.add_argument("savefile", type=str, help="File to save csv results")
parser.add_argument("--use-cached", action="store_true", help="Use cache if present")
parser.add_argument(
"--fig-file",
type=str,
help="Path to save figure (default: savefile.csv -> savefile.png",
)
args = parser.parse_args()
if args.use_cached:
if args.savefile is None:
cprint(
"Error '--use-cached' requires a savefile to be specified!",
"red",
attrs=["bold"],
)
sys.exit(-1)
try:
rdf = pd.read_csv(args.savefile)
except FileNotFoundError:
cprint(
'Cachefile "{}" does not exist!! Fetching again'.format(args.savefile),
"yellow",
)
args.use_cached = False
if not args.use_cached:
rdf = build_results_dataframe_from_azml()
rdf.sort_values(["class", "num_nodes"], inplace=True)
if args.savefile:
rdf.to_csv(args.savefile, index=False)
rdf = rdf[rdf["iter"] >= 1000]
print(rdf)
fig = plt.figure()
ax = fig.add_subplot()
for runclass in set(rdf["class"]):
data = rdf[rdf["class"] == runclass]
ax.plot(data["num_nodes"] * 8, data["dfps"], marker='x', ls='none',
label=runclass)
ax.xaxis.set_major_locator(LogLocator(base=2))
ax.set_xlabel("# GPUs")
ax.set_ylabel("Performance (img/s)")
plt.legend()
plt.show(block=True)
if args.fig_file is None:
args.fig_file = ".".join([os.path.splitext(args.savefile)[0], "png"])
fig.savefig(args.fig_file)
if __name__ == "__main__":
main()