AzureML_Best_Practice/maskrcnn/get_benchmark_results.py at master · numericalalgorithmsgroup/AzureML_Best_Practice · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3

import argparse
from datetime import datetime
import requests
import os
import sys

from termcolor import cprint

from tqdm import tqdm
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import LogLocator

import sharedconfig


def build_results_dataframe_from_azml():

    from azureml.core import Workspace, Experiment

    workspace = Workspace.get(sharedconfig.workspace_name)

    experiment = Experiment(workspace, sharedconfig.experiment_name)

    runs = [run for run in experiment.get_runs() if run.status == "Completed"]

    results = []
    for run in tqdm(runs):
        tags = {k: v for k, v in run.get_tags().items() if not k.startswith("_")}
        tags["num_nodes"] = int(tags["num_nodes"])
        tags["iter"] = int(tags["iter"])
        tags["ims_per_gpu"] = int(tags["ims_per_gpu"])
        tags["fps"], tags["dfps"], _ = get_driver0_fps(run)

        results.append(tags)

    return pd.DataFrame(results)


def get_driver0_fps(run):

    log = requests.get(run.get_details()["logFiles"]["azureml-logs/70_driver_log_0.txt"])

    ipb = None
    perf_reps = {}
    iter_0 = None
    for line in log.iter_lines():
        dline = line.decode()
        if "Training Iteration:" in dline:
            line_data = dline.split()
            perf_reps[int(line_data[6])] = datetime.fromisoformat(
                "Z".join([line_data[1], line_data[2]])
            )
            continue
        if "IMS_PER_BATCH:" in dline:
            if ipb is None:
                ipb = int(dline.split()[1])
            continue
        if "PARAMETER train_starti : True" in dline:
            line_data = dline.split()
            iter_0 = datetime.fromisoformat("Z".join([line_data[1], line_data[2]]))
        if "train_perf_fps" in dline:
            fps = float(dline.split()[-1])
            break

    imin = min(perf_reps.keys())
    imax = max(perf_reps.keys())
    delta = (perf_reps[imax] - perf_reps[imin]).total_seconds()
    dfps = (imax - imin) * ipb / delta

    perf_reps[0] = iter_0

    return fps, dfps, perf_reps


def main():

    parser = argparse.ArgumentParser("Download benchmarking results")

    parser.add_argument("savefile", type=str, help="File to save csv results")
    parser.add_argument("--use-cached", action="store_true", help="Use cache if present")
    parser.add_argument(
        "--fig-file",
        type=str,
        help="Path to save figure (default: savefile.csv -> savefile.png",
    )

    args = parser.parse_args()

    if args.use_cached:
        if args.savefile is None:
            cprint(
                "Error '--use-cached' requires a savefile to be specified!",
                "red",
                attrs=["bold"],
            )
            sys.exit(-1)

        try:
            rdf = pd.read_csv(args.savefile)
        except FileNotFoundError:
            cprint(
                'Cachefile "{}" does not exist!! Fetching again'.format(args.savefile),
                "yellow",
            )
            args.use_cached = False

    if not args.use_cached:
        rdf = build_results_dataframe_from_azml()

    rdf.sort_values(["class", "num_nodes"], inplace=True)

    if args.savefile:
        rdf.to_csv(args.savefile, index=False)

    rdf = rdf[rdf["iter"] >= 1000]

    print(rdf)

    fig = plt.figure()
    ax = fig.add_subplot()
    for runclass in set(rdf["class"]):
        data = rdf[rdf["class"] == runclass]
        ax.plot(data["num_nodes"] * 8, data["dfps"], marker='x', ls='none',
                label=runclass)

    ax.xaxis.set_major_locator(LogLocator(base=2))
    ax.set_xlabel("# GPUs")
    ax.set_ylabel("Performance (img/s)")
    plt.legend()

    plt.show(block=True)

    if args.fig_file is None:
        args.fig_file = ".".join([os.path.splitext(args.savefile)[0], "png"])
        fig.savefig(args.fig_file)


if __name__ == "__main__":
    main()