-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathblobmount_submit_maskrcnn.py
More file actions
executable file
·145 lines (114 loc) · 4.31 KB
/
blobmount_submit_maskrcnn.py
File metadata and controls
executable file
·145 lines (114 loc) · 4.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
import argparse
from azureml.core import Experiment, ScriptRunConfig
from azureml.core.runconfig import MpiConfiguration
from common import (
get_or_create_workspace,
create_or_update_environment,
create_or_update_cluster,
)
import sharedconfig
k_runclass = "Mount"
def generate_training_opts(num_gpus, ims_per_gpu, max_iter, per_epoch_eval=False):
"""Populate common Mask RCNN command line options
"""
opts = ["--config-file", "./benchmark_mask_rcnn_R_50_FPN.yaml"]
opts.extend(["SOLVER.IMS_PER_BATCH", str(num_gpus * ims_per_gpu)])
opts.extend(["SOLVER.MAX_ITER", str(max_iter)])
opts.extend(["TEST.IMS_PER_BATCH", str(num_gpus * ims_per_gpu)])
opts.extend(["PER_EPOCH_EVAL", str(bool(per_epoch_eval))])
return opts
def parse_command_line_args():
"""Parse command line arguments and return args object
"""
parser = argparse.ArgumentParser(
description="Submit benchmark runs using mounted blob"
)
parser.add_argument("num_nodes", type=int, help="Number of nodes")
tiers = parser.add_mutually_exclusive_group()
tiers.add_argument("--premium", action="store_true", help="Use premium storage")
tiers.add_argument("--cool", action="store_true", help="Use cool storage")
parser.add_argument("--follow", action="store_true", help="Follow run output")
parser.add_argument(
"--iter",
type=int,
default=sharedconfig.max_iter,
help="Number of training iterations",
)
return parser.parse_args()
def main():
# Collect command line arguments
args = parse_command_line_args()
# Collect runclass and default (hot) dataset name
runclass = k_runclass
dataset = sharedconfig.dataset_hot
# Replace/update args for using premium storage
if args.premium:
runclass += "_premium"
dataset = sharedconfig.dataset_premium
# Replace/update args for using cool storage
if args.cool:
runclass += "_cool"
dataset = sharedconfig.dataset_cool
# Get the AzureML Workspace object
workspace = get_or_create_workspace(
sharedconfig.subscription,
sharedconfig.resource_group,
sharedconfig.workspace_name,
)
# Get and update the ClusterConnector object
# NOTE: This is *NOT* an azureml.core.compute.AmlCompute object but a wrapper
# See clusterconnector.py for more details
clusterconnector = create_or_update_cluster(
workspace,
sharedconfig.cluster_name,
args.num_nodes,
sharedconfig.ssh_key,
sharedconfig.vm_type,
terminate_on_failure=True,
use_beeond=False,
)
# Get and update the AzureML Environment object
environment = create_or_update_environment(
workspace, sharedconfig.environment_name, sharedconfig.docker_image
)
# Get/Create an experiment object
experiment = Experiment(workspace=workspace, name=sharedconfig.experiment_name)
# Configure the distributed compute settings
pytorchconfig = MpiConfiguration(
node_count=args.num_nodes, process_count_per_node=sharedconfig.gpus_per_node
)
# Collect arguments to be passed to training script
script_args = ["--dataset", dataset]
script_args.extend(
generate_training_opts(
args.num_nodes * sharedconfig.gpus_per_node,
sharedconfig.ims_per_gpu,
args.iter,
)
)
script_args.extend(["PATHS_CATALOG", "./dataset_catalog.py"])
# Define the configuration for running the training script
script_conf = ScriptRunConfig(
source_directory="train",
script="train_net_mount.py",
compute_target=clusterconnector.cluster,
environment=environment,
arguments=script_args,
distributed_job_config=pytorchconfig,
)
# We can use these tags make a note of run parameters (avoids grepping the logs)
runtags = {
"class": runclass,
"vmtype": sharedconfig.vm_type,
"num_nodes": args.num_nodes,
"ims_per_gpu": sharedconfig.ims_per_gpu,
"iter": args.iter,
}
# Submit the run
run = experiment.submit(config=script_conf, tags=runtags)
# Can optionally choose to follow the output on the command line
if args.follow:
run.wait_for_completion(show_output=True)
if __name__ == "__main__":
main()