AzureML_Best_Practice/maskrcnn/blobmount_submit_maskrcnn.py at master · numericalalgorithmsgroup/AzureML_Best_Practice · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3

import argparse

from azureml.core import Experiment, ScriptRunConfig
from azureml.core.runconfig import MpiConfiguration

from common import (
    get_or_create_workspace,
    create_or_update_environment,
    create_or_update_cluster,
)

import sharedconfig


k_runclass = "Mount"


def generate_training_opts(num_gpus, ims_per_gpu, max_iter, per_epoch_eval=False):
    """Populate common Mask RCNN command line options
    """
    opts = ["--config-file", "./benchmark_mask_rcnn_R_50_FPN.yaml"]
    opts.extend(["SOLVER.IMS_PER_BATCH", str(num_gpus * ims_per_gpu)])
    opts.extend(["SOLVER.MAX_ITER", str(max_iter)])
    opts.extend(["TEST.IMS_PER_BATCH", str(num_gpus * ims_per_gpu)])
    opts.extend(["PER_EPOCH_EVAL", str(bool(per_epoch_eval))])

    return opts


def parse_command_line_args():
    """Parse command line arguments and return args object
    """
    parser = argparse.ArgumentParser(
        description="Submit benchmark runs using mounted blob"
    )

    parser.add_argument("num_nodes", type=int, help="Number of nodes")
    tiers = parser.add_mutually_exclusive_group()
    tiers.add_argument("--premium", action="store_true", help="Use premium storage")
    tiers.add_argument("--cool", action="store_true", help="Use cool storage")
    parser.add_argument("--follow", action="store_true", help="Follow run output")
    parser.add_argument(
        "--iter",
        type=int,
        default=sharedconfig.max_iter,
        help="Number of training iterations",
    )

    return parser.parse_args()


def main():

    # Collect command line arguments
    args = parse_command_line_args()

    # Collect runclass and default (hot) dataset name
    runclass = k_runclass
    dataset = sharedconfig.dataset_hot

    # Replace/update args for using premium storage
    if args.premium:
        runclass += "_premium"
        dataset = sharedconfig.dataset_premium

    # Replace/update args for using cool storage
    if args.cool:
        runclass += "_cool"
        dataset = sharedconfig.dataset_cool

    # Get the AzureML Workspace object
    workspace = get_or_create_workspace(
        sharedconfig.subscription,
        sharedconfig.resource_group,
        sharedconfig.workspace_name,
    )

    # Get and update the ClusterConnector object
    # NOTE: This is *NOT* an azureml.core.compute.AmlCompute object but a wrapper
    # See clusterconnector.py for more details
    clusterconnector = create_or_update_cluster(
        workspace,
        sharedconfig.cluster_name,
        args.num_nodes,
        sharedconfig.ssh_key,
        sharedconfig.vm_type,
        terminate_on_failure=True,
        use_beeond=False,
    )

    # Get and update the AzureML Environment object
    environment = create_or_update_environment(
        workspace, sharedconfig.environment_name, sharedconfig.docker_image
    )

    # Get/Create an experiment object
    experiment = Experiment(workspace=workspace, name=sharedconfig.experiment_name)

    # Configure the distributed compute settings
    pytorchconfig = MpiConfiguration(
        node_count=args.num_nodes, process_count_per_node=sharedconfig.gpus_per_node
    )

    # Collect arguments to be passed to training script
    script_args = ["--dataset", dataset]
    script_args.extend(
        generate_training_opts(
            args.num_nodes * sharedconfig.gpus_per_node,
            sharedconfig.ims_per_gpu,
            args.iter,
        )
    )
    script_args.extend(["PATHS_CATALOG", "./dataset_catalog.py"])

    # Define the configuration for running the training script
    script_conf = ScriptRunConfig(
        source_directory="train",
        script="train_net_mount.py",
        compute_target=clusterconnector.cluster,
        environment=environment,
        arguments=script_args,
        distributed_job_config=pytorchconfig,
    )

    # We can use these tags make a note of run parameters (avoids grepping the logs)
    runtags = {
        "class": runclass,
        "vmtype": sharedconfig.vm_type,
        "num_nodes": args.num_nodes,
        "ims_per_gpu": sharedconfig.ims_per_gpu,
        "iter": args.iter,
    }

    # Submit the run
    run = experiment.submit(config=script_conf, tags=runtags)

    # Can optionally choose to follow the output on the command line
    if args.follow:
        run.wait_for_completion(show_output=True)


if __name__ == "__main__":
    main()