Added arguments that enable to set batch norm momentum and eps. Easier optimizer choice in main.py.

gabikadlecova · gabikadlecova · commit 4020a3febf08 · 2022-11-10T14:25:33.000+01:00
diff --git a/main.py b/main.py
@@ -56,9 +56,12 @@ def reload_checkpoint(path, device=None):
     parser.add_argument('--num_workers', default=0, type=int, help="Number of parallel workers for the train dataset.")
     parser.add_argument('--learning_rate', default=0.025, type=float, help='base learning rate')
     parser.add_argument('--lr_decay_method', default='COSINE_BY_STEP', type=str, help='learning decay method')
+    parser.add_argument('--optimizer', default='sgd', type=str, help='Optimizer (sgd or rmsprop)')
     parser.add_argument('--momentum', default=0.9, type=float, help='momentum')
     parser.add_argument('--weight_decay', default=1e-4, type=float, help='L2 regularization weight')   
     parser.add_argument('--grad_clip', default=5, type=float, help='gradient clipping')
+    parser.add_argument('--batch_norm_momentum', default=0.1, type=float, help='Batch normalization momentum')
+    parser.add_argument('--batch_norm_eps', default=1e-5, type=float, help='Batch normalization epsilon')
     parser.add_argument('--load_checkpoint', default='', type=str, help='Reload model from checkpoint')
     parser.add_argument('--num_labels', default=10, type=int, help='#classes')
     parser.add_argument('--device', default='cuda', type=str, help='Device for network training.')
@@ -77,14 +80,23 @@ def reload_checkpoint(path, device=None):
     # model
     spec = ModelSpec(matrix, operations)
     net = Network(spec, num_labels=args.num_labels, in_channels=args.in_channels, stem_out_channels=args.stem_out_channels,
-                  num_stacks=args.num_stacks, num_modules_per_stack=args.num_modules_per_stack)
+                  num_stacks=args.num_stacks, num_modules_per_stack=args.num_modules_per_stack,
+                  momentum=args.batch_norm_momentum, eps=args.batch_norm_eps)
 
     if args.load_checkpoint != '':
         net.load_state_dict(reload_checkpoint(args.load_checkpoint))
     net.to(args.device)
 
     criterion = nn.CrossEntropyLoss()
-    optimizer = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=args.momentum,
+
+    if args.optimizer.lower() == 'sgd':
+        optimizer = optim.SGD
+    elif args.optimizer.lower() == 'rmsprop':
+        optimizer = optim.RMSprop
+    else:
+        raise ValueError(f"Invalid optimizer {args.optimizer}, possible: SGD, RMSProp")
+
+    optimizer = optimizer(net.parameters(), lr=args.learning_rate, momentum=args.momentum,
                           weight_decay=args.weight_decay)
     scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs)
 
diff --git a/nasbench_pytorch/model/base_ops.py b/nasbench_pytorch/model/base_ops.py
@@ -9,12 +9,12 @@
 import torch.nn.functional as F
 
 class ConvBnRelu(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, momentum=0.1, eps=1e-5):
         super(ConvBnRelu, self).__init__()
 
         self.conv_bn_relu = nn.Sequential(
             nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False),
-            nn.BatchNorm2d(out_channels),
+            nn.BatchNorm2d(out_channels, eps=eps, momentum=momentum),
             nn.ReLU()
         )
 
@@ -23,21 +23,21 @@ def forward(self, x):
 
 class Conv3x3BnRelu(nn.Module):
     """3x3 convolution with batch norm and ReLU activation."""
-    def __init__(self, in_channels, out_channels):
+    def __init__(self, in_channels, out_channels, **kwargs):
         super(Conv3x3BnRelu, self).__init__()
 
-        self.conv3x3 = ConvBnRelu(in_channels, out_channels, 3, 1, 1)
+        self.conv3x3 = ConvBnRelu(in_channels, out_channels, 3, 1, 1, **kwargs)
 
     def forward(self, x):
         x = self.conv3x3(x)
         return x
 
 class Conv1x1BnRelu(nn.Module):
     """1x1 convolution with batch norm and ReLU activation."""
-    def __init__(self, in_channels, out_channels):
+    def __init__(self, in_channels, out_channels, **kwargs):
         super(Conv1x1BnRelu, self).__init__()
 
-        self.conv1x1 = ConvBnRelu(in_channels, out_channels, 1, 1, 0)
+        self.conv1x1 = ConvBnRelu(in_channels, out_channels, 1, 1, 0, **kwargs)
 
     def forward(self, x):
         x = self.conv1x1(x)
diff --git a/nasbench_pytorch/model/model.py b/nasbench_pytorch/model/model.py
@@ -25,7 +25,7 @@
 
 class Network(nn.Module):
     def __init__(self, spec, num_labels=10,
-                 in_channels=3, stem_out_channels=128, num_stacks=3, num_modules_per_stack=3):
+                 in_channels=3, stem_out_channels=128, num_stacks=3, num_modules_per_stack=3, momentum=0.1, eps=1e-5):
         """
 
         Args:
@@ -49,7 +49,7 @@ def __init__(self, spec, num_labels=10,
 
         # initial stem convolution
         out_channels = stem_out_channels
-        stem_conv = ConvBnRelu(in_channels, out_channels, 3, 1, 1)
+        stem_conv = ConvBnRelu(in_channels, out_channels, 3, 1, 1, momentum=momentum, eps=eps)
         self.layers.append(stem_conv)
 
         # stacked cells
@@ -63,7 +63,7 @@ def __init__(self, spec, num_labels=10,
                 out_channels *= 2
 
             for module_num in range(num_modules_per_stack):
-                cell = Cell(spec, in_channels, out_channels)
+                cell = Cell(spec, in_channels, out_channels, momentum=momentum, eps=eps)
                 self.layers.append(cell)
                 in_channels = out_channels
 
@@ -102,7 +102,7 @@ class Cell(nn.Module):
     determined via equally splitting the channel count whenever there is a
     concatenation of Tensors.
     """
-    def __init__(self, spec, in_channels, out_channels):
+    def __init__(self, spec, in_channels, out_channels, momentum=0.1, eps=1e-5):
         super(Cell, self).__init__()
 
         self.dev_param = nn.Parameter(torch.empty(0))
@@ -124,7 +124,7 @@ def __init__(self, spec, in_channels, out_channels):
         self.input_op = nn.ModuleList([Placeholder()])
         for t in range(1, self.num_vertices):
             if self.matrix[0, t]:
-                self.input_op.append(Projection(in_channels, self.vertex_channels[t]))
+                self.input_op.append(Projection(in_channels, self.vertex_channels[t], momentum=momentum, eps=eps))
             else:
                 self.input_op.append(Placeholder())
 
@@ -179,9 +179,11 @@ def forward(self, x):
 
         return outputs
 
-def Projection(in_channels, out_channels):
+
+def Projection(in_channels, out_channels, momentum=0.1, eps=1e-5):
     """1x1 projection (as in ResNet) followed by batch normalization and ReLU."""
-    return ConvBnRelu(in_channels, out_channels, 1)
+    return ConvBnRelu(in_channels, out_channels, 1, momentum=momentum, eps=eps)
+
 
 def Truncate(inputs, channels):
     """Slice the inputs to channels if necessary."""
@@ -197,6 +199,7 @@ def Truncate(inputs, channels):
         assert input_channels - channels == 1
         return inputs[:, :channels, :, :]
 
+
 def ComputeVertexChannels(in_channels, out_channels, matrix):
     """Computes the number of channels at every vertex.