Disentangled-Variational-Autoencoder/src/models/model_utils.py at main · AndrewSpano/Disentangled-Variational-Autoencoder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import sys
sys.path.append("../utils")

import torch
import torch.nn as nn

from errors import InvalidArchitectureError


def compute_output_shape(current_shape, kernel_size, stride, padding):
    """
    :param tuple current_shape:  The current shape of the data before a convolution is applied.
    :param tuple kernel_size:    The kernel size of the current convolution operation.
    :param tuple stride:         The stride of the current convolution operation.
    :param tuple padding:        The padding of the current convolution operation.

    :return:  The shape after a convolution operation with the above parameters is applied.
    :rtype:   tuple

            The formula used to compute the final shape is

        component[i] = floor((N[i] - K[i] + 2 * P[i]) / S[i]) + 1

        where, N = current shape of the data
               K = kernel size
               P = padding
               S = stride
    """
    # get the dimension of the data compute each component using the above formula
    dimensions = len(current_shape)
    return tuple((current_shape[i] - kernel_size[i] + 2 * padding[i]) // stride[i] + 1
                 for i in range(dimensions))


def compute_transpose_output_shape(current_shape, kernel_size, stride, padding):
    """
    :param tuple current_shape:  The current shape of the data before a transpose convolution is
                                   applied.
    :param tuple kernel_size:    The kernel size of the current transpose convolution operation.
    :param tuple stride:         The stride of the current transpose convolution operation.
    :param tuple padding:        The padding of the current transpose convolution operation.

    :return:  The shape after a transpose convolution operation with the above parameters is
                applied.
    :rtype:   tuple

            The formula used to compute the final shape is

        component[i] = (N[i] - 1) * S[i] - 2 * P[i] + (K[i] - 1) + 1

        where, N = current shape of the data
               K = kernel size
               P = padding
               S = stride
    """
    # get the dimension of the data compute each component using the above formula
    dimensions = len(current_shape)
    return tuple((current_shape[i] - 1) * stride[i] - 2 * padding[i] + (kernel_size[i] - 1) + 1
                 for i in range(dimensions))


def compute_output_padding(current_shape, target_shape):
    """
    :param tuple current_shape:  The shape of the data after a transpose convolution operation
                                   takes place.
    :param tuple target_shape:   The target shape that we would like our data to have after the
                                   transpose convolution operation takes place.

    :return:  The output padding needed so that the shape of the image after a transpose
                convolution is applied, is the same as the target shape.
    :rtype:   tuple
    """
    # basically subtract each term to get the difference which will be the output padding
    dimensions = len(current_shape)
    return tuple(target_shape[i] - current_shape[i] for i in range(dimensions))


def invalid_shape(current_shape):
    """
    :param tuple current_shape:  The current shape of the data after a convolution is applied.

    :return:  True if the shape is invalid, that is, a negative or 0 components exists. Else, it
                returns False.
    :rtype:   bool
    """
    # check all components
    for component in current_shape:
        if component <= 0:
            return True
    # return False if they are ok
    return False


def create_encoder(architecture, input_shape):
    """
    :param dict architecture:  A dictionary containing the hyperparameters that define the
                                 architecture of the model.
    :param tuple input_shape:  A tuple that corresponds to the shape of the input.

    :return:  A PyTorch Sequential model that represents the encoder part of a VAE, along with the
                final shape that a data point would have after the sequential is applied to it.
    :rtype:   (torch.nn.Sequential, tuple)

    This method builds the encoder part of a VAE and returns it. It is common for all types of VAE.
    """

    # initialize useful variables
    in_channels = input_shape[0]
    current_shape = (input_shape[1], input_shape[2])

    # initialize a list that will store the shape produced in each layer
    shape_per_layer = [current_shape]

    # build the encoder part
    conv_sets = []

    # iterate through the lists that define the architecture of the encoder
    for layer in range(architecture["conv_layers"]):

        # get the variables from the dictionary for more verbose
        out_channels = architecture["conv_channels"][layer]
        kernel_size = architecture["conv_kernel_sizes"][layer]
        stride = architecture["conv_strides"][layer]
        padding = architecture["conv_paddings"][layer]

        # add a set of Convolutional - Leaky ReLU - Batch Normalization sequential layers
        conv_sets.append(
            nn.Sequential(
                nn.Conv2d(in_channels=in_channels,
                          out_channels=out_channels,
                          kernel_size=kernel_size,
                          stride=stride,
                          padding=padding),
                nn.LeakyReLU(negative_slope=0.15),
                nn.BatchNorm2d(out_channels))
        )

        # compute the new shape of the image
        current_shape = compute_output_shape(current_shape=current_shape,
                                             kernel_size=kernel_size,
                                             stride=stride,
                                             padding=padding)
        shape_per_layer.append(current_shape)

        # make sure that the shape is valid, and if not, raise an error
        if invalid_shape(current_shape):
            raise InvalidArchitectureError(shape=current_shape, layer=layer+1)

        # the output channels of the current layer becomes the input channels of the next layer
        in_channels = out_channels

    # create a Sequential model and return it (* asterisk is used to unpack the list)
    return nn.Sequential(*conv_sets), shape_per_layer


def create_decoder(architecture, encoder_shapes):
    """
    :param dict architecture:    A dictionary containing the hyperparameters that define the
                                   architecture of the model.
    :param list encoder_shapes:  A list that contains the shape of the data after it is applied to
                                   every set of convolutional layers.

    :return:  A PyTorch Sequential model that represents the decoder part of a VAE.
    :rtype:   torch.nn.Sequential

    This method builds the decoder part of a VAE and returns it. It is common for all types of VAE.
    """
    # now start building the decoder part
    conv_sets = []

    # initialize useful variables
    in_channels = architecture["conv_channels"][-1]

    # iterate through the lists that define the architecture of the decoder
    for layer in range(architecture["conv_layers"] - 1, -1, -1):

        # get the variables from the dictionary for more verbose
        out_channels = architecture["conv_channels"][layer]
        kernel_size = architecture["conv_kernel_sizes"][layer]
        stride = architecture["conv_strides"][layer]
        padding = architecture["conv_paddings"][layer]

        # compute the output shape after a transpose convolution in order to get the output padding
        current_shape = encoder_shapes[layer + 1]
        target_shape = encoder_shapes[layer]
        output_shape = compute_transpose_output_shape(current_shape=current_shape,
                                                      kernel_size=kernel_size,
                                                      stride=stride,
                                                      padding=padding)
        output_padding = compute_output_padding(output_shape, target_shape)

        # add a set of ConvolutionalTranspose - Leaky ReLU - Batch Normalization sequential layers
        conv_sets.append(
            nn.Sequential(
                nn.ConvTranspose2d(in_channels=in_channels,
                                   out_channels=out_channels,
                                   kernel_size=kernel_size,
                                   stride=stride,
                                   padding=padding,
                                   output_padding=output_padding),
                nn.LeakyReLU(negative_slope=0.15),
                nn.BatchNorm2d(out_channels))
        )

        # the output channels of the current layer becomes the input channels of the next layer
        in_channels = out_channels

    # create a Sequential model and return it (* asterisk is used to unpack the list)
    return nn.Sequential(*conv_sets)


def create_output_layer(architecture, input_shape):
    """
    :param dict architecture:  A dictionary containing the hyperparameters that define the
                                 architecture of the model.
    :param tuple input_shape:  A tuple that corresponds to the shape of the input.

    :return:  A PyTorch Sequential model that represents the output layer of a VAE.
    :rtype:   torch.nn.Sequential

    This method creates the output layer of a VAE, that is, the layer where the data from the
    output of the decoder gets fed in order to be finally reconstructed.
    """
    # define the variables of the architecture for more verbose
    in_channels = architecture["conv_channels"][0]
    kernel_size = architecture["conv_kernel_sizes"][0]
    stride = architecture["conv_strides"][0]
    padding = architecture["conv_paddings"][0]

    return nn.Sequential(nn.ConvTranspose2d(in_channels=in_channels,
                                            out_channels=in_channels,
                                            kernel_size=kernel_size,
                                            stride=stride,
                                            padding=padding),
                         nn.SELU(),
                         nn.BatchNorm2d(in_channels),
                         nn.Conv2d(in_channels=in_channels,
                                   out_channels=input_shape[0],
                                   kernel_size=kernel_size,
                                   stride=stride,
                                   padding=padding),
                         nn.Sigmoid())