image-classification/modules/mlp.py at main · AlecColas/image-classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
"""Functions to perform classification with Artificial Neural Networks.

The entry point is the function run_mlp_training().
You can either choose to use Mean Square Error (MSE) or Cross Entropy as the error function in train_mlp().
"""

import numpy as np


def softmax(x):
    """Compute the softmax values for each sets (rows) of scores in x.

    Parameters
    ----------
    x : np.ndarray(np.float32)
        A 2-D array containing the correspondence scores between one image and each labels (coded in rows).

    Returns
    -------
    np.ndarray(np.float32)
        The softmax activated scores.
    """
    exps = np.exp(x - np.max(x))
    return exps / np.sum(exps, axis=1, keepdims=True)


def one_hot(labels):
    """Return the one-hot matrix of label.

    Parameters
    ----------
    labels : np.ndarray(np.int64)
        An (n)-D array of labels.

    Returns
    -------
    np.ndarray(np.float64)
        The corresponding (n+1)-D one-hot matrix.
    """
    dimensions = np.max(labels) + 1
    hot_labels = np.eye(dimensions)[labels]

    return hot_labels


def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate):
    """Perform one gradient descent step using the Mean Square Error (MSE).

    Parameters
    ----------
    w1 : np.ndarray(np.float32)
        The weight matrix (d_in x d_h) of the first (hidden) layer.
    b1 : np.ndarray(np.float32)
        The bias matrix (1, d_h) of the first (hidden) layer.
    w2 : np.ndarray(np.float32)
        The weight matrix (d_h x d_out) of the output layer.
    b2 : np.ndarray(np.float32)
        The bias matrix (1 x d_out) of the output layer.
    data : np.ndarray(np.float32)
        The input matrix (batch_size x d_in) containing training images in rows, and
    targets : np.ndarray(np.int64)
        the vector (batch_size) of corresponding labels for each image.
    learning_rate : float
        The learning rate.

    Returns
    -------
    w1 : np.ndarray(np.float32)
        The updated weight matrix (d_in x d_h) of the first (hidden) layer.
    b1 : np.ndarray(np.float32)
        The updated bias matrix (1, d_h) of the first (hidden) layer.
    w2 : np.ndarray(np.float32)
        The updated weight matrix (d_h x d_out) of the output layer.
    b2 : np.ndarray(np.float32)
        The updated bias matrix (1 x d_out) of the output layer.
    loss : float
        The value of the loss for one training epoch.
    """
    N = data.shape[0]

    # Resize targets from tuple to np.ndarray
    r_targets = np.array([targets]).T

    # Forward pass
    a0 = data  # the data are the input of the first layer
    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
    # output of the hidden layer (sigmoid activation function)
    a1 = 1 / (1 + np.exp(-z1))
    z2 = np.matmul(a1, w2) + b2  # input of the output layer
    # output of the output layer (sigmoid activation function)
    a2 = 1 / (1 + np.exp(-z2))
    predictions = a2  # the predicted values are the outputs of the output layer

    # Compute loss (MSE)
    loss = np.mean(np.square(predictions - r_targets))

    # compute accuracy
    classification = np.argmax(predictions, axis=1, keepdims=True)
    accuracy = np.count_nonzero(classification[:, 0] == targets) / len(targets)

    print("MSE Loss before learning :", loss)
    print("Accuracy is : ", accuracy)

    # Gradient computation
    dC_da2 = 2 * (a2 - r_targets) / N
    dC_dz2 = dC_da2 * (a2 - np.square(a2))
    dC_dw2 = np.matmul(a1.T, dC_dz2)
    dC_db2 = np.sum(dC_dz2, axis=0, keepdims=True) / N

    dC_da1 = np.matmul(dC_dz2, w2.T)
    dC_dz1 = dC_da1 * (a1 - np.square(a1))
    dC_dw1 = np.matmul(a0.T, dC_dz1)
    dC_db1 = np.sum(dC_dz1, axis=0, keepdims=True) / N

    w1 -= learning_rate * dC_dw1
    b1 -= learning_rate * dC_db1
    w2 -= learning_rate * dC_dw2
    b2 -= learning_rate * dC_db2

    return (w1, b1, w2, b2, loss, accuracy)


def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate):
    """Perform one gradient descent step using Cross Entropy (softmax activation function + one hot encoding of labels).

    Parameters
    ----------
    w1 : np.ndarray(np.float32)
        The weight matrix (d_in x d_h) of the first (hidden) layer.
    b1 : np.ndarray(np.float32)
        The bias matrix (1, d_h) of the first (hidden) layer.
    w2 : np.ndarray(np.float32)
        The weight matrix (d_h x d_out) of the output layer.
    b2 : np.ndarray(np.float32)
        The bias matrix (1 x d_out) of the output layer.
    data : np.ndarray(np.float32)
        The input matrix (batch_size x d_in) containing training images in rows, and
    labels_train : np.ndarray(np.int64)
        the vector (batch_size) of corresponding labels for each image.
    learning_rate : float
        The learning rate.

    Returns
    -------
    w1 : np.ndarray(np.float32)
        The updated weight matrix (d_in x d_h) of the first (hidden) layer.
    b1 : np.ndarray(np.float32)
        The updated bias matrix (1, d_h) of the first (hidden) layer.
    w2 : np.ndarray(np.float32)
        The updated weight matrix (d_h x d_out) of the output layer.
    b2 : np.ndarray(np.float32)
        The updated bias matrix (1 x d_out) of the output layer.
    loss : float
        The value of the loss for one training epoch.
    """
    N = data.shape[0]

    # Forward pass
    a0 = data  # the data are the input of the first layer
    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
    # output of the hidden layer (sigmoid activation function)
    a1 = 1 / (1 + np.exp(-z1))
    z2 = np.matmul(a1, w2) + b2  # input of the output layer
    # output of the output layer (softmax activation function)
    a2 = softmax(z2)
    predictions = a2  # the predicted values are the outputs of the output layer

    targets_one_hot = one_hot(labels_train)
    # Compute loss (Cross Entropy)
    loss = -np.mean(targets_one_hot * np.log(predictions))

    # Compute accuracy
    classification = np.argmax(predictions, axis=1, keepdims=True)
    accuracy = np.count_nonzero(classification[:, 0] == labels_train) / len(
        labels_train
    )

    print("Loss before learning with cross entropy :", loss)
    print("Accuracy is : ", accuracy)

    # Gradient computation
    # We admit that $`\frac{partial C}{partial Z^{(2)}} = A^{(2)} - Y`$.
    dC_dz2 = a2 - targets_one_hot
    dC_dw2 = np.matmul(a1.T, dC_dz2) / N
    dC_db2 = np.sum(dC_dz2, axis=0, keepdims=True) / N

    dC_da1 = np.matmul(dC_dz2, w2.T)
    dC_dz1 = dC_da1 * (a1 - np.square(a1))
    dC_dw1 = np.matmul(a0.T, dC_dz1) / N
    dC_db1 = np.sum(dC_dz1, axis=0, keepdims=True) / N

    w1 -= learning_rate * dC_dw1
    b1 -= learning_rate * dC_db1
    w2 -= learning_rate * dC_dw2
    b2 -= learning_rate * dC_db2

    return (w1, b1, w2, b2, loss, accuracy)


def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epochs):
    """Perform num_epoch training steps with given learning_rate and loss function.

    Parameters
    ----------
    w1 : np.ndarray(np.float32)
        The weight matrix (d_in x d_h) of the first (hidden) layer.
    b1 : np.ndarray(np.float32)
        The bias matrix (1, d_h) of the first (hidden) layer.
    w2 : np.ndarray(np.float32)
        The weight matrix (d_h x d_out) of the output layer.
    b2 : np.ndarray(np.float32)
        The bias matrix (1 x d_out) of the output layer.
    data_train : np.ndarray(np.float32)
        The input matrix (batch_size x d_in) containing training images in rows, and
    labels_train : np.ndarray(np.int64)
        The vector (batch_size) of corresponding labels for each training image.
    learning_rate : float
        The learning rate.
    num_epochs : int
        The number of epochs to perform training.

    Returns
    -------
    w1 : np.ndarray(np.float32)
        The updated weight matrix (d_in x d_h) of the first (hidden) layer after complete training.
    b1 : np.ndarray(np.float32)
        The updated bias matrix (1, d_h) of the first (hidden) layer after complete training.
    w2 : np.ndarray(np.float32)
        The updated weight matrix (d_h x d_out) of the output layer after complete training.
    b2 : np.ndarray(np.float32)
        The updated bias matrix (1 x d_out) of the output layer after complete training.
    train_accuracies : np.ndarray(float32)
        A vector containing the accuracy before training for each epoch.
    """
    train_accuracies = np.zeros((num_epochs, 1))
    train_losses = np.zeros((num_epochs, 1))

    for k in range(num_epochs):
        print("Training MLP for epoch number :", k)
        (w1, b1, w2, b2, loss, accuracy) = learn_once_cross_entropy(
            w1, b1, w2, b2, data_train, labels_train, learning_rate
        )

        train_losses[k] = loss
        train_accuracies[k] = accuracy

    return (w1, b1, w2, b2, train_accuracies, train_losses)


def test_mlp(w1, b1, w2, b2, data_test, labels_test):
    """Test the trained network on the test set.

    Parameters
    ----------
    w1 : np.ndarray(np.float32)
        The weight matrix (d_in x d_h) of the first (hidden) layer.
    b1 : np.ndarray(np.float32)
        The bias matrix (1, d_h) of the first (hidden) layer.
    w2 : np.ndarray(np.float32)
        The weight matrix (d_h x d_out) of the output layer.
    b2 : np.ndarray(np.float32)
        The bias matrix (1 x d_out) of the output layer.
    data_test : np.ndarray(np.float32)
        The input matrix (batch_size x d_in) containing test images in rows, and
    labels_test : np.ndarray(np.int64)
        the vector (batch_size) of corresponding labels for each image.

    Returns
    -------
    float
        The testing accuracy.
    """
    # Forward pass
    a0 = data_test  # the data are the input of the first layer
    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
    # output of the hidden layer (sigmoid activation function)
    a1 = 1 / (1 + np.exp(-z1))
    z2 = np.matmul(a1, w2) + b2  # input of the output layer
    # output of the output layer (sigmoid activation function)
    a2 = 1 / (1 + np.exp(-z2))
    predictions = a2  # the predicted values are the outputs of the output layer

    classification = np.argmax(predictions, axis=1, keepdims=True)

    nb_labels = len(labels_test)
    nb_well_classified = np.count_nonzero(classification[:, 0] == labels_test)
    accuracy = nb_well_classified / nb_labels

    return accuracy


def run_mlp_training(
    data_train, labels_train, data_test, labels_test, d_h, learning_rate, num_epochs
):
    """Train an MLP classifier and test the trained network (weights and biases) on the test set.

    Parameters
    ----------
    data_train : np.ndarray(np.float32)
        The input matrix (batch_size x d_in) containing training images in rows, and
    labels_train : np.ndarray(np.int64)
        The vector (batch_size) of corresponding labels for each training image.
    data_test : np.ndarray(np.float32)
        The input matrix (batch_size x d_in) containing test images in rows, and
    labels_test : np.ndarray(np.int64)
        the vector (batch_size) of corresponding labels for each image.
    d_h : int
        The number of neurons in the hidden layer.
    learning_rate : float
        The learning rate.
    num_epochs : int
        The number of epochs to perform training.

    Returns
    -------
    train_accuracies : np.ndarray(float32)
        A vector containing the accuracy before training for each epoch.
    final_accuracy : float
        The testing accuracy.
    """
    d_in = np.shape(data_train)[1]
    d_out = 10

    # Random initialization of the network weights and biaises
    w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
    b1 = np.zeros((1, d_h))  # first layer biaises
    w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
    b2 = np.zeros((1, d_out))  # second layer biaises

    (w1, b1, w2, b2, train_accuracies, train_losses) = train_mlp(
        w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epochs
    )

    final_accuracy = test_mlp(w1, b1, w2, b2, data_test, labels_test)

    return (train_accuracies, train_losses, final_accuracy)