-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathknn.py
More file actions
240 lines (184 loc) · 7.46 KB
/
knn.py
File metadata and controls
240 lines (184 loc) · 7.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
"""Functions to perform KNN classification with a split dataset.
The entry points are evaluate_knn() or evaluate_knn_optimized().
"""
import numpy as np
def distance_matrix(train, test):
"""Return the L2 Euclidean distance matrix between two matrixes of shape (n,m) and (m,p).
Parameters
----------
train : np.ndarray(np.float32)
The training images data matrix of shape (n,m).
test : np.ndarray(np.float32)
The test images data matrix of shape (m,p).
Returns
-------
np.ndarray(np.float32)
The L2 Euclidean distance matrix between train and test, of shape (n,p).
"""
print("Computing distance matrix between train and test sets")
train2 = train * train
test2 = test * test
train2_sum = np.sum(train2, axis=1, keepdims=True)
test2_sum = np.sum(test2, axis=1, keepdims=True)
product = -2 * np.matmul(train, test.T)
dists = np.sqrt(product + train2_sum + test2_sum.T)
return dists
def knn_predict(dists, labels_train, k=1):
"""Compute the k-nearest neighbors labels for each image of data_test using distances between training data and test data.
The distance minima are sorted along the y-axis, each column of dists corresponding to the distances between one test image and all training images.
Parameters
----------
dists : np.ndarray(np.float32)
The distance matrix between the train set and the test set.
labels_train : np.ndarray(np.int64)
The corresponding labels of training data.
k : int, optional
The chosen number of neighbors, by default 1.
Returns
-------
np.ndarray(np.int64)
The predicted labels for the images of data_test.
"""
print("Extracting labels of k nearest neighbors")
if k <= 0 or k > np.shape(dists)[0]:
return np.array([])
indexes_of_knn = np.argsort(dists, axis=0)[0:k, :]
labels_of_knn = labels_train[indexes_of_knn]
return labels_of_knn
def classify_with_mode(labels_of_knn):
"""Compute classification of test images according to the most common label of their k-nearest neighbors in training data.
Parameters
----------
labels_of_knn : np.ndarray(np.int64)
array of the computed possible labels for the images of data_test.
Returns
-------
np.ndarray(np.int64)
List of the most common labels in the passed array along to the y-axis.
"""
predicted_labels = []
for k in range(np.shape(labels_of_knn)[1]):
predicted_labels.append(np.array(np.bincount(labels_of_knn[:, k]).argmax()))
return np.array(predicted_labels)
def compute_accuracy(labels_test, computed_labels) -> float:
"""Compute the classification rate (accuracy) of a method by comparing computed labels to true labels of test data.
Parameters
----------
labels_test : np.ndarray(np.int64)
The true labels of test images.
computed_labels : np.ndarray(np.int64)
The computed labels of test images.
Returns
-------
float
The computed accuracy.
Raises
------
ValueError
In case the computed test labels and the true test labels arrays shapes do not match.
"""
if len(labels_test) != len(computed_labels):
raise ValueError("Input arrays shape do not match")
nb_labels = len(labels_test)
nb_well_classified = np.count_nonzero(labels_test == computed_labels)
accuracy = nb_well_classified / nb_labels
return accuracy
def evaluate_knn(data_train, labels_train, data_test, labels_test, k: int):
"""Evaluate the classification rate of one knn method given k.
Parameters
----------
data_train : np.ndarray(np.float32)
The training data, and
labels_train : np.ndarray(np.int64)
the corresponding labels.
data_test : np.ndarray(np.float32)
The test data, and
labels_test : np.ndarray(np.int64)
the corresponding labels.
k : int
The chosen number of neighbors.
Returns
-------
float
The classification rate (accuracy) of this k-nn method.
"""
L_data_train = np.shape(data_train)[0]
if k <= 0 or k > L_data_train:
return
dists = distance_matrix(data_train, data_test)
labels_of_knn = knn_predict(dists, labels_train, k)
computed_labels_for_test_images = classify_with_mode(labels_of_knn)
return compute_accuracy(labels_test, computed_labels_for_test_images)
def compute_accuracy_for_range_k(labels_test, computed_labels_for_k_max, k_max):
"""Compute the classification rate (accuracy) of a knn methods for k in range(1, k_max) by comparing computed labels to true labels of test data.
Parameters
----------
labels_test : np.ndarray(np.int64)
The true labels of test images.
computed_labels_for_k_max : np.ndarray(np.int64)
The computed labels of test images for k = k_max.
k_max : int
The maximum number of neighbors chosen.
Returns
-------
list(float)
The list of accuracies for knn methods for k in range (1, k_max).
"""
print("Computing accuracies")
nb_labels = len(labels_test)
# list of accuracies for all k in range (1, k_max)
accuracies = []
for k in range(k_max):
# extract column k representing the computed labels for data_test for k
computed_labels_for_k = computed_labels_for_k_max[k]
# get the number of well computed labels for k-neighbors
nb_well_classified_for_k = np.count_nonzero(
labels_test == computed_labels_for_k
)
# compute the accuracy for k-neighbors
accuracy_for_k = nb_well_classified_for_k / nb_labels
# insert accuracy in list
accuracies.append(accuracy_for_k)
return accuracies
def evaluate_knn_optimized(
data_train, labels_train, data_test, labels_test, k_max: int
):
"""Optimized version of evaluate_knn() that computes the classification rate of knn methods for k in range(1, k_max).
Parameters
----------
data_train : np.ndarray(np.float32)
The training data, and
labels_train : np.ndarray(np.int64)
the corresponding labels.
data_test : np.ndarray(np.float32)
The test data, and
labels_test : np.ndarray(np.int64)
the corresponding labels.
k_max : int
The maximum number of neighbors chosen.
Returns
-------
list(float)
The accuracy list of knn methods for k in range(1, k_max).
"""
print("Starting knn evaluation")
# check if k_max value is correct
L_data_train = np.shape(data_train)[0]
if k_max <= 0 or k_max > L_data_train:
return
dists = distance_matrix(data_train, data_test)
# get the computed labels for data_test, k = k_max
computed_labels_for_k_max_neighbors = knn_predict(dists, labels_train, k_max)
# retrieve the classification (most common computed label) for each image, and for each k in range(1, k_max)
# element k of computed_labels_for_test_images is an array containing the computed labels of test images for k
computed_labels_for_test_images = []
for i in range(1, k_max + 1):
computed_labels_for_test_images_for_k = classify_with_mode(
computed_labels_for_k_max_neighbors[:i, :]
)
computed_labels_for_test_images.append(computed_labels_for_test_images_for_k)
# compute the accuracy of the knn methods for all k in range (1, k_max)
accuracy_for_all_k = compute_accuracy_for_range_k(
labels_test, computed_labels_for_test_images, k_max
)
return accuracy_for_all_k