-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluation.py
More file actions
344 lines (319 loc) · 12.1 KB
/
evaluation.py
File metadata and controls
344 lines (319 loc) · 12.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
import gc
import multiprocessing
import time
import os
import numpy as np
from utils import *
from scipy.spatial.distance import cdist
from sklearn import preprocessing
from sklearn.metrics.pairwise import euclidean_distances
import faiss
def test(embeds1, embeds2, top_k, threads_num, metric='inner', normalize=False, csls_k=0, accurate=True):
alignment_rest_12, hits1_12, mr_12, mrr_12 = greedy_alignment(embeds1, embeds2, top_k, threads_num,
metric, normalize, csls_k, accurate)
return alignment_rest_12, hits1_12, mrr_12
def early_stop(flag1, flag2, flag):
if flag <= flag2 <= flag1:
print("\n == should early stop == \n")
return flag2, flag, True
else:
return flag2, flag, False
def eval_entity_alignment_faiss(Lvec, Rvec, k, eval_metric='inner', eval_normalize=True):
if eval_normalize:
Lvec = preprocessing.normalize(Lvec)
Rvec = preprocessing.normalize(Rvec)
test_num = Lvec.shape[0]
mrr = 0
mean = 0
hit_1_score = 0
hit_5_score = 0
hit_10_score = 0
hit_k_score = 0
if eval_metric == "l2":
index = faiss.IndexFlatL2(Rvec.shape[1]) # create index base with fixed dimension
elif eval_metric == "inner":
index = faiss.IndexFlatIP(Rvec.shape[1])
else:
assert ValueError
index.add(np.ascontiguousarray(Rvec)) # add key to index base
del Rvec;
_, I = index.search(np.ascontiguousarray(Lvec), test_num) # search query in index base
for idx in range(Lvec.shape[0]):
rank_index = np.where(I[idx,:]==idx)[0][0]
rank_index += 1
mean += (rank_index)
mrr += 1.0 / (rank_index)
if rank_index <= 1:
hit_1_score += 1
if rank_index <= 5:
hit_5_score += 1
if rank_index <= 10:
hit_10_score += 1
mrr = mrr / test_num
hit_1_score = hit_1_score / test_num
hit_5_score = hit_5_score / test_num
hit_10_score = hit_10_score / test_num
mean = mean / test_num
print('faiss: ', [hit_1_score,hit_5_score,hit_10_score], mean, mrr)
return [hit_1_score,hit_5_score,hit_10_score], mean, mrr
def greedy_alignment(embed1, embed2, top_k, nums_threads, metric, normalize, csls_k, accurate):
"""
Parameters
----------
embed1 : matrix_like
An embedding matrix of size n1*d, where n1 is the number of embeddings and d is the dimension.
embed2 : matrix_like
An embedding matrix of size n2*d, where n2 is the number of embeddings and d is the dimension.
top_k : list of integers
Hits@k metrics for evaluating results.
nums_threads : int
The number of threads used to search alignment.
metric : string
The distance metric to use. It can be 'cosine', 'euclidean' or 'inner'.
normalize : bool, true or false.
Whether to normalize the input embeddings.
csls_k : int
K value for csls. If k > 0, enhance the similarity by csls.
Returns
-------
alignment_rest : list, pairs of aligned entities
hits1 : float, hits@1 values for alignment results
mr : float, MR values for alignment results
mrr : float, MRR values for alignment results
"""
t = time.time()
sim_mat = sim(embed1, embed2, metric=metric, normalize=normalize, csls_k=csls_k)
num = sim_mat.shape[0]
if nums_threads > 1:
hits = [0] * len(top_k)
mr, mrr = 0, 0
alignment_rest = set()
rests = list()
search_tasks = task_divide(np.array(range(num)), nums_threads)
pool = multiprocessing.Pool(processes=len(search_tasks))
for task in search_tasks:
mat = sim_mat[task, :]
rests.append(pool.apply_async(calculate_rank, (task, mat, top_k, accurate, num)))
pool.close()
pool.join()
for rest in rests:
sub_mr, sub_mrr, sub_hits, sub_hits1_rest = rest.get()
mr += sub_mr
mrr += sub_mrr
hits += np.array(sub_hits)
alignment_rest |= sub_hits1_rest
else:
mr, mrr, hits, alignment_rest = calculate_rank(list(range(num)), sim_mat, top_k, accurate, num)
assert len(alignment_rest) == num
hits = np.array(hits) / num * 100
for i in range(len(hits)):
hits[i] = round(hits[i], 3)
cost = time.time() - t
if accurate:
if csls_k > 0:
print("accurate results with csls: csls={}, hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s ".
format(csls_k, top_k, hits, mr, mrr, cost))
else:
print("accurate results: hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s ".
format(top_k, hits, mr, mrr, cost))
else:
if csls_k > 0:
print("quick results with csls: csls={}, hits@{} = {}%, time = {:.3f} s ".format(csls_k, top_k, hits, cost))
else:
print("quick results: hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s ".format(top_k, hits, mr, mrr, cost))
# hits1 = hits[0]
del sim_mat
gc.collect()
return alignment_rest, hits, mr, mrr
def calculate_rank(idx, sim_mat, top_k, accurate, total_num):
assert 1 in top_k
mr = 0
mrr = 0
hits = [0] * len(top_k)
hits1_rest = set()
for i in range(len(idx)):
gold = idx[i]
if accurate:
rank = (-sim_mat[i, :]).argsort()
else:
rank = np.argpartition(-sim_mat[i, :], np.array(top_k) - 1)
hits1_rest.add((gold, rank[0]))
assert gold in rank
rank_index = np.where(rank == gold)[0][0]
mr += (rank_index + 1)
mrr += 1 / (rank_index + 1)
for j in range(len(top_k)):
if rank_index < top_k[j]:
hits[j] += 1
mr /= total_num
mrr /= total_num
return mr, mrr, hits, hits1_rest
def sim(embed1, embed2, metric='inner', normalize=False, csls_k=0):
"""
Compute pairwise similarity between the two collections of embeddings.
Parameters
----------
embed1 : matrix_like
An embedding matrix of size n1*d, where n1 is the number of embeddings and d is the dimension.
embed2 : matrix_like
An embedding matrix of size n2*d, where n2 is the number of embeddings and d is the dimension.
metric : str, optional, inner default.
The distance metric to use. It can be 'cosine', 'euclidean', 'inner'.
normalize : bool, optional, default false.
Whether to normalize the input embeddings.
csls_k : int, optional, 0 by default.
K value for csls. If k > 0, enhance the similarity by csls.
Returns
-------
sim_mat : An similarity matrix of size n1*n2.
"""
if normalize:
embed1 = preprocessing.normalize(embed1)
embed2 = preprocessing.normalize(embed2)
if metric == 'inner':
sim_mat = np.matmul(embed1, embed2.T) # numpy.ndarray, float32
elif metric == 'cosine' and normalize:
sim_mat = np.matmul(embed1, embed2.T) # numpy.ndarray, float32
elif metric == 'euclidean':
sim_mat = 1 - euclidean_distances(embed1, embed2)
print(type(sim_mat), sim_mat.dtype)
sim_mat = sim_mat.astype(np.float32)
elif metric == 'cosine':
sim_mat = 1 - cdist(embed1, embed2, metric='cosine') # numpy.ndarray, float64
sim_mat = sim_mat.astype(np.float32)
elif metric == 'manhattan':
sim_mat = 1 - cdist(embed1, embed2, metric='cityblock')
sim_mat = sim_mat.astype(np.float32)
else:
sim_mat = 1 - cdist(embed1, embed2, metric=metric)
sim_mat = sim_mat.astype(np.float32)
if csls_k > 0:
sim_mat = csls_sim(sim_mat, csls_k)
return sim_mat
def csls_sim(sim_mat, k):
"""
Compute pairwise csls similarity based on the input similarity matrix.
Parameters
----------
sim_mat : matrix-like
A pairwise similarity matrix.
k : int
The number of nearest neighbors.
Returns
-------
csls_sim_mat : A csls similarity matrix of n1*n2.
"""
nearest_values1 = calculate_nearest_k(sim_mat, k)
nearest_values2 = calculate_nearest_k(sim_mat.T, k)
csls_sim_mat = 2 * sim_mat.T - nearest_values1
csls_sim_mat = csls_sim_mat.T - nearest_values2
return csls_sim_mat
def calculate_nearest_k(sim_mat, k):
sorted_mat = -np.partition(-sim_mat, k + 1, axis=1) # -np.sort(-sim_mat1)
nearest_k = sorted_mat[:, 0:k]
return np.mean(nearest_k, axis=1)
def csls_sim_multi_threads(sim_mat, k, nums_threads):
tasks = task_divide(np.array(range(sim_mat.shape[0])), nums_threads)
pool = multiprocessing.Pool(processes=len(tasks))
rests = list()
for task in tasks:
rests.append(pool.apply_async(calculate_nearest_k, (sim_mat[task, :], k)))
pool.close()
pool.join()
sim_values = None
for res in rests:
val = res.get()
if sim_values is None:
sim_values = val
else:
sim_values = np.append(sim_values, val)
assert sim_values.shape[0] == sim_mat.shape[0]
return sim_values
def save_results(folder, rest_12):
if not os.path.exists(folder):
os.makedirs(folder)
pair2file(folder + 'alignment_results_12', rest_12)
print("Results saved!")
def pair2file(file, pairs):
if pairs is None:
return
with open(file, 'w', encoding='utf8') as f:
for i, j in pairs:
f.write(str(i) + '\t' + str(j) + '\n')
f.close()
def sim_handler(embed1, embed2, k, nums_threads):
sim_mat = np.matmul(embed1, embed2.T)
return sim_mat
if k <= 0:
print("k = 0")
return sim_mat
csls1 = CSLS_sim(sim_mat, k, nums_threads)
csls2 = CSLS_sim(sim_mat.T, k, nums_threads)
# for i in range(sim_mat.shape[0]):
# for j in range(sim_mat.shape[1]):
# sim_mat[i][j] = 2 * sim_mat[i][j] - csls1[i] - csls2[j]
# return sim_mat
csls_sim_mat = 2 * sim_mat.T - csls1
csls_sim_mat = csls_sim_mat.T - csls2
del sim_mat
gc.collect()
return csls_sim_mat
def eval_alignment_by_sim_mat(embed1, embed2, top_k, nums_threads, csls=0, accurate=False,output = True):
t = time.time()
sim_mat = sim_handler(embed1, embed2, csls, nums_threads)
# sim_mat = sim(embed1, embed2, metric=metric, normalize=normalize, csls_k=csls_k)
ref_num = sim_mat.shape[0]
t_num = [0 for k in top_k]
t_mean = 0
t_mrr = 0
t_prec_set = set()
tasks = task_divide(np.array(range(ref_num)), nums_threads)
pool = multiprocessing.Pool(processes=len(tasks))
reses = list()
for task in tasks:
reses.append(pool.apply_async(cal_rank_by_sim_mat, (task, sim_mat[task, :], top_k, accurate)))
pool.close()
pool.join()
for res in reses:
mean, mrr, num, prec_set = res.get()
t_mean += mean
t_mrr += mrr
t_num += np.array(num)
t_prec_set |= prec_set
assert len(t_prec_set) == ref_num
acc = np.array(t_num) / ref_num * 100
for i in range(len(acc)):
acc[i] = round(acc[i], 2)
t_mean /= ref_num
t_mrr /= ref_num
if output:
if accurate:
print("accurate results: hits@{} = {}, mr = {:.3f}, mrr = {:.3f}, time = {:.3f} s ".format(top_k, acc, t_mean,
t_mrr,
time.time() - t))
else:
print("hits@{} = {}, time = {:.3f} s ".format(top_k, acc, time.time() - t))
hits1 = acc[0]
del sim_mat
gc.collect()
return t_prec_set, hits1
def cal_rank_by_sim_mat(task, sim, top_k, accurate):
mean = 0
mrr = 0
num = [0 for k in top_k]
prec_set = set()
for i in range(len(task)):
ref = task[i]
if accurate:
rank = (-sim[i, :]).argsort()
else:
rank = np.argpartition(-sim[i, :], np.array(top_k) - 1)
prec_set.add((ref, rank[0]))
assert ref in rank
rank_index = np.where(rank == ref)[0][0]
mean += (rank_index + 1)
mrr += 1 / (rank_index + 1)
for j in range(len(top_k)):
if rank_index < top_k[j]:
num[j] += 1
return mean, mrr, num, prec_set