Skip to content

Commit a2be29d

Browse files
Merge pull request #43 from arrangeesh/master
Support normalization parameter
2 parents bea64df + b83c4fd commit a2be29d

6 files changed

Lines changed: 63 additions & 66 deletions

File tree

.travis.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ deploy:
77
skip_cleanup: true
88
api-key:
99
secure: LmVvlW+FdYNIDlinjJ4sieONrcx1jaw18J7/mpHBD9ppIWZ+TB6H/iNqkqkh4WvULZttJrTHRYE6rQHXww7KK2UMrjVNE/TVUPaLFDeRRFvLDinAbqJkn+QJia0TuRa/26Bg9cDcvNYTghy7s37xpK2bJTEMF/eCM9b9RHYXilESYy8Z4l8IkFn5vnaDDfT5iV8xjuuOE4lsf4KC3L0xXIkYnKC/LbDVDj3B9h52TpsteL6cZtn/ExAThor5SrVymW7oMR1qrPQv8btNAdxymqJvEbjaP5RUuX7ehihev0Yge47A2X9gvxDRv+a6wM0HOvT4aGsMwCWo++fb0taWH7HUXFxSvkzKhsl74kDMmnE0WarcI/8L/3Q/zRhW1a2vAtj3O0FDHtzS/OK/k3TDk6Fh/LOvk2mTuGD3L34YxJrXxDxnt4tK2ubde8cGeA7pI5jRLNTNQXUip6Dxhr/5ZnMmG2nHI6ujjmDnucE+CHBtUmS1wjBn6ootE4pdoyti0aaA9OrVoGrf39pK7FAG38KJghqn8I3YCLoeapWjI4/DI0WIfq2Vl+v6yQar3Dn9lBLpWFLrjUmZnAx2F1e0P2y0VUg9hl0bINzIIrm2mHw4Zsl2GlMVSR033cwvcbdyeNxKMAfSV3EZBDpNuI6nlkkUZG1O72N/WV+kFRtSdQA=
10-
name: wordvecspace-0.5.4
11-
tag_name: 0.5.4
10+
name: wordvecspace-0.5.5
11+
tag_name: 0.5.5
1212
true:
1313
repo: deep-compute/wordvecspace
1414
- provider: pypi

README.md

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -179,14 +179,6 @@ False
179179
>>> print(wv.get_index("inidia"))
180180
None
181181

182-
>>> print(wv.get_index("inidia", raise_exc=True))
183-
Traceback (most recent call last):
184-
File "/usr/lib/python3.6/code.py", line 91, in runcode
185-
exec(code, self.locals)
186-
File "<console>", line 1, in <module>
187-
File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 196, in get_word_index
188-
raise UnknownWord(word)
189-
wordvecspace.exception.UnknownWord: "inidia"
190182
```
191183

192184
##### Get the indices of words
@@ -195,16 +187,6 @@ wordvecspace.exception.UnknownWord: "inidia"
195187
[1, None, 509]
196188

197189

198-
>>> print(wv.get_indices(['the', 'deepcompute', 'india'], raise_exc=True))
199-
Traceback (most recent call last):
200-
File "/usr/lib/python3.6/code.py", line 91, in runcode
201-
exec(code, self.locals)
202-
File "<console>", line 1, in <module>
203-
File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 209, in get_word_indices
204-
index = self.get_word_index(word, raise_exc=raise_exc)
205-
File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 196, in get_word_index
206-
raise UnknownWord(word)
207-
wordvecspace.exception.UnknownWord: "deepcompute"
208190
```
209191

210192
##### Get Word at Index
@@ -262,19 +244,6 @@ None
262244
>>> print(wv.get_vector("india", normalized=True))
263245
[-0.7871 -0.2993 0.3233 -0.2864 0.323 ]
264246

265-
# Get the word vector for a word inidia.
266-
>>> print(wv.get_vector('inidia', raise_exc=True))
267-
Traceback (most recent call last):
268-
File "/usr/lib/python3.6/code.py", line 91, in runcode
269-
exec(code, self.locals)
270-
File "<console>", line 1, in <module>
271-
File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 287, in get_word_vector
272-
index = self.get_word_index(word, raise_exc)
273-
File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 196, in get_word_index
274-
raise UnknownWord(word)
275-
wordvecspace.exception.UnknownWord: "inidia"
276-
277-
# If you don't want to get exception when word is not there, then you can simply discard raise_exc=True
278247
>>> print(wv.get_vector('inidia'))
279248
[ 0. 0. 0. 0. 0.]
280249
```

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from setuptools import setup, find_packages
22

3-
version = "0.5.4"
3+
version = "0.5.5"
44
setup(
55
name="wordvecspace",
66
python_requires=">3.5.1",
@@ -23,6 +23,7 @@
2323
"diskdict==0.2.2",
2424
"deeputil==0.2.5",
2525
"bottleneck==1.2.1",
26+
"pandas==0.23.4",
2627
],
2728
extras_require={
2829
"cuda": ["pycuda==2018.1.1", "scikit-cuda==0.5.1"],

wordvecspace/cuda.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,8 @@ def _perform_sgemm(self, mat_a, mat_b, mat_out):
116116

117117
return mat_out
118118

119-
def get_distances(self, row_words, col_words=None, raise_exc=False):
120-
dvec = super(CudaWordVecSpaceMem, self).get_distances(
121-
row_words, col_words, raise_exc
122-
)
119+
def get_distances(self, row_words, col_words=None):
120+
dvec = super(CudaWordVecSpaceMem, self).get_distances(row_words, col_words)
123121
return dvec.get()
124122

125123

wordvecspace/server.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,15 +145,14 @@ def get_vectors(
145145
get_vectors(["hi", "inidia"]) => [[[ 0.6342 0.2268 -0.3904 0.0368 0.6266], [ 0. 0. 0. 0. 0. ]]
146146
"""
147147

148-
return self.wv.get_vectors(
149-
words_or_indices, normalized=normalized, raise_exc=raise_exc
150-
).tolist()
148+
return self.wv.get_vectors(words_or_indices, normalized=normalized).tolist()
151149

152150
def get_distance(
153151
self,
154152
word_or_index1: Union[str, int],
155153
word_or_index2: Union[str, int],
156154
metric: str = "angular",
155+
normalized: bool = True,
157156
) -> float:
158157
"""
159158
Get cosine distance between two words
@@ -163,7 +162,9 @@ def get_distance(
163162
"""
164163

165164
if self._type == "mem" or "disk":
166-
return self.wv.get_distance(word_or_index1, word_or_index2, metric=metric)
165+
return self.wv.get_distance(
166+
word_or_index1, word_or_index2, metric=metric, normalized=normalized
167+
)
167168

168169
return self.wv.get_distance(word_or_index1, word_or_index2)
169170

@@ -172,6 +173,7 @@ def get_distances(
172173
row_words_or_indices: Union[str, int, tuple, list],
173174
col_words_or_indices: Union[list, None] = None,
174175
metric: str = "angular",
176+
normalized: bool = True,
175177
) -> list:
176178
"""
177179
Get distances between given words and all words in the vector space
@@ -191,7 +193,10 @@ def get_distances(
191193
c = col_words_or_indices
192194
if self._type == "mem" or "disk":
193195
return self.wv.get_distances(
194-
row_words_or_indices, col_words_or_indices=c, metric=metric
196+
row_words_or_indices,
197+
col_words_or_indices=c,
198+
metric=metric,
199+
normalized=normalized,
195200
).tolist()
196201

197202
return self.wv.get_distances(
@@ -203,19 +208,28 @@ def get_nearest(
203208
v_w_i: Union[str, int, list, tuple],
204209
k: int = 512,
205210
metric: str = "angular",
211+
distances: bool = False,
206212
combination: bool = False,
213+
normalized: bool = True,
207214
) -> list:
208215
"""
209216
get_nearest("india", 20) => [509, 3389, 486, 523, 7125, 16619, 4491, 12191, 6866, 8776, 15232, 14208, 5998, 21916, 5226, 6322, 4343, 6212, 10172, 6186]
210217
get_nearest(["ram", "india"], 5, metric='euclidean') => [[3844, 16727, 15811, 42731, 41516], [509, 3389, 486, 523, 7125]]
211218
get_nearest(['india', 'bosnia'], 10, combination=True) => [523, 509, 486]
212219
"""
213220
if self._type == "mem" or self._type == "disk":
214-
neg = self.wv.get_nearest(v_w_i, k, metric=metric, combination=combination)
221+
neg = self.wv.get_nearest(
222+
v_w_i,
223+
k,
224+
metric=metric,
225+
combination=combination,
226+
distances=distances,
227+
normalized=normalized,
228+
)
215229
neg = neg.tolist()
216230

217231
else:
218-
neg = self.wv.get_nearest(v_w_i, k)
232+
neg = self.wv.get_nearest(v_w_i, k, distances=distances)
219233

220234
return neg
221235

wordvecspace/wvspace.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import os
22
import json
33
from typing import Union
4-
54
from scipy.spatial import distance
65
import numpy as np
6+
import pandas as pd
77
import bottleneck
88

99
from .fileformat import WordVecSpaceFile
@@ -60,19 +60,19 @@ def _check_indices_or_words(self, items):
6060

6161
return w
6262

63-
def _check_vec(self, v, normalised=False):
63+
def _check_vec(self, v, normalized=False):
6464
if isinstance(v, np.ndarray) and len(v.shape) == 2 and v.dtype == np.float32:
65-
if normalised:
65+
if normalized:
6666
m = np.linalg.norm(v)
6767
return v / m
6868

6969
return v
7070

7171
else:
7272
if isinstance(v, (list, tuple)):
73-
return self.get_vectors(v, normalized=normalised)
73+
return self.get_vectors(v, normalized=normalized)
7474

75-
return self.get_vector(v, normalized=normalised)
75+
return self.get_vector(v, normalized=normalized)
7676

7777
def get_manifest(self) -> dict:
7878
manifest_info = open(os.path.join(self.input_dir, "manifest.json"), "r")
@@ -148,6 +148,7 @@ def get_distance(
148148
word_or_index1: Union[int, str],
149149
word_or_index2: Union[int, str],
150150
metric: str = "cosine",
151+
normalized: bool = True,
151152
) -> float:
152153

153154
w1 = word_or_index1
@@ -156,9 +157,9 @@ def get_distance(
156157
if not metric:
157158
metric = self.metric
158159

159-
if metric == "cosine" or "angular":
160-
vec1 = self._check_vec(w1, True)
161-
vec2 = self._check_vec(w2, True)
160+
if metric in ("cosine", "angular"):
161+
vec1 = self._check_vec(w1, normalized)
162+
vec2 = self._check_vec(w2, normalized)
162163

163164
return 1 - np.dot(vec1, vec2.T)
164165

@@ -186,19 +187,20 @@ def get_distances(
186187
row_words_or_indices: Union[list, np.ndarray],
187188
col_words_or_indices: Union[list, None, np.ndarray] = None,
188189
metric=None,
190+
normalized: bool = True,
189191
) -> np.ndarray:
190192

191193
r = row_words_or_indices
192194
c = col_words_or_indices
193195

194196
metric, r, c = self._check_r_and_c(r, c, metric)
195197

196-
if metric == "cosine" or "angular":
197-
row_vectors = self._check_vec(r, True)
198+
if metric in ("cosine", "angular"):
199+
row_vectors = self._check_vec(r, normalized)
198200

199201
col_vectors = self.vecs
200202
if c is not None and len(c):
201-
col_vectors = self._check_vec(c, True)
203+
col_vectors = self._check_vec(c, normalized)
202204

203205
if len(r) == 1:
204206
nvecs, dim = col_vectors.shape
@@ -214,6 +216,10 @@ def get_distances(
214216
)
215217
res = self._perform_sgemm(row_vectors, col_vectors, mat_out)
216218

219+
if not normalized:
220+
res = np.multiply(res, self.mags)
221+
return res
222+
217223
return 1 - res
218224

219225
elif metric == "euclidean":
@@ -226,19 +232,25 @@ def get_distances(
226232

227233
return distance.cdist(row_vectors, col_vectors, "euclidean")
228234

229-
def _nearest_sorting(self, d, k):
235+
def _nearest_sorting(self, d, k, normalized=True):
230236

231237
ner = self._make_array(shape=(len(d), k), dtype=np.uint32)
232238
dist = self._make_array(shape=(len(d), k), dtype=np.float32)
233239

234240
for index, p in enumerate(d):
235-
# FIXME: better variable name for b_sort
236-
b_sort = bottleneck.argpartition(p, k)[:k]
237-
pr_dist = np.take(p, b_sort)
241+
if normalized:
242+
# FIXME: better variable name for b_sort
243+
b_sort = bottleneck.argpartition(p, k)[:k]
244+
pr_dist = np.take(p, b_sort)
238245

239-
# FIXME: better variable name for a_sorted
240-
a_sorted = np.argsort(pr_dist)
241-
indices = np.take(b_sort, a_sorted)
246+
# FIXME: better variable name for a_sorted
247+
a_sorted = np.argsort(pr_dist)
248+
indices = np.take(b_sort, a_sorted)
249+
250+
else:
251+
d = pd.Series(p)
252+
d = d.nlargest(k)
253+
indices = d.keys()
242254

243255
ner[index] = indices
244256
dist[index] = np.take(p, indices)
@@ -253,25 +265,28 @@ def get_nearest(
253265
combination: bool = False,
254266
weights: list = None,
255267
metric: str = "cosine",
268+
normalized: bool = True,
256269
) -> np.ndarray:
257270

258-
d = self.get_distances(v_w_i, metric=metric)
271+
d = self.get_distances(v_w_i, metric=metric, normalized=normalized)
259272

260273
if not weights:
261274
weights = np.ones(len(v_w_i))
262275

263276
if combination and len(weights) == len(v_w_i):
264277
weights = np.array(weights)
265278
w_d = np.dot(weights, d)
266-
nearest_indices, dist = self._nearest_sorting(w_d.reshape(1, len(w_d)), k)
279+
nearest_indices, dist = self._nearest_sorting(
280+
w_d.reshape(1, len(w_d)), k, normalized
281+
)
267282

268283
if distances:
269284
return nearest_indices, dist
270285

271286
else:
272287
return nearest_indices
273288

274-
nearest_indices, dist = self._nearest_sorting(d, k)
289+
nearest_indices, dist = self._nearest_sorting(d, k, normalized)
275290

276291
if (
277292
isinstance(v_w_i, (list, tuple))

0 commit comments

Comments
 (0)