Merge pull request #43 from arrangeesh/master

sriharsha-kms · web-flow · commit a2be29dd2676 · 2019-09-23T00:46:59.000+05:30
Support normalization parameter
diff --git a/.travis.yml b/.travis.yml
@@ -7,8 +7,8 @@ deploy:
   skip_cleanup: true
   api-key:
     secure: LmVvlW+FdYNIDlinjJ4sieONrcx1jaw18J7/mpHBD9ppIWZ+TB6H/iNqkqkh4WvULZttJrTHRYE6rQHXww7KK2UMrjVNE/TVUPaLFDeRRFvLDinAbqJkn+QJia0TuRa/26Bg9cDcvNYTghy7s37xpK2bJTEMF/eCM9b9RHYXilESYy8Z4l8IkFn5vnaDDfT5iV8xjuuOE4lsf4KC3L0xXIkYnKC/LbDVDj3B9h52TpsteL6cZtn/ExAThor5SrVymW7oMR1qrPQv8btNAdxymqJvEbjaP5RUuX7ehihev0Yge47A2X9gvxDRv+a6wM0HOvT4aGsMwCWo++fb0taWH7HUXFxSvkzKhsl74kDMmnE0WarcI/8L/3Q/zRhW1a2vAtj3O0FDHtzS/OK/k3TDk6Fh/LOvk2mTuGD3L34YxJrXxDxnt4tK2ubde8cGeA7pI5jRLNTNQXUip6Dxhr/5ZnMmG2nHI6ujjmDnucE+CHBtUmS1wjBn6ootE4pdoyti0aaA9OrVoGrf39pK7FAG38KJghqn8I3YCLoeapWjI4/DI0WIfq2Vl+v6yQar3Dn9lBLpWFLrjUmZnAx2F1e0P2y0VUg9hl0bINzIIrm2mHw4Zsl2GlMVSR033cwvcbdyeNxKMAfSV3EZBDpNuI6nlkkUZG1O72N/WV+kFRtSdQA=
-  name: wordvecspace-0.5.4
-  tag_name: 0.5.4
+  name: wordvecspace-0.5.5
+  tag_name: 0.5.5
   true:
     repo: deep-compute/wordvecspace
 - provider: pypi
diff --git a/README.md b/README.md
@@ -179,14 +179,6 @@ False
 >>> print(wv.get_index("inidia"))
 None
 
->>> print(wv.get_index("inidia", raise_exc=True))
-Traceback (most recent call last):
-  File "/usr/lib/python3.6/code.py", line 91, in runcode
-    exec(code, self.locals)
-  File "<console>", line 1, in <module>
-  File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 196, in get_word_index
-    raise UnknownWord(word)
-wordvecspace.exception.UnknownWord: "inidia"
 ```
 
 ##### Get the indices of words
@@ -195,16 +187,6 @@ wordvecspace.exception.UnknownWord: "inidia"
 [1, None, 509]
 
 
->>> print(wv.get_indices(['the', 'deepcompute', 'india'], raise_exc=True))
-Traceback (most recent call last):
-  File "/usr/lib/python3.6/code.py", line 91, in runcode
-    exec(code, self.locals)
-  File "<console>", line 1, in <module>
-  File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 209, in get_word_indices
-    index = self.get_word_index(word, raise_exc=raise_exc)
-  File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 196, in get_word_index
-    raise UnknownWord(word)
-wordvecspace.exception.UnknownWord: "deepcompute"
 ```
 
 ##### Get Word at Index
@@ -262,19 +244,6 @@ None
 >>> print(wv.get_vector("india", normalized=True))
 [-0.7871 -0.2993  0.3233 -0.2864  0.323 ]
 
-# Get the word vector for a word inidia.
->>> print(wv.get_vector('inidia', raise_exc=True))
-Traceback (most recent call last):
-  File "/usr/lib/python3.6/code.py", line 91, in runcode
-    exec(code, self.locals)
-  File "<console>", line 1, in <module>
-  File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 287, in get_word_vector
-    index = self.get_word_index(word, raise_exc)
-  File "/usr/local/lib/python3.6/dist-packages/wordvecspace/mem.py", line 196, in get_word_index
-    raise UnknownWord(word)
-wordvecspace.exception.UnknownWord: "inidia"
-
-# If you don't want to get exception when word is not there, then you can simply discard raise_exc=True
 >>> print(wv.get_vector('inidia'))
 [ 0.  0.  0.  0.  0.]
 ```
diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-version = "0.5.4"
+version = "0.5.5"
 setup(
     name="wordvecspace",
     python_requires=">3.5.1",
@@ -23,6 +23,7 @@
         "diskdict==0.2.2",
         "deeputil==0.2.5",
         "bottleneck==1.2.1",
+        "pandas==0.23.4",
     ],
     extras_require={
         "cuda": ["pycuda==2018.1.1", "scikit-cuda==0.5.1"],
diff --git a/wordvecspace/cuda.py b/wordvecspace/cuda.py
@@ -116,10 +116,8 @@ def _perform_sgemm(self, mat_a, mat_b, mat_out):
 
         return mat_out
 
-    def get_distances(self, row_words, col_words=None, raise_exc=False):
-        dvec = super(CudaWordVecSpaceMem, self).get_distances(
-            row_words, col_words, raise_exc
-        )
+    def get_distances(self, row_words, col_words=None):
+        dvec = super(CudaWordVecSpaceMem, self).get_distances(row_words, col_words)
         return dvec.get()
 
 
diff --git a/wordvecspace/server.py b/wordvecspace/server.py
@@ -145,15 +145,14 @@ def get_vectors(
         get_vectors(["hi", "inidia"]) => [[[ 0.6342  0.2268 -0.3904  0.0368  0.6266], [ 0.      0.      0.      0.      0.    ]]
         """
 
-        return self.wv.get_vectors(
-            words_or_indices, normalized=normalized, raise_exc=raise_exc
-        ).tolist()
+        return self.wv.get_vectors(words_or_indices, normalized=normalized).tolist()
 
     def get_distance(
         self,
         word_or_index1: Union[str, int],
         word_or_index2: Union[str, int],
         metric: str = "angular",
+        normalized: bool = True,
     ) -> float:
         """
         Get cosine distance between two words
@@ -163,7 +162,9 @@ def get_distance(
         """
 
         if self._type == "mem" or "disk":
-            return self.wv.get_distance(word_or_index1, word_or_index2, metric=metric)
+            return self.wv.get_distance(
+                word_or_index1, word_or_index2, metric=metric, normalized=normalized
+            )
 
         return self.wv.get_distance(word_or_index1, word_or_index2)
 
@@ -172,6 +173,7 @@ def get_distances(
         row_words_or_indices: Union[str, int, tuple, list],
         col_words_or_indices: Union[list, None] = None,
         metric: str = "angular",
+        normalized: bool = True,
     ) -> list:
         """
         Get distances between given words and all words in the vector space
@@ -191,7 +193,10 @@ def get_distances(
         c = col_words_or_indices
         if self._type == "mem" or "disk":
             return self.wv.get_distances(
-                row_words_or_indices, col_words_or_indices=c, metric=metric
+                row_words_or_indices,
+                col_words_or_indices=c,
+                metric=metric,
+                normalized=normalized,
             ).tolist()
 
         return self.wv.get_distances(
@@ -203,19 +208,28 @@ def get_nearest(
         v_w_i: Union[str, int, list, tuple],
         k: int = 512,
         metric: str = "angular",
+        distances: bool = False,
         combination: bool = False,
+        normalized: bool = True,
     ) -> list:
         """
         get_nearest("india", 20) => [509, 3389, 486, 523, 7125, 16619, 4491, 12191, 6866, 8776, 15232, 14208, 5998, 21916, 5226, 6322, 4343, 6212, 10172, 6186]
         get_nearest(["ram", "india"], 5, metric='euclidean') => [[3844, 16727, 15811, 42731, 41516], [509, 3389, 486, 523, 7125]]
         get_nearest(['india', 'bosnia'], 10, combination=True) => [523, 509, 486]
         """
         if self._type == "mem" or self._type == "disk":
-            neg = self.wv.get_nearest(v_w_i, k, metric=metric, combination=combination)
+            neg = self.wv.get_nearest(
+                v_w_i,
+                k,
+                metric=metric,
+                combination=combination,
+                distances=distances,
+                normalized=normalized,
+            )
             neg = neg.tolist()
 
         else:
-            neg = self.wv.get_nearest(v_w_i, k)
+            neg = self.wv.get_nearest(v_w_i, k, distances=distances)
 
         return neg
 
diff --git a/wordvecspace/wvspace.py b/wordvecspace/wvspace.py
@@ -1,9 +1,9 @@
 import os
 import json
 from typing import Union
-
 from scipy.spatial import distance
 import numpy as np
+import pandas as pd
 import bottleneck
 
 from .fileformat import WordVecSpaceFile
@@ -60,19 +60,19 @@ def _check_indices_or_words(self, items):
 
         return w
 
-    def _check_vec(self, v, normalised=False):
+    def _check_vec(self, v, normalized=False):
         if isinstance(v, np.ndarray) and len(v.shape) == 2 and v.dtype == np.float32:
-            if normalised:
+            if normalized:
                 m = np.linalg.norm(v)
                 return v / m
 
             return v
 
         else:
             if isinstance(v, (list, tuple)):
-                return self.get_vectors(v, normalized=normalised)
+                return self.get_vectors(v, normalized=normalized)
 
-            return self.get_vector(v, normalized=normalised)
+            return self.get_vector(v, normalized=normalized)
 
     def get_manifest(self) -> dict:
         manifest_info = open(os.path.join(self.input_dir, "manifest.json"), "r")
@@ -148,6 +148,7 @@ def get_distance(
         word_or_index1: Union[int, str],
         word_or_index2: Union[int, str],
         metric: str = "cosine",
+        normalized: bool = True,
     ) -> float:
 
         w1 = word_or_index1
@@ -156,9 +157,9 @@ def get_distance(
         if not metric:
             metric = self.metric
 
-        if metric == "cosine" or "angular":
-            vec1 = self._check_vec(w1, True)
-            vec2 = self._check_vec(w2, True)
+        if metric in ("cosine", "angular"):
+            vec1 = self._check_vec(w1, normalized)
+            vec2 = self._check_vec(w2, normalized)
 
             return 1 - np.dot(vec1, vec2.T)
 
@@ -186,19 +187,20 @@ def get_distances(
         row_words_or_indices: Union[list, np.ndarray],
         col_words_or_indices: Union[list, None, np.ndarray] = None,
         metric=None,
+        normalized: bool = True,
     ) -> np.ndarray:
 
         r = row_words_or_indices
         c = col_words_or_indices
 
         metric, r, c = self._check_r_and_c(r, c, metric)
 
-        if metric == "cosine" or "angular":
-            row_vectors = self._check_vec(r, True)
+        if metric in ("cosine", "angular"):
+            row_vectors = self._check_vec(r, normalized)
 
             col_vectors = self.vecs
             if c is not None and len(c):
-                col_vectors = self._check_vec(c, True)
+                col_vectors = self._check_vec(c, normalized)
 
             if len(r) == 1:
                 nvecs, dim = col_vectors.shape
@@ -214,6 +216,10 @@ def get_distances(
                 )
                 res = self._perform_sgemm(row_vectors, col_vectors, mat_out)
 
+            if not normalized:
+                res = np.multiply(res, self.mags)
+                return res
+
             return 1 - res
 
         elif metric == "euclidean":
@@ -226,19 +232,25 @@ def get_distances(
 
             return distance.cdist(row_vectors, col_vectors, "euclidean")
 
-    def _nearest_sorting(self, d, k):
+    def _nearest_sorting(self, d, k, normalized=True):
 
         ner = self._make_array(shape=(len(d), k), dtype=np.uint32)
         dist = self._make_array(shape=(len(d), k), dtype=np.float32)
 
         for index, p in enumerate(d):
-            # FIXME: better variable name for b_sort
-            b_sort = bottleneck.argpartition(p, k)[:k]
-            pr_dist = np.take(p, b_sort)
+            if normalized:
+                # FIXME: better variable name for b_sort
+                b_sort = bottleneck.argpartition(p, k)[:k]
+                pr_dist = np.take(p, b_sort)
 
-            # FIXME: better variable name for a_sorted
-            a_sorted = np.argsort(pr_dist)
-            indices = np.take(b_sort, a_sorted)
+                # FIXME: better variable name for a_sorted
+                a_sorted = np.argsort(pr_dist)
+                indices = np.take(b_sort, a_sorted)
+
+            else:
+                d = pd.Series(p)
+                d = d.nlargest(k)
+                indices = d.keys()
 
             ner[index] = indices
             dist[index] = np.take(p, indices)
@@ -253,25 +265,28 @@ def get_nearest(
         combination: bool = False,
         weights: list = None,
         metric: str = "cosine",
+        normalized: bool = True,
     ) -> np.ndarray:
 
-        d = self.get_distances(v_w_i, metric=metric)
+        d = self.get_distances(v_w_i, metric=metric, normalized=normalized)
 
         if not weights:
             weights = np.ones(len(v_w_i))
 
         if combination and len(weights) == len(v_w_i):
             weights = np.array(weights)
             w_d = np.dot(weights, d)
-            nearest_indices, dist = self._nearest_sorting(w_d.reshape(1, len(w_d)), k)
+            nearest_indices, dist = self._nearest_sorting(
+                w_d.reshape(1, len(w_d)), k, normalized
+            )
 
             if distances:
                 return nearest_indices, dist
 
             else:
                 return nearest_indices
 
-        nearest_indices, dist = self._nearest_sorting(d, k)
+        nearest_indices, dist = self._nearest_sorting(d, k, normalized)
 
         if (
             isinstance(v_w_i, (list, tuple))