bug fixes in get_nearest; modified logic for combined get_nearest; pep8 fixes

chatrapathik · chatrapathik · commit 8d0c94829ff5 · 2018-10-17T11:36:38.000+02:00
diff --git a/README.md b/README.md
@@ -1,12 +1,13 @@
 # WordVecSpace
 A high performance pure python module that helps in loading and performing operations on word vector spaces created using Google's Word2vec tool.
 
-This module has ability to the load data into memory using `WordVecSpaceMem` and it can also support performing operations on the data which is on the disk using `WordVecSpaceAnnoy` and `WordVecSpaceDisk`.
+This module has ability to the load data into memory using `WordVecSpaceMem` and it can also support performing operations on the data which is on the disk using `WordVecSpaceAnnoy` and                   `WordVecSpaceDisk`.
 
 ## Installation
-> Prerequisites: >=Python3.5.2
+> Prerequisites: Python3.5.2
 
 ```bash
+
 $ sudo apt install libopenblas-base # Optional
 $ sudo pip3 install wordvecspace
 ```
@@ -21,8 +22,8 @@ word vector space data. Here are two ways to get that.
 #### Download pre-computed sample data
 
 ```bash
-$ wget https://s3.amazonaws.com/deepcompute-public-data/wordvecspace/small_test_data.tgz
-$ tar zxvf small_test_data.tgz
+$ wget https://s3.amazonaws.com/deepcompute-public-data/wordvecspace/test_data-0_5_4.tgz
+$ tar test_data-0_5_4.tgz
 ```
 
 > NOTE: We got this data by downloading the `text8` corpus
@@ -41,7 +42,7 @@ $ git clone https://github.com/tmikolov/word2vec.git
 
 # 1. Navigate to the folder word2vec
 # 2. open demo-word.sh for editing
-# 3. Edit the command "time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15" ----to----> "time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 5 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -save-vocab vocab.txt -iter 15" to get vocab.txt file also as output.
+# 3. Edit the command "time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15" ----to----> "time ./word2vec -train     text8 -output vectors.bin -cbow 1 -size 5 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -save-vocab vocab.txt -iter 15" to get vocab.txt file also as output.
 # 4. Run demo-word.sh
 
 $ chmod +x demo-word.sh
@@ -96,7 +97,7 @@ $ wordvecspace convert /home/user/bindata /home/user/output_dir
 
 `WordVecSpaceMem` and `WordVecSpaceDisk` is a bruteforce algorithm which compares given word with all the words in the vector space
 
-`WordVecSpaceAnnoy` takes wordvecspace output_dir as input and creates annoy indexes in another file (index file). Using this file `annoy` gives approximate results quickly. For better understanding of `Annoy` please go through this [link](https://github.com/spotify/annoy)
+`WordVecSpaceAnnoy` takes wordvecspace output_dir as input and creates annoy indexes in another file (index file). Using this file `annoy` gives approximate results quickly. For better understanding of   `Annoy` please go through this [link](https://github.com/spotify/annoy)
 
 As we have seen how to import `WordVecSpaceDisk` above, let us look at `WordVecSpaceAnnoy` and `WordVecSpaceMem`
 
@@ -193,6 +194,7 @@ wordvecspace.exception.UnknownWord: "inidia"
 >>> print(wv.get_indices(['the', 'deepcompute', 'india']))
 [1, None, 509]
 
+
 >>> print(wv.get_indices(['the', 'deepcompute', 'india'], raise_exc=True))
 Traceback (most recent call last):
   File "/usr/lib/python3.6/code.py", line 91, in runcode
@@ -342,8 +344,33 @@ wordvecspace.exception.UnknownWord: "inidia"
 [[3844, 16727, 15811, 42731, 41516], [509, 3389, 486, 523, 7125]]
 
 # Get common nearest neighbors among given words
->>> print(wv.get_nearest(['india', 'bosnia'], 10, combination=True))
-[523, 509, 486]
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10)[0])
+['india', 'indian', 'delhi', 'subcontinent', 'hyderabad', 'pradesh', 'pakistan', 'gujarat', 'bombay', 'chhattisgarh']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10)[1])
+['pakistan', 'pakistani', 'india', 'bangladesh', 'peshawar', 'afghanistan', 'baluchistan', 'balochistan', 'kashmir', 'islamabad']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True)[0])
+['pakistan', 'india', 'indian', 'bangladesh', 'pakistani', 'subcontinent', 'shimla', 'delhi', 'punjab', 'ladakh']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True, weights=[1, 0])[0])
+['india', 'indian', 'delhi', 'subcontinent', 'hyderabad', 'pradesh', 'pakistan', 'gujarat', 'bombay', 'chhattisgarh']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True, weights=[0, 1])[0])
+['pakistan', 'pakistani', 'india', 'bangladesh', 'peshawar', 'afghanistan', 'baluchistan', 'balochistan', 'kashmir', 'islamabad']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True, weights=[0.7, 0.3])[0])
+['india', 'pakistan', 'indian', 'subcontinent', 'delhi', 'bangladesh', 'hyderabad', 'shimla', 'punjab', 'bengal']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True, weights=[0.3, 0.7])[0])
+['pakistan', 'india', 'pakistani', 'bangladesh', 'subcontinent', 'indian', 'shimla', 'punjab', 'kashmir', 'ladakh']
+
+# Get nearest with vector(s)
+>>> wv.get_words(wv.get_nearest(wv.get_vector('india').reshape(1, wv.dim), k=5))
+['india', 'indian', 'subcontinent', 'bombay', 'bengal']
+>>> wv.get_words(wv.get_nearest(wv.get_vectors(['india', 'pakistan']), k=5)[0])
+['india', 'indian', 'subcontinent', 'bombay', 'bengal']
+>>> wv.get_words(wv.get_nearest(wv.get_vectors(['india', 'pakistan']), k=5)[1])
+['pakistan', 'pakistani', 'kargil', 'afghanistan', 'bangladesh']
+>>> wv.get_words(wv.get_nearest(wv.get_vectors(['india', 'pakistan']), k=5, combination=True)[0])
+['india', 'pakistan', 'indian', 'pakistani', 'subcontinent']
+>>> wv.get_words(wv.get_nearest(wv.get_vectors(['india', 'pakistan']), k=5, combination=True, weights=[0.4, 0.6])[0])
+['pakistan', 'india', 'pakistani', 'kargil', 'indian']
+
 ```
 
 ## Service
diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-version = '0.5.3'
+version = '0.5.4'
 setup(
     name="wordvecspace",
     python_requires='>3.5.1',
diff --git a/test.py b/test.py
@@ -7,6 +7,7 @@
 from wordvecspace import mem
 from wordvecspace import annoy
 from wordvecspace import disk
+from wordvecspace import wvspace
 
 def suite_test():
     suite = unittest.TestSuite()
@@ -15,6 +16,7 @@ def suite_test():
     suite.addTests(doctest.DocTestSuite(mem))
     suite.addTests(doctest.DocTestSuite(annoy))
     suite.addTests(doctest.DocTestSuite(disk))
+    suite.addTests(doctest.DocTestSuite(wvspace))
 
     return suite
 
@@ -23,3 +25,4 @@ def suite_test():
     doctest.testmod(mem)
     doctest.testmod(annoy)
     doctest.testmod(disk)
+    doctest.testmod(wvspace)
diff --git a/wordvecspace/wvspace.py b/wordvecspace/wvspace.py
@@ -16,8 +16,10 @@
 # $export WORDVECSPACE_DATADIR=/path/to/data/
 DATAFILE_ENV_VAR = os.environ.get('WORDVECSPACE_DATADIR', '')
 
+
 class WordVecSpace(WordVecSpaceBase):
     METRIC = 'cosine'
+    DEFAULT_K = 512
 
     def __init__(self, input_dir: str, metric: str=METRIC) -> None:
         self._f = WordVecSpaceFile(input_dir, mode='r')
@@ -40,6 +42,7 @@ def _make_array(self, shape, dtype):
     def _check_index_or_word(self, item):
         if isinstance(item, str):
             return self.get_index(item)
+
         return item
 
     def _check_indices_or_words(self, items):
@@ -54,14 +57,17 @@ def _check_indices_or_words(self, items):
         if isinstance(w, (list, tuple)):
             if isinstance(w[0], str):
                 return self.get_indices(w)
+
         return w
 
     def _check_vec(self, v, normalised=False):
-        if isinstance(v, np.ndarray) and len(v.shape) == 2 and v.dtype==np.float32:
+        if isinstance(v, np.ndarray) and len(v.shape) == 2 and v.dtype == np.float32:
             if normalised:
                 m = np.linalg.norm(v)
                 return v / m
+
             return v
+
         else:
             if isinstance(v, (list, tuple)):
                 return self.get_vectors(v, normalized=normalised)
@@ -133,8 +139,8 @@ def get_vectors(self, words_or_indices: list, normalized: bool=False) -> np.ndar
 
         return np.multiply(vecs.T, mags).T
 
-    def get_distance(self, word_or_index1: Union[int, str],\
-                    word_or_index2: Union[int, str], metric: str='cosine') -> float:
+    def get_distance(self, word_or_index1: Union[int, str],
+                     word_or_index2: Union[int, str], metric: str='cosine') -> float:
 
         w1 = word_or_index1
         w2 = word_or_index2
@@ -167,8 +173,10 @@ def _check_r_and_c(self, r, c, m):
 
         return m, r, c
 
-    def get_distances(self, row_words_or_indices: Union[list, np.ndarray],\
-                    col_words_or_indices: Union[list, None, np.ndarray]=None, metric=None) -> np.ndarray:
+    def get_distances(self,
+                    row_words_or_indices: Union[list, np.ndarray],
+                    col_words_or_indices: Union[list, None, np.ndarray]=None,
+                    metric=None) -> np.ndarray:
 
         r = row_words_or_indices
         c = col_words_or_indices
@@ -186,7 +194,6 @@ def get_distances(self, row_words_or_indices: Union[list, np.ndarray],\
                 nvecs, dim = col_vectors.shape
 
                 vec_out = self._make_array((len(col_vectors), len(row_vectors)), dtype=np.float32)
-
                 res = self._perform_sgemv(row_vectors, col_vectors, vec_out, nvecs, dim)
 
             else:
@@ -205,33 +212,52 @@ def get_distances(self, row_words_or_indices: Union[list, np.ndarray],\
 
             return distance.cdist(row_vectors, col_vectors, 'euclidean')
 
-    DEFAULT_K = 512
-
-    def get_nearest(self, v_w_i: list, k: int=DEFAULT_K,\
-                    distances: bool=False, combination: bool=False,\
-                    metric: str='cosine') -> np.ndarray:
-
-        d = self.get_distances(v_w_i, metric=metric)
+    def _nearest_sorting(self, d, k):
 
         ner = self._make_array(shape=(len(d), k), dtype=np.uint32)
         dist = self._make_array(shape=(len(d), k), dtype=np.float32)
 
         for index, p in enumerate(d):
+            # FIXME: better variable name for b_sort
             b_sort = bottleneck.argpartition(p, k)[:k]
-            pr_dist = np.take(d, b_sort)
+            pr_dist = np.take(p, b_sort)
 
+            # FIXME: better variable name for a_sorted
             a_sorted = np.argsort(pr_dist)
             indices = np.take(b_sort, a_sorted)
 
             ner[index] = indices
             dist[index] = np.take(p, indices)
 
-        if combination:
-            ner = set(ner[0]).intersection(*ner)
-            return (ner, dist) if distances else ner
+        return ner, dist
+
+    def get_nearest(self, v_w_i: list,
+                    k: int=DEFAULT_K,
+                    distances: bool=False,
+                    combination: bool=False,
+                    weights: list=None,
+                    metric: str='cosine') -> np.ndarray:
+
+        d = self.get_distances(v_w_i, metric=metric)
+
+        if not weights:
+            weights = np.ones(len(v_w_i))
+
+        if combination and len(weights) == len(v_w_i):
+            weights = np.array(weights)
+            w_d = np.dot(weights, d)
+            nearest_indices, dist = self._nearest_sorting(w_d.reshape(1, len(w_d)), k)
+
+            if distances:
+                return nearest_indices, dist
+
+            else:
+                return nearest_indices
+
+        nearest_indices, dist = self._nearest_sorting(d, k)
+
+        if isinstance(v_w_i, (list, tuple)) or isinstance(v_w_i, np.ndarray) and len(v_w_i) > 1:
+            return (nearest_indices, dist) if distances else nearest_indices
 
-        if isinstance(v_w_i, (list, tuple)) or \
-            isinstance(v_w_i, np.ndarray) and len(v_w_i) > 1:
-            return (ner, dist) if distances else ner
         else:
-            return (ner[0], dist[0]) if distances else ner[0]
+            return (nearest_indices[0], dist[0]) if distances else nearest_indices[0]