Merge pull request #38 from deep-compute/combined_get_nearest

Ram Idavalapati · web-flow · commit bc0a489c63ec · 2018-10-17T17:16:03.000+05:30
bug fixes in get_nearest; modified logic for combined get_nearest
diff --git a/.travis.yml b/.travis.yml
@@ -2,21 +2,20 @@ language: python
 python:
 - '3.5'
 before_install:
-- wget 'https://s3.amazonaws.com/deepcompute-public-data/wordvecspace/small_test_data.tgz'
-  && tar xvzf small_test_data.tgz
-- export WORDVECSPACE_DATADIR='small_test_data'
+- wget 'https://s3.amazonaws.com/deepcompute-public-data/wordvecspace/test_data-0_5_4.tgz' && tar xvzf test_data-0_5_4.tgz
+- export WORDVECSPACE_DATADIR='test_data-0_5_4.tgz'
 install:
+- sudo apt update
 - pip install .[service]
-- sudo apt install libopenblas-base
 script:
 - echo "No tests for this release"
 deploy:
 - provider: releases
   skip_cleanup: true
   api-key:
     secure: LmVvlW+FdYNIDlinjJ4sieONrcx1jaw18J7/mpHBD9ppIWZ+TB6H/iNqkqkh4WvULZttJrTHRYE6rQHXww7KK2UMrjVNE/TVUPaLFDeRRFvLDinAbqJkn+QJia0TuRa/26Bg9cDcvNYTghy7s37xpK2bJTEMF/eCM9b9RHYXilESYy8Z4l8IkFn5vnaDDfT5iV8xjuuOE4lsf4KC3L0xXIkYnKC/LbDVDj3B9h52TpsteL6cZtn/ExAThor5SrVymW7oMR1qrPQv8btNAdxymqJvEbjaP5RUuX7ehihev0Yge47A2X9gvxDRv+a6wM0HOvT4aGsMwCWo++fb0taWH7HUXFxSvkzKhsl74kDMmnE0WarcI/8L/3Q/zRhW1a2vAtj3O0FDHtzS/OK/k3TDk6Fh/LOvk2mTuGD3L34YxJrXxDxnt4tK2ubde8cGeA7pI5jRLNTNQXUip6Dxhr/5ZnMmG2nHI6ujjmDnucE+CHBtUmS1wjBn6ootE4pdoyti0aaA9OrVoGrf39pK7FAG38KJghqn8I3YCLoeapWjI4/DI0WIfq2Vl+v6yQar3Dn9lBLpWFLrjUmZnAx2F1e0P2y0VUg9hl0bINzIIrm2mHw4Zsl2GlMVSR033cwvcbdyeNxKMAfSV3EZBDpNuI6nlkkUZG1O72N/WV+kFRtSdQA=
-  name: wordvecspace-0.5.3
-  tag_name: 0.5.3
+  name: wordvecspace-0.5.4
+  tag_name: 0.5.4
   on:
     repo: deep-compute/wordvecspace
   # pypitest
diff --git a/README.md b/README.md
@@ -1,12 +1,13 @@
 # WordVecSpace
 A high performance pure python module that helps in loading and performing operations on word vector spaces created using Google's Word2vec tool.
 
-This module has ability to the load data into memory using `WordVecSpaceMem` and it can also support performing operations on the data which is on the disk using `WordVecSpaceAnnoy` and `WordVecSpaceDisk`.
+This module has ability to the load data into memory using `WordVecSpaceMem` and it can also support performing operations on the data which is on the disk using `WordVecSpaceAnnoy` and                   `WordVecSpaceDisk`.
 
 ## Installation
 > Prerequisites: >=Python3.5.2
 
 ```bash
+
 $ sudo apt install libopenblas-base # Optional
 $ sudo pip3 install wordvecspace
 ```
@@ -21,8 +22,8 @@ word vector space data. Here are two ways to get that.
 #### Download pre-computed sample data
 
 ```bash
-$ wget https://s3.amazonaws.com/deepcompute-public-data/wordvecspace/small_test_data.tgz
-$ tar zxvf small_test_data.tgz
+$ wget https://s3.amazonaws.com/deepcompute-public-data/wordvecspace/test_data-0_5_4.tgz
+$ tar zxvf test_data-0_5_4.tgz
 ```
 
 > NOTE: We got this data by downloading the `text8` corpus
@@ -193,6 +194,7 @@ wordvecspace.exception.UnknownWord: "inidia"
 >>> print(wv.get_indices(['the', 'deepcompute', 'india']))
 [1, None, 509]
 
+
 >>> print(wv.get_indices(['the', 'deepcompute', 'india'], raise_exc=True))
 Traceback (most recent call last):
   File "/usr/lib/python3.6/code.py", line 91, in runcode
@@ -342,8 +344,33 @@ wordvecspace.exception.UnknownWord: "inidia"
 [[3844, 16727, 15811, 42731, 41516], [509, 3389, 486, 523, 7125]]
 
 # Get common nearest neighbors among given words
->>> print(wv.get_nearest(['india', 'bosnia'], 10, combination=True))
-[523, 509, 486]
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10)[0])
+['india', 'indian', 'delhi', 'subcontinent', 'hyderabad', 'pradesh', 'pakistan', 'gujarat', 'bombay', 'chhattisgarh']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10)[1])
+['pakistan', 'pakistani', 'india', 'bangladesh', 'peshawar', 'afghanistan', 'baluchistan', 'balochistan', 'kashmir', 'islamabad']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True)[0])
+['pakistan', 'india', 'indian', 'bangladesh', 'pakistani', 'subcontinent', 'shimla', 'delhi', 'punjab', 'ladakh']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True, weights=[1, 0])[0])
+['india', 'indian', 'delhi', 'subcontinent', 'hyderabad', 'pradesh', 'pakistan', 'gujarat', 'bombay', 'chhattisgarh']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True, weights=[0, 1])[0])
+['pakistan', 'pakistani', 'india', 'bangladesh', 'peshawar', 'afghanistan', 'baluchistan', 'balochistan', 'kashmir', 'islamabad']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True, weights=[0.7, 0.3])[0])
+['india', 'pakistan', 'indian', 'subcontinent', 'delhi', 'bangladesh', 'hyderabad', 'shimla', 'punjab', 'bengal']
+>>> wv.get_words(wv.get_nearest(['india', 'pakistan'], 10, combination=True, weights=[0.3, 0.7])[0])
+['pakistan', 'india', 'pakistani', 'bangladesh', 'subcontinent', 'indian', 'shimla', 'punjab', 'kashmir', 'ladakh']
+
+# Get nearest with vector(s)
+>>> wv.get_words(wv.get_nearest(wv.get_vector('india').reshape(1, wv.dim), k=5))
+['india', 'indian', 'subcontinent', 'bombay', 'bengal']
+>>> wv.get_words(wv.get_nearest(wv.get_vectors(['india', 'pakistan']), k=5)[0])
+['india', 'indian', 'subcontinent', 'bombay', 'bengal']
+>>> wv.get_words(wv.get_nearest(wv.get_vectors(['india', 'pakistan']), k=5)[1])
+['pakistan', 'pakistani', 'kargil', 'afghanistan', 'bangladesh']
+>>> wv.get_words(wv.get_nearest(wv.get_vectors(['india', 'pakistan']), k=5, combination=True)[0])
+['india', 'pakistan', 'indian', 'pakistani', 'subcontinent']
+>>> wv.get_words(wv.get_nearest(wv.get_vectors(['india', 'pakistan']), k=5, combination=True, weights=[0.4, 0.6])[0])
+['pakistan', 'india', 'pakistani', 'kargil', 'indian']
+
 ```
 
 ## Service
diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-version = '0.5.3'
+version = '0.5.4'
 setup(
     name="wordvecspace",
     python_requires='>3.5.1',
diff --git a/test.py b/test.py
@@ -7,6 +7,7 @@
 from wordvecspace import mem
 from wordvecspace import annoy
 from wordvecspace import disk
+from wordvecspace import wvspace
 
 def suite_test():
     suite = unittest.TestSuite()
@@ -15,6 +16,7 @@ def suite_test():
     suite.addTests(doctest.DocTestSuite(mem))
     suite.addTests(doctest.DocTestSuite(annoy))
     suite.addTests(doctest.DocTestSuite(disk))
+    suite.addTests(doctest.DocTestSuite(wvspace))
 
     return suite
 
@@ -23,3 +25,4 @@ def suite_test():
     doctest.testmod(mem)
     doctest.testmod(annoy)
     doctest.testmod(disk)
+    doctest.testmod(wvspace)
diff --git a/wordvecspace/wvspace.py b/wordvecspace/wvspace.py
@@ -16,8 +16,10 @@
 # $export WORDVECSPACE_DATADIR=/path/to/data/
 DATAFILE_ENV_VAR = os.environ.get('WORDVECSPACE_DATADIR', '')
 
+
 class WordVecSpace(WordVecSpaceBase):
     METRIC = 'cosine'
+    DEFAULT_K = 512
 
     def __init__(self, input_dir: str, metric: str=METRIC) -> None:
         self._f = WordVecSpaceFile(input_dir, mode='r')
@@ -40,6 +42,7 @@ def _make_array(self, shape, dtype):
     def _check_index_or_word(self, item):
         if isinstance(item, str):
             return self.get_index(item)
+
         return item
 
     def _check_indices_or_words(self, items):
@@ -54,14 +57,17 @@ def _check_indices_or_words(self, items):
         if isinstance(w, (list, tuple)):
             if isinstance(w[0], str):
                 return self.get_indices(w)
+
         return w
 
     def _check_vec(self, v, normalised=False):
-        if isinstance(v, np.ndarray) and len(v.shape) == 2 and v.dtype==np.float32:
+        if isinstance(v, np.ndarray) and len(v.shape) == 2 and v.dtype == np.float32:
             if normalised:
                 m = np.linalg.norm(v)
                 return v / m
+
             return v
+
         else:
             if isinstance(v, (list, tuple)):
                 return self.get_vectors(v, normalized=normalised)
@@ -133,8 +139,8 @@ def get_vectors(self, words_or_indices: list, normalized: bool=False) -> np.ndar
 
         return np.multiply(vecs.T, mags).T
 
-    def get_distance(self, word_or_index1: Union[int, str],\
-                    word_or_index2: Union[int, str], metric: str='cosine') -> float:
+    def get_distance(self, word_or_index1: Union[int, str],
+                     word_or_index2: Union[int, str], metric: str='cosine') -> float:
 
         w1 = word_or_index1
         w2 = word_or_index2
@@ -167,8 +173,10 @@ def _check_r_and_c(self, r, c, m):
 
         return m, r, c
 
-    def get_distances(self, row_words_or_indices: Union[list, np.ndarray],\
-                    col_words_or_indices: Union[list, None, np.ndarray]=None, metric=None) -> np.ndarray:
+    def get_distances(self,
+                    row_words_or_indices: Union[list, np.ndarray],
+                    col_words_or_indices: Union[list, None, np.ndarray]=None,
+                    metric=None) -> np.ndarray:
 
         r = row_words_or_indices
         c = col_words_or_indices
@@ -186,7 +194,6 @@ def get_distances(self, row_words_or_indices: Union[list, np.ndarray],\
                 nvecs, dim = col_vectors.shape
 
                 vec_out = self._make_array((len(col_vectors), len(row_vectors)), dtype=np.float32)
-
                 res = self._perform_sgemv(row_vectors, col_vectors, vec_out, nvecs, dim)
 
             else:
@@ -205,33 +212,52 @@ def get_distances(self, row_words_or_indices: Union[list, np.ndarray],\
 
             return distance.cdist(row_vectors, col_vectors, 'euclidean')
 
-    DEFAULT_K = 512
-
-    def get_nearest(self, v_w_i: list, k: int=DEFAULT_K,\
-                    distances: bool=False, combination: bool=False,\
-                    metric: str='cosine') -> np.ndarray:
-
-        d = self.get_distances(v_w_i, metric=metric)
+    def _nearest_sorting(self, d, k):
 
         ner = self._make_array(shape=(len(d), k), dtype=np.uint32)
         dist = self._make_array(shape=(len(d), k), dtype=np.float32)
 
         for index, p in enumerate(d):
+            # FIXME: better variable name for b_sort
             b_sort = bottleneck.argpartition(p, k)[:k]
-            pr_dist = np.take(d, b_sort)
+            pr_dist = np.take(p, b_sort)
 
+            # FIXME: better variable name for a_sorted
             a_sorted = np.argsort(pr_dist)
             indices = np.take(b_sort, a_sorted)
 
             ner[index] = indices
             dist[index] = np.take(p, indices)
 
-        if combination:
-            ner = set(ner[0]).intersection(*ner)
-            return (ner, dist) if distances else ner
+        return ner, dist
+
+    def get_nearest(self, v_w_i: list,
+                    k: int=DEFAULT_K,
+                    distances: bool=False,
+                    combination: bool=False,
+                    weights: list=None,
+                    metric: str='cosine') -> np.ndarray:
+
+        d = self.get_distances(v_w_i, metric=metric)
+
+        if not weights:
+            weights = np.ones(len(v_w_i))
+
+        if combination and len(weights) == len(v_w_i):
+            weights = np.array(weights)
+            w_d = np.dot(weights, d)
+            nearest_indices, dist = self._nearest_sorting(w_d.reshape(1, len(w_d)), k)
+
+            if distances:
+                return nearest_indices, dist
+
+            else:
+                return nearest_indices
+
+        nearest_indices, dist = self._nearest_sorting(d, k)
+
+        if isinstance(v_w_i, (list, tuple)) or isinstance(v_w_i, np.ndarray) and len(v_w_i) > 1:
+            return (nearest_indices, dist) if distances else nearest_indices
 
-        if isinstance(v_w_i, (list, tuple)) or \
-            isinstance(v_w_i, np.ndarray) and len(v_w_i) > 1:
-            return (ner, dist) if distances else ner
         else:
-            return (ner[0], dist[0]) if distances else ner[0]
+            return (nearest_indices[0], dist[0]) if distances else nearest_indices[0]