deep-compute
diff --git a/‎.travis.yml‎
Lines changed: 4 additions & 4 deletions b/‎.travis.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 70 additions & 55 deletions b/‎README.md‎
Lines changed: 70 additions & 55 deletions
diff --git a/‎setup.py‎
Lines changed: 7 additions & 6 deletions b/‎setup.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎wordvecspace/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎wordvecspace/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -4,8 +4,8 @@ python:
 before_install:
 - wget 'https://s3.amazonaws.com/deepcompute-public-data/wordvecspace/small_test_data.tgz'
   && tar xvzf small_test_data.tgz
-- export WORDVECSPACE_DATAFILE='dc.wvspace'
-install: 
+- export WORDVECSPACE_DATADIR='small_test_data'
+install:
 - pip install .[service]
 - sudo apt install libopenblas-base
 - sudo apt-get install libffi-dev
@@ -16,8 +16,8 @@ deploy:
   skip_cleanup: true
   api-key:
     secure: LmVvlW+FdYNIDlinjJ4sieONrcx1jaw18J7/mpHBD9ppIWZ+TB6H/iNqkqkh4WvULZttJrTHRYE6rQHXww7KK2UMrjVNE/TVUPaLFDeRRFvLDinAbqJkn+QJia0TuRa/26Bg9cDcvNYTghy7s37xpK2bJTEMF/eCM9b9RHYXilESYy8Z4l8IkFn5vnaDDfT5iV8xjuuOE4lsf4KC3L0xXIkYnKC/LbDVDj3B9h52TpsteL6cZtn/ExAThor5SrVymW7oMR1qrPQv8btNAdxymqJvEbjaP5RUuX7ehihev0Yge47A2X9gvxDRv+a6wM0HOvT4aGsMwCWo++fb0taWH7HUXFxSvkzKhsl74kDMmnE0WarcI/8L/3Q/zRhW1a2vAtj3O0FDHtzS/OK/k3TDk6Fh/LOvk2mTuGD3L34YxJrXxDxnt4tK2ubde8cGeA7pI5jRLNTNQXUip6Dxhr/5ZnMmG2nHI6ujjmDnucE+CHBtUmS1wjBn6ootE4pdoyti0aaA9OrVoGrf39pK7FAG38KJghqn8I3YCLoeapWjI4/DI0WIfq2Vl+v6yQar3Dn9lBLpWFLrjUmZnAx2F1e0P2y0VUg9hl0bINzIIrm2mHw4Zsl2GlMVSR033cwvcbdyeNxKMAfSV3EZBDpNuI6nlkkUZG1O72N/WV+kFRtSdQA=
-  name: wordvecspace-0.5.1
-  tag_name: 0.5.1
+  name: wordvecspace-0.5.2
+  tag_name: 0.5.2
   on:
     repo: deep-compute/wordvecspace
   # pypitest
 
@@ -1,7 +1,7 @@
 # WordVecSpace
 A high performance pure python module that helps in loading and performing operations on word vector spaces created using Google's Word2vec tool.
 
-This module has ability to load data into memory using `WordVecSpaceMem` or it can also supports performing operations on the data which is on the disk using `WordVecSpaceAnnoy`.
+This module has ability to load data into memory using `WordVecSpaceMem` and it can also support performing operations on the data which is on the disk using `WordVecSpaceAnnoy` and `WordVecSpaceDisk`.
 
 ## Installation
 > Prerequisites: Python3.5
@@ -63,19 +63,19 @@ by the `wordvecspace` module. You'll first have to convert them
 to the `WordVecSpace` format.
 
 ```bash
-$ wordvecspace convert <input_dir> <output_file>
+$ wordvecspace convert <input_dir> <output_dir>
 
 # <input_dir> is the directory which has vocab.txt and vectors.bin
-# <output_file> is the file where you want to put your output file
+# <output_dir> is the directory where you want to store your output files.
 ```
 
 Example:
 
 ```bash
-$ wordvecspace convert /home/user/bindata /home/user/dc.wvspace
+$ wordvecspace convert /home/user/bindata /home/user/output_dir
 
 # /home/user/bindata is the directory containing vocab.txt and vectors.bin
-# dc.wvspace is the output file
+# /home/user/output_dir is the output directory which contains wordvecspace data files.
 ```
 
 ### Importing
@@ -89,59 +89,67 @@ $ wordvecspace convert /home/user/bindata /home/user/dc.wvspace
 
 ##### Load data
 ```python
->>> wv = WordVecSpaceMem('/home/user/dc.wvspace')
+>>> wv = WordVecSpaceMem('/home/user/output_dir')
 ```
 
 ##### Make get_nearest call
 ```python
 >>> wv.get_nearest('india', k=20)
-[509, 486, 523, 4343, 14208, 13942, 42424, 25578, 6212, 2475, 3560, 13508, 20919, 3389, 4484, 19995, 8776, 7012, 12191, 16619]
-
+[509, 3389, 486, 523, 7125, 16619, 4491, 12191, 6866, 8776, 15232, 14208, 5998, 21916, 5226, 6322, 4343, 6212, 10172, 6186]
 # k is for getting top k nearest values
 ```
 
 #### Types
-`wordvecspace` module can perform operations by loading data into RAM using `WordVecSpaceMem` or directly on the data which is on the disk using `WordVecSpaceAnnoy`
+`wordvecspace` module can perform operations by loading data into RAM using `WordVecSpaceMem` or directly on the data which is on the disk using `WordVecSpaceDisk`
 
-`WordVecSpaceMem` is a bruteforce algorithm which compares given word with all the words in the vector space
+`WordVecSpaceMem` and `WordVecSpaceDisk` is a bruteforce algorithm which compares given word with all the words in the vector space
 
-`WordVecSpaceAnnoy` takes wvspace file as input and creates annoy indexes in another file (index file). Using this file `annoy` gives approximate results quickly. For better understanding of `Annoy` please go through this [link](https://github.com/spotify/annoy)
+`WordVecSpaceAnnoy` takes wordvecspace output_dir as input and creates annoy indexes in another file (index file). Using this file `annoy` gives approximate results quickly. For better understanding of `Annoy` please go through this [link](https://github.com/spotify/annoy)
 
-As we have seen how to import `WordVecSpaceMem` above, let us look at `WordVecSpaceAnnoy`
+As we have seen how to import `WordVecSpaceMem` above, let us look at `WordVecSpaceAnnoy` and `WordVecSpaceDisk`
 
 ##### Import
 ```python
 >>> from wordvecspace import WordVecSpaceAnnoy
+>>> from wordvecspace import WordVecSpaceDisk
 ```
 
 ##### Load data
 ```python
-wv = WordVecSpaceAnnoy('/home/user/dc.wvspace', n_trees, index_fpath)
+>>> wv = WordVecSpaceAnnoy('/home/user/output_dir', n_trees, index_fpath)
 
 # n_trees = number of trees(More trees gives a higher precision when querying for get_nearest)
 # index_fpath = path for annoy index file
 
-# n_trees and index_fpath are optional. If those are not given then WordVecSpaceAnnoy uses `1` for n_trees and `/home/user/` (dc.wvspace file directory) directory for index_fpath.
+# n_trees and index_fpath are optional. If those are not given then WordVecSpaceAnnoy uses `1` for n_trees and `/home/user/output_dir` (wordvecspace data directory) directory for index_fpath.
+
+>>> wv = WordVecSpaceDisk('/home/user/output_dir')
 ```
 
 ##### Make get_nearest call
 ```python
->>> wv.get_nearest('india', k=20)
-[509, 486, 523, 4343, 14208, 13942, 42424, 25578, 6212, 2475, 3560, 13508, 20919, 3389, 4484, 19995, 8776, 7012, 12191, 16619]
+>>> wv.get_nearest('india', k=20) (ANNOY)
+[509, 3389, 16619, 4491, 6866, 8776, 14208, 5998, 21916, 20919, 2325, 4622, 3546, 24149, 5064, 35704, 25578, 15842, 4137, 6499]
+
+>>> wv.get_nearest('india', k=20) (DISK)
+[509, 3389, 486, 523, 7125, 16619, 4491, 12191, 6866, 8776, 15232, 14208, 5998, 21916, 5226, 6322, 4343, 6212, 10172, 6186]
 ```
 
 #### Distance calculations
 `WordVecSpaceAnnoy` supports different types of distance calculations such as `"angular"`, `"euclidean"`, `"manhattan"` and `"hamming"`.
 
 `WordVecSpaceMem` supports `"angular"` and `"euclidean"` for distance calculations.
 
-Both uses `"angular"` by default. If you want to change it then you can change at the time of creating object.
+`WordVecSpaceDisk` supports `"angular"` and `"euclidean"` for distance calculations.
+
+All of the above uses `"angular"` by default. If you want to change it then you can change at the time of creating object.
 
 Example:
 
 ```bash
-wv = WordVecSpaceAnnoy('/path/to/wvspacefile', n_trees, metric="euclidean")
-wv = WordVecSpaceMem('/path/to/wvspacefile', metric="euclidean")
+wv = WordVecSpaceAnnoy('/path/to/output_dir', n_trees, metric="euclidean")
+wv = WordVecSpaceMem('/path/to/output_dir', metric="euclidean")
+wv = WordVecSpaceDisk('/path/to/output_dir', metric="euclidean")
 
 # metric = type of distance calculation
 ```
@@ -150,14 +158,14 @@ WordVecSpaceMem can also supports specifying metric at the time of calculating d
 
 Example:
 ```bash
-wv = WordVecSpaceMem('/path/to/wvspacefile', metric="euclidean")
+wv = WordVecSpaceMem('/path/to/output_dir', metric="euclidean")
 
 wv.get_distance('ap', 'india', metric='angular')
 ```
 
 #### Examples of using wordvecspace methods
 
-> WordVecSpaceMem and WordVecSpaceAnnoy have the same common methods.
+> `WordVecSpaceMem`, `WordVecSpaceAnnoy` and `WordVecSpaceDisk` support the same methods.
 
 ##### Check if a word exists or not in the word vector space
 ```python
@@ -238,25 +246,25 @@ None
 ```python
 # Get magnitude for the word "hi"
 >>> print(wv.get_vector_magnitude("hi"))
-8.7948
+1.0
 ```
 
 ##### Get vector magnitude of the words
 ```python
 # Get magnitude for the words "hi" and "india"
 >>> print(wv.get_vector_magnitudes(["hi", "india"]))
-[  8.7948  10.303 ]
+[1.0, 1.0]
 ```
 
 ##### Get vector for given word
 ```python
 # Get the word vector for a word india
 >>> print(wv.get_word_vector("india"))
-[-6.4482 -2.1636  5.7277 -3.7746  3.583 ]
+[-0.7871 -0.2993  0.3233 -0.2864  0.323 ]
 
 # Get the unit word vector for a word india
 >>> print(wv.get_word_vector("india", normalized=True))
-[-0.6259 -0.21    0.5559 -0.3664  0.3478]
+[-0.7871 -0.2993  0.3233 -0.2864  0.323 ]
 
 # Get the word vector for a word inidia.
 >>> print(wv.get_word_vector('inidia', raise_exc=True))
@@ -278,80 +286,80 @@ wordvecspace.exception.UnknownWord: "inidia"
 ##### Get vector for given words
 ```python
 >>> print(wv.get_word_vectors(["hi", "india"]))
-[[ 0.4008  0.3623 -0.013   0.8395  0.0562]
- [-0.4975 -0.134   0.7874 -0.3274  0.0857]]
+[[ 0.6342  0.2268 -0.3904  0.0368  0.6266]
+ [-0.7871 -0.2993  0.3233 -0.2864  0.323 ]]
 >>> print(wv.get_word_vectors(["hi", "inidia"]))
-[[ 0.4008  0.3623 -0.013   0.8395  0.0562]
+[[ 0.6342  0.2268 -0.3904  0.0368  0.6266]
  [ 0.      0.      0.      0.      0.    ]]
 ```
 
 ##### Get distance between two words
 ```python
 # Get distance between "india", "usa"
 >>> print(wv.get_distance("india", "usa"))
-0.48379534483
+0.37698328495
 
 # Get the distance between 250, "india"
 >>> print(wv.get_distance(250, "india"))
-1.16397565603
+1.1418992728
 
 # Get the euclidean distance between 250, "india" for WordvecSpaceMem
 >>> print(wv.get_distance(250, "india", metric='euclidean'))
-12.04961109161377
+1.5112241506576538
 ```
 
 ##### Get distance between list of words
 
 ```python
 >>> print(wv.get_distances("for", ["to", "for", "india"]))
-[[ 0.381   0.      0.9561]]
+[[  2.7428e-01   5.9605e-08   1.1567e+00]]
 
 >>> print(wv.get_distances("for", ["to", "for", "inidia"]))
-[[ 0.381  0.     1.   ]]
+[[  2.7428e-01   5.9605e-08   1.0000e+00]]
 
 >>> print(wv.get_distances(["india", "for"], ["to", "for", "usa"]))
-[[ 1.0685  0.9561  0.3251]
- [ 0.381   0.      1.4781]]
+[[  1.1445e+00   1.1567e+00   3.7698e-01]
+ [  2.7428e-01   5.9605e-08   1.6128e+00]]
 
 >>> print(wv.get_distances(["india", "usa"]))
-[[ 1.3853  0.4129  0.3149 ...,  1.1231  1.4595  0.7912]
- [ 1.3742  0.9549  1.0354 ...,  0.5556  1.0847  1.0832]]
+[[ 1.5464  0.4876  0.3017 ...,  1.2492  1.2451  0.8925]
+ [ 1.0436  0.9995  1.0913 ...,  0.6996  0.8014  1.1608]]
 
 >>> print(wv.get_distances(["andhra"]))
-[[ 1.2817  0.6138  0.2995 ...,  0.9945  1.224   0.6137]]
+[[ 1.5418  0.7153  0.277  ...,  1.1657  1.0774  0.7036]]
 
 # For WordVecSpaceMem
 >>> print(wv.get_distances(["andhra"], metric='euclidean'))
-[[ 9.0035  8.3985  7.1658 ...,  9.2236  9.6078  8.6349]]
+[[ 1.756   1.1961  0.7443 ...,  1.5269  1.4679  1.1862]]
 ```
 
 ##### Get nearest
 ```python
 # Get nearest for given word or index
 >>> print(wv.get_nearest("india", 20))
-[509, 486, 523, 4343, 14208, 13942, 42424, 25578, 6212, 2475, 3560, 13508, 20919, 3389, 4484, 19995, 8776, 7012, 12191, 16619]
+[509, 3389, 486, 523, 7125, 16619, 4491, 12191, 6866, 8776, 15232, 14208, 5998, 21916, 5226, 6322, 4343, 6212, 10172, 6186]
 
 # Get nearest for given words or indices
 >>> print(wv.get_nearest(["ram", "india"], 5))
-[[3844, 38851, 25381, 10830, 17049], [509, 486, 523, 4343, 14208]]
+[[3844, 16727, 15811, 42731, 41516], [509, 3389, 486, 523, 7125]]
 
 # Get nearest using euclidean distance for WordVecSpaceMem
 >>> print(wv.get_nearest(["ram", "india"], 5, metric='euclidean'))
-[[3844, 25381, 27802, 17049, 38851], [509, 486, 14208, 523, 13942]]
+[[3844, 16727, 15811, 42731, 41516], [509, 3389, 486, 523, 7125]]
 
 # Get common nearest neighbors among given words
 >>> print(wv.get_nearest(['india', 'bosnia'], 10, combination=True))
-[14208, 486, 523, 4343, 42424, 509]
+[523, 509, 486]
 ```
 
-### Service
+## Service
 
 ```bash
 # Run wordvecspace as a service (which continuously listens on some port for API requests)
-$ wordvecspace runserver <type> <input_file> --metric <metric> --port <port> --eargs <eargs>
+$ wordvecspace runserver <type> <input_dir> --metric <metric> --port <port> --eargs <eargs>
 
-# <type> is for specifying wordvecspace functionality (eg: mem or annoy).
-# <input_file> is for wordvecspace file
+# <type> is for specifying wordvecspace functionality (eg: mem, annoy or disk).
+# <input_dir> is for wordvecspace data dir
 # <metric> is to specify type for distance calculation
 # <port> is to run wordvecspace in that port
 # <eargs> is for specifying extra arguments for annoy
@@ -361,10 +369,13 @@ Example:
 
 ```bash
 # For mem
-$ wordvecspace runserver mem /home/user/dc.wvspace --metric angular --port 8000
+$ wordvecspace runserver mem /home/user/output_dir --metric angular --port 8000
+
+# For disk
+$ wordvecspace runserver disk /home/user/output_dir --metric angular --port 8000
 
 # For annoy
-$ wordvecspace runserver annoy /home/user/dc.wvspace --metric euclidean --port 8000 --eargs n_trees=1:index_fpath=/tmp
+$ wordvecspace runserver annoy /home/user/output_dir --metric euclidean --port 8000 --eargs n_trees=1:index_fpath=/tmp
 
 # Extra arguments for annoy are n_trees and index_fpath
 #   - n_trees is the number of trees for annoy
@@ -415,20 +426,24 @@ $ http://localhost:8000/api/v1/get_nearest?words_or_indices=india&k=100&metric=e
 ```bash
 # wordvecspace provides command to directly interact with it
 
-$ wordvecspace interact <type> <input_file> --metric <metric> --eargs <eargs>
+$ wordvecspace interact <type> <input_dir> --metric <metric> --eargs <eargs>
 
-# <type> is for specifying wordvecspace functionality (eg: mem or annoy).
-# <input_file> is for wordvecspace file
+# <type> is for specifying wordvecspace functionality (eg: mem, disk or annoy).
+# <input_dir> is for wordvecspace data dir
 # <metric> is to specify type for distance calculation
 # <eargs> is for specifying extra arguments for annoy
 ```
 
 Example:
 ```bash
 # For mem
-$ wordvecspace interact mem /home/user/dc.wvspace --metric euclidean
+$ wordvecspace interact mem /home/user/output_dir --metric euclidean
+
+# For Disk
+$ wordvecspace interact disk /home/user/output_dir --metric euclidean
 
-$ wordvecspace interact annoy /home/user/dc.wvspace --metric angular --eargs n_trees=1:index_fpath=/tmp
+# For Annoy
+$ wordvecspace interact annoy /home/user/output_dir --metric angular --eargs n_trees=1:index_fpath=/tmp
 WordVecSpaceAnnoy console (vectors=71291 dims=5)
 >>> wv.get_nearest('india', 20)
 [509, 486, 523, 4343, 13942, 42424, 25578, 3389, 12191, 16619, 12088, 6049, 5226, 4137, 41883, 18617, 10172, 35704, 25552, 29059]
@@ -447,7 +462,7 @@ $ wget 'https://s3.amazonaws.com/deepcompute-public-data/wordvecspace/small_test
 $ tar xvzf small_test_data.tgz
 
 # Export the path of data file to the environment variables
-$ export WORDVECSPACE_DATAFILE="/home/user/dc.wvspace"
+$ export WORDVECSPACE_DATADIR="/home/user/output_dir"
 
 # Run tests
 $ python3 setup.py test
 
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-version = '0.5.1'
+version = '0.5.2'
 setup(
     name="wordvecspace",
     version=version,
@@ -19,11 +19,12 @@
         'numpy==1.13.1',
         'pandas==0.20.3',
         'numba==0.36.2',
-        'basescript==0.2.0',
-        'tables==3.4.2',
+        'basescript==0.2.1',
         'annoy==1.11.4',
-        'cmph-cffi==0.3.0',
-        'scipy==1.0.0'
+        'scipy==1.0.0',
+        'diskarray==0.1.4',
+        'diskdict==0.1',
+        'deeputil==0.2.5'
     ],
     extras_require={
         'cuda': ['pycuda==2017.1.1', 'scikit-cuda==0.5.1'],
@@ -38,7 +39,7 @@
         "Intended Audience :: Developers",
         "License :: OSI Approved :: MIT License",
     ],
-    test_suite = 'test.suite_test',
+    test_suite='test.suite_test',
     entry_points={
         "console_scripts": [
             "wordvecspace = wordvecspace:main",
 
@@ -1,5 +1,6 @@
 from .command import main
 from .base import WordVecSpace
 from .mem import WordVecSpaceMem
+from .disk import WordVecSpaceDisk
 from .annoy import WordVecSpaceAnnoy
 from .fileformat import WordVecSpaceFile