wordvecspace with conversion tool

RamanjaneyuluIdavalapati · RamanjaneyuluIdavalapati · commit 372da1bd326c · 2017-09-11T17:45:34.000+05:30
diff --git a/.travis.yml b/.travis.yml
@@ -21,10 +21,11 @@ deploy:
     - wordvecspace/__init__.py
     - wordvecspace/wordvecspace.py
     - test.py
-    - .travis.py
+    - wordvecspace/command.py
+    - wordvecspace/convert.py
     - wordvecspace/cuda.py
-  name: wordvecspace-0.3
-  tag_name: 0.3
+  name: wordvecspace-0.4
+  tag_name: wordvecspace-0.4
   on:
     repo: deep-compute/wordvecspace
   # pypitest
diff --git a/README.md b/README.md
@@ -7,10 +7,63 @@ A high performance pure python module that helps in loading and performing opera
 
 ```bash
 sudo apt install libopenblas-base
+sudo pip install wordvecspace
 ```
 
 ## Usage
 
+### Preparing data
+
+Before we can start using the library, we need access to some
+word vector space data. Here are two ways to get that.
+
+#### Download pre-computed sample data
+
+```bash
+$ wget https://s3.amazonaws.com/deepcompute-public/data/wordvecspace/small_test_data.tgz
+$ tar zxvf small_test_data.tgz
+```
+
+> NOTE: We got this data by downloading the `text8` corpus
+> from this location (http://mattmahoney.net/dc/text8.zip) and converting that to `WordVecSpace`
+> format. You can do the same conversion process by reading
+> the instructions in the following section.
+
+#### Computing your own data
+
+You can compute a word vector space on an arbitrary text corpus
+by using Google's word2vec tool. Here is an example on how to do
+that for the sample `text8` corpus.
+
+```bash
+$ git clone https://github.com/tmikolov/word2vec.git 
+
+# 1. Navigate to the folder word2vec
+# 2. open demo-word.sh for editing
+# 3. Edit "time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15" to "time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 5 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -save-vocab vocab.txt -iter 15" to get vocab.txt file also as output.
+# 4. Run demo-word.sh
+
+$ chmod +x demo-word.sh
+$ ./demo-word.sh
+
+# This will produce the output files (vectors.bin and vocab.txt)
+```
+
+These files (vectors.bin and vocab.txt) cannot be directly loaded
+by the `wordvecspace` module. You'll first have to convert them
+to the `WordVecSpace` format.
+
+
+```bash
+$ wordvecspace convert <input_dir> <output_dir>
+
+# <input_dir> is the directory which has vocab.txt and vectors.bin
+# <output_dir> is the directory where you want to put your output files
+
+# You can also generate shards by specifying number of vectors per each shard
+$ wordvecspace convert <input_dir> <output_dir> -n 5000
+```
+
 ### Importing
 ```python
 >>> from wordvecspace import WordVecSpace
@@ -53,18 +106,18 @@ wordvecspace.UnknownWord: "inidia"
 # Get the word vector for a word india
 
 >>> print wv.get_word_vector("india")
-[-6.44819975 -2.16358566  5.72767735 -3.77464485  3.58295298]
+[-6.4482 -2.1636  5.7277 -3.7746  3.583 ]
 
 # Get the unit word vector for a word india
 >>> print wv.get_word_vector("india", normalized=True)
-[-0.62585545 -0.20999533  0.55592233 -0.36636305  0.34775764]
+[-0.6259 -0.21    0.5559 -0.3664  0.3478]
 
 >>> print wv.get_word_vector("india")
-[-6.44819975 -2.16358566  5.72767735 -3.77464485  3.58295298]
+[-6.4482 -2.1636  5.7277 -3.7746  3.583 ]
 
 # Get the unit word vector for a word india
 >>> print wv.get_word_vector("india", normalized=True)
-[-0.62585545 -0.20999533  0.55592233 -0.36636305  0.34775764]
+[-0.6259 -0.21    0.5559 -0.3664  0.3478]
 
 # Get the unit vector for a word inidia.
 >>> print wv.get_word_vector('inidia', normalized=True, raise_exc=True)
@@ -116,65 +169,62 @@ wordvecspace.UnknownWord: "inidia"
 ```python
 # Get Vector magnitude of the word "india"
 >>> print wv.get_vector_magnitudes("india")
-[ 10.30301762]
+[ 10.303]
 
 >>> print wv.get_vector_magnitudes(["india", "usa"])
-[ 10.30301762   7.36207819]
+[ 10.303   7.3621]
 
 >>> print wv.get_vector_magnitudes(["inidia", "usa"])
-[ 0.          7.36207819]
+[ 0.          7.3621]
 
 >>> print wv.get_vector_magnitudes(["india", "usa"])
-[ 10.30301762   7.36207819]
+[ 10.303    7.3621]
 
 >>> print wv.get_vector_magnitudes(["inidia", "usa"])
-[ 0.          7.36207819]
+[ 0.          7.3621]
 ```	
 
 ### Get vectors for list of words
 ```python
 # Get vectors for list of words ["usa", "india"]
 >>> print wv.get_word_vectors(["usa", "india"])
-[[-0.72164571 -0.05566886  0.41082662  0.54941767  0.07409521]
- [-0.62585545 -0.20999533  0.55592233 -0.36636305  0.34775764]]
+[[-0.7216 -0.0557  0.4108  0.5494  0.0741]
+ [-0.6259 -0.21    0.5559 -0.3664  0.3478]]
 ```
 
 ### Get distance between two words 
 ```python
 # Get distance between "india", "usa"
 >>> print wv.get_distance("india", "usa")
-0.516205
+0.48379534483
 
 # Get the distance between 250, "india"
 >>> print wv.get_distance(250, "india")
--0.163976
+1.16397565603
 ```
 
 ### Get distance between list of words
 ```python
 >>> print wv.get_distances("for", ["to", "for", "india"])
-[[ 0.85009682]
- [ 1.00000012]
- [-0.38545406]]
+[[  1.4990e-01]
+ [ -1.1921e-07]
+ [  1.3855e+00]]
 
 >>> print wv.get_distances("for", ["to", "for", "inidia"])
-[[ 0.85009682]
- [ 1.00000012]
- [ 0.        ]]
+[[  1.4990e-01]
+ [ -1.1921e-07]
+ [  1.0000e+00]]
 
 >>> print wv.get_distances(["india", "for"], ["to", "for", "usa"])
-[[-0.18296985 -0.38545409  0.51620466]
- [ 0.85009682  1.00000012 -0.49754807]]
+[[  1.1830e+00   1.3855e+00   4.8380e-01]
+ [  1.4990e-01  -1.1921e-07   1.4975e+00]]
 
 >>> print wv.get_distances(["india", "usa"])
-[[-0.49026281  0.57980162  0.73099834 ..., -0.20406421 -0.35388517
-   0.38457203]
- [-0.80836529  0.04589185 -0.16784868 ...,  0.4037039  -0.04579565
-  -0.16079855]]
+[[ 1.4903  0.4202  0.269  ...,  1.2041  1.3539  0.6154]
+ [ 1.8084  0.9541  1.1678 ...,  0.5963  1.0458  1.1608]]
 
 >>> print wv.get_distances(["andhra"])
-[[-0.3432439   0.42185491  0.76944059 ..., -0.09365848 -0.13691582
-   0.57156253]]
+[[ 1.3432  0.5781  0.2306 ...,  1.0937  1.1369  0.4284]]
 ```
 
 ### Get nearest neighbors 
@@ -216,3 +266,4 @@ $ python setup.py test
 The `WordVecSpace` from the `cuda` module is a drop-in replacement for the CPU based `WordVecSpace` class showcased above.
 
 > NOTE: The vector space size must fit on available GPU ram for this to work
+> Also, you will need to install cuda support by doing "sudo pip install wordvecspace[cuda]"
diff --git a/setup.py b/setup.py
@@ -22,13 +22,15 @@ def get_long_description():
 
 long_description = get_long_description()
 
-version = '0.3'
+version = '0.4'
 setup(
     name="wordvecspace",
     version=version,
     description="A high performance pure python module that helps in"
                 " loading and performing operations on word vector spaces"
-                " created using Google's Word2vec tool.",
+                " created using Google's Word2vec tool. It also supports"
+                " converting word vector space data (vectors and vocabulary)"
+                " from Google Word2Vec format to WordVecSpace format.",
     long_description=long_description,
     keywords='wordvecspace',
     author='Deep Compute, LLC',
@@ -37,12 +39,12 @@ def get_long_description():
     download_url="https://github.com/deep-compute/wordvecspace/tarball/%s" % version,
     license='MIT License',
     install_requires=[
-        'numpy',
-        'pandas',
-        'numba',
+        'numpy==1.13.1',
+        'pandas==0.20.3',
+        'numba==0.34.0',
     ],
     extras_require={
-        'cuda': ['pycuda', 'scikit-cuda'],
+        'cuda': ['pycuda==2017.1.1', 'scikit-cuda==0.5.1'],
     },
     package_dir={'wordvecspace': 'wordvecspace'},
     packages=find_packages('.'),
@@ -53,5 +55,11 @@ def get_long_description():
         "Intended Audience :: Developers",
         "License :: OSI Approved :: MIT License",
     ],
-    test_suite='test.suite'
+    test_suite='test.suite',
+    entry_points={
+        "console_scripts": [
+            "wordvecspace = wordvecspace:main",
+        ]
+    }
+
 )
diff --git a/wordvecspace/__init__.py b/wordvecspace/__init__.py
@@ -1 +1,2 @@
-from wordvecspace import WordVecSpace
+from wordvecspace import WordVecSpace
+from command import main
diff --git a/wordvecspace/command.py b/wordvecspace/command.py
@@ -0,0 +1,35 @@
+from basescript import BaseScript
+from convert import GWVec2WordVecSpace
+
+class WordVecSpaceCommand(BaseScript):
+    DESC = 'Word Vector Space command-line tool'
+
+    def convert(self):
+        convertor = GWVec2WordVecSpace(
+                        self.args.input_dir,
+                        self.args.output_dir,
+                        self.args.num_vecs_per_shard)
+        convertor.start()
+
+    DEFAULT_NUM_VECS_PER_SHARD = 0
+
+    def define_subcommands(self, subcommands):
+        super(WordVecSpaceCommand, self).define_subcommands(subcommands)
+
+        convert_cmd = subcommands.add_parser('convert',
+            help='Convert data in Google\'s Word2Vec format to WordVecSpace format')
+        convert_cmd.set_defaults(func=self.convert)
+        convert_cmd.add_argument('input_dir',
+            help='Input directory containing Google Word2Vec format files'
+                 ' (vocab.txt, w2v_vectors.bin)')
+        convert_cmd.add_argument('output_dir',
+            help='Output directory where WordVecSpace format files are produced')
+        convert_cmd.add_argument('-n', '--num-vecs-per-shard',
+            default=self.DEFAULT_NUM_VECS_PER_SHARD, type=int,
+            help='Number of vectors per shard. 0 value ensures all vecs in one shard.')
+
+def main():
+    WordVecSpaceCommand().start()
+
+if __name__ == '__main__':
+    main()
diff --git a/wordvecspace/convert.py b/wordvecspace/convert.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-from wordvecspace import WordVecSpace`
	`1`	`+from wordvecspace import WordVecSpace`
	`2`	`+from command import main`