Skip to content

Commit 372da1b

Browse files
author
RamanjaneyuluIdavalapati
committed
wordvecspace with conversion tool
1 parent 5d02326 commit 372da1b

6 files changed

Lines changed: 272 additions & 38 deletions

File tree

.travis.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@ deploy:
2121
- wordvecspace/__init__.py
2222
- wordvecspace/wordvecspace.py
2323
- test.py
24-
- .travis.py
24+
- wordvecspace/command.py
25+
- wordvecspace/convert.py
2526
- wordvecspace/cuda.py
26-
name: wordvecspace-0.3
27-
tag_name: 0.3
27+
name: wordvecspace-0.4
28+
tag_name: wordvecspace-0.4
2829
on:
2930
repo: deep-compute/wordvecspace
3031
# pypitest

README.md

Lines changed: 78 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,63 @@ A high performance pure python module that helps in loading and performing opera
77
88
```bash
99
sudo apt install libopenblas-base
10+
sudo pip install wordvecspace
1011
```
1112

1213
## Usage
1314

15+
### Preparing data
16+
17+
Before we can start using the library, we need access to some
18+
word vector space data. Here are two ways to get that.
19+
20+
#### Download pre-computed sample data
21+
22+
```bash
23+
$ wget https://s3.amazonaws.com/deepcompute-public/data/wordvecspace/small_test_data.tgz
24+
$ tar zxvf small_test_data.tgz
25+
```
26+
27+
> NOTE: We got this data by downloading the `text8` corpus
28+
> from this location (http://mattmahoney.net/dc/text8.zip) and converting that to `WordVecSpace`
29+
> format. You can do the same conversion process by reading
30+
> the instructions in the following section.
31+
32+
#### Computing your own data
33+
34+
You can compute a word vector space on an arbitrary text corpus
35+
by using Google's word2vec tool. Here is an example on how to do
36+
that for the sample `text8` corpus.
37+
38+
```bash
39+
$ git clone https://github.com/tmikolov/word2vec.git
40+
41+
# 1. Navigate to the folder word2vec
42+
# 2. open demo-word.sh for editing
43+
# 3. Edit "time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15" to "time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 5 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -save-vocab vocab.txt -iter 15" to get vocab.txt file also as output.
44+
# 4. Run demo-word.sh
45+
46+
$ chmod +x demo-word.sh
47+
$ ./demo-word.sh
48+
49+
# This will produce the output files (vectors.bin and vocab.txt)
50+
```
51+
52+
These files (vectors.bin and vocab.txt) cannot be directly loaded
53+
by the `wordvecspace` module. You'll first have to convert them
54+
to the `WordVecSpace` format.
55+
56+
57+
```bash
58+
$ wordvecspace convert <input_dir> <output_dir>
59+
60+
# <input_dir> is the directory which has vocab.txt and vectors.bin
61+
# <output_dir> is the directory where you want to put your output files
62+
63+
# You can also generate shards by specifying number of vectors per each shard
64+
$ wordvecspace convert <input_dir> <output_dir> -n 5000
65+
```
66+
1467
### Importing
1568
```python
1669
>>> from wordvecspace import WordVecSpace
@@ -53,18 +106,18 @@ wordvecspace.UnknownWord: "inidia"
53106
# Get the word vector for a word india
54107

55108
>>> print wv.get_word_vector("india")
56-
[-6.44819975 -2.16358566 5.72767735 -3.77464485 3.58295298]
109+
[-6.4482 -2.1636 5.7277 -3.7746 3.583 ]
57110

58111
# Get the unit word vector for a word india
59112
>>> print wv.get_word_vector("india", normalized=True)
60-
[-0.62585545 -0.20999533 0.55592233 -0.36636305 0.34775764]
113+
[-0.6259 -0.21 0.5559 -0.3664 0.3478]
61114

62115
>>> print wv.get_word_vector("india")
63-
[-6.44819975 -2.16358566 5.72767735 -3.77464485 3.58295298]
116+
[-6.4482 -2.1636 5.7277 -3.7746 3.583 ]
64117

65118
# Get the unit word vector for a word india
66119
>>> print wv.get_word_vector("india", normalized=True)
67-
[-0.62585545 -0.20999533 0.55592233 -0.36636305 0.34775764]
120+
[-0.6259 -0.21 0.5559 -0.3664 0.3478]
68121

69122
# Get the unit vector for a word inidia.
70123
>>> print wv.get_word_vector('inidia', normalized=True, raise_exc=True)
@@ -116,65 +169,62 @@ wordvecspace.UnknownWord: "inidia"
116169
```python
117170
# Get Vector magnitude of the word "india"
118171
>>> print wv.get_vector_magnitudes("india")
119-
[ 10.30301762]
172+
[ 10.303]
120173

121174
>>> print wv.get_vector_magnitudes(["india", "usa"])
122-
[ 10.30301762 7.36207819]
175+
[ 10.303 7.3621]
123176

124177
>>> print wv.get_vector_magnitudes(["inidia", "usa"])
125-
[ 0. 7.36207819]
178+
[ 0. 7.3621]
126179

127180
>>> print wv.get_vector_magnitudes(["india", "usa"])
128-
[ 10.30301762 7.36207819]
181+
[ 10.303 7.3621]
129182

130183
>>> print wv.get_vector_magnitudes(["inidia", "usa"])
131-
[ 0. 7.36207819]
184+
[ 0. 7.3621]
132185
```
133186

134187
### Get vectors for list of words
135188
```python
136189
# Get vectors for list of words ["usa", "india"]
137190
>>> print wv.get_word_vectors(["usa", "india"])
138-
[[-0.72164571 -0.05566886 0.41082662 0.54941767 0.07409521]
139-
[-0.62585545 -0.20999533 0.55592233 -0.36636305 0.34775764]]
191+
[[-0.7216 -0.0557 0.4108 0.5494 0.0741]
192+
[-0.6259 -0.21 0.5559 -0.3664 0.3478]]
140193
```
141194

142195
### Get distance between two words
143196
```python
144197
# Get distance between "india", "usa"
145198
>>> print wv.get_distance("india", "usa")
146-
0.516205
199+
0.48379534483
147200

148201
# Get the distance between 250, "india"
149202
>>> print wv.get_distance(250, "india")
150-
-0.163976
203+
1.16397565603
151204
```
152205

153206
### Get distance between list of words
154207
```python
155208
>>> print wv.get_distances("for", ["to", "for", "india"])
156-
[[ 0.85009682]
157-
[ 1.00000012]
158-
[-0.38545406]]
209+
[[ 1.4990e-01]
210+
[ -1.1921e-07]
211+
[ 1.3855e+00]]
159212

160213
>>> print wv.get_distances("for", ["to", "for", "inidia"])
161-
[[ 0.85009682]
162-
[ 1.00000012]
163-
[ 0. ]]
214+
[[ 1.4990e-01]
215+
[ -1.1921e-07]
216+
[ 1.0000e+00]]
164217

165218
>>> print wv.get_distances(["india", "for"], ["to", "for", "usa"])
166-
[[-0.18296985 -0.38545409 0.51620466]
167-
[ 0.85009682 1.00000012 -0.49754807]]
219+
[[ 1.1830e+00 1.3855e+00 4.8380e-01]
220+
[ 1.4990e-01 -1.1921e-07 1.4975e+00]]
168221

169222
>>> print wv.get_distances(["india", "usa"])
170-
[[-0.49026281 0.57980162 0.73099834 ..., -0.20406421 -0.35388517
171-
0.38457203]
172-
[-0.80836529 0.04589185 -0.16784868 ..., 0.4037039 -0.04579565
173-
-0.16079855]]
223+
[[ 1.4903 0.4202 0.269 ..., 1.2041 1.3539 0.6154]
224+
[ 1.8084 0.9541 1.1678 ..., 0.5963 1.0458 1.1608]]
174225

175226
>>> print wv.get_distances(["andhra"])
176-
[[-0.3432439 0.42185491 0.76944059 ..., -0.09365848 -0.13691582
177-
0.57156253]]
227+
[[ 1.3432 0.5781 0.2306 ..., 1.0937 1.1369 0.4284]]
178228
```
179229

180230
### Get nearest neighbors
@@ -216,3 +266,4 @@ $ python setup.py test
216266
The `WordVecSpace` from the `cuda` module is a drop-in replacement for the CPU based `WordVecSpace` class showcased above.
217267

218268
> NOTE: The vector space size must fit on available GPU ram for this to work
269+
> Also, you will need to install cuda support by doing "sudo pip install wordvecspace[cuda]"

setup.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@ def get_long_description():
2222

2323
long_description = get_long_description()
2424

25-
version = '0.3'
25+
version = '0.4'
2626
setup(
2727
name="wordvecspace",
2828
version=version,
2929
description="A high performance pure python module that helps in"
3030
" loading and performing operations on word vector spaces"
31-
" created using Google's Word2vec tool.",
31+
" created using Google's Word2vec tool. It also supports"
32+
" converting word vector space data (vectors and vocabulary)"
33+
" from Google Word2Vec format to WordVecSpace format.",
3234
long_description=long_description,
3335
keywords='wordvecspace',
3436
author='Deep Compute, LLC',
@@ -37,12 +39,12 @@ def get_long_description():
3739
download_url="https://github.com/deep-compute/wordvecspace/tarball/%s" % version,
3840
license='MIT License',
3941
install_requires=[
40-
'numpy',
41-
'pandas',
42-
'numba',
42+
'numpy==1.13.1',
43+
'pandas==0.20.3',
44+
'numba==0.34.0',
4345
],
4446
extras_require={
45-
'cuda': ['pycuda', 'scikit-cuda'],
47+
'cuda': ['pycuda==2017.1.1', 'scikit-cuda==0.5.1'],
4648
},
4749
package_dir={'wordvecspace': 'wordvecspace'},
4850
packages=find_packages('.'),
@@ -53,5 +55,11 @@ def get_long_description():
5355
"Intended Audience :: Developers",
5456
"License :: OSI Approved :: MIT License",
5557
],
56-
test_suite='test.suite'
58+
test_suite='test.suite',
59+
entry_points={
60+
"console_scripts": [
61+
"wordvecspace = wordvecspace:main",
62+
]
63+
}
64+
5765
)

wordvecspace/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from wordvecspace import WordVecSpace
1+
from wordvecspace import WordVecSpace
2+
from command import main

wordvecspace/command.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from basescript import BaseScript
2+
from convert import GWVec2WordVecSpace
3+
4+
class WordVecSpaceCommand(BaseScript):
5+
DESC = 'Word Vector Space command-line tool'
6+
7+
def convert(self):
8+
convertor = GWVec2WordVecSpace(
9+
self.args.input_dir,
10+
self.args.output_dir,
11+
self.args.num_vecs_per_shard)
12+
convertor.start()
13+
14+
DEFAULT_NUM_VECS_PER_SHARD = 0
15+
16+
def define_subcommands(self, subcommands):
17+
super(WordVecSpaceCommand, self).define_subcommands(subcommands)
18+
19+
convert_cmd = subcommands.add_parser('convert',
20+
help='Convert data in Google\'s Word2Vec format to WordVecSpace format')
21+
convert_cmd.set_defaults(func=self.convert)
22+
convert_cmd.add_argument('input_dir',
23+
help='Input directory containing Google Word2Vec format files'
24+
' (vocab.txt, w2v_vectors.bin)')
25+
convert_cmd.add_argument('output_dir',
26+
help='Output directory where WordVecSpace format files are produced')
27+
convert_cmd.add_argument('-n', '--num-vecs-per-shard',
28+
default=self.DEFAULT_NUM_VECS_PER_SHARD, type=int,
29+
help='Number of vectors per shard. 0 value ensures all vecs in one shard.')
30+
31+
def main():
32+
WordVecSpaceCommand().start()
33+
34+
if __name__ == '__main__':
35+
main()

0 commit comments

Comments
 (0)