forked from ahmadika/Apache_Sparkler_Post_Processing
-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathword2vec_script.py
More file actions
62 lines (49 loc) · 1.67 KB
/
Copy pathword2vec_script.py
File metadata and controls
62 lines (49 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
''' This script is used to create word2vec corpus model from a folder containing extracted text from a given set of URLs.
We can create 2 different word2vec model using normal Python and Gensim. Create Gensim model by uncommenting the code '''
import os
#from gensim.models import Word2Vec
import word2vec
import codecs
import numpy as np
from nltk.corpus import stopwords
def mergeAllContents():
all_files = os.listdir("otherstotext/")
big_f = open("all200Files.txt", "w")
for i in all_files:
f=open("otherstotext/"+str(i), "r")
big_f.write(f.read())
def read_lines(file_lines):
stop_words = set(stopwords.words('english'))
print(stopwords)
with open(file_lines) as f:
content = f.readlines()
sentences = []
for line in content:
tokens = line.split()
for r in tokens:
if not r in stop_words:
sentences.append(tokens)
return np.asarray(sentences)
mergeAllContents()
# # Building a model Using Gensim
# # define training data
# sentences = read_lines("all200Files.txt")
# # train model
# model = Word2Vec(sentences, min_count=100)
# # summarize the loaded model
# print(model)
# # summarize vocabulary
# words = list(model.wv.vocab)
# print(words)
# access vector for one word
# print(model['protection'])
# # save model
# model.save('ocean_gensim.bin')
# Loading Gensim Model
# new_model = Word2Vec.load('ocean_gensim.bin')
word2vec.word2phrase('all200Files.txt', 'ocean-full-phrases', verbose=True)
word2vec.word2vec('ocean-full-phrases', 'ocean.bin', size=500, verbose=True, min_count=5)
model = word2vec.load('ocean.bin',kind='bin', encoding = "ISO-8859-1")
word='ocean'
print(model[word])
print(model.vectors.shape)