22
33import itertools
44import os
5+ import pickle
56import random
6- import json
77from math import floor
8- import pickle
98
10- import numpy as np
11- from dill import dump , load
12- from scipy import spatial
13- from sklearn .feature_extraction .text import CountVectorizer , TfidfVectorizer
14- from sklearn .metrics import pairwise_distances
15- from sklearn .metrics .pairwise import linear_kernel , cosine_similarity
16- from scipy .spatial .distance import cdist
179from annoy import AnnoyIndex
18- from tqdm import tqdm
1910from multiprocess import cpu_count
11+ from sklearn .feature_extraction .text import CountVectorizer , TfidfVectorizer
12+ from tqdm import tqdm
2013
2114
2215class savedTexts :
@@ -110,16 +103,15 @@ def sample_corpus(self):
110103
111104 def build_annoy_index (self ):
112105 print ("Building Annoy index of document vectors..." , flush = True )
113- annoy_index = AnnoyIndex (self .sklearn_vector_space .shape [1 ], "angular" )
106+ self . annoy_index = AnnoyIndex (self .sklearn_vector_space .shape [1 ], "angular" )
114107 for i , doc_vector in tqdm (
115108 enumerate (self .sklearn_vector_space ),
116109 total = self .sklearn_vector_space .shape [0 ],
117110 desc = "Adding document vectors to Annoy index" ,
118111 leave = False ,
119112 ):
120- annoy_index .add_item (i , doc_vector [0 ].toarray ()[0 ])
121- annoy_index .build (1000 , n_jobs = cpu_count () - 1 )
122- annoy_index .save (os .path .join (self ._source_files , "index.annoy" ))
113+ self .annoy_index .add_item (i , doc_vector [0 ].toarray ()[0 ])
114+ self .annoy_index .build (1000 , n_jobs = cpu_count () - 1 )
123115
124116 def docs_for_word (self , word_id ):
125117 ids = []
@@ -144,13 +136,9 @@ def id_for_word(self, word_id):
144136 return - 1
145137
146138 def similar_docs_by_vector (self , doc_id , num_docs ):
147- if self .annoy_index is None :
148- self .annoy_index = AnnoyIndex (self .sklearn_vector_space .shape [1 ], "angular" )
149- self .annoy_index .load (os .path .join (self ._source_files , "index.annoy" ))
150139 docs , scores = self .annoy_index .get_nns_by_item (doc_id , num_docs + 1 , include_distances = True )
151140 return [(doc , score ) for doc , score in zip (docs , scores ) if doc != doc_id ]
152141
153142 def similar_docs_by_topic_distribution (self , doc_id , num_docs , topic_model ):
154143 docs , scores = topic_model .annoy_index .get_nns_by_item (doc_id , num_docs + 1 , include_distances = True )
155144 return [(doc , score ) for doc , score in zip (docs , scores ) if doc != doc_id ]
156-
0 commit comments