@@ -62,6 +62,15 @@ def infer_and_replace(self, corpus):
6262 self .document_topic_matrix = coo_matrix ((data , (row , col )), shape = (self .corpus .size , self .nb_topics )).tocsr ()
6363 topic_frequencies = np .sum (self .document_topic_matrix .transpose (), axis = 1 )
6464 self .topic_frequencies = topic_frequencies / np .sum (topic_frequencies )
65+ self .annoy_index = AnnoyIndex (self .document_topic_matrix .shape [1 ], "angular" )
66+ for i , doc_vector in tqdm (
67+ enumerate (self .document_topic_matrix ),
68+ total = self .document_topic_matrix .shape [0 ],
69+ desc = "Building Annoy index of document-topic vectors" ,
70+ leave = False ,
71+ ):
72+ self .annoy_index .add_item (i , doc_vector [0 ].toarray ()[0 ])
73+ self .annoy_index .build (1000 , n_jobs = cpu_count () - 1 )
6574
6675 def most_similar_topic_by_doc_distribution (self ):
6776 return pairwise_distances (self .document_topic_matrix .transpose ())
@@ -187,13 +196,5 @@ def infer_topics(self, num_topics=10, **kwargs):
187196 topic_count += 1
188197 doc_count += 1
189198 document_topic_matrix = coo_matrix ((data , (row , col )), shape = (self .corpus .size , self .nb_topics )).tocsr ()
190- self .annoy_index = AnnoyIndex (document_topic_matrix .shape [1 ], "angular" )
191- for i , doc_vector in tqdm (
192- enumerate (document_topic_matrix ),
193- total = document_topic_matrix .shape [0 ],
194- desc = "Building Annoy index of document-topic vectors" ,
195- leave = False ,
196- ):
197- self .annoy_index .add_item (i , doc_vector [0 ].toarray ()[0 ])
198- self .annoy_index .build (1000 , n_jobs = cpu_count () - 1 )
199+ self .annoy_index = None
199200
0 commit comments