Skip to content

Commit c12c34e

Browse files
committed
keep annoy index in RAM
1 parent 91e2e21 commit c12c34e

2 files changed

Lines changed: 8 additions & 22 deletions

File tree

lib/topologic/corpus.py

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,14 @@
22

33
import itertools
44
import os
5+
import pickle
56
import random
6-
import json
77
from math import floor
8-
import pickle
98

10-
import numpy as np
11-
from dill import dump, load
12-
from scipy import spatial
13-
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
14-
from sklearn.metrics import pairwise_distances
15-
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
16-
from scipy.spatial.distance import cdist
179
from annoy import AnnoyIndex
18-
from tqdm import tqdm
1910
from multiprocess import cpu_count
11+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
12+
from tqdm import tqdm
2013

2114

2215
class savedTexts:
@@ -110,16 +103,15 @@ def sample_corpus(self):
110103

111104
def build_annoy_index(self):
112105
print("Building Annoy index of document vectors...", flush=True)
113-
annoy_index = AnnoyIndex(self.sklearn_vector_space.shape[1], "angular")
106+
self.annoy_index = AnnoyIndex(self.sklearn_vector_space.shape[1], "angular")
114107
for i, doc_vector in tqdm(
115108
enumerate(self.sklearn_vector_space),
116109
total=self.sklearn_vector_space.shape[0],
117110
desc="Adding document vectors to Annoy index",
118111
leave=False,
119112
):
120-
annoy_index.add_item(i, doc_vector[0].toarray()[0])
121-
annoy_index.build(1000, n_jobs=cpu_count() - 1)
122-
annoy_index.save(os.path.join(self._source_files, "index.annoy"))
113+
self.annoy_index.add_item(i, doc_vector[0].toarray()[0])
114+
self.annoy_index.build(1000, n_jobs=cpu_count() - 1)
123115

124116
def docs_for_word(self, word_id):
125117
ids = []
@@ -144,13 +136,9 @@ def id_for_word(self, word_id):
144136
return -1
145137

146138
def similar_docs_by_vector(self, doc_id, num_docs):
147-
if self.annoy_index is None:
148-
self.annoy_index = AnnoyIndex(self.sklearn_vector_space.shape[1], "angular")
149-
self.annoy_index.load(os.path.join(self._source_files, "index.annoy"))
150139
docs, scores = self.annoy_index.get_nns_by_item(doc_id, num_docs + 1, include_distances=True)
151140
return [(doc, score) for doc, score in zip(docs, scores) if doc != doc_id]
152141

153142
def similar_docs_by_topic_distribution(self, doc_id, num_docs, topic_model):
154143
docs, scores = topic_model.annoy_index.get_nns_by_item(doc_id, num_docs + 1, include_distances=True)
155144
return [(doc, score) for doc, score in zip(docs, scores) if doc != doc_id]
156-

lib/topologic/topic_model.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
#!/usr/bin/env python3
22

33
import itertools
4-
import os
54
from abc import ABCMeta, abstractmethod
65

76
import numpy as np
7+
from annoy import AnnoyIndex
8+
from multiprocess import cpu_count
89
from scipy.sparse import coo_matrix
910
from sklearn.decomposition import NMF
1011
from sklearn.decomposition import LatentDirichletAllocation as LDA
1112
from sklearn.metrics import pairwise_distances
12-
from annoy import AnnoyIndex
1313
from tqdm import tqdm
14-
from multiprocess import cpu_count
1514

1615

1716
class TopicModel(object):
@@ -197,4 +196,3 @@ def infer_topics(self, num_topics=10, **kwargs):
197196
doc_count += 1
198197
document_topic_matrix = coo_matrix((data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
199198
self.annoy_index = None
200-

0 commit comments

Comments
 (0)