Skip to content

Commit 91e2e21

Browse files
committed
fix bug in inference corpus
1 parent 1e0f187 commit 91e2e21

5 files changed

Lines changed: 22 additions & 13 deletions

File tree

api/topologic_explorer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ def read_json_config(path):
6969
@app.get("/{table_name}")
7070
@app.get("/{table_name}/topic/{topic_num}")
7171
@app.get("/{table_name}/document/{philo_db}/{doc}")
72+
@app.get("/{table_name}/document/{philo_db}/{doc}/{div1}")
73+
@app.get("/{table_name}/document/{philo_db}/{doc}/{div1}/{div2}")
74+
@app.get("/{table_name}/document/{philo_db}/{doc}/{div1}/{div2}/{div3}")
75+
@app.get("/{table_name}/document/{philo_db}/{doc}/{div1}/{div2}/{div3}/{para}")
7276
@app.get("/{table_name}/word/{word}")
7377
@app.get("/{table_name}/time")
7478
@app.get("/{table_name}/view/{field_name}")

api_server/web_server.sh

100644100755
File mode changed.

lib/scripts/topologic

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ def build_model(
348348
ngram=training_corpus.ngram,
349349
)
350350

351-
print("inference corpus size:", training_corpus.size)
351+
print("inference corpus size:", full_corpus.size)
352352

353353
# Instantiate a topic model
354354
if algorithm == "nmf":

lib/topologic/corpus.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,15 +110,16 @@ def sample_corpus(self):
110110

111111
def build_annoy_index(self):
112112
print("Building Annoy index of document vectors...", flush=True)
113-
self.annoy_index = AnnoyIndex(self.sklearn_vector_space.shape[1], "angular")
113+
annoy_index = AnnoyIndex(self.sklearn_vector_space.shape[1], "angular")
114114
for i, doc_vector in tqdm(
115115
enumerate(self.sklearn_vector_space),
116116
total=self.sklearn_vector_space.shape[0],
117117
desc="Adding document vectors to Annoy index",
118118
leave=False,
119119
):
120-
self.annoy_index.add_item(i, doc_vector[0].toarray()[0])
121-
self.annoy_index.build(1000, n_jobs=cpu_count() - 1)
120+
annoy_index.add_item(i, doc_vector[0].toarray()[0])
121+
annoy_index.build(1000, n_jobs=cpu_count() - 1)
122+
annoy_index.save(os.path.join(self._source_files, "index.annoy"))
122123

123124
def docs_for_word(self, word_id):
124125
ids = []
@@ -143,6 +144,9 @@ def id_for_word(self, word_id):
143144
return -1
144145

145146
def similar_docs_by_vector(self, doc_id, num_docs):
147+
if self.annoy_index is None:
148+
self.annoy_index = AnnoyIndex(self.sklearn_vector_space.shape[1], "angular")
149+
self.annoy_index.load(os.path.join(self._source_files, "index.annoy"))
146150
docs, scores = self.annoy_index.get_nns_by_item(doc_id, num_docs + 1, include_distances=True)
147151
return [(doc, score) for doc, score in zip(docs, scores) if doc != doc_id]
148152

lib/topologic/topic_model.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,15 @@ def infer_and_replace(self, corpus):
6262
self.document_topic_matrix = coo_matrix((data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
6363
topic_frequencies = np.sum(self.document_topic_matrix.transpose(), axis=1)
6464
self.topic_frequencies = topic_frequencies / np.sum(topic_frequencies)
65+
self.annoy_index = AnnoyIndex(self.document_topic_matrix.shape[1], "angular")
66+
for i, doc_vector in tqdm(
67+
enumerate(self.document_topic_matrix),
68+
total=self.document_topic_matrix.shape[0],
69+
desc="Building Annoy index of document-topic vectors",
70+
leave=False,
71+
):
72+
self.annoy_index.add_item(i, doc_vector[0].toarray()[0])
73+
self.annoy_index.build(1000, n_jobs=cpu_count() - 1)
6574

6675
def most_similar_topic_by_doc_distribution(self):
6776
return pairwise_distances(self.document_topic_matrix.transpose())
@@ -187,13 +196,5 @@ def infer_topics(self, num_topics=10, **kwargs):
187196
topic_count += 1
188197
doc_count += 1
189198
document_topic_matrix = coo_matrix((data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
190-
self.annoy_index = AnnoyIndex(document_topic_matrix.shape[1], "angular")
191-
for i, doc_vector in tqdm(
192-
enumerate(document_topic_matrix),
193-
total=document_topic_matrix.shape[0],
194-
desc="Building Annoy index of document-topic vectors",
195-
leave=False,
196-
):
197-
self.annoy_index.add_item(i, doc_vector[0].toarray()[0])
198-
self.annoy_index.build(1000, n_jobs=cpu_count() - 1)
199+
self.annoy_index = None
199200

0 commit comments

Comments
 (0)