fix bug in inference corpus

clovis · clovis · commit 91e2e2109b4a · 2020-11-03T16:06:45.000-06:00
diff --git a/api/topologic_explorer.py b/api/topologic_explorer.py
@@ -69,6 +69,10 @@ def read_json_config(path):
 @app.get("/{table_name}")
 @app.get("/{table_name}/topic/{topic_num}")
 @app.get("/{table_name}/document/{philo_db}/{doc}")
+@app.get("/{table_name}/document/{philo_db}/{doc}/{div1}")
+@app.get("/{table_name}/document/{philo_db}/{doc}/{div1}/{div2}")
+@app.get("/{table_name}/document/{philo_db}/{doc}/{div1}/{div2}/{div3}")
+@app.get("/{table_name}/document/{philo_db}/{doc}/{div1}/{div2}/{div3}/{para}")
 @app.get("/{table_name}/word/{word}")
 @app.get("/{table_name}/time")
 @app.get("/{table_name}/view/{field_name}")
diff --git a/api_server/web_server.sh b/api_server/web_server.sh
diff --git a/lib/scripts/topologic b/lib/scripts/topologic
@@ -348,7 +348,7 @@ def build_model(
             ngram=training_corpus.ngram,
         )
 
-    print("inference corpus size:", training_corpus.size)
+    print("inference corpus size:", full_corpus.size)
 
     # Instantiate a topic model
     if algorithm == "nmf":
diff --git a/lib/topologic/corpus.py b/lib/topologic/corpus.py
@@ -110,15 +110,16 @@ def sample_corpus(self):
 
     def build_annoy_index(self):
         print("Building Annoy index of document vectors...", flush=True)
-        self.annoy_index = AnnoyIndex(self.sklearn_vector_space.shape[1], "angular")
+        annoy_index = AnnoyIndex(self.sklearn_vector_space.shape[1], "angular")
         for i, doc_vector in tqdm(
             enumerate(self.sklearn_vector_space),
             total=self.sklearn_vector_space.shape[0],
             desc="Adding document vectors to Annoy index",
             leave=False,
         ):
-            self.annoy_index.add_item(i, doc_vector[0].toarray()[0])
-        self.annoy_index.build(1000, n_jobs=cpu_count() - 1)
+            annoy_index.add_item(i, doc_vector[0].toarray()[0])
+        annoy_index.build(1000, n_jobs=cpu_count() - 1)
+        annoy_index.save(os.path.join(self._source_files, "index.annoy"))
 
     def docs_for_word(self, word_id):
         ids = []
@@ -143,6 +144,9 @@ def id_for_word(self, word_id):
             return -1
 
     def similar_docs_by_vector(self, doc_id, num_docs):
+        if self.annoy_index is None:
+            self.annoy_index = AnnoyIndex(self.sklearn_vector_space.shape[1], "angular")
+            self.annoy_index.load(os.path.join(self._source_files, "index.annoy"))
         docs, scores = self.annoy_index.get_nns_by_item(doc_id, num_docs + 1, include_distances=True)
         return [(doc, score) for doc, score in zip(docs, scores) if doc != doc_id]
 
diff --git a/lib/topologic/topic_model.py b/lib/topologic/topic_model.py
@@ -62,6 +62,15 @@ def infer_and_replace(self, corpus):
         self.document_topic_matrix = coo_matrix((data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
         topic_frequencies = np.sum(self.document_topic_matrix.transpose(), axis=1)
         self.topic_frequencies = topic_frequencies / np.sum(topic_frequencies)
+        self.annoy_index = AnnoyIndex(self.document_topic_matrix.shape[1], "angular")
+        for i, doc_vector in tqdm(
+            enumerate(self.document_topic_matrix),
+            total=self.document_topic_matrix.shape[0],
+            desc="Building Annoy index of document-topic vectors",
+            leave=False,
+        ):
+            self.annoy_index.add_item(i, doc_vector[0].toarray()[0])
+        self.annoy_index.build(1000, n_jobs=cpu_count() - 1)
 
     def most_similar_topic_by_doc_distribution(self):
         return pairwise_distances(self.document_topic_matrix.transpose())
@@ -187,13 +196,5 @@ def infer_topics(self, num_topics=10, **kwargs):
                 topic_count += 1
             doc_count += 1
         document_topic_matrix = coo_matrix((data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
-        self.annoy_index = AnnoyIndex(document_topic_matrix.shape[1], "angular")
-        for i, doc_vector in tqdm(
-            enumerate(document_topic_matrix),
-            total=document_topic_matrix.shape[0],
-            desc="Building Annoy index of document-topic vectors",
-            leave=False,
-        ):
-            self.annoy_index.add_item(i, doc_vector[0].toarray()[0])
-        self.annoy_index.build(1000, n_jobs=cpu_count() - 1)
+        self.annoy_index = None
 

Original file line number	Diff line number	Diff line change
`@@ -348,7 +348,7 @@ def build_model(`
`348`	`348`	`ngram=training_corpus.ngram,`
`349`	`349`	`)`
`350`	`350`
`351`		`- print("inference corpus size:", training_corpus.size)`
	`351`	`+ print("inference corpus size:", full_corpus.size)`
`352`	`352`
`353`	`353`	`# Instantiate a topic model`
`354`	`354`	`if algorithm == "nmf":`