fix empty metadata

clovis · clovis · commit 058d8f421419 · 2020-08-20T14:32:01.000-05:00
diff --git a/config/topologic_config.ini b/config/topologic_config.ini
@@ -102,8 +102,8 @@ algorithm = nmf
 # Number of topics for model
 number_of_topics = 100
 
-# Maximum iteration for LDA model
-max_iter = 20
+# Maximum iteration for model
+max_iter = 200
 
 
 [TOPICS_OVER_TIME]
diff --git a/lib/scripts/topologic b/lib/scripts/topologic
@@ -92,6 +92,7 @@ def main(args):
         inference_config,
         algorithm=model_config["algorithm"],
         number_of_topics=model_config["number_of_topics"],
+        max_iter=model_config["max_iter"],
         vectorization=vector_config["vectorization"],
         max_freq=vector_config["max_freq"],
         min_freq=vector_config["min_freq"],
@@ -179,8 +180,8 @@ def prepare_data(
             ) as output:  ## Set buffer to 64K to speed up writes and avoid build-up in RAM
                 output.write(" ".join(text))
             if (
-                db_name in inference_config
-                and db_config["text_object_level"] == inference_config[db_name]["text_object_level"]
+                db_name in inference_config["databases"]
+                and db_config["text_object_level"] == inference_config["databases"][db_name]["text_object_level"]
             ):  # if training collection and inference collection are the same, we won't process it again
                 text.metadata["philo_db"] = db_name
                 metadata[pos] = text.metadata
@@ -189,6 +190,7 @@ def prepare_data(
             pickle.dump(metadata, output_metadata)
 
     pos = 0
+    count = 0
     print("Processing inference data...", flush=True)
     for db_name, db_config in inference_config["databases"].items():
         count += 1
@@ -244,6 +246,7 @@ def build_model(
     inference_config,
     algorithm="lda",
     number_of_topics=100,
+    max_iter=None,
     vectorization="tf",
     max_freq=0.9,
     min_freq=0.1,
diff --git a/lib/topologic/topic_model.py b/lib/topologic/topic_model.py
@@ -19,12 +19,13 @@
 class TopicModel(object):
     __metaclass__ = ABCMeta
 
-    def __init__(self, corpus):
+    def __init__(self, corpus, max_iter=None):
         self.corpus = corpus  # a Corpus object
         self.document_topic_matrix = None  # document x topic matrix
         self.topic_word_matrix = None  # topic x word matrix
         self.nb_topics = None  # a scalar value > 1
         self.model = None
+        self.max_iter = max_iter
 
     @abstractmethod
     def infer_topics(self, num_topics=10, **kwargs):
@@ -111,7 +112,7 @@ def most_likely_topics_for_document(self, doc_id):
 
 
 class LatentDirichletAllocation(TopicModel):
-    def infer_topics(self, num_topics=10, algorithm="variational", **kwargs):
+    def infer_topics(self, num_topics=10, algorithm="variational", max_iter=None, **kwargs):
         self.nb_topics = num_topics
         lda_model = None
         topic_document = None
@@ -120,7 +121,7 @@ def infer_topics(self, num_topics=10, algorithm="variational", **kwargs):
             learning_method="batch",
             n_jobs=-1,
             random_state=0,
-            max_iter=20,
+            max_iter=max_iter or 20,
             doc_topic_prior=1.0 / num_topics,
             topic_word_prior=0.01 / num_topics,
         )
@@ -155,9 +156,9 @@ def infer_topics(self, num_topics=10, algorithm="variational", **kwargs):
 
 
 class NonNegativeMatrixFactorization(TopicModel):
-    def infer_topics(self, num_topics=10, **kwargs):
+    def infer_topics(self, num_topics=10, max_iter=None, **kwargs):
         self.nb_topics = num_topics
-        self.model = NMF(n_components=num_topics, init="nndsvd", solver="cd", random_state=0)
+        self.model = NMF(n_components=num_topics, init="nndsvd", solver="cd", max_iter=max_iter or 200, random_state=0)
         topic_document = self.model.fit_transform(self.corpus.sklearn_vector_space)
         self.topic_word_matrix = []
         self.document_topic_matrix = []