Skip to content

Commit 058d8f4

Browse files
committed
fix empty metadata
1 parent 1ed0f11 commit 058d8f4

3 files changed

Lines changed: 13 additions & 9 deletions

File tree

config/topologic_config.ini

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ algorithm = nmf
102102
# Number of topics for model
103103
number_of_topics = 100
104104

105-
# Maximum iteration for LDA model
106-
max_iter = 20
105+
# Maximum iteration for model
106+
max_iter = 200
107107

108108

109109
[TOPICS_OVER_TIME]

lib/scripts/topologic

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def main(args):
9292
inference_config,
9393
algorithm=model_config["algorithm"],
9494
number_of_topics=model_config["number_of_topics"],
95+
max_iter=model_config["max_iter"],
9596
vectorization=vector_config["vectorization"],
9697
max_freq=vector_config["max_freq"],
9798
min_freq=vector_config["min_freq"],
@@ -179,8 +180,8 @@ def prepare_data(
179180
) as output: ## Set buffer to 64K to speed up writes and avoid build-up in RAM
180181
output.write(" ".join(text))
181182
if (
182-
db_name in inference_config
183-
and db_config["text_object_level"] == inference_config[db_name]["text_object_level"]
183+
db_name in inference_config["databases"]
184+
and db_config["text_object_level"] == inference_config["databases"][db_name]["text_object_level"]
184185
): # if training collection and inference collection are the same, we won't process it again
185186
text.metadata["philo_db"] = db_name
186187
metadata[pos] = text.metadata
@@ -189,6 +190,7 @@ def prepare_data(
189190
pickle.dump(metadata, output_metadata)
190191

191192
pos = 0
193+
count = 0
192194
print("Processing inference data...", flush=True)
193195
for db_name, db_config in inference_config["databases"].items():
194196
count += 1
@@ -244,6 +246,7 @@ def build_model(
244246
inference_config,
245247
algorithm="lda",
246248
number_of_topics=100,
249+
max_iter=None,
247250
vectorization="tf",
248251
max_freq=0.9,
249252
min_freq=0.1,

lib/topologic/topic_model.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@
1919
class TopicModel(object):
2020
__metaclass__ = ABCMeta
2121

22-
def __init__(self, corpus):
22+
def __init__(self, corpus, max_iter=None):
2323
self.corpus = corpus # a Corpus object
2424
self.document_topic_matrix = None # document x topic matrix
2525
self.topic_word_matrix = None # topic x word matrix
2626
self.nb_topics = None # a scalar value > 1
2727
self.model = None
28+
self.max_iter = max_iter
2829

2930
@abstractmethod
3031
def infer_topics(self, num_topics=10, **kwargs):
@@ -111,7 +112,7 @@ def most_likely_topics_for_document(self, doc_id):
111112

112113

113114
class LatentDirichletAllocation(TopicModel):
114-
def infer_topics(self, num_topics=10, algorithm="variational", **kwargs):
115+
def infer_topics(self, num_topics=10, algorithm="variational", max_iter=None, **kwargs):
115116
self.nb_topics = num_topics
116117
lda_model = None
117118
topic_document = None
@@ -120,7 +121,7 @@ def infer_topics(self, num_topics=10, algorithm="variational", **kwargs):
120121
learning_method="batch",
121122
n_jobs=-1,
122123
random_state=0,
123-
max_iter=20,
124+
max_iter=max_iter or 20,
124125
doc_topic_prior=1.0 / num_topics,
125126
topic_word_prior=0.01 / num_topics,
126127
)
@@ -155,9 +156,9 @@ def infer_topics(self, num_topics=10, algorithm="variational", **kwargs):
155156

156157

157158
class NonNegativeMatrixFactorization(TopicModel):
158-
def infer_topics(self, num_topics=10, **kwargs):
159+
def infer_topics(self, num_topics=10, max_iter=None, **kwargs):
159160
self.nb_topics = num_topics
160-
self.model = NMF(n_components=num_topics, init="nndsvd", solver="cd", random_state=0)
161+
self.model = NMF(n_components=num_topics, init="nndsvd", solver="cd", max_iter=max_iter or 200, random_state=0)
161162
topic_document = self.model.fit_transform(self.corpus.sklearn_vector_space)
162163
self.topic_word_matrix = []
163164
self.document_topic_matrix = []

0 commit comments

Comments
 (0)