-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpyLDAvis_generator.py
More file actions
90 lines (73 loc) · 3.03 KB
/
pyLDAvis_generator.py
File metadata and controls
90 lines (73 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# pyLDAvis_generator.py
import os
import pickle
import numpy as np
import pyLDAvis
import pyLDAvis._prepare as prep
from gensim.models.ldamodel import LdaModel
# ------------------ Model Names and File Mapping ------------------
model_map = {
"20_newsgroups": "20",
"mini_newsgroups": "mini",
"merged": "merged"
}
# ------------------ Paths ------------------
script_dir = os.path.dirname(os.path.abspath(__file__))
models_dir = os.path.join(script_dir, "models")
results_dir = os.path.join(script_dir, "results")
os.makedirs(results_dir, exist_ok=True)
# ------------------ Function ------------------
def generate_vis(display_name, file_prefix):
try:
print(f"\n Processing model: {display_name}")
# File paths
lda_model_path = os.path.join(models_dir, f"tuned_lda_model_{display_name}.model")
corpus_path = os.path.join(models_dir, f"corpus_{file_prefix}.pkl")
# Load LDA model
print(f"[INFO] Loading LDA model: {lda_model_path}")
lda_model = LdaModel.load(lda_model_path)
# Load corpus
print(f"[INFO] Loading corpus: {corpus_path}")
with open(corpus_path, "rb") as f:
corpus = pickle.load(f)
dictionary = lda_model.id2word
vocab = [dictionary[i] for i in range(len(dictionary))]
# Topic-Term Distributions
print("[INFO] Preparing topic-term distributions...")
topic_term_dists = lda_model.get_topics()
assert topic_term_dists.shape[1] == len(vocab), (
f"Mismatch: topic_term_dists has {topic_term_dists.shape[1]} terms, but vocab has {len(vocab)}"
)
# Document-Topic Distributions
doc_topic_dists = np.array([
[prob for _, prob in lda_model.get_document_topics(doc, minimum_probability=0.0)]
for doc in corpus
])
# Document Lengths
doc_lengths = [sum(cnt for _, cnt in doc) for doc in corpus]
# Term Frequency
term_frequency = np.zeros(len(vocab))
for doc in corpus:
for idx, count in doc:
term_frequency[idx] += count
term_frequency = term_frequency.astype(int)
# Generate Visualization
print(f"[INFO] Generating pyLDAvis HTML for {display_name}...")
vis_data = pyLDAvis.prepare(
topic_term_dists=topic_term_dists,
doc_topic_dists=doc_topic_dists,
doc_lengths=doc_lengths,
vocab=vocab,
term_frequency=term_frequency
)
output_path = os.path.join(results_dir, f"lda_topics_visual_{display_name}.html")
pyLDAvis.save_html(vis_data, output_path)
print(f" Saved: {output_path}")
except Exception as e:
print(f" Error for model '{display_name}': {e}")
# ------------------ Loop ------------------
if __name__ == "__main__":
print(" Starting pyLDAvis generation for all tuned LDA models...")
for display_name, file_prefix in model_map.items():
generate_vis(display_name, file_prefix)
print("\n All visualizations generated (or attempted).")