diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 9986da64..8a353f5f 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8, 3.9, "3.10"] + python-version: [3.9, "3.10"] steps: - uses: actions/checkout@v2 diff --git a/README.rst b/README.rst index 3d49a267..375f6c68 100644 --- a/README.rst +++ b/README.rst @@ -244,6 +244,7 @@ If the dataset is partitioned, you can: Available Models ================= + +-------------------------------------------+-----------------------------------------------------------+ | Name | Implementation | +===========================================+===========================================================+ @@ -263,7 +264,10 @@ Available Models +-------------------------------------------+-----------------------------------------------------------+ | ProdLda `(Srivastava and Sutton 2017)`_ | https://github.com/estebandito22/PyTorchAVITM | +-------------------------------------------+-----------------------------------------------------------+ - +| RSM `(Hinton and Salakhutdinov 2009)`_ | https://github.com/Fede-Rausa/ReplicatedSoftmax | ++-------------------------------------------+-----------------------------------------------------------+ +| over-RSM `(Hinton et al. 2013)`_ | https://github.com/Fede-Rausa/ReplicatedSoftmax | ++-------------------------------------------+-----------------------------------------------------------+ .. _(Bianchi et al. 2021): https://www.aclweb.org/anthology/2021.eacl-main.143/ .. _(Dieng et al. 2020): https://www.aclweb.org/anthology/2020.tacl-1.29 @@ -272,6 +276,8 @@ Available Models .. _(Landauer et al. 1998): http://lsa.colorado.edu/papers/dp1.LSAintro.pdf .. _(Lee and Seung 2000): https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization .. _(Srivastava and Sutton 2017): https://arxiv.org/abs/1703.01488 +.. _(Hinton and Salakhutdinov 2009): https://proceedings.neurips.cc/paper_files/paper/2009/file/31839b036f63806cba3f47b93af8ccb5-Paper.pdf +.. _(Hinton et al. 2013): https://arxiv.org/pdf/1309.6865 If you use one of these implementations, make sure to cite the right paper. diff --git a/octis/configuration/citations.py b/octis/configuration/citations.py index ee93027a..df97d524 100644 --- a/octis/configuration/citations.py +++ b/octis/configuration/citations.py @@ -166,6 +166,30 @@ } """ + +models_RSM = r""" +@article{hinton2009replicated, + title={Replicated softmax: an undirected topic model}, + author={Hinton, Geoffrey E and Salakhutdinov, Russ R}, + journal={Advances in neural information processing systems}, + volume={22}, + year={2009} +} + """ + + +models_oRSM = r""" +@article{srivastava2013modeling, + title={Modeling documents with deep boltzmann machines}, + author={Srivastava, Nitish and Salakhutdinov, Ruslan R and Hinton, Geoffrey E}, + journal={arXiv preprint arXiv:1309.6865}, + year={2013} +} + """ + + + + sources_dblp_M10 = r"""@inproceedings{DBLP:conf/ijcai/PanWZZW16, author = {Shirui Pan and Jia Wu and diff --git a/octis/configuration/defaults.py b/octis/configuration/defaults.py index a0e27005..1b18ce83 100644 --- a/octis/configuration/defaults.py +++ b/octis/configuration/defaults.py @@ -25,7 +25,18 @@ 'NMF': {'name': 'Non-negative Matrix Factorization', 'citation': 'Daniel D. Lee & H. Sebastian Seung (2001). Algorithms for Non-negative Matrix ' 'Factorization. Advances in Neural Information Processing Systems 13: ' - 'Proceedings of the 2000 Conference. MIT Press. pp. 556–562.'}} + 'Proceedings of the 2000 Conference. MIT Press. pp. 556–562.'}, + 'RSM' : {'name': 'Replicated Softmax Model', + 'citation': 'Ruslan R. Salakhutdinov, Geoffrey E. Hinton.' + 'Replicated Softmax: An Undirected Topic Model.' + 'Advances in neural information processing systems, 22 (2009).'}, + + 'oRSM' : {'name': 'Over Replicated Softmax Model', + 'citation': 'Srivastava, N., Salakhutdinov, R. R., & Hinton, G. E. (2013).' + 'Modeling documents with deep boltzmann machines.' + 'arXiv preprint arXiv:1309.6865.' + } + } model_hyperparameters = { 'LDA': { @@ -451,3 +462,111 @@ l1_ratio (double, optional) – The regularization mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an elementwise L2 penalty (aka Frobenius Norm). For l1_ratio = 1 it is an elementwise L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. """ + + + +RSM_hyperparameters_info = """ +num_topics (int, default=50) – Number of latent topics (hidden units) in the Replicated Softmax Model. + +epochs (int, default=5) – Number of training epochs (full passes over the dataset). + +btsz (int, default=100) – Mini-batch size used during training. + +lr (float, default=0.01) – Learning rate for parameter updates. + +momentum (float, default=0.1) – Momentum coefficient (used when train_optimizer='momentum'). + +K (int, default=1) – Number of Gibbs sampling steps for k-step Contrastive Divergence (K-CD). + +softstart (float, default=0.001) – Scale for random initialization of weights (weights ~ N(0,1)*softstart). + +decay (float, default=0) – Regularization coefficient. If >0, interaction penalty is applied (L1 or L2). + +penalty_L1 (bool, default=False) – If True use L1 regularization; otherwise L2 is used. + +penalty_local (bool, default=False) – If True apply penalty locally per-weight; otherwise apply a global penalty. + +epochs_per_monitor (int, default=1) – Frequency (in epochs) to record monitoring metrics when monitor=True. + +monitor (bool, default=False) – If True compute and store log-likelihood / perplexity during training. + +persistent_cd (bool, default=False) – If True use persistent contrastive divergence (PCD) chains. + +mean_field_cd (bool, default=True) – If True use mean-field contrastive divergence (mfcd) updates. + +increase_cd (bool, default=False) – If True use gradual k-step CD (k increases across epochs). + +increase_speed (float, default=0) – Controls speed of gradual increase of k when increase_cd is True. + +cd_type (str, default='mfcd') – Type of contrastive-divergence algorithm. Common values: 'mfcd' (mean-field CD), 'kcd' (k-step CD), 'pcd' or 'persistent' (persistent CD), 'gradkcd' (gradual kcd). + +train_optimizer (str, default='sgd') – Optimizer used for parameter updates. Options include: 'sgd', 'momentum', 'adagrad', 'rmsprop', 'adam', 'full' (full-batch), 'minibatch'. + +logdtm (bool, default=False) – If True apply log(1+count) transform to the document-term matrix before training. + +val_dtm (array or None, default=None) – Validation document-term matrix (used when training with partitions). + +random_state (int or None, default=None) – Seed for numpy RNG for reproducible runs. + +rms_decay (float, default=0.9) – RMSProp moving-average decay (used if train_optimizer='rmsprop'). + +adam_decay1 (float, default=0.9) – Adam first-moment decay (beta1). + +adam_decay2 (float, default=0.999) – Adam second-moment decay (beta2). +""" + + + +oRSM_hyperparameters_info = """ +num_topics (int, default=50) – Number of latent topics (hidden units) in the Over Replicated Softmax Model. + +epochs (int, default=5) – Number of training epochs (full passes over the dataset). + +pretrain_epochs (int, default=1) – Number of initial epochs that run the pretraining (mean-field) phase. + +btsz (int, default=100) – Mini-batch size used during training. + +M (int, default=30) – Number of hidden multinomial units in the additional replicated softmax layer (over-replication factor). + +lr (float, default=0.01) – Learning rate for parameter updates. + +momentum (float, default=0.1) – Momentum coefficient (used when train_optimizer='momentum'). + +softstart (float, default=0.001) – Scale for random initialization of weights (weights ~ N(0,1)*softstart). + +decay (float, default=0) – Regularization coefficient. If >0, interaction penalty is applied (L1 or L2). + +penalty_L1 (bool, default=False) – If True use L1 regularization; otherwise L2 is used. + +penalty_local (bool, default=False) – If True apply penalty locally per-weight; otherwise apply a global penalty. + +cd_type (str, default='mfcd') – Type of contrastive-divergence algorithm (common values: 'mfcd' mean-field CD, 'kcd' k-step CD, 'pcd' persistent CD). + +train_optimizer (str, default='sgd') – Optimizer used for parameter updates. Options include: 'sgd', 'momentum', 'adagrad', 'rmsprop', 'adam'. + +rms_decay (float, default=0.9) – RMSProp moving-average decay (used if train_optimizer='rmsprop'). + +adam_decay1 (float, default=0.9) – Adam first-moment decay (beta1). + +adam_decay2 (float, default=0.999) – Adam second-moment decay (beta2). + +logdtm (bool, default=False) – If True apply log(1+count) transform to the document-term matrix before training. + +val_dtm (array or None, default=None) – Validation document-term matrix (used when training with partitions). + +epochs_per_monitor (int, default=1) – Frequency (in epochs) to record monitoring metrics when monitor=True. + +monitor (bool, default=False) – If True compute and store monitoring metrics (e.g., perplexity) during training. + +random_state (int or None, default=None) – Seed for numpy RNG for reproducible runs. + +use_partitions (bool, default=True) – Whether the dataset partitions (train/test) are used (class attribute). + +softstart (float, default=0.001) – Initial scale for weight initialization. + +epsilon (float, default=0.01) – Convergence threshold used by mean-field updates (internal training parameter). + +Notes: +- The model accepts a document-term matrix (dtm) as training input; many hyperparameters (e.g., M, btsz, lr, optimizer) influence training dynamics and convergence. +- Pretraining uses a simplified k-CD step (pretrain_epochs) before full training. +""" \ No newline at end of file diff --git a/octis/models/RSM.py b/octis/models/RSM.py new file mode 100644 index 00000000..d84315f2 --- /dev/null +++ b/octis/models/RSM.py @@ -0,0 +1,990 @@ +from octis.models.model import AbstractModel +from octis.models.RS_class import Replicated_Softmax +import numpy as np +from tqdm import tqdm +import gensim.corpora as corpora +import octis.configuration.citations as citations +import octis.configuration.defaults as defaults +import time +import warnings + + +################## RSM octis class + + +class RSM(AbstractModel): + id2word = None + id_corpus = None + use_partitions = True + update_with_test = False + + def __init__( + self, + num_topics=50, + epochs=5, + btsz=100, + lr=0.01, + momentum=0.9, + K=1, + softstart=0.001, + decay=0, + penalty_L1=False, + penalty_local=False, + monitor_ppl=False, + monitor_time=False, + monitor_loglik=False, + increase_speed=1, + rms_decay=0.9, + adam_decay1=0.9, + adam_decay2=0.999, + cd_type="mfcd", + train_optimizer="sgd", + verbose=False, + logdtm=False, + random_state=None, + ): + """ + Parameters + ---------- + num_topics : number of topics + epochs : number of training epochs + btsz : batch size + lr : learning rate + momentum : momentum of momentum optimizer + (applied only if train_optimizer='momentum') + rms_decay : decay rate for RMSProp optimizer + (applied only if train_optimizer='rmsprop') + adam_decay1 : first decay rate for Adam optimizer + (applied only if train_optimizer='adam') + adam_decay2 : second decay rate for Adam optimizer + (applied only if train_optimizer='adam') + K : number of Gibbs sampling steps when using KCD + decay : penalization coefficient, default 0 (no penalization) + penalty_L1 : if True uses L1 penalization, else L2 penalization + penalty_local : if True uses local penalization, + else global penalization + softstart : initialization scale for weights + (randomly drawn from N(0,1)*softstart) + logdtm : if True each cell of the dtm is transformed as log(1+cell), + otherwise the raw counts are used + monitor : if True prints training information during training + + cd_type : type of contrastive divergence to use, + 'kcd', 'pcd', 'mfcd' (default) or 'gradcd' : + 'kcd' stands for k-step contrastive divergence + 'pcd' stands for persistent contrastive divergence + 'mfcd' stands for mean-field contrastive divergence + 'gradcd' stands for gradual k-step contrastive divergence, + where k increases over epochs latter when increase_speed is higher + train_optimizer : training optimizer to use : + 'full' for full batch training, + 'sgd' for simple stochastic gradient descent, + 'minibatch' for mini-batch training, + 'momentum' for mini-batch with momentum, + 'rmsprop' for RMSProp optimizer, + 'adam' for Adam optimizer, + 'adagrad' for Adagrad optimizer + + + Example usage + -------------------- + + from octis.dataset.dataset import Dataset + from octis.models.RSM import RSM + + dataset_20ng = Dataset() + dataset_20ng.fetch_dataset("20NewsGroup") + + rsm = RSM(num_topics=20, epochs=500, btsz=20, lr=0.0001, cd_type='mfcd', train_optimizer='rmsprop') + output_rsm = rsm.train(dataset_20ng) + """ + super().__init__() + self.hyperparameters = dict() + self.hyperparameters["num_topics"] = num_topics + self.hyperparameters["btsz"] = btsz + self.hyperparameters["lr"] = lr + self.hyperparameters["momentum"] = momentum + self.hyperparameters["K"] = K + self.hyperparameters["softstart"] = softstart + self.hyperparameters["epochs"] = epochs + self.hyperparameters["increase_speed"] = increase_speed + self.hyperparameters["monitor_time"] = monitor_time + self.hyperparameters["monitor_ppl"] = monitor_ppl + self.hyperparameters["monitor_loglik"] = monitor_loglik + self.hyperparameters["penalty_L1"] = penalty_L1 + self.hyperparameters["penalty_local"] = penalty_local + self.hyperparameters["decay"] = decay + self.hyperparameters["random_state"] = random_state + self.hyperparameters["cd_type"] = cd_type + self.hyperparameters["logdtm"] = logdtm + self.hyperparameters["val_dtm"] = None + self.hyperparameters["train_optimizer"] = train_optimizer + self.hyperparameters["rms_decay"] = rms_decay + self.hyperparameters["adam_decay1"] = adam_decay1 + self.hyperparameters["adam_decay2"] = adam_decay2 + self.hyperparameters["verbose"] = verbose + + def info(self): + """ + Returns model informations + """ + return { + "citation": citations.models_RSM, + "name": "RSM, Replicated Softmax Model", + } + + def hyperparameters_info(self): + """ + Returns hyperparameters informations + """ + return defaults.RSM_hyperparameters_info + + def train_model(self, dataset, hyperparams=None, top_words=10): + """ + Train the model and return output + + Parameters + ---------- + dataset : dataset to use to build the model + hyperparams : hyperparameters to build the model + top_words : if greater than 0 returns the most significant words for + each topic in the output (Default True) + Returns + ------- + result : dictionary with up to 3 entries, + 'topics', 'topic-word-matrix' and + 'topic-document-matrix' + """ + + self.initialize_model_structure(hyperparams=hyperparams, dataset=dataset) + + self.trained_model.train(**self.hyperparameters) + + return self.get_model_output(top_words) + + def get_model_output(self, top_words=10): + result = {} + + result["topic-word-matrix"] = self.trained_model._get_topic_word_matrix() + + if top_words > 0: + result["topics"] = self.trained_model._get_topics(top_words) + + result["topic-document-matrix"] = self.trained_model._get_topic_doc( + self.train_dtm + ) + + if self.use_partitions: + result["test-topic-document-matrix"] = self.trained_model._get_topic_doc( + self.test_dtm + ) + else: + result["test-topic-document-matrix"] = result["topic-document-matrix"] + + return result + + def initialize_model_structure(self, hyperparams, dataset): + if hyperparams is None: + hyperparams = {} + + if self.use_partitions: + train_corpus, test_corpus = dataset.get_partitioned_corpus( + use_validation=False + ) + else: + train_corpus = dataset.get_corpus() + + if self.id2word is None: + self.id2word = self.get_vocab(dataset.get_corpus()) + + if self.use_partitions: + if self.hyperparameters["verbose"]: + print("Building train DTM...") + self.train_dtm = self.build_dtm(train_corpus, self.id2word) + if self.hyperparameters["verbose"]: + print("Building test DTM...") + self.test_dtm = self.build_dtm(test_corpus, self.id2word) + hyperparams["dtm"] = self.train_dtm + hyperparams["val_dtm"] = self.test_dtm + else: + if self.hyperparameters["verbose"]: + print("Building DTM...") + self.train_dtm = self.build_dtm(train_corpus, self.id2word) + hyperparams["dtm"] = self.train_dtm + hyperparams["val_dtm"] = None + + if "num_topics" not in hyperparams: + hyperparams["num_topics"] = self.hyperparameters["num_topics"] + + self.hyperparameters.update(hyperparams) + + self.trained_model = self.RSM_model() + self.trained_model.id2word = self.id2word + + ############### preprocessing functions + + def get_vocab(self, tokenized_corpus): + id2word = corpora.Dictionary(tokenized_corpus) + return id2word + + def build_dtm(self, tokenized_corpus, id2word=None): + """ + converts a tokenized corpus to a DOcument Term Matrix. id2word is a gensim dictionary. + """ + if id2word is None: + id2word = corpora.Dictionary(tokenized_corpus) + else: + id2word = id2word + id_corpus = [id2word.doc2bow(document) for document in tokenized_corpus] + vocab = id2word.token2id + N = len(id_corpus) + DTM = np.zeros((N, len(vocab))) + for i in tqdm(range(N)): + doc = id_corpus[i] + for id, count in doc: + DTM[i, id] = count + + if self.hyperparameters["logdtm"]: + DTM = np.log(1 + DTM) + return DTM + + ############################################################## RSM original class + + class RSM_model(Replicated_Softmax): + def __init__(self): + super().__init__() + + ############################## energy and probability + + def neg_energy(self, v, h): + w_vh, w_v, w_h = self.W + D = v.sum(axis=1) + t1 = v @ w_v + t2 = D * (h @ w_h) + t3 = (v @ w_vh @ h.T).sum(axis=1) + en = t1 + t2 + t3 + return en + + def visible2hidden_vec(self, v): + w_vh, w_v, w_h = self.W + D = v.sum() + energy = D * w_h + np.dot(v, w_vh) + return self.sigmoid(energy) + + def visible2hidden(self, v): + w_vh, w_v, w_h = self.W + D = np.tile(v.sum(axis=1), (w_h.shape[0], 1)).T + energy = D * w_h + np.dot(v, w_vh) + return self.sigmoid(energy) + + def hidden2visible_vec(self, h): + w_vh, w_v, w_h = self.W + energy = w_v + np.dot(w_vh, h) + return self.softmax_vec(energy) + + def hidden2visible(self, h): + w_vh, w_v, w_h = self.W + energy = np.tile(w_v, (h.shape[0], 1)).T + np.dot(w_vh, h.T) + return self.softmax(energy.T) + + ##################################### leapfrog trainsition operators + + def gibbs_transition(self, v): + D = v.sum(axis=1) + hidden_probs = self.visible2hidden(v) + hidden_sample = self.unif_reject_sample(hidden_probs) + visible_probs = self.hidden2visible(hidden_sample) + visible_sample = np.empty(v.shape) + for i in range(v.shape[0]): + visible_sample[i] = self.multinomial_sample(visible_probs[i], D[i]) + return visible_sample + + def MH_transition(self, state, logpdf): + new = self.gibbs_transition(state) + old_logpdf = logpdf(state) + new_logpdf = logpdf(new) + + accept_ratio = min(1, np.exp(new_logpdf - old_logpdf)) + + # Accept or reject + if np.random.random() < accept_ratio: + return new + else: + return state + + #### leapfrog for single document vectors (useful for ais estimates of perplexity) + + def gibbs_transition_vec(self, v): + D = v.sum() + hidden_probs = self.visible2hidden_vec(v) + hidden_sample = self.unif_reject_sample(hidden_probs) + visible_probs = self.hidden2visible_vec(hidden_sample) + visible_sample = self.multinomial_sample(visible_probs, D) + return visible_sample + + def MH_transition_vec(self, state, logpdf): + new = self.gibbs_transition_vec(state) + old_logpdf = logpdf(state) + new_logpdf = logpdf(new) + + accept_ratio = min(1, np.exp(new_logpdf - old_logpdf)) + + # Accept or reject + if np.random.random() < accept_ratio: + return new + else: + return state + + ################################## gradient descent optimization + + def gradient_simple(self, v1, v2, h1, h2): + w_vh, w_v, w_h = self.W + lr = self.lr + + vel_vh = np.dot(v1.T, h1) - np.dot(v2.T, h2) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + vel_v = v1.sum(axis=0) - v2.sum(axis=0) + vel_h = h1.sum(axis=0) - h2.sum(axis=0) + + w_vh += vel_vh * lr + w_v += vel_v * lr + w_h += vel_h * lr + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + def gradient_momentum(self, v1, v2, h1, h2): + w_vh, w_v, w_h = self.W + vel_vh, vel_v, vel_h = self.train_cache + m = self.momentum + lr = self.lr + + vel_vh = vel_vh * m + (np.dot(v1.T, h1) - np.dot(v2.T, h2)) * (1 - m) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + vel_v = vel_v * m + (v1.sum(axis=0) - v2.sum(axis=0)) * (1 - m) + vel_h = vel_h * m + (h1.sum(axis=0) - h2.sum(axis=0)) * (1 - m) + + w_vh += vel_vh * lr + w_v += vel_v * lr + w_h += vel_h * lr + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + self.train_cache = vel_vh, vel_v, vel_h + + def gradient_adagrad(self, v1, v2, h1, h2): + w_vh, w_v, w_h = self.W + vel_vh, vel_v, vel_h = self.train_cache + lr = self.lr + + vel_vh = np.dot(v1.T, h1) - np.dot(v2.T, h2) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + vel_v = v1.sum(axis=0) - v2.sum(axis=0) + vel_h = h1.sum(axis=0) - h2.sum(axis=0) + + w_vh += vel_vh * lr / (np.sqrt(np.sum(vel_vh**2)) + 1e-8) + w_v += vel_v * lr / (np.sqrt(np.sum(vel_v**2)) + 1e-8) + w_h += vel_h * lr / (np.sqrt(np.sum(vel_h**2)) + 1e-8) + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + self.train_cache = vel_vh, vel_v, vel_h + + def gradient_rmsprop(self, v1, v2, h1, h2): + ( + w_vh, + w_v, + w_h, + ) = self.W + vel_vh, vel_v, vel_h, rms_m2_vh, rms_m2_v, rms_m2_h = self.train_cache + rms_decay = self.rms_decay + lr = self.lr + + vel_vh = np.dot(v1.T, h1) - np.dot(v2.T, h2) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + vel_v = v1.sum(axis=0) - v2.sum(axis=0) + vel_h = h1.sum(axis=0) - h2.sum(axis=0) + + rms_m2_vh = rms_decay * rms_m2_vh + (1 - rms_decay) * (vel_vh**2) + w_vh += lr * vel_vh / np.sqrt(rms_m2_vh + 1e-8) + rms_m2_v = rms_decay * rms_m2_v + (1 - rms_decay) * (vel_v**2) + w_v += lr * vel_v / np.sqrt(rms_m2_v + 1e-8) + rms_m2_h = rms_decay * rms_m2_h + (1 - rms_decay) * (vel_h**2) + w_h += lr * vel_h / np.sqrt(rms_m2_h + 1e-8) + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + self.train_cache = vel_vh, vel_v, vel_h, rms_m2_vh, rms_m2_v, rms_m2_h + + def gradient_adam(self, v1, v2, h1, h2): + w_vh, w_v, w_h = self.W + ( + vel_vh, + vel_v, + vel_h, + adam_m1_vh, + adam_m1_v, + adam_m1_h, + adam_m2_vh, + adam_m2_v, + adam_m2_h, + t, + ) = self.train_cache + decay1 = self.adam_decay1 + decay2 = self.adam_decay2 + lr = self.lr + + vel_vh = np.dot(v1.T, h1) - np.dot(v2.T, h2) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + vel_v = v1.sum(axis=0) - v2.sum(axis=0) + vel_h = h1.sum(axis=0) - h2.sum(axis=0) + + # Increment t first (should start from 1, not 0) + t += 1 + + # Compute bias correction terms + bias_correction1 = 1 - decay1**t + bias_correction2 = 1 - decay2**t + + # Update for w_vh + adam_m1_vh = decay1 * adam_m1_vh + (1 - decay1) * vel_vh + adam_m2_vh = decay2 * adam_m2_vh + (1 - decay2) * (vel_vh**2) + adam_m1_vh_hat = adam_m1_vh / bias_correction1 + adam_m2_vh_hat = adam_m2_vh / bias_correction2 + w_vh += lr * adam_m1_vh_hat / (np.sqrt(adam_m2_vh_hat) + 1e-8) + + # Update for w_v + adam_m1_v = decay1 * adam_m1_v + (1 - decay1) * vel_v + adam_m2_v = decay2 * adam_m2_v + (1 - decay2) * (vel_v**2) + adam_m1_v_hat = adam_m1_v / bias_correction1 + adam_m2_v_hat = adam_m2_v / bias_correction2 + w_v += lr * adam_m1_v_hat / (np.sqrt(adam_m2_v_hat) + 1e-8) + + # Update for w_h + adam_m1_h = decay1 * adam_m1_h + (1 - decay1) * vel_h + adam_m2_h = decay2 * adam_m2_h + (1 - decay2) * (vel_h**2) + adam_m1_h_hat = adam_m1_h / bias_correction1 + adam_m2_h_hat = adam_m2_h / bias_correction2 + w_h += lr * adam_m1_h_hat / (np.sqrt(adam_m2_h_hat) + 1e-8) + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + self.train_cache = ( + vel_vh, + vel_v, + vel_h, + adam_m1_vh, + adam_m1_v, + adam_m1_h, + adam_m2_vh, + adam_m2_v, + adam_m2_h, + t, + ) + + ########################################## contrastive divergence steps + + def kcd_step(self, ids): + v0 = self.dtm[ids, :] + h0 = self.visible2hidden(v0) + v1 = v0 + for k in range(self.tK): + v1 = self.gibbs_transition(v1) + h1 = self.visible2hidden(v1) + + if not self.mean_h: # converting probabilities to binaries + h0 = self.unif_reject_sample(h0) + h1 = self.unif_reject_sample(h1) + + self.gradient_step(v0, v1, h0, h1) + + def mfcd_step(self, ids): + v0 = self.dtm[ids, :] + D = v0.sum(axis=1) + h0 = self.visible2hidden(v0) + v1 = self.hidden2visible(h0) * D.reshape(-1, 1) + h1 = self.visible2hidden(v1) + + self.gradient_step(v0, v1, h0, h1) + + def gradkcd_step(self, ids): + self.tK = self.Kvec[self.t] + if self.tK == 0: + self.mfcd_step(ids) + else: + self.kcd_step(ids) + + def pcd_step(self, ids): + v0 = self.dtm[ids, :] + pv0 = self.persistent_v[ids, :] + h0 = self.visible2hidden(v0) + pv1 = self.gibbs_transition(pv0) + ph1 = self.visible2hidden(pv1) + self.persistent_v[ids, :] = pv1 + + self.gradient_step(v0, pv1, h0, ph1) + + def gradual_k(self, T, K, g=0): + t = np.arange(1, T + 1) + k = np.floor((K + 1) * ((t / (T + 1)) ** (1 + g))).astype(int) + return k + + ########################################## main training function + + def train( + self, + dtm, + num_topics=5, + epochs=3, + btsz=100, + lr=0.01, + momentum=0.5, + K=1, + decay=0, + penalty_L1=False, + penalty_local=False, + monitor_time=False, + monitor_ppl=False, + monitor_loglik=False, + train_optimizer="sgd", + cd_type="mfcd", + logdtm=False, + rms_decay=0.9, + adam_decay1=0.9, + adam_decay2=0.999, + increase_speed=0, + softstart=0.001, + winit=None, + val_dtm=None, + random_state=None, + verbose=False, + ): + ## init global variables + if random_state is not None: + np.random.seed(random_state) + + doval = val_dtm is not None + + # init structure of the model + self.set_structure_from_dtm( + winit=winit, + softstart=softstart, + epochs=epochs, + num_topics=num_topics, + dtm=dtm, + val_dtm=val_dtm, + monitor_ppl=monitor_ppl, + monitor_loglik=monitor_loglik, + monitor_time=monitor_time, + logdtm=logdtm, + ) + + ##init training hyperparams + self.set_train_hyper( + epochs=epochs, + btsz=btsz, + lr=lr, + momentum=momentum, + K=K, + decay=decay, + penalty_L1=penalty_L1, + penalty_local=penalty_local, + train_optimizer=train_optimizer, + cd_type=cd_type, + rms_decay=rms_decay, + adam_decay1=adam_decay1, + adam_decay2=adam_decay2, + increase_speed=increase_speed, + ) + + ## MAIN TRAIN LOOP + print("Training RS model...") + for t in tqdm(range(epochs)): + if monitor_time: + current_time = time.time() + + if self.stop: + print("training stopped early") + break + else: + self.train_epoch() + + if monitor_time: + elapsed_time = time.time() - current_time + self.train_time[t] = elapsed_time + + if monitor_ppl: + self.train_ppl[t] = self.log_ppl_approx(dtm) + + if doval: + self.val_ppl[t] = self.log_ppl_approx(val_dtm) + + if monitor_loglik: + self.train_loglik[t] = np.mean(self.neg_free_energy(dtm)) + + if doval: + self.val_loglik[t] = np.mean(self.neg_free_energy(val_dtm)) + + def train_epoch(self): + """one epoch of training, with sgd and mini-batches""" + start_id = 0 + + # if self.sgd: + np.random.shuffle(self.obs_ids) # apply sgd + self.dtm = self.dtm[self.obs_ids, :] + if self.persist: + self.persistent_v = self.persistent_v[self.obs_ids, :] + + for b in range(self.batches): + ids = np.arange(start_id, start_id + self.btsz) + self.cd_learning_step(ids) + start_id += self.btsz + + self.t += 1 + + def set_train_hyper( + self, + epochs=3, + btsz=100, + lr=0.01, + momentum=0.9, + K=1, + decay=0, + penalty_L1=False, + penalty_local=False, + train_optimizer="sgd", + cd_type="mfcd", + rms_decay=0.9, + adam_decay1=0.9, + adam_decay2=0.999, + increase_speed=0, + ): + N, dictsize = self.dtm.shape + num_topics = self.hidden + + self.stop = False + self.momentum = momentum + self.lr = lr + self.decay = decay + self.penalty = decay > 0 + self.penL1 = penalty_L1 + self.local_penalty = penalty_local + + self.train_optimizer = train_optimizer + self.adam_decay1 = adam_decay1 + self.adam_decay2 = adam_decay2 + self.rms_decay = rms_decay + + self.persist = cd_type == "pcd" # persistent_cd + self.mean_field = cd_type == "mfcd" # mean_field_cd + self.gradual = cd_type == "gradcd" # increase_cd + + self.t = 0 # current epoch + self.K = K + self.tK = K # current k + self.mean_h = True # whether to use mean hidden activations or sample them + + self.btsz = btsz + self.batches = int(np.floor(N / btsz)) + # self.sgd = (train_optimizer!='full') + # self.bt_correct = (btsz**2)/N #a bayesian would correct decay for batch size. I'm not a bayesian + + ## initialize k + if self.gradual: + Kvec = self.gradual_k(T=epochs, K=self.K, g=increase_speed) + else: + Kvec = np.ones(epochs) * self.K + self.Kvec = Kvec.astype(int) + + # Initialize persistent chain - one chain for each document in the dataset + # Each persistent visible should have the same document length as corresponding data + if self.persist: + self.persistent_v = np.zeros((N, dictsize)) # Full dataset size + persistent_D = self.dtm.sum( + axis=1 + ) # Document lengths from original data + + # Initialize each document with uniform multinomial of its actual length + for i in range(N): + if persistent_D[i] > 0: # Avoid empty documents + self.persistent_v[i] = np.random.multinomial( + persistent_D[i], np.ones(dictsize) / dictsize + ) + + # Initialize weights gradients + vel_vh = np.zeros((dictsize, num_topics)) + vel_v = np.zeros((dictsize)) + vel_h = np.zeros((num_topics)) + + if self.train_optimizer == "sgd": + self.gradient_step = self.gradient_simple + else: + if self.train_optimizer == "momentum": + self.gradient_step = self.gradient_momentum + self.train_cache = vel_vh, vel_v, vel_h + else: + if self.train_optimizer == "adagrad": + self.gradient_step = self.gradient_adagrad + self.train_cache = vel_vh, vel_v, vel_h + else: + if self.train_optimizer == "rmsprop": + self.gradient_step = self.gradient_rmsprop + rms_m2_vh = np.zeros((dictsize, num_topics)) + rms_m2_v = np.zeros((dictsize)) + rms_m2_h = np.zeros((num_topics)) + self.rms_decay = 0.9 + self.train_cache = ( + vel_vh, + vel_v, + vel_h, + rms_m2_vh, + rms_m2_v, + rms_m2_h, + ) + else: + if self.train_optimizer == "adam": + self.gradient_step = self.gradient_adam + adam_m1_vh = np.zeros((dictsize, num_topics)) + adam_m1_v = np.zeros((dictsize)) + adam_m1_h = np.zeros((num_topics)) + adam_m2_vh = np.zeros((dictsize, num_topics)) + adam_m2_v = np.zeros((dictsize)) + adam_m2_h = np.zeros((num_topics)) + t = 1 + self.adam_decay1 = 0.9 + self.adam_decay2 = 0.999 + self.train_cache = ( + vel_vh, + vel_v, + vel_h, + adam_m1_vh, + adam_m1_v, + adam_m1_h, + adam_m2_vh, + adam_m2_v, + adam_m2_h, + t, + ) + else: + self.gradient_step = self.gradient_simple + + if self.mean_field: + self.cd_learning_step = self.mfcd_step # input is v0 + else: + if self.persist: + self.cd_learning_step = ( + self.pcd_step + ) # input is v0, persistent_v, output is new persistent_v + else: + if cd_type == "kcd": + self.cd_learning_step = self.kcd_step # input is v0, K fixed + else: # gradual kcd + if self.gradual: + self.cd_learning_step = ( + self.gradkcd_step + ) # input is v0, change K each epoch + else: + self.cd_learning_step = ( + self.kcd_step + ) # input is v0, K fixed + + ############ perplexity and probability + + def log_ppl_approx(self, dtm): + """ + return the log perplepxity upper bound + given a document term matrix + """ + mfh = self.visible2hidden(dtm) + vprob = self.hidden2visible(mfh) + vprob = np.clip(vprob, 1e-12, None) + sum_dtm = np.sum(dtm) + assert sum_dtm > 0, "the sum of the dtm s entries has to be positive" + lpub = -np.nansum(np.log(vprob) * dtm) / sum_dtm + return lpub + + def ppl_approx(self, testmatrix): + """ + return the perplepxity upper bound + given a document term matrix + """ + + w_vh, w_v, w_h = self.W + D = testmatrix.sum(axis=1) + + # compute hidden activations + h = self.sigmoid(np.dot(testmatrix, w_vh) + np.outer(D, w_h)) + + # compute visible activations + v = np.dot(h, w_vh.T) + w_v + pdf = self.softmax(v) + + # compute the per word perplexity + z = np.nansum(testmatrix * np.log(pdf)) + s = np.sum(D) + ppl = np.exp(-z / s) + return ppl + + def approx_prob(self, dtm): + w_vh, w_v, w_h = self.W + D = dtm.sum(axis=1) + # compute hidden activations + h = self.sigmoid(np.dot(dtm, w_vh) + np.outer(D, w_h)) + + # compute visible activations + v = np.dot(h, w_vh.T) + w_v + pdf = self.softmax(v) + + return pdf + + def ppl_exact_ais( + self, testmatrix, S=10000, niter=100, MH_steps=0, D=[10, 20, 40, 60] + ): + """ + return the exact perplepxity + given a document term matrix + using Annealed Importance Sampling + + S: number of intermediate distributions + niter: number of AIS runs + MH_steps: number of MH steps per intermediate distribution (0 means just Gibbs sampling) + D: list of document lengths to use for the AIS runs + """ + log_Zb_list = [] + print("Estimating partition function using AIS...") + for d in D: + log_Zb, Za, log_avg_ratio, var_log_ratio = self.ais( + S=S, niter=niter, D=d, MH_steps=MH_steps + ) + log_Zb_list.append(log_Zb) + + # estimate partition function for each document length + slope, intercept = self.simple_linreg( + X=np.array(D), Y=np.array(log_Zb_list) + ) + + N = testmatrix.shape[0] + total_loglik = 0 + print("Computing exact perplexity...") + for i in tqdm(range(N)): + doc = testmatrix[i].reshape(1, -1) + D = int(doc.sum()) + log_Zb = intercept + slope * D + loglik = self.neg_free_energy(doc) - log_Zb + total_loglik += loglik + avg_loglik = total_loglik / N + ppl = np.exp(-avg_loglik) + return ppl + + def ais(self, S=1000, niter=100, D=20, MH_steps=0): + """ + Annealed Importance Sampling to estimate the partition function of the RSM + S: number of intermediate distributions + niter: number of AIS runs + D: document length for the AIS runs + MH_steps: number of MH steps per intermediate distribution (0 means just Gibbs sampling) + """ + + T = self.hidden + Za = 2**T + K = self.visible # voacb length + # inverse temperature values + beta = np.arange(start=0, stop=1 + 1 / S, step=1 / S) + + # intermediate pdf + def temp_pdf(docvec, b): + return np.exp(b * np.log(self.marginal_pdf(docvec))) + + def log_temp_pdf(docvec, b): + return b * self.neg_free_energy_single_doc(docvec) + # return b*np.log(self.marginal_pdf(docvec)) + + log_w_ais_list = np.empty(niter) + for it in tqdm(range(niter)): + v_sampled = np.random.multinomial(D, np.ones(K) / K, size=1)[0] + + # loop + log_w_ais = 0 # w_ais = 1 + for s in range(S - 1): + if MH_steps > 0: + + def lpd(doc): + return log_temp_pdf(doc, beta[s]) + + for m in range(MH_steps): + v_sampled = self.MH_transition_vec(v_sampled, logpdf=lpd) + else: + v_sampled = self.gibbs_transition_vec(v_sampled) + + logratio = log_temp_pdf(v_sampled, beta[s + 1]) - log_temp_pdf( + v_sampled, beta[s] + ) + if not np.isnan(logratio): + log_w_ais = log_w_ais + logratio + # ratio = temp_pdf(v_sampled, beta[s+1])/temp_pdf(v_sampled, beta[s]) + # w_ais = w_ais*ratio + + log_w_ais_list[it] = log_w_ais + + vec = log_w_ais_list - np.log(log_w_ais_list.shape[0]) + log_avg_ratio = np.max(vec) + np.log(np.sum(np.exp(vec - np.max(vec)))) + + var_log_ratio = np.nanvar(log_w_ais_list) + + log_Zb = log_avg_ratio + np.log(Za) + # Zb = np.exp(log_Zb) + + return log_Zb, Za, log_avg_ratio, var_log_ratio + + def simple_linreg(self, X, Y): + """ + Simple linear regression to predict Y from X + Returns coefficients and intercept + """ + # Calculate means + mean_x = np.mean(X) + mean_y = np.mean(Y) + + # Calculate standard deviations + sd_x = np.std(X, ddof=1) + sd_y = np.std(Y, ddof=1) + + # Calculate correlation + correlation = np.corrcoef(X, Y)[0, 1] + + # Calculate slope (b1) using the formula: b1 = (correlation * sd_y) / sd_X + slope = (correlation * sd_y) / sd_x + + # Calculate intercept (b0) using the formula: b0 = mean_y - slope * mean_X + intercept = mean_y - slope * mean_x + + return slope, intercept diff --git a/octis/models/RS_class.py b/octis/models/RS_class.py new file mode 100644 index 00000000..c21bcb70 --- /dev/null +++ b/octis/models/RS_class.py @@ -0,0 +1,263 @@ +import numpy as np + + +class Replicated_Softmax: + def __init__(self): + self.W = None + + ###### to implement in the specific class + + def train(self): + raise NotImplementedError + + def train_epoch(self): + raise NotImplementedError + + def set_train_hyper(self): + raise NotImplementedError + + def visible2hidden(self): + raise NotImplementedError + + ####### activations and sampling + + def softmax(self, x): + """ + Softmax activation by row of the matrix x. + The denominator log(sum(exp(x[i]))) leads to many inf, so the + LogSumExp approximation is used instead. + """ + maxs = np.max(x, axis=1, keepdims=True) + lse = maxs + np.log(np.sum(np.exp(x - maxs), axis=1, keepdims=True)) + return np.exp(x - lse) + + def softmax_vec(self, array): + """simple softmax activation for an array vector (single document)""" + exparr = np.exp(array) + return exparr / exparr.sum() + + def sigmoid(self, x): + """basic sigmoid activation""" + return 1 / (1 + np.exp(-x)) + + def multinomial_sample(self, probs, N): + """ + wrapper of np.random.multinomial + probs: vector of probabilities for words count + N: number of words to sample + """ + return np.random.multinomial(N, probs, size=1)[0] + + def unif_reject_sample(self, probs): + """ + function to sample topics (bernoulli distributed) + given a vector of probabilities. + It samples from a uniform distribution U(0,1) + to get the thresholds for each topic. + """ + h_unif = np.random.rand(*probs.shape) + h_sample = np.array(h_unif < probs, dtype=int) + return h_sample + + def deterministic_sample(self, probs): + """ + function to sample topics (bernoulli distributed) + given a vector of probabilities. + It uses the >0.5 rule to assign 1 to each topic. + """ + return (probs > 0.5).astype(int) + + ################### gradient utils + + def interaction_penalty(self, vel_vh, w_vh): + """ + function to adjust the gradient of the + topic-word interaction weights during a training iteration + of a RS model by a penalty factor. + The model shoud have the attributes: + - penalty : bool : if the penalization should be applied + - penL1: bool : if the penalty is of type L1 or L2 + - local_penalty : bool : if the penalty should be local or global + - decay : float : the penalty factor to use + This function also requires two numpy arrays as arguments: + - the interaction weigths matrix w_vh, that connects topics to words + - the respective gradients vel_vh (also a matrix) + """ + if self.penalty: + if self.penL1: # L1 penalty + if self.local_penalty: + penal = self.decay * np.sign(w_vh) + else: + penal = self.decay * np.sum(np.abs(w_vh)) * np.sign(w_vh) + else: # L2 penalty + if self.local_penalty: + penal = self.decay * w_vh + else: + penal = self.decay * np.sum(w_vh) + + vel_vh = vel_vh - penal + return vel_vh + + ############### likelihood utils + + def neg_free_energy(self, v): + """ + given an array v similar to the dtm, computes the + log pdf under the replicated softmax + """ + w_vh, w_v, w_h = self.W + T = self.hidden + D = v.sum(axis=1) + fren = np.dot(v, w_v) + for j in range(T): + w_j = w_vh[:, j] + a_j = w_h[j] + fren += np.log(1 + np.exp(D * a_j + np.dot(v, w_j))) + return fren + + def neg_free_energy_single_doc(self, v): + """ + given a one dimensional Bow vector v representing a single document, + computes the log pdf under the replicated softmax + """ + w_vh, w_v, w_h = self.W + T = self.hidden + D = v.sum() + fren = np.dot(v, w_v) + for j in range(T): + w_j = w_vh[:, j] + a_j = w_h[j] + fren += np.log(1 + np.exp(D * a_j + np.dot(v, w_j))) + return fren + + def marginal_pdf(self, v): + return np.exp(self.neg_free_energy(v)) + + def marginal_pdf_single_doc(self, v): + return np.exp(self.neg_free_energy_single_doc(v)) + + ############ octis output functions + + def topic_words(self, topk, id2word=None): + """ + Given a gensim dictionary id2word, + returns the topk most important words for each topic + inside a list of T lists, where T is the number of topics + """ + w_vh, w_v, w_h = self.W + T = self.hidden + if id2word is None: + id2word = self.id2word + words = np.array([k for k in id2word.token2id.keys()]) + + toplist = [] + for t in range(T): + topw = w_vh[:, t] + bestwords = words[np.argsort(topw)[::-1]][0:topk] + toplist.append(bestwords) + + return toplist + + def _get_topics(self, topk): + """ + Given a gensim dictionary id2word, + returns the topk most important words for each topic + inside a list of T lists, where T is the number of topics + (this function is a wrapper of topic_words, used by octis class) + """ + return self.topic_words(topk, self.id2word) + + def _get_topic_word_matrix(self): + """ + Returns the topic representation of the words. + Uses min-max normalization by topic of the interaction weigths + matrix w_vh. The ranking of the words using this matrix + is equivalent to the ranking obtained from the unnormalized + matrix of weigths w_vh. + """ + w_vh, w_v, w_h = self.W + topic_word_matrix = w_vh.T + normalized = [] + for words_w in topic_word_matrix: + minimum = min(words_w) + words = words_w - minimum + normalized.append([float(i) / sum(words) for i in words]) + topic_word_matrix = np.array(normalized) + return topic_word_matrix + + def _get_topic_doc(self, dtm): + """ + given a bidimensional array dtm like, returns + the probabilities of each topic for each document + (as an array of probabilities). + """ + return self.visible2hidden(dtm).T + + ####################### train utils + + def set_structure_from_dtm( + self, + winit=None, + dtm=None, + val_dtm=None, + softstart=0.001, + num_topics=5, + epochs=5, + monitor_ppl=False, + monitor_time=False, + monitor_loglik=False, + logdtm=False, + ): + """function to initialize the weigths matrices + given the dtm and the number of topics""" + doval = val_dtm is not None + + if logdtm: + self.dtm = np.log(1 + dtm) + if doval: + self.val_dtm = np.log(1 + val_dtm) + else: + self.dtm = dtm + if doval: + self.val_dtm = val_dtm + + D = self.dtm.sum(axis=1) + assert not np.any(D == 0), "all the documents should have positive length" + + self.hidden = num_topics + self.F = num_topics + N, dictsize = dtm.shape + self.visible = dictsize + + self.obs_ids = np.arange(N) + + if winit is not None: + ###self.W = winit WRONG: You are referencing the same arrays across runs + # defensive copy to avoid sharing mutable numpy arrays across runs + try: + self.W = tuple(np.array(arr, copy=True) for arr in winit) + except Exception: + # fallback: keep original if not iterable + self.W = winit + + if self.W is None: + w_vh = softstart * np.random.randn(dictsize, num_topics) + w_v = softstart * np.random.randn(dictsize) + w_h = softstart * np.random.randn(num_topics) + self.W = w_vh, w_v, w_h + else: + print("train already available weights") + w_vh, w_v, w_h = self.W + + if monitor_time: + self.train_time = np.empty(epochs) + + if monitor_ppl: + self.train_ppl = np.empty(epochs) + if doval: + self.val_ppl = np.empty(epochs) + + if monitor_loglik: + self.train_loglik = np.empty(epochs) + if doval: + self.val_loglik = np.empty(epochs) diff --git a/octis/models/oRSM.py b/octis/models/oRSM.py new file mode 100644 index 00000000..f50b71ce --- /dev/null +++ b/octis/models/oRSM.py @@ -0,0 +1,955 @@ +from octis.models.model import AbstractModel +from octis.models.RS_class import Replicated_Softmax +import numpy as np +from tqdm import tqdm +import gensim.corpora as corpora +import octis.configuration.citations as citations +import octis.configuration.defaults as defaults +import time +import warnings + +################## oRSM octis class + + +class oRSM(AbstractModel): + id2word = None + id_corpus = None + use_partitions = True + update_with_test = False + + def __init__( + self, + num_topics=50, + epochs=5, + btsz=100, + M=50, + lr=0.01, + momentum=0.9, + softstart=0.001, + epsilon=0.01, + decay=0, + penalty_L1=False, + penalty_local=False, + increase_speed=1, + rms_decay=0.9, + adam_decay1=0.9, + adam_decay2=0.999, + monitor_time=False, + monitor_ppl=False, + monitor_loglik=False, + cd_type="mfcd", + K=1, + train_optimizer="sgd", + logdtm=False, + verbose=False, + random_state=None, + pretrain_epochs=500, + ): + """ + Parameters + ---------- + num_topics : number of topics + epochs : number of training epochs + btsz : batch size + lr : learning rate + M : size of the multinomial of the third layer, so the fixed number of words in the prior + (represents the strength of the prior over the formation of topics) + epsilon : convergence threshold for mean field approximation of the two hidden layers + pretrain_epochs : number of epochs used for pretraining. + The rest (epochs-pretrain_epochs) goes in mean field training. + When higher than epochs there is no mean field training. + momentum : momentum of momentum optimizer + (applied only if train_optimizer='momentum') + rms_decay : decay rate for RMSProp optimizer + (applied only if train_optimizer='rmsprop') + adam_decay1 : first decay rate for Adam optimizer + (applied only if train_optimizer='adam') + adam_decay2 : second decay rate for Adam optimizer + (applied only if train_optimizer='adam') + K : number of Gibbs sampling steps when using KCD + decay : penalization coefficient, default 0 (no penalization) + penalty_L1 : if True uses L1 penalization, else L2 penalization + penalty_local : if True uses local penalization, + else global penalization + softstart : initialization scale for weights + (randomly drawn from N(0,1)*softstart) + logdtm : if True each cell of the dtm is transformed as log(1+cell), + otherwise the raw counts are used + monitor : if True prints training information during training + + cd_type : type of contrastive divergence to use, + 'kcd', 'pcd', 'mfcd' (default) or 'gradcd' : + 'kcd' stands for k-step contrastive divergence + 'pcd' stands for persistent contrastive divergence + 'mfcd' stands for mean-field contrastive divergence + 'gradcd' stands for gradual k-step contrastive divergence, + where k increases over epochs latter when increase_speed is higher + train_optimizer : training optimizer to use : + 'full' for full batch training, + 'sgd' for simple stochastic gradient descent, + 'minibatch' for mini-batch training, + 'momentum' for mini-batch with momentum, + 'rmsprop' for RMSProp optimizer, + 'adam' for Adam optimizer, + 'adagrad' for Adagrad optimizer + + + Example usage + -------------------- + + from octis.dataset.dataset import Dataset + from octis.models.oRSM import oRSM + + dataset_20ng = Dataset() + dataset_20ng.fetch_dataset("20NewsGroup") + + ors = oRSM(num_topics=20, epochs=500, btsz=20, lr=0.0001, cd_type='mfcd', train_optimizer='rmsprop', + M=100, pretrain_epochs=450, epsilon=2) + output_ors = ors.train(dataset_20ng) + """ + super().__init__() + self.hyperparameters = dict() + self.hyperparameters["num_topics"] = num_topics + self.hyperparameters["btsz"] = btsz + self.hyperparameters["lr"] = lr + self.hyperparameters["momentum"] = momentum + self.hyperparameters["K"] = K + self.hyperparameters["softstart"] = softstart + self.hyperparameters["epochs"] = epochs + self.hyperparameters["increase_speed"] = increase_speed + self.hyperparameters["monitor_time"] = monitor_time + self.hyperparameters["monitor_ppl"] = monitor_ppl + self.hyperparameters["monitor_loglik"] = monitor_loglik + self.hyperparameters["penalty_L1"] = penalty_L1 + self.hyperparameters["penalty_local"] = penalty_local + self.hyperparameters["decay"] = decay + self.hyperparameters["random_state"] = random_state + self.hyperparameters["cd_type"] = cd_type + self.hyperparameters["logdtm"] = logdtm + self.hyperparameters["val_dtm"] = None + self.hyperparameters["train_optimizer"] = train_optimizer + self.hyperparameters["rms_decay"] = rms_decay + self.hyperparameters["adam_decay1"] = adam_decay1 + self.hyperparameters["adam_decay2"] = adam_decay2 + self.hyperparameters["verbose"] = verbose + + # new params in oRSM that are not in RSM + self.hyperparameters["M"] = M + self.hyperparameters["pretrain_epochs"] = pretrain_epochs + self.hyperparameters["epsilon"] = epsilon + + def info(self): + """ + Returns model informations + """ + return { + "citation": citations.models_oRSM, + "name": "oRSM, Over Replicated Softmax Model", + } + + def hyperparameters_info(self): + """ + Returns hyperparameters informations + """ + return defaults.oRSM_hyperparameters_info + + def train_model(self, dataset, hyperparams=None, top_words=10): + """ + Train the model and return output + + Parameters + ---------- + dataset : dataset to use to build the model + hyperparams : hyperparameters to build the model + top_words : if greater than 0 returns the most significant words for + each topic in the output (Default True) + Returns + ------- + result : dictionary with up to 3 entries, + 'topics', 'topic-word-matrix' and + 'topic-document-matrix' + """ + + self.initialize_model_structure(hyperparams=hyperparams, dataset=dataset) + self.trained_model.train(**self.hyperparameters) + return self.get_model_output(top_words) + + def get_model_output(self, top_words=10): + result = {} + + result["topic-word-matrix"] = self.trained_model._get_topic_word_matrix() + + if top_words > 0: + result["topics"] = self.trained_model._get_topics(top_words) + + result["topic-document-matrix"] = self.trained_model._get_topic_doc( + self.train_dtm + ) + + if self.use_partitions: + result["test-topic-document-matrix"] = self.trained_model._get_topic_doc( + self.test_dtm + ) + else: + result["test-topic-document-matrix"] = result["topic-document-matrix"] + + return result + + def initialize_model_structure(self, hyperparams, dataset): + if hyperparams is None: + hyperparams = {} + + if self.use_partitions: + train_corpus, test_corpus = dataset.get_partitioned_corpus( + use_validation=False + ) + else: + train_corpus = dataset.get_corpus() + + if self.id2word is None: + self.id2word = self.get_vocab(dataset.get_corpus()) + + if self.use_partitions: + if self.hyperparameters["verbose"]: + print("Building train DTM...") + self.train_dtm = self.build_dtm(train_corpus, self.id2word) + if self.hyperparameters["verbose"]: + print("Building test DTM...") + self.test_dtm = self.build_dtm(test_corpus, self.id2word) + hyperparams["dtm"] = self.train_dtm + hyperparams["val_dtm"] = self.test_dtm + else: + if self.hyperparameters["verbose"]: + print("Building DTM...") + self.train_dtm = self.build_dtm(train_corpus, self.id2word) + hyperparams["dtm"] = self.train_dtm + hyperparams["val_dtm"] = None + + if "num_topics" not in hyperparams: + hyperparams["num_topics"] = self.hyperparameters["num_topics"] + + self.hyperparameters.update(hyperparams) + + self.trained_model = self.oRSM_model() + self.trained_model.id2word = self.id2word + + ############### preprocessing functions + + def get_vocab(self, tokenized_corpus): + id2word = corpora.Dictionary(tokenized_corpus) + return id2word + + def build_dtm(self, tokenized_corpus, id2word=None): + """ + converts a tokenized corpus to a DOcument Term Matrix. id2word is a gensim dictionary. + """ + if id2word is None: + id2word = corpora.Dictionary(tokenized_corpus) + else: + id2word = id2word + id_corpus = [id2word.doc2bow(document) for document in tokenized_corpus] + vocab = id2word.token2id + N = len(id_corpus) + DTM = np.zeros((N, len(vocab))) + for i in tqdm(range(N)): + doc = id_corpus[i] + for id, count in doc: + DTM[i, id] = count + return DTM + + class oRSM_model(Replicated_Softmax): + def __init__(self): + super().__init__() + + def h1_to_softmax(self, h1): + """ + D: number of words in the document + h1: N x F in [0,1] + """ + w_vh, w_v, w_h = self.W + energy = np.reshape(w_v, (-1, 1)) + w_vh @ h1.T + probs = self.softmax(energy.T) + return probs + + def sample_softmax(self, visible_probs, D): + """ + D: number of words in the document, for N documents + visible_probs: N x K + """ + visible_sample = np.empty(visible_probs.shape) + for i in range(visible_probs.shape[0]): + visible_sample[i] = self.multinomial_sample(visible_probs[i], D[i]) + return visible_sample + + def sample_visible(self, h1, D): + visible_probs = self.h1_to_softmax(h1) + visible_sample = self.sample_softmax(visible_probs, D) + return visible_sample + + def v_and_h2_to_h1(self, v, h2): + w_vh, w_v, w_h = self.W + D = v.sum(axis=1) + energy = (np.outer(w_h, (D + self.M)) + w_vh.T @ (v + h2).T).T # N x F + h1 = self.sigmoid(energy) + return h1 + + def v_to_mf_h1(self, v): + w_vh, w_v, w_h = self.W + D = v.sum(axis=1) + energy = np.outer((D + self.M), w_h) + (v @ w_vh) * np.reshape( + (1 + self.M / D), (-1, 1) + ) # N x F + h1 = self.sigmoid(energy) + return h1 + + def visible2hidden(self, v): + return self.v_to_mf_h1(v) + + def visible_to_hiddens_gibbs(self, v): + """ + main function to compute the hidden states given visible states + in the training of the over replicated softmax model. + Uses mean field approximation to get the expected values of the two hidden layers. + The third hidden layer is initialized as uniform random. + + v: visible states N x K + """ + + converge = False + mu2 = np.random.random(self.visible) * self.M # initialize mu2 randomly + + while not converge: + old_mu2 = mu2 + h2 = mu2 * self.M # self.sample_h2(mu2, np.ones(v.shape[0])*self.M) + mu1 = self.v_and_h2_to_h1(v, h2) + mu2 = self.h1_to_softmax(mu1) + + if (np.abs(old_mu2 - mu2)).sum() < self.epsilon: + converge = True + + return mu1, mu2 + + def sample_hidden(self, v): + h1_probs = self.v_to_mf_h1(v) + h1_sample = self.unif_reject_sample(h1_probs) + return h1_sample + + ##################################### leapfrog trainsition operators + + def gibbs_transition(self, v): + """ + makes a gibbs transition on a batch of visible states v + using the full gibbs sampling for the hidden layers + """ + D = v.sum(axis=1) + hidden_probs1, hidden_probs2 = self.visible_to_hiddens_gibbs(v) + hidden_sample = self.unif_reject_sample(hidden_probs1) + visible_probs = self.h1_to_softmax(hidden_sample) + visible_sample = np.empty(v.shape) + for i in range(v.shape[0]): + visible_sample[i] = self.multinomial_sample(visible_probs[i], D[i]) + return visible_sample + + def gibbs_transition_lowcost(self, v): + """ + makes a gibbs transition on a batch of visible states v + using the mean field approximation for the hidden layers + """ + D = v.sum(axis=1) + hidden_probs = self.v_to_mf_h1(v) + hidden_sample = self.unif_reject_sample(hidden_probs) + visible_probs = self.h1_to_softmax(hidden_sample) + visible_sample = np.empty(v.shape) + for i in range(v.shape[0]): + visible_sample[i] = self.multinomial_sample(visible_probs[i], D[i]) + return visible_sample + + ######################## gradient descent optimization + + def gradient_simple(self, v1, v2, h11, h12, h21, h22): + w_vh, w_v, w_h = self.W + lr = self.lr + + vel_vh = np.dot((v1 + h21).T, h11) - np.dot((v2 + h22).T, h12) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + + vel_v = (v1 + h21).sum(axis=0) - (v2 + h22).sum(axis=0) + vel_h = h11.sum(axis=0) - h12.sum(axis=0) + + w_vh += vel_vh * lr + w_v += vel_v * lr + w_h += vel_h * lr + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + def gradient_momentum(self, v1, v2, h11, h12, h21, h22): + w_vh, w_v, w_h = self.W + vel_vh, vel_v, vel_h = self.train_cache + m = self.momentum + lr = self.lr + + vel_vh = vel_vh * m + ( + np.dot((v1 + h21).T, h11) - np.dot((v2 + h22).T, h12) + ) * (1 - m) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + vel_v = vel_v * m + ((v1 + h21).sum(axis=0) - (v2 + h22).sum(axis=0)) * ( + 1 - m + ) + vel_h = vel_h * m + (h11.sum(axis=0) - h12.sum(axis=0)) * (1 - m) + + w_vh += vel_vh * lr + w_v += vel_v * lr + w_h += vel_h * lr + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + self.train_cache = vel_vh, vel_v, vel_h + + def gradient_adagrad(self, v1, v2, h11, h12, h21, h22): + w_vh, w_v, w_h = self.W + vel_vh, vel_v, vel_h = self.train_cache + lr = self.lr + + vel_vh = np.dot((v1 + h21).T, h11) - np.dot((v2 + h22).T, h12) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + vel_v = (v1 + h21).sum(axis=0) - (v2 + h22).sum(axis=0) + vel_h = h11.sum(axis=0) - h12.sum(axis=0) + + w_vh += vel_vh * lr / (np.sqrt(np.sum(vel_vh**2)) + 1e-8) + w_v += vel_v * lr / (np.sqrt(np.sum(vel_v**2)) + 1e-8) + w_h += vel_h * lr / (np.sqrt(np.sum(vel_h**2)) + 1e-8) + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + self.train_cache = vel_vh, vel_v, vel_h + + def gradient_rmsprop(self, v1, v2, h11, h12, h21, h22): + ( + w_vh, + w_v, + w_h, + ) = self.W + vel_vh, vel_v, vel_h, rms_m2_vh, rms_m2_v, rms_m2_h = self.train_cache + rms_decay = self.rms_decay + lr = self.lr + + vel_vh = np.dot((v1 + h21).T, h11) - np.dot((v2 + h22).T, h12) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + vel_v = (v1 + h21).sum(axis=0) - (v2 + h22).sum(axis=0) + vel_h = h11.sum(axis=0) - h12.sum(axis=0) + + rms_m2_vh = rms_decay * rms_m2_vh + (1 - rms_decay) * (vel_vh**2) + w_vh += lr * vel_vh / np.sqrt(rms_m2_vh + 1e-8) + rms_m2_v = rms_decay * rms_m2_v + (1 - rms_decay) * (vel_v**2) + w_v += lr * vel_v / np.sqrt(rms_m2_v + 1e-8) + rms_m2_h = rms_decay * rms_m2_h + (1 - rms_decay) * (vel_h**2) + w_h += lr * vel_h / np.sqrt(rms_m2_h + 1e-8) + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + self.train_cache = vel_vh, vel_v, vel_h, rms_m2_vh, rms_m2_v, rms_m2_h + + def gradient_adam(self, v1, v2, h11, h12, h21, h22): + w_vh, w_v, w_h = self.W + ( + vel_vh, + vel_v, + vel_h, + adam_m1_vh, + adam_m1_v, + adam_m1_h, + adam_m2_vh, + adam_m2_v, + adam_m2_h, + t, + ) = self.train_cache + decay1 = self.adam_decay1 + decay2 = self.adam_decay2 + lr = self.lr + + vel_vh = np.dot((v1 + h21).T, h11) - np.dot((v2 + h22).T, h12) + vel_vh = self.interaction_penalty(vel_vh, w_vh) + vel_v = (v1 + h21).sum(axis=0) - (v2 + h22).sum(axis=0) + vel_h = h11.sum(axis=0) - h12.sum(axis=0) + + # Increment t first (should start from 1, not 0) + t += 1 + + # Compute bias correction terms + bias_correction1 = 1 - decay1**t + bias_correction2 = 1 - decay2**t + + # Update for w_vh + adam_m1_vh = decay1 * adam_m1_vh + (1 - decay1) * vel_vh + adam_m2_vh = decay2 * adam_m2_vh + (1 - decay2) * (vel_vh**2) + adam_m1_vh_hat = adam_m1_vh / bias_correction1 + adam_m2_vh_hat = adam_m2_vh / bias_correction2 + w_vh += lr * adam_m1_vh_hat / (np.sqrt(adam_m2_vh_hat) + 1e-8) + + # Update for w_v + adam_m1_v = decay1 * adam_m1_v + (1 - decay1) * vel_v + adam_m2_v = decay2 * adam_m2_v + (1 - decay2) * (vel_v**2) + adam_m1_v_hat = adam_m1_v / bias_correction1 + adam_m2_v_hat = adam_m2_v / bias_correction2 + w_v += lr * adam_m1_v_hat / (np.sqrt(adam_m2_v_hat) + 1e-8) + + # Update for w_h + adam_m1_h = decay1 * adam_m1_h + (1 - decay1) * vel_h + adam_m2_h = decay2 * adam_m2_h + (1 - decay2) * (vel_h**2) + adam_m1_h_hat = adam_m1_h / bias_correction1 + adam_m2_h_hat = adam_m2_h / bias_correction2 + w_h += lr * adam_m1_h_hat / (np.sqrt(adam_m2_h_hat) + 1e-8) + + if any( + (np.any(np.isnan(w_vh)), np.any(np.isnan(w_v)), np.any(np.isnan(w_h))) + ): + self.stop = True + warnings.warn("NaN values founded in weights: stopping training") + else: + self.W = w_vh, w_v, w_h + + self.train_cache = ( + vel_vh, + vel_v, + vel_h, + adam_m1_vh, + adam_m1_v, + adam_m1_h, + adam_m2_vh, + adam_m2_v, + adam_m2_h, + t, + ) + + ####################### contrastive divergence steps + + ##### cd steps for training + + def kcd_step(self, ids): + v = self.dtm[ids, :] + h1, mu2 = self.visible_to_hiddens_gibbs(v) + h2 = mu2 * self.M # self.sample_h2(mu2, np.ones(v.shape[0])*self.M) + + D = v.sum(axis=1) + for k in range(self.tK): + v_model = self.sample_visible(h1, D) + h1_model, mu2_model = self.visible_to_hiddens_gibbs(v_model) + + h2_model = mu2_model * self.M + self.gradient_step(v, v_model, h1, h1_model, h2, h2_model) + + def pcd_step(self, ids): + v0 = self.dtm[ids, :] + pv0 = self.persistent_v[ids, :] + h1, h2 = self.visible_to_hiddens_gibbs(v0) + pv1 = self.gibbs_transition(pv0) + ph1, ph2 = self.visible_to_hiddens_gibbs(pv1) + h2 = h2 * self.M + ph2 = ph2 * self.M + self.persistent_v[ids, :] = pv1 + + self.gradient_step(v0, pv1, h1, ph1, h2, ph2) + + def mfcd_step(self, ids): + v0 = self.dtm[ids, :] + D = v0.sum(axis=1) + h0, mu0 = self.visible_to_hiddens_gibbs(v0) + v1 = self.h1_to_softmax(h0) * D.reshape(-1, 1) + h1, mu1 = self.visible_to_hiddens_gibbs(v1) + mu0 = mu0 * self.M + mu1 = mu1 * self.M + self.gradient_step(v0, v1, h0, h1, mu0, mu1) + + def gradkcd_step(self, ids): + self.tK = self.Kvec[self.t] + if self.tK == 0: + self.mfcd_step(ids) + else: + self.kcd_step(ids) + + def gradual_k(self, T, K, g=0): + t = np.arange(1, T + 1) + k = np.floor((K + 1) * ((t / (T + 1)) ** (1 + g))).astype(int) + return k + + ##### cd steps for pre-training + + def pretrain_kcd_step(self, ids): + v = self.dtm[ids, :] + + h1 = self.v_to_mf_h1(v) + D = v.sum(axis=1) + h2 = v * self.M / D.reshape(-1, 1) + + for k in range(self.tK): + v_model = self.sample_visible(h1, D) + h1_model = self.v_to_mf_h1(v_model) + + mu2_model = self.h1_to_softmax(h1_model) + h2_model = mu2_model * self.M + self.gradient_step(v, v_model, h1, h1_model, h2, h2_model) + + def pretrain_mfcd_step(self, ids): + v0 = self.dtm[ids, :] + D = v0.sum(axis=1) + h0 = self.v_to_mf_h1(v0) + v1 = self.h1_to_softmax(h0) * D.reshape(-1, 1) + h1 = self.v_to_mf_h1(v1) + self.gradient_step( + v0, + v1, + h0, + h1, + v0 * self.M / D.reshape(-1, 1), + v1 * self.M / D.reshape(-1, 1), + ) + + def pretrain_pcd_step(self, ids): + v0 = self.dtm[ids, :] + pv0 = self.persistent_v[ids, :] + D = v0.sum(axis=1) + h0 = self.v_to_mf_h1(v0) + pv1 = self.gibbs_transition_lowcost(pv0) + ph1 = self.v_to_mf_h1(pv1) + self.persistent_v[ids, :] = pv1 + + self.gradient_step( + v0, + pv1, + h0, + ph1, + v0 * self.M / D.reshape(-1, 1), + pv1 * self.M / D.reshape(-1, 1), + ) + + def pretrain_gradkcd_step(self, ids): + self.tK = self.Kvec[self.t] + if self.tK == 0: + self.pretrain_mfcd_step(ids) + else: + self.pretrain_kcd_step(ids) + + ############################### main train function + + def train( + self, + dtm, + num_topics=5, + epochs=3, + M=50, + btsz=100, + pretrain_epochs=1, + epsilon=10, + lr=0.01, + momentum=0.1, + K=1, + decay=0, + penalty_L1=False, + penalty_local=False, + monitor_time=True, + monitor_ppl=False, + monitor_loglik=False, + train_optimizer="sgd", + cd_type="mfcd", + logdtm=False, + rms_decay=0.9, + adam_decay1=0.9, + adam_decay2=0.999, + increase_speed=0, + softstart=0.001, + winit=None, + val_dtm=None, + random_state=None, + verbose=False, + ): + ## init global variables + if random_state is not None: + np.random.seed(random_state) + + doval = val_dtm is not None + + # init structure of the model + self.set_structure_from_dtm( + winit=winit, + softstart=softstart, + epochs=epochs, + num_topics=num_topics, + dtm=dtm, + val_dtm=val_dtm, + monitor_ppl=monitor_ppl, + monitor_loglik=monitor_loglik, + monitor_time=monitor_time, + logdtm=logdtm, + ) + + ##init training hyperparams + self.set_train_hyper( + epochs=epochs, + btsz=btsz, + lr=lr, + momentum=momentum, + K=K, + decay=decay, + penalty_L1=penalty_L1, + penalty_local=penalty_local, + train_optimizer=train_optimizer, + cd_type=cd_type, + rms_decay=rms_decay, + adam_decay1=adam_decay1, + adam_decay2=adam_decay2, + increase_speed=increase_speed, + pretrain_epochs=pretrain_epochs, + M=M, + epsilon=epsilon, + ) + + ## MAIN TRAIN LOOP + print("Training OverRS model...") + + for t in tqdm(range(epochs)): + if monitor_time: + current_time = time.time() + + if self.stop: + print("training stopped early") + break + else: + self.train_epoch() + + if monitor_time: + elapsed_time = time.time() - current_time + self.train_time[t] = elapsed_time + + if monitor_ppl: + self.train_ppl[t] = self.log_ppl_upbo(dtm) + + if doval: + self.val_ppl[t] = self.log_ppl_upbo(val_dtm) + + if monitor_loglik: + self.train_loglik[t] = np.mean(self.neg_free_energy(dtm)) + + if doval: + self.val_loglik[t] = np.mean(self.neg_free_energy(val_dtm)) + + def train_epoch(self): + """one epoch of training, with sgd and mini-batches""" + + start_id = 0 + np.random.shuffle(self.obs_ids) # apply sgd + self.dtm = self.dtm[self.obs_ids, :] + + if self.persist: + self.persistent_v = self.persistent_v[self.obs_ids, :] + + if self.t < self.pretrain_epochs: + for b in range(self.batches): + ids = np.arange(start_id, start_id + self.btsz) + self.cd_pretrain_learning_step(ids) + start_id += self.btsz + + else: + for b in range(self.batches): + ids = np.arange(start_id, start_id + self.btsz) + self.cd_learning_step(ids) + start_id += self.btsz + + self.t += 1 + + def set_train_hyper( + self, + epochs=3, + btsz=100, + lr=0.01, + momentum=0.5, + K=1, + decay=0, + penalty_L1=False, + penalty_local=False, + train_optimizer="sgd", + cd_type="mfcd", + rms_decay=0.9, + adam_decay1=0.9, + adam_decay2=0.999, + increase_speed=0, + pretrain_epochs=500, + M=50, + epsilon=0.01, + ): + N, dictsize = self.dtm.shape + num_topics = self.hidden + + self.stop = False + self.momentum = momentum + self.lr = lr + self.decay = decay + self.penalty = decay > 0 + self.penL1 = penalty_L1 + self.local_penalty = penalty_local + + self.train_optimizer = train_optimizer + self.adam_decay1 = adam_decay1 + self.adam_decay2 = adam_decay2 + self.rms_decay = rms_decay + + self.persist = cd_type == "pcd" # persistent_cd + self.mean_field = cd_type == "mfcd" # mean_field_cd + self.gradual = cd_type == "gradcd" # increase_cd + + self.t = 0 # current epoch + self.pretrain_epochs = pretrain_epochs + self.epsilon = epsilon + self.M = M + self.K = K + self.tK = K # current k + self.mean_h = True # whether to use mean hidden activations or sample them + + self.btsz = btsz + self.batches = int(np.floor(N / btsz)) + # self.bt_correct = (btsz**2)/N #a bayesian would correct decay for batch size. I'm not a bayesian + + ## initialize k + if self.gradual: + Kvec = self.gradual_k(T=epochs, K=self.K, g=increase_speed) + else: + Kvec = np.ones(epochs) * self.K + self.Kvec = Kvec.astype(int) + + # Initialize persistent chain - one chain for each document in the dataset + # Each persistent visible should have the same document length as corresponding data + if self.persist: + self.persistent_v = np.zeros((N, dictsize)) # Full dataset size + persistent_D = self.dtm.sum( + axis=1 + ) # Document lengths from original data + + # Initialize each document with uniform multinomial of its actual length + for i in range(N): + if persistent_D[i] > 0: # Avoid empty documents + self.persistent_v[i] = np.random.multinomial( + persistent_D[i], np.ones(dictsize) / dictsize + ) + + # Initialize weights gradients + vel_vh = np.zeros((dictsize, num_topics)) + vel_v = np.zeros((dictsize)) + vel_h = np.zeros((num_topics)) + + if self.train_optimizer == "sgd": + self.gradient_step = self.gradient_simple + else: + if self.train_optimizer == "momentum": + self.gradient_step = self.gradient_momentum + self.train_cache = vel_vh, vel_v, vel_h + else: + if self.train_optimizer == "adagrad": + self.gradient_step = self.gradient_adagrad + self.train_cache = vel_vh, vel_v, vel_h + else: + if self.train_optimizer == "rmsprop": + self.gradient_step = self.gradient_rmsprop + rms_m2_vh = np.zeros((dictsize, num_topics)) + rms_m2_v = np.zeros((dictsize)) + rms_m2_h = np.zeros((num_topics)) + self.rms_decay = 0.9 + self.train_cache = ( + vel_vh, + vel_v, + vel_h, + rms_m2_vh, + rms_m2_v, + rms_m2_h, + ) + else: + if self.train_optimizer == "adam": + self.gradient_step = self.gradient_adam + adam_m1_vh = np.zeros((dictsize, num_topics)) + adam_m1_v = np.zeros((dictsize)) + adam_m1_h = np.zeros((num_topics)) + adam_m2_vh = np.zeros((dictsize, num_topics)) + adam_m2_v = np.zeros((dictsize)) + adam_m2_h = np.zeros((num_topics)) + t = 1 + self.adam_decay1 = 0.9 + self.adam_decay2 = 0.999 + self.train_cache = ( + vel_vh, + vel_v, + vel_h, + adam_m1_vh, + adam_m1_v, + adam_m1_h, + adam_m2_vh, + adam_m2_v, + adam_m2_h, + t, + ) + else: + self.gradient_step = self.gradient_simple + + if self.mean_field: + self.cd_learning_step = self.mfcd_step # input is v0 + self.cd_pretrain_learning_step = self.pretrain_mfcd_step + else: + if self.persist: + self.cd_learning_step = ( + self.pcd_step + ) # input is v0, persistent_v, output is new persistent_v + self.cd_pretrain_learning_step = self.pretrain_pcd_step + else: + if cd_type == "kcd": + self.cd_learning_step = self.kcd_step # input is v0, K fixed + self.cd_pretrain_learning_step = self.pretrain_kcd_step + else: # gradual kcd + if self.gradual: + self.cd_learning_step = ( + self.gradkcd_step + ) # input is v0, change K each epoch + self.cd_pretrain_learning_step = self.pretrain_gradkcd_step + else: + self.cd_learning_step = ( + self.kcd_step + ) # input is v0, K fixed + self.cd_pretrain_learning_step = self.pretrain_kcd_step + + def log_ppl_approx(self, dtm): + """ + return the log perplepxity + given a document term matrix + """ + mfh = self.v_to_mf_h1(dtm) + vprob = self.h1_to_softmax(mfh) + lppl = -np.nansum(np.log(vprob) * dtm) / np.sum(dtm) + return lppl + + def ppl_approx(self, testmatrix): + """ + return the perplepxity + given a document term matrix + """ + ppl = np.exp(self.log_ppl_approx(testmatrix)) + return ppl diff --git a/setup.py b/setup.py index 516a446f..7fc9ce87 100644 --- a/setup.py +++ b/setup.py @@ -27,8 +27,6 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', ], description="OCTIS: a library for Optimizing and Comparing Topic Models.", diff --git a/tests/test_octis.py b/tests/test_octis.py index 5dd107c4..61b8635e 100644 --- a/tests/test_octis.py +++ b/tests/test_octis.py @@ -5,6 +5,8 @@ import pytest from octis.dataset.dataset import Dataset +from octis.models.RSM import RSM +from octis.models.oRSM import oRSM from octis.models.LDA import LDA from octis.models.LDA_tomopy import LDA_tomopy as LDATOMOTO from octis.models.ETM import ETM @@ -574,3 +576,65 @@ def test_model_output_prodlda_not_partitioned(data_dir): assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == ( num_topics, len(dataset.get_corpus())) + + + + +def test_model_output_rsm(data_dir): + dataset = Dataset() + dataset.load_custom_dataset_from_folder(data_dir + '/M10') + num_topics = 3 + model = RSM(num_topics=num_topics, epochs=2) + output = model.train_model(dataset) + assert 'topics' in output.keys() + assert 'topic-word-matrix' in output.keys() + assert 'test-topic-document-matrix' in output.keys() + + # check topics format + assert type(output['topics']) == list + assert len(output['topics']) == num_topics + + # check topic-word-matrix format + assert type(output['topic-word-matrix']) == np.ndarray + assert output['topic-word-matrix'].shape == (num_topics, len( + dataset.get_vocabulary())) + + # check topic-document-matrix format + assert type(output['topic-document-matrix']) == np.ndarray + assert output['topic-document-matrix'].shape == (num_topics, len( + dataset.get_partitioned_corpus()[0])) + + # check test-topic-document-matrix format + assert type(output['test-topic-document-matrix']) == np.ndarray + assert output['test-topic-document-matrix'].shape == (num_topics, len( + dataset.get_partitioned_corpus()[2])) + + +def test_model_output_orsm(data_dir): + dataset = Dataset() + dataset.load_custom_dataset_from_folder(data_dir + '/M10') + num_topics = 3 + model = oRSM(num_topics=num_topics, epochs=2) + output = model.train_model(dataset) + assert 'topics' in output.keys() + assert 'topic-word-matrix' in output.keys() + assert 'test-topic-document-matrix' in output.keys() + + # check topics format + assert type(output['topics']) == list + assert len(output['topics']) == num_topics + + # check topic-word-matrix format + assert type(output['topic-word-matrix']) == np.ndarray + assert output['topic-word-matrix'].shape == (num_topics, len( + dataset.get_vocabulary())) + + # check topic-document-matrix format + assert type(output['topic-document-matrix']) == np.ndarray + assert output['topic-document-matrix'].shape == (num_topics, len( + dataset.get_partitioned_corpus()[0])) + + # check test-topic-document-matrix format + assert type(output['test-topic-document-matrix']) == np.ndarray + assert output['test-topic-document-matrix'].shape == (num_topics, len( + dataset.get_partitioned_corpus()[2]))