leleamol · SuperLinguini · Feb 28, 2018 · Mar 2, 2018 · Mar 2, 2018 · Mar 6, 2018
diff --git a/pytorch_benchmark/pytorch_word_language_model/README.md b/pytorch_benchmark/pytorch_word_language_model/README.md
@@ -0,0 +1,56 @@
+# Word-level language modeling RNN
+
+This example trains a multi-layer RNN (Elman, GRU, or LSTM) on a language modeling task.
+By default, the training script uses the Wikitext-2 dataset, provided.
+The trained model can then be used by the generate script to generate new text.
+
+```bash
+python main.py --cuda --epochs 6        # Train a LSTM on Wikitext-2 with CUDA, reaching perplexity of 117.61
+python main.py --cuda --epochs 6 --tied # Train a tied LSTM on Wikitext-2 with CUDA, reaching perplexity of 110.44
+python main.py --cuda --tied            # Train a tied LSTM on Wikitext-2 with CUDA for 40 epochs, reaching perplexity of 87.17
+python generate.py                      # Generate samples from the trained LSTM model.
+```
+
+The model uses the `nn.RNN` module (and its sister modules `nn.GRU` and `nn.LSTM`)
+which will automatically use the cuDNN backend if run on CUDA with cuDNN installed.
+
+During training, if a keyboard interrupt (Ctrl-C) is received,
+training is stopped and the current model is evaluated against the test dataset.
+
+The `main.py` script accepts the following arguments:
+
+```bash
+optional arguments:
+  -h, --help         show this help message and exit
+  --data DATA        location of the data corpus
+  --model MODEL      type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)
+  --emsize EMSIZE    size of word embeddings
+  --nhid NHID        number of hidden units per layer
+  --nlayers NLAYERS  number of layers
+  --lr LR            initial learning rate
+  --clip CLIP        gradient clipping
+  --epochs EPOCHS    upper epoch limit
+  --batch-size N     batch size
+  --bptt BPTT        sequence length
+  --dropout DROPOUT  dropout applied to layers (0 = no dropout)
+  --decay DECAY      learning rate decay per epoch
+  --tied             tie the word embedding and softmax weights
+  --seed SEED        random seed
+  --cuda             use CUDA
+  --log-interval N   report interval
+  --save SAVE        path to save the final model
+```
+
+With these arguments, a variety of models can be tested.
+As an example, the following arguments produce slower but better models:
+
+```bash
+python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40           # Test perplexity of 80.97
+python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 --tied    # Test perplexity of 75.96
+python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40        # Test perplexity of 77.42
+python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied # Test perplexity of 72.30
+```
+
+Perplexities on PTB are equal or better than
+[Recurrent Neural Network Regularization (Zaremba et al. 2014)](https://arxiv.org/pdf/1409.2329.pdf)
+and are similar to [Using the Output Embedding to Improve Language Models (Press & Wolf 2016](https://arxiv.org/abs/1608.05859) and [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling (Inan et al. 2016)](https://arxiv.org/pdf/1611.01462.pdf), though both of these papers have improved perplexities by using a form of recurrent dropout [(variational dropout)](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks).
diff --git a/pytorch_benchmark/pytorch_word_language_model/data.py b/pytorch_benchmark/pytorch_word_language_model/data.py
@@ -0,0 +1,54 @@
+import os
+import torch
+import subprocess
+
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.idx2word)
+
+
+class Corpus(object):
+    def __init__(self, path):
+        self.dictionary = Dictionary()
+        self.train = self.tokenize(os.path.join(path, 'train.txt'))
+        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
+        self.test = self.tokenize(os.path.join(path, 'test.txt'))
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        try:
+            assert os.path.exists(path)
+        except AssertionError:
+            subprocess.call(['{}/get_ptb_data.sh'.format(os.path.dirname(__file__))])
+            assert os.path.exists(path)
+
+        # Add words to the dictionary
+        with open(path, 'r') as f:
+            tokens = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                tokens += len(words)
+                for word in words:
+                    self.dictionary.add_word(word)
+
+        # Tokenize file content
+        with open(path, 'r') as f:
+            ids = torch.LongTensor(tokens)
+            token = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    ids[token] = self.dictionary.word2idx[word]
+                    token += 1
+
+        return ids
diff --git a/pytorch_benchmark/pytorch_word_language_model/generate.py b/pytorch_benchmark/pytorch_word_language_model/generate.py
@@ -0,0 +1,77 @@
+###############################################################################
+# Language Modeling on Penn Tree Bank
+#
+# This file generates new sentences sampled from the language model
+#
+###############################################################################
+
+import argparse
+
+import torch
+from torch.autograd import Variable
+
+import data
+
+parser = argparse.ArgumentParser(description='PyTorch Language Model')
+
+# Model parameters.
+parser.add_argument('--data', type=str, default='./data/ptb/',
+                    help='location of the data corpus')
+parser.add_argument('--checkpoint', type=str, default='./model.pt',
+                    help='model checkpoint to use')
+parser.add_argument('--outf', type=str, default='generated.txt',
+                    help='output file for generated text')
+parser.add_argument('--words', type=int, default='1000',
+                    help='number of words to generate')
+parser.add_argument('--seed', type=int, default=1111,
+                    help='random seed')
+parser.add_argument('--cuda', action='store_true',
+                    help='use CUDA')
+parser.add_argument('--temperature', type=float, default=1.0,
+                    help='temperature - higher will increase diversity')
+parser.add_argument('--log-interval', type=int, default=100,
+                    help='reporting interval')
+args = parser.parse_args()
+
+# Set the random seed manually for reproducibility.
+torch.manual_seed(args.seed)
+if torch.cuda.is_available():
+    if not args.cuda:
+        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
+    else:
+        torch.cuda.manual_seed(args.seed)
+
+if args.temperature < 1e-3:
+    parser.error("--temperature has to be greater or equal 1e-3")
+
+with open(args.checkpoint, 'rb') as f:
+    model = torch.load(f)
+model.eval()
+
+if args.cuda:
+    model.cuda()
+else:
+    model.cpu()
+
+corpus = data.Corpus(args.data)
+ntokens = len(corpus.dictionary)
+if isinstance(model, torch.nn.parallel.DataParallel):
+    hidden = model.module.init_hidden(1)
+else:
+    hidden = model.init_hidden(1)
+input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True)
+if args.cuda:
+    input.data = input.data.cuda()
+
+with open(args.outf, 'w') as outf:
+    for i in range(args.words):
+        output, hidden = model(input, hidden)
+        word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
+        word_idx = torch.multinomial(word_weights, 1)[0]
+        input.data.fill_(word_idx)
+        word = corpus.dictionary.idx2word[word_idx]
+
+        outf.write(word + ('\n' if i % 20 == 19 else ' '))
+
+        if i % args.log_interval == 0:
+            print('| Generated {}/{} words'.format(i, args.words))
diff --git a/pytorch_benchmark/pytorch_word_language_model/get_ptb_data.sh b/pytorch_benchmark/pytorch_word_language_model/get_ptb_data.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+RNN_DIR=$(cd `dirname $0`; pwd)
+DATA_DIR="${RNN_DIR}/data/ptb/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} doesn't exist, will create one";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt -O ${DATA_DIR}/train.txt;
+wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt -O ${DATA_DIR}/valid.txt;
+wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt -O ${DATA_DIR}/test.txt;
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;