Seq2SeqModel

Summary

Create seq2seq model with buckets and attention. setup folowings

self.encoder_inputs
self.decoder_inputs
self.target_weights (what is this?)
self.outputs and self.losses by model_with_buckets

Code with comments

  def __init__(self,
               source_vocab_size,
               target_vocab_size,
               buckets,
               size,
               num_layers,
               max_gradient_norm,
               batch_size,
               learning_rate,
               learning_rate_decay_factor,
               use_lstm=False,
               num_samples=512,
               forward_only=False,
               dtype=tf.float32,
               beam_search=True,
               beam_size=5,
               attention=True):
    """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
    self.source_vocab_size = source_vocab_size
    self.target_vocab_size = target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(
        float(learning_rate), trainable=False, dtype=dtype)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if 0 < num_samples < self.target_vocab_size:
      w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
      w = tf.transpose(w_t)
      b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
      output_projection = (w, b)

      # this is necessary for huge # of classes softmax classifier
      def sampled_loss(labels, logits):
        labels = tf.reshape(labels, [-1, 1])
        # We need to compute the sampled_softmax_loss using 32bit floats to
        # avoid numerical instabilities.
        local_w_t = tf.cast(w_t, tf.float32)
        local_b = tf.cast(b, tf.float32)
        local_inputs = tf.cast(logits, tf.float32)
        # weights = [num_classes, dim]
        # bias = [num_classes]
        # labels = [batch_size, num_true(=1 choose 1)]
        #  batch 0: [5] this char should be index=5
        # inputs = [batch_size, dim]
        #  batch 0: [vector representation of input 0]
        # return: A batch_size 1-D tensor of per-example sampled softmax losses.
        return tf.cast(
            tf.nn.sampled_softmax_loss(
                weights=local_w_t,
                biases=local_b,
                labels=labels,
                inputs=local_inputs,
                num_sampled=num_samples,
                num_classes=self.target_vocab_size),
            dtype)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    def single_cell():
      return tf.contrib.rnn.GRUCell(size)
    if use_lstm:
      def single_cell():
        return tf.contrib.rnn.BasicLSTMCell(size)
    cell = single_cell()
    if num_layers > 1:
      cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)], state_is_tuple=False)

    # The seq2seq function: we use embedding for the input and attention.
#    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
#      return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
#          encoder_inputs,
#          decoder_inputs,
#          cell,
#          num_encoder_symbols=source_vocab_size,
#          num_decoder_symbols=target_vocab_size,
#          embedding_size=size,
#          output_projection=output_projection,
#          feed_previous=do_decode,
#          dtype=dtype)

    # The seq2seq function: we use embedding for the input and attention.
    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        if attention:
            print("Attention Model")
            ## todo higepon replace
            return embedding_attention_seq2seq(
               encoder_inputs, decoder_inputs, cell,
               num_encoder_symbols=source_vocab_size,
               num_decoder_symbols=target_vocab_size,
               embedding_size=size,
               output_projection=output_projection,
               feed_previous=do_decode,
               beam_search=beam_search,
               beam_size=beam_size )
        else:
            print("Simple Model")
            ## todo higepon replace
            return embedding_rnn_seq2seq(
              encoder_inputs, decoder_inputs, cell,
              num_encoder_symbols=source_vocab_size,
              num_decoder_symbols=target_vocab_size,
              embedding_size=size,
              output_projection=output_projection,
              feed_previous=do_decode,
              beam_search=beam_search,
              beam_size=beam_size )

    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []
    ## for each encoder input
    for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="encoder{0}".format(i)))
    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(dtype, shape=[None],
                                                name="weight{0}".format(i)))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # Training outputs and losses.
    if forward_only:
        if beam_search:
              self.outputs, self.beam_path, self.beam_symbol = decode_model_with_buckets(
                  self.encoder_inputs, self.decoder_inputs, targets,
                  self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
                  softmax_loss_function=softmax_loss_function)
              ## Added by higepon 7/7/2016
              if output_projection is not None:
                    for b in range(len(buckets)):
                      self.outputs[b] = [
                          tf.matmul(output, output_projection[0]) + output_projection[1]
                          for output in self.outputs[b]
                          ]
        else:
              # print self.decoder_inputs
              self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                  self.encoder_inputs, self.decoder_inputs, targets,
                  self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
                  softmax_loss_function=softmax_loss_function)
              # If we use output projection, we need to project outputs for decoding.
              if output_projection is not None:
                    for b in xrange(len(buckets)):
                      self.outputs[b] = [
                          tf.matmul(output, output_projection[0]) + output_projection[1]
                          for output in self.outputs[b]
                      ]
#    if forward_only:
#      self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
#          self.encoder_inputs, self.decoder_inputs, targets,
#          self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
#          softmax_loss_function=softmax_loss_function)
      # If we use output projection, we need to project outputs for decoding.
#      if output_projection is not None:
#        for b in xrange(len(buckets)):
#          self.outputs[b] = [
#              tf.matmul(output, output_projection[0]) + output_projection[1]
#              for output in self.outputs[b]
#          ]
    else:
       # training
       self.outputs, self.losses = model_with_buckets(
           self.encoder_inputs, self.decoder_inputs, targets,
           self.target_weights, buckets,
           lambda x, y: seq2seq_f(x, y, False),
           softmax_loss_function=softmax_loss_function)

    self.train_loss_summaries = []
    for i in range(len(self.losses)):
        self.train_loss_summaries.append(tf.summary.scalar("train_loss_bucket{}".format(i), self.losses[i]))

    self.valid_loss_summaries = []
    for i in range(len(self.losses)):
        self.valid_loss_summaries.append(tf.summary.scalar("valid_loss_bucket{}".format(i), self.losses[i]))

    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.AdamOptimizer(1e-4)
      #opt = tf.train.GradientDescentOptimizer(self.learning_rate)

      for b in xrange(len(buckets)):
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

    self.saver = tf.train.Saver(tf.global_variables())

Seq2SeqModel

Summary

Code with comments

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally