Skip to content

Seq2SeqModel

Higepon Taro Minowa edited this page Jul 9, 2017 · 5 revisions

Summary

Create seq2seq model with buckets and attention. setup folowings

  • self.encoder_inputs
  • self.decoder_inputs
  • self.target_weights (what is this?)
  • self.outputs and self.losses by model_with_buckets

Code with comments

  def __init__(self,
               source_vocab_size,
               target_vocab_size,
               buckets,
               size,
               num_layers,
               max_gradient_norm,
               batch_size,
               learning_rate,
               learning_rate_decay_factor,
               use_lstm=False,
               num_samples=512,
               forward_only=False,
               dtype=tf.float32,
               beam_search=True,
               beam_size=5,
               attention=True):
    """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
    self.source_vocab_size = source_vocab_size
    self.target_vocab_size = target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(
        float(learning_rate), trainable=False, dtype=dtype)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if 0 < num_samples < self.target_vocab_size:
      w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
      w = tf.transpose(w_t)
      b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
      output_projection = (w, b)

      # this is necessary for huge # of classes softmax classifier
      def sampled_loss(labels, logits):
        labels = tf.reshape(labels, [-1, 1])
        # We need to compute the sampled_softmax_loss using 32bit floats to
        # avoid numerical instabilities.
        local_w_t = tf.cast(w_t, tf.float32)
        local_b = tf.cast(b, tf.float32)
        local_inputs = tf.cast(logits, tf.float32)
        # weights = [num_classes, dim]
        # bias = [num_classes]
        # labels = [batch_size, num_true(=1 choose 1)]
        #  batch 0: [5] this char should be index=5
        # inputs = [batch_size, dim]
        #  batch 0: [vector representation of input 0]
        # return: A batch_size 1-D tensor of per-example sampled softmax losses.
        return tf.cast(
            tf.nn.sampled_softmax_loss(
                weights=local_w_t,
                biases=local_b,
                labels=labels,
                inputs=local_inputs,
                num_sampled=num_samples,
                num_classes=self.target_vocab_size),
            dtype)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    def single_cell():
      return tf.contrib.rnn.GRUCell(size)
    if use_lstm:
      def single_cell():
        return tf.contrib.rnn.BasicLSTMCell(size)
    cell = single_cell()
    if num_layers > 1:
      cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)], state_is_tuple=False)

    # The seq2seq function: we use embedding for the input and attention.
#    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
#      return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
#          encoder_inputs,
#          decoder_inputs,
#          cell,
#          num_encoder_symbols=source_vocab_size,
#          num_decoder_symbols=target_vocab_size,
#          embedding_size=size,
#          output_projection=output_projection,
#          feed_previous=do_decode,
#          dtype=dtype)

    # The seq2seq function: we use embedding for the input and attention.
    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        if attention:
            print("Attention Model")
            ## todo higepon replace
            return embedding_attention_seq2seq(
               encoder_inputs, decoder_inputs, cell,
               num_encoder_symbols=source_vocab_size,
               num_decoder_symbols=target_vocab_size,
               embedding_size=size,
               output_projection=output_projection,
               feed_previous=do_decode,
               beam_search=beam_search,
               beam_size=beam_size )
        else:
            print("Simple Model")
            ## todo higepon replace
            return embedding_rnn_seq2seq(
              encoder_inputs, decoder_inputs, cell,
              num_encoder_symbols=source_vocab_size,
              num_decoder_symbols=target_vocab_size,
              embedding_size=size,
              output_projection=output_projection,
              feed_previous=do_decode,
              beam_search=beam_search,
              beam_size=beam_size )

    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []
    ## for each encoder input
    for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="encoder{0}".format(i)))
    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(dtype, shape=[None],
                                                name="weight{0}".format(i)))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # Training outputs and losses.
    if forward_only:
        if beam_search:
              self.outputs, self.beam_path, self.beam_symbol = decode_model_with_buckets(
                  self.encoder_inputs, self.decoder_inputs, targets,
                  self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
                  softmax_loss_function=softmax_loss_function)
              ## Added by higepon 7/7/2016
              if output_projection is not None:
                    for b in range(len(buckets)):
                      self.outputs[b] = [
                          tf.matmul(output, output_projection[0]) + output_projection[1]
                          for output in self.outputs[b]
                          ]
        else:
              # print self.decoder_inputs
              self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                  self.encoder_inputs, self.decoder_inputs, targets,
                  self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
                  softmax_loss_function=softmax_loss_function)
              # If we use output projection, we need to project outputs for decoding.
              if output_projection is not None:
                    for b in xrange(len(buckets)):
                      self.outputs[b] = [
                          tf.matmul(output, output_projection[0]) + output_projection[1]
                          for output in self.outputs[b]
                      ]
#    if forward_only:
#      self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
#          self.encoder_inputs, self.decoder_inputs, targets,
#          self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
#          softmax_loss_function=softmax_loss_function)
      # If we use output projection, we need to project outputs for decoding.
#      if output_projection is not None:
#        for b in xrange(len(buckets)):
#          self.outputs[b] = [
#              tf.matmul(output, output_projection[0]) + output_projection[1]
#              for output in self.outputs[b]
#          ]
    else:
       # training
       self.outputs, self.losses = model_with_buckets(
           self.encoder_inputs, self.decoder_inputs, targets,
           self.target_weights, buckets,
           lambda x, y: seq2seq_f(x, y, False),
           softmax_loss_function=softmax_loss_function)

    self.train_loss_summaries = []
    for i in range(len(self.losses)):
        self.train_loss_summaries.append(tf.summary.scalar("train_loss_bucket{}".format(i), self.losses[i]))

    self.valid_loss_summaries = []
    for i in range(len(self.losses)):
        self.valid_loss_summaries.append(tf.summary.scalar("valid_loss_bucket{}".format(i), self.losses[i]))

    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.AdamOptimizer(1e-4)
      #opt = tf.train.GradientDescentOptimizer(self.learning_rate)

      for b in xrange(len(buckets)):
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

    self.saver = tf.train.Saver(tf.global_variables())

Clone this wiki locally