diff --git a/example_configs/lm/lstm-test-small-cudnn.py b/example_configs/lm/lstm-test-small-cudnn.py new file mode 100644 index 000000000..da0ab14f5 --- /dev/null +++ b/example_configs/lm/lstm-test-small-cudnn.py @@ -0,0 +1,134 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import WKTDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import BasicSequenceLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR WikiText-2-raw DATA]" +processed_data_folder = 'wkt2-processed-data' + +base_model = LSTMLM +bptt = 12 +steps = 10 + +base_params = { + "restore_best_checkpoint": True, + "use_horovod": False, + "num_gpus": 2, + + "batch_size_per_gpu": 160, + "num_epochs": 1500, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "processed_data_folder": processed_data_folder, + "logdir": "LSTM-FP32-2GPU-SMALL", + "eval_steps": steps * 2, + + "optimizer": "Adam", + "optimizer_params": {}, + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 9e-4 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "dtype": tf.float32, + # "dtype": "mixed", + # "loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "use_cudnn_rnn": True, + "cudnn_rnn_type": tf.contrib.cudnn_rnn.CudnnLSTM, + "core_cell": None, + "core_cell_params": { + "num_units": 128, + "forget_bias": 1.0, + }, + "encoder_layers": 2, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 0.6, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 0.6, + "recurrent_keep_prob": 0.7, + 'encoder_emb_keep_prob': 0.37, + "encoder_use_skip_connections": False, + "emb_size": 64, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + }, + + "decoder": FakeDecoder, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-6, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": False, + "average_across_timestep": True, + "do_mask": False, + } +} + +train_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "data_root": data_root, + "processed_data_folder": processed_data_folder, + "pad_vocab_to_eight": False, + "rand_start": True, + "shuffle": False, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "bptt": bptt, + "small": True, + }, +} +eval_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "processed_data_folder": processed_data_folder, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "bptt": bptt, + "small": True, + }, +} + +infer_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "processed_data_folder": processed_data_folder, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "bptt": bptt, + "seed_tokens": "something The only game", + }, +} diff --git a/example_configs/lm/lstm-test-small-mixed.py b/example_configs/lm/lstm-test-small-mixed.py index 52b54d138..5ae792d68 100644 --- a/example_configs/lm/lstm-test-small-mixed.py +++ b/example_configs/lm/lstm-test-small-mixed.py @@ -50,6 +50,8 @@ "minval": -0.1, "maxval": 0.1, }, + "use_cudnn_rnn": False, + "cudnn_rnn_type": None, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { "num_units": 128, diff --git a/example_configs/lm/lstm-test-small.py b/example_configs/lm/lstm-test-small.py index 860f228aa..a4f20aa7b 100644 --- a/example_configs/lm/lstm-test-small.py +++ b/example_configs/lm/lstm-test-small.py @@ -51,6 +51,8 @@ "minval": -0.1, "maxval": 0.1, }, + "use_cudnn_rnn": False, + "cudnn_rnn_type": None, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { "num_units": 128, diff --git a/example_configs/lm/lstm-wkt103-mixed.py b/example_configs/lm/lstm-wkt103-mixed.py index f077dd2f7..29e3a9be2 100644 --- a/example_configs/lm/lstm-wkt103-mixed.py +++ b/example_configs/lm/lstm-wkt103-mixed.py @@ -52,6 +52,8 @@ "minval": -0.1, "maxval": 0.1, }, + "use_cudnn_rnn": False, + "cudnn_rnn_type": None, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { "num_units": 1024, @@ -125,4 +127,4 @@ "bptt": bptt, "seed_tokens": "something The only game", }, -} \ No newline at end of file +} diff --git a/example_configs/lm/lstm-wkt2-fp32.py b/example_configs/lm/lstm-wkt2-fp32.py index e80d6d53f..004ab2a7e 100644 --- a/example_configs/lm/lstm-wkt2-fp32.py +++ b/example_configs/lm/lstm-wkt2-fp32.py @@ -50,6 +50,8 @@ "minval": -0.1, "maxval": 0.1, }, + "use_cudnn_rnn": False, + "cudnn_rnn_type": None, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { "num_units": 896, @@ -124,4 +126,4 @@ "bptt": bptt, "seed_tokens": "something The only game", }, -} \ No newline at end of file +} diff --git a/example_configs/transfer/imdb-from-scratch.py b/example_configs/transfer/imdb-from-scratch.py index e59752552..1cc98b56e 100644 --- a/example_configs/transfer/imdb-from-scratch.py +++ b/example_configs/transfer/imdb-from-scratch.py @@ -55,6 +55,8 @@ "minval": -0.1, "maxval": 0.1, }, + "use_cudnn_rnn": False, + "cudnn_rnn_type": None, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { "num_units": 896, @@ -127,4 +129,4 @@ "binary": binary, "max_length": max_length, }, -} \ No newline at end of file +} diff --git a/example_configs/transfer/imdb-wkt103.py b/example_configs/transfer/imdb-wkt103.py index 8deb1d351..e46ece2df 100644 --- a/example_configs/transfer/imdb-wkt103.py +++ b/example_configs/transfer/imdb-wkt103.py @@ -57,6 +57,8 @@ "minval": -0.1, "maxval": 0.1, }, + "use_cudnn_rnn": False, + "cudnn_rnn_type": None, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { "num_units": 1024, diff --git a/example_configs/transfer/imdb-wkt2-cudnn.py b/example_configs/transfer/imdb-wkt2-cudnn.py new file mode 100644 index 000000000..427878674 --- /dev/null +++ b/example_configs/transfer/imdb-wkt2-cudnn.py @@ -0,0 +1,137 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import IMDBDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR IMDB DATA]" +processed_data_folder = 'imdb-processed-data-wkt2' + +base_model = LSTMLM +max_length = 256 +binary = True +steps = 10 + +base_params = { + "restore_best_checkpoint": True, + "use_horovod": False, + "num_gpus": 1, + + "batch_size_per_gpu": 16, + "eval_batch_size_per_gpu": 64, + "num_epochs": 100, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "load_model": "LSTM-FP32-2GPU-SMALL", + "logdir": "IMDB-WKT103-CUDNN-MIXED", + "lm_vocab_file": 'wkt2-processed-data/vocab.txt', + # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]' + "processed_data_folder": processed_data_folder, + "eval_steps": steps, + + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 1e-4 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + # "max_grad_norm": 0.25, + # "dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "use_cudnn_rnn": True, + "cudnn_rnn_type": tf.contrib.cudnn_rnn.CudnnLSTM, + "core_cell": None, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, + "encoder_layers": 3, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 0.8, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 0.8, + "recurrent_keep_prob": 1.0, + 'encoder_emb_keep_prob': 0.6, + "encoder_use_skip_connections": False, + "emb_size": 256, + "num_tokens_gen": 10, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + }, + + "decoder": FakeDecoder, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-6, + }, + + "loss": CrossEntropyLoss, +} + +train_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": True, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "binary": binary, + "max_length": max_length, + "get_stats": True, + # "small": True, + }, +} +eval_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + # "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "binary": binary, + "max_length": max_length, + # "small": True, + }, +} + +infer_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + # "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "binary": binary, + "max_length": max_length, + }, +} diff --git a/example_configs/transfer/imdb-wkt2.py b/example_configs/transfer/imdb-wkt2.py index 39ddcca6e..1eade20cb 100644 --- a/example_configs/transfer/imdb-wkt2.py +++ b/example_configs/transfer/imdb-wkt2.py @@ -56,6 +56,8 @@ "minval": -0.1, "maxval": 0.1, }, + "use_cudnn_rnn": False, + "cudnn_rnn_type": None, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { "num_units": 896, diff --git a/example_configs/transfer/sst-wkt2-small.py b/example_configs/transfer/sst-wkt2-small.py index c6edf7fe0..d7f588252 100644 --- a/example_configs/transfer/sst-wkt2-small.py +++ b/example_configs/transfer/sst-wkt2-small.py @@ -57,6 +57,8 @@ "minval": -0.1, "maxval": 0.1, }, + "use_cudnn_rnn": False, + "cudnn_rnn_type": None, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { "num_units": 128, diff --git a/example_configs/transfer/sst-wkt2.py b/example_configs/transfer/sst-wkt2.py index 473d8f716..2a7e2d344 100644 --- a/example_configs/transfer/sst-wkt2.py +++ b/example_configs/transfer/sst-wkt2.py @@ -57,6 +57,8 @@ "minval": -0.1, "maxval": 0.1, }, + "use_cudnn_rnn": False, + "cudnn_rnn_type": None, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { "num_units": 896, diff --git a/open_seq2seq/encoders/lm_encoders.py b/open_seq2seq/encoders/lm_encoders.py index ebb50edc6..51ad1178f 100644 --- a/open_seq2seq/encoders/lm_encoders.py +++ b/open_seq2seq/encoders/lm_encoders.py @@ -5,11 +5,9 @@ from __future__ import absolute_import, division, print_function from __future__ import unicode_literals -import copy - +import copy, inspect import tensorflow as tf from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops - from open_seq2seq.optimizers.mp_wrapper import mp_regularizer_wrapper from open_seq2seq.parts.rnns.utils import single_cell from .encoder import Encoder @@ -31,6 +29,8 @@ def get_required_params(): 'core_cell_params': dict, 'end_token': int, "batch_size": int, + "use_cudnn_rnn": bool, + "cudnn_rnn_type": None }) @staticmethod @@ -199,6 +199,9 @@ def _encode(self, input_dict): regularizer = self.params.get('regularizer', None) fc_use_bias = self.params.get('fc_use_bias', True) + use_cudnn_rnn = self.params.get("use_cudnn_rnn", False) + cudnn_rnn_type = self.params.get("cudnn_rnn_type", None) + if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) @@ -260,38 +263,65 @@ def _encode(self, input_dict): self._enc_emb_w = tf.nn.dropout(enc_emb_w, keep_prob=emb_keep_prob) - fwd_cells = [ - single_cell(cell_class=self.params['core_cell'], - cell_params=self.params['core_cell_params'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - recurrent_keep_prob=recurrent_keep_prob, - input_weight_keep_prob=input_weight_keep_prob, - recurrent_weight_keep_prob=recurrent_weight_keep_prob, - weight_variational=self.params['weight_variational'], - dropout_seed=self.params['dropout_seed'], - residual_connections=self.params['encoder_use_skip_connections'], - awd_initializer=self.params['awd_initializer'], - dtype=self._params['dtype'] - ) for _ in range(self.params['encoder_layers'] - 1)] - - fwd_cells.append( - single_cell(cell_class=self.params['core_cell'], - cell_params=last_cell_params, - dp_input_keep_prob=last_input_keep_prob, - dp_output_keep_prob=last_output_keep_prob, - recurrent_keep_prob=recurrent_keep_prob, - input_weight_keep_prob=input_weight_keep_prob, - recurrent_weight_keep_prob=recurrent_weight_keep_prob, - weight_variational=self.params['weight_variational'], - dropout_seed=self.params['dropout_seed'], - residual_connections=self.params['encoder_use_skip_connections'], - awd_initializer=self.params['awd_initializer'], - dtype=self._params['dtype'] - ) - ) - - self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells) + if use_cudnn_rnn: + if self._mode == 'train' or self._mode == 'eval': + all_cudnn_classes = [ + i[1] + for i in inspect.getmembers(tf.contrib.cudnn_rnn, inspect.isclass) + ] + + if not cudnn_rnn_type in all_cudnn_classes: + raise TypeError("rnn_type must be a Cudnn RNN class") + + rnn_block = cudnn_rnn_type( + num_layers=self.params['encoder_layers'], + num_units=self._emb_size, + dtype=self._params['dtype'], + name="cudnn_rnn" + ) + else: + # Transferring weights from model trained with CudnnLSTM/CudnnGRU + # to CudnnCompatibleLSTMCell/CudnnCompatibleGRUCell for inference + if 'CudnnLSTM' in str(cudnn_rnn_type): + cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(num_units=self._emb_size) + elif 'CudnnGRU' in str(cudnn_rnn_type): + cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell(num_units=self._emb_size) + + fwd_cells = [cell() for _ in range(self.params['encoder_layers'])] + self._encoder_cell_fw = tf.nn.rnn_cell.MultiRNNCell(fwd_cells) + else: + fwd_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params['core_cell_params'], + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + recurrent_keep_prob=recurrent_keep_prob, + input_weight_keep_prob=input_weight_keep_prob, + recurrent_weight_keep_prob=recurrent_weight_keep_prob, + weight_variational=self.params['weight_variational'], + dropout_seed=self.params['dropout_seed'], + residual_connections=self.params['encoder_use_skip_connections'], + awd_initializer=self.params['awd_initializer'], + dtype=self._params['dtype'] + ) for _ in range(self.params['encoder_layers'] - 1)] + + fwd_cells.append( + single_cell(cell_class=self.params['core_cell'], + cell_params=last_cell_params, + dp_input_keep_prob=last_input_keep_prob, + dp_output_keep_prob=last_output_keep_prob, + recurrent_keep_prob=recurrent_keep_prob, + input_weight_keep_prob=input_weight_keep_prob, + recurrent_weight_keep_prob=recurrent_weight_keep_prob, + weight_variational=self.params['weight_variational'], + dropout_seed=self.params['dropout_seed'], + residual_connections=self.params['encoder_use_skip_connections'], + awd_initializer=self.params['awd_initializer'], + dtype=self._params['dtype'] + ) + ) + + self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells) time_major = self.params.get("time_major", False) use_swap_memory = self.params.get("use_swap_memory", False) @@ -306,20 +336,41 @@ def _encode(self, input_dict): source_sequence, ), self.params['dtype']) - encoder_outputs, encoder_state = tf.nn.dynamic_rnn( - cell=self._encoder_cell_fw, - inputs=embedded_inputs, - sequence_length=source_length, - time_major=time_major, - swap_memory=use_swap_memory, - dtype=self._params['dtype'], - scope='decoder', - ) + if use_cudnn_rnn: + # The CudnnLSTM will return encoder_state as a tuple of hidden + # and cell values that. The hidden and cell tensors are stored for + # each LSTM Layer. + + # reshape to [B, T, C] --> [T, B, C] + if time_major == False: + embedded_inputs = tf.transpose(embedded_inputs, [1, 0, 2]) + + rnn_block.build(embedded_inputs.get_shape()) + encoder_outputs, encoder_state = rnn_block(embedded_inputs) + encoder_outputs = tf.transpose(encoder_outputs, [1, 0, 2]) + else: + encoder_outputs, encoder_state = tf.nn.dynamic_rnn( + cell=self._encoder_cell_fw, + inputs=embedded_inputs, + sequence_length=source_length, + time_major=time_major, + swap_memory=use_swap_memory, + dtype=self._params['dtype'], + scope='decoder', + ) + if not self._lm_phase: - if self._use_cell_state: - encoder_outputs = tf.concat([encoder_state[-1].h, encoder_state[-1].c], axis=1) + # CudnnLSTM stores cell and hidden state differently + if use_cudnn_rnn: + if self._use_cell_state: + encoder_outputs = tf.concat([encoder_state[0][-1], encoder_state[1][-1]], axis=1) + else: + encoder_outputs = encoder_state[0][-1] else: - encoder_outputs = encoder_state[-1].h + if self._use_cell_state: + encoder_outputs = tf.concat([encoder_state[-1].h, encoder_state[-1].c], axis=1) + else: + encoder_outputs = encoder_state[-1].h if self._mode == 'train' and self._num_sampled < self._fc_dim: # sampled softmax output_dict = {'weights': enc_emb_w, @@ -332,6 +383,24 @@ def _encode(self, input_dict): logits = self._output_layer.apply(encoder_outputs) output_dict = {'logits': logits, 'outputs': [logits]} else: # infer in LM phase + # This portion of graph is required to restore weights from CudnnLSTM to + # CudnnCompatibleLSTMCell/CudnnCompatibleGRUCell + if use_cudnn_rnn: + embedded_inputs = tf.cast(tf.nn.embedding_lookup( + self.enc_emb_w, + source_sequence, + ), self.params['dtype']) + + # Scope must remain unset to restore weights + encoder_outputs, encoder_state = tf.nn.dynamic_rnn( + cell=self._encoder_cell_fw, + inputs=embedded_inputs, + sequence_length=source_length, + time_major=time_major, + swap_memory=use_swap_memory, + dtype=self._params['dtype'] + ) + embedding_fn = lambda ids: tf.cast(tf.nn.embedding_lookup( self.enc_emb_w, ids, diff --git a/open_seq2seq/utils/hooks.py b/open_seq2seq/utils/hooks.py index c8a65bb12..830e863d6 100644 --- a/open_seq2seq/utils/hooks.py +++ b/open_seq2seq/utils/hooks.py @@ -216,6 +216,11 @@ def after_run(self, run_context, run_values): dict_to_log = self._model.finalize_evaluation(results_per_batch, step) dict_to_log['eval_loss'] = total_loss + if self._print_ppl: + # Add bpc and ppl metrics to tensorboard + dict_to_log['ppl'] = math.exp(total_loss) + dict_to_log['bpc'] = math.exp(total_loss/math.log(2)) + # saving the best validation model if self._model.params['save_checkpoint_steps'] and \ total_loss < self._best_eval_loss: