From dd10de128fba4ba4ff4e4265142c964eb22af529 Mon Sep 17 00:00:00 2001 From: Edward Lu Date: Tue, 30 Oct 2018 18:34:04 -0700 Subject: [PATCH 1/2] signoff commit Signed-off-by: Edward Lu --- example_configs/text2speech/wavenet_float.py | 104 +++++ .../text2speech/wavenet_float_8gpu.py | 104 +++++ example_configs/text2speech/wavenet_mixed.py | 105 +++++ .../text2speech/wavenet_mixed_8gpu.py | 105 +++++ open_seq2seq/data/__init__.py | 1 + open_seq2seq/data/text2speech/speech_utils.py | 8 +- open_seq2seq/data/text2speech/text2speech.py | 5 - .../data/text2speech/text2speech_wavenet.py | 273 +++++++++++ open_seq2seq/encoders/__init__.py | 1 + open_seq2seq/encoders/wavenet_encoder.py | 425 ++++++++++++++++++ open_seq2seq/losses/__init__.py | 1 + open_seq2seq/losses/wavenet_loss.py | 43 ++ open_seq2seq/models/__init__.py | 1 + open_seq2seq/models/text2speech_wavenet.py | 52 +++ scripts/tacotron_save_spec.py | 81 ++++ scripts/wavenet_naive_infer.py | 97 ++++ 16 files changed, 1400 insertions(+), 6 deletions(-) create mode 100644 example_configs/text2speech/wavenet_float.py create mode 100644 example_configs/text2speech/wavenet_float_8gpu.py create mode 100644 example_configs/text2speech/wavenet_mixed.py create mode 100644 example_configs/text2speech/wavenet_mixed_8gpu.py create mode 100644 open_seq2seq/data/text2speech/text2speech_wavenet.py create mode 100644 open_seq2seq/encoders/wavenet_encoder.py create mode 100644 open_seq2seq/losses/wavenet_loss.py create mode 100644 open_seq2seq/models/text2speech_wavenet.py create mode 100644 scripts/tacotron_save_spec.py create mode 100644 scripts/wavenet_naive_infer.py diff --git a/example_configs/text2speech/wavenet_float.py b/example_configs/text2speech/wavenet_float.py new file mode 100644 index 000000000..8ea7fd507 --- /dev/null +++ b/example_configs/text2speech/wavenet_float.py @@ -0,0 +1,104 @@ +# pylint: skip-file +import tensorflow as tf +from open_seq2seq.models import Text2SpeechWavenet +from open_seq2seq.encoders import WavenetEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.losses import WavenetLoss +from open_seq2seq.data import WavenetDataLayer +from open_seq2seq.optimizers.lr_policies import exp_decay +from open_seq2seq.parts.convs2s.utils import gated_linear_units + +base_model = Text2SpeechWavenet + +base_params = { + "random_seed": 0, + "use_horovod": False, + "max_steps": 1000000, + + "num_gpus": 1, + "batch_size_per_gpu": 2, + + "save_summaries_steps": 50, + "print_loss_steps": 50, + "print_samples_steps": 500, + "eval_steps": 500, + "save_checkpoint_steps": 2500, + "logdir": "result/wavenet-LJ-float", + + "optimizer": "Adam", + "optimizer_params": {}, + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 1e-3, + "decay_steps": 20000, + "decay_rate": 0.1, + "use_staircase_decay": False, + "begin_decay_at": 45000, + "min_lr": 1e-5, + }, + "dtype": tf.float32, + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + "scale": 1e-6 + }, + "initializer": tf.contrib.layers.xavier_initializer, + + "summaries": [], + + "encoder": WavenetEncoder, + "encoder_params": { + "layer_type": "conv1d", + "kernel_size": 3, + "strides": 1, + "padding": "VALID", + "blocks": 3, + "layers_per_block": 10, + "filters": 64, + "quantization_channels": 256 + }, + + "decoder": FakeDecoder, + + "loss": WavenetLoss, + + "data_layer": WavenetDataLayer, + "data_layer_params": { + "dataset": "LJ", + "num_audio_features": 80, + "dataset_location": "data/speech/LJSpeech/wavs/" + } +} + +train_params = { + "data_layer_params": { + "dataset_files": [ + "data/speech/LJSpeech/train.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer_params": { + "dataset_files": [ + "data/speech/LJSpeech/val.csv", + ], + "shuffle": False, + }, +} + +infer_params = { + "data_layer_params": { + "dataset_files": [ + "data/speech/LJSpeech/test.csv", + ], + "shuffle": False, + }, +} + +interactive_infer_params = { + "data_layer_params": { + "dataset_files": [], + "shuffle": False, + }, +} diff --git a/example_configs/text2speech/wavenet_float_8gpu.py b/example_configs/text2speech/wavenet_float_8gpu.py new file mode 100644 index 000000000..1cf5c571f --- /dev/null +++ b/example_configs/text2speech/wavenet_float_8gpu.py @@ -0,0 +1,104 @@ +# pylint: skip-file +import tensorflow as tf +from open_seq2seq.models import Text2SpeechWavenet +from open_seq2seq.encoders import WavenetEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.losses import WavenetLoss +from open_seq2seq.data import WavenetDataLayer +from open_seq2seq.optimizers.lr_policies import exp_decay +from open_seq2seq.parts.convs2s.utils import gated_linear_units + +base_model = Text2SpeechWavenet + +base_params = { + "random_seed": 0, + "use_horovod": True, + "max_steps": 1000000, + + "num_gpus": 8, + "batch_size_per_gpu": 1, + + "save_summaries_steps": 50, + "print_loss_steps": 50, + "print_samples_steps": 500, + "eval_steps": 500, + "save_checkpoint_steps": 2500, + "logdir": "result/wavenet-LJ-float", + + "optimizer": "Adam", + "optimizer_params": {}, + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 1e-3, + "decay_steps": 20000, + "decay_rate": 0.1, + "use_staircase_decay": False, + "begin_decay_at": 45000, + "min_lr": 1e-5, + }, + "dtype": tf.float32, + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + "scale": 1e-6 + }, + "initializer": tf.contrib.layers.xavier_initializer, + + "summaries": [], + + "encoder": WavenetEncoder, + "encoder_params": { + "layer_type": "conv1d", + "kernel_size": 3, + "strides": 1, + "padding": "VALID", + "blocks": 3, + "layers_per_block": 10, + "filters": 64, + "quantization_channels": 256 + }, + + "decoder": FakeDecoder, + + "loss": WavenetLoss, + + "data_layer": WavenetDataLayer, + "data_layer_params": { + "dataset": "LJ", + "num_audio_features": 80, + "dataset_location": "/data/LJSpeech-1.1-partitioned/wavs/" + } +} + +train_params = { + "data_layer_params": { + "dataset_files": [ + "/data/LJSpeech-1.1-partitioned/train.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer_params": { + "dataset_files": [ + "/data/LJSpeech-1.1-partitioned/val.csv", + ], + "shuffle": False, + }, +} + +infer_params = { + "data_layer_params": { + "dataset_files": [ + "/data/LJSpeech-1.1-partitioned/test.csv", + ], + "shuffle": False, + }, +} + +interactive_infer_params = { + "data_layer_params": { + "dataset_files": [], + "shuffle": False, + }, +} diff --git a/example_configs/text2speech/wavenet_mixed.py b/example_configs/text2speech/wavenet_mixed.py new file mode 100644 index 000000000..efff43f36 --- /dev/null +++ b/example_configs/text2speech/wavenet_mixed.py @@ -0,0 +1,105 @@ +# pylint: skip-file +import tensorflow as tf +from open_seq2seq.models import Text2SpeechWavenet +from open_seq2seq.encoders import WavenetEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.losses import WavenetLoss +from open_seq2seq.data import WavenetDataLayer +from open_seq2seq.optimizers.lr_policies import exp_decay +from open_seq2seq.parts.convs2s.utils import gated_linear_units + +base_model = Text2SpeechWavenet + +base_params = { + "random_seed": 0, + "use_horovod": False, + "max_steps": 1000000, + + "num_gpus": 1, + "batch_size_per_gpu": 4, + + "save_summaries_steps": 50, + "print_loss_steps": 50, + "print_samples_steps": 500, + "eval_steps": 500, + "save_checkpoint_steps": 2500, + "logdir": "result/wavenet-LJ-mixed", + + "optimizer": "Adam", + "optimizer_params": {}, + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 1e-3, + "decay_steps": 20000, + "decay_rate": 0.1, + "use_staircase_decay": False, + "begin_decay_at": 45000, + "min_lr": 1e-5, + }, + "dtype": "mixed", + "loss_scaling": "Backoff", + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + "scale": 1e-6 + }, + "initializer": tf.contrib.layers.xavier_initializer, + + "summaries": [], + + "encoder": WavenetEncoder, + "encoder_params": { + "layer_type": "conv1d", + "kernel_size": 3, + "strides": 1, + "padding": "VALID", + "blocks": 3, + "layers_per_block": 10, + "filters": 64, + "quantization_channels": 256 + }, + + "decoder": FakeDecoder, + + "loss": WavenetLoss, + + "data_layer": WavenetDataLayer, + "data_layer_params": { + "dataset": "LJ", + "num_audio_features": 80, + "dataset_location": "data/speech/LJSpeech/wavs/" + } +} + +train_params = { + "data_layer_params": { + "dataset_files": [ + "data/speech/LJSpeech/train.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer_params": { + "dataset_files": [ + "data/speech/LJSpeech/val.csv", + ], + "shuffle": False, + }, +} + +infer_params = { + "data_layer_params": { + "dataset_files": [ + "data/speech/LJSpeech/test.csv", + ], + "shuffle": False, + }, +} + +interactive_infer_params = { + "data_layer_params": { + "dataset_files": [], + "shuffle": False, + }, +} diff --git a/example_configs/text2speech/wavenet_mixed_8gpu.py b/example_configs/text2speech/wavenet_mixed_8gpu.py new file mode 100644 index 000000000..40559087e --- /dev/null +++ b/example_configs/text2speech/wavenet_mixed_8gpu.py @@ -0,0 +1,105 @@ +# pylint: skip-file +import tensorflow as tf +from open_seq2seq.models import Text2SpeechWavenet +from open_seq2seq.encoders import WavenetEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.losses import WavenetLoss +from open_seq2seq.data import WavenetDataLayer +from open_seq2seq.optimizers.lr_policies import exp_decay +from open_seq2seq.parts.convs2s.utils import gated_linear_units + +base_model = Text2SpeechWavenet + +base_params = { + "random_seed": 0, + "use_horovod": True, + "max_steps": 1000000, + + "num_gpus": 8, + "batch_size_per_gpu": 2, + + "save_summaries_steps": 50, + "print_loss_steps": 50, + "print_samples_steps": 500, + "eval_steps": 500, + "save_checkpoint_steps": 2500, + "logdir": "result/wavenet-LJ-mixed", + + "optimizer": "Adam", + "optimizer_params": {}, + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 1e-3, + "decay_steps": 20000, + "decay_rate": 0.1, + "use_staircase_decay": False, + "begin_decay_at": 45000, + "min_lr": 1e-5, + }, + "dtype": "mixed", + "loss_scaling": "Backoff", + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + "scale": 1e-6 + }, + "initializer": tf.contrib.layers.xavier_initializer, + + "summaries": [], + + "encoder": WavenetEncoder, + "encoder_params": { + "layer_type": "conv1d", + "kernel_size": 3, + "strides": 1, + "padding": "VALID", + "blocks": 3, + "layers_per_block": 10, + "filters": 64, + "quantization_channels": 256 + }, + + "decoder": FakeDecoder, + + "loss": WavenetLoss, + + "data_layer": WavenetDataLayer, + "data_layer_params": { + "dataset": "LJ", + "num_audio_features": 80, + "dataset_location": "/data/LJSpeech-1.1-partitioned/wavs/" + } +} + +train_params = { + "data_layer_params": { + "dataset_files": [ + "/data/LJSpeech-1.1-partitioned/train.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer_params": { + "dataset_files": [ + "/data/LJSpeech-1.1-partitioned/val.csv", + ], + "shuffle": False, + }, +} + +infer_params = { + "data_layer_params": { + "dataset_files": [ + "/data/LJSpeech-1.1-partitioned/test.csv", + ], + "shuffle": False, + }, +} + +interactive_infer_params = { + "data_layer_params": { + "dataset_files": [], + "shuffle": False, + }, +} diff --git a/open_seq2seq/data/__init__.py b/open_seq2seq/data/__init__.py index 3f9450aa6..3f8be5c17 100644 --- a/open_seq2seq/data/__init__.py +++ b/open_seq2seq/data/__init__.py @@ -4,3 +4,4 @@ from .image2label.image2label import ImagenetDataLayer from .lm.lmdata import WKTDataLayer, IMDBDataLayer, SSTDataLayer from .text2speech.text2speech import Text2SpeechDataLayer +from .text2speech.text2speech_wavenet import WavenetDataLayer diff --git a/open_seq2seq/data/text2speech/speech_utils.py b/open_seq2seq/data/text2speech/speech_utils.py index cbef3b7e4..ea3c25a17 100644 --- a/open_seq2seq/data/text2speech/speech_utils.py +++ b/open_seq2seq/data/text2speech/speech_utils.py @@ -18,6 +18,7 @@ def get_speech_features_from_file( std=1., trim=False, data_min=1e-5, + return_raw_audio=False, mel_basis=None ): """ Helper function to retrieve spectrograms from wav files @@ -52,11 +53,16 @@ def get_speech_features_from_file( frame_length=int(n_fft/2), hop_length=int(hop_length/2) ) - return get_speech_features( + speech_features = get_speech_features( signal, fs, num_features, features_type, n_fft, hop_length, mag_power, feature_normalize, mean, std, data_min, mel_basis ) + if return_raw_audio: + return signal, speech_features + else: + return speech_features + def get_speech_features( signal, diff --git a/open_seq2seq/data/text2speech/text2speech.py b/open_seq2seq/data/text2speech/text2speech.py index 1200af0ed..007662019 100644 --- a/open_seq2seq/data/text2speech/text2speech.py +++ b/open_seq2seq/data/text2speech/text2speech.py @@ -239,11 +239,6 @@ def __init__(self, params, model, num_workers=None, worker_id=None): def split_data(self, data): if self.params['mode'] != 'train' and self._num_workers is not None: - #Decrease num_eval for dev, since most data is thrown out anyways - if self.params['mode'] == 'eval': - start = self._worker_id * self.params['batch_size'] - end = start+self.params['batch_size'] - return data[start:end] size = len(data) start = size // self._num_workers * self._worker_id if self._worker_id == self._num_workers - 1: diff --git a/open_seq2seq/data/text2speech/text2speech_wavenet.py b/open_seq2seq/data/text2speech/text2speech_wavenet.py new file mode 100644 index 000000000..343b20667 --- /dev/null +++ b/open_seq2seq/data/text2speech/text2speech_wavenet.py @@ -0,0 +1,273 @@ +# Copyright (c) 2018 NVIDIA Corporation +import os +import six +import numpy as np +import tensorflow as tf +import pandas as pd + +from open_seq2seq.data.data_layer import DataLayer +from open_seq2seq.data.text2speech.speech_utils import \ + get_speech_features_from_file + +class WavenetDataLayer(DataLayer): + """ Text to speech data layer class for Wavenet """ + + @staticmethod + def get_required_params(): + return dict( + DataLayer.get_required_params(), **{ + "dataset": str, + "num_audio_features": int, + "dataset_files": list + } + ) + + @staticmethod + def get_optional_params(): + return dict( + DataLayer.get_optional_params(), **{ + "dataset_location": str, + "receptive_field": int + } + ) + + def __init__(self, params, model, num_workers=None, worker_id=None): + """ + Wavenet data layer constructor. + + See parent class for arguments description. + + Config parameters: + + * **dataset** (str) --- The dataset to use, currently only supports "LJ" + for LJSpeech 1.1 + + """ + + super(WavenetDataLayer, self).__init__( + params, + model, + num_workers, + worker_id + ) + + if self.params.get("dataset_location", None) is None: + raise ValueError( + "dataset_location must be specified when using LJSpeech" + ) + + names = ["wav_filename", "raw_transcript", "transcript"] + sep = "\x7c" + header = None + + self.sampling_rate = 22050 + self.n_fft = 1024 + + self._files = None + for csvs in params["dataset_files"]: + files = pd.read_csv( + csvs, + encoding="utf-8", + sep=sep, + header=header, + names=names, + quoting=3 + ) + + if self._files is None: + self._files = files + else: + self._files = self._files.append(files) + + cols = "wav_filename" + if self._files is not None: + all_files = self._files.loc[:, cols].values + self._files = self.split_data(all_files) + + self._size = self.get_size_in_samples() + self._dataset = None + self._iterator = None + self._input_tensors = None + + @property + def input_tensors(self): + return self._input_tensors + + def get_size_in_samples(self): + if self._files is not None: + return len(self._files) + else: + return 0 + + def split_data(self, data): + if self.params['mode'] != 'train' and self._num_workers is not None: + size = len(data) + start = size // self._num_workers * self._worker_id + + if self._worker_id == self._num_workers - 1: + end = size + else: + end = size // self._num_workers * (self._worker_id + 1) + + return data[start:end] + + return data + + @property + def iterator(self): + return self._iterator + + def _parse_audio_element(self, element): + """Parses tf.data element from TextLineDataset into audio.""" + audio_filename = element + + if six.PY2: + audio_filename = unicode(audio_filename, "utf-8") + else: + audio_filename = str(audio_filename, "utf-8") + + file_path = os.path.join( + self.params["dataset_location"], + audio_filename + ".wav" + ) + + audio, spectrogram = get_speech_features_from_file( + file_path, + self.params["num_audio_features"], + features_type="mel", + data_min=1e-5, + return_raw_audio=True + ) + + spectrogram = np.pad( + spectrogram, + ((0, 1), (0, 0)), + "constant", + constant_values=1e-5 + ) + assert len(audio) < len(spectrogram)*256, \ + "audio len: {}, spec*256 len: {}".format(len(audio), \ + len(spectrogram)*256) + num_pad = len(spectrogram)*256 - len(audio) + audio = np.pad( + audio, + (0, num_pad), + "constant", + constant_values=0 + ) + + # upsample the spectrogram to match source length by repeating each value + spectrogram = np.repeat(spectrogram, 256, axis=0) + + return audio.astype(self.params["dtype"].as_numpy_dtype()), \ + np.int32([len(audio)]), \ + spectrogram.astype(self.params["dtype"].as_numpy_dtype()), \ + np.int32([len(spectrogram)]) + + def _parse_spectrogram_element(self, element): + audio, au_length, spectrogram, spec_length = \ + self._parse_audio_element(element) + return spectrogram, spec_length + + def create_interactive_placeholders(self): + self._source = tf.placeholder( + dtype=self.params["dtype"], + shape=[self.params["batch_size"], None] + ) + self._src_length = tf.placeholder( + dtype=tf.int32, + shape=[self.params["batch_size"]] + ) + + self._spec = tf.placeholder( + dtype=self.params["dtype"], + shape=[self.params["batch_size"], None, + self.params["num_audio_features"]] + ) + self._spec_length = tf.placeholder( + dtype=tf.int32, + shape=[self.params["batch_size"]] + ) + self._spec_offset = tf.placeholder( + dtype=tf.int32, + shape=() + ) + + self._input_tensors = {} + self._input_tensors["source_tensors"] = [ + self._source, self._src_length, self._spec, self._spec_length, + self._spec_offset + ] + + def create_feed_dict(self, model_in): + """ + Creates the feed dict for interactive infer using a spectrogram + + Args: + model_in: tuple( + source: source audio + src_length: length of the source + spec: conditioning spectrogram + spec_length: length of the spectrogram + spec_offset: iterative index for position of receptive field window + ) + """ + + source, src_length, spec, spec_length, spec_offset = model_in + + return { + self._source: source, + self._src_length: src_length, + self._spec: spec, + self._spec_length: spec_length, + self._spec_offset: spec_offset + } + + def build_graph(self): + """ builds data reading graph """ + self._dataset = tf.data.Dataset.from_tensor_slices(self._files) + + if self.params["shuffle"]: + self._dataset = self._dataset.shuffle(self._size) + self._dataset = self._dataset.repeat() + + num_audio_features = self.params["num_audio_features"] + + if self.params["mode"] != "infer": + self._dataset = self._dataset.map( + lambda line: tf.py_func( + self._parse_audio_element, + [line], + [self.params["dtype"], tf.int32, self.params["dtype"], tf.int32], + stateful=False + ), + num_parallel_calls=8 + ) + + self._dataset = self._dataset.padded_batch( + self.params["batch_size"], + padded_shapes=([None], 1, [None, num_audio_features], 1) + ) + + else: + raise ValueError("Non-interactive infer is not supported") + + self._iterator = self._dataset.prefetch(tf.contrib.data.AUTOTUNE) \ + .make_initializable_iterator() + + if self.params["mode"] != "infer": + source, src_length, spec, spec_length = self._iterator.get_next() + spec.set_shape([self.params["batch_size"], None, num_audio_features]) + spec_length = tf.reshape(spec_length, [self.params["batch_size"]]) + + source.set_shape([self.params["batch_size"], None]) + src_length = tf.reshape(src_length, [self.params["batch_size"]]) + + self._input_tensors = {} + self._input_tensors["source_tensors"] = [ + source, src_length, spec, spec_length + ] + self._input_tensors["target_tensors"] = [source, src_length] + + else: + raise ValueError("Non-interactive infer is not supported") diff --git a/open_seq2seq/encoders/__init__.py b/open_seq2seq/encoders/__init__.py index 280ebf63d..d7445bb51 100644 --- a/open_seq2seq/encoders/__init__.py +++ b/open_seq2seq/encoders/__init__.py @@ -16,3 +16,4 @@ from .las_encoder import ListenAttendSpellEncoder from .convs2s_encoder import ConvS2SEncoder from .lm_encoders import LMEncoder +from .wavenet_encoder import WavenetEncoder diff --git a/open_seq2seq/encoders/wavenet_encoder.py b/open_seq2seq/encoders/wavenet_encoder.py new file mode 100644 index 000000000..0b16de2fb --- /dev/null +++ b/open_seq2seq/encoders/wavenet_encoder.py @@ -0,0 +1,425 @@ +# Copyright (c) 2018 NVIDIA Corporation + +import tensorflow as tf +from math import ceil +from open_seq2seq.parts.cnns.conv_blocks import conv_actv, conv_bn_actv + +from .encoder import Encoder + + +def _get_receptive_field(kernel_size, blocks, layers_per_block): + dilations = [2 ** i for i in range(layers_per_block)] + return (kernel_size - 1) * blocks * sum(dilations) + 1 + +def _mu_law_encode(signal, channels, dtype): + mu = tf.saturate_cast(channels - 1, dtype) + safe_audio_abs = tf.minimum(tf.abs(signal), 1.0) + magnitude = tf.log1p(mu * safe_audio_abs) / tf.log1p(mu) + signal = tf.sign(signal) * magnitude + return tf.to_int32((signal + 1) / 2 * mu + 0.5) + +def _mu_law_decode(output, channels): + mu = channels - 1 + signal = 2 * (tf.to_float(output) / mu) - 1 + magnitude = (1 / mu) * ((1 + mu)**abs(signal) - 1) + return tf.sign(signal) * magnitude + +def conv_1x1( + layer_type, name, inputs, filters, strides, regularizer, training, + data_format): + """ + Defines a single 1x1 convolution for convenience + """ + + return conv_actv( + layer_type=layer_type, + name=name, + inputs=inputs, + filters=filters, + kernel_size=1, + activation_fn=None, + strides=strides, + padding="SAME", + regularizer=regularizer, + training=training, + data_format=data_format, + ) + +def causal_conv_bn_actv( + layer_type, name, inputs, filters, kernel_size, activation_fn, strides, + padding, regularizer, training, data_format, bn_momentum, bn_epsilon, + dilation=1): + """ + Defines a single dilated causal convolutional layer with batch norm + """ + + block = conv_bn_actv( + layer_type=layer_type, + name=name, + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + activation_fn=activation_fn, + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + bn_momentum=bn_momentum, + bn_epsilon=bn_epsilon, + dilation=dilation + ) + + # pad the left side of the time-series with an amount of zeros based on the + # dilation rate + block = tf.pad(block, [[0, 0], [dilation * (kernel_size - 1), 0], [0, 0]]) + return block + +def wavenet_conv_block( + layer_type, name, inputs, condition_filter, condition_gate, filters, + kernel_size, strides, padding, regularizer, training, data_format, + bn_momentum, bn_epsilon, layers_per_block): + """ + Defines a single WaveNet block using the architecture specified in the + original paper, including skip and residual connections + """ + + skips = None + for layer in range(layers_per_block): + # split source along channels + source_shape = inputs.get_shape().as_list() + source_filter = inputs[:, :, 0:int(source_shape[2] / 2)] + source_gate = inputs[:, :, int(source_shape[2] / 2):] + + dilation = 2 ** layer + + source_filter = causal_conv_bn_actv( + layer_type=layer_type, + name="filter_{}_{}".format(name, layer), + inputs=source_filter, + filters=filters, + kernel_size=kernel_size, + activation_fn=None, + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + bn_momentum=bn_momentum, + bn_epsilon=bn_epsilon, + dilation=dilation + ) + + source_gate = causal_conv_bn_actv( + layer_type=layer_type, + name="gate_{}_{}".format(name, layer), + inputs=source_gate, + filters=filters, + kernel_size=kernel_size, + activation_fn=None, + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + bn_momentum=bn_momentum, + bn_epsilon=bn_epsilon, + dilation=dilation + ) + + if condition_filter is not None and condition_gate is not None: + source_filter = tf.tanh(tf.add(source_filter, condition_filter)) + source_gate = tf.sigmoid(tf.add(source_gate, condition_gate)) + else: + source_filter = tf.tanh(source_filter) + source_gate = tf.sigmoid(source_gate) + + conv_feats = tf.multiply(source_filter, source_gate) + + residual = conv_1x1( + layer_type=layer_type, + name="residual_1x1_{}_{}".format(name, layer), + inputs=conv_feats, + filters=filters, + strides=strides, + regularizer=regularizer, + training=training, + data_format=data_format + ) + + inputs = tf.add(inputs, residual) + + skip = conv_1x1( + layer_type=layer_type, + name="skip_1x1_{}_{}".format(name, layer), + inputs=conv_feats, + filters=filters, + strides=strides, + regularizer=regularizer, + training=training, + data_format=data_format + ) + + if skips is None: + skips = skip + else: + skips = tf.add(skips, skip) + + return inputs, skips + +class WavenetEncoder(Encoder): + + """ + WaveNet like encoder. + + Consists of several blocks of dilated causal convolutions. + """ + + @staticmethod + def get_required_params(): + return dict( + Encoder.get_required_params(), + **{ + "layer_type": str, + "kernel_size": int, + "strides": int, + "padding": str, + "blocks": int, + "layers_per_block": int, + "filters": int, + "quantization_channels": int + } + ) + + @staticmethod + def get_optional_params(): + return dict( + Encoder.get_optional_params(), + **{ + "data_format": str, + "bn_momentum": float, + "bn_epsilon": float + } + ) + + def __init__(self, params, model, name="wavenet_encoder", mode="train"): + """ + WaveNet like encoder constructor. + + Config parameters: + * **layer_type** (str) --- type of convolutional layer, currently only + supports "conv1d" + * **kernel_size** (int) --- size of kernel + * **strides** (int) --- size of stride + * **padding** (str) --- padding, can be "SAME" or "VALID" + + * **blocks** (int) --- number of dilation cycles + * **layers_per_block** (int) --- number of dilated convolutional layers in + each block + * **filters** (int) --- number of output channels + * **quantization_channels** (int) --- depth of mu-law quantized input + + * **data_format** (string) --- could be either "channels_first" or + "channels_last". Defaults to "channels_last". + * **bn_momentum** (float) --- momentum for batch norm. Defaults to 0.1. + * **bn_epsilon** (float) --- epsilon for batch norm. Defaults to 1e-5. + """ + + super(WavenetEncoder, self).__init__(params, model, name, mode) + + def _encode(self, input_dict): + """ + Creates TensorFlow graph for WaveNet like encoder. + ... + """ + + training = (self._mode == "train" or self._mode == "eval") + + if training: + source, src_length, condition, spec_length = input_dict["source_tensors"] + spec_offset = 0 + else: + source, src_length, condition, spec_length, spec_offset = \ + input_dict["source_tensors"] + + regularizer = self.params.get("regularizer", None) + data_format = self.params.get("data_format", "channels_last") + + if data_format != "channels_last": + source = tf.transpose(source, [0, 2, 1]) + condition = tf.transpose(condition, [0, 2, 1]) + + dtype = self.params["dtype"] + layer_type = self.params["layer_type"] + kernel_size = self.params["kernel_size"] + strides = self.params["strides"] + padding = self.params["padding"] + blocks = self.params["blocks"] + layers_per_block = self.params["layers_per_block"] + filters = self.params["filters"] + quantization_channels = self.params["quantization_channels"] + + bn_momentum = self.params.get("bn_momentum", 0.1) + bn_epsilon = self.params.get("bn_epsilon", 1e-5) + local_conditioning = self.params.get("local_conditioning", True) + + receptive_field = _get_receptive_field( + kernel_size, blocks, layers_per_block + ) + + # ----- Preprocessing ----------------------------------------------- + + encoded_inputs = _mu_law_encode(source, quantization_channels, dtype) + + if training: + # remove last sample to maintain causality + inputs = tf.slice( + encoded_inputs, [0, 0], [-1, tf.shape(encoded_inputs)[1] - 1] + ) + else: + inputs = encoded_inputs + + inputs = tf.one_hot(inputs, depth=quantization_channels, axis=-1) + inputs = tf.saturate_cast(inputs, dtype) + + if local_conditioning: + # split condition along channels + condition_shape = condition.get_shape().as_list() + condition_filter = condition[:, :, 0:int(condition_shape[2] / 2)] + condition_gate = condition[:, :, int(condition_shape[2] / 2):] + + condition_filter = conv_1x1( + layer_type=layer_type, + name="filter_condition", + inputs=condition_filter, + filters=filters, + strides=strides, + regularizer=regularizer, + training=training, + data_format=data_format + ) + + condition_gate = conv_1x1( + layer_type=layer_type, + name="gate_condition", + inputs=condition_gate, + filters=filters, + strides=strides, + regularizer=regularizer, + training=training, + data_format=data_format + ) + + if training: + # remove last sample to maintain causality + condition_filter = condition_filter[:, :-1, :] + condition_gate = condition_gate[:, :-1, :] + else: + # pad with zeros to align the condition to the source for + # autoregressive inference + zeros = tf.saturate_cast( + tf.zeros([condition_shape[0], receptive_field, filters]), + dtype + ) + condition_filter = tf.concat([zeros, condition_filter], axis=1) + condition_gate = tf.concat([zeros, condition_gate], axis=1) + + condition_filter = condition_filter[ + :, spec_offset:spec_offset + receptive_field, : + ] + condition_gate = condition_gate[ + :, spec_offset:spec_offset + receptive_field, : + ] + + else: + condition_filter = None + condition_gate = None + + # ----- Convolutional layers ----------------------------------------------- + + # first causal convolutional layer + inputs = causal_conv_bn_actv( + layer_type=layer_type, + name="preprocess", + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + activation_fn=None, + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + bn_momentum=bn_momentum, + bn_epsilon=bn_epsilon, + dilation=1 + ) + + # dilation stack + skips = None + for block in range(blocks): + inputs, skip = wavenet_conv_block( + layer_type=layer_type, + name=block, + inputs=inputs, + condition_filter=condition_filter, + condition_gate=condition_gate, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + bn_momentum=bn_momentum, + bn_epsilon=bn_epsilon, + layers_per_block=layers_per_block + ) + + if skips is None: + skips = skip + else: + skips = tf.add(skips, skip) + + outputs = tf.add(skips, inputs) + + # postprocessing + outputs = tf.nn.relu(outputs) + outputs = conv_1x1( + layer_type=layer_type, + name="postprocess_1", + inputs=outputs, + filters=filters, + strides=strides, + regularizer=regularizer, + training=training, + data_format=data_format + ) + + outputs = tf.nn.relu(outputs) + outputs = conv_1x1( + layer_type=layer_type, + name="postprocess_2", + inputs=outputs, + filters=quantization_channels, + strides=strides, + regularizer=regularizer, + training=training, + data_format=data_format + ) + + if training: + # remove samples that would be predicted without the full receptive field + prediction = tf.slice(outputs, [0, receptive_field - 1, 0], [-1, -1, -1]) + target_output = tf.slice(encoded_inputs, [0, receptive_field], [-1, -1]) + else: + prediction = outputs + target_output = encoded_inputs + + # decode the predicted signal as audio + audio = tf.argmax(tf.nn.softmax(outputs), axis=-1, output_type=tf.int32) + audio = tf.expand_dims(audio, -1) + audio = _mu_law_decode(audio, self.params["quantization_channels"]) + audio = tf.cast(audio, tf.float32) + + return { "logits": prediction, "outputs": [target_output, audio] } diff --git a/open_seq2seq/losses/__init__.py b/open_seq2seq/losses/__init__.py index 2c6a26279..2bda8deb3 100644 --- a/open_seq2seq/losses/__init__.py +++ b/open_seq2seq/losses/__init__.py @@ -7,4 +7,5 @@ from .ctc_loss import CTCLoss from .cross_entropy_loss import CrossEntropyLoss from .tacotron_loss import TacotronLoss +from .wavenet_loss import WavenetLoss from .jca_loss import MultiTaskCTCEntropyLoss diff --git a/open_seq2seq/losses/wavenet_loss.py b/open_seq2seq/losses/wavenet_loss.py new file mode 100644 index 000000000..d29d0ebf6 --- /dev/null +++ b/open_seq2seq/losses/wavenet_loss.py @@ -0,0 +1,43 @@ +# Copyright (c) 2018 NVIDIA Corporation + +import tensorflow as tf + +from .loss import Loss + +class WavenetLoss(Loss): + + def __init__(self, params, model, name="wavenet_loss"): + super(WavenetLoss, self).__init__(params, model, name) + self._n_feats = self._model.get_data_layer().params["num_audio_features"] + + def get_required_params(self): + return {} + + def get_optional_params(self): + return {} + + def _compute_loss(self, input_dict): + """ + Computes the cross-entropy loss for WaveNet. + + Args: + input_dict (dict): + * "decoder_output": array containing: [ + * logits: predicted output signal as logits + * outputs: array containing: [ + * ground truth signal as encoded labels + * mu-law decoded audio + ] + ] + """ + + prediction = tf.cast(input_dict["decoder_output"]["logits"], tf.float32) + target_output = input_dict["decoder_output"]["outputs"][0] + + loss = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=prediction, + labels=target_output + ) + loss = tf.reduce_mean(loss) + + return loss diff --git a/open_seq2seq/models/__init__.py b/open_seq2seq/models/__init__.py index 4687cb2ca..90bfcff7e 100644 --- a/open_seq2seq/models/__init__.py +++ b/open_seq2seq/models/__init__.py @@ -6,3 +6,4 @@ from .image2label import Image2Label from .lstm_lm import LSTMLM from .text2speech import Text2Speech +from .text2speech_wavenet import Text2SpeechWavenet diff --git a/open_seq2seq/models/text2speech_wavenet.py b/open_seq2seq/models/text2speech_wavenet.py new file mode 100644 index 000000000..2831cb665 --- /dev/null +++ b/open_seq2seq/models/text2speech_wavenet.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 NVIDIA Corporation +import numpy as np +from scipy.io.wavfile import write + +from .encoder_decoder import EncoderDecoderModel + +def save_audio(signal, logdir, step, sampling_rate, mode): + signal = np.float32(signal) + file_name = '{}/sample_step{}_{}.wav'.format(logdir, step, mode) + if logdir[0] != '/': + file_name = "./" + file_name + write(file_name, sampling_rate, signal) + +class Text2SpeechWavenet(EncoderDecoderModel): + + @staticmethod + def get_required_params(): + return dict( + EncoderDecoderModel.get_required_params(), **{} + ) + + def __init__(self, params, mode="train", hvd=None): + super(Text2SpeechWavenet, self).__init__(params, mode=mode, hvd=hvd) + + def maybe_print_logs(self, input_values, output_values, training_step): + save_audio( + output_values[1][-1], + self.params["logdir"], + training_step, + sampling_rate=22050, + mode="train" + ) + return {} + + def evaluate(self, input_values, output_values): + return output_values[1][-1] + + def finalize_evaluation(self, results_per_batch, training_step=None): + save_audio( + results_per_batch[0], + self.params["logdir"], + training_step, + sampling_rate=22050, + mode="eval" + ) + return {} + + def infer(self, input_values, output_values): + return output_values[1][-1] + + def finalize_inference(self, results_per_batch, output_file): + return {} diff --git a/scripts/tacotron_save_spec.py b/scripts/tacotron_save_spec.py new file mode 100644 index 000000000..594460b89 --- /dev/null +++ b/scripts/tacotron_save_spec.py @@ -0,0 +1,81 @@ +%matplotlib inline +# Replace the first box of Interactive_Infer_example.ipynb with this + +import IPython +import librosa + +import numpy as np +import scipy.io.wavfile as wave +import tensorflow as tf +import matplotlib.pyplot as plt + +from open_seq2seq.utils.utils import deco_print, get_base_config, check_logdir,\ + create_logdir, create_model, get_interactive_infer_results +from open_seq2seq.models.text2speech import save_audio + +args_T2S = ["--config_file=Infer_T2S/config.py", + "--mode=interactive_infer", + "--logdir=Infer_T2S/", + "--batch_size_per_gpu=1", +] + +# A simpler version of what run.py does. It returns the created model and its saved checkpoint +def get_model(args, scope): + with tf.variable_scope(scope): + args, base_config, base_model, config_module = get_base_config(args) + checkpoint = check_logdir(args, base_config) + model = create_model(args, base_config, config_module, base_model, None) + return model, checkpoint + +model_T2S, checkpoint_T2S = get_model(args_T2S, "T2S") + +# Create the session and load the checkpoints +sess_config = tf.ConfigProto(allow_soft_placement=True) +sess_config.gpu_options.allow_growth = True +sess = tf.InteractiveSession(config=sess_config) +vars_T2S = {} +for v in tf.get_collection(tf.GraphKeys.VARIABLES): + if "T2S" in v.name: + vars_T2S["/".join(v.op.name.split("/")[1:])] = v +saver_T2S = tf.train.Saver(vars_T2S) +saver_T2S.restore(sess, checkpoint_T2S) + +# line = "I was trained using Nvidia's Open Sequence to Sequence framework." + +# Define the inference function +n_fft = model_T2S.get_data_layer().n_fft +sampling_rate = model_T2S.get_data_layer().sampling_rate +def infer(line): + print("Input English") + print(line) + + # Generate speech + results = get_interactive_infer_results(model_T2S, sess, model_in=[line]) + audio_length = results[1][4][0] + + if model_T2S.get_data_layer()._both: + prediction = results[1][5][0] + + else: + prediction = results[1][1][0] + + prediction = prediction[:audio_length-1,:] + mag_prediction = model_T2S.get_data_layer().get_magnitude_spec(prediction) + + mag_prediction_squared = np.clip(mag_prediction, a_min=0, a_max=255) + mag_prediction_squared = mag_prediction_squared**1.5 + mag_prediction_squared = np.square(mag_prediction_squared) + + mel_basis = librosa.filters.mel(sr=22050, n_fft=1024, n_mels=80, htk=True, norm=None) + mel = np.dot(mel_basis, mag_prediction_squared.T) + mel = np.log(np.clip(mel, a_min=1e-5, a_max=None)) + np.save("spec2", mel) + + plt.imshow(mel) + plt.gca().invert_yaxis() + plt.show() + + wav = save_audio(mag_prediction, "unused", "unused", sampling_rate=sampling_rate, save_format="np.array", n_fft=n_fft) + audio = IPython.display.Audio(wav, rate=sampling_rate) + print("Generated Audio") + IPython.display.display(audio) diff --git a/scripts/wavenet_naive_infer.py b/scripts/wavenet_naive_infer.py new file mode 100644 index 000000000..ba97ce132 --- /dev/null +++ b/scripts/wavenet_naive_infer.py @@ -0,0 +1,97 @@ +# Replace the first box of Interactive_Infer_example.ipynb with this + +import IPython +import librosa + +import numpy as np +import scipy.io.wavfile as wave +import tensorflow as tf + +from open_seq2seq.utils.utils import deco_print, get_base_config, check_logdir,\ + create_logdir, create_model, get_interactive_infer_results +from open_seq2seq.models.text2speech_wavenet import save_audio + +args_T2S = ["--config_file=Infer_T2S_Wave/config.py", + "--mode=interactive_infer", + "--logdir=Infer_T2S_Wave/", + "--batch_size_per_gpu=1", +] + +# A simpler version of what run.py does. It returns the created model and its +# saved checkpoint +def get_model(args, scope): + with tf.variable_scope(scope): + args, base_config, base_model, config_module = get_base_config(args) + checkpoint = check_logdir(args, base_config) + model = create_model(args, base_config, config_module, base_model, None) + return model, checkpoint + +model_T2S, checkpoint_T2S = get_model(args_T2S, "T2S") + +# Create the session and load the checkpoints +sess_config = tf.ConfigProto(allow_soft_placement=True) +sess_config.gpu_options.allow_growth = True +sess = tf.InteractiveSession(config=sess_config) + +vars_T2S = {} +for v in tf.get_collection(tf.GraphKeys.VARIABLES): + if "T2S" in v.name: + vars_T2S["/".join(v.op.name.split("/")[1:])] = v + +saver_T2S = tf.train.Saver(vars_T2S) +saver_T2S.restore(sess, checkpoint_T2S) + +# Define the inference function +n_fft = model_T2S.get_data_layer().n_fft +sampling_rate = model_T2S.get_data_layer().sampling_rate +def infer(line): + """ + Infers one value at a time using a sliding window with width equal to the + receptive field. + """ + + print("Input File") + print(line) + + GET_SPEC_FROM_WAV = False + max_steps = 200000 + receptive_field = 6139 # 3x10 + + source = np.zeros([1, receptive_field]) + src_length = np.full([1], receptive_field) + audio = [] + spec_offset = 0 + + if GET_SPEC_FROM_WAV: # get spectrogram from .wav file + file_name = str.encode(line) + spec, spec_length = model_T2S.get_data_layer(). \ + _parse_spectrogram_element(file_name) + + else: # get spectrogram from .npy file + spec = np.load(line + ".npy").T + spec = np.repeat(spec, 256, axis=0) + spec_length = spec.shape[0] + + spec = np.expand_dims(spec, axis=0) + spec_length = np.reshape(spec_length, [1]) + + while(spec_offset < max_steps): + output = get_interactive_infer_results( + model_T2S, sess, + model_in=(source, src_length, spec, spec_length, spec_offset) + ) + + predicted = output[-1][0] + audio.append(predicted) + + source[0][0] = predicted + source[0] = np.roll(source[0], -1) + + if spec_offset % 1000 == 0: + print("Saving audio for step {}".format(spec_offset)) + wav = save_audio( + np.array(audio), "result", 0, + sampling_rate=sampling_rate, mode="infer" + ) + + spec_offset += 1 From 8c79fa8daf24353af7fa4c08ec2f09b68a20b1eb Mon Sep 17 00:00:00 2001 From: Edward Lu Date: Wed, 31 Oct 2018 11:05:49 -0700 Subject: [PATCH 2/2] remove unused dataset param Signed-off-by: Edward Lu --- example_configs/text2speech/wavenet_float.py | 1 - example_configs/text2speech/wavenet_float_8gpu.py | 1 - example_configs/text2speech/wavenet_mixed.py | 1 - example_configs/text2speech/wavenet_mixed_8gpu.py | 1 - open_seq2seq/data/text2speech/text2speech_wavenet.py | 10 +++++----- 5 files changed, 5 insertions(+), 9 deletions(-) diff --git a/example_configs/text2speech/wavenet_float.py b/example_configs/text2speech/wavenet_float.py index 8ea7fd507..0076ed823 100644 --- a/example_configs/text2speech/wavenet_float.py +++ b/example_configs/text2speech/wavenet_float.py @@ -63,7 +63,6 @@ "data_layer": WavenetDataLayer, "data_layer_params": { - "dataset": "LJ", "num_audio_features": 80, "dataset_location": "data/speech/LJSpeech/wavs/" } diff --git a/example_configs/text2speech/wavenet_float_8gpu.py b/example_configs/text2speech/wavenet_float_8gpu.py index 1cf5c571f..b4c48cc8f 100644 --- a/example_configs/text2speech/wavenet_float_8gpu.py +++ b/example_configs/text2speech/wavenet_float_8gpu.py @@ -63,7 +63,6 @@ "data_layer": WavenetDataLayer, "data_layer_params": { - "dataset": "LJ", "num_audio_features": 80, "dataset_location": "/data/LJSpeech-1.1-partitioned/wavs/" } diff --git a/example_configs/text2speech/wavenet_mixed.py b/example_configs/text2speech/wavenet_mixed.py index efff43f36..3f572224f 100644 --- a/example_configs/text2speech/wavenet_mixed.py +++ b/example_configs/text2speech/wavenet_mixed.py @@ -64,7 +64,6 @@ "data_layer": WavenetDataLayer, "data_layer_params": { - "dataset": "LJ", "num_audio_features": 80, "dataset_location": "data/speech/LJSpeech/wavs/" } diff --git a/example_configs/text2speech/wavenet_mixed_8gpu.py b/example_configs/text2speech/wavenet_mixed_8gpu.py index 40559087e..303c6f3e1 100644 --- a/example_configs/text2speech/wavenet_mixed_8gpu.py +++ b/example_configs/text2speech/wavenet_mixed_8gpu.py @@ -64,7 +64,6 @@ "data_layer": WavenetDataLayer, "data_layer_params": { - "dataset": "LJ", "num_audio_features": 80, "dataset_location": "/data/LJSpeech-1.1-partitioned/wavs/" } diff --git a/open_seq2seq/data/text2speech/text2speech_wavenet.py b/open_seq2seq/data/text2speech/text2speech_wavenet.py index 343b20667..1a4d48a1b 100644 --- a/open_seq2seq/data/text2speech/text2speech_wavenet.py +++ b/open_seq2seq/data/text2speech/text2speech_wavenet.py @@ -16,7 +16,6 @@ class WavenetDataLayer(DataLayer): def get_required_params(): return dict( DataLayer.get_required_params(), **{ - "dataset": str, "num_audio_features": int, "dataset_files": list } @@ -26,8 +25,7 @@ def get_required_params(): def get_optional_params(): return dict( DataLayer.get_optional_params(), **{ - "dataset_location": str, - "receptive_field": int + "dataset_location": str } ) @@ -39,9 +37,11 @@ def __init__(self, params, model, num_workers=None, worker_id=None): Config parameters: - * **dataset** (str) --- The dataset to use, currently only supports "LJ" - for LJSpeech 1.1 + * **num_audio_features** (int) --- number of spectrogram audio features + * **dataset_files** (list) --- list with paths to all dataset .csv files + * **dataset_location** (str) --- string with path to directory where wavs + are stored """ super(WavenetDataLayer, self).__init__(