Skip to content
This repository has been archived by the owner on Aug 3, 2021. It is now read-only.

Commit

Permalink
Merge pull request #269 from edwardhdlu/wavenet-pull-request
Browse files Browse the repository at this point in the history
WaveNet Implementation
  • Loading branch information
vsl9 authored Oct 31, 2018
2 parents aad1ca1 + 8c79fa8 commit 4b95346
Show file tree
Hide file tree
Showing 16 changed files with 1,396 additions and 6 deletions.
103 changes: 103 additions & 0 deletions example_configs/text2speech/wavenet_float.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# pylint: skip-file
import tensorflow as tf
from open_seq2seq.models import Text2SpeechWavenet
from open_seq2seq.encoders import WavenetEncoder
from open_seq2seq.decoders import FakeDecoder
from open_seq2seq.losses import WavenetLoss
from open_seq2seq.data import WavenetDataLayer
from open_seq2seq.optimizers.lr_policies import exp_decay
from open_seq2seq.parts.convs2s.utils import gated_linear_units

base_model = Text2SpeechWavenet

base_params = {
"random_seed": 0,
"use_horovod": False,
"max_steps": 1000000,

"num_gpus": 1,
"batch_size_per_gpu": 2,

"save_summaries_steps": 50,
"print_loss_steps": 50,
"print_samples_steps": 500,
"eval_steps": 500,
"save_checkpoint_steps": 2500,
"logdir": "result/wavenet-LJ-float",

"optimizer": "Adam",
"optimizer_params": {},
"lr_policy": exp_decay,
"lr_policy_params": {
"learning_rate": 1e-3,
"decay_steps": 20000,
"decay_rate": 0.1,
"use_staircase_decay": False,
"begin_decay_at": 45000,
"min_lr": 1e-5,
},
"dtype": tf.float32,
"regularizer": tf.contrib.layers.l2_regularizer,
"regularizer_params": {
"scale": 1e-6
},
"initializer": tf.contrib.layers.xavier_initializer,

"summaries": [],

"encoder": WavenetEncoder,
"encoder_params": {
"layer_type": "conv1d",
"kernel_size": 3,
"strides": 1,
"padding": "VALID",
"blocks": 3,
"layers_per_block": 10,
"filters": 64,
"quantization_channels": 256
},

"decoder": FakeDecoder,

"loss": WavenetLoss,

"data_layer": WavenetDataLayer,
"data_layer_params": {
"num_audio_features": 80,
"dataset_location": "data/speech/LJSpeech/wavs/"
}
}

train_params = {
"data_layer_params": {
"dataset_files": [
"data/speech/LJSpeech/train.csv",
],
"shuffle": True,
},
}

eval_params = {
"data_layer_params": {
"dataset_files": [
"data/speech/LJSpeech/val.csv",
],
"shuffle": False,
},
}

infer_params = {
"data_layer_params": {
"dataset_files": [
"data/speech/LJSpeech/test.csv",
],
"shuffle": False,
},
}

interactive_infer_params = {
"data_layer_params": {
"dataset_files": [],
"shuffle": False,
},
}
103 changes: 103 additions & 0 deletions example_configs/text2speech/wavenet_float_8gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# pylint: skip-file
import tensorflow as tf
from open_seq2seq.models import Text2SpeechWavenet
from open_seq2seq.encoders import WavenetEncoder
from open_seq2seq.decoders import FakeDecoder
from open_seq2seq.losses import WavenetLoss
from open_seq2seq.data import WavenetDataLayer
from open_seq2seq.optimizers.lr_policies import exp_decay
from open_seq2seq.parts.convs2s.utils import gated_linear_units

base_model = Text2SpeechWavenet

base_params = {
"random_seed": 0,
"use_horovod": True,
"max_steps": 1000000,

"num_gpus": 8,
"batch_size_per_gpu": 1,

"save_summaries_steps": 50,
"print_loss_steps": 50,
"print_samples_steps": 500,
"eval_steps": 500,
"save_checkpoint_steps": 2500,
"logdir": "result/wavenet-LJ-float",

"optimizer": "Adam",
"optimizer_params": {},
"lr_policy": exp_decay,
"lr_policy_params": {
"learning_rate": 1e-3,
"decay_steps": 20000,
"decay_rate": 0.1,
"use_staircase_decay": False,
"begin_decay_at": 45000,
"min_lr": 1e-5,
},
"dtype": tf.float32,
"regularizer": tf.contrib.layers.l2_regularizer,
"regularizer_params": {
"scale": 1e-6
},
"initializer": tf.contrib.layers.xavier_initializer,

"summaries": [],

"encoder": WavenetEncoder,
"encoder_params": {
"layer_type": "conv1d",
"kernel_size": 3,
"strides": 1,
"padding": "VALID",
"blocks": 3,
"layers_per_block": 10,
"filters": 64,
"quantization_channels": 256
},

"decoder": FakeDecoder,

"loss": WavenetLoss,

"data_layer": WavenetDataLayer,
"data_layer_params": {
"num_audio_features": 80,
"dataset_location": "/data/LJSpeech-1.1-partitioned/wavs/"
}
}

train_params = {
"data_layer_params": {
"dataset_files": [
"/data/LJSpeech-1.1-partitioned/train.csv",
],
"shuffle": True,
},
}

eval_params = {
"data_layer_params": {
"dataset_files": [
"/data/LJSpeech-1.1-partitioned/val.csv",
],
"shuffle": False,
},
}

infer_params = {
"data_layer_params": {
"dataset_files": [
"/data/LJSpeech-1.1-partitioned/test.csv",
],
"shuffle": False,
},
}

interactive_infer_params = {
"data_layer_params": {
"dataset_files": [],
"shuffle": False,
},
}
104 changes: 104 additions & 0 deletions example_configs/text2speech/wavenet_mixed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# pylint: skip-file
import tensorflow as tf
from open_seq2seq.models import Text2SpeechWavenet
from open_seq2seq.encoders import WavenetEncoder
from open_seq2seq.decoders import FakeDecoder
from open_seq2seq.losses import WavenetLoss
from open_seq2seq.data import WavenetDataLayer
from open_seq2seq.optimizers.lr_policies import exp_decay
from open_seq2seq.parts.convs2s.utils import gated_linear_units

base_model = Text2SpeechWavenet

base_params = {
"random_seed": 0,
"use_horovod": False,
"max_steps": 1000000,

"num_gpus": 1,
"batch_size_per_gpu": 4,

"save_summaries_steps": 50,
"print_loss_steps": 50,
"print_samples_steps": 500,
"eval_steps": 500,
"save_checkpoint_steps": 2500,
"logdir": "result/wavenet-LJ-mixed",

"optimizer": "Adam",
"optimizer_params": {},
"lr_policy": exp_decay,
"lr_policy_params": {
"learning_rate": 1e-3,
"decay_steps": 20000,
"decay_rate": 0.1,
"use_staircase_decay": False,
"begin_decay_at": 45000,
"min_lr": 1e-5,
},
"dtype": "mixed",
"loss_scaling": "Backoff",
"regularizer": tf.contrib.layers.l2_regularizer,
"regularizer_params": {
"scale": 1e-6
},
"initializer": tf.contrib.layers.xavier_initializer,

"summaries": [],

"encoder": WavenetEncoder,
"encoder_params": {
"layer_type": "conv1d",
"kernel_size": 3,
"strides": 1,
"padding": "VALID",
"blocks": 3,
"layers_per_block": 10,
"filters": 64,
"quantization_channels": 256
},

"decoder": FakeDecoder,

"loss": WavenetLoss,

"data_layer": WavenetDataLayer,
"data_layer_params": {
"num_audio_features": 80,
"dataset_location": "data/speech/LJSpeech/wavs/"
}
}

train_params = {
"data_layer_params": {
"dataset_files": [
"data/speech/LJSpeech/train.csv",
],
"shuffle": True,
},
}

eval_params = {
"data_layer_params": {
"dataset_files": [
"data/speech/LJSpeech/val.csv",
],
"shuffle": False,
},
}

infer_params = {
"data_layer_params": {
"dataset_files": [
"data/speech/LJSpeech/test.csv",
],
"shuffle": False,
},
}

interactive_infer_params = {
"data_layer_params": {
"dataset_files": [],
"shuffle": False,
},
}
Loading

0 comments on commit 4b95346

Please sign in to comment.