123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- import tensorflow as tf
- from text.symbols import symbols
- def create_hparams(hparams_string=None, verbose=False):
- """Create model hyperparameters. Parse nondefault from given string."""
- hparams = tf.contrib.training.HParams(
-
-
-
- epochs=50000,
- iters_per_checkpoint=500,
- seed=1234,
- dynamic_loss_scaling=True,
- fp16_run=False,
- distributed_run=False,
- dist_backend="nccl",
- dist_url="tcp://localhost:54321",
- cudnn_enabled=True,
- cudnn_benchmark=False,
- ignore_layers=['speaker_embedding.weight'],
-
-
-
-
-
- training_files='/Users/tzld/mellotron/filelists/ljs_audiopaths_text_sid_train_filelist_new.txt',
- validation_files='/Users/tzld/mellotron/filelists/ljs_audiopaths_text_sid_val_filelist_new.txt',
-
-
- text_cleaners=['english_cleaners'],
- p_arpabet=1.0,
- cmudict_path="data/cmu_dictionary",
-
-
-
- max_wav_value=32768.0,
- sampling_rate=22050,
- filter_length=1024,
- hop_length=256,
- win_length=1024,
- n_mel_channels=80,
- mel_fmin=0.0,
- mel_fmax=8000.0,
- f0_min=80,
- f0_max=880,
- harm_thresh=0.25,
-
-
-
- n_symbols=len(symbols),
- symbols_embedding_dim=512,
-
- encoder_kernel_size=5,
- encoder_n_convolutions=3,
- encoder_embedding_dim=512,
-
- n_frames_per_step=1,
- decoder_rnn_dim=1024,
- prenet_dim=256,
- prenet_f0_n_layers=1,
- prenet_f0_dim=1,
- prenet_f0_kernel_size=1,
- prenet_rms_dim=0,
- prenet_rms_kernel_size=1,
- max_decoder_steps=1000,
- gate_threshold=0.5,
- p_attention_dropout=0.1,
- p_decoder_dropout=0.1,
- p_teacher_forcing=1.0,
-
- attention_rnn_dim=1024,
- attention_dim=128,
-
- attention_location_n_filters=32,
- attention_location_kernel_size=31,
-
- postnet_embedding_dim=512,
- postnet_kernel_size=5,
- postnet_n_convolutions=5,
-
- n_speakers=123,
- speaker_embedding_dim=128,
-
- with_gst=True,
- ref_enc_filters=[32, 32, 64, 64, 128, 128],
- ref_enc_size=[3, 3],
- ref_enc_strides=[2, 2],
- ref_enc_pad=[1, 1],
- ref_enc_gru_size=128,
-
- token_embedding_size=256,
- token_num=10,
- num_heads=8,
-
-
-
- use_saved_learning_rate=False,
- learning_rate=1e-3,
- learning_rate_min=1e-5,
- learning_rate_anneal=50000,
- weight_decay=1e-6,
- grad_clip_thresh=1.0,
- batch_size=32,
- mask_padding=True,
- )
- if hparams_string:
- tf.compat.v1.logging.info('Parsing command line hparams: %s', hparams_string)
- hparams.parse(hparams_string)
- if verbose:
- tf.compat.v1.logging.info('Final parsed hparams: %s', hparams.values())
- return hparams
|