|
|
@@ -1,145 +0,0 @@
|
|
|
-defaults:
|
|
|
- - base
|
|
|
- - _self_
|
|
|
-
|
|
|
-project: vq_naive
|
|
|
-
|
|
|
-# Lightning Trainer
|
|
|
-trainer:
|
|
|
- accelerator: gpu
|
|
|
- devices: 1
|
|
|
- strategy: ddp_find_unused_parameters_true
|
|
|
- gradient_clip_val: 1.0
|
|
|
- gradient_clip_algorithm: 'norm'
|
|
|
- precision: bf16-mixed
|
|
|
- max_steps: 100_000
|
|
|
- val_check_interval: 5000
|
|
|
-
|
|
|
-sample_rate: 22050
|
|
|
-hop_length: 256
|
|
|
-num_mels: 80
|
|
|
-n_fft: 1024
|
|
|
-win_length: 1024
|
|
|
-segment_size: 512
|
|
|
-
|
|
|
-# Dataset Configuration
|
|
|
-train_dataset:
|
|
|
- _target_: fish_speech.datasets.vqgan.VQGANDataset
|
|
|
- filelist: data/filelist.split.train
|
|
|
- sample_rate: ${sample_rate}
|
|
|
- hop_length: ${hop_length}
|
|
|
- slice_frames: ${segment_size}
|
|
|
-
|
|
|
-val_dataset:
|
|
|
- _target_: fish_speech.datasets.vqgan.VQGANDataset
|
|
|
- filelist: data/filelist.split.valid
|
|
|
- sample_rate: ${sample_rate}
|
|
|
- hop_length: ${hop_length}
|
|
|
-
|
|
|
-data:
|
|
|
- _target_: fish_speech.datasets.vqgan.VQGANDataModule
|
|
|
- train_dataset: ${train_dataset}
|
|
|
- val_dataset: ${val_dataset}
|
|
|
- num_workers: 8
|
|
|
- batch_size: 128
|
|
|
- val_batch_size: 16
|
|
|
-
|
|
|
-# Model Configuration
|
|
|
-model:
|
|
|
- _target_: fish_speech.models.vqgan.VQNaive
|
|
|
- sample_rate: ${sample_rate}
|
|
|
- hop_length: ${hop_length}
|
|
|
-
|
|
|
- downsample:
|
|
|
- _target_: fish_speech.models.vq_diffusion.lit_module.ConvDownSample
|
|
|
- dims: ["${num_mels}", 512, 256]
|
|
|
- kernel_sizes: [3, 3]
|
|
|
- strides: [2, 2]
|
|
|
-
|
|
|
- mel_encoder:
|
|
|
- _target_: fish_speech.models.vqgan.modules.encoders.TextEncoder
|
|
|
- in_channels: 256
|
|
|
- out_channels: 256
|
|
|
- hidden_channels: 192
|
|
|
- hidden_channels_ffn: 768
|
|
|
- n_heads: 2
|
|
|
- n_layers: 6
|
|
|
- kernel_size: 1
|
|
|
- dropout: 0.1
|
|
|
- use_vae: false
|
|
|
-
|
|
|
- vq_encoder:
|
|
|
- _target_: fish_speech.models.vqgan.modules.encoders.VQEncoder
|
|
|
- in_channels: 256
|
|
|
- vq_channels: 256
|
|
|
- codebook_size: 4096
|
|
|
- downsample: 1
|
|
|
-
|
|
|
- speaker_encoder:
|
|
|
- _target_: fish_speech.models.vqgan.modules.encoders.SpeakerEncoder
|
|
|
- in_channels: ${num_mels}
|
|
|
- hidden_channels: 192
|
|
|
- out_channels: 256
|
|
|
- num_heads: 2
|
|
|
- num_layers: 4
|
|
|
- p_dropout: 0.1
|
|
|
-
|
|
|
- decoder:
|
|
|
- _target_: fish_speech.models.vqgan.modules.encoders.TextEncoder
|
|
|
- in_channels: 256
|
|
|
- out_channels: ${num_mels}
|
|
|
- hidden_channels: 192
|
|
|
- hidden_channels_ffn: 768
|
|
|
- n_heads: 2
|
|
|
- n_layers: 8
|
|
|
- kernel_size: 1
|
|
|
- use_vae: false
|
|
|
- dropout: 0.1
|
|
|
- gin_channels: 256
|
|
|
- speaker_cond_layer: 4
|
|
|
-
|
|
|
- mel_transform:
|
|
|
- _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
|
|
|
- sample_rate: ${sample_rate}
|
|
|
- n_fft: ${n_fft}
|
|
|
- hop_length: ${hop_length}
|
|
|
- win_length: ${win_length}
|
|
|
- n_mels: ${num_mels}
|
|
|
- f_min: 0
|
|
|
- f_max: 8000
|
|
|
-
|
|
|
- vocoder:
|
|
|
- _target_: fish_speech.models.vqgan.modules.decoder.Generator
|
|
|
- initial_channel: ${num_mels}
|
|
|
- resblock: "1"
|
|
|
- resblock_kernel_sizes: [3, 7, 11]
|
|
|
- resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
|
|
- upsample_rates: [8, 8, 2, 2]
|
|
|
- upsample_initial_channel: 512
|
|
|
- upsample_kernel_sizes: [16, 16, 4, 4]
|
|
|
- ckpt_path: "checkpoints/hifigan-v1-universal-22050/g_02500000"
|
|
|
-
|
|
|
- optimizer:
|
|
|
- _target_: torch.optim.AdamW
|
|
|
- _partial_: true
|
|
|
- lr: 1e-4
|
|
|
- betas: [0.9, 0.999]
|
|
|
- eps: 1e-5
|
|
|
-
|
|
|
- lr_scheduler:
|
|
|
- _target_: torch.optim.lr_scheduler.LambdaLR
|
|
|
- _partial_: true
|
|
|
- lr_lambda:
|
|
|
- _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
|
|
|
- _partial_: true
|
|
|
- num_warmup_steps: 1000
|
|
|
- num_training_steps: ${trainer.max_steps}
|
|
|
- final_lr_ratio: 0.05
|
|
|
-
|
|
|
-callbacks:
|
|
|
- grad_norm_monitor:
|
|
|
- sub_module:
|
|
|
- - mel_encoder
|
|
|
- - vq_encoder
|
|
|
- - speaker_encoder
|
|
|
- - decoder
|