| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- defaults:
- - base
- - _self_
- project: hubert_vq_diffusion
- # Lightning Trainer
- trainer:
- accelerator: gpu
- devices: 4
- strategy: ddp_find_unused_parameters_true
- gradient_clip_val: 1.0
- gradient_clip_algorithm: 'norm'
- precision: 16-mixed
- max_steps: 1_000_000
- val_check_interval: 5000
- sample_rate: 44100
- hop_length: 512
- num_mels: 128
- n_fft: 2048
- win_length: 2048
- # Dataset Configuration
- train_dataset:
- _target_: fish_speech.datasets.vqgan.VQGANDataset
- filelist: data/filelist.split.train
- sample_rate: ${sample_rate}
- hop_length: ${hop_length}
- slice_frames: 512
- val_dataset:
- _target_: fish_speech.datasets.vqgan.VQGANDataset
- filelist: data/filelist.split.valid
- sample_rate: ${sample_rate}
- hop_length: ${hop_length}
- data:
- _target_: fish_speech.datasets.vqgan.VQGANDataModule
- train_dataset: ${train_dataset}
- val_dataset: ${val_dataset}
- num_workers: 8
- batch_size: 32
- val_batch_size: 4
- # Model Configuration
- model:
- _target_: fish_speech.models.vq_diffusion.lit_module.VQDiffusion
- sample_rate: ${sample_rate}
- hop_length: ${hop_length}
- text_encoder:
- _target_: fish_speech.models.vqgan.modules.encoders.TextEncoder
- in_channels: 128
- out_channels: 128
- hidden_channels: 192
- hidden_channels_ffn: 768
- n_heads: 2
- n_layers: 6
- kernel_size: 1
- dropout: 0.1
- use_vae: false
- gin_channels: 512
- speaker_cond_layer: 0
- vq_encoder:
- _target_: fish_speech.models.vqgan.modules.encoders.VQEncoder
- in_channels: 128
- vq_channels: 128
- codebook_size: 16384
- downsample: 1
- speaker_encoder:
- _target_: fish_speech.models.vqgan.modules.encoders.SpeakerEncoder
- in_channels: 128
- hidden_channels: 192
- out_channels: 128
- num_heads: 2
- num_layers: 4
- p_dropout: 0.1
-
- denoiser:
- _target_: fish_speech.models.vq_diffusion.convnext_1d.ConvNext1DModel
- in_channels: 256
- out_channels: 128
- intermediate_dim: 512
- # condition_dim: 128
- mlp_dim: 2048
- num_layers: 20
- dilation_cycle_length: 2
- time_embedding_type: "positional"
- vocoder:
- _target_: fish_speech.models.vq_diffusion.adamos.ADaMoSHiFiGANV1
- mel_transform:
- _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
- sample_rate: ${sample_rate}
- n_fft: ${n_fft}
- hop_length: ${hop_length}
- win_length: ${win_length}
- n_mels: ${num_mels}
- f_min: 40
- f_max: 16000
- feature_mel_transform:
- _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
- sample_rate: 32000
- n_fft: 2048
- hop_length: 640
- win_length: 2048
- n_mels: 128
- optimizer:
- _target_: torch.optim.AdamW
- _partial_: true
- lr: 1e-4
- betas: [0.9, 0.999]
- eps: 1e-5
- lr_scheduler:
- _target_: torch.optim.lr_scheduler.LambdaLR
- _partial_: true
- lr_lambda:
- _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
- _partial_: true
- num_warmup_steps: 0
- num_training_steps: ${trainer.max_steps}
- final_lr_ratio: 0.05
- callbacks:
- grad_norm_monitor:
- sub_module:
- - vq_encoder
- - text_encoder
- - speaker_encoder
- - denoiser
|