vits_decoder_pretrain.yaml 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. defaults:
  2. - base
  3. - _self_
  4. project: vits_decoder
  5. ckpt_path: checkpoints/Bert-VITS2/ensemble.pth
  6. resume_weights_only: true
  7. # Lightning Trainer
  8. trainer:
  9. accelerator: gpu
  10. devices: auto
  11. strategy: ddp_find_unused_parameters_true
  12. precision: 32
  13. max_steps: 1_000_000
  14. val_check_interval: 1000
  15. sample_rate: 44100
  16. hop_length: 512
  17. num_mels: 128
  18. n_fft: 2048
  19. win_length: 2048
  20. # Dataset Configuration
  21. tokenizer:
  22. _target_: transformers.AutoTokenizer.from_pretrained
  23. pretrained_model_name_or_path: fishaudio/fish-speech-1
  24. # Dataset Configuration
  25. train_dataset:
  26. _target_: fish_speech.datasets.vits.VITSDataset
  27. filelist: data/source/Genshin/filelist.train.txt
  28. sample_rate: ${sample_rate}
  29. hop_length: ${hop_length}
  30. suffix: ".lab"
  31. tokenizer: ${tokenizer}
  32. sentence_mask_ratio: 0.2
  33. val_dataset:
  34. _target_: fish_speech.datasets.vits.VITSDataset
  35. filelist: data/source/Genshin/filelist.test.txt
  36. sample_rate: ${sample_rate}
  37. hop_length: ${hop_length}
  38. suffix: ".lab"
  39. tokenizer: ${tokenizer}
  40. data:
  41. _target_: fish_speech.datasets.vits.VITSDataModule
  42. train_dataset: ${train_dataset}
  43. val_dataset: ${val_dataset}
  44. num_workers: 4
  45. batch_size: 8
  46. val_batch_size: 4
  47. tokenizer: ${tokenizer}
  48. # Model Configuration
  49. model:
  50. _target_: fish_speech.models.vits_decoder.VITSDecoder
  51. sample_rate: ${sample_rate}
  52. hop_length: ${hop_length}
  53. freeze_discriminator: false
  54. weight_mel: 45.0
  55. weight_kl: 1.0
  56. generator:
  57. _target_: fish_speech.models.vits_decoder.modules.models.SynthesizerTrn
  58. spec_channels: 1025
  59. segment_size: 32
  60. inter_channels: 192
  61. hidden_channels: 192
  62. filter_channels: 768
  63. n_heads: 2
  64. n_layers: 6
  65. kernel_size: 3
  66. p_dropout: 0.1
  67. resblock: "1"
  68. resblock_kernel_sizes: [3, 7, 11]
  69. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  70. upsample_rates: [8, 8, 2, 2, 2]
  71. upsample_initial_channel: 512
  72. upsample_kernel_sizes: [16, 16, 8, 2, 2]
  73. gin_channels: 512
  74. vq_mask_ratio: 0.2
  75. ref_mask_ratio: 0.2
  76. discriminator:
  77. _target_: fish_speech.models.vits_decoder.modules.models.EnsembledDiscriminator
  78. periods: [2, 3, 5, 7, 11]
  79. mel_transform:
  80. _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
  81. sample_rate: ${sample_rate}
  82. n_fft: ${n_fft}
  83. hop_length: ${hop_length}
  84. win_length: ${win_length}
  85. n_mels: ${num_mels}
  86. spec_transform:
  87. _target_: fish_speech.utils.spectrogram.LinearSpectrogram
  88. n_fft: ${n_fft}
  89. hop_length: ${hop_length}
  90. win_length: ${win_length}
  91. mode: pow2_sqrt
  92. optimizer:
  93. _target_: torch.optim.AdamW
  94. _partial_: true
  95. lr: 1e-4
  96. betas: [0.8, 0.99]
  97. eps: 1e-6
  98. lr_scheduler:
  99. _target_: torch.optim.lr_scheduler.ExponentialLR
  100. _partial_: true
  101. gamma: 0.999999
  102. callbacks:
  103. grad_norm_monitor:
  104. sub_module:
  105. - generator
  106. - discriminator
  107. model_checkpoint:
  108. every_n_train_steps: 1000
  109. save_top_k: 10