vits_decoder_pretrain.yaml 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. defaults:
  2. - base
  3. - _self_
  4. project: vits_decoder
  5. ckpt_path: checkpoints/Bert-VITS2/ensemble.pth
  6. resume_weights_only: true
  7. # Lightning Trainer
  8. trainer:
  9. accelerator: gpu
  10. devices: auto
  11. strategy: ddp_find_unused_parameters_true
  12. precision: 32
  13. max_steps: 1_000_000
  14. val_check_interval: 1000
  15. benchmark: false
  16. sample_rate: 44100
  17. hop_length: 512
  18. num_mels: 128
  19. n_fft: 2048
  20. win_length: 2048
  21. # Dataset Configuration
  22. tokenizer:
  23. _target_: transformers.AutoTokenizer.from_pretrained
  24. pretrained_model_name_or_path: fishaudio/fish-speech-1
  25. # Dataset Configuration
  26. train_dataset:
  27. _target_: fish_speech.datasets.vits.VITSDataset
  28. filelist: data/source/Genshin/filelist.train.txt
  29. sample_rate: ${sample_rate}
  30. hop_length: ${hop_length}
  31. suffix: ".lab"
  32. tokenizer: ${tokenizer}
  33. sentence_mask_ratio: 0.2
  34. val_dataset:
  35. _target_: fish_speech.datasets.vits.VITSDataset
  36. filelist: data/source/Genshin/filelist.test.txt
  37. sample_rate: ${sample_rate}
  38. hop_length: ${hop_length}
  39. suffix: ".lab"
  40. tokenizer: ${tokenizer}
  41. data:
  42. _target_: fish_speech.datasets.vits.VITSDataModule
  43. train_dataset: ${train_dataset}
  44. val_dataset: ${val_dataset}
  45. num_workers: 4
  46. batch_size: 8
  47. val_batch_size: 4
  48. tokenizer: ${tokenizer}
  49. # Model Configuration
  50. model:
  51. _target_: fish_speech.models.vits_decoder.VITSDecoder
  52. sample_rate: ${sample_rate}
  53. hop_length: ${hop_length}
  54. freeze_discriminator: false
  55. weight_mel: 45.0
  56. weight_kl: 1.0
  57. generator:
  58. _target_: fish_speech.models.vits_decoder.modules.models.SynthesizerTrn
  59. spec_channels: 1025
  60. segment_size: 32
  61. inter_channels: 192
  62. hidden_channels: 192
  63. filter_channels: 768
  64. n_heads: 2
  65. n_layers: 6
  66. kernel_size: 3
  67. p_dropout: 0.1
  68. resblock: "1"
  69. resblock_kernel_sizes: [3, 7, 11]
  70. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  71. upsample_rates: [8, 8, 2, 2, 2]
  72. upsample_initial_channel: 512
  73. upsample_kernel_sizes: [16, 16, 8, 2, 2]
  74. gin_channels: 512
  75. vq_mask_ratio: 0.2
  76. ref_mask_ratio: 0.2
  77. discriminator:
  78. _target_: fish_speech.models.vits_decoder.modules.models.EnsembledDiscriminator
  79. periods: [2, 3, 5, 7, 11]
  80. mel_transform:
  81. _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
  82. sample_rate: ${sample_rate}
  83. n_fft: ${n_fft}
  84. hop_length: ${hop_length}
  85. win_length: ${win_length}
  86. n_mels: ${num_mels}
  87. spec_transform:
  88. _target_: fish_speech.utils.spectrogram.LinearSpectrogram
  89. n_fft: ${n_fft}
  90. hop_length: ${hop_length}
  91. win_length: ${win_length}
  92. mode: pow2_sqrt
  93. optimizer:
  94. _target_: torch.optim.AdamW
  95. _partial_: true
  96. lr: 1e-4
  97. betas: [0.8, 0.99]
  98. eps: 1e-6
  99. lr_scheduler:
  100. _target_: torch.optim.lr_scheduler.ExponentialLR
  101. _partial_: true
  102. gamma: 0.999999
  103. callbacks:
  104. grad_norm_monitor:
  105. sub_module:
  106. - generator
  107. - discriminator
  108. model_checkpoint:
  109. every_n_train_steps: 1000
  110. save_top_k: 10