vits_decoder.yaml 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. defaults:
  2. - base
  3. - _self_
  4. project: vits_decoder
  5. ckpt_path: checkpoints/Bert-VITS2/ensemble.pth
  6. resume_weights_only: true
  7. # Lightning Trainer
  8. trainer:
  9. accelerator: gpu
  10. devices: auto
  11. strategy: ddp_find_unused_parameters_true
  12. precision: 32
  13. max_steps: 1_000_000
  14. val_check_interval: 2000
  15. sample_rate: 44100
  16. hop_length: 512
  17. num_mels: 128
  18. n_fft: 2048
  19. win_length: 2048
  20. # Dataset Configuration
  21. tokenizer:
  22. _target_: transformers.AutoTokenizer.from_pretrained
  23. pretrained_model_name_or_path: fishaudio/fish-speech-1
  24. # Dataset Configuration
  25. train_dataset:
  26. _target_: fish_speech.datasets.vits.VITSDataset
  27. filelist: data/source/Genshin/filelist.train.txt
  28. sample_rate: ${sample_rate}
  29. hop_length: ${hop_length}
  30. suffix: ".lab"
  31. tokenizer: ${tokenizer}
  32. val_dataset:
  33. _target_: fish_speech.datasets.vits.VITSDataset
  34. filelist: data/source/Genshin/filelist.test.txt
  35. sample_rate: ${sample_rate}
  36. hop_length: ${hop_length}
  37. suffix: ".lab"
  38. tokenizer: ${tokenizer}
  39. data:
  40. _target_: fish_speech.datasets.vits.VITSDataModule
  41. train_dataset: ${train_dataset}
  42. val_dataset: ${val_dataset}
  43. num_workers: 4
  44. batch_size: 8
  45. val_batch_size: 4
  46. tokenizer: ${tokenizer}
  47. # Model Configuration
  48. model:
  49. _target_: fish_speech.models.vits_decoder.VITSDecoder
  50. sample_rate: ${sample_rate}
  51. hop_length: ${hop_length}
  52. freeze_discriminator: false
  53. weight_mel: 45.0
  54. weight_kl: 1.0
  55. generator:
  56. _target_: fish_speech.models.vits_decoder.modules.models.SynthesizerTrn
  57. spec_channels: 1025
  58. segment_size: 32
  59. inter_channels: 192
  60. hidden_channels: 192
  61. filter_channels: 768
  62. n_heads: 2
  63. n_layers: 6
  64. kernel_size: 3
  65. p_dropout: 0.1
  66. resblock: "1"
  67. resblock_kernel_sizes: [3, 7, 11]
  68. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  69. upsample_rates: [8, 8, 2, 2, 2]
  70. upsample_initial_channel: 512
  71. upsample_kernel_sizes: [16, 16, 8, 2, 2]
  72. gin_channels: 512
  73. discriminator:
  74. _target_: fish_speech.models.vits_decoder.modules.models.EnsembledDiscriminator
  75. periods: [2, 3, 5, 7, 11]
  76. mel_transform:
  77. _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
  78. sample_rate: ${sample_rate}
  79. n_fft: ${n_fft}
  80. hop_length: ${hop_length}
  81. win_length: ${win_length}
  82. n_mels: ${num_mels}
  83. spec_transform:
  84. _target_: fish_speech.utils.spectrogram.LinearSpectrogram
  85. n_fft: ${n_fft}
  86. hop_length: ${hop_length}
  87. win_length: ${win_length}
  88. mode: pow2_sqrt
  89. optimizer:
  90. _target_: torch.optim.AdamW
  91. _partial_: true
  92. lr: 1e-4
  93. betas: [0.8, 0.99]
  94. eps: 1e-6
  95. lr_scheduler:
  96. _target_: torch.optim.lr_scheduler.ExponentialLR
  97. _partial_: true
  98. gamma: 0.999875
  99. callbacks:
  100. grad_norm_monitor:
  101. sub_module:
  102. - generator
  103. - discriminator