vits_decoder_finetune.yaml 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. defaults:
  2. - base
  3. - _self_
  4. project: vits_decoder
  5. ckpt_path: checkpoints/vits_decoder_v1.1.ckpt
  6. resume_weights_only: true
  7. # Lightning Trainer
  8. trainer:
  9. accelerator: gpu
  10. devices: auto
  11. strategy:
  12. find_unused_parameters: true
  13. precision: 32
  14. max_steps: 100_000
  15. val_check_interval: 100
  16. benchmark: false
  17. sample_rate: 44100
  18. hop_length: 512
  19. num_mels: 128
  20. n_fft: 2048
  21. win_length: 2048
  22. # Dataset Configuration
  23. tokenizer:
  24. _target_: transformers.AutoTokenizer.from_pretrained
  25. pretrained_model_name_or_path: fishaudio/fish-speech-1
  26. # Dataset Configuration
  27. train_dataset:
  28. _target_: fish_speech.datasets.vits.VITSDataset
  29. filelist: data/source/Genshin/filelist.train.txt
  30. sample_rate: ${sample_rate}
  31. hop_length: ${hop_length}
  32. suffix: ".lab"
  33. tokenizer: ${tokenizer}
  34. sentence_mask_ratio: 0.2
  35. val_dataset:
  36. _target_: fish_speech.datasets.vits.VITSDataset
  37. filelist: data/source/Genshin/filelist.test.txt
  38. sample_rate: ${sample_rate}
  39. hop_length: ${hop_length}
  40. suffix: ".lab"
  41. tokenizer: ${tokenizer}
  42. data:
  43. _target_: fish_speech.datasets.vits.VITSDataModule
  44. train_dataset: ${train_dataset}
  45. val_dataset: ${val_dataset}
  46. num_workers: 4
  47. batch_size: 8
  48. val_batch_size: 4
  49. tokenizer: ${tokenizer}
  50. # Model Configuration
  51. model:
  52. _target_: fish_speech.models.vits_decoder.VITSDecoder
  53. sample_rate: ${sample_rate}
  54. hop_length: ${hop_length}
  55. freeze_discriminator: false
  56. weight_mel: 45.0
  57. weight_kl: 1.0
  58. generator:
  59. _target_: fish_speech.models.vits_decoder.modules.models.SynthesizerTrn
  60. spec_channels: 1025
  61. segment_size: 32
  62. inter_channels: 192
  63. hidden_channels: 192
  64. filter_channels: 768
  65. n_heads: 2
  66. n_layers: 6
  67. kernel_size: 3
  68. p_dropout: 0.1
  69. resblock: "1"
  70. resblock_kernel_sizes: [3, 7, 11]
  71. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  72. upsample_rates: [8, 8, 2, 2, 2]
  73. upsample_initial_channel: 512
  74. upsample_kernel_sizes: [16, 16, 8, 2, 2]
  75. gin_channels: 512
  76. vq_mask_ratio: 0.2
  77. ref_mask_ratio: 0.2
  78. discriminator:
  79. _target_: fish_speech.models.vits_decoder.modules.models.EnsembledDiscriminator
  80. periods: [2, 3, 5, 7, 11]
  81. mel_transform:
  82. _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
  83. sample_rate: ${sample_rate}
  84. n_fft: ${n_fft}
  85. hop_length: ${hop_length}
  86. win_length: ${win_length}
  87. n_mels: ${num_mels}
  88. spec_transform:
  89. _target_: fish_speech.utils.spectrogram.LinearSpectrogram
  90. n_fft: ${n_fft}
  91. hop_length: ${hop_length}
  92. win_length: ${win_length}
  93. mode: pow2_sqrt
  94. optimizer:
  95. _target_: torch.optim.AdamW
  96. _partial_: true
  97. lr: 1e-4
  98. betas: [0.8, 0.99]
  99. eps: 1e-6
  100. lr_scheduler:
  101. _target_: torch.optim.lr_scheduler.ExponentialLR
  102. _partial_: true
  103. gamma: 0.999999
  104. callbacks:
  105. grad_norm_monitor:
  106. sub_module:
  107. - generator
  108. - discriminator
  109. model_checkpoint:
  110. every_n_train_steps: ${trainer.val_check_interval}
  111. save_top_k: 10