firefly_gan_vq.yaml 1021 B

12345678910111213141516171819202122232425262728293031323334
  1. _target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
  2. spec_transform:
  3. _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
  4. sample_rate: 44100
  5. n_mels: 160
  6. n_fft: 2048
  7. hop_length: 512
  8. win_length: 2048
  9. backbone:
  10. _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
  11. input_channels: 160
  12. depths: [3, 3, 9, 3]
  13. dims: [128, 256, 384, 512]
  14. drop_path_rate: 0.2
  15. kernel_size: 7
  16. head:
  17. _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
  18. hop_length: 512
  19. upsample_rates: [8, 8, 2, 2, 2] # aka. strides
  20. upsample_kernel_sizes: [16, 16, 4, 4, 4]
  21. resblock_kernel_sizes: [3, 7, 11]
  22. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  23. num_mels: 512
  24. upsample_initial_channel: 512
  25. use_template: false
  26. pre_conv_kernel_size: 13
  27. post_conv_kernel_size: 13
  28. quantizer:
  29. _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
  30. input_dim: 512
  31. n_groups: 4
  32. n_codebooks: 1
  33. levels: [8, 5, 5, 5]
  34. downsample_factor: [2]