modded_dac_vq.yaml 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. _target_: fish_speech.models.dac.modded_dac.DAC
  2. # Model setup
  3. sample_rate: 44100
  4. encoder_dim: 64
  5. encoder_rates: [2, 4, 8, 8]
  6. decoder_dim: 1536
  7. decoder_rates: [8, 8, 4, 2]
  8. encoder_transformer_layers: [0, 0, 0, 4]
  9. decoder_transformer_layers: [4, 0, 0, 0]
  10. transformer_general_config:
  11. _target_: fish_speech.models.dac.modded_dac.ModelArgs
  12. _partial_: true
  13. block_size: 8192
  14. n_local_heads: -1
  15. head_dim: 64
  16. rope_base: 10000
  17. norm_eps: 1e-5
  18. dropout_rate: 0.1
  19. attn_dropout_rate: 0.1
  20. channels_first: true
  21. # Quantization
  22. quantizer:
  23. _target_: fish_speech.models.dac.rvq.DownsampleResidualVectorQuantize
  24. input_dim: 1024
  25. n_codebooks: 9
  26. codebook_size: 1024
  27. codebook_dim: 8
  28. quantizer_dropout: 0.5
  29. downsample_factor: [2, 2]
  30. post_module: &transformer_module
  31. _target_: fish_speech.models.dac.modded_dac.WindowLimitedTransformer
  32. causal: true
  33. window_size: 128 # empirically this does not seem to matter
  34. input_dim: 1024
  35. config: &transformer_config
  36. _target_: fish_speech.models.dac.modded_dac.ModelArgs
  37. block_size: 2048
  38. n_layer: 8
  39. n_head: 16
  40. dim: 1024
  41. intermediate_size: 3072
  42. n_local_heads: -1
  43. head_dim: 64
  44. rope_base: 10000
  45. norm_eps: 1e-5
  46. dropout_rate: 0.1
  47. attn_dropout_rate: 0.1
  48. channels_first: true
  49. pre_module: *transformer_module
  50. semantic_codebook_size: 4096