| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- _target_: fish_speech.models.dac.modded_dac.DAC
- # Model setup
- sample_rate: 44100
- encoder_dim: 64
- encoder_rates: [2, 4, 8, 8]
- decoder_dim: 1536
- decoder_rates: [8, 8, 4, 2]
- encoder_transformer_layers: [0, 0, 0, 4]
- decoder_transformer_layers: [4, 0, 0, 0]
- transformer_general_config:
- _target_: fish_speech.models.dac.modded_dac.ModelArgs
- _partial_: true
- block_size: 8192
- n_local_heads: -1
- head_dim: 64
- rope_base: 10000
- norm_eps: 1e-5
- dropout_rate: 0.1
- attn_dropout_rate: 0.1
- channels_first: true
- # Quantization
- quantizer:
- _target_: fish_speech.models.dac.rvq.DownsampleResidualVectorQuantize
- input_dim: 1024
- n_codebooks: 9
- codebook_size: 1024
- codebook_dim: 8
- quantizer_dropout: 0.5
- downsample_factor: [2, 2]
- post_module: &transformer_module
- _target_: fish_speech.models.dac.modded_dac.WindowLimitedTransformer
- causal: true
- window_size: 128 # empirically this does not seem to matter
- input_dim: 1024
- config: &transformer_config
- _target_: fish_speech.models.dac.modded_dac.ModelArgs
- block_size: 2048
- n_layer: 8
- n_head: 16
- dim: 1024
- intermediate_size: 3072
- n_local_heads: -1
- head_dim: 64
- rope_base: 10000
- norm_eps: 1e-5
- dropout_rate: 0.1
- attn_dropout_rate: 0.1
- channels_first: true
- pre_module: *transformer_module
- semantic_codebook_size: 4096
|