пре 2 година · 5e7914472f
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,4 @@ asr-label*
 
				 /.locale
			
 
				 /demo-audios
			
 
				 ref_data*
			
 
				+/example
			
--- a/API_FLAGS.txt
+++ b/API_FLAGS.txt
@@ -3,5 +3,5 @@
 
				 --listen 0.0.0.0:8000 \
			
 
				 --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
			
 
				 --llama-config-name dual_ar_2_codebook_medium \
			
 
				---decoder-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth" \
			
 
				---decoder-config-name vqgan_finetune
			
 
				+--decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
			
 
				+--decoder-config-name firefly_gan_vq
			
--- a/docs/en/finetune.md
+++ b/docs/en/finetune.md
@@ -59,8 +59,8 @@ You can then run the following command to extract semantic tokens:
 
				 ```bash
			
 
				 python tools/vqgan/extract_vq.py data \
			
 
				     --num-workers 1 --batch-size 16 \
			
 
				-    --config-name "vqgan_pretrain" \
			
 
				-    --checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
			
 
				+    --config-name "firefly_gan_vq" \
			
 
				+    --checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
			
 
				 ```
			
 
				 
			
 
				 !!! note
			
@@ -233,16 +233,16 @@ This command will create `data/vq_train_filelist.txt` and `data/vq_val_filelist.
 
				 ### 3. Start Training
			
 
				 
			
 
				 ```bash
			
 
				-python fish_speech/train.py --config-name vqgan_finetune
			
 
				+python fish_speech/train.py --config-name firefly_gan_vq
			
 
				 ```
			
 
				 
			
 
				 !!! note
			
 
				-    You can modify training parameters by editing `fish_speech/configs/vqgan_finetune.yaml`, but in most cases, this won't be necessary.
			
 
				+    You can modify training parameters by editing `fish_speech/configs/firefly_gan_vq.yaml`, but in most cases, this won't be necessary.
			
 
				 
			
 
				 ### 4. Test the Audio
			
 
				     
			
 
				 ```bash
			
 
				-python tools/vqgan/inference.py -i test.wav --checkpoint-path results/vqgan_finetune/checkpoints/step_000010000.ckpt
			
 
				+python tools/vqgan/inference.py -i test.wav --checkpoint-path results/firefly_gan_vq/checkpoints/step_000010000.ckpt
			
 
				 ```
			
 
				 
			
 
				 You can review `fake.wav` to assess the fine-tuning results.
			
--- a/docs/en/inference.md
+++ b/docs/en/inference.md
@@ -31,7 +31,7 @@ huggingface-cli download fishaudio/fish-speech-1 firefly-gan-base-generator.ckpt
 
				 ```bash
			
 
				 python tools/vqgan/inference.py \
			
 
				     -i "paimon.wav" \
			
 
				-    --checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
			
 
				+    --checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
			
 
				 ```
			
 
				 You should get a `fake.npy` file.
			
 
				 
			
@@ -73,7 +73,7 @@ python tools/vits_decoder/inference.py \
 
				 ```bash
			
 
				 python tools/vqgan/inference.py \
			
 
				     -i "codes_0.npy" \
			
 
				-    --checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
			
 
				+    --checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
			
 
				 ```
			
 
				 
			
 
				 ## HTTP API Inference
			
@@ -85,8 +85,8 @@ python -m tools.api \
 
				     --listen 0.0.0.0:8000 \
			
 
				     --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
			
 
				     --llama-config-name dual_ar_2_codebook_medium \
			
 
				-    --decoder-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth" \
			
 
				-    --decoder-config-name vqgan_pretrain
			
 
				+    --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
			
 
				+    --decoder-config-name firefly_gan_vq
			
 
				 ```
			
 
				 
			
 
				 After that, you can view and test the API at http://127.0.0.1:8000/.  
			
@@ -107,7 +107,7 @@ You can start the WebUI using the following command:
 
				 python -m tools.webui \
			
 
				     --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
			
 
				     --llama-config-name dual_ar_2_codebook_medium \
			
 
				-    --vqgan-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth" \
			
 
				+    --vqgan-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
			
 
				     --vits-checkpoint-path "checkpoints/vits_decoder_v1.1.ckpt"
			
 
				 ```
			
 
				 
			
--- a/docs/zh/finetune.md
+++ b/docs/zh/finetune.md
@@ -63,8 +63,8 @@ HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech
 
				 ```bash
			
 
				 python tools/vqgan/extract_vq.py data \
			
 
				     --num-workers 1 --batch-size 16 \
			
 
				-    --config-name "vqgan_pretrain" \
			
 
				-    --checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
			
 
				+    --config-name "firefly_gan_vq" \
			
 
				+    --checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
			
 
				 ```
			
 
				 
			
 
				 !!! note
			
@@ -239,16 +239,16 @@ python tools/vqgan/create_train_split.py data
 
				 ### 3. 启动训练
			
 
				 
			
 
				 ```bash
			
 
				-python fish_speech/train.py --config-name vqgan_finetune
			
 
				+python fish_speech/train.py --config-name firefly_gan_vq
			
 
				 ```
			
 
				 
			
 
				 !!! note
			
 
				-    你可以通过修改 `fish_speech/configs/vqgan_finetune.yaml` 来修改训练参数, 但大部分情况下, 你不需要这么做.
			
 
				+    你可以通过修改 `fish_speech/configs/firefly_gan_vq.yaml` 来修改训练参数, 但大部分情况下, 你不需要这么做.
			
 
				 
			
 
				 ### 4. 测试音频
			
 
				     
			
 
				 ```bash
			
 
				-python tools/vqgan/inference.py -i test.wav --checkpoint-path results/vqgan_finetune/checkpoints/step_000010000.ckpt
			
 
				+python tools/vqgan/inference.py -i test.wav --checkpoint-path results/firefly_gan_vq/checkpoints/step_000010000.ckpt
			
 
				 ```
			
 
				 
			
 
				 你可以查看 `fake.wav` 来判断微调效果.
			
--- a/docs/zh/inference.md
+++ b/docs/zh/inference.md
@@ -41,7 +41,7 @@ HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech
 
				 ```bash
			
 
				 python tools/vqgan/inference.py \
			
 
				     -i "paimon.wav" \
			
 
				-    --checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
			
 
				+    --checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
			
 
				 ```
			
 
				 你应该能得到一个 `fake.npy` 文件.
			
 
				 
			
@@ -83,7 +83,7 @@ python tools/vits_decoder/inference.py \
 
				 ```bash
			
 
				 python tools/vqgan/inference.py \
			
 
				     -i "codes_0.npy" \
			
 
				-    --checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
			
 
				+    --checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
			
 
				 ```
			
 
				 
			
 
				 ## HTTP API 推理
			
@@ -95,8 +95,8 @@ python -m tools.api \
 
				     --listen 0.0.0.0:8000 \
			
 
				     --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
			
 
				     --llama-config-name dual_ar_2_codebook_medium \
			
 
				-    --decoder-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth" \
			
 
				-    --decoder-config-name vqgan_pretrain
			
 
				+    --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
			
 
				+    --decoder-config-name firefly_gan_vq
			
 
				 
			
 
				 # 推荐中国大陆用户运行以下命令来启动 HTTP 服务:
			
 
				 HF_ENDPOINT=https://hf-mirror.com python -m ...
			
@@ -120,8 +120,8 @@ HF_ENDPOINT=https://hf-mirror.com python -m ...
 
				 python -m tools.webui \
			
 
				     --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
			
 
				     --llama-config-name dual_ar_2_codebook_medium \
			
 
				-    --decoder-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth" \
			
 
				-    --decoder-config-name vqgan_pretrain
			
 
				+    --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
			
 
				+    --decoder-config-name firefly_gan_vq
			
 
				 ```
			
 
				 
			
 
				 !!! info
			
--- a/fish_speech/configs/firefly_gan_vq.yaml
+++ b/fish_speech/configs/firefly_gan_vq.yaml
@@ -0,0 +1,34 @@
 
				+_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
			
 
				+spec_transform:
			
 
				+  _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
			
 
				+  sample_rate: 44100
			
 
				+  n_mels: 160
			
 
				+  n_fft: 2048
			
 
				+  hop_length: 512
			
 
				+  win_length: 2048
			
 
				+backbone:
			
 
				+  _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
			
 
				+  input_channels: 160
			
 
				+  depths: [3, 3, 9, 3]
			
 
				+  dims: [128, 256, 384, 512]
			
 
				+  drop_path_rate: 0.2
			
 
				+  kernel_size: 7
			
 
				+head:
			
 
				+  _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
			
 
				+  hop_length: 512
			
 
				+  upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
			
 
				+  upsample_kernel_sizes: [16, 16, 4, 4, 4]
			
 
				+  resblock_kernel_sizes: [3, 7, 11]
			
 
				+  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				+  num_mels: 512
			
 
				+  upsample_initial_channel: 512
			
 
				+  use_template: false
			
 
				+  pre_conv_kernel_size: 13
			
 
				+  post_conv_kernel_size: 13
			
 
				+quantizer:
			
 
				+  _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
			
 
				+  input_dim: 512
			
 
				+  n_groups: 4
			
 
				+  n_codebooks: 1
			
 
				+  levels: [8, 5, 5, 5]
			
 
				+  downsample_factor: [2]
			
--- a/fish_speech/configs/model/dual_ar_2_codebook_large.yaml
+++ b/fish_speech/configs/model/dual_ar_2_codebook_large.yaml
@@ -1,9 +0,0 @@
 
				-defaults:
			
 
				-  - dual_ar_2_codebook_small
			
 
				-  - _self_
			
 
				-
			
 
				-config:
			
 
				-  n_layer: 30
			
 
				-  n_fast_layer: 6
			
 
				-  n_head: 24
			
 
				-  dim: 1536
			
--- a/fish_speech/configs/model/dual_ar_2_codebook_medium.yaml
+++ b/fish_speech/configs/model/dual_ar_2_codebook_medium.yaml
@@ -1,9 +0,0 @@
 
				-defaults:
			
 
				-  - dual_ar_2_codebook_small
			
 
				-  - _self_
			
 
				-
			
 
				-config:
			
 
				-  n_layer: 24
			
 
				-  n_fast_layer: 6
			
 
				-  n_head: 16
			
 
				-  dim: 1024
			
--- a/fish_speech/configs/model/dual_ar_2_codebook_small.yaml
+++ b/fish_speech/configs/model/dual_ar_2_codebook_small.yaml
@@ -1,13 +0,0 @@
 
				-_target_: fish_speech.models.text2semantic.llama.DualARTransformer
			
 
				-config:
			
 
				-  _target_: fish_speech.models.text2semantic.llama.DualARModelArgs
			
 
				-  max_seq_len: ${max_length}
			
 
				-  vocab_size: 264 # pad 262 to 8x
			
 
				-  n_layer: 12
			
 
				-  n_fast_layer: 4
			
 
				-  n_head: 12
			
 
				-  dim: 768
			
 
				-  rope_base: 10000
			
 
				-  norm_eps: 1e-5
			
 
				-  num_codebooks: 2  # input/output codebook size
			
 
				-  codebook_size: 1032 # codebook size 1024 + 2 special tokens
			
--- a/fish_speech/configs/model/naive_2_codebook_small.yaml
+++ b/fish_speech/configs/model/naive_2_codebook_small.yaml
@@ -1,12 +0,0 @@
 
				-_target_: fish_speech.models.text2semantic.llama.NaiveTransformer
			
 
				-config:
			
 
				-  _target_: fish_speech.models.text2semantic.llama.NaiveModelArgs
			
 
				-  max_seq_len: ${max_length}
			
 
				-  vocab_size: 36408
			
 
				-  n_layer: 12
			
 
				-  n_head: 12
			
 
				-  dim: 768
			
 
				-  rope_base: 10000
			
 
				-  norm_eps: 1e-5
			
 
				-  num_codebooks: 2  # input/output codebook size
			
 
				-  codebook_size: 1032 # codebook size 1024 + 2 special tokens
			
--- a/fish_speech/configs/text2semantic_agent.yaml
+++ b/fish_speech/configs/text2semantic_agent.yaml
@@ -0,0 +1,66 @@
 
				+defaults:
			
 
				+  - base
			
 
				+  - model@model.model: dual_ar_2_codebook_1.3b
			
 
				+  - _self_
			
 
				+
			
 
				+project: text2semantic_agent_dual_ar_debug
			
 
				+max_length: 2048
			
 
				+ckpt_path: checkpoints/fish-speech-agent-1/step_000013000.ckpt
			
 
				+resume_weights_only: true
			
 
				+
			
 
				+# Lightning Trainer
			
 
				+trainer:
			
 
				+  accumulate_grad_batches: 1
			
 
				+  gradient_clip_val: 1.0
			
 
				+  gradient_clip_algorithm: 'norm'
			
 
				+  max_steps: 1_000_000
			
 
				+  precision: bf16-true
			
 
				+  log_every_n_steps: 10
			
 
				+  limit_val_batches: 10
			
 
				+  val_check_interval: 1000
			
 
				+
			
 
				+# Dataset Configuration
			
 
				+tokenizer:
			
 
				+  _target_: transformers.AutoTokenizer.from_pretrained
			
 
				+  pretrained_model_name_or_path: checkpoints/fish-speech-agent-1
			
 
				+
			
 
				+# Dataset Configuration
			
 
				+train_dataset: {}
			
 
				+val_dataset: {}
			
 
				+
			
 
				+data:
			
 
				+  _target_: fish_speech.datasets.text.TextDataModule
			
 
				+  train_dataset: ${train_dataset}
			
 
				+  val_dataset: ${val_dataset}
			
 
				+  num_workers: 4
			
 
				+  batch_size: 8
			
 
				+  tokenizer: ${tokenizer}
			
 
				+  max_length: ${max_length}
			
 
				+
			
 
				+# Model Configuration
			
 
				+model:
			
 
				+  _target_: fish_speech.models.text2semantic.TextToSemantic
			
 
				+  model: {}
			
 
				+
			
 
				+  optimizer:
			
 
				+    _target_: torch.optim.AdamW
			
 
				+    _partial_: true
			
 
				+    lr: 3e-4
			
 
				+    weight_decay: 0.01
			
 
				+    betas: [0.9, 0.95]
			
 
				+    eps: 1e-5
			
 
				+
			
 
				+  lr_scheduler:
			
 
				+    _target_: torch.optim.lr_scheduler.LambdaLR
			
 
				+    _partial_: true
			
 
				+    lr_lambda:
			
 
				+      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
			
 
				+      _partial_: true
			
 
				+      num_warmup_steps: 1000
			
 
				+      num_training_steps: ${trainer.max_steps}
			
 
				+      final_lr_ratio: 0.1
			
 
				+
			
 
				+# Callbacks
			
 
				+callbacks:
			
 
				+  model_checkpoint:
			
 
				+    every_n_train_steps: ${trainer.val_check_interval}
			
--- a/fish_speech/configs/vits_decoder_finetune.yaml
+++ b/fish_speech/configs/vits_decoder_finetune.yaml
@@ -1,128 +0,0 @@
 
				-defaults:
			
 
				-  - base
			
 
				-  - _self_
			
 
				-
			
 
				-project: vits_decoder
			
 
				-ckpt_path: checkpoints/vits_decoder_v1.1.ckpt
			
 
				-resume_weights_only: true
			
 
				-
			
 
				-# Lightning Trainer
			
 
				-trainer:
			
 
				-  accelerator: gpu
			
 
				-  devices: auto
			
 
				-  strategy:
			
 
				-    find_unused_parameters: true
			
 
				-  precision: 32
			
 
				-  max_steps: 100_000
			
 
				-  val_check_interval: 100
			
 
				-  benchmark: false
			
 
				-
			
 
				-sample_rate: 44100
			
 
				-hop_length: 512
			
 
				-num_mels: 128
			
 
				-n_fft: 2048
			
 
				-win_length: 2048
			
 
				-
			
 
				-# Dataset Configuration
			
 
				-tokenizer:
			
 
				-  _target_: transformers.AutoTokenizer.from_pretrained
			
 
				-  pretrained_model_name_or_path: fishaudio/fish-speech-1
			
 
				-
			
 
				-# Dataset Configuration
			
 
				-train_dataset:
			
 
				-  _target_: fish_speech.datasets.vits.VITSDataset
			
 
				-  filelist: data/source/Genshin/filelist.train.txt
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-  suffix: ".lab"
			
 
				-  tokenizer: ${tokenizer}
			
 
				-  sentence_mask_ratio: 0.2
			
 
				-
			
 
				-val_dataset:
			
 
				-  _target_: fish_speech.datasets.vits.VITSDataset
			
 
				-  filelist: data/source/Genshin/filelist.test.txt
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-  suffix: ".lab"
			
 
				-  tokenizer: ${tokenizer}
			
 
				-
			
 
				-data:
			
 
				-  _target_: fish_speech.datasets.vits.VITSDataModule
			
 
				-  train_dataset: ${train_dataset}
			
 
				-  val_dataset: ${val_dataset}
			
 
				-  num_workers: 4
			
 
				-  batch_size: 8
			
 
				-  val_batch_size: 4
			
 
				-  tokenizer: ${tokenizer}
			
 
				-
			
 
				-# Model Configuration
			
 
				-model:
			
 
				-  _target_: fish_speech.models.vits_decoder.VITSDecoder
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-  freeze_discriminator: false
			
 
				-
			
 
				-  weight_mel: 45.0
			
 
				-  weight_kl: 1.0
			
 
				-
			
 
				-  generator:
			
 
				-    _target_: fish_speech.models.vits_decoder.modules.models.SynthesizerTrn
			
 
				-    spec_channels: 1025
			
 
				-    segment_size: 32
			
 
				-    inter_channels: 192
			
 
				-    hidden_channels: 192
			
 
				-    filter_channels: 768
			
 
				-    n_heads: 2
			
 
				-    n_layers: 6
			
 
				-    kernel_size: 3
			
 
				-    p_dropout: 0.1
			
 
				-    resblock: "1"
			
 
				-    resblock_kernel_sizes: [3, 7, 11]
			
 
				-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				-    upsample_rates: [8, 8, 2, 2, 2]
			
 
				-    upsample_initial_channel: 512
			
 
				-    upsample_kernel_sizes: [16, 16, 8, 2, 2]
			
 
				-    gin_channels: 512
			
 
				-    vq_mask_ratio: 0.2
			
 
				-    ref_mask_ratio: 0.2
			
 
				-
			
 
				-  discriminator:
			
 
				-    _target_: fish_speech.models.vits_decoder.modules.models.EnsembledDiscriminator
			
 
				-    periods: [2, 3, 5, 7, 11]
			
 
				-
			
 
				-  mel_transform:
			
 
				-    _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
			
 
				-    sample_rate: ${sample_rate}
			
 
				-    n_fft: ${n_fft}
			
 
				-    hop_length: ${hop_length}
			
 
				-    win_length: ${win_length}
			
 
				-    n_mels: ${num_mels}
			
 
				-
			
 
				-  spec_transform:
			
 
				-    _target_: fish_speech.utils.spectrogram.LinearSpectrogram
			
 
				-    n_fft: ${n_fft}
			
 
				-    hop_length: ${hop_length}
			
 
				-    win_length: ${win_length}
			
 
				-    mode: pow2_sqrt
			
 
				-  
			
 
				-  optimizer:
			
 
				-    _target_: torch.optim.AdamW
			
 
				-    _partial_: true
			
 
				-    lr: 1e-4
			
 
				-    betas: [0.8, 0.99]
			
 
				-    eps: 1e-6
			
 
				-
			
 
				-  lr_scheduler:
			
 
				-    _target_: torch.optim.lr_scheduler.ExponentialLR
			
 
				-    _partial_: true
			
 
				-    gamma: 0.999999
			
 
				-
			
 
				-callbacks:
			
 
				-  grad_norm_monitor:
			
 
				-    sub_module: 
			
 
				-      - generator
			
 
				-      - discriminator
			
 
				-
			
 
				-  model_checkpoint:
			
 
				-    every_n_train_steps: ${trainer.val_check_interval}
			
 
				-    save_top_k: 10
			
--- a/fish_speech/configs/vits_decoder_pretrain.yaml
+++ b/fish_speech/configs/vits_decoder_pretrain.yaml
@@ -1,127 +0,0 @@
 
				-defaults:
			
 
				-  - base
			
 
				-  - _self_
			
 
				-
			
 
				-project: vits_decoder
			
 
				-ckpt_path: checkpoints/Bert-VITS2/ensemble.pth
			
 
				-resume_weights_only: true
			
 
				-
			
 
				-# Lightning Trainer
			
 
				-trainer:
			
 
				-  accelerator: gpu
			
 
				-  devices: auto
			
 
				-  strategy: ddp_find_unused_parameters_true
			
 
				-  precision: 32
			
 
				-  max_steps: 1_000_000
			
 
				-  val_check_interval: 1000
			
 
				-  benchmark: false
			
 
				-
			
 
				-sample_rate: 44100
			
 
				-hop_length: 512
			
 
				-num_mels: 128
			
 
				-n_fft: 2048
			
 
				-win_length: 2048
			
 
				-
			
 
				-# Dataset Configuration
			
 
				-tokenizer:
			
 
				-  _target_: transformers.AutoTokenizer.from_pretrained
			
 
				-  pretrained_model_name_or_path: fishaudio/fish-speech-1
			
 
				-
			
 
				-# Dataset Configuration
			
 
				-train_dataset:
			
 
				-  _target_: fish_speech.datasets.vits.VITSDataset
			
 
				-  filelist: data/source/Genshin/filelist.train.txt
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-  suffix: ".lab"
			
 
				-  tokenizer: ${tokenizer}
			
 
				-  sentence_mask_ratio: 0.2
			
 
				-
			
 
				-val_dataset:
			
 
				-  _target_: fish_speech.datasets.vits.VITSDataset
			
 
				-  filelist: data/source/Genshin/filelist.test.txt
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-  suffix: ".lab"
			
 
				-  tokenizer: ${tokenizer}
			
 
				-
			
 
				-data:
			
 
				-  _target_: fish_speech.datasets.vits.VITSDataModule
			
 
				-  train_dataset: ${train_dataset}
			
 
				-  val_dataset: ${val_dataset}
			
 
				-  num_workers: 4
			
 
				-  batch_size: 8
			
 
				-  val_batch_size: 4
			
 
				-  tokenizer: ${tokenizer}
			
 
				-
			
 
				-# Model Configuration
			
 
				-model:
			
 
				-  _target_: fish_speech.models.vits_decoder.VITSDecoder
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-  freeze_discriminator: false
			
 
				-
			
 
				-  weight_mel: 45.0
			
 
				-  weight_kl: 1.0
			
 
				-
			
 
				-  generator:
			
 
				-    _target_: fish_speech.models.vits_decoder.modules.models.SynthesizerTrn
			
 
				-    spec_channels: 1025
			
 
				-    segment_size: 32
			
 
				-    inter_channels: 192
			
 
				-    hidden_channels: 192
			
 
				-    filter_channels: 768
			
 
				-    n_heads: 2
			
 
				-    n_layers: 6
			
 
				-    kernel_size: 3
			
 
				-    p_dropout: 0.1
			
 
				-    resblock: "1"
			
 
				-    resblock_kernel_sizes: [3, 7, 11]
			
 
				-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				-    upsample_rates: [8, 8, 2, 2, 2]
			
 
				-    upsample_initial_channel: 512
			
 
				-    upsample_kernel_sizes: [16, 16, 8, 2, 2]
			
 
				-    gin_channels: 512
			
 
				-    vq_mask_ratio: 0.2
			
 
				-    ref_mask_ratio: 0.2
			
 
				-
			
 
				-  discriminator:
			
 
				-    _target_: fish_speech.models.vits_decoder.modules.models.EnsembledDiscriminator
			
 
				-    periods: [2, 3, 5, 7, 11]
			
 
				-
			
 
				-  mel_transform:
			
 
				-    _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
			
 
				-    sample_rate: ${sample_rate}
			
 
				-    n_fft: ${n_fft}
			
 
				-    hop_length: ${hop_length}
			
 
				-    win_length: ${win_length}
			
 
				-    n_mels: ${num_mels}
			
 
				-
			
 
				-  spec_transform:
			
 
				-    _target_: fish_speech.utils.spectrogram.LinearSpectrogram
			
 
				-    n_fft: ${n_fft}
			
 
				-    hop_length: ${hop_length}
			
 
				-    win_length: ${win_length}
			
 
				-    mode: pow2_sqrt
			
 
				-  
			
 
				-  optimizer:
			
 
				-    _target_: torch.optim.AdamW
			
 
				-    _partial_: true
			
 
				-    lr: 1e-4
			
 
				-    betas: [0.8, 0.99]
			
 
				-    eps: 1e-6
			
 
				-
			
 
				-  lr_scheduler:
			
 
				-    _target_: torch.optim.lr_scheduler.ExponentialLR
			
 
				-    _partial_: true
			
 
				-    gamma: 0.999999
			
 
				-
			
 
				-callbacks:
			
 
				-  grad_norm_monitor:
			
 
				-    sub_module: 
			
 
				-      - generator
			
 
				-      - discriminator
			
 
				-
			
 
				-  model_checkpoint:
			
 
				-    every_n_train_steps: 1000
			
 
				-    save_top_k: 10
			
--- a/fish_speech/configs/vqgan_finetune.yaml
+++ b/fish_speech/configs/vqgan_finetune.yaml
@@ -1,137 +0,0 @@
 
				-defaults:
			
 
				-  - base
			
 
				-  - _self_
			
 
				-
			
 
				-project: vq-gan-finetune
			
 
				-ckpt_path: checkpoints/vq-gan-group-fsq-2x1024.pth
			
 
				-resume_weights_only: true
			
 
				-
			
 
				-# Lightning Trainer
			
 
				-trainer:
			
 
				-  accelerator: gpu
			
 
				-  devices: auto
			
 
				-  precision: bf16-mixed
			
 
				-  max_steps: 100_000
			
 
				-  val_check_interval: 5000
			
 
				-  strategy:
			
 
				-    find_unused_parameters: true
			
 
				-
			
 
				-sample_rate: 44100
			
 
				-hop_length: 512
			
 
				-num_mels: 128
			
 
				-n_fft: 2048
			
 
				-win_length: 2048
			
 
				-
			
 
				-# Dataset Configuration
			
 
				-train_dataset:
			
 
				-  _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				-  filelist: data/vq_train_filelist.txt
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-  slice_frames: 512
			
 
				-
			
 
				-val_dataset:
			
 
				-  _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				-  filelist: data/vq_val_filelist.txt
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-
			
 
				-data:
			
 
				-  _target_: fish_speech.datasets.vqgan.VQGANDataModule
			
 
				-  train_dataset: ${train_dataset}
			
 
				-  val_dataset: ${val_dataset}
			
 
				-  num_workers: 4
			
 
				-  batch_size: 16
			
 
				-  val_batch_size: 16
			
 
				-
			
 
				-# Model Configuration
			
 
				-model:
			
 
				-  _target_: fish_speech.models.vqgan.VQGAN
			
 
				-
			
 
				-  sampling_rate: ${sample_rate}
			
 
				-  weight_adv: 0.2
			
 
				-  weight_vq: 1.0
			
 
				-  weight_mel: 1.0
			
 
				-
			
 
				-  # Important: Set the freeze_encoder to true to only train the decoder
			
 
				-  freeze_encoder: true
			
 
				-
			
 
				-  encoder:
			
 
				-    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
			
 
				-    input_channels: ${num_mels}
			
 
				-    residual_channels: 768
			
 
				-    residual_layers: 20
			
 
				-    dilation_cycle: 4
			
 
				-  
			
 
				-  quantizer:
			
 
				-    _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
			
 
				-    input_dim: 768
			
 
				-    n_codebooks: 1
			
 
				-    n_groups: 2
			
 
				-    levels: [8, 5, 5, 5]
			
 
				-
			
 
				-  decoder:
			
 
				-    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
			
 
				-    output_channels: ${num_mels}
			
 
				-    residual_channels: 768
			
 
				-    residual_layers: 20
			
 
				-    dilation_cycle: 4
			
 
				-    condition_channels: 768
			
 
				-  
			
 
				-  discriminator:
			
 
				-    _target_: fish_speech.models.vqgan.modules.discriminator.Discriminator
			
 
				-
			
 
				-  vocoder:
			
 
				-    _target_: fish_speech.models.vqgan.modules.firefly.FireflyBase
			
 
				-    ckpt_path: null # You may download the pretrained vocoder and set the path here
			
 
				-
			
 
				-  encode_mel_transform:
			
 
				-    _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
			
 
				-    sample_rate: ${sample_rate}
			
 
				-    n_fft: ${n_fft}
			
 
				-    hop_length: ${hop_length}
			
 
				-    win_length: ${win_length}
			
 
				-    n_mels: ${num_mels}
			
 
				-    f_min: 0.0
			
 
				-    f_max: 8000.0
			
 
				-
			
 
				-  gt_mel_transform:
			
 
				-    _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
			
 
				-    sample_rate: ${sample_rate}
			
 
				-    n_fft: ${n_fft}
			
 
				-    hop_length: ${hop_length}
			
 
				-    win_length: ${win_length}
			
 
				-    n_mels: ${num_mels}
			
 
				-
			
 
				-  optimizer:
			
 
				-    _target_: torch.optim.AdamW
			
 
				-    _partial_: true
			
 
				-    lr: 4e-5
			
 
				-    betas: [0.8, 0.99]
			
 
				-    eps: 1e-5
			
 
				-    weight_decay: 0.01
			
 
				-
			
 
				-  lr_scheduler:
			
 
				-    _target_: torch.optim.lr_scheduler.LambdaLR
			
 
				-    _partial_: true
			
 
				-    lr_lambda:
			
 
				-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
			
 
				-      _partial_: true
			
 
				-      num_warmup_steps: 0
			
 
				-      num_training_steps: ${trainer.max_steps}
			
 
				-      final_lr_ratio: 0
			
 
				-
			
 
				-callbacks:
			
 
				-  model_summary:
			
 
				-    _target_: lightning.pytorch.callbacks.ModelSummary
			
 
				-    max_depth: 1
			
 
				-
			
 
				-  model_checkpoint:
			
 
				-    every_n_train_steps: ${trainer.val_check_interval}
			
 
				-
			
 
				-  grad_norm_monitor:
			
 
				-    sub_module: 
			
 
				-      - encoder
			
 
				-      - decoder
			
 
				-      - quantizer
			
 
				-      - discriminator
			
--- a/fish_speech/configs/vqgan_pretrain.yaml
+++ b/fish_speech/configs/vqgan_pretrain.yaml
@@ -1,140 +0,0 @@
 
				-defaults:
			
 
				-  - base
			
 
				-  - _self_
			
 
				-
			
 
				-project: vq-gan-pretrain
			
 
				-
			
 
				-# Lightning Trainer
			
 
				-trainer:
			
 
				-  accelerator: gpu
			
 
				-  devices: auto
			
 
				-  precision: bf16-mixed
			
 
				-  max_steps: 1_000_000
			
 
				-  val_check_interval: 5000
			
 
				-  strategy:
			
 
				-    find_unused_parameters: true
			
 
				-
			
 
				-sample_rate: 44100
			
 
				-hop_length: 512
			
 
				-num_mels: 128
			
 
				-n_fft: 2048
			
 
				-win_length: 2048
			
 
				-
			
 
				-# Dataset Configuration
			
 
				-train_dataset:
			
 
				-  _target_: torch.utils.data.ConcatDataset
			
 
				-  datasets:
			
 
				-    - _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				-      filelist: data/gigaspeech/vq_train_filelist.txt
			
 
				-      sample_rate: ${sample_rate}
			
 
				-      hop_length: ${hop_length}
			
 
				-      slice_frames: 512
			
 
				-    - _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				-      filelist: data/sft/vq_train_filelist.txt
			
 
				-      sample_rate: ${sample_rate}
			
 
				-      hop_length: ${hop_length}
			
 
				-      slice_frames: 512
			
 
				-
			
 
				-val_dataset:
			
 
				-  _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				-  filelist: data/sft/vq_val_filelist.txt
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-
			
 
				-data:
			
 
				-  _target_: fish_speech.datasets.vqgan.VQGANDataModule
			
 
				-  train_dataset: ${train_dataset}
			
 
				-  val_dataset: ${val_dataset}
			
 
				-  num_workers: 4
			
 
				-  batch_size: 32
			
 
				-  val_batch_size: 32
			
 
				-
			
 
				-# Model Configuration
			
 
				-model:
			
 
				-  _target_: fish_speech.models.vqgan.VQGAN
			
 
				-
			
 
				-  sampling_rate: ${sample_rate}
			
 
				-  weight_adv: 0.2
			
 
				-  weight_vq: 1.0
			
 
				-  weight_mel: 1.0
			
 
				-  freeze_encoder: false
			
 
				-
			
 
				-  encoder:
			
 
				-    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
			
 
				-    input_channels: ${num_mels}
			
 
				-    residual_channels: 768
			
 
				-    residual_layers: 20
			
 
				-    dilation_cycle: 4
			
 
				-  
			
 
				-  quantizer:
			
 
				-    _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
			
 
				-    input_dim: 768
			
 
				-    n_codebooks: 1
			
 
				-    n_groups: 2
			
 
				-    levels: [8, 5, 5, 5]
			
 
				-
			
 
				-  decoder:
			
 
				-    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
			
 
				-    output_channels: ${num_mels}
			
 
				-    residual_channels: 768
			
 
				-    residual_layers: 20
			
 
				-    dilation_cycle: 4
			
 
				-    condition_channels: 768
			
 
				-  
			
 
				-  discriminator:
			
 
				-    _target_: fish_speech.models.vqgan.modules.discriminator.Discriminator
			
 
				-
			
 
				-  vocoder:
			
 
				-    _target_: fish_speech.models.vqgan.modules.firefly.FireflyBase
			
 
				-    ckpt_path: null # You may download the pretrained vocoder and set the path here
			
 
				-
			
 
				-  encode_mel_transform:
			
 
				-    _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
			
 
				-    sample_rate: ${sample_rate}
			
 
				-    n_fft: ${n_fft}
			
 
				-    hop_length: ${hop_length}
			
 
				-    win_length: ${win_length}
			
 
				-    n_mels: ${num_mels}
			
 
				-    f_min: 0.0
			
 
				-    f_max: 8000.0
			
 
				-
			
 
				-  gt_mel_transform:
			
 
				-    _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
			
 
				-    sample_rate: ${sample_rate}
			
 
				-    n_fft: ${n_fft}
			
 
				-    hop_length: ${hop_length}
			
 
				-    win_length: ${win_length}
			
 
				-    n_mels: ${num_mels}
			
 
				-
			
 
				-  optimizer:
			
 
				-    _target_: torch.optim.AdamW
			
 
				-    _partial_: true
			
 
				-    lr: 1e-4
			
 
				-    betas: [0.8, 0.99]
			
 
				-    eps: 1e-5
			
 
				-    weight_decay: 0.01
			
 
				-
			
 
				-  lr_scheduler:
			
 
				-    _target_: torch.optim.lr_scheduler.LambdaLR
			
 
				-    _partial_: true
			
 
				-    lr_lambda:
			
 
				-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
			
 
				-      _partial_: true
			
 
				-      num_warmup_steps: 100
			
 
				-      num_training_steps: ${trainer.max_steps}
			
 
				-      final_lr_ratio: 0
			
 
				-
			
 
				-callbacks:
			
 
				-  model_summary:
			
 
				-    _target_: lightning.pytorch.callbacks.ModelSummary
			
 
				-    max_depth: 1
			
 
				-
			
 
				-  model_checkpoint:
			
 
				-    every_n_train_steps: ${trainer.val_check_interval}
			
 
				-
			
 
				-  grad_norm_monitor:
			
 
				-    sub_module: 
			
 
				-      - encoder
			
 
				-      - decoder
			
 
				-      - quantizer
			
 
				-      - discriminator
			
--- a/fish_speech/conversation.py
+++ b/fish_speech/conversation.py
@@ -0,0 +1,2 @@
 
				+SEMANTIC_TOKEN = "<|semantic|>"
			
 
				+CODEBOOK_PAD_TOKEN_ID = 0
			
--- a/fish_speech/datasets/concat_repeat.py
+++ b/fish_speech/datasets/concat_repeat.py
@@ -51,38 +51,3 @@ class ConcatRepeatDataset(Dataset):
 
				         dataset = self.datasets[dataset_idx]
			
 
				 
			
 
				         return dataset[sample_idx % len(dataset)]
			
 
				-
			
 
				-
			
 
				-class ConcatWeightedIterableDataset(IterableDataset):
			
 
				-    datasets: list[IterableDataset]
			
 
				-    weights: list[float]
			
 
				-
			
 
				-    def __init__(self, datasets: Iterable[IterableDataset], weights: list[float]):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        total_weight = sum(weights)
			
 
				-        self.weights = [w / total_weight for w in weights]
			
 
				-        self.datasets = list(datasets)
			
 
				-
			
 
				-        assert len(self.datasets) > 0, "datasets should not be an empty iterable"
			
 
				-        assert len(self.datasets) == len(
			
 
				-            weights
			
 
				-        ), "datasets and repeats should have the same length"
			
 
				-
			
 
				-        for d in self.datasets:
			
 
				-            assert isinstance(
			
 
				-                d, IterableDataset
			
 
				-            ), "ConcatRepeatIterableDataset only supports IterableDataset"
			
 
				-
			
 
				-    def __iter__(self):
			
 
				-        all_datasets = [iter(dataset) for dataset in self.datasets]
			
 
				-        ids = list(range(len(self.datasets)))
			
 
				-
			
 
				-        while True:
			
 
				-            chosen_dataset = random.choices(ids, self.weights)[0]
			
 
				-
			
 
				-            try:
			
 
				-                yield next(all_datasets[chosen_dataset])
			
 
				-            except StopIteration:
			
 
				-                all_datasets[chosen_dataset] = iter(self.datasets[chosen_dataset])
			
 
				-                yield next(all_datasets[chosen_dataset])
			
--- a/fish_speech/datasets/prompts.py
+++ b/fish_speech/datasets/prompts.py
@@ -0,0 +1,381 @@
 
				+# "Transcribe the following audio into text."
			
 
				+# "Transcribe what you will hear."
			
 
				+
			
 
				+asr_instructions = [
			
 
				+    "Transcribe:",
			
 
				+    "Transcribe the following audio into text.",
			
 
				+    "Convert the audio you're about to hear into written text.",
			
 
				+    "Please write down what you hear in the audio file.",
			
 
				+    "Listen to the audio and type out its contents.",
			
 
				+    "Your task is to write the audio's content in text form.",
			
 
				+    "Transcribe the content of the audio into text.",
			
 
				+    "Transform the given audio into a textual format.",
			
 
				+    "Listen to the following sound clip and transcribe it.",
			
 
				+    "The audio provided should be converted into written words.",
			
 
				+    "Document the audio in text.",
			
 
				+    "Put the audio's dialogue into written form.",
			
 
				+    "Capture the audio's message in text.",
			
 
				+    "Turn the sound file's speech into text.",
			
 
				+    "Render the audio into a text version.",
			
 
				+    "Translate the audio recording to text.",
			
 
				+    "Write out the dialogue from the audio.",
			
 
				+    "Listen and transcribe the audio into words.",
			
 
				+    "Change the audio into a written transcript.",
			
 
				+    "Your job is to transcribe the audio to text.",
			
 
				+    "Please transcribe the spoken words into text.",
			
 
				+    "The task is to convert audio speech into written text.",
			
 
				+    "Make a text transcript of the following audio.",
			
 
				+    "Decode the audio into a written document.",
			
 
				+    "Write down the transcription of the audio.",
			
 
				+    "Please provide a text version of this audio.",
			
 
				+    "The objective is to transcribe the audio into readable text.",
			
 
				+    "Listen carefully and type out the audio.",
			
 
				+    "Transform this audio clip into a text document.",
			
 
				+    "Your assignment is to transcribe this audio.",
			
 
				+    "Transcribe this sound recording into text format.",
			
 
				+    "The goal is to turn the audio into text.",
			
 
				+    "Your duty is to document the audio in written form.",
			
 
				+    "Listen to this audio piece and write down its contents.",
			
 
				+    "The task is converting the audio into text.",
			
 
				+    "Please create a textual transcription of the audio.",
			
 
				+    "Capture in writing what is said in the audio.",
			
 
				+    "Transcribe the audible content into a text format.",
			
 
				+    "The mission is to transcribe the audio into text.",
			
 
				+    "Your task: convert the audio to text.",
			
 
				+    "Write the contents of the audio as text.",
			
 
				+    "Listen to the clip and transcribe its audio to text.",
			
 
				+    "Transcribe the given audio track into written words.",
			
 
				+    "The assignment is to write out the audio in text.",
			
 
				+    "Convert the spoken words into text.",
			
 
				+    "Transcribe the voice recording into text.",
			
 
				+    "Your task is to make a written record of the audio.",
			
 
				+    "Listen to the audio and reproduce it in text.",
			
 
				+    "Transcribe the following sound into written text.",
			
 
				+    "Your challenge is to transcribe the audio into written form.",
			
 
				+    "Make a written version of the audio.",
			
 
				+    "Take the audio and transcribe it to text.",
			
 
				+    "Write down everything you hear in the audio.",
			
 
				+    "Please put the audio into text format.",
			
 
				+    "Your role is to transcribe the following audio into text.",
			
 
				+    "Convert the audio message into written text.",
			
 
				+    "Provide a written transcription of the audio.",
			
 
				+    "Listen and convert the audio to text.",
			
 
				+    "The requirement is to transcribe the audio into text form.",
			
 
				+    "Document in text what the audio says.",
			
 
				+    "Transcribe into text what you hear in the audio.",
			
 
				+    "Translate the audio file's contents into text.",
			
 
				+    "The task is to create a text transcript of the audio.",
			
 
				+    "Your assignment: Translate the audio into written words.",
			
 
				+    "Write a textual representation of the audio.",
			
 
				+    "Capture the essence of the audio in text.",
			
 
				+    "Your job: Listen to the audio and transcribe it.",
			
 
				+    "Turn the audio content into a text transcript.",
			
 
				+    "The task at hand is to transcribe the audio to text.",
			
 
				+    "Reproduce the audio in text form.",
			
 
				+    "Your mission: Convert the audio into a textual format.",
			
 
				+    "Transcribe what is spoken in the audio into text.",
			
 
				+    "Create a written version of what's in the audio.",
			
 
				+    "Transform the spoken audio into text.",
			
 
				+    "Document the spoken words in the audio as text.",
			
 
				+    "The objective is to write down the audio in text.",
			
 
				+    "Your goal: Transcribe the audio into text.",
			
 
				+    "Please convert the audio file into text.",
			
 
				+    "Transcribe the audio clip into written text.",
			
 
				+    "Listen to the audio and transcribe the speech into text.",
			
 
				+    "Transform the voice from the audio into written words.",
			
 
				+    "The task is to write the audio's speech in text form.",
			
 
				+    "Your duty: Write down what the audio says.",
			
 
				+    "Turn the given audio into a written format.",
			
 
				+    "Write in text form what is said in the audio.",
			
 
				+    "Your task: Document the audio in text.",
			
 
				+    "Provide a text transcription of the audio.",
			
 
				+    "Provide a text transcription of the audio.",
			
 
				+    "Write down the audio you listen to.",
			
 
				+    "Type out the spoken words you hear.",
			
 
				+    "Document the audio content verbatim.",
			
 
				+    "Transcribe the spoken content accurately.",
			
 
				+    "Convert the audio you hear into text.",
			
 
				+    "Record in writing what is said in the audio.",
			
 
				+    "Capture the spoken words in written form.",
			
 
				+    "Translate the audio into written text.",
			
 
				+    "Jot down the words you hear in the audio.",
			
 
				+    "Put into writing the spoken words you hear.",
			
 
				+    "Transcribe the auditory information verbatim.",
			
 
				+    "Note down the dialogue from the audio.",
			
 
				+    "Write out the spoken words from the audio.",
			
 
				+    "Transcribe the oral presentation into text.",
			
 
				+    "Render the spoken audio into written form.",
			
 
				+    "Reproduce the spoken words in text form.",
			
 
				+    "Document what is being said in the audio.",
			
 
				+    "Translate the spoken word into written form.",
			
 
				+    "Write verbatim what you hear in the audio.",
			
 
				+    "Capture in writing the contents of the audio.",
			
 
				+    "Transcribe verbatim the spoken words.",
			
 
				+    "Write down verbatim what is spoken.",
			
 
				+    "Transcribe the sounds into words on paper.",
			
 
				+    "Translate the sounds you hear into words.",
			
 
				+    "Write the spoken words in text form.",
			
 
				+    "Reproduce the audio content in writing.",
			
 
				+    "Note verbatim what is said in the audio.",
			
 
				+    "Put the audio content into written words.",
			
 
				+    "Record the spoken words into text format.",
			
 
				+    "Transcribe the audio into a written document.",
			
 
				+    "Write down exactly what you hear.",
			
 
				+    "Type out the content of the audio.",
			
 
				+    "Document the words spoken in the audio.",
			
 
				+    "Translate the verbal content into text.",
			
 
				+    "Convert what you hear into written words.",
			
 
				+    "Capture the essence of the audio in writing.",
			
 
				+    "Reproduce the spoken content in written form.",
			
 
				+    "Jot down exactly what is said in the audio.",
			
 
				+    "Document every word you hear in the audio.",
			
 
				+    "Record the audio content by writing it down.",
			
 
				+    "Capture the audio's spoken words in text.",
			
 
				+    "Turn the spoken audio into a written transcript.",
			
 
				+    "Write down the contents of the audio verbatim.",
			
 
				+    "Transcribe the voice you hear into text.",
			
 
				+    "Convert the spoken audio into text format.",
			
 
				+    "Type what is being spoken in the audio.",
			
 
				+    "Translate the audio speech into written words.",
			
 
				+    "Write the audio's dialogue in written form.",
			
 
				+    "Record the verbal content as written text.",
			
 
				+    "Transcribe the spoken parts of the audio.",
			
 
				+    "Note down everything you hear in the audio.",
			
 
				+    "Capture every word from the audio in text.",
			
 
				+    "Put the spoken audio into text form.",
			
 
				+    "Transcribe the audible content into words.",
			
 
				+    "Translate the oral content into written text.",
			
 
				+    "Type out everything heard in the audio.",
			
 
				+    "Write down the spoken parts verbatim.",
			
 
				+    "Document the spoken audio in text form.",
			
 
				+    "Capture the verbal exchanges in written text.",
			
 
				+    "Transcribe each word you hear accurately.",
			
 
				+    "Turn the audio into a textual document.",
			
 
				+    "Transcribe the sound into written words.",
			
 
				+    "Write the audio transcript in your own words.",
			
 
				+    "Document in text what you hear in the audio.",
			
 
				+    "Record in text the spoken parts of the audio.",
			
 
				+    "Transcribe the narrative you hear into text.",
			
 
				+    "Capture the spoken narrative in written form.",
			
 
				+    "Convert the verbal audio into written script.",
			
 
				+    "Note down the spoken words in the audio.",
			
 
				+    "Write in text form what is spoken in the audio.",
			
 
				+    "Record the audio's spoken words verbatim.",
			
 
				+    "Jot down the audio's dialogue accurately.",
			
 
				+    "Transcribe the verbal parts into written words.",
			
 
				+    "Translate the audio's spoken content into text.",
			
 
				+    "Document the audio dialogue in written form.",
			
 
				+    "Type out the words spoken in the audio verbatim.",
			
 
				+    "Write down word for word what is said in the audio.",
			
 
				+    "Transcribe the entire audio content into text.",
			
 
				+    "Note down precisely what is said in the audio.",
			
 
				+    "Capture in text the spoken content of the audio.",
			
 
				+    "Record the spoken audio into written language.",
			
 
				+    "Write the essence of the audio in text form.",
			
 
				+    "Transcribe the words you hear in the audio.",
			
 
				+    "Translate every spoken word into written text.",
			
 
				+    "Convert the oral speech into a written format.",
			
 
				+    "Jot down the words spoken in the audio.",
			
 
				+    "Record every word from the audio in writing.",
			
 
				+    "Document the entire audio in written form.",
			
 
				+    "Transcribe the spoken language into text.",
			
 
				+    "Write down the audio's words exactly as spoken.",
			
 
				+    "Capture the spoken word in written format.",
			
 
				+    "Type out verbatim the spoken audio content.",
			
 
				+    "Write precisely what you hear from the audio.",
			
 
				+]
			
 
				+
			
 
				+# "Read the following text with emotion."
			
 
				+# "Read the following text."
			
 
				+
			
 
				+tts_instructions = [
			
 
				+    "Speak:",
			
 
				+    "Expressively read the text that follows.",
			
 
				+    "Convey the upcoming text with emotion.",
			
 
				+    "Deliver the following passage with heartfelt expression.",
			
 
				+    "Evoke emotion while reading the text below.",
			
 
				+    "With feeling, please read the text that comes next.",
			
 
				+    "Infuse the upcoming words with emotional depth as you read.",
			
 
				+    "Let your emotions guide you as you read the following lines.",
			
 
				+    "Channel emotion into your reading of the next passage.",
			
 
				+    "Read the text below with a sense of emotion.",
			
 
				+    "Bring the following words to life with emotional expression.",
			
 
				+    "Engage emotionally with the text as you read it aloud.",
			
 
				+    "Imbue the subsequent text with feeling as you read.",
			
 
				+    "Read the following content with genuine emotion.",
			
 
				+    "Allow your feelings to resonate through the upcoming text.",
			
 
				+    "Emotionally interpret the text that follows.",
			
 
				+    "Read the ensuing passage with deep feeling.",
			
 
				+    "Convey the text below with genuine emotional depth.",
			
 
				+    "Read the text that comes next, letting your emotions flow.",
			
 
				+    "With emotion, present the following words.",
			
 
				+    "Let your emotional expression enhance the next text.",
			
 
				+    "Embrace emotion as you read the following passage.",
			
 
				+    "Read aloud the text below with emotive expression.",
			
 
				+    "Infuse the upcoming lines with emotional intensity.",
			
 
				+    "With sincerity, read the following text with emotion.",
			
 
				+    "Project emotion as you deliver the text that follows.",
			
 
				+    "Let the next words be read with a wealth of emotion.",
			
 
				+    "Give the upcoming text an emotional rendition.",
			
 
				+    "With emotion, read the text that is presented next.",
			
 
				+    "Convey the essence of the following text with heartfelt emotion.",
			
 
				+    "Inject emotional depth into your reading of the next passage.",
			
 
				+    "Bring out the emotional undertones in the following text.",
			
 
				+    "Embody the emotions as you read the text below.",
			
 
				+    "Express the following narrative with emotional depth.",
			
 
				+    "Let emotion permeate your reading of the upcoming passage.",
			
 
				+    "Interpret the following text with a rich emotional tone.",
			
 
				+    "Elicit emotion through your reading of the next content.",
			
 
				+    "Read the subsequent text with a deep emotional connection.",
			
 
				+    "Emote the essence of the text that follows in your reading.",
			
 
				+    "Render the following lines with emotional expression.",
			
 
				+    "Expressively interpret the upcoming text.",
			
 
				+    "Immerse in emotion as you read the following passage.",
			
 
				+    "Engage with the text below on an emotional level as you read.",
			
 
				+    "With emotional clarity, read the next text.",
			
 
				+    "Let an emotional depth inform your reading of the following words.",
			
 
				+    "Express the following content with deep emotional resonance.",
			
 
				+    "Deliver the upcoming text with a range of emotions.",
			
 
				+    "Narrate the following lines with emotional expressiveness.",
			
 
				+    "Convey emotional texture as you read the text below.",
			
 
				+    "Instill the next passage with emotive power as you read.",
			
 
				+    "Read the ensuing text with a palette of emotions.",
			
 
				+    "With a depth of feeling, present the next text.",
			
 
				+    "Inflect the upcoming words with emotional vibrancy.",
			
 
				+    "Emotionally engage with the text that follows in your reading.",
			
 
				+    "Lend emotional expression to the passage below.",
			
 
				+    "Evoke a spectrum of emotions as you read the next lines.",
			
 
				+    "Channel a rich emotional tone into the following text.",
			
 
				+    "With feeling, convey the essence of the upcoming passage.",
			
 
				+    "Read the text that comes next with emotional fervor.",
			
 
				+    "Render the following words with emotional authenticity.",
			
 
				+    "Give the upcoming passage an emotive interpretation.",
			
 
				+    "Allow your reading of the text below to be emotionally driven.",
			
 
				+    "Imbue the next lines with a sense of emotion.",
			
 
				+    "Emotionally animate the following text as you read.",
			
 
				+    "Bring emotional depth to the passage that follows.",
			
 
				+    "Articulate the text below with emotional nuance.",
			
 
				+    "Project a range of emotions as you read the upcoming text.",
			
 
				+    "With emotion, breathe life into the following words.",
			
 
				+    "Narrate the ensuing text with heartfelt emotion.",
			
 
				+    "Convey the text that follows with emotional richness.",
			
 
				+    "Read aloud the next passage with a depth of emotion.",
			
 
				+    "Emphasize emotional expression in your reading of the text below.",
			
 
				+    "Let your reading of the following lines be emotionally charged.",
			
 
				+    "With a heartfelt approach, read the upcoming text.",
			
 
				+    "Express the essence of emotion as you deliver the next passage.",
			
 
				+    "Read the following text, infused with emotional energy.",
			
 
				+    "Allow the text that comes next to be expressed with emotion.",
			
 
				+    "Convey the following passage with an emotional depth.",
			
 
				+    "Emotionally render the text that follows.",
			
 
				+    "With an emotional undertone, read the upcoming words.",
			
 
				+    "Read the text below, letting emotion guide your expression.",
			
 
				+    "Elicit an emotional response through your reading of the next passage.",
			
 
				+    "Give the following lines an emotive delivery.",
			
 
				+    "Read the upcoming text with emotional sincerity.",
			
 
				+    "Narrate the text that follows with an emotional touch.",
			
 
				+    "Deliver the following words with an emotive clarity.",
			
 
				+    "Express the next passage with a range of emotional tones.",
			
 
				+    "Immerse yourself emotionally in the text below as you read.",
			
 
				+    "Let the ensuing text be conveyed with profound emotion.",
			
 
				+    "Infuse the following lines with a sense of heartfelt emotion.",
			
 
				+    "Emotionally engage with the upcoming text in your reading.",
			
 
				+    "Convey deep emotion as you read the text that follows.",
			
 
				+    "Let your reading of the next passage be rich in emotion.",
			
 
				+    "With emotional depth, narrate the following text.",
			
 
				+    "Read the text below, capturing its emotional essence.",
			
 
				+    "Emote through your reading of the upcoming lines.",
			
 
				+    "Please read the text that follows aloud.",
			
 
				+    "Proceed to vocalize the upcoming text.",
			
 
				+    "Kindly articulate the subsequent text.",
			
 
				+    "Go ahead and pronounce the text below.",
			
 
				+    "Could you recite the forthcoming passage?",
			
 
				+    "Start reading the text below out loud.",
			
 
				+    "Announce the following text audibly.",
			
 
				+    "Voice the text that comes next.",
			
 
				+    "Read through the following lines aloud.",
			
 
				+    "Narrate the text presented below.",
			
 
				+    "Elevate your voice for the upcoming script.",
			
 
				+    "Broadcast the text that follows.",
			
 
				+    "Project the subsequent lines audibly.",
			
 
				+    "Give voice to the text underneath.",
			
 
				+    "Unfold the following text with your voice.",
			
 
				+    "Engage in reading the next piece of text aloud.",
			
 
				+    "Orate the following series of words.",
			
 
				+    "Enunciate the text appearing next.",
			
 
				+    "Verbally present the upcoming text.",
			
 
				+    "Articulate the passage that follows.",
			
 
				+    "Read aloud the text that's coming up.",
			
 
				+    "Proclaim the subsequent words.",
			
 
				+    "Vocalize the narrative below.",
			
 
				+    "Bring the following text to life by reading it aloud.",
			
 
				+    "Express the next text with your voice.",
			
 
				+    "Render the following text audibly.",
			
 
				+    "Voice out the lines that follow.",
			
 
				+    "Orally deliver the upcoming text.",
			
 
				+    "Loudly read out the text below.",
			
 
				+    "Share the next text by reading it out loud.",
			
 
				+    "Speak the following passage aloud.",
			
 
				+    "Let your voice carry the upcoming words.",
			
 
				+    "Annunciate the text that follows.",
			
 
				+    "Sound out the subsequent text.",
			
 
				+    "Aurally present the text below.",
			
 
				+    "Elocute the forthcoming lines.",
			
 
				+    "Recite the text below with clarity.",
			
 
				+    "Make the next text heard by reading aloud.",
			
 
				+    "Bring forth your voice for the following script.",
			
 
				+    "Read the text that ensues out loud.",
			
 
				+    "Deliver the following lines vocally.",
			
 
				+    "Voice the ensuing text.",
			
 
				+    "Publicly read the text that follows.",
			
 
				+    "Loudly narrate the subsequent text.",
			
 
				+    "Express the following text through your voice.",
			
 
				+    "Verbally articulate the next passage.",
			
 
				+    "Read the forthcoming text clearly.",
			
 
				+    "Announce the next set of words aloud.",
			
 
				+    "Broadcast the following narrative.",
			
 
				+    "Articulate the text coming up next.",
			
 
				+    "Enunciate the passage that follows clearly.",
			
 
				+    "Recite the subsequent text audibly.",
			
 
				+    "Speak out the text below.",
			
 
				+    "Project your voice with the following words.",
			
 
				+    "Read the next lines aloud.",
			
 
				+    "Vocalize the text that is to follow.",
			
 
				+    "Narrate aloud the text below.",
			
 
				+    "Orate the forthcoming script.",
			
 
				+    "Pronounce the next passage.",
			
 
				+    "Read out the subsequent text.",
			
 
				+    "Let the following words be heard by reading them aloud.",
			
 
				+    "Express the text that follows with your voice.",
			
 
				+    "Give audible life to the text below.",
			
 
				+    "Speak the ensuing text clearly.",
			
 
				+    "Make the forthcoming text audible.",
			
 
				+    "Project the next series of words audibly.",
			
 
				+    "Voice out the following narrative.",
			
 
				+    "Elevate the subsequent text with your voice.",
			
 
				+    "Bring the next passage to audible life.",
			
 
				+    "Read the lines that come next out loud.",
			
 
				+    "Announce the text below with clarity.",
			
 
				+    "Vocalize the script that follows.",
			
 
				+    "Narrate the following text with emphasis.",
			
 
				+    "Deliver the upcoming words with your voice.",
			
 
				+    "Articulate the next set of lines.",
			
 
				+    "Verbally convey the following text.",
			
 
				+    "Present the subsequent text vocally.",
			
 
				+    "Enunciate the upcoming passage loudly.",
			
 
				+    "Orally render the text that follows.",
			
 
				+    "Speak out the subsequent narrative.",
			
 
				+    "Proclaim the next text audibly.",
			
 
				+    "Elocute the following lines with clarity.",
			
 
				+    "Give voice to the upcoming script.",
			
 
				+    "Let your voice express the text below.",
			
 
				+    "Annunciate the following words clearly.",
			
 
				+    "Sound out the text that is next.",
			
 
				+    "Aurally convey the subsequent passage.",
			
 
				+    "Read the text up next aloud.",
			
 
				+]
			
 
				+
			
 
				+prompt_dict = {
			
 
				+    "asr": asr_instructions,
			
 
				+    "tts": tts_instructions,
			
 
				+}
			
--- a/fish_speech/datasets/text.py
+++ b/fish_speech/datasets/text.py
@@ -1,21 +1,29 @@
 
				+import gzip
			
 
				+import io
			
 
				+import json
			
 
				 import random
			
 
				 from dataclasses import dataclass
			
 
				-from itertools import chain
			
 
				 from pathlib import Path
			
 
				 from random import Random
			
 
				 from typing import Optional, Union
			
 
				 
			
 
				 import numpy as np
			
 
				-import pyarrow.parquet as pq
			
 
				 import torch
			
 
				 import torch.nn.functional as F
			
 
				-from datasets.download.streaming_download_manager import xopen
			
 
				-from huggingface_hub import HfApi
			
 
				+import zstandard as zstd
			
 
				 from lightning import LightningDataModule
			
 
				 from torch.distributed import get_rank, get_world_size, is_initialized
			
 
				 from torch.utils.data import DataLoader, IterableDataset, get_worker_info
			
 
				 from transformers import AutoTokenizer
			
 
				 
			
 
				+from fish_speech.conversation import (
			
 
				+    CODEBOOK_PAD_TOKEN_ID,
			
 
				+    SKIP_TEXT_STRING,
			
 
				+    Conversation,
			
 
				+    Message,
			
 
				+    encode_conversation,
			
 
				+)
			
 
				+from fish_speech.datasets.prompts import asr_instructions, tts_instructions
			
 
				 from fish_speech.datasets.protos.text_data_pb2 import SampledData
			
 
				 from fish_speech.datasets.protos.text_data_stream import read_pb_stream
			
 
				 from fish_speech.text.clean import clean_text
			
@@ -24,9 +32,7 @@ from fish_speech.utils.braceexpand import braceexpand
 
				 
			
 
				 log = RankedLogger(__name__, rank_zero_only=True)
			
 
				 
			
 
				-CODEBOOK_PAD_TOKEN_ID = 0
			
 
				-CODEBOOK_EOS_TOKEN_ID = 1
			
 
				-SKIP_TEXT_STRING = "<|skip_text|>"
			
 
				+DCTX = zstd.ZstdDecompressor(max_window_size=2**31)
			
 
				 
			
 
				 
			
 
				 def split_by_rank_worker(files):
			
@@ -56,43 +62,55 @@ def split_by_rank_worker(files):
 
				     return files
			
 
				 
			
 
				 
			
 
				-class StreamTextDataset(IterableDataset):
			
 
				+def expand_split_proto_files(proto_files, seed: int = 42):
			
 
				+    # Expand the proto files
			
 
				+    expanded_proto_files = []
			
 
				+    for filename in proto_files:
			
 
				+        for i in braceexpand(filename):
			
 
				+            i = Path(i)
			
 
				+            if i.is_file():
			
 
				+                expanded_proto_files.append(i)
			
 
				+            elif i.is_dir():
			
 
				+                expanded_proto_files.extend(i.rglob("*.proto"))
			
 
				+                expanded_proto_files.extend(i.rglob("*.protos"))
			
 
				+            else:
			
 
				+                raise ValueError(f"{i} is not a file or directory")
			
 
				+
			
 
				+    expanded_proto_files = sorted(expanded_proto_files)
			
 
				+    Random(seed).shuffle(expanded_proto_files)
			
 
				+    return split_by_rank_worker(expanded_proto_files)
			
 
				+
			
 
				+
			
 
				+class TextPretrainDataset(IterableDataset):
			
 
				     def __init__(
			
 
				         self,
			
 
				-        files: Optional[Union[list[str], str]] = None,
			
 
				-        prefix: Optional[str] = None,
			
 
				+        source: str,
			
 
				         seed: int = 42,
			
 
				-        parquet_batch_size: int = 10000,
			
 
				-        repo: str = "uonlp/CulturaX",
			
 
				         max_length: int = 1024,
			
 
				         tokenizer: AutoTokenizer = None,
			
 
				+        num_codebooks: int = 2,
			
 
				     ):
			
 
				         super().__init__()
			
 
				 
			
 
				+        self.source = Path(source)
			
 
				         self.seed = seed
			
 
				-        self.parquet_batch_size = parquet_batch_size
			
 
				-        self.repo = repo
			
 
				         self.max_length = max_length
			
 
				         self.tokenizer = tokenizer
			
 
				+        self.num_codebooks = num_codebooks
			
 
				 
			
 
				-        if files is None and prefix is None:
			
 
				-            raise ValueError("Either files or prefix must be specified")
			
 
				-
			
 
				-        if prefix is not None:
			
 
				-            files = HfApi().list_repo_files(repo, repo_type="dataset")
			
 
				+        if self.source.is_file():
			
 
				+            with open(self.source, "r") as f:
			
 
				+                files = f.read().strip().split("\n")
			
 
				+            self.root = self.source.parent
			
 
				+        else:
			
 
				             files = [
			
 
				-                f for f in files if f.startswith(prefix) and f.endswith(".parquet")
			
 
				+                str(i.relative_to(self.source)) for i in self.source.rglob("*.jsonl")
			
 
				             ]
			
 
				-            log.info(f"Found {len(files)} files in {repo} with prefix {prefix}")
			
 
				-        else:
			
 
				-            if isinstance(files, str):
			
 
				-                files = [files]
			
 
				-
			
 
				-            files = list(chain.from_iterable(map(braceexpand, files)))
			
 
				-            log.info(f"Expanded {len(files)} files in {repo}")
			
 
				+            self.root = self.source
			
 
				 
			
 
				         # Get sharded files
			
 
				         self.files = sorted(files)
			
 
				+
			
 
				         Random(seed).shuffle(self.files)
			
 
				 
			
 
				     def __iter__(self):
			
@@ -105,142 +123,147 @@ class StreamTextDataset(IterableDataset):
 
				             except Exception as e:
			
 
				                 log.exception(f"Failed to parse {filename}: {e}")
			
 
				 
			
 
				-    def parse_data(self, filename: str):
			
 
				-        for data in self.parse_data_internal(filename):
			
 
				-            text = data["text"]
			
 
				+    def read_jsonl(self, filename: str):
			
 
				+        with open(self.root / filename, "rb") as f:
			
 
				+            if filename.endswith(".zst"):
			
 
				+                stream_reader = DCTX.stream_reader(f)
			
 
				+            elif filename.endswith(".gz"):
			
 
				+                stream_reader = gzip.open(f, "rb")
			
 
				+            elif filename.endswith(".jsonl"):
			
 
				+                stream_reader = f
			
 
				+            else:
			
 
				+                raise ValueError(f"Unknown file type: {filename}")
			
 
				 
			
 
				+            stream = io.TextIOWrapper(stream_reader, encoding="utf-8")
			
 
				+
			
 
				+            # Parse jsonl
			
 
				+            for line in stream:
			
 
				+                line = json.loads(line)
			
 
				+                yield line
			
 
				+
			
 
				+    def parse_data(self, filename: str):
			
 
				+        for line in self.read_jsonl(filename):
			
 
				             # encode
			
 
				             tokens = self.tokenizer.encode(
			
 
				-                text,
			
 
				+                line["text"],
			
 
				                 add_special_tokens=False,
			
 
				                 truncation=False,
			
 
				                 max_length=10**6,
			
 
				             )
			
 
				 
			
 
				-            # Random choice self.max_length
			
 
				-            if len(tokens) > self.max_length:
			
 
				-                start = random.randint(0, len(tokens) - self.max_length)
			
 
				-                tokens = tokens[start : start + self.max_length - 1]
			
 
				-
			
 
				             tokens = (
			
 
				                 [self.tokenizer.bos_token_id] + tokens + [self.tokenizer.eos_token_id]
			
 
				             )
			
 
				-            # Pad dims
			
 
				-            placeholder_multi_codebook = torch.zeros((4, len(tokens)), dtype=torch.long)
			
 
				-
			
 
				-            tokens = torch.concat(
			
 
				-                [
			
 
				-                    torch.tensor([tokens], dtype=torch.long),
			
 
				-                    placeholder_multi_codebook,
			
 
				-                ],
			
 
				-                dim=0,
			
 
				-            )
			
 
				+
			
 
				+            if len(tokens) > self.max_length:
			
 
				+                tokens = tokens[: self.max_length]
			
 
				+
			
 
				+            tokens = self.pad_codebooks(tokens)
			
 
				             labels = tokens.clone()
			
 
				             tokens = tokens[:, :-1]
			
 
				             labels = labels[:, 1:]
			
 
				-            labels[1:] = -100  # remove all placeholders
			
 
				+            labels[1:] = -100  # no loss on codebook
			
 
				 
			
 
				             yield {"tokens": tokens, "labels": labels}
			
 
				 
			
 
				-    def parse_data_internal(self, filename: str):
			
 
				-        url = f"https://huggingface.co/datasets/{self.repo}/resolve/main/{filename}"
			
 
				+    def pad_codebooks(self, tokens):
			
 
				+        placeholder_multi_codebook = (
			
 
				+            torch.zeros((self.num_codebooks, len(tokens)), dtype=torch.long)
			
 
				+            + CODEBOOK_PAD_TOKEN_ID
			
 
				+        )
			
 
				+        return torch.concat(
			
 
				+            [
			
 
				+                torch.tensor([tokens], dtype=torch.long),
			
 
				+                placeholder_multi_codebook,
			
 
				+            ],
			
 
				+            dim=0,
			
 
				+        )
			
 
				+
			
 
				 
			
 
				-        with xopen(url, mode="rb") as stream:
			
 
				-            parquet_file = pq.ParquetFile(stream)
			
 
				+class TextInstructionDataset(TextPretrainDataset):
			
 
				+    def parse_data(self, filename: str):
			
 
				+        for line in self.read_jsonl(filename):
			
 
				+            messages = []
			
 
				+            for conversation in line["conversations"]:
			
 
				+                role = {
			
 
				+                    "human": "user",
			
 
				+                    "gpt": "assistant",
			
 
				+                    "system": "system",
			
 
				+                }[conversation["from"]]
			
 
				+
			
 
				+                message = Message(
			
 
				+                    role=role,
			
 
				+                    parts=[conversation["value"]],
			
 
				+                )
			
 
				+                messages.append(message)
			
 
				+
			
 
				+            conversation = Conversation(messages=messages)
			
 
				+            tokens, labels = encode_conversation(
			
 
				+                conversation,
			
 
				+                self.tokenizer,
			
 
				+                num_codebooks=self.num_codebooks,
			
 
				+            )
			
 
				 
			
 
				-            for batch in parquet_file.iter_batches(
			
 
				-                batch_size=self.parquet_batch_size, columns=["text"]
			
 
				-            ):
			
 
				-                # In-batch shuffling
			
 
				-                texts = [{"text": text.as_py()} for text in batch["text"]]
			
 
				-                random.shuffle(texts)
			
 
				-                yield from texts
			
 
				+            yield {"tokens": tokens, "labels": labels}
			
 
				 
			
 
				 
			
 
				-class AutoAugTextDataset(IterableDataset):
			
 
				-    """
			
 
				-    Auto Augment Dataset by Speaker
			
 
				+def semantic_to_tensor(semantics):
			
 
				+    num_codebooks = len(semantics)
			
 
				+    codes = [[] for _ in range(num_codebooks)]
			
 
				 
			
 
				-    1. Random concatenate multiple sentences from the same speaker to form a longer sentence
			
 
				-    2. Automatically normalize the text
			
 
				+    for book_idx, book in zip(range(num_codebooks), semantics):
			
 
				+        for j in book.values:
			
 
				+            codes[book_idx].append(int(j))
			
 
				 
			
 
				-    For interactive mode, we use the following format (multiple sequences):
			
 
				-    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
			
 
				+    return torch.tensor(codes, dtype=torch.int)
			
 
				 
			
 
				-    For non-interactive mode, we use the following format (one long sequence):
			
 
				-    <s> [INST] text [/INST] ... </s>
			
 
				-    """
			
 
				 
			
 
				+class AutoTextSemanticInstructionDataset(IterableDataset):
			
 
				     def __init__(
			
 
				         self,
			
 
				         proto_files: list[str],
			
 
				         seed: int = 42,
			
 
				-        interactive_prob: float = 0.5,
			
 
				         max_length: int = 1024,
			
 
				         tokenizer: AutoTokenizer = None,
			
 
				-        use_speaker: bool | float = True,
			
 
				-        causual: bool = True,
			
 
				-        use_negative_samples: bool = False,
			
 
				+        causual: Union[bool, float] = True,
			
 
				         num_codebooks: Optional[int] = None,
			
 
				         skip_text_prob: float = 0.0,
			
 
				+        asr_prob: float = 0.0,
			
 
				     ):
			
 
				         """
			
 
				         Args:
			
 
				             proto_files: proto buf files if using local data
			
 
				             seed: random seed
			
 
				-            interactive_prob: probability to use interactive mode
			
 
				             max_length: max length of the text
			
 
				             tokenizer: tokenizer
			
 
				-            use_speaker: include speaker information in the prompt
			
 
				             causual: use causual sampling when using local data, disable will lead to random sampling
			
 
				-            use_negative_samples: generate negative samples
			
 
				             num_codebooks: number of codebooks, if None, it will be automatically detected
			
 
				             skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
			
 
				+            asr_prob: probability to use ASR
			
 
				         """
			
 
				 
			
 
				         super().__init__()
			
 
				 
			
 
				-        assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
			
 
				+        assert 0 <= skip_text_prob <= 1, "skip_text_prob must be in [0, 1]"
			
 
				+        assert 0 <= asr_prob <= 1, "asr_prob must be in [0, 1]"
			
 
				 
			
 
				         self.seed = seed
			
 
				         self.max_length = max_length
			
 
				         self.tokenizer = tokenizer
			
 
				-        self.interactive_prob = interactive_prob
			
 
				-        self.use_speaker = use_speaker
			
 
				         self.proto_files = proto_files
			
 
				         self.causual = causual
			
 
				-        self.use_negative_samples = use_negative_samples
			
 
				         self.num_codebooks = num_codebooks
			
 
				         self.skip_text_prob = skip_text_prob
			
 
				-
			
 
				-        self.semantic_token_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
			
 
				+        self.asr_prob = asr_prob
			
 
				         self.groups = None
			
 
				 
			
 
				     def init_mock_data_server(self):
			
 
				         if self.groups is not None:
			
 
				             return
			
 
				 
			
 
				-        # Expand the proto files
			
 
				-        expanded_proto_files = []
			
 
				-        for filename in self.proto_files:
			
 
				-            for i in braceexpand(filename):
			
 
				-                i = Path(i)
			
 
				-                if i.is_file():
			
 
				-                    expanded_proto_files.append(i)
			
 
				-                elif i.is_dir():
			
 
				-                    expanded_proto_files.extend(i.rglob("*.proto"))
			
 
				-                    expanded_proto_files.extend(i.rglob("*.protos"))
			
 
				-                else:
			
 
				-                    raise ValueError(f"{i} is not a file or directory")
			
 
				-
			
 
				-        expanded_proto_files = sorted(expanded_proto_files)
			
 
				-        Random(self.seed).shuffle(expanded_proto_files)
			
 
				-
			
 
				         self.groups = []
			
 
				-        shard_proto_files = split_by_rank_worker(expanded_proto_files)
			
 
				-        log.info(
			
 
				-            f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
			
 
				-        )
			
 
				+        shard_proto_files = expand_split_proto_files(self.proto_files, seed=self.seed)
			
 
				+        log.info(f"Reading {len(shard_proto_files)} files")
			
 
				 
			
 
				         count = 0
			
 
				         for filename in shard_proto_files:
			
@@ -279,7 +302,11 @@ class AutoAugTextDataset(IterableDataset):
 
				         # choice group based on their number of samples
			
 
				         group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
			
 
				 
			
 
				-        if self.causual:
			
 
				+        causual = self.causual
			
 
				+        if isinstance(self.causual, float):
			
 
				+            causual = random.random() < self.causual
			
 
				+
			
 
				+        if causual:
			
 
				             # Sample in order
			
 
				             if num_samples >= len(group.sentences):
			
 
				                 samples = group.sentences
			
@@ -298,7 +325,6 @@ class AutoAugTextDataset(IterableDataset):
 
				         )
			
 
				 
			
 
				     def augment(self):
			
 
				-        final_text, final_semantic = [], []
			
 
				         response = self.sample_data()
			
 
				         if len(response.samples) == 0:
			
 
				             # Invalid group
			
@@ -306,29 +332,9 @@ class AutoAugTextDataset(IterableDataset):
 
				 
			
 
				         samples = list(response.samples)
			
 
				         idx = 0
			
 
				-        use_interactive = random.random() < self.interactive_prob
			
 
				-
			
 
				-        if use_interactive is False:
			
 
				-            # Random sample based on speaker using a truncated normal distribution
			
 
				-            a = torch.tensor([0], dtype=torch.float32)
			
 
				-            torch.nn.init.trunc_normal_(
			
 
				-                a,
			
 
				-                mean=self.max_length // 2,
			
 
				-                std=self.max_length // 4,
			
 
				-                a=10,
			
 
				-                b=self.max_length,
			
 
				-            )
			
 
				-            remaining_tokens = a.long().item() - 4
			
 
				-        else:
			
 
				-            remaining_tokens = self.max_length
			
 
				-
			
 
				-        # Use speaker
			
 
				-        if isinstance(self.use_speaker, float):
			
 
				-            use_speaker = random.random() < self.use_speaker
			
 
				-        else:
			
 
				-            use_speaker = self.use_speaker
			
 
				+        remaining_tokens = self.max_length
			
 
				 
			
 
				-        all_tokens, all_labels = [], []
			
 
				+        all_messages = []
			
 
				         while remaining_tokens > 0 and len(samples) > 0:
			
 
				             sentence = samples.pop(0)
			
 
				 
			
@@ -336,37 +342,52 @@ class AutoAugTextDataset(IterableDataset):
 
				             text, length = self.tokenize_sentence(text)
			
 
				             remaining_tokens -= length + len(sentence.semantics[0].values)
			
 
				 
			
 
				-            if use_interactive is False:
			
 
				-                final_text.append(text)
			
 
				-                final_semantic.append(sentence.semantics)
			
 
				+            # For interactive mode, we only apply speaker for the first sentence
			
 
				+            # [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
			
 
				+
			
 
				+            if random.random() < self.asr_prob:
			
 
				+                all_messages.append(
			
 
				+                    Message(
			
 
				+                        role="user",
			
 
				+                        parts=[
			
 
				+                            random.choice(asr_instructions),
			
 
				+                            semantic_to_tensor(sentence.semantics),
			
 
				+                        ],
			
 
				+                    )
			
 
				+                )
			
 
				+                all_messages.append(
			
 
				+                    Message(
			
 
				+                        role="assistant",
			
 
				+                        parts=[text],
			
 
				+                    )
			
 
				+                )
			
 
				             else:
			
 
				-                # For interactive mode, we only apply speaker for the first sentence
			
 
				-                # [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
			
 
				-                tokens, labels = self.pack_sentences(
			
 
				-                    sentences=[text],
			
 
				-                    semantics=[sentence.semantics],
			
 
				-                    speaker=response.name if use_speaker else None,
			
 
				-                    add_bos=idx == 0,
			
 
				-                    skip_text=random.random() < self.skip_text_prob,
			
 
				+                skip_text = random.random() < self.skip_text_prob
			
 
				+                if skip_text:
			
 
				+                    text = SKIP_TEXT_STRING
			
 
				+
			
 
				+                all_messages.append(
			
 
				+                    Message(
			
 
				+                        role="user",
			
 
				+                        parts=[random.choice(tts_instructions) + text],
			
 
				+                        mask_labels=skip_text,
			
 
				+                    )
			
 
				+                )
			
 
				+                all_messages.append(
			
 
				+                    Message(
			
 
				+                        role="assistant",
			
 
				+                        parts=[semantic_to_tensor(sentence.semantics)],
			
 
				+                        mask_labels=skip_text,
			
 
				+                    )
			
 
				                 )
			
 
				-
			
 
				-                all_tokens.append(tokens)
			
 
				-                all_labels.append(labels)
			
 
				 
			
 
				             idx += 1
			
 
				 
			
 
				-        if use_interactive is False:
			
 
				-            tokens, labels = self.pack_sentences(
			
 
				-                final_text,
			
 
				-                semantics=final_semantic,
			
 
				-                speaker=response.name if use_speaker else None,
			
 
				-                add_bos=True,
			
 
				-            )
			
 
				-            all_tokens.append(tokens)
			
 
				-            all_labels.append(labels)
			
 
				-
			
 
				-        tokens = torch.cat(all_tokens, dim=1)
			
 
				-        labels = torch.cat(all_labels, dim=1)
			
 
				+        tokens, labels = encode_conversation(
			
 
				+            Conversation(messages=all_messages),
			
 
				+            self.tokenizer,
			
 
				+            num_codebooks=self.num_codebooks,
			
 
				+        )
			
 
				 
			
 
				         # Verify that the length is correct
			
 
				         assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
			
@@ -374,156 +395,71 @@ class AutoAugTextDataset(IterableDataset):
 
				         # Verify bos token
			
 
				         assert tokens[0, 0] == self.tokenizer.bos_token_id
			
 
				 
			
 
				-        data = {"tokens": tokens, "labels": labels}
			
 
				-
			
 
				-        if self.use_negative_samples:
			
 
				-            negative_samples = self.generate_negative_samples(all_tokens, all_labels)
			
 
				-            data.update(negative_samples)
			
 
				-
			
 
				-        return data
			
 
				-
			
 
				-    def generate_negative_samples(self, all_tokens, all_labels):
			
 
				-        new_tokens, new_labels = [], []
			
 
				-
			
 
				-        for tokens, labels in zip(all_tokens, all_labels):
			
 
				-            # If all codebooks are not -100, we find where it starts
			
 
				-            start = torch.where(labels[1:].sum(0) != -100 * (labels.size(0) - 1))[0][0]
			
 
				-            assert (labels[1:, start:] != -100).all()  # This shouldn't happen
			
 
				+        return {"tokens": tokens, "labels": labels}
			
 
				 
			
 
				-            mode = random.choice(["repeat", "lost", "noise"])
			
 
				-            begin = random.randint(start, labels.size(1) - 1)
			
 
				-            end = random.randint(begin, labels.size(1) - 1)
			
 
				 
			
 
				-            if mode == "repeat":
			
 
				-                tokens = torch.cat(
			
 
				-                    [
			
 
				-                        tokens[:, :begin],
			
 
				-                        tokens[:, begin:end],
			
 
				-                        tokens[:, begin:end],
			
 
				-                        tokens[:, end:],
			
 
				-                    ],
			
 
				-                    dim=1,
			
 
				-                )
			
 
				-                labels = torch.cat(
			
 
				-                    [
			
 
				-                        labels[:, :begin],
			
 
				-                        labels[:, begin:end],
			
 
				-                        labels[:, begin:end],
			
 
				-                        labels[:, end:],
			
 
				-                    ],
			
 
				-                    dim=1,
			
 
				-                )
			
 
				-            elif mode == "lost":
			
 
				-                tokens = torch.cat([tokens[:, :begin], tokens[:, end:]], dim=1)
			
 
				-                labels = torch.cat([labels[:, :begin], labels[:, end:]], dim=1)
			
 
				-            elif mode == "noise":
			
 
				-                middle_tokens, middle_labels = (
			
 
				-                    tokens[:, begin:end],
			
 
				-                    labels[:, begin:end],
			
 
				-                )
			
 
				-                random_order0 = torch.randperm(middle_tokens.size(1))
			
 
				-                random_order1 = torch.randperm(middle_tokens.size(1))
			
 
				-                middle_tokens = middle_tokens[:, random_order0]
			
 
				-                middle_labels = middle_labels[:, random_order1]
			
 
				-                tokens = torch.cat(
			
 
				-                    [tokens[:, :begin], middle_tokens, tokens[:, end:]], dim=1
			
 
				-                )
			
 
				-                labels = torch.cat(
			
 
				-                    [labels[:, :begin], middle_labels, labels[:, end:]], dim=1
			
 
				-                )
			
 
				-
			
 
				-            new_tokens.append(tokens)
			
 
				-            new_labels.append(labels)
			
 
				-
			
 
				-        tokens = torch.cat(new_tokens, dim=1)
			
 
				-        labels = torch.cat(new_labels, dim=1)
			
 
				-
			
 
				-        # Verify that the length is correct
			
 
				-        assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
			
 
				-
			
 
				-        return {"negative_tokens": tokens, "negative_labels": labels}
			
 
				-
			
 
				-    def pack_sentences(
			
 
				+class SemanticInstructionDataset(IterableDataset):
			
 
				+    def __init__(
			
 
				         self,
			
 
				-        sentences: list[str],
			
 
				-        semantics: list,
			
 
				-        speaker: Optional[str] = None,
			
 
				-        add_bos: bool = True,
			
 
				-        skip_text: bool = False,
			
 
				+        proto_files: list[str],
			
 
				+        seed: int = 42,
			
 
				+        max_length: int = 1024,
			
 
				+        tokenizer: AutoTokenizer = None,
			
 
				+        num_codebooks: Optional[int] = None,
			
 
				     ):
			
 
				-        if speaker is None:
			
 
				-            speaker = "assistant"
			
 
				+        super().__init__()
			
 
				 
			
 
				-        cated_sentences = " ".join(sentences)
			
 
				-        if skip_text:
			
 
				-            cated_sentences = SKIP_TEXT_STRING
			
 
				+        self.seed = seed
			
 
				+        self.max_length = max_length
			
 
				+        self.tokenizer = tokenizer
			
 
				+        self.proto_files = proto_files
			
 
				+        self.num_codebooks = num_codebooks
			
 
				 
			
 
				-        final_text = "<|im_start|>user<|im_sep|>" + cated_sentences + "<|im_end|>"
			
 
				-        final_text = final_text + f"<|im_start|>{speaker}<|im_sep|>"
			
 
				+    def get_data_generator(self):
			
 
				+        shard_proto_files = expand_split_proto_files(self.proto_files, seed=self.seed)
			
 
				+        random.shuffle(shard_proto_files)
			
 
				+        log.info(f"Fetched {len(shard_proto_files)} files")
			
 
				 
			
 
				-        encoded = self.tokenizer.encode(
			
 
				-            final_text,
			
 
				-            add_special_tokens=False,
			
 
				-            truncation=False,
			
 
				-            max_length=10**6,
			
 
				-        )
			
 
				-        semantic_length = sum([len(i[0].values) for i in semantics])
			
 
				-        prompt_length = len(encoded)
			
 
				-        num_codebooks = (
			
 
				-            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
			
 
				-        )
			
 
				+        for filename in shard_proto_files:
			
 
				+            with open(filename, "rb") as f:
			
 
				+                for group in read_pb_stream(f):
			
 
				+                    yield group
			
 
				 
			
 
				-        bos_bias = 1 if add_bos else 0
			
 
				+    def pack_one_group(self, group):
			
 
				+        sentences = group.sentences
			
 
				 
			
 
				-        # Pack the tokens and semantics (add <s> and </s> to semantic tokens)
			
 
				-        tokens = (
			
 
				-            encoded
			
 
				-            + [self.semantic_token_id] * semantic_length
			
 
				-            + self.tokenizer.convert_tokens_to_ids(
			
 
				-                ["<|im_end|>", "<|end_of_sequence|>"]
			
 
				+        messages = []
			
 
				+        for idx, sentence in enumerate(sentences):
			
 
				+            role = "user" if idx % 2 == 0 else "assistant"
			
 
				+            semantic = semantic_to_tensor(sentence.semantics)
			
 
				+            text = random.choice(sentence.texts)
			
 
				+            parts = [semantic]
			
 
				+            if role == "assistant":
			
 
				+                # Let model to predict the text first
			
 
				+                prev_text = random.choice(sentences[idx - 1].texts)
			
 
				+                # parts.insert(0, f"Q: {prev_text}\nA: {text}")
			
 
				+            messages.append(
			
 
				+                Message(
			
 
				+                    role=role,
			
 
				+                    parts=parts,
			
 
				+                )
			
 
				             )
			
 
				-        )
			
 
				-
			
 
				-        if add_bos:
			
 
				-            tokens = [self.tokenizer.bos_token_id] + tokens
			
 
				-
			
 
				-        # Codebook bos/padding: 0, eos: 1
			
 
				-        codes = [
			
 
				-            [CODEBOOK_PAD_TOKEN_ID] * (prompt_length + bos_bias)
			
 
				-            for _ in range(num_codebooks)
			
 
				-        ]
			
 
				-        for segment in semantics:
			
 
				-            for book_idx, book in zip(range(num_codebooks), segment):
			
 
				-                for j in book.values:
			
 
				-                    codes[book_idx].append(int(j) + 2)
			
 
				 
			
 
				-        for book in codes:
			
 
				-            book.extend([CODEBOOK_EOS_TOKEN_ID] * 2)
			
 
				-
			
 
				-        tokens = [tokens] + codes
			
 
				-
			
 
				-        tokens = torch.tensor(tokens, dtype=torch.long)
			
 
				-        labels = tokens.clone()
			
 
				-
			
 
				-        if skip_text:
			
 
				-            # If text is not provided, the sentence is used for condition only, all labels are -100
			
 
				-            torch.fill_(labels, -100)
			
 
				-            return tokens, labels
			
 
				-
			
 
				-        # Mask out the <s> tokens for semantic, predict semantic tokens only
			
 
				-        # Since we don't mask out the input tokens, the language modeling still works
			
 
				-        labels[1:, : (prompt_length + bos_bias)] = -100
			
 
				-
			
 
				-        tokens = tokens[:, :-1]
			
 
				-        labels = labels[:, 1:]
			
 
				+        conversation = Conversation(messages=messages)
			
 
				+        tokens, labels = encode_conversation(
			
 
				+            conversation,
			
 
				+            self.tokenizer,
			
 
				+            num_codebooks=self.num_codebooks,
			
 
				+        )
			
 
				 
			
 
				-        # Verify the padding is correct, and the last token is eos
			
 
				-        assert add_bos is False or tokens[0, 0] == self.tokenizer.bos_token_id
			
 
				-        assert (tokens[1:, : prompt_length + bos_bias] == CODEBOOK_PAD_TOKEN_ID).all()
			
 
				-        assert labels[0, -1] == self.tokenizer.eos_token_id
			
 
				-        assert (labels[1:, -2:] == CODEBOOK_EOS_TOKEN_ID).all()
			
 
				+        return {"tokens": tokens, "labels": labels}
			
 
				 
			
 
				-        return tokens, labels
			
 
				+    def __iter__(self):
			
 
				+        for group in self.get_data_generator():
			
 
				+            try:
			
 
				+                yield self.pack_one_group(group)
			
 
				+            except Exception as e:
			
 
				+                log.exception(f"Failed to parse {group}: {e}")
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -633,8 +569,18 @@ class InterleaveDataset(IterableDataset):
 
				 class TextDataModule(LightningDataModule):
			
 
				     def __init__(
			
 
				         self,
			
 
				-        train_dataset: Union[StreamTextDataset, AutoAugTextDataset, InterleaveDataset],
			
 
				-        val_dataset: Union[StreamTextDataset, AutoAugTextDataset, InterleaveDataset],
			
 
				+        train_dataset: Union[
			
 
				+            AutoTextSemanticInstructionDataset,
			
 
				+            TextPretrainDataset,
			
 
				+            TextInstructionDataset,
			
 
				+            InterleaveDataset,
			
 
				+        ],
			
 
				+        val_dataset: Union[
			
 
				+            AutoTextSemanticInstructionDataset,
			
 
				+            TextPretrainDataset,
			
 
				+            TextInstructionDataset,
			
 
				+            InterleaveDataset,
			
 
				+        ],
			
 
				         batch_size: int = 32,
			
 
				         tokenizer: AutoTokenizer = None,
			
 
				         max_length: int = 1024,
			
@@ -671,17 +617,36 @@ class TextDataModule(LightningDataModule):
 
				 if __name__ == "__main__":
			
 
				     from tqdm import tqdm
			
 
				 
			
 
				-    ds = AutoAugTextDataset(
			
 
				-        ["data/protos"],
			
 
				-        tokenizer=AutoTokenizer.from_pretrained("fishaudio/fish-speech-1"),
			
 
				-        use_speaker=False,
			
 
				-        interactive_prob=1.0,
			
 
				-        use_negative_samples=False,
			
 
				-        skip_text_prob=0.5,
			
 
				+    # ds = AutoTextSemanticInstructionDataset(
			
 
				+    #     ["data/protos/sft/val/11labs"],
			
 
				+    #     tokenizer=AutoTokenizer.from_pretrained("checkpoints/fish-speech-agent-1"),
			
 
				+    #     skip_text_prob=1.0,
			
 
				+    #     asr_prob=0.0,
			
 
				+    #     num_codebooks=2,
			
 
				+    # )
			
 
				+    # ds = TextInstructionDataset(
			
 
				+    #     source="data/openhermes2_5",
			
 
				+    #     tokenizer=AutoTokenizer.from_pretrained("checkpoints/fish-speech-agent-1"),
			
 
				+    # )
			
 
				+
			
 
				+    ds = SemanticInstructionDataset(
			
 
				+        proto_files=["data/protos/sft/val/ultrachat_200k_spoken_openai"],
			
 
				+        tokenizer=AutoTokenizer.from_pretrained("checkpoints/fish-speech-agent-1"),
			
 
				+        num_codebooks=2,
			
 
				     )
			
 
				 
			
 
				     for i in ds:
			
 
				-        print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
			
 
				+        # print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
			
 
				         # i["labels"][0][i["labels"][0] == -100] = 0
			
 
				         # print(ds.tokenizer.decode(i["labels"][0], skip_special_tokens=False))
			
 
				+
			
 
				+        length = i["tokens"].size(1)
			
 
				+        print(i["tokens"].size(), i["tokens"].dtype)
			
 
				+        for j in range(length):
			
 
				+            print(
			
 
				+                ds.tokenizer.decode(i["tokens"][0, j]),
			
 
				+                i["tokens"][:, j],
			
 
				+                i["labels"][:, j],
			
 
				+            )
			
 
				+            input()
			
 
				         break
			
--- a/fish_speech/models/text2semantic/__init__.py
+++ b/fish_speech/models/text2semantic/__init__.py
@@ -1,3 +0,0 @@
 
				-from .lit_module import TextToSemantic
			
 
				-
			
 
				-__all__ = ["TextToSemantic"]
			
--- a/fish_speech/models/text2semantic/lit_module.py
+++ b/fish_speech/models/text2semantic/lit_module.py
@@ -6,8 +6,8 @@ import torch.nn.functional as F
 
				 from lightning.pytorch.utilities.types import OptimizerLRScheduler
			
 
				 
			
 
				 import fish_speech.utils as utils
			
 
				+from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
			
 
				 from fish_speech.models.text2semantic.llama import NaiveTransformer
			
 
				-from fish_speech.models.text2semantic.lora_utils import LoraConfig, setup_lora
			
 
				 
			
 
				 log = utils.RankedLogger(__name__, rank_zero_only=True)
			
 
				 
			
@@ -137,15 +137,15 @@ class TextToSemantic(L.LightningModule):
 
				             labels, negative_labels = labels.chunk(2)
			
 
				 
			
 
				         # Generate labels
			
 
				-        base_loss = F.cross_entropy(
			
 
				-            token_logits.reshape(-1, token_logits.size(-1)),
			
 
				+        base_loss = fast_cross_entropy_loss(
			
 
				+            token_logits.view(-1, token_logits.size(-1)),
			
 
				             labels[:, 0].reshape(-1),
			
 
				             ignore_index=-100,
			
 
				         )
			
 
				 
			
 
				         codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks].mT
			
 
				-        semantic_loss = F.cross_entropy(
			
 
				-            codebook_logits.reshape(-1, codebook_logits.size(-1)),
			
 
				+        semantic_loss = fast_cross_entropy_loss(
			
 
				+            codebook_logits.view(-1, codebook_logits.size(-1)),
			
 
				             codebook_labels.reshape(-1),
			
 
				             ignore_index=-100,
			
 
				         )
			
@@ -281,11 +281,15 @@ class TextToSemantic(L.LightningModule):
 
				         return loss
			
 
				 
			
 
				     def get_accuracy(self, logits, labels):
			
 
				+        mask = (labels != -100) & (labels != CODEBOOK_PAD_TOKEN_ID)
			
 
				+        if mask.sum() == 0:
			
 
				+            return torch.tensor(0.0, device=logits.device)
			
 
				+
			
 
				         _, indices = logits.topk(5, dim=-1)
			
 
				         correct = indices.eq(labels.unsqueeze(-1))
			
 
				-        correct[labels == -100] = 0
			
 
				+        correct[~mask] = 0
			
 
				         correct = correct.sum()
			
 
				-        accuracy = correct / (labels != -100).sum()
			
 
				+        accuracy = correct / mask.sum()
			
 
				 
			
 
				         return accuracy
			
 
				 
			
--- a/fish_speech/models/text2semantic/llama.py
+++ b/fish_speech/models/text2semantic/llama.py
@@ -1,5 +1,7 @@
 
				+import json
			
 
				 import math
			
 
				 from dataclasses import dataclass
			
 
				+from pathlib import Path
			
 
				 from typing import Optional
			
 
				 
			
 
				 import torch
			
@@ -7,7 +9,16 @@ import torch.nn as nn
 
				 from einops import rearrange
			
 
				 from torch import Tensor
			
 
				 from torch.nn import functional as F
			
 
				+from torch.nn.attention import SDPBackend, sdpa_kernel
			
 
				 from torch.utils.checkpoint import checkpoint
			
 
				+from transformers import AutoTokenizer
			
 
				+
			
 
				+from fish_speech.conversation import SEMANTIC_TOKEN
			
 
				+from fish_speech.utils import RankedLogger
			
 
				+
			
 
				+from .lora import LoraConfig, setup_lora
			
 
				+
			
 
				+log = RankedLogger(__name__, rank_zero_only=True)
			
 
				 
			
 
				 
			
 
				 def find_multiple(n: int, k: int) -> int:
			
@@ -18,6 +29,8 @@ def find_multiple(n: int, k: int) -> int:
 
				 
			
 
				 @dataclass
			
 
				 class BaseModelArgs:
			
 
				+    model_type: str = "base"
			
 
				+
			
 
				     vocab_size: int = 32000
			
 
				     n_layer: int = 32
			
 
				     n_head: int = 32
			
@@ -29,16 +42,19 @@ class BaseModelArgs:
 
				     norm_eps: float = 1e-5
			
 
				     max_seq_len: int = 2048
			
 
				     dropout: float = 0.0
			
 
				+    tie_word_embeddings: bool = True
			
 
				+    attention_qkv_bias: bool = False
			
 
				 
			
 
				     # Codebook configs
			
 
				     codebook_size: int = 160
			
 
				     num_codebooks: int = 4
			
 
				-    num_in_codebooks: Optional[int] = None
			
 
				-    codebook_padding_idx: int = 0
			
 
				 
			
 
				     # Gradient checkpointing
			
 
				     use_gradient_checkpointing: bool = True
			
 
				 
			
 
				+    # Initialize the model
			
 
				+    initializer_range: float = 0.02
			
 
				+
			
 
				     def __post_init__(self):
			
 
				         if self.n_local_heads == -1:
			
 
				             self.n_local_heads = self.n_head
			
@@ -46,18 +62,41 @@ class BaseModelArgs:
 
				             hidden_dim = 4 * self.dim
			
 
				             n_hidden = int(2 * hidden_dim / 3)
			
 
				             self.intermediate_size = find_multiple(n_hidden, 256)
			
 
				-        if self.num_in_codebooks is None:
			
 
				-            self.num_in_codebooks = self.num_codebooks
			
 
				         self.head_dim = self.dim // self.n_head
			
 
				 
			
 
				+    @staticmethod
			
 
				+    def from_pretrained(path: str):
			
 
				+        path = Path(path)
			
 
				+
			
 
				+        if path.is_dir():
			
 
				+            path = path / "config.json"
			
 
				+
			
 
				+        with open(path, "r") as f:
			
 
				+            data = json.load(f)
			
 
				+
			
 
				+        match data["model_type"]:
			
 
				+            case "naive":
			
 
				+                cls = NaiveModelArgs
			
 
				+            case "dual_ar":
			
 
				+                cls = DualARModelArgs
			
 
				+            case _:
			
 
				+                raise ValueError(f"Unknown model type: {data['model_type']}")
			
 
				+
			
 
				+        return cls(**data)
			
 
				+
			
 
				+    def save(self, path: str):
			
 
				+        with open(path, "w") as f:
			
 
				+            json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)
			
 
				+
			
 
				 
			
 
				 @dataclass
			
 
				 class NaiveModelArgs(BaseModelArgs):
			
 
				-    pass
			
 
				+    model_type: str = "naive"
			
 
				 
			
 
				 
			
 
				 @dataclass
			
 
				 class DualARModelArgs(BaseModelArgs):
			
 
				+    model_type: str = "dual_ar"
			
 
				     n_fast_layer: int = 4
			
 
				 
			
 
				 
			
@@ -95,24 +134,35 @@ class BaseTransformerForwardResult:
 
				 
			
 
				 
			
 
				 class BaseTransformer(nn.Module):
			
 
				-    def __init__(self, config: BaseModelArgs) -> None:
			
 
				+    def __init__(
			
 
				+        self, config: BaseModelArgs, tokenizer: AutoTokenizer, init_weights: bool = True
			
 
				+    ) -> None:
			
 
				         super().__init__()
			
 
				         self.config = config
			
 
				+        self.tokenizer = tokenizer
			
 
				+
			
 
				+        self.semantic_token_id = tokenizer.convert_tokens_to_ids(SEMANTIC_TOKEN)
			
 
				 
			
 
				         # Slow transformer
			
 
				         self.embeddings = nn.Embedding(
			
 
				-            config.vocab_size + config.codebook_size * config.num_in_codebooks,
			
 
				+            config.vocab_size,
			
 
				+            config.dim,
			
 
				+        )
			
 
				+        self.codebook_embeddings = nn.Embedding(
			
 
				+            config.codebook_size * config.num_codebooks,
			
 
				             config.dim,
			
 
				         )
			
 
				         self.layers = nn.ModuleList(
			
 
				             TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)
			
 
				         )
			
 
				         self.norm = RMSNorm(config.dim, eps=config.norm_eps)
			
 
				-        self.output = nn.Linear(
			
 
				-            config.dim,
			
 
				-            config.vocab_size,
			
 
				-            bias=False,
			
 
				-        )
			
 
				+
			
 
				+        if self.config.tie_word_embeddings is False:
			
 
				+            self.output = nn.Linear(
			
 
				+                config.dim,
			
 
				+                config.vocab_size,
			
 
				+                bias=False,
			
 
				+            )
			
 
				 
			
 
				         self.register_buffer(
			
 
				             "freqs_cis",
			
@@ -139,6 +189,9 @@ class BaseTransformer(nn.Module):
 
				         self.max_batch_size = -1
			
 
				         self.max_seq_len = -1
			
 
				 
			
 
				+        if init_weights:
			
 
				+            self.apply(self._init_weights)
			
 
				+
			
 
				     def setup_caches(
			
 
				         self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
			
 
				     ):
			
@@ -161,11 +214,9 @@ class BaseTransformer(nn.Module):
 
				 
			
 
				     def embed(self, x: Tensor) -> Tensor:
			
 
				         vocab_embeds = [self.embeddings(x[:, 0])]
			
 
				-        for i in range(self.config.num_in_codebooks):
			
 
				-            emb = self.embeddings(
			
 
				-                x[:, i + 1] + i * self.config.codebook_size + self.config.vocab_size
			
 
				-            )
			
 
				-            emb[x[:, i + 1] == self.config.codebook_padding_idx] = 0
			
 
				+        for i in range(self.config.num_codebooks):
			
 
				+            emb = self.codebook_embeddings(x[:, i + 1] + i * self.config.codebook_size)
			
 
				+            emb[x[:, 0] != self.semantic_token_id] = 0
			
 
				             vocab_embeds.append(emb)
			
 
				 
			
 
				         x = torch.stack(vocab_embeds, dim=3)
			
@@ -174,21 +225,23 @@ class BaseTransformer(nn.Module):
 
				         return x
			
 
				 
			
 
				     def forward(
			
 
				-        self, inp: Tensor, key_padding_mask: Optional[Tensor] = None
			
 
				+        self,
			
 
				+        inp: Tensor,
			
 
				+        key_padding_mask: Optional[Tensor] = None,
			
 
				     ) -> BaseTransformerForwardResult:
			
 
				-        # x: (batch, num_codebooks + 1, seq_len)
			
 
				         seq_len = inp.size(2)
			
 
				 
			
 
				         # Here we want to merge the embeddings of the codebooks
			
 
				         x = self.embed(inp)
			
 
				 
			
 
				-        mask = self.causal_mask[None, None, :seq_len, :seq_len]  # (B, N, Q, K)
			
 
				         freqs_cis = self.freqs_cis[:seq_len]
			
 
				 
			
 
				         # Not that the causal mask here follows the definition of scaled_dot_product_attention
			
 
				         # That is, FALSE means masked out
			
 
				         # To maintain consistency, key_padding_mask use TRUE to mask out
			
 
				+        mask = None
			
 
				         if key_padding_mask is not None:
			
 
				+            mask = self.causal_mask[None, None, :seq_len, :seq_len]  # (B, N, Q, K)
			
 
				             mask = mask & key_padding_mask[:, None, None, :].logical_not()
			
 
				 
			
 
				         for layer in self.layers:
			
@@ -199,7 +252,11 @@ class BaseTransformer(nn.Module):
 
				 
			
 
				         # We got slow_out here
			
 
				         slow_out = self.norm(x)
			
 
				-        token_logits = self.output(slow_out)
			
 
				+
			
 
				+        if self.config.tie_word_embeddings:
			
 
				+            token_logits = F.linear(slow_out, self.embeddings.weight)
			
 
				+        else:
			
 
				+            token_logits = self.output(slow_out)
			
 
				 
			
 
				         return BaseTransformerForwardResult(
			
 
				             logits=token_logits,
			
@@ -207,7 +264,10 @@ class BaseTransformer(nn.Module):
 
				         )
			
 
				 
			
 
				     def forward_generate(
			
 
				-        self, x: Tensor, input_pos: Optional[Tensor] = None
			
 
				+        self,
			
 
				+        x: Tensor,
			
 
				+        input_pos: Optional[Tensor] = None,
			
 
				+        return_all: bool = False,
			
 
				     ) -> BaseTransformerForwardResult:
			
 
				         # This is used for generation, optimized for torch compile
			
 
				         assert (
			
@@ -225,22 +285,99 @@ class BaseTransformer(nn.Module):
 
				             x = layer(x, freqs_cis, mask, input_pos=input_pos)
			
 
				 
			
 
				         # If prefill, we only calculate the logits of last token
			
 
				-        if x.size(1) > 1:
			
 
				+        if x.size(1) > 1 and not return_all:
			
 
				             x = x[:, -1:]
			
 
				 
			
 
				         # We got slow_out here
			
 
				         slow_out = self.norm(x)
			
 
				-        token_logits = self.output(slow_out)
			
 
				+
			
 
				+        if self.config.tie_word_embeddings:
			
 
				+            token_logits = F.linear(slow_out, self.embeddings.weight)
			
 
				+        else:
			
 
				+            token_logits = self.output(slow_out)
			
 
				 
			
 
				         return BaseTransformerForwardResult(
			
 
				             logits=token_logits,
			
 
				             hidden_states=x,
			
 
				         )
			
 
				 
			
 
				+    def _init_weights(self, module):
			
 
				+        std = self.config.initializer_range
			
 
				+        if isinstance(module, nn.Linear):
			
 
				+            module.weight.data.normal_(mean=0.0, std=std)
			
 
				+            if module.bias is not None:
			
 
				+                module.bias.data.zero_()
			
 
				+        elif isinstance(module, nn.Embedding):
			
 
				+            module.weight.data.normal_(mean=0.0, std=std)
			
 
				+            if module.padding_idx is not None:
			
 
				+                module.weight.data[module.padding_idx].zero_()
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def from_pretrained(
			
 
				+        path: str,
			
 
				+        load_weights: bool = False,
			
 
				+        max_length: int | None = None,
			
 
				+        lora_config: LoraConfig | None = None,
			
 
				+        rope_base: int | None = None,
			
 
				+    ) -> "BaseTransformer":
			
 
				+        config = BaseModelArgs.from_pretrained(path)
			
 
				+        if max_length is not None:
			
 
				+            config.max_seq_len = max_length
			
 
				+            log.info(f"Override max_seq_len to {max_length}")
			
 
				+
			
 
				+        if rope_base is not None:
			
 
				+            config.rope_base = rope_base
			
 
				+            log.info(f"Override rope_base to {rope_base}")
			
 
				+
			
 
				+        match config.model_type:
			
 
				+            case "naive":
			
 
				+                model_cls = NaiveTransformer
			
 
				+            case "dual_ar":
			
 
				+                model_cls = DualARTransformer
			
 
				+            case _:
			
 
				+                raise ValueError(f"Unknown model type: {config.model_type}")
			
 
				+
			
 
				+        tokenizer = AutoTokenizer.from_pretrained(str(path))
			
 
				+        log.info(f"Loading model from {path}, config: {config}")
			
 
				+        model = model_cls(config, tokenizer=tokenizer)
			
 
				+
			
 
				+        if lora_config is not None:
			
 
				+            setup_lora(model, lora_config)
			
 
				+            log.info(f"LoRA setup: {lora_config}")
			
 
				+
			
 
				+        if load_weights is False:
			
 
				+            log.info("Randomly initialized model")
			
 
				+        else:
			
 
				+            weights = torch.load(
			
 
				+                Path(path) / "model.pth", map_location="cpu", mmap=True
			
 
				+            )
			
 
				+            err = model.load_state_dict(weights, strict=False, assign=True)
			
 
				+            log.info(f"Loaded weights with error: {err}")
			
 
				+
			
 
				+        return model
			
 
				+
			
 
				+    def save_pretrained(self, path: str, drop_lora: bool = False):
			
 
				+        path = Path(path)
			
 
				+        path.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        self.config.save(path / "config.json")
			
 
				+        state_dict = self.state_dict()
			
 
				+
			
 
				+        if drop_lora:
			
 
				+            for key in list(state_dict.keys()):
			
 
				+                if "lora" not in key:
			
 
				+                    continue
			
 
				+
			
 
				+                state_dict.pop(key)
			
 
				+                log.info(f"Drop LoRA parameter: {key}")
			
 
				+
			
 
				+        torch.save(state_dict, path / "model.pth")
			
 
				+        self.tokenizer.save_pretrained(path)
			
 
				+
			
 
				 
			
 
				 class NaiveTransformer(BaseTransformer):
			
 
				-    def __init__(self, config: NaiveModelArgs) -> None:
			
 
				-        super().__init__(config)
			
 
				+    def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
			
 
				+        super().__init__(config, init_weights=False, tokenizer=tokenizer)
			
 
				 
			
 
				         self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
			
 
				         self.codebook_output = nn.Linear(
			
@@ -249,6 +386,8 @@ class NaiveTransformer(BaseTransformer):
 
				             bias=False,
			
 
				         )
			
 
				 
			
 
				+        self.apply(self._init_weights)
			
 
				+
			
 
				     def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:
			
 
				         token_logits = result.logits
			
 
				         x = result.hidden_states
			
@@ -265,9 +404,14 @@ class NaiveTransformer(BaseTransformer):
 
				         )
			
 
				 
			
 
				     def forward(
			
 
				-        self, inp: Tensor, key_padding_mask: Optional[Tensor] = None
			
 
				+        self,
			
 
				+        inp: Tensor,
			
 
				+        key_padding_mask: Optional[Tensor] = None,
			
 
				     ) -> TransformerForwardResult:
			
 
				-        result = super().forward(inp, key_padding_mask)
			
 
				+        result = super().forward(
			
 
				+            inp=inp,
			
 
				+            key_padding_mask=key_padding_mask,
			
 
				+        )
			
 
				         return self.decode(result)
			
 
				 
			
 
				     def forward_generate(
			
@@ -278,13 +422,11 @@ class NaiveTransformer(BaseTransformer):
 
				 
			
 
				 
			
 
				 class DualARTransformer(BaseTransformer):
			
 
				-    def __init__(self, config: DualARModelArgs) -> None:
			
 
				-        super().__init__(config)
			
 
				+    def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
			
 
				+        super().__init__(config, init_weights=False, tokenizer=tokenizer)
			
 
				 
			
 
				         # Fast transformer
			
 
				-        self.fast_embeddings = nn.Embedding(
			
 
				-            config.codebook_size, config.dim, padding_idx=config.codebook_padding_idx
			
 
				-        )
			
 
				+        self.fast_embeddings = nn.Embedding(config.codebook_size, config.dim)
			
 
				 
			
 
				         # The equivalent bs is so large that sdpa doesn't work
			
 
				         self.fast_layers = nn.ModuleList(
			
@@ -297,6 +439,8 @@ class DualARTransformer(BaseTransformer):
 
				             bias=False,
			
 
				         )
			
 
				 
			
 
				+        self.apply(self._init_weights)
			
 
				+
			
 
				     def setup_caches(
			
 
				         self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
			
 
				     ):
			
@@ -316,7 +460,9 @@ class DualARTransformer(BaseTransformer):
 
				             )
			
 
				 
			
 
				     def forward(
			
 
				-        self, inp: Tensor, key_padding_mask: Optional[Tensor] = None
			
 
				+        self,
			
 
				+        inp: Tensor,
			
 
				+        key_padding_mask: Optional[Tensor] = None,
			
 
				     ) -> TransformerForwardResult:
			
 
				         parent_result = super().forward(inp, key_padding_mask)
			
 
				         token_logits = parent_result.logits
			
@@ -340,6 +486,11 @@ class DualARTransformer(BaseTransformer):
 
				         # Remove padded part
			
 
				         codebooks = rearrange(codebooks, "b n s -> (b s) n")
			
 
				         codebook_mask = (codebooks == self.config.codebook_padding_idx).all(dim=-1)
			
 
				+
			
 
				+        if torch.all(codebook_mask):
			
 
				+            # If all codebooks are padded, we keep first 8 to make sure the model runs
			
 
				+            codebook_mask[:8] = False
			
 
				+
			
 
				         x_bs, x_len = x.size(0), x.size(1)
			
 
				         x = x[~codebook_mask]
			
 
				 
			
@@ -422,7 +573,9 @@ class Attention(nn.Module):
 
				 
			
 
				         total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
			
 
				         # key, query, value projections for all heads, but in a batch
			
 
				-        self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
			
 
				+        self.wqkv = nn.Linear(
			
 
				+            config.dim, total_head_dim, bias=config.attention_qkv_bias
			
 
				+        )
			
 
				         self.wo = nn.Linear(config.dim, config.dim, bias=False)
			
 
				         self.kv_cache = None
			
 
				 
			
@@ -469,13 +622,24 @@ class Attention(nn.Module):
 
				         v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
			
 
				 
			
 
				         if self.use_sdpa:
			
 
				-            y = F.scaled_dot_product_attention(
			
 
				-                q,
			
 
				-                k,
			
 
				-                v,
			
 
				-                attn_mask=mask,
			
 
				-                dropout_p=self.dropout if self.training else 0.0,
			
 
				-            )
			
 
				+            if mask is None:
			
 
				+                with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
			
 
				+                    y = F.scaled_dot_product_attention(
			
 
				+                        q,
			
 
				+                        k,
			
 
				+                        v,
			
 
				+                        dropout_p=self.dropout if self.training else 0.0,
			
 
				+                        is_causal=True,
			
 
				+                        # No thirdparty attn_mask here to use flash_attention
			
 
				+                    )
			
 
				+            else:
			
 
				+                y = F.scaled_dot_product_attention(
			
 
				+                    q,
			
 
				+                    k,
			
 
				+                    v,
			
 
				+                    attn_mask=mask,
			
 
				+                    dropout_p=self.dropout if self.training else 0.0,
			
 
				+                )
			
 
				         else:
			
 
				             y = self.eq_scaled_dot_product_attention(
			
 
				                 q,
			
@@ -567,29 +731,3 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
 
				 
			
 
				     x_out2 = x_out2.flatten(3)
			
 
				     return x_out2.type_as(x)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    args = DualARModelArgs(
			
 
				-        max_seq_len=4096,
			
 
				-        vocab_size=32312,
			
 
				-        n_layer=12,
			
 
				-        n_fast_layer=4,
			
 
				-        n_head=12,
			
 
				-        dim=768,
			
 
				-        rope_base=10000,
			
 
				-        norm_eps=1e-5,
			
 
				-        codebook_size=128,
			
 
				-        num_codebooks=4,
			
 
				-    )
			
 
				-
			
 
				-    model = DualARTransformer(args)
			
 
				-    model = model.cuda().bfloat16()
			
 
				-    print("Total params:", sum(i.numel() for i in model.parameters()) / 1024 / 1024)
			
 
				-
			
 
				-    inputs = torch.randint(0, 100, (2, 5, 128)).cuda()
			
 
				-    key_padding_mask = torch.zeros(2, 128).bool().cuda()
			
 
				-    key_padding_mask[0, 2:] = True
			
 
				-    x1 = model(inputs, key_padding_mask=key_padding_mask)
			
 
				-    print(x1.token_logits.shape)
			
 
				-    print(x1.codebook_logits.shape)
			
--- a/fish_speech/models/text2semantic/lora_utils.py
+++ b/fish_speech/models/text2semantic/lora_utils.py
@@ -20,6 +20,14 @@ def setup_lora(model, lora_config):
 
				         lora_alpha=lora_config.lora_alpha,
			
 
				     )
			
 
				 
			
 
				+    model.codebook_embeddings = lora.Embedding(
			
 
				+        num_embeddings=model.codebook_embeddings.num_embeddings,
			
 
				+        embedding_dim=model.codebook_embeddings.embedding_dim,
			
 
				+        padding_idx=model.codebook_embeddings.padding_idx,
			
 
				+        r=lora_config.r,
			
 
				+        lora_alpha=lora_config.lora_alpha,
			
 
				+    )
			
 
				+
			
 
				     # Replace output layer with a LoRA layer
			
 
				     linears = [(model, "output")]
			
 
				 
			
--- a/fish_speech/models/vits_decoder/__init__.py
+++ b/fish_speech/models/vits_decoder/__init__.py
@@ -1,3 +0,0 @@
 
				-from .lit_module import VITSDecoder
			
 
				-
			
 
				-__all__ = ["VITSDecoder"]
			
--- a/fish_speech/models/vits_decoder/lit_module.py
+++ b/fish_speech/models/vits_decoder/lit_module.py
@@ -1,394 +0,0 @@
 
				-from typing import Any, Callable
			
 
				-
			
 
				-import lightning as L
			
 
				-import torch
			
 
				-import torch.nn.functional as F
			
 
				-import wandb
			
 
				-from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
			
 
				-from matplotlib import pyplot as plt
			
 
				-from torch import nn
			
 
				-
			
 
				-from fish_speech.models.vits_decoder.losses import (
			
 
				-    discriminator_loss,
			
 
				-    feature_loss,
			
 
				-    generator_loss,
			
 
				-    kl_loss,
			
 
				-)
			
 
				-from fish_speech.models.vqgan.utils import (
			
 
				-    avg_with_mask,
			
 
				-    plot_mel,
			
 
				-    sequence_mask,
			
 
				-    slice_segments,
			
 
				-)
			
 
				-
			
 
				-
			
 
				-class VITSDecoder(L.LightningModule):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        optimizer: Callable,
			
 
				-        lr_scheduler: Callable,
			
 
				-        generator: nn.Module,
			
 
				-        discriminator: nn.Module,
			
 
				-        mel_transform: nn.Module,
			
 
				-        spec_transform: nn.Module,
			
 
				-        hop_length: int = 512,
			
 
				-        sample_rate: int = 44100,
			
 
				-        freeze_discriminator: bool = False,
			
 
				-        weight_mel: float = 45,
			
 
				-        weight_kl: float = 0.1,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        # Model parameters
			
 
				-        self.optimizer_builder = optimizer
			
 
				-        self.lr_scheduler_builder = lr_scheduler
			
 
				-
			
 
				-        # Generator and discriminator
			
 
				-        self.generator = generator
			
 
				-        self.discriminator = discriminator
			
 
				-        self.mel_transform = mel_transform
			
 
				-        self.spec_transform = spec_transform
			
 
				-        self.freeze_discriminator = freeze_discriminator
			
 
				-
			
 
				-        # Loss weights
			
 
				-        self.weight_mel = weight_mel
			
 
				-        self.weight_kl = weight_kl
			
 
				-
			
 
				-        # Other parameters
			
 
				-        self.hop_length = hop_length
			
 
				-        self.sampling_rate = sample_rate
			
 
				-
			
 
				-        # Disable automatic optimization
			
 
				-        self.automatic_optimization = False
			
 
				-
			
 
				-        if self.freeze_discriminator:
			
 
				-            for p in self.discriminator.parameters():
			
 
				-                p.requires_grad = False
			
 
				-
			
 
				-    def configure_optimizers(self):
			
 
				-        # Need two optimizers and two schedulers
			
 
				-        optimizer_generator = self.optimizer_builder(self.generator.parameters())
			
 
				-        optimizer_discriminator = self.optimizer_builder(
			
 
				-            self.discriminator.parameters()
			
 
				-        )
			
 
				-
			
 
				-        lr_scheduler_generator = self.lr_scheduler_builder(optimizer_generator)
			
 
				-        lr_scheduler_discriminator = self.lr_scheduler_builder(optimizer_discriminator)
			
 
				-
			
 
				-        return (
			
 
				-            {
			
 
				-                "optimizer": optimizer_generator,
			
 
				-                "lr_scheduler": {
			
 
				-                    "scheduler": lr_scheduler_generator,
			
 
				-                    "interval": "step",
			
 
				-                    "name": "optimizer/generator",
			
 
				-                },
			
 
				-            },
			
 
				-            {
			
 
				-                "optimizer": optimizer_discriminator,
			
 
				-                "lr_scheduler": {
			
 
				-                    "scheduler": lr_scheduler_discriminator,
			
 
				-                    "interval": "step",
			
 
				-                    "name": "optimizer/discriminator",
			
 
				-                },
			
 
				-            },
			
 
				-        )
			
 
				-
			
 
				-    def training_step(self, batch, batch_idx):
			
 
				-        optim_g, optim_d = self.optimizers()
			
 
				-
			
 
				-        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
			
 
				-        texts, text_lengths = batch["texts"], batch["text_lengths"]
			
 
				-
			
 
				-        audios = audios.float()
			
 
				-        audios = audios[:, None, :]
			
 
				-
			
 
				-        with torch.no_grad():
			
 
				-            gt_mels = self.mel_transform(audios)
			
 
				-            gt_specs = self.spec_transform(audios)
			
 
				-
			
 
				-        spec_lengths = audio_lengths // self.hop_length
			
 
				-        spec_masks = torch.unsqueeze(
			
 
				-            sequence_mask(spec_lengths, gt_mels.shape[2]), 1
			
 
				-        ).to(gt_mels.dtype)
			
 
				-
			
 
				-        (
			
 
				-            fake_audios,
			
 
				-            ids_slice,
			
 
				-            y_mask,
			
 
				-            (z, z_p, m_p, logs_p, m_q, logs_q),
			
 
				-        ) = self.generator(
			
 
				-            audios,
			
 
				-            audio_lengths,
			
 
				-            gt_specs,
			
 
				-            spec_lengths,
			
 
				-            texts,
			
 
				-            text_lengths,
			
 
				-        )
			
 
				-
			
 
				-        gt_mels = slice_segments(gt_mels, ids_slice, self.generator.segment_size)
			
 
				-        spec_masks = slice_segments(spec_masks, ids_slice, self.generator.segment_size)
			
 
				-        audios = slice_segments(
			
 
				-            audios,
			
 
				-            ids_slice * self.hop_length,
			
 
				-            self.generator.segment_size * self.hop_length,
			
 
				-        )
			
 
				-        fake_mels = self.mel_transform(fake_audios.squeeze(1))
			
 
				-
			
 
				-        assert (
			
 
				-            audios.shape == fake_audios.shape
			
 
				-        ), f"{audios.shape} != {fake_audios.shape}"
			
 
				-
			
 
				-        # Discriminator
			
 
				-        if self.freeze_discriminator is False:
			
 
				-            y_d_hat_r, y_d_hat_g, _, _ = self.discriminator(
			
 
				-                audios, fake_audios.detach()
			
 
				-            )
			
 
				-
			
 
				-            with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				-                loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g)
			
 
				-
			
 
				-            self.log(
			
 
				-                f"train/discriminator/loss",
			
 
				-                loss_disc,
			
 
				-                on_step=True,
			
 
				-                on_epoch=False,
			
 
				-                prog_bar=False,
			
 
				-                logger=True,
			
 
				-                sync_dist=True,
			
 
				-            )
			
 
				-
			
 
				-            optim_d.zero_grad()
			
 
				-            self.manual_backward(loss_disc)
			
 
				-            self.clip_gradients(
			
 
				-                optim_d, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
			
 
				-            )
			
 
				-            optim_d.step()
			
 
				-
			
 
				-        # Adv Loss
			
 
				-        y_d_hat_r, y_d_hat_g, _, _ = self.discriminator(audios, fake_audios)
			
 
				-
			
 
				-        # Adversarial Loss
			
 
				-        with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				-            loss_adv, _ = generator_loss(y_d_hat_g)
			
 
				-
			
 
				-        self.log(
			
 
				-            f"train/generator/adv",
			
 
				-            loss_adv,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				-            loss_fm = feature_loss(y_d_hat_r, y_d_hat_g)
			
 
				-
			
 
				-        self.log(
			
 
				-            f"train/generator/adv_fm",
			
 
				-            loss_fm,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				-            loss_mel = avg_with_mask(
			
 
				-                F.l1_loss(gt_mels, fake_mels, reduction="none"), spec_masks
			
 
				-            )
			
 
				-
			
 
				-        self.log(
			
 
				-            "train/generator/loss_mel",
			
 
				-            loss_mel,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, y_mask)
			
 
				-
			
 
				-        self.log(
			
 
				-            "train/generator/loss_kl",
			
 
				-            loss_kl,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        loss = (
			
 
				-            loss_mel * self.weight_mel + loss_kl * self.weight_kl + loss_adv + loss_fm
			
 
				-        )
			
 
				-        self.log(
			
 
				-            "train/generator/loss",
			
 
				-            loss,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=True,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        # Backward
			
 
				-        optim_g.zero_grad()
			
 
				-
			
 
				-        self.manual_backward(loss)
			
 
				-        self.clip_gradients(
			
 
				-            optim_g, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
			
 
				-        )
			
 
				-        optim_g.step()
			
 
				-
			
 
				-        # Manual LR Scheduler
			
 
				-        scheduler_g, scheduler_d = self.lr_schedulers()
			
 
				-        scheduler_g.step()
			
 
				-        scheduler_d.step()
			
 
				-
			
 
				-    def validation_step(self, batch: Any, batch_idx: int):
			
 
				-        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
			
 
				-        texts, text_lengths = batch["texts"], batch["text_lengths"]
			
 
				-
			
 
				-        audios = audios.float()
			
 
				-        audios = audios[:, None, :]
			
 
				-
			
 
				-        gt_mels = self.mel_transform(audios)
			
 
				-        gt_specs = self.spec_transform(audios)
			
 
				-        spec_lengths = audio_lengths // self.hop_length
			
 
				-        spec_masks = torch.unsqueeze(
			
 
				-            sequence_mask(spec_lengths, gt_mels.shape[2]), 1
			
 
				-        ).to(gt_mels.dtype)
			
 
				-
			
 
				-        prior_audios = self.generator.infer(
			
 
				-            audios, audio_lengths, gt_specs, spec_lengths, texts, text_lengths
			
 
				-        )
			
 
				-        posterior_audios = self.generator.infer_posterior(gt_specs, spec_lengths)
			
 
				-        prior_mels = self.mel_transform(prior_audios.squeeze(1))
			
 
				-        posterior_mels = self.mel_transform(posterior_audios.squeeze(1))
			
 
				-
			
 
				-        min_mel_length = min(
			
 
				-            gt_mels.shape[-1], prior_mels.shape[-1], posterior_mels.shape[-1]
			
 
				-        )
			
 
				-        gt_mels = gt_mels[:, :, :min_mel_length]
			
 
				-        prior_mels = prior_mels[:, :, :min_mel_length]
			
 
				-        posterior_mels = posterior_mels[:, :, :min_mel_length]
			
 
				-
			
 
				-        prior_mel_loss = avg_with_mask(
			
 
				-            F.l1_loss(gt_mels, prior_mels, reduction="none"), spec_masks
			
 
				-        )
			
 
				-        posterior_mel_loss = avg_with_mask(
			
 
				-            F.l1_loss(gt_mels, posterior_mels, reduction="none"), spec_masks
			
 
				-        )
			
 
				-
			
 
				-        self.log(
			
 
				-            "val/prior_mel_loss",
			
 
				-            prior_mel_loss,
			
 
				-            on_step=False,
			
 
				-            on_epoch=True,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        self.log(
			
 
				-            "val/posterior_mel_loss",
			
 
				-            posterior_mel_loss,
			
 
				-            on_step=False,
			
 
				-            on_epoch=True,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        # only log the first batch
			
 
				-        if batch_idx != 0:
			
 
				-            return
			
 
				-
			
 
				-        for idx, (
			
 
				-            mel,
			
 
				-            prior_mel,
			
 
				-            posterior_mel,
			
 
				-            audio,
			
 
				-            prior_audio,
			
 
				-            posterior_audio,
			
 
				-            audio_len,
			
 
				-        ) in enumerate(
			
 
				-            zip(
			
 
				-                gt_mels,
			
 
				-                prior_mels,
			
 
				-                posterior_mels,
			
 
				-                audios.detach().float(),
			
 
				-                prior_audios.detach().float(),
			
 
				-                posterior_audios.detach().float(),
			
 
				-                audio_lengths,
			
 
				-            )
			
 
				-        ):
			
 
				-            mel_len = audio_len // self.hop_length
			
 
				-
			
 
				-            image_mels = plot_mel(
			
 
				-                [
			
 
				-                    prior_mel[:, :mel_len],
			
 
				-                    posterior_mel[:, :mel_len],
			
 
				-                    mel[:, :mel_len],
			
 
				-                ],
			
 
				-                [
			
 
				-                    "Prior (VQ)",
			
 
				-                    "Posterior (Reconstruction)",
			
 
				-                    "Ground-Truth",
			
 
				-                ],
			
 
				-            )
			
 
				-
			
 
				-            if isinstance(self.logger, WandbLogger):
			
 
				-                self.logger.experiment.log(
			
 
				-                    {
			
 
				-                        "reconstruction_mel": wandb.Image(image_mels, caption="mels"),
			
 
				-                        "wavs": [
			
 
				-                            wandb.Audio(
			
 
				-                                audio[0, :audio_len],
			
 
				-                                sample_rate=self.sampling_rate,
			
 
				-                                caption="gt",
			
 
				-                            ),
			
 
				-                            wandb.Audio(
			
 
				-                                prior_audio[0, :audio_len],
			
 
				-                                sample_rate=self.sampling_rate,
			
 
				-                                caption="prior",
			
 
				-                            ),
			
 
				-                            wandb.Audio(
			
 
				-                                posterior_audio[0, :audio_len],
			
 
				-                                sample_rate=self.sampling_rate,
			
 
				-                                caption="posterior",
			
 
				-                            ),
			
 
				-                        ],
			
 
				-                    },
			
 
				-                )
			
 
				-
			
 
				-            if isinstance(self.logger, TensorBoardLogger):
			
 
				-                self.logger.experiment.add_figure(
			
 
				-                    f"sample-{idx}/mels",
			
 
				-                    image_mels,
			
 
				-                    global_step=self.global_step,
			
 
				-                )
			
 
				-                self.logger.experiment.add_audio(
			
 
				-                    f"sample-{idx}/wavs/gt",
			
 
				-                    audio[0, :audio_len],
			
 
				-                    self.global_step,
			
 
				-                    sample_rate=self.sampling_rate,
			
 
				-                )
			
 
				-                self.logger.experiment.add_audio(
			
 
				-                    f"sample-{idx}/wavs/prior",
			
 
				-                    prior_audio[0, :audio_len],
			
 
				-                    self.global_step,
			
 
				-                    sample_rate=self.sampling_rate,
			
 
				-                )
			
 
				-                self.logger.experiment.add_audio(
			
 
				-                    f"sample-{idx}/wavs/posterior",
			
 
				-                    posterior_audio[0, :audio_len],
			
 
				-                    self.global_step,
			
 
				-                    sample_rate=self.sampling_rate,
			
 
				-                )
			
 
				-
			
 
				-            plt.close(image_mels)
			
--- a/fish_speech/models/vits_decoder/losses.py
+++ b/fish_speech/models/vits_decoder/losses.py
@@ -1,67 +0,0 @@
 
				-import torch
			
 
				-import torch.nn.functional as F
			
 
				-from torch import nn
			
 
				-
			
 
				-
			
 
				-def feature_loss(fmap_r: list[torch.Tensor], fmap_g: list[torch.Tensor]):
			
 
				-    loss = 0
			
 
				-    for dr, dg in zip(fmap_r, fmap_g):
			
 
				-        dr = dr.float().detach()
			
 
				-        dg = dg.float()
			
 
				-        loss += torch.mean(torch.abs(dr - dg))
			
 
				-
			
 
				-    return loss * 2
			
 
				-
			
 
				-
			
 
				-def discriminator_loss(
			
 
				-    disc_real_outputs: list[torch.Tensor], disc_generated_outputs: list[torch.Tensor]
			
 
				-):
			
 
				-    loss = 0
			
 
				-    r_losses = []
			
 
				-    g_losses = []
			
 
				-    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
			
 
				-        dr = dr.float()
			
 
				-        dg = dg.float()
			
 
				-        r_loss = torch.mean((1 - dr) ** 2)
			
 
				-        g_loss = torch.mean(dg**2)
			
 
				-        loss += r_loss + g_loss
			
 
				-        r_losses.append(r_loss.item())
			
 
				-        g_losses.append(g_loss.item())
			
 
				-
			
 
				-    return loss, r_losses, g_losses
			
 
				-
			
 
				-
			
 
				-def generator_loss(disc_outputs: list[torch.Tensor]):
			
 
				-    loss = 0
			
 
				-    gen_losses = []
			
 
				-    for dg in disc_outputs:
			
 
				-        dg = dg.float()
			
 
				-        l = torch.mean((1 - dg) ** 2)
			
 
				-        gen_losses.append(l)
			
 
				-        loss += l
			
 
				-
			
 
				-    return loss, gen_losses
			
 
				-
			
 
				-
			
 
				-def kl_loss(
			
 
				-    z_p: torch.Tensor,
			
 
				-    logs_q: torch.Tensor,
			
 
				-    m_p: torch.Tensor,
			
 
				-    logs_p: torch.Tensor,
			
 
				-    z_mask: torch.Tensor,
			
 
				-):
			
 
				-    """
			
 
				-    z_p, logs_q: [b, h, t_t]
			
 
				-    m_p, logs_p: [b, h, t_t]
			
 
				-    """
			
 
				-    z_p = z_p.float()
			
 
				-    logs_q = logs_q.float()
			
 
				-    m_p = m_p.float()
			
 
				-    logs_p = logs_p.float()
			
 
				-    z_mask = z_mask.float()
			
 
				-
			
 
				-    kl = logs_p - logs_q - 0.5
			
 
				-    kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
			
 
				-    kl = torch.sum(kl * z_mask)
			
 
				-    l = kl / torch.sum(z_mask)
			
 
				-    return l
			
--- a/fish_speech/models/vits_decoder/modules/attentions.py
+++ b/fish_speech/models/vits_decoder/modules/attentions.py
@@ -1,350 +0,0 @@
 
				-import math
			
 
				-
			
 
				-import torch
			
 
				-from torch import nn
			
 
				-from torch.nn import functional as F
			
 
				-from torch.nn.utils import remove_weight_norm, weight_norm
			
 
				-
			
 
				-from fish_speech.models.vits_decoder.modules import commons
			
 
				-
			
 
				-from .modules import LayerNorm
			
 
				-
			
 
				-
			
 
				-class Encoder(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        hidden_channels,
			
 
				-        filter_channels,
			
 
				-        n_heads,
			
 
				-        n_layers,
			
 
				-        kernel_size=1,
			
 
				-        p_dropout=0.0,
			
 
				-        window_size=4,
			
 
				-        isflow=False,
			
 
				-        gin_channels=0,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-        self.hidden_channels = hidden_channels
			
 
				-        self.filter_channels = filter_channels
			
 
				-        self.n_heads = n_heads
			
 
				-        self.n_layers = n_layers
			
 
				-        self.kernel_size = kernel_size
			
 
				-        self.p_dropout = p_dropout
			
 
				-        self.window_size = window_size
			
 
				-
			
 
				-        self.drop = nn.Dropout(p_dropout)
			
 
				-        self.attn_layers = nn.ModuleList()
			
 
				-        self.norm_layers_1 = nn.ModuleList()
			
 
				-        self.ffn_layers = nn.ModuleList()
			
 
				-        self.norm_layers_2 = nn.ModuleList()
			
 
				-        for i in range(self.n_layers):
			
 
				-            self.attn_layers.append(
			
 
				-                MultiHeadAttention(
			
 
				-                    hidden_channels,
			
 
				-                    hidden_channels,
			
 
				-                    n_heads,
			
 
				-                    p_dropout=p_dropout,
			
 
				-                    window_size=window_size,
			
 
				-                )
			
 
				-            )
			
 
				-            self.norm_layers_1.append(LayerNorm(hidden_channels))
			
 
				-            self.ffn_layers.append(
			
 
				-                FFN(
			
 
				-                    hidden_channels,
			
 
				-                    hidden_channels,
			
 
				-                    filter_channels,
			
 
				-                    kernel_size,
			
 
				-                    p_dropout=p_dropout,
			
 
				-                )
			
 
				-            )
			
 
				-            self.norm_layers_2.append(LayerNorm(hidden_channels))
			
 
				-
			
 
				-        if isflow:
			
 
				-            cond_layer = torch.nn.Conv1d(
			
 
				-                gin_channels, 2 * hidden_channels * n_layers, 1
			
 
				-            )
			
 
				-            self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1)
			
 
				-            self.cond_layer = weight_norm(cond_layer, "weight")
			
 
				-            self.gin_channels = gin_channels
			
 
				-
			
 
				-    def forward(self, x, x_mask, g=None):
			
 
				-        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
			
 
				-        x = x * x_mask
			
 
				-        if g is not None:
			
 
				-            g = self.cond_layer(g)
			
 
				-
			
 
				-        for i in range(self.n_layers):
			
 
				-            if g is not None:
			
 
				-                x = self.cond_pre(x)
			
 
				-                cond_offset = i * 2 * self.hidden_channels
			
 
				-                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
			
 
				-                x = commons.fused_add_tanh_sigmoid_multiply(
			
 
				-                    x, g_l, torch.IntTensor([self.hidden_channels])
			
 
				-                )
			
 
				-            y = self.attn_layers[i](x, x, attn_mask)
			
 
				-            y = self.drop(y)
			
 
				-            x = self.norm_layers_1[i](x + y)
			
 
				-
			
 
				-            y = self.ffn_layers[i](x, x_mask)
			
 
				-            y = self.drop(y)
			
 
				-            x = self.norm_layers_2[i](x + y)
			
 
				-        x = x * x_mask
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class MultiHeadAttention(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        channels,
			
 
				-        out_channels,
			
 
				-        n_heads,
			
 
				-        p_dropout=0.0,
			
 
				-        window_size=None,
			
 
				-        heads_share=True,
			
 
				-        block_length=None,
			
 
				-        proximal_bias=False,
			
 
				-        proximal_init=False,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-        assert channels % n_heads == 0
			
 
				-
			
 
				-        self.channels = channels
			
 
				-        self.out_channels = out_channels
			
 
				-        self.n_heads = n_heads
			
 
				-        self.p_dropout = p_dropout
			
 
				-        self.window_size = window_size
			
 
				-        self.heads_share = heads_share
			
 
				-        self.block_length = block_length
			
 
				-        self.proximal_bias = proximal_bias
			
 
				-        self.proximal_init = proximal_init
			
 
				-        self.attn = None
			
 
				-
			
 
				-        self.k_channels = channels // n_heads
			
 
				-        self.conv_q = nn.Conv1d(channels, channels, 1)
			
 
				-        self.conv_k = nn.Conv1d(channels, channels, 1)
			
 
				-        self.conv_v = nn.Conv1d(channels, channels, 1)
			
 
				-        self.conv_o = nn.Conv1d(channels, out_channels, 1)
			
 
				-        self.drop = nn.Dropout(p_dropout)
			
 
				-
			
 
				-        if window_size is not None:
			
 
				-            n_heads_rel = 1 if heads_share else n_heads
			
 
				-            rel_stddev = self.k_channels**-0.5
			
 
				-            self.emb_rel_k = nn.Parameter(
			
 
				-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
			
 
				-                * rel_stddev
			
 
				-            )
			
 
				-            self.emb_rel_v = nn.Parameter(
			
 
				-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
			
 
				-                * rel_stddev
			
 
				-            )
			
 
				-
			
 
				-        nn.init.xavier_uniform_(self.conv_q.weight)
			
 
				-        nn.init.xavier_uniform_(self.conv_k.weight)
			
 
				-        nn.init.xavier_uniform_(self.conv_v.weight)
			
 
				-        if proximal_init:
			
 
				-            with torch.no_grad():
			
 
				-                self.conv_k.weight.copy_(self.conv_q.weight)
			
 
				-                self.conv_k.bias.copy_(self.conv_q.bias)
			
 
				-
			
 
				-    def forward(self, x, c, attn_mask=None):
			
 
				-        q = self.conv_q(x)
			
 
				-        k = self.conv_k(c)
			
 
				-        v = self.conv_v(c)
			
 
				-
			
 
				-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
			
 
				-
			
 
				-        x = self.conv_o(x)
			
 
				-        return x
			
 
				-
			
 
				-    def attention(self, query, key, value, mask=None):
			
 
				-        # reshape [b, d, t] -> [b, n_h, t, d_k]
			
 
				-        b, d, t_s, t_t = (*key.size(), query.size(2))
			
 
				-        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
			
 
				-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
			
 
				-        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
			
 
				-
			
 
				-        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
			
 
				-        if self.window_size is not None:
			
 
				-            assert (
			
 
				-                t_s == t_t
			
 
				-            ), "Relative attention is only available for self-attention."
			
 
				-            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
			
 
				-            rel_logits = self._matmul_with_relative_keys(
			
 
				-                query / math.sqrt(self.k_channels), key_relative_embeddings
			
 
				-            )
			
 
				-            scores_local = self._relative_position_to_absolute_position(rel_logits)
			
 
				-            scores = scores + scores_local
			
 
				-        if self.proximal_bias:
			
 
				-            assert t_s == t_t, "Proximal bias is only available for self-attention."
			
 
				-            scores = scores + self._attention_bias_proximal(t_s).to(
			
 
				-                device=scores.device, dtype=scores.dtype
			
 
				-            )
			
 
				-        if mask is not None:
			
 
				-            scores = scores.masked_fill(mask == 0, -1e4)
			
 
				-            if self.block_length is not None:
			
 
				-                assert (
			
 
				-                    t_s == t_t
			
 
				-                ), "Local attention is only available for self-attention."
			
 
				-                block_mask = (
			
 
				-                    torch.ones_like(scores)
			
 
				-                    .triu(-self.block_length)
			
 
				-                    .tril(self.block_length)
			
 
				-                )
			
 
				-                scores = scores.masked_fill(block_mask == 0, -1e4)
			
 
				-        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
			
 
				-        p_attn = self.drop(p_attn)
			
 
				-        output = torch.matmul(p_attn, value)
			
 
				-        if self.window_size is not None:
			
 
				-            relative_weights = self._absolute_position_to_relative_position(p_attn)
			
 
				-            value_relative_embeddings = self._get_relative_embeddings(
			
 
				-                self.emb_rel_v, t_s
			
 
				-            )
			
 
				-            output = output + self._matmul_with_relative_values(
			
 
				-                relative_weights, value_relative_embeddings
			
 
				-            )
			
 
				-        output = (
			
 
				-            output.transpose(2, 3).contiguous().view(b, d, t_t)
			
 
				-        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
			
 
				-        return output, p_attn
			
 
				-
			
 
				-    def _matmul_with_relative_values(self, x, y):
			
 
				-        """
			
 
				-        x: [b, h, l, m]
			
 
				-        y: [h or 1, m, d]
			
 
				-        ret: [b, h, l, d]
			
 
				-        """
			
 
				-        ret = torch.matmul(x, y.unsqueeze(0))
			
 
				-        return ret
			
 
				-
			
 
				-    def _matmul_with_relative_keys(self, x, y):
			
 
				-        """
			
 
				-        x: [b, h, l, d]
			
 
				-        y: [h or 1, m, d]
			
 
				-        ret: [b, h, l, m]
			
 
				-        """
			
 
				-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
			
 
				-        return ret
			
 
				-
			
 
				-    def _get_relative_embeddings(self, relative_embeddings, length):
			
 
				-        max_relative_position = 2 * self.window_size + 1
			
 
				-        # Pad first before slice to avoid using cond ops.
			
 
				-        pad_length = max(length - (self.window_size + 1), 0)
			
 
				-        slice_start_position = max((self.window_size + 1) - length, 0)
			
 
				-        slice_end_position = slice_start_position + 2 * length - 1
			
 
				-        if pad_length > 0:
			
 
				-            padded_relative_embeddings = F.pad(
			
 
				-                relative_embeddings,
			
 
				-                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
			
 
				-            )
			
 
				-        else:
			
 
				-            padded_relative_embeddings = relative_embeddings
			
 
				-        used_relative_embeddings = padded_relative_embeddings[
			
 
				-            :, slice_start_position:slice_end_position
			
 
				-        ]
			
 
				-        return used_relative_embeddings
			
 
				-
			
 
				-    def _relative_position_to_absolute_position(self, x):
			
 
				-        """
			
 
				-        x: [b, h, l, 2*l-1]
			
 
				-        ret: [b, h, l, l]
			
 
				-        """
			
 
				-        batch, heads, length, _ = x.size()
			
 
				-        # Concat columns of pad to shift from relative to absolute indexing.
			
 
				-        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
			
 
				-
			
 
				-        # Concat extra elements so to add up to shape (len+1, 2*len-1).
			
 
				-        x_flat = x.view([batch, heads, length * 2 * length])
			
 
				-        x_flat = F.pad(
			
 
				-            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
			
 
				-        )
			
 
				-
			
 
				-        # Reshape and slice out the padded elements.
			
 
				-        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
			
 
				-            :, :, :length, length - 1 :
			
 
				-        ]
			
 
				-        return x_final
			
 
				-
			
 
				-    def _absolute_position_to_relative_position(self, x):
			
 
				-        """
			
 
				-        x: [b, h, l, l]
			
 
				-        ret: [b, h, l, 2*l-1]
			
 
				-        """
			
 
				-        batch, heads, length, _ = x.size()
			
 
				-        # pad along column
			
 
				-        x = F.pad(
			
 
				-            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
			
 
				-        )
			
 
				-        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
			
 
				-        # add 0's in the beginning that will skew the elements after reshape
			
 
				-        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
			
 
				-        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
			
 
				-        return x_final
			
 
				-
			
 
				-    def _attention_bias_proximal(self, length):
			
 
				-        """Bias for self-attention to encourage attention to close positions.
			
 
				-        Args:
			
 
				-          length: an integer scalar.
			
 
				-        Returns:
			
 
				-          a Tensor with shape [1, 1, length, length]
			
 
				-        """
			
 
				-        r = torch.arange(length, dtype=torch.float32)
			
 
				-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
			
 
				-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
			
 
				-
			
 
				-
			
 
				-class FFN(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        in_channels,
			
 
				-        out_channels,
			
 
				-        filter_channels,
			
 
				-        kernel_size,
			
 
				-        p_dropout=0.0,
			
 
				-        activation=None,
			
 
				-        causal=False,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-        self.in_channels = in_channels
			
 
				-        self.out_channels = out_channels
			
 
				-        self.filter_channels = filter_channels
			
 
				-        self.kernel_size = kernel_size
			
 
				-        self.p_dropout = p_dropout
			
 
				-        self.activation = activation
			
 
				-        self.causal = causal
			
 
				-
			
 
				-        if causal:
			
 
				-            self.padding = self._causal_padding
			
 
				-        else:
			
 
				-            self.padding = self._same_padding
			
 
				-
			
 
				-        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
			
 
				-        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
			
 
				-        self.drop = nn.Dropout(p_dropout)
			
 
				-
			
 
				-    def forward(self, x, x_mask):
			
 
				-        x = self.conv_1(self.padding(x * x_mask))
			
 
				-        if self.activation == "gelu":
			
 
				-            x = x * torch.sigmoid(1.702 * x)
			
 
				-        else:
			
 
				-            x = torch.relu(x)
			
 
				-        x = self.drop(x)
			
 
				-        x = self.conv_2(self.padding(x * x_mask))
			
 
				-        return x * x_mask
			
 
				-
			
 
				-    def _causal_padding(self, x):
			
 
				-        if self.kernel_size == 1:
			
 
				-            return x
			
 
				-        pad_l = self.kernel_size - 1
			
 
				-        pad_r = 0
			
 
				-        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
			
 
				-        x = F.pad(x, commons.convert_pad_shape(padding))
			
 
				-        return x
			
 
				-
			
 
				-    def _same_padding(self, x):
			
 
				-        if self.kernel_size == 1:
			
 
				-            return x
			
 
				-        pad_l = (self.kernel_size - 1) // 2
			
 
				-        pad_r = self.kernel_size // 2
			
 
				-        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
			
 
				-        x = F.pad(x, commons.convert_pad_shape(padding))
			
 
				-        return x
			
--- a/fish_speech/models/vits_decoder/modules/commons.py
+++ b/fish_speech/models/vits_decoder/modules/commons.py
@@ -1,190 +0,0 @@
 
				-import math
			
 
				-
			
 
				-import torch
			
 
				-from torch.nn import functional as F
			
 
				-
			
 
				-
			
 
				-def init_weights(m, mean=0.0, std=0.01):
			
 
				-    classname = m.__class__.__name__
			
 
				-    if classname.find("Conv") != -1:
			
 
				-        m.weight.data.normal_(mean, std)
			
 
				-
			
 
				-
			
 
				-def get_padding(kernel_size, dilation=1):
			
 
				-    return int((kernel_size * dilation - dilation) / 2)
			
 
				-
			
 
				-
			
 
				-def convert_pad_shape(pad_shape):
			
 
				-    l = pad_shape[::-1]
			
 
				-    pad_shape = [item for sublist in l for item in sublist]
			
 
				-    return pad_shape
			
 
				-
			
 
				-
			
 
				-def intersperse(lst, item):
			
 
				-    result = [item] * (len(lst) * 2 + 1)
			
 
				-    result[1::2] = lst
			
 
				-    return result
			
 
				-
			
 
				-
			
 
				-def kl_divergence(m_p, logs_p, m_q, logs_q):
			
 
				-    """KL(P||Q)"""
			
 
				-    kl = (logs_q - logs_p) - 0.5
			
 
				-    kl += (
			
 
				-        0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
			
 
				-    )
			
 
				-    return kl
			
 
				-
			
 
				-
			
 
				-def rand_gumbel(shape):
			
 
				-    """Sample from the Gumbel distribution, protect from overflows."""
			
 
				-    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
			
 
				-    return -torch.log(-torch.log(uniform_samples))
			
 
				-
			
 
				-
			
 
				-def rand_gumbel_like(x):
			
 
				-    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
			
 
				-    return g
			
 
				-
			
 
				-
			
 
				-def slice_segments(x, ids_str, segment_size=4):
			
 
				-    ret = torch.zeros_like(x[:, :, :segment_size])
			
 
				-    for i in range(x.size(0)):
			
 
				-        idx_str = ids_str[i]
			
 
				-        idx_end = idx_str + segment_size
			
 
				-        ret[i] = x[i, :, idx_str:idx_end]
			
 
				-    return ret
			
 
				-
			
 
				-
			
 
				-def rand_slice_segments(x, x_lengths=None, segment_size=4):
			
 
				-    b, d, t = x.size()
			
 
				-    if x_lengths is None:
			
 
				-        x_lengths = t
			
 
				-    ids_str_max = x_lengths - segment_size + 1
			
 
				-    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
			
 
				-    ret = slice_segments(x, ids_str, segment_size)
			
 
				-    return ret, ids_str
			
 
				-
			
 
				-
			
 
				-def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
			
 
				-    position = torch.arange(length, dtype=torch.float)
			
 
				-    num_timescales = channels // 2
			
 
				-    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
			
 
				-        num_timescales - 1
			
 
				-    )
			
 
				-    inv_timescales = min_timescale * torch.exp(
			
 
				-        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
			
 
				-    )
			
 
				-    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
			
 
				-    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
			
 
				-    signal = F.pad(signal, [0, 0, 0, channels % 2])
			
 
				-    signal = signal.view(1, channels, length)
			
 
				-    return signal
			
 
				-
			
 
				-
			
 
				-def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
			
 
				-    b, channels, length = x.size()
			
 
				-    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
			
 
				-    return x + signal.to(dtype=x.dtype, device=x.device)
			
 
				-
			
 
				-
			
 
				-def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
			
 
				-    b, channels, length = x.size()
			
 
				-    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
			
 
				-    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
			
 
				-
			
 
				-
			
 
				-def subsequent_mask(length):
			
 
				-    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
			
 
				-    return mask
			
 
				-
			
 
				-
			
 
				-@torch.jit.script
			
 
				-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
			
 
				-    n_channels_int = n_channels[0]
			
 
				-    in_act = input_a + input_b
			
 
				-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
			
 
				-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
			
 
				-    acts = t_act * s_act
			
 
				-    return acts
			
 
				-
			
 
				-
			
 
				-def convert_pad_shape(pad_shape):
			
 
				-    l = pad_shape[::-1]
			
 
				-    pad_shape = [item for sublist in l for item in sublist]
			
 
				-    return pad_shape
			
 
				-
			
 
				-
			
 
				-def shift_1d(x):
			
 
				-    x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
			
 
				-    return x
			
 
				-
			
 
				-
			
 
				-def sequence_mask(length, max_length=None):
			
 
				-    if max_length is None:
			
 
				-        max_length = length.max()
			
 
				-    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
			
 
				-    return x.unsqueeze(0) < length.unsqueeze(1)
			
 
				-
			
 
				-
			
 
				-def generate_path(duration, mask):
			
 
				-    """
			
 
				-    duration: [b, 1, t_x]
			
 
				-    mask: [b, 1, t_y, t_x]
			
 
				-    """
			
 
				-    device = duration.device
			
 
				-
			
 
				-    b, _, t_y, t_x = mask.shape
			
 
				-    cum_duration = torch.cumsum(duration, -1)
			
 
				-
			
 
				-    cum_duration_flat = cum_duration.view(b * t_x)
			
 
				-    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
			
 
				-    path = path.view(b, t_x, t_y)
			
 
				-    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
			
 
				-    path = path.unsqueeze(1).transpose(2, 3) * mask
			
 
				-    return path
			
 
				-
			
 
				-
			
 
				-def clip_grad_value_(parameters, clip_value, norm_type=2):
			
 
				-    if isinstance(parameters, torch.Tensor):
			
 
				-        parameters = [parameters]
			
 
				-    parameters = list(filter(lambda p: p.grad is not None, parameters))
			
 
				-    norm_type = float(norm_type)
			
 
				-    if clip_value is not None:
			
 
				-        clip_value = float(clip_value)
			
 
				-
			
 
				-    total_norm = 0
			
 
				-    for p in parameters:
			
 
				-        param_norm = p.grad.data.norm(norm_type)
			
 
				-        total_norm += param_norm.item() ** norm_type
			
 
				-        if clip_value is not None:
			
 
				-            p.grad.data.clamp_(min=-clip_value, max=clip_value)
			
 
				-    total_norm = total_norm ** (1.0 / norm_type)
			
 
				-    return total_norm
			
 
				-
			
 
				-
			
 
				-def squeeze(x, x_mask=None, n_sqz=2):
			
 
				-    b, c, t = x.size()
			
 
				-
			
 
				-    t = (t // n_sqz) * n_sqz
			
 
				-    x = x[:, :, :t]
			
 
				-    x_sqz = x.view(b, c, t // n_sqz, n_sqz)
			
 
				-    x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz)
			
 
				-
			
 
				-    if x_mask is not None:
			
 
				-        x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz]
			
 
				-    else:
			
 
				-        x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype)
			
 
				-    return x_sqz * x_mask, x_mask
			
 
				-
			
 
				-
			
 
				-def unsqueeze(x, x_mask=None, n_sqz=2):
			
 
				-    b, c, t = x.size()
			
 
				-
			
 
				-    x_unsqz = x.view(b, n_sqz, c // n_sqz, t)
			
 
				-    x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz)
			
 
				-
			
 
				-    if x_mask is not None:
			
 
				-        x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz)
			
 
				-    else:
			
 
				-        x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype)
			
 
				-    return x_unsqz * x_mask, x_mask
			
--- a/fish_speech/models/vits_decoder/modules/models.py
+++ b/fish_speech/models/vits_decoder/modules/models.py
@@ -1,686 +0,0 @@
 
				-import torch
			
 
				-from torch import nn
			
 
				-from torch.nn import Conv1d, Conv2d, ConvTranspose1d
			
 
				-from torch.nn import functional as F
			
 
				-from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
			
 
				-
			
 
				-from fish_speech.models.vits_decoder.modules import attentions, commons, modules
			
 
				-
			
 
				-from .commons import get_padding, init_weights
			
 
				-from .mrte import MRTE
			
 
				-from .vq_encoder import VQEncoder
			
 
				-
			
 
				-
			
 
				-class TextEncoder(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        out_channels,
			
 
				-        hidden_channels,
			
 
				-        filter_channels,
			
 
				-        n_heads,
			
 
				-        n_layers,
			
 
				-        kernel_size,
			
 
				-        p_dropout,
			
 
				-        latent_channels=192,
			
 
				-        codebook_size=264,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-        self.out_channels = out_channels
			
 
				-        self.hidden_channels = hidden_channels
			
 
				-        self.filter_channels = filter_channels
			
 
				-        self.n_heads = n_heads
			
 
				-        self.n_layers = n_layers
			
 
				-        self.kernel_size = kernel_size
			
 
				-        self.p_dropout = p_dropout
			
 
				-        self.latent_channels = latent_channels
			
 
				-
			
 
				-        self.ssl_proj = nn.Conv1d(768, hidden_channels, 1)
			
 
				-
			
 
				-        self.encoder_ssl = attentions.Encoder(
			
 
				-            hidden_channels,
			
 
				-            filter_channels,
			
 
				-            n_heads,
			
 
				-            n_layers // 2,
			
 
				-            kernel_size,
			
 
				-            p_dropout,
			
 
				-        )
			
 
				-
			
 
				-        self.encoder_text = attentions.Encoder(
			
 
				-            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
			
 
				-        )
			
 
				-        self.text_embedding = nn.Embedding(codebook_size, hidden_channels)
			
 
				-
			
 
				-        self.mrte = MRTE()
			
 
				-
			
 
				-        self.encoder2 = attentions.Encoder(
			
 
				-            hidden_channels,
			
 
				-            filter_channels,
			
 
				-            n_heads,
			
 
				-            n_layers // 2,
			
 
				-            kernel_size,
			
 
				-            p_dropout,
			
 
				-        )
			
 
				-
			
 
				-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
			
 
				-
			
 
				-    def forward(self, y, y_lengths, text, text_lengths, ge):
			
 
				-        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(
			
 
				-            y.dtype
			
 
				-        )
			
 
				-
			
 
				-        y = self.ssl_proj(y * y_mask) * y_mask
			
 
				-
			
 
				-        y = self.encoder_ssl(y * y_mask, y_mask)
			
 
				-
			
 
				-        text_mask = torch.unsqueeze(
			
 
				-            commons.sequence_mask(text_lengths, text.size(1)), 1
			
 
				-        ).to(y.dtype)
			
 
				-        text = self.text_embedding(text).transpose(1, 2)
			
 
				-        text = self.encoder_text(text * text_mask, text_mask)
			
 
				-
			
 
				-        y = self.mrte(y, y_mask, text, text_mask, ge)
			
 
				-
			
 
				-        y = self.encoder2(y * y_mask, y_mask)
			
 
				-
			
 
				-        stats = self.proj(y) * y_mask
			
 
				-        m, logs = torch.split(stats, self.out_channels, dim=1)
			
 
				-        return y, m, logs, y_mask
			
 
				-
			
 
				-
			
 
				-class ResidualCouplingBlock(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        channels,
			
 
				-        hidden_channels,
			
 
				-        kernel_size,
			
 
				-        dilation_rate,
			
 
				-        n_layers,
			
 
				-        n_flows=4,
			
 
				-        gin_channels=0,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-        self.channels = channels
			
 
				-        self.hidden_channels = hidden_channels
			
 
				-        self.kernel_size = kernel_size
			
 
				-        self.dilation_rate = dilation_rate
			
 
				-        self.n_layers = n_layers
			
 
				-        self.n_flows = n_flows
			
 
				-        self.gin_channels = gin_channels
			
 
				-
			
 
				-        self.flows = nn.ModuleList()
			
 
				-        for i in range(n_flows):
			
 
				-            self.flows.append(
			
 
				-                modules.ResidualCouplingLayer(
			
 
				-                    channels,
			
 
				-                    hidden_channels,
			
 
				-                    kernel_size,
			
 
				-                    dilation_rate,
			
 
				-                    n_layers,
			
 
				-                    gin_channels=gin_channels,
			
 
				-                    mean_only=True,
			
 
				-                )
			
 
				-            )
			
 
				-            self.flows.append(modules.Flip())
			
 
				-
			
 
				-    def forward(self, x, x_mask, g=None, reverse=False):
			
 
				-        if not reverse:
			
 
				-            for flow in self.flows:
			
 
				-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
			
 
				-        else:
			
 
				-            for flow in reversed(self.flows):
			
 
				-                x = flow(x, x_mask, g=g, reverse=reverse)
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class PosteriorEncoder(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        in_channels,
			
 
				-        out_channels,
			
 
				-        hidden_channels,
			
 
				-        kernel_size,
			
 
				-        dilation_rate,
			
 
				-        n_layers,
			
 
				-        gin_channels=0,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-        self.in_channels = in_channels
			
 
				-        self.out_channels = out_channels
			
 
				-        self.hidden_channels = hidden_channels
			
 
				-        self.kernel_size = kernel_size
			
 
				-        self.dilation_rate = dilation_rate
			
 
				-        self.n_layers = n_layers
			
 
				-        self.gin_channels = gin_channels
			
 
				-
			
 
				-        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
			
 
				-        self.enc = modules.WN(
			
 
				-            hidden_channels,
			
 
				-            kernel_size,
			
 
				-            dilation_rate,
			
 
				-            n_layers,
			
 
				-            gin_channels=gin_channels,
			
 
				-        )
			
 
				-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
			
 
				-
			
 
				-    def forward(self, x, x_lengths, g=None):
			
 
				-        if g != None:
			
 
				-            g = g.detach()
			
 
				-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
			
 
				-            x.dtype
			
 
				-        )
			
 
				-        x = self.pre(x) * x_mask
			
 
				-        x = self.enc(x, x_mask, g=g)
			
 
				-        stats = self.proj(x) * x_mask
			
 
				-        m, logs = torch.split(stats, self.out_channels, dim=1)
			
 
				-        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
			
 
				-        return z, m, logs, x_mask
			
 
				-
			
 
				-
			
 
				-class Generator(torch.nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        initial_channel,
			
 
				-        resblock,
			
 
				-        resblock_kernel_sizes,
			
 
				-        resblock_dilation_sizes,
			
 
				-        upsample_rates,
			
 
				-        upsample_initial_channel,
			
 
				-        upsample_kernel_sizes,
			
 
				-        gin_channels=0,
			
 
				-    ):
			
 
				-        super(Generator, self).__init__()
			
 
				-        self.num_kernels = len(resblock_kernel_sizes)
			
 
				-        self.num_upsamples = len(upsample_rates)
			
 
				-        self.conv_pre = Conv1d(
			
 
				-            initial_channel, upsample_initial_channel, 7, 1, padding=3
			
 
				-        )
			
 
				-        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
			
 
				-
			
 
				-        self.ups = nn.ModuleList()
			
 
				-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
			
 
				-            self.ups.append(
			
 
				-                weight_norm(
			
 
				-                    ConvTranspose1d(
			
 
				-                        upsample_initial_channel // (2**i),
			
 
				-                        upsample_initial_channel // (2 ** (i + 1)),
			
 
				-                        k,
			
 
				-                        u,
			
 
				-                        padding=(k - u) // 2,
			
 
				-                    )
			
 
				-                )
			
 
				-            )
			
 
				-
			
 
				-        self.resblocks = nn.ModuleList()
			
 
				-        for i in range(len(self.ups)):
			
 
				-            ch = upsample_initial_channel // (2 ** (i + 1))
			
 
				-            for j, (k, d) in enumerate(
			
 
				-                zip(resblock_kernel_sizes, resblock_dilation_sizes)
			
 
				-            ):
			
 
				-                self.resblocks.append(resblock(ch, k, d))
			
 
				-
			
 
				-        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
			
 
				-        self.ups.apply(init_weights)
			
 
				-
			
 
				-        if gin_channels != 0:
			
 
				-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
			
 
				-
			
 
				-    def forward(self, x, g=None):
			
 
				-        x = self.conv_pre(x)
			
 
				-        if g is not None:
			
 
				-            x = x + self.cond(g)
			
 
				-
			
 
				-        for i in range(self.num_upsamples):
			
 
				-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
			
 
				-            x = self.ups[i](x)
			
 
				-            xs = None
			
 
				-            for j in range(self.num_kernels):
			
 
				-                if xs is None:
			
 
				-                    xs = self.resblocks[i * self.num_kernels + j](x)
			
 
				-                else:
			
 
				-                    xs += self.resblocks[i * self.num_kernels + j](x)
			
 
				-            x = xs / self.num_kernels
			
 
				-        x = F.leaky_relu(x)
			
 
				-        x = self.conv_post(x)
			
 
				-        x = torch.tanh(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-    def remove_weight_norm(self):
			
 
				-        print("Removing weight norm...")
			
 
				-        for l in self.ups:
			
 
				-            remove_weight_norm(l)
			
 
				-        for l in self.resblocks:
			
 
				-            l.remove_weight_norm()
			
 
				-
			
 
				-
			
 
				-class DiscriminatorP(torch.nn.Module):
			
 
				-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
			
 
				-        super(DiscriminatorP, self).__init__()
			
 
				-        self.period = period
			
 
				-        self.use_spectral_norm = use_spectral_norm
			
 
				-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
			
 
				-        self.convs = nn.ModuleList(
			
 
				-            [
			
 
				-                norm_f(
			
 
				-                    Conv2d(
			
 
				-                        1,
			
 
				-                        32,
			
 
				-                        (kernel_size, 1),
			
 
				-                        (stride, 1),
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-                norm_f(
			
 
				-                    Conv2d(
			
 
				-                        32,
			
 
				-                        128,
			
 
				-                        (kernel_size, 1),
			
 
				-                        (stride, 1),
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-                norm_f(
			
 
				-                    Conv2d(
			
 
				-                        128,
			
 
				-                        512,
			
 
				-                        (kernel_size, 1),
			
 
				-                        (stride, 1),
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-                norm_f(
			
 
				-                    Conv2d(
			
 
				-                        512,
			
 
				-                        1024,
			
 
				-                        (kernel_size, 1),
			
 
				-                        (stride, 1),
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-                norm_f(
			
 
				-                    Conv2d(
			
 
				-                        1024,
			
 
				-                        1024,
			
 
				-                        (kernel_size, 1),
			
 
				-                        1,
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-            ]
			
 
				-        )
			
 
				-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        fmap = []
			
 
				-
			
 
				-        # 1d to 2d
			
 
				-        b, c, t = x.shape
			
 
				-        if t % self.period != 0:  # pad first
			
 
				-            n_pad = self.period - (t % self.period)
			
 
				-            x = F.pad(x, (0, n_pad), "reflect")
			
 
				-            t = t + n_pad
			
 
				-        x = x.view(b, c, t // self.period, self.period)
			
 
				-
			
 
				-        for l in self.convs:
			
 
				-            x = l(x)
			
 
				-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
			
 
				-            fmap.append(x)
			
 
				-        x = self.conv_post(x)
			
 
				-        fmap.append(x)
			
 
				-        x = torch.flatten(x, 1, -1)
			
 
				-
			
 
				-        return x, fmap
			
 
				-
			
 
				-
			
 
				-class DiscriminatorS(torch.nn.Module):
			
 
				-    def __init__(self, use_spectral_norm=False):
			
 
				-        super(DiscriminatorS, self).__init__()
			
 
				-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
			
 
				-        self.convs = nn.ModuleList(
			
 
				-            [
			
 
				-                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
			
 
				-                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
			
 
				-                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
			
 
				-                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
			
 
				-                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
			
 
				-                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
			
 
				-            ]
			
 
				-        )
			
 
				-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        fmap = []
			
 
				-
			
 
				-        for l in self.convs:
			
 
				-            x = l(x)
			
 
				-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
			
 
				-            fmap.append(x)
			
 
				-        x = self.conv_post(x)
			
 
				-        fmap.append(x)
			
 
				-        x = torch.flatten(x, 1, -1)
			
 
				-
			
 
				-        return x, fmap
			
 
				-
			
 
				-
			
 
				-class EnsembledDiscriminator(torch.nn.Module):
			
 
				-    def __init__(self, periods=(2, 3, 5, 7, 11), use_spectral_norm=False):
			
 
				-        super().__init__()
			
 
				-        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
			
 
				-        discs = discs + [
			
 
				-            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
			
 
				-        ]
			
 
				-        self.discriminators = nn.ModuleList(discs)
			
 
				-
			
 
				-    def forward(self, y, y_hat):
			
 
				-        y_d_rs = []
			
 
				-        y_d_gs = []
			
 
				-        fmap_rs = []
			
 
				-        fmap_gs = []
			
 
				-        for i, d in enumerate(self.discriminators):
			
 
				-            y_d_r, fmap_r = d(y)
			
 
				-            y_d_g, fmap_g = d(y_hat)
			
 
				-            y_d_rs.append(y_d_r)
			
 
				-            y_d_gs.append(y_d_g)
			
 
				-            fmap_rs.append(fmap_r)
			
 
				-            fmap_gs.append(fmap_g)
			
 
				-
			
 
				-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
			
 
				-
			
 
				-
			
 
				-class SynthesizerTrn(nn.Module):
			
 
				-    """
			
 
				-    Synthesizer for Training
			
 
				-    """
			
 
				-
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        *,
			
 
				-        spec_channels,
			
 
				-        segment_size,
			
 
				-        inter_channels,
			
 
				-        hidden_channels,
			
 
				-        filter_channels,
			
 
				-        n_heads,
			
 
				-        n_layers,
			
 
				-        kernel_size,
			
 
				-        p_dropout,
			
 
				-        resblock,
			
 
				-        resblock_kernel_sizes,
			
 
				-        resblock_dilation_sizes,
			
 
				-        upsample_rates,
			
 
				-        upsample_initial_channel,
			
 
				-        upsample_kernel_sizes,
			
 
				-        gin_channels=0,
			
 
				-        codebook_size=264,
			
 
				-        vq_mask_ratio=0.0,
			
 
				-        ref_mask_ratio=0.0,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.spec_channels = spec_channels
			
 
				-        self.inter_channels = inter_channels
			
 
				-        self.hidden_channels = hidden_channels
			
 
				-        self.filter_channels = filter_channels
			
 
				-        self.n_heads = n_heads
			
 
				-        self.n_layers = n_layers
			
 
				-        self.kernel_size = kernel_size
			
 
				-        self.p_dropout = p_dropout
			
 
				-        self.resblock = resblock
			
 
				-        self.resblock_kernel_sizes = resblock_kernel_sizes
			
 
				-        self.resblock_dilation_sizes = resblock_dilation_sizes
			
 
				-        self.upsample_rates = upsample_rates
			
 
				-        self.upsample_initial_channel = upsample_initial_channel
			
 
				-        self.upsample_kernel_sizes = upsample_kernel_sizes
			
 
				-        self.segment_size = segment_size
			
 
				-        self.gin_channels = gin_channels
			
 
				-        self.vq_mask_ratio = vq_mask_ratio
			
 
				-        self.ref_mask_ratio = ref_mask_ratio
			
 
				-
			
 
				-        self.enc_p = TextEncoder(
			
 
				-            inter_channels,
			
 
				-            hidden_channels,
			
 
				-            filter_channels,
			
 
				-            n_heads,
			
 
				-            n_layers,
			
 
				-            kernel_size,
			
 
				-            p_dropout,
			
 
				-            codebook_size=codebook_size,
			
 
				-        )
			
 
				-        self.dec = Generator(
			
 
				-            inter_channels,
			
 
				-            resblock,
			
 
				-            resblock_kernel_sizes,
			
 
				-            resblock_dilation_sizes,
			
 
				-            upsample_rates,
			
 
				-            upsample_initial_channel,
			
 
				-            upsample_kernel_sizes,
			
 
				-            gin_channels=gin_channels,
			
 
				-        )
			
 
				-        self.enc_q = PosteriorEncoder(
			
 
				-            spec_channels,
			
 
				-            inter_channels,
			
 
				-            hidden_channels,
			
 
				-            5,
			
 
				-            1,
			
 
				-            16,
			
 
				-            gin_channels=gin_channels,
			
 
				-        )
			
 
				-        self.flow = ResidualCouplingBlock(
			
 
				-            inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
			
 
				-        )
			
 
				-
			
 
				-        self.ref_enc = modules.MelStyleEncoder(
			
 
				-            spec_channels, style_vector_dim=gin_channels
			
 
				-        )
			
 
				-
			
 
				-        self.vq = VQEncoder()
			
 
				-        for param in self.vq.parameters():
			
 
				-            param.requires_grad = False
			
 
				-
			
 
				-    def forward(
			
 
				-        self, audio, audio_lengths, gt_specs, gt_spec_lengths, text, text_lengths
			
 
				-    ):
			
 
				-        y_mask = torch.unsqueeze(
			
 
				-            commons.sequence_mask(gt_spec_lengths, gt_specs.size(2)), 1
			
 
				-        ).to(gt_specs.dtype)
			
 
				-        ge = self.ref_enc(gt_specs * y_mask, y_mask)
			
 
				-
			
 
				-        if self.training and self.ref_mask_ratio > 0:
			
 
				-            bs = audio.size(0)
			
 
				-            mask_speaker_len = int(bs * self.ref_mask_ratio)
			
 
				-            mask_indices = torch.randperm(bs)[:mask_speaker_len]
			
 
				-            audio[mask_indices] = 0
			
 
				-
			
 
				-        quantized = self.vq(audio, audio_lengths)
			
 
				-
			
 
				-        # Block masking, block_size = 4
			
 
				-        block_size = 4
			
 
				-        if self.training and self.vq_mask_ratio > 0:
			
 
				-            reduced_length = quantized.size(-1) // block_size
			
 
				-            mask_length = int(reduced_length * self.vq_mask_ratio)
			
 
				-            mask_indices = torch.randperm(reduced_length)[:mask_length]
			
 
				-            short_mask = torch.zeros(
			
 
				-                quantized.size(0),
			
 
				-                quantized.size(1),
			
 
				-                reduced_length,
			
 
				-                device=quantized.device,
			
 
				-                dtype=torch.float,
			
 
				-            )
			
 
				-            short_mask[:, :, mask_indices] = 1.0
			
 
				-            long_mask = short_mask.repeat_interleave(block_size, dim=-1)
			
 
				-            long_mask = F.interpolate(
			
 
				-                long_mask, size=quantized.size(-1), mode="nearest"
			
 
				-            )
			
 
				-            quantized = quantized.masked_fill(long_mask > 0.5, 0)
			
 
				-
			
 
				-        x, m_p, logs_p, y_mask = self.enc_p(
			
 
				-            quantized, gt_spec_lengths, text, text_lengths, ge
			
 
				-        )
			
 
				-        z, m_q, logs_q, y_mask = self.enc_q(gt_specs, gt_spec_lengths, g=ge)
			
 
				-        z_p = self.flow(z, y_mask, g=ge)
			
 
				-
			
 
				-        z_slice, ids_slice = commons.rand_slice_segments(
			
 
				-            z, gt_spec_lengths, self.segment_size
			
 
				-        )
			
 
				-        o = self.dec(z_slice, g=ge)
			
 
				-
			
 
				-        return (
			
 
				-            o,
			
 
				-            ids_slice,
			
 
				-            y_mask,
			
 
				-            (z, z_p, m_p, logs_p, m_q, logs_q),
			
 
				-        )
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def infer(
			
 
				-        self,
			
 
				-        audio,
			
 
				-        audio_lengths,
			
 
				-        gt_specs,
			
 
				-        gt_spec_lengths,
			
 
				-        text,
			
 
				-        text_lengths,
			
 
				-        noise_scale=0.5,
			
 
				-    ):
			
 
				-        quantized = self.vq(audio, audio_lengths)
			
 
				-        quantized_lengths = audio_lengths // 512
			
 
				-        ge = self.encode_ref(gt_specs, gt_spec_lengths)
			
 
				-
			
 
				-        return self.decode(
			
 
				-            quantized,
			
 
				-            quantized_lengths,
			
 
				-            text,
			
 
				-            text_lengths,
			
 
				-            noise_scale=noise_scale,
			
 
				-            ge=ge,
			
 
				-        )
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def infer_posterior(
			
 
				-        self,
			
 
				-        gt_specs,
			
 
				-        gt_spec_lengths,
			
 
				-    ):
			
 
				-        y_mask = torch.unsqueeze(
			
 
				-            commons.sequence_mask(gt_spec_lengths, gt_specs.size(2)), 1
			
 
				-        ).to(gt_specs.dtype)
			
 
				-        ge = self.ref_enc(gt_specs * y_mask, y_mask)
			
 
				-        z, m_q, logs_q, y_mask = self.enc_q(gt_specs, gt_spec_lengths, g=ge)
			
 
				-        o = self.dec(z * y_mask, g=ge)
			
 
				-
			
 
				-        return o
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def decode(
			
 
				-        self,
			
 
				-        quantized,
			
 
				-        quantized_lengths,
			
 
				-        text,
			
 
				-        text_lengths,
			
 
				-        noise_scale=0.5,
			
 
				-        ge=None,
			
 
				-    ):
			
 
				-        x, m_p, logs_p, y_mask = self.enc_p(
			
 
				-            quantized, quantized_lengths, text, text_lengths, ge
			
 
				-        )
			
 
				-        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
			
 
				-
			
 
				-        z = self.flow(z_p, y_mask, g=ge, reverse=True)
			
 
				-
			
 
				-        o = self.dec(z * y_mask, g=ge)
			
 
				-
			
 
				-        return o
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def encode_ref(self, gt_specs, gt_spec_lengths):
			
 
				-        y_mask = torch.unsqueeze(
			
 
				-            commons.sequence_mask(gt_spec_lengths, gt_specs.size(2)), 1
			
 
				-        ).to(gt_specs.dtype)
			
 
				-        ge = self.ref_enc(gt_specs * y_mask, y_mask)
			
 
				-
			
 
				-        return ge
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    import librosa
			
 
				-    from transformers import AutoTokenizer
			
 
				-
			
 
				-    from fish_speech.utils.spectrogram import LinearSpectrogram
			
 
				-
			
 
				-    model = SynthesizerTrn(
			
 
				-        spec_channels=1025,
			
 
				-        segment_size=20480 // 640,
			
 
				-        inter_channels=192,
			
 
				-        hidden_channels=192,
			
 
				-        filter_channels=768,
			
 
				-        n_heads=2,
			
 
				-        n_layers=6,
			
 
				-        kernel_size=3,
			
 
				-        p_dropout=0.1,
			
 
				-        resblock="1",
			
 
				-        resblock_kernel_sizes=[3, 7, 11],
			
 
				-        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
			
 
				-        upsample_rates=[8, 8, 2, 2, 2],
			
 
				-        upsample_initial_channel=512,
			
 
				-        upsample_kernel_sizes=[16, 16, 8, 2, 2],
			
 
				-        gin_channels=512,
			
 
				-    )
			
 
				-
			
 
				-    ckpt = "checkpoints/Bert-VITS2/G_0.pth"
			
 
				-    # Try to load the model
			
 
				-    print(f"Loading model from {ckpt}")
			
 
				-    checkpoint = torch.load(ckpt, map_location="cpu", weights_only=True)["model"]
			
 
				-    # d_checkpoint = torch.load(
			
 
				-    #     "checkpoints/Bert-VITS2/D_0.pth", map_location="cpu", weights_only=True
			
 
				-    # )["model"]
			
 
				-    # print(checkpoint.keys())
			
 
				-
			
 
				-    checkpoint.pop("dec.cond.weight")
			
 
				-    checkpoint.pop("enc_q.enc.cond_layer.weight_v")
			
 
				-
			
 
				-    # new_checkpoint = {}
			
 
				-    # for k, v in checkpoint.items():
			
 
				-    #     new_checkpoint["generator." + k] = v
			
 
				-
			
 
				-    # for k, v in d_checkpoint.items():
			
 
				-    #     new_checkpoint["discriminator." + k] = v
			
 
				-
			
 
				-    # torch.save(new_checkpoint, "checkpoints/Bert-VITS2/ensemble.pth")
			
 
				-    # exit()
			
 
				-
			
 
				-    print(model.load_state_dict(checkpoint, strict=False))
			
 
				-
			
 
				-    # Test
			
 
				-
			
 
				-    ref_audio = librosa.load(
			
 
				-        "data/source/云天河/云天河-旁白/《薄太太》第0025集-yth_24.wav", sr=32000
			
 
				-    )[0]
			
 
				-    input_audio = librosa.load(
			
 
				-        "data/source/云天河/云天河-旁白/《薄太太》第0025集-yth_24.wav", sr=32000
			
 
				-    )[0]
			
 
				-    ref_audio = input_audio
			
 
				-    text = "博兴只知道身边的小女人没睡着，他又凑到她耳边压低了声线。阮苏眉睁眼，不觉得你老公像英雄吗？阮苏还是没反应，这男人是不是有病？刚才那冰冷又强势的样子，和现在这幼稚无赖的样子，根本就判若二人。"
			
 
				-    encoded_text = AutoTokenizer.from_pretrained("fishaudio/fish-speech-1")
			
 
				-    spec = LinearSpectrogram(n_fft=2048, hop_length=640, win_length=2048)
			
 
				-
			
 
				-    ref_audio = torch.tensor(ref_audio).unsqueeze(0).unsqueeze(0)
			
 
				-    ref_spec = spec(ref_audio)
			
 
				-
			
 
				-    input_audio = torch.tensor(input_audio).unsqueeze(0).unsqueeze(0)
			
 
				-    text = encoded_text(text, return_tensors="pt")["input_ids"]
			
 
				-    print(ref_audio.size(), ref_spec.size(), input_audio.size(), text.size())
			
 
				-
			
 
				-    o, y_mask, (z, z_p, m_p, logs_p) = model.infer(
			
 
				-        input_audio,
			
 
				-        torch.LongTensor([input_audio.size(2)]),
			
 
				-        ref_spec,
			
 
				-        torch.LongTensor([ref_spec.size(2)]),
			
 
				-        text,
			
 
				-        torch.LongTensor([text.size(1)]),
			
 
				-    )
			
 
				-    print(o.size(), y_mask.size(), z.size(), z_p.size(), m_p.size(), logs_p.size())
			
 
				-
			
 
				-    # Save output
			
 
				-    # import soundfile as sf
			
 
				-
			
 
				-    # sf.write("output.wav", o.squeeze().detach().numpy(), 32000)
			
--- a/fish_speech/models/vits_decoder/modules/modules.py
+++ b/fish_speech/models/vits_decoder/modules/modules.py
@@ -1,619 +0,0 @@
 
				-import numpy as np
			
 
				-import torch
			
 
				-from torch import nn
			
 
				-from torch.nn import Conv1d
			
 
				-from torch.nn import functional as F
			
 
				-from torch.nn.utils import remove_weight_norm, weight_norm
			
 
				-
			
 
				-from .commons import fused_add_tanh_sigmoid_multiply, get_padding, init_weights
			
 
				-
			
 
				-LRELU_SLOPE = 0.1
			
 
				-
			
 
				-
			
 
				-class LayerNorm(nn.Module):
			
 
				-    def __init__(self, channels, eps=1e-5):
			
 
				-        super().__init__()
			
 
				-        self.channels = channels
			
 
				-        self.eps = eps
			
 
				-
			
 
				-        self.gamma = nn.Parameter(torch.ones(channels))
			
 
				-        self.beta = nn.Parameter(torch.zeros(channels))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = x.transpose(1, -1)
			
 
				-        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
			
 
				-        return x.transpose(1, -1)
			
 
				-
			
 
				-
			
 
				-class ConvReluNorm(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        in_channels,
			
 
				-        hidden_channels,
			
 
				-        out_channels,
			
 
				-        kernel_size,
			
 
				-        n_layers,
			
 
				-        p_dropout,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-        self.in_channels = in_channels
			
 
				-        self.hidden_channels = hidden_channels
			
 
				-        self.out_channels = out_channels
			
 
				-        self.kernel_size = kernel_size
			
 
				-        self.n_layers = n_layers
			
 
				-        self.p_dropout = p_dropout
			
 
				-        assert n_layers > 1, "Number of layers should be larger than 0."
			
 
				-
			
 
				-        self.conv_layers = nn.ModuleList()
			
 
				-        self.norm_layers = nn.ModuleList()
			
 
				-        self.conv_layers.append(
			
 
				-            nn.Conv1d(
			
 
				-                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
			
 
				-            )
			
 
				-        )
			
 
				-        self.norm_layers.append(LayerNorm(hidden_channels))
			
 
				-        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
			
 
				-        for _ in range(n_layers - 1):
			
 
				-            self.conv_layers.append(
			
 
				-                nn.Conv1d(
			
 
				-                    hidden_channels,
			
 
				-                    hidden_channels,
			
 
				-                    kernel_size,
			
 
				-                    padding=kernel_size // 2,
			
 
				-                )
			
 
				-            )
			
 
				-            self.norm_layers.append(LayerNorm(hidden_channels))
			
 
				-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
			
 
				-        self.proj.weight.data.zero_()
			
 
				-        self.proj.bias.data.zero_()
			
 
				-
			
 
				-    def forward(self, x, x_mask):
			
 
				-        x_org = x
			
 
				-        for i in range(self.n_layers):
			
 
				-            x = self.conv_layers[i](x * x_mask)
			
 
				-            x = self.norm_layers[i](x)
			
 
				-            x = self.relu_drop(x)
			
 
				-        x = x_org + self.proj(x)
			
 
				-        return x * x_mask
			
 
				-
			
 
				-
			
 
				-class WN(torch.nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        hidden_channels,
			
 
				-        kernel_size,
			
 
				-        dilation_rate,
			
 
				-        n_layers,
			
 
				-        gin_channels=0,
			
 
				-        p_dropout=0,
			
 
				-    ):
			
 
				-        super(WN, self).__init__()
			
 
				-        assert kernel_size % 2 == 1
			
 
				-        self.hidden_channels = hidden_channels
			
 
				-        self.kernel_size = (kernel_size,)
			
 
				-        self.dilation_rate = dilation_rate
			
 
				-        self.n_layers = n_layers
			
 
				-        self.gin_channels = gin_channels
			
 
				-        self.p_dropout = p_dropout
			
 
				-
			
 
				-        self.in_layers = torch.nn.ModuleList()
			
 
				-        self.res_skip_layers = torch.nn.ModuleList()
			
 
				-        self.drop = nn.Dropout(p_dropout)
			
 
				-
			
 
				-        if gin_channels != 0:
			
 
				-            cond_layer = torch.nn.Conv1d(
			
 
				-                gin_channels, 2 * hidden_channels * n_layers, 1
			
 
				-            )
			
 
				-            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
			
 
				-
			
 
				-        for i in range(n_layers):
			
 
				-            dilation = dilation_rate**i
			
 
				-            padding = int((kernel_size * dilation - dilation) / 2)
			
 
				-            in_layer = torch.nn.Conv1d(
			
 
				-                hidden_channels,
			
 
				-                2 * hidden_channels,
			
 
				-                kernel_size,
			
 
				-                dilation=dilation,
			
 
				-                padding=padding,
			
 
				-            )
			
 
				-            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
			
 
				-            self.in_layers.append(in_layer)
			
 
				-
			
 
				-            # last one is not necessary
			
 
				-            if i < n_layers - 1:
			
 
				-                res_skip_channels = 2 * hidden_channels
			
 
				-            else:
			
 
				-                res_skip_channels = hidden_channels
			
 
				-
			
 
				-            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
			
 
				-            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
			
 
				-            self.res_skip_layers.append(res_skip_layer)
			
 
				-
			
 
				-    def forward(self, x, x_mask, g=None, **kwargs):
			
 
				-        output = torch.zeros_like(x)
			
 
				-        n_channels_tensor = torch.IntTensor([self.hidden_channels])
			
 
				-
			
 
				-        if g is not None:
			
 
				-            g = self.cond_layer(g)
			
 
				-
			
 
				-        for i in range(self.n_layers):
			
 
				-            x_in = self.in_layers[i](x)
			
 
				-            if g is not None:
			
 
				-                cond_offset = i * 2 * self.hidden_channels
			
 
				-                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
			
 
				-            else:
			
 
				-                g_l = torch.zeros_like(x_in)
			
 
				-
			
 
				-            acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
			
 
				-            acts = self.drop(acts)
			
 
				-
			
 
				-            res_skip_acts = self.res_skip_layers[i](acts)
			
 
				-            if i < self.n_layers - 1:
			
 
				-                res_acts = res_skip_acts[:, : self.hidden_channels, :]
			
 
				-                x = (x + res_acts) * x_mask
			
 
				-                output = output + res_skip_acts[:, self.hidden_channels :, :]
			
 
				-            else:
			
 
				-                output = output + res_skip_acts
			
 
				-        return output * x_mask
			
 
				-
			
 
				-    def remove_weight_norm(self):
			
 
				-        if self.gin_channels != 0:
			
 
				-            torch.nn.utils.remove_weight_norm(self.cond_layer)
			
 
				-        for l in self.in_layers:
			
 
				-            torch.nn.utils.remove_weight_norm(l)
			
 
				-        for l in self.res_skip_layers:
			
 
				-            torch.nn.utils.remove_weight_norm(l)
			
 
				-
			
 
				-
			
 
				-class ResBlock1(torch.nn.Module):
			
 
				-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
			
 
				-        super(ResBlock1, self).__init__()
			
 
				-        self.convs1 = nn.ModuleList(
			
 
				-            [
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=dilation[0],
			
 
				-                        padding=get_padding(kernel_size, dilation[0]),
			
 
				-                    )
			
 
				-                ),
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=dilation[1],
			
 
				-                        padding=get_padding(kernel_size, dilation[1]),
			
 
				-                    )
			
 
				-                ),
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=dilation[2],
			
 
				-                        padding=get_padding(kernel_size, dilation[2]),
			
 
				-                    )
			
 
				-                ),
			
 
				-            ]
			
 
				-        )
			
 
				-        self.convs1.apply(init_weights)
			
 
				-
			
 
				-        self.convs2 = nn.ModuleList(
			
 
				-            [
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=1,
			
 
				-                        padding=get_padding(kernel_size, 1),
			
 
				-                    )
			
 
				-                ),
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=1,
			
 
				-                        padding=get_padding(kernel_size, 1),
			
 
				-                    )
			
 
				-                ),
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=1,
			
 
				-                        padding=get_padding(kernel_size, 1),
			
 
				-                    )
			
 
				-                ),
			
 
				-            ]
			
 
				-        )
			
 
				-        self.convs2.apply(init_weights)
			
 
				-
			
 
				-    def forward(self, x, x_mask=None):
			
 
				-        for c1, c2 in zip(self.convs1, self.convs2):
			
 
				-            xt = F.leaky_relu(x, LRELU_SLOPE)
			
 
				-            if x_mask is not None:
			
 
				-                xt = xt * x_mask
			
 
				-            xt = c1(xt)
			
 
				-            xt = F.leaky_relu(xt, LRELU_SLOPE)
			
 
				-            if x_mask is not None:
			
 
				-                xt = xt * x_mask
			
 
				-            xt = c2(xt)
			
 
				-            x = xt + x
			
 
				-        if x_mask is not None:
			
 
				-            x = x * x_mask
			
 
				-        return x
			
 
				-
			
 
				-    def remove_weight_norm(self):
			
 
				-        for l in self.convs1:
			
 
				-            remove_weight_norm(l)
			
 
				-        for l in self.convs2:
			
 
				-            remove_weight_norm(l)
			
 
				-
			
 
				-
			
 
				-class ResBlock2(torch.nn.Module):
			
 
				-    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
			
 
				-        super(ResBlock2, self).__init__()
			
 
				-        self.convs = nn.ModuleList(
			
 
				-            [
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=dilation[0],
			
 
				-                        padding=get_padding(kernel_size, dilation[0]),
			
 
				-                    )
			
 
				-                ),
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=dilation[1],
			
 
				-                        padding=get_padding(kernel_size, dilation[1]),
			
 
				-                    )
			
 
				-                ),
			
 
				-            ]
			
 
				-        )
			
 
				-        self.convs.apply(init_weights)
			
 
				-
			
 
				-    def forward(self, x, x_mask=None):
			
 
				-        for c in self.convs:
			
 
				-            xt = F.leaky_relu(x, LRELU_SLOPE)
			
 
				-            if x_mask is not None:
			
 
				-                xt = xt * x_mask
			
 
				-            xt = c(xt)
			
 
				-            x = xt + x
			
 
				-        if x_mask is not None:
			
 
				-            x = x * x_mask
			
 
				-        return x
			
 
				-
			
 
				-    def remove_weight_norm(self):
			
 
				-        for l in self.convs:
			
 
				-            remove_weight_norm(l)
			
 
				-
			
 
				-
			
 
				-class Flip(nn.Module):
			
 
				-    def forward(self, x, *args, reverse=False, **kwargs):
			
 
				-        x = torch.flip(x, [1])
			
 
				-        if not reverse:
			
 
				-            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
			
 
				-            return x, logdet
			
 
				-        else:
			
 
				-            return x
			
 
				-
			
 
				-
			
 
				-class ResidualCouplingLayer(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        channels,
			
 
				-        hidden_channels,
			
 
				-        kernel_size,
			
 
				-        dilation_rate,
			
 
				-        n_layers,
			
 
				-        p_dropout=0,
			
 
				-        gin_channels=0,
			
 
				-        mean_only=False,
			
 
				-    ):
			
 
				-        assert channels % 2 == 0, "channels should be divisible by 2"
			
 
				-        super().__init__()
			
 
				-        self.channels = channels
			
 
				-        self.hidden_channels = hidden_channels
			
 
				-        self.kernel_size = kernel_size
			
 
				-        self.dilation_rate = dilation_rate
			
 
				-        self.n_layers = n_layers
			
 
				-        self.half_channels = channels // 2
			
 
				-        self.mean_only = mean_only
			
 
				-
			
 
				-        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
			
 
				-        self.enc = WN(
			
 
				-            hidden_channels,
			
 
				-            kernel_size,
			
 
				-            dilation_rate,
			
 
				-            n_layers,
			
 
				-            p_dropout=p_dropout,
			
 
				-            gin_channels=gin_channels,
			
 
				-        )
			
 
				-        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
			
 
				-        self.post.weight.data.zero_()
			
 
				-        self.post.bias.data.zero_()
			
 
				-
			
 
				-    def forward(self, x, x_mask, g=None, reverse=False):
			
 
				-        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
			
 
				-        h = self.pre(x0) * x_mask
			
 
				-        h = self.enc(h, x_mask, g=g)
			
 
				-        stats = self.post(h) * x_mask
			
 
				-        if not self.mean_only:
			
 
				-            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
			
 
				-        else:
			
 
				-            m = stats
			
 
				-            logs = torch.zeros_like(m)
			
 
				-
			
 
				-        if not reverse:
			
 
				-            x1 = m + x1 * torch.exp(logs) * x_mask
			
 
				-            x = torch.cat([x0, x1], 1)
			
 
				-            logdet = torch.sum(logs, [1, 2])
			
 
				-            return x, logdet
			
 
				-        else:
			
 
				-            x1 = (x1 - m) * torch.exp(-logs) * x_mask
			
 
				-            x = torch.cat([x0, x1], 1)
			
 
				-            return x
			
 
				-
			
 
				-
			
 
				-class LinearNorm(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        in_channels,
			
 
				-        out_channels,
			
 
				-        bias=True,
			
 
				-        spectral_norm=False,
			
 
				-    ):
			
 
				-        super(LinearNorm, self).__init__()
			
 
				-        self.fc = nn.Linear(in_channels, out_channels, bias)
			
 
				-
			
 
				-        if spectral_norm:
			
 
				-            self.fc = nn.utils.spectral_norm(self.fc)
			
 
				-
			
 
				-    def forward(self, input):
			
 
				-        out = self.fc(input)
			
 
				-        return out
			
 
				-
			
 
				-
			
 
				-class Mish(nn.Module):
			
 
				-    def __init__(self):
			
 
				-        super(Mish, self).__init__()
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return x * torch.tanh(F.softplus(x))
			
 
				-
			
 
				-
			
 
				-class Conv1dGLU(nn.Module):
			
 
				-    """
			
 
				-    Conv1d + GLU(Gated Linear Unit) with residual connection.
			
 
				-    For GLU refer to https://arxiv.org/abs/1612.08083 paper.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, in_channels, out_channels, kernel_size, dropout):
			
 
				-        super(Conv1dGLU, self).__init__()
			
 
				-        self.out_channels = out_channels
			
 
				-        self.conv1 = ConvNorm(in_channels, 2 * out_channels, kernel_size=kernel_size)
			
 
				-        self.dropout = nn.Dropout(dropout)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        residual = x
			
 
				-        x = self.conv1(x)
			
 
				-        x1, x2 = torch.split(x, split_size_or_sections=self.out_channels, dim=1)
			
 
				-        x = x1 * torch.sigmoid(x2)
			
 
				-        x = residual + self.dropout(x)
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class ConvNorm(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        in_channels,
			
 
				-        out_channels,
			
 
				-        kernel_size=1,
			
 
				-        stride=1,
			
 
				-        padding=None,
			
 
				-        dilation=1,
			
 
				-        bias=True,
			
 
				-        spectral_norm=False,
			
 
				-    ):
			
 
				-        super(ConvNorm, self).__init__()
			
 
				-
			
 
				-        if padding is None:
			
 
				-            assert kernel_size % 2 == 1
			
 
				-            padding = int(dilation * (kernel_size - 1) / 2)
			
 
				-
			
 
				-        self.conv = torch.nn.Conv1d(
			
 
				-            in_channels,
			
 
				-            out_channels,
			
 
				-            kernel_size=kernel_size,
			
 
				-            stride=stride,
			
 
				-            padding=padding,
			
 
				-            dilation=dilation,
			
 
				-            bias=bias,
			
 
				-        )
			
 
				-
			
 
				-        if spectral_norm:
			
 
				-            self.conv = nn.utils.spectral_norm(self.conv)
			
 
				-
			
 
				-    def forward(self, input):
			
 
				-        out = self.conv(input)
			
 
				-        return out
			
 
				-
			
 
				-
			
 
				-class MultiHeadAttention(nn.Module):
			
 
				-    """Multi-Head Attention module"""
			
 
				-
			
 
				-    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.0, spectral_norm=False):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.n_head = n_head
			
 
				-        self.d_k = d_k
			
 
				-        self.d_v = d_v
			
 
				-
			
 
				-        self.w_qs = nn.Linear(d_model, n_head * d_k)
			
 
				-        self.w_ks = nn.Linear(d_model, n_head * d_k)
			
 
				-        self.w_vs = nn.Linear(d_model, n_head * d_v)
			
 
				-
			
 
				-        self.attention = ScaledDotProductAttention(
			
 
				-            temperature=np.power(d_model, 0.5), dropout=dropout
			
 
				-        )
			
 
				-
			
 
				-        self.fc = nn.Linear(n_head * d_v, d_model)
			
 
				-        self.dropout = nn.Dropout(dropout)
			
 
				-
			
 
				-        if spectral_norm:
			
 
				-            self.w_qs = nn.utils.spectral_norm(self.w_qs)
			
 
				-            self.w_ks = nn.utils.spectral_norm(self.w_ks)
			
 
				-            self.w_vs = nn.utils.spectral_norm(self.w_vs)
			
 
				-            self.fc = nn.utils.spectral_norm(self.fc)
			
 
				-
			
 
				-    def forward(self, x, mask=None):
			
 
				-        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
			
 
				-        sz_b, len_x, _ = x.size()
			
 
				-
			
 
				-        residual = x
			
 
				-
			
 
				-        q = self.w_qs(x).view(sz_b, len_x, n_head, d_k)
			
 
				-        k = self.w_ks(x).view(sz_b, len_x, n_head, d_k)
			
 
				-        v = self.w_vs(x).view(sz_b, len_x, n_head, d_v)
			
 
				-        q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_k)  # (n*b) x lq x dk
			
 
				-        k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_k)  # (n*b) x lk x dk
			
 
				-        v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_v)  # (n*b) x lv x dv
			
 
				-
			
 
				-        if mask is not None:
			
 
				-            slf_mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
			
 
				-        else:
			
 
				-            slf_mask = None
			
 
				-        output, attn = self.attention(q, k, v, mask=slf_mask)
			
 
				-
			
 
				-        output = output.view(n_head, sz_b, len_x, d_v)
			
 
				-        output = (
			
 
				-            output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1)
			
 
				-        )  # b x lq x (n*dv)
			
 
				-
			
 
				-        output = self.fc(output)
			
 
				-
			
 
				-        output = self.dropout(output) + residual
			
 
				-        return output, attn
			
 
				-
			
 
				-
			
 
				-class ScaledDotProductAttention(nn.Module):
			
 
				-    """Scaled Dot-Product Attention"""
			
 
				-
			
 
				-    def __init__(self, temperature, dropout):
			
 
				-        super().__init__()
			
 
				-        self.temperature = temperature
			
 
				-        self.softmax = nn.Softmax(dim=2)
			
 
				-        self.dropout = nn.Dropout(dropout)
			
 
				-
			
 
				-    def forward(self, q, k, v, mask=None):
			
 
				-        attn = torch.bmm(q, k.transpose(1, 2))
			
 
				-        attn = attn / self.temperature
			
 
				-
			
 
				-        if mask is not None:
			
 
				-            attn = attn.masked_fill(mask, -np.inf)
			
 
				-
			
 
				-        attn = self.softmax(attn)
			
 
				-        p_attn = self.dropout(attn)
			
 
				-
			
 
				-        output = torch.bmm(p_attn, v)
			
 
				-        return output, attn
			
 
				-
			
 
				-
			
 
				-class MelStyleEncoder(nn.Module):
			
 
				-    """MelStyleEncoder"""
			
 
				-
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        n_mel_channels=80,
			
 
				-        style_hidden=128,
			
 
				-        style_vector_dim=256,
			
 
				-        style_kernel_size=5,
			
 
				-        style_head=2,
			
 
				-        dropout=0.1,
			
 
				-    ):
			
 
				-        super(MelStyleEncoder, self).__init__()
			
 
				-        self.in_dim = n_mel_channels
			
 
				-        self.hidden_dim = style_hidden
			
 
				-        self.out_dim = style_vector_dim
			
 
				-        self.kernel_size = style_kernel_size
			
 
				-        self.n_head = style_head
			
 
				-        self.dropout = dropout
			
 
				-
			
 
				-        self.spectral = nn.Sequential(
			
 
				-            LinearNorm(self.in_dim, self.hidden_dim),
			
 
				-            Mish(),
			
 
				-            nn.Dropout(self.dropout),
			
 
				-            LinearNorm(self.hidden_dim, self.hidden_dim),
			
 
				-            Mish(),
			
 
				-            nn.Dropout(self.dropout),
			
 
				-        )
			
 
				-
			
 
				-        self.temporal = nn.Sequential(
			
 
				-            Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
			
 
				-            Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
			
 
				-        )
			
 
				-
			
 
				-        self.slf_attn = MultiHeadAttention(
			
 
				-            self.n_head,
			
 
				-            self.hidden_dim,
			
 
				-            self.hidden_dim // self.n_head,
			
 
				-            self.hidden_dim // self.n_head,
			
 
				-            self.dropout,
			
 
				-        )
			
 
				-
			
 
				-        self.fc = LinearNorm(self.hidden_dim, self.out_dim)
			
 
				-
			
 
				-    def temporal_avg_pool(self, x, mask=None):
			
 
				-        if mask is None:
			
 
				-            out = torch.mean(x, dim=1)
			
 
				-        else:
			
 
				-            len_ = (~mask).sum(dim=1).unsqueeze(1)
			
 
				-            x = x.masked_fill(mask.unsqueeze(-1), 0)
			
 
				-            x = x.sum(dim=1)
			
 
				-            out = torch.div(x, len_)
			
 
				-        return out
			
 
				-
			
 
				-    def forward(self, x, mask=None):
			
 
				-        x = x.transpose(1, 2)
			
 
				-        if mask is not None:
			
 
				-            mask = (mask.int() == 0).squeeze(1)
			
 
				-        max_len = x.shape[1]
			
 
				-        slf_attn_mask = (
			
 
				-            mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None
			
 
				-        )
			
 
				-
			
 
				-        # spectral
			
 
				-        x = self.spectral(x)
			
 
				-        # temporal
			
 
				-        x = x.transpose(1, 2)
			
 
				-        x = self.temporal(x)
			
 
				-        x = x.transpose(1, 2)
			
 
				-        # self-attention
			
 
				-        if mask is not None:
			
 
				-            x = x.masked_fill(mask.unsqueeze(-1), 0)
			
 
				-        x, _ = self.slf_attn(x, mask=slf_attn_mask)
			
 
				-        # fc
			
 
				-        x = self.fc(x)
			
 
				-        # temoral average pooling
			
 
				-        w = self.temporal_avg_pool(x, mask=mask)
			
 
				-
			
 
				-        return w.unsqueeze(-1)
			
--- a/fish_speech/models/vits_decoder/modules/mrte.py
+++ b/fish_speech/models/vits_decoder/modules/mrte.py
@@ -1,58 +0,0 @@
 
				-import torch
			
 
				-from torch import nn
			
 
				-from torch.nn.utils import remove_weight_norm, weight_norm
			
 
				-
			
 
				-from fish_speech.models.vits_decoder.modules.attentions import MultiHeadAttention
			
 
				-
			
 
				-
			
 
				-class MRTE(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        content_enc_channels=192,
			
 
				-        hidden_size=512,
			
 
				-        out_channels=192,
			
 
				-        n_heads=4,
			
 
				-    ):
			
 
				-        super(MRTE, self).__init__()
			
 
				-        self.cross_attention = MultiHeadAttention(hidden_size, hidden_size, n_heads)
			
 
				-        self.c_pre = nn.Conv1d(content_enc_channels, hidden_size, 1)
			
 
				-        self.text_pre = nn.Conv1d(content_enc_channels, hidden_size, 1)
			
 
				-        self.c_post = nn.Conv1d(hidden_size, out_channels, 1)
			
 
				-
			
 
				-    def forward(self, ssl_enc, ssl_mask, text, text_mask, ge, test=None):
			
 
				-        if ge == None:
			
 
				-            ge = 0
			
 
				-        attn_mask = text_mask.unsqueeze(2) * ssl_mask.unsqueeze(-1)
			
 
				-
			
 
				-        ssl_enc = self.c_pre(ssl_enc * ssl_mask)
			
 
				-        text_enc = self.text_pre(text * text_mask)
			
 
				-        if test != None:
			
 
				-            if test == 0:
			
 
				-                x = (
			
 
				-                    self.cross_attention(
			
 
				-                        ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
			
 
				-                    )
			
 
				-                    + ssl_enc
			
 
				-                    + ge
			
 
				-                )
			
 
				-            elif test == 1:
			
 
				-                x = ssl_enc + ge
			
 
				-            elif test == 2:
			
 
				-                x = (
			
 
				-                    self.cross_attention(
			
 
				-                        ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask
			
 
				-                    )
			
 
				-                    + ge
			
 
				-                )
			
 
				-            else:
			
 
				-                raise ValueError("test should be 0,1,2")
			
 
				-        else:
			
 
				-            x = (
			
 
				-                self.cross_attention(
			
 
				-                    ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
			
 
				-                )
			
 
				-                + ssl_enc
			
 
				-                + ge
			
 
				-            )
			
 
				-        x = self.c_post(x * ssl_mask)
			
 
				-        return x
			
--- a/fish_speech/models/vits_decoder/modules/vq_encoder.py
+++ b/fish_speech/models/vits_decoder/modules/vq_encoder.py
@@ -1,101 +0,0 @@
 
				-import math
			
 
				-
			
 
				-import torch
			
 
				-from torch import nn
			
 
				-
			
 
				-from fish_speech.models.vqgan.modules.fsq import DownsampleFiniteScalarQuantize
			
 
				-from fish_speech.models.vqgan.modules.wavenet import WaveNet
			
 
				-from fish_speech.models.vqgan.utils import sequence_mask
			
 
				-from fish_speech.utils.spectrogram import LogMelSpectrogram
			
 
				-
			
 
				-
			
 
				-class VQEncoder(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.encoder = WaveNet(
			
 
				-            input_channels=128,
			
 
				-            residual_channels=768,
			
 
				-            residual_layers=20,
			
 
				-            dilation_cycle=4,
			
 
				-        )
			
 
				-
			
 
				-        self.quantizer = DownsampleFiniteScalarQuantize(
			
 
				-            input_dim=768, n_codebooks=1, n_groups=2, levels=[8, 5, 5, 5]
			
 
				-        )
			
 
				-
			
 
				-        self.spec = LogMelSpectrogram(
			
 
				-            sample_rate=44100,
			
 
				-            n_fft=2048,
			
 
				-            win_length=2048,
			
 
				-            hop_length=512,
			
 
				-            n_mels=128,
			
 
				-            f_min=0.0,
			
 
				-            f_max=8000.0,
			
 
				-        )
			
 
				-
			
 
				-        self.eval()
			
 
				-        e = self.load_state_dict(
			
 
				-            torch.load("checkpoints/vq-gan-group-fsq-2x1024.pth", map_location="cpu"),
			
 
				-            strict=False,
			
 
				-        )
			
 
				-
			
 
				-        assert len(e.missing_keys) == 0, e.missing_keys
			
 
				-        assert all(
			
 
				-            k.startswith("decoder.")
			
 
				-            or k.startswith("quality_projection.")
			
 
				-            or k.startswith("discriminator.")
			
 
				-            for k in e.unexpected_keys
			
 
				-        ), e.unexpected_keys
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def forward(self, audios, audio_lengths, sr=None):
			
 
				-        mel_spec = self.spec(audios, sample_rate=sr)
			
 
				-
			
 
				-        if sr is not None:
			
 
				-            audio_lengths = audio_lengths * 44100 // sr
			
 
				-
			
 
				-        mel_lengths = audio_lengths // self.spec.hop_length
			
 
				-        mel_masks = (
			
 
				-            torch.arange(mel_spec.shape[2], device=mel_spec.device)
			
 
				-            < mel_lengths[:, None]
			
 
				-        )
			
 
				-        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				-        mels = mel_spec * mel_masks_float_conv
			
 
				-
			
 
				-        # Encode
			
 
				-        encoded_features = self.encoder(mels) * mel_masks_float_conv
			
 
				-        encoded_features = self.quantizer(encoded_features).z * mel_masks_float_conv
			
 
				-
			
 
				-        return encoded_features
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def indicies_to_vq_features(
			
 
				-        self,
			
 
				-        indices,
			
 
				-        feature_lengths,
			
 
				-    ):
			
 
				-        factor = math.prod(self.quantizer.downsample_factor)
			
 
				-        mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
			
 
				-        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				-        z = self.quantizer.decode(indices) * mel_masks_float_conv
			
 
				-
			
 
				-        return z
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def encode(self, audios, audio_lengths, sr=None):
			
 
				-        audios = audios.float()
			
 
				-
			
 
				-        mels = self.spec(audios, sample_rate=sr)
			
 
				-        mel_lengths = audio_lengths // self.spec.hop_length
			
 
				-        mel_masks = sequence_mask(mel_lengths, mels.shape[2])
			
 
				-        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				-        mels = mels * mel_masks_float_conv
			
 
				-
			
 
				-        # Encode
			
 
				-        encoded_features = self.encoder(mels) * mel_masks_float_conv
			
 
				-        feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
			
 
				-
			
 
				-        return self.quantizer.encode(encoded_features), feature_lengths
			
--- a/fish_speech/models/vqgan/modules/firefly.py
+++ b/fish_speech/models/vqgan/modules/firefly.py
@@ -1,5 +1,6 @@
 
				 # A inference only version of the FireflyGAN model
			
 
				 
			
 
				+import math
			
 
				 from functools import partial
			
 
				 from math import prod
			
 
				 from typing import Callable
			
@@ -13,6 +14,8 @@ from torch.nn.utils.parametrizations import weight_norm
 
				 from torch.nn.utils.parametrize import remove_parametrizations
			
 
				 from torch.utils.checkpoint import checkpoint
			
 
				 
			
 
				+from fish_speech.models.vqgan.utils import sequence_mask
			
 
				+
			
 
				 
			
 
				 def init_weights(m, mean=0.0, std=0.01):
			
 
				     classname = m.__class__.__name__
			
@@ -474,6 +477,89 @@ class ConvNeXtEncoder(nn.Module):
 
				         return self.norm(x)
			
 
				 
			
 
				 
			
 
				+class FireflyArchitecture(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        backbone: nn.Module,
			
 
				+        head: nn.Module,
			
 
				+        quantizer: nn.Module,
			
 
				+        spec_transform: nn.Module,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.backbone = backbone
			
 
				+        self.head = head
			
 
				+        self.quantizer = quantizer
			
 
				+        self.spec_transform = spec_transform
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
			
 
				+        if self.spec_transform is not None:
			
 
				+            x = self.spec_transform(x)
			
 
				+
			
 
				+        x = self.backbone(x)
			
 
				+        if mask is not None:
			
 
				+            x = x * mask
			
 
				+
			
 
				+        if self.quantizer is not None:
			
 
				+            vq_result = self.quantizer(x)
			
 
				+            x = vq_result.z
			
 
				+
			
 
				+            if mask is not None:
			
 
				+                x = x * mask
			
 
				+
			
 
				+        x = self.head(x, template=template)
			
 
				+
			
 
				+        if x.ndim == 2:
			
 
				+            x = x[:, None, :]
			
 
				+
			
 
				+        if self.vq is not None:
			
 
				+            return x, vq_result
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+    def encode(self, audios, audio_lengths):
			
 
				+        audios = audios.float()
			
 
				+
			
 
				+        mels = self.spec_transform(audios)
			
 
				+        mel_lengths = audio_lengths // self.spec_transform.hop_length
			
 
				+        mel_masks = sequence_mask(mel_lengths, mels.shape[2])
			
 
				+        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				+        mels = mels * mel_masks_float_conv
			
 
				+
			
 
				+        # Encode
			
 
				+        encoded_features = self.backbone(mels) * mel_masks_float_conv
			
 
				+        feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
			
 
				+
			
 
				+        return self.quantizer.encode(encoded_features), feature_lengths
			
 
				+
			
 
				+    def decode(self, indices, feature_lengths) -> torch.Tensor:
			
 
				+        factor = math.prod(self.quantizer.downsample_factor)
			
 
				+        mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
			
 
				+        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				+
			
 
				+        audio_masks = sequence_mask(
			
 
				+            feature_lengths * factor * self.spec_transform.hop_length,
			
 
				+            indices.shape[2] * factor * self.spec_transform.hop_length,
			
 
				+        )
			
 
				+        audio_masks_float_conv = audio_masks[:, None, :].float()
			
 
				+
			
 
				+        z = self.quantizer.decode(indices) * mel_masks_float_conv
			
 
				+        x = self.head(z) * audio_masks_float_conv
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+    def remove_parametrizations(self):
			
 
				+        if hasattr(self.backbone, "remove_parametrizations"):
			
 
				+            self.backbone.remove_parametrizations()
			
 
				+
			
 
				+        if hasattr(self.head, "remove_parametrizations"):
			
 
				+            self.head.remove_parametrizations()
			
 
				+
			
 
				+    @property
			
 
				+    def device(self):
			
 
				+        return next(self.parameters()).device
			
 
				+
			
 
				+
			
 
				 class FireflyBase(nn.Module):
			
 
				     def __init__(self, ckpt_path: str = None, pretrained: bool = True):
			
 
				         super().__init__()
			
--- a/fish_speech/models/vqgan/modules/fsq.py
+++ b/fish_speech/models/vqgan/modules/fsq.py
@@ -20,7 +20,7 @@ class DownsampleFiniteScalarQuantize(nn.Module):
 
				     def __init__(
			
 
				         self,
			
 
				         input_dim: int = 512,
			
 
				-        n_codebooks: int = 9,
			
 
				+        n_codebooks: int = 1,
			
 
				         n_groups: int = 1,
			
 
				         levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
			
 
				         downsample_factor: tuple[int] = (2, 2),
			
--- a/fish_speech/webui/manage.py
+++ b/fish_speech/webui/manage.py
@@ -26,7 +26,7 @@ from fish_speech.i18n import i18n
 
				 from fish_speech.webui.launch_utils import Seafoam, is_module_installed, versions_html
			
 
				 
			
 
				 config_path = cur_work_dir / "fish_speech" / "configs"
			
 
				-vqgan_yml_path = config_path / "vqgan_finetune.yaml"
			
 
				+vqgan_yml_path = config_path / "firefly_gan_vq.yaml"
			
 
				 llama_yml_path = config_path / "text2semantic_finetune.yaml"
			
 
				 vits_yml_path = config_path / "vits_decoder_finetune.yaml"
			
 
				 
			
@@ -137,7 +137,7 @@ def change_decoder_config(decoder_model_path):
 
				         choices = ["vits_decoder_finetune", "vits_decoder_pretrain"]
			
 
				         return gr.Dropdown(choices=choices, value=choices[0])
			
 
				     elif "vqgan" in decoder_model_path or "vq-gan" in decoder_model_path:
			
 
				-        choices = ["vqgan_finetune", "vqgan_pretrain"]
			
 
				+        choices = ["firefly_gan_vq", "firefly_gan_vq"]
			
 
				         return gr.Dropdown(choices=choices, value=choices[0])
			
 
				     else:
			
 
				         raise ValueError("Invalid decoder name")
			
@@ -517,7 +517,7 @@ def train_process(
 
				             PYTHON,
			
 
				             "fish_speech/train.py",
			
 
				             "--config-name",
			
 
				-            "vqgan_finetune",
			
 
				+            "firefly_gan_vq",
			
 
				             f"project={project}",
			
 
				             f"trainer.strategy.process_group_backend={backend}",
			
 
				             f"model.optimizer.lr={vqgan_lr}",
			
@@ -590,9 +590,9 @@ def train_process(
 
				                 "--batch-size",
			
 
				                 "16",
			
 
				                 "--config-name",
			
 
				-                "vqgan_pretrain",
			
 
				+                "firefly_gan_vq",
			
 
				                 "--checkpoint-path",
			
 
				-                "checkpoints/vq-gan-group-fsq-2x1024.pth",
			
 
				+                "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
			
 
				             ]
			
 
				         )
			
 
				 
			
@@ -1292,8 +1292,8 @@ with gr.Blocks(
 
				                                     choices=[
			
 
				                                         "vits_decoder_finetune",
			
 
				                                         "vits_decoder_pretrain",
			
 
				-                                        "vqgan_finetune",
			
 
				-                                        "vqgan_pretrain",
			
 
				+                                        "firefly_gan_vq",
			
 
				+                                        "firefly_gan_vq",
			
 
				                                     ],
			
 
				                                     allow_custom_value=True,
			
 
				                                 )
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,8 @@ dependencies = [
 
				     "vector_quantize_pytorch>=1.14.24",
			
 
				     "samplerate>=0.2.1",
			
 
				     "resampy>=0.4.3",
			
 
				-    "einx[torch]==0.2.2"
			
 
				+    "einx[torch]==0.2.2",
			
 
				+    "zstandard>=0.22.0"
			
 
				 ]
			
 
				 
			
 
				 [project.optional-dependencies]
			
--- a/run.py
+++ b/run.py
@@ -0,0 +1,112 @@
 
				+import audioop
			
 
				+import base64
			
 
				+
			
 
				+import numpy as np
			
 
				+import soundfile as sf
			
 
				+from fastapi import FastAPI, WebSocket
			
 
				+from fastapi.responses import Response
			
 
				+from loguru import logger
			
 
				+
			
 
				+from stream_service import FishAgentPipeline
			
 
				+
			
 
				+app = FastAPI()
			
 
				+
			
 
				+
			
 
				+@app.post("/incoming")
			
 
				+async def handle_incoming():
			
 
				+    xml = """<Response>
			
 
				+    <Connect>
			
 
				+    <Stream url="wss://2427-24-4-31-213.ngrok-free.app/connection" />
			
 
				+    </Connect>
			
 
				+</Response>"""
			
 
				+
			
 
				+    logger.info("Incoming call received")
			
 
				+    return Response(media_type="text/xml", content=xml)
			
 
				+
			
 
				+
			
 
				+async def send_audio(ws, audio, stream_sid=""):
			
 
				+    await ws.send_json(
			
 
				+        {
			
 
				+            "streamSid": stream_sid,
			
 
				+            "event": "media",
			
 
				+            "media": {
			
 
				+                "payload": audio,
			
 
				+            },
			
 
				+        }
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def decode_mu_law(data):
			
 
				+    samples = audioop.ulaw2lin(data, 2)
			
 
				+    samples = np.frombuffer(samples, dtype=np.int16)
			
 
				+    samples = samples.astype(np.float32) / 32768.0
			
 
				+
			
 
				+    return samples
			
 
				+
			
 
				+
			
 
				+def encode_mu_law(data):
			
 
				+    samples = np.clip(data, -1.0, 1.0)
			
 
				+    samples = (samples * 32768).astype(np.int16)
			
 
				+    samples = audioop.lin2ulaw(samples.tobytes(), 2)
			
 
				+
			
 
				+    return samples
			
 
				+
			
 
				+
			
 
				+is_working = False
			
 
				+
			
 
				+
			
 
				+@app.websocket("/connection")
			
 
				+async def handle_connection(websocket: WebSocket):
			
 
				+    global is_working
			
 
				+
			
 
				+    await websocket.accept()
			
 
				+    logger.info("Connection established")
			
 
				+    stream_sid = None
			
 
				+    call_sid = None
			
 
				+
			
 
				+    if is_working:
			
 
				+        logger.info("Already working, closing connection")
			
 
				+        await websocket.close()
			
 
				+        return
			
 
				+
			
 
				+    is_working = True
			
 
				+    pipe.reset()
			
 
				+
			
 
				+    while True:
			
 
				+        data = await websocket.receive_json()
			
 
				+        if data["event"] == "connected":
			
 
				+            logger.info("Connected message received")
			
 
				+        elif data["event"] == "start":
			
 
				+            stream_sid = data["start"]["streamSid"]
			
 
				+            call_sid = data["start"]["callSid"]
			
 
				+            logger.info(f"Start media streaming: {stream_sid} - {call_sid}")
			
 
				+        elif data["event"] == "media":
			
 
				+            payload = data["media"]["payload"]
			
 
				+            chunk = base64.b64decode(payload)
			
 
				+            samples = decode_mu_law(chunk)
			
 
				+            for i in pipe.add_chunk(samples, sr=8000):
			
 
				+                await send_audio(
			
 
				+                    websocket, base64.b64encode(encode_mu_law(i)).decode(), stream_sid
			
 
				+                )
			
 
				+        elif data["event"] == "closed":
			
 
				+            logger.info("Connection closed")
			
 
				+            await websocket.close()
			
 
				+            break
			
 
				+        elif data["event"] == "stop":
			
 
				+            logger.info("Stop media streaming")
			
 
				+            await websocket.close()
			
 
				+            break
			
 
				+        else:
			
 
				+            logger.info(f"Unknown event: {data}")
			
 
				+
			
 
				+    is_working = False
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import uvicorn
			
 
				+
			
 
				+    pipe = FishAgentPipeline()
			
 
				+    pipe.warmup()
			
 
				+
			
 
				+    logger.info("Starting server")
			
 
				+    uvicorn.run(app, host="localhost", port=5000)
			
--- a/stream_service.py
+++ b/stream_service.py
@@ -0,0 +1,412 @@
 
				+import time
			
 
				+
			
 
				+import librosa
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+import torchaudio
			
 
				+from loguru import logger
			
 
				+from torchaudio import functional as AF
			
 
				+from transformers import (
			
 
				+    AutoModelForSpeechSeq2Seq,
			
 
				+    AutoProcessor,
			
 
				+    AutoTokenizer,
			
 
				+    pipeline,
			
 
				+)
			
 
				+
			
 
				+from fish_speech.conversation import (
			
 
				+    CODEBOOK_EOS_TOKEN_ID,
			
 
				+    Conversation,
			
 
				+    Message,
			
 
				+    TokensPart,
			
 
				+    encode_conversation,
			
 
				+)
			
 
				+from fish_speech.models.text2semantic.llama import DualARTransformer
			
 
				+from tools.api import decode_vq_tokens, encode_reference
			
 
				+from tools.llama.generate_test import convert_string
			
 
				+from tools.llama.generate_test import generate as llama_generate
			
 
				+from tools.llama.generate_test import load_model as load_llama_model
			
 
				+from tools.vqgan.inference import load_model as load_decoder_model
			
 
				+
			
 
				+
			
 
				+class FishStreamVAD:
			
 
				+    def __init__(self) -> None:
			
 
				+        # Args
			
 
				+        self.sample_rate = 16000
			
 
				+        self.threshold = 0.5
			
 
				+        self.neg_threshold = self.threshold - 0.15
			
 
				+        self.min_speech_duration_ms = 100
			
 
				+        self.min_silence_ms = 500
			
 
				+        self.speech_pad_ms = 30
			
 
				+        self.chunk_size = 512
			
 
				+
			
 
				+        # Convert to samples
			
 
				+        self.min_speech_duration_samples = (
			
 
				+            self.min_speech_duration_ms * self.sample_rate // 1000
			
 
				+        )
			
 
				+        self.min_silence_samples = self.min_silence_ms * self.sample_rate // 1000
			
 
				+        self.speech_pad_samples = self.speech_pad_ms * self.sample_rate // 1000
			
 
				+
			
 
				+        # Core buffers
			
 
				+        self.reset()
			
 
				+
			
 
				+        # Load models
			
 
				+        logger.info("Loading VAD model")
			
 
				+        vad_model, vad_utils = torch.hub.load(
			
 
				+            repo_or_dir="snakers4/silero-vad",
			
 
				+            model="silero_vad",
			
 
				+            force_reload=True,
			
 
				+            onnx=True,
			
 
				+        )
			
 
				+
			
 
				+        self.vad_model = vad_model
			
 
				+        self.get_speech_timestamps = vad_utils[0]
			
 
				+        logger.info("VAD model loaded")
			
 
				+
			
 
				+    def reset(self):
			
 
				+        self.audio_chunks = None
			
 
				+        self.vad_pointer = 0
			
 
				+        self.speech_probs = []
			
 
				+
			
 
				+        self.triggered = False
			
 
				+        self.start = self.end = self.temp_end = 0
			
 
				+        self.last_seen_end = 0
			
 
				+        self.speech_segments = []
			
 
				+
			
 
				+    def add_chunk(self, chunk, sr=None):
			
 
				+        """
			
 
				+        Add a chunk to the buffer
			
 
				+        """
			
 
				+
			
 
				+        if isinstance(chunk, np.ndarray):
			
 
				+            chunk = torch.from_numpy(chunk)
			
 
				+
			
 
				+        if sr is not None and sr != self.sample_rate:
			
 
				+            chunk = AF.resample(chunk, sr, self.sample_rate)
			
 
				+
			
 
				+        # self.audio_chunks.append(chunk)
			
 
				+        if self.audio_chunks is None:
			
 
				+            self.audio_chunks = chunk
			
 
				+        else:
			
 
				+            self.audio_chunks = torch.cat([self.audio_chunks, chunk])
			
 
				+
			
 
				+        # Trigger VAD
			
 
				+        yield from self.detect_speech()
			
 
				+
			
 
				+    def detect_speech(self):
			
 
				+        """
			
 
				+        Run the VAD model on the current buffer
			
 
				+        """
			
 
				+
			
 
				+        speech_prob_start_idx = len(self.speech_probs)
			
 
				+        while len(self.audio_chunks) - self.vad_pointer >= self.chunk_size:
			
 
				+            chunk = self.audio_chunks[
			
 
				+                self.vad_pointer : self.vad_pointer + self.chunk_size
			
 
				+            ]
			
 
				+            speech_prob = self.vad_model(chunk, self.sample_rate)
			
 
				+            self.speech_probs.append(speech_prob)
			
 
				+            self.vad_pointer += self.chunk_size
			
 
				+
			
 
				+        # Process speech probs
			
 
				+        for i in range(speech_prob_start_idx, len(self.speech_probs)):
			
 
				+            speech_prob = self.speech_probs[i]
			
 
				+
			
 
				+            if speech_prob >= self.threshold and self.temp_end:
			
 
				+                self.temp_end = 0
			
 
				+
			
 
				+            if speech_prob >= self.threshold and self.triggered is False:
			
 
				+                self.triggered = True
			
 
				+                self.start = i * self.chunk_size
			
 
				+                continue
			
 
				+
			
 
				+            if speech_prob < self.neg_threshold and self.triggered is True:
			
 
				+                if self.temp_end == 0:
			
 
				+                    self.temp_end = i * self.chunk_size
			
 
				+
			
 
				+                if i * self.chunk_size - self.temp_end < self.min_silence_samples:
			
 
				+                    continue
			
 
				+
			
 
				+                self.end = self.temp_end
			
 
				+                if self.end - self.start > self.min_speech_duration_samples:
			
 
				+                    yield self.audio_chunks[
			
 
				+                        self.start : self.end + self.speech_pad_samples
			
 
				+                    ]
			
 
				+
			
 
				+                self.triggered = False
			
 
				+                self.start = self.end = self.temp_end = 0
			
 
				+
			
 
				+
			
 
				+class FishASR:
			
 
				+    def __init__(self) -> None:
			
 
				+        self.audio_chunks = None
			
 
				+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
			
 
				+        torch_dtype = torch.bfloat16
			
 
				+        model_id = "openai/whisper-medium.en"
			
 
				+
			
 
				+        logger.info("Loading ASR model")
			
 
				+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
			
 
				+            model_id, torch_dtype=torch_dtype, use_safetensors=True
			
 
				+        ).to(self.device)
			
 
				+        processor = AutoProcessor.from_pretrained(model_id)
			
 
				+        self.pipe = pipeline(
			
 
				+            "automatic-speech-recognition",
			
 
				+            model=model,
			
 
				+            tokenizer=processor.tokenizer,
			
 
				+            feature_extractor=processor.feature_extractor,
			
 
				+            max_new_tokens=256,
			
 
				+            torch_dtype=torch_dtype,
			
 
				+            device=self.device,
			
 
				+        )
			
 
				+        logger.info("ASR model loaded")
			
 
				+
			
 
				+    @torch.inference_mode()
			
 
				+    def run(self, chunk):
			
 
				+        return self.pipe(chunk.numpy())
			
 
				+
			
 
				+
			
 
				+class FishE2EAgent:
			
 
				+    def __init__(self) -> None:
			
 
				+        self.device = device = "cuda" if torch.cuda.is_available() else "cpu"
			
 
				+        logger.info(f"Using device: {device}")
			
 
				+
			
 
				+        decoder_model = load_decoder_model(
			
 
				+            config_name="firefly_gan_vq",
			
 
				+            checkpoint_path="checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
			
 
				+            device=device,
			
 
				+        )
			
 
				+        self.decoder_model = decoder_model
			
 
				+        logger.info("Decoder model loaded")
			
 
				+
			
 
				+        llama_model, decode_one_token = load_llama_model(
			
 
				+            config_name="dual_ar_2_codebook_1.3b",
			
 
				+            checkpoint_path="checkpoints/step_000206000.ckpt",
			
 
				+            device=device,
			
 
				+            precision=torch.bfloat16,
			
 
				+            max_length=2048,
			
 
				+            compile=True,
			
 
				+        )
			
 
				+        self.llama_model: DualARTransformer = llama_model
			
 
				+        self.decode_one_token = decode_one_token
			
 
				+        logger.info("LLAMA model loaded")
			
 
				+
			
 
				+        self.tokenizer = AutoTokenizer.from_pretrained(
			
 
				+            "checkpoints/fish-speech-agent-1"
			
 
				+        )
			
 
				+        self.semantic_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
			
 
				+        self.im_end_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
			
 
				+        self.decoder_tokenizer = AutoTokenizer.from_pretrained(
			
 
				+            "fishaudio/fish-speech-1"
			
 
				+        )
			
 
				+
			
 
				+        # Control params
			
 
				+        self.temperature = torch.tensor(0.7, device=device, dtype=torch.float)
			
 
				+        self.top_p = torch.tensor(0.7, device=device, dtype=torch.float)
			
 
				+        self.repetition_penalty = torch.tensor(1.2, device=device, dtype=torch.float)
			
 
				+
			
 
				+        # This is used to control the timbre of the generated audio
			
 
				+        self.base_messages = [
			
 
				+            # Message(
			
 
				+            #     role="user",
			
 
				+            #     parts=[np.load("example/q0.npy")],
			
 
				+            # ),
			
 
				+            # Message(
			
 
				+            #     role="assistant",
			
 
				+            #     parts=[
			
 
				+            #         "Transcribed: Hi, can you briefly describe what is machine learning?\nResponse: Sure! Machine learning is the process of automating tasks that humans are capable of doing with a computer. It involves training computers to make decisions based on data.",
			
 
				+            #         np.load("example/a0.npy"),
			
 
				+            #     ],
			
 
				+            # ),
			
 
				+        ]
			
 
				+        self.reference = encode_reference(
			
 
				+            decoder_model=self.decoder_model,
			
 
				+            reference_audio="example/a0.wav",
			
 
				+            enable_reference_audio=True,
			
 
				+        )
			
 
				+        self.messages = self.base_messages.copy()
			
 
				+
			
 
				+    def reset(self):
			
 
				+        self.messages = self.base_messages.copy()
			
 
				+
			
 
				+    @torch.inference_mode()
			
 
				+    def vq_encode(self, audios, sr=None):
			
 
				+        if isinstance(audios, np.ndarray):
			
 
				+            audios = torch.from_numpy(audios)
			
 
				+
			
 
				+        if audios.ndim == 1:
			
 
				+            audios = audios[None, None, :]
			
 
				+
			
 
				+        audios = audios.to(self.decoder_model.device)
			
 
				+        if sr is not None and sr != self.decoder_model.sampling_rate:
			
 
				+            audios = AF.resample(audios, sr, self.decoder_model.sampling_rate)
			
 
				+
			
 
				+        audio_lengths = torch.tensor(
			
 
				+            [audios.shape[2]], device=self.decoder_model.device, dtype=torch.long
			
 
				+        )
			
 
				+
			
 
				+        return self.decoder_model.encode(audios, audio_lengths)[0][0]
			
 
				+
			
 
				+    @torch.inference_mode()
			
 
				+    def generate(self, audio_chunk, sr=None, text=None):
			
 
				+        vq_output = self.vq_encode(audio_chunk, sr)
			
 
				+        logger.info(f"VQ output: {vq_output.shape}")
			
 
				+
			
 
				+        # Encode conversation
			
 
				+        self.messages.append(
			
 
				+            Message(
			
 
				+                role="user",
			
 
				+                parts=[vq_output],
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        parts = []
			
 
				+        if text is not None:
			
 
				+            parts.append(f"Transcribed: {text}\nResponse:")
			
 
				+
			
 
				+        self.messages.append(
			
 
				+            Message(
			
 
				+                role="assistant",
			
 
				+                parts=parts,
			
 
				+            )
			
 
				+        )
			
 
				+        conversation = Conversation(self.messages)
			
 
				+
			
 
				+        # Encode the conversation
			
 
				+        prompt, _ = encode_conversation(
			
 
				+            conversation, self.tokenizer, self.llama_model.config.num_codebooks
			
 
				+        )
			
 
				+        prompt = prompt[:, :-1].to(dtype=torch.int, device=self.device)
			
 
				+        prompt_length = prompt.shape[1]
			
 
				+
			
 
				+        # Generate
			
 
				+        y = llama_generate(
			
 
				+            model=self.llama_model,
			
 
				+            prompt=prompt,
			
 
				+            max_new_tokens=0,
			
 
				+            eos_token_id=self.tokenizer.eos_token_id,
			
 
				+            im_end_id=self.im_end_id,
			
 
				+            decode_one_token=self.decode_one_token,
			
 
				+            temperature=self.temperature,
			
 
				+            top_p=self.top_p,
			
 
				+            repetition_penalty=self.repetition_penalty,
			
 
				+        )
			
 
				+
			
 
				+        tokens = self.tokenizer.decode(
			
 
				+            y[0, prompt_length:].tolist(), skip_special_tokens=False
			
 
				+        )
			
 
				+        logger.info(f"Generated: {convert_string(tokens)}")
			
 
				+
			
 
				+        # Put the generated tokens
			
 
				+        # since there is <im_end> and <eos> tokens, we remove last 2 tokens
			
 
				+        code_mask = y[0, prompt_length:-2] == self.semantic_id
			
 
				+        codes = y[1:, prompt_length:-2][:, code_mask].clone()
			
 
				+
			
 
				+        codes = codes - 2
			
 
				+        assert (codes >= 0).all(), f"Negative code found"
			
 
				+
			
 
				+        decoded = y[:, prompt_length:-1].clone()
			
 
				+        if decoded[0, -1] != self.im_end_id:  # <im_end>
			
 
				+            val = [[self.im_end_id]] + [[CODEBOOK_EOS_TOKEN_ID]] * (decoded.size(0) - 1)
			
 
				+            decoded = torch.cat(
			
 
				+                (decoded, torch.tensor(val, device=self.device, dtype=torch.int)), dim=1
			
 
				+            )
			
 
				+
			
 
				+        decoded = decoded.cpu()
			
 
				+        self.messages[-1].parts.append(
			
 
				+            TokensPart(
			
 
				+                tokens=decoded[:1],
			
 
				+                codes=decoded[1:],
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        # Less than 5 * 20 = 100ms
			
 
				+        if codes.shape[1] <= 5:
			
 
				+            return
			
 
				+
			
 
				+        # Generate audio
			
 
				+        main_tokens = decoded[0]
			
 
				+        text_tokens = main_tokens[main_tokens != self.semantic_id]
			
 
				+        text = self.tokenizer.decode(text_tokens.tolist(), skip_special_tokens=True)
			
 
				+        text_tokens = self.decoder_tokenizer.encode(text, return_tensors="pt").to(
			
 
				+            self.device
			
 
				+        )
			
 
				+
			
 
				+        audio = decode_vq_tokens(
			
 
				+            decoder_model=self.decoder_model,
			
 
				+            codes=codes,
			
 
				+            text_tokens=text_tokens,
			
 
				+            reference_embedding=self.reference,
			
 
				+        )
			
 
				+
			
 
				+        if sr is not None and sr != self.decoder_model.sampling_rate:
			
 
				+            audio = AF.resample(audio, self.decoder_model.sampling_rate, sr)
			
 
				+
			
 
				+        return audio.float()
			
 
				+
			
 
				+
			
 
				+class FishAgentPipeline:
			
 
				+    def __init__(self) -> None:
			
 
				+        self.vad = FishStreamVAD()
			
 
				+        # Currently use ASR model as intermediate
			
 
				+        self.asr = FishASR()
			
 
				+        self.agent = FishE2EAgent()
			
 
				+
			
 
				+        self.vad_segments = []
			
 
				+        self.text_segments = []
			
 
				+
			
 
				+    def add_chunk(self, chunk, sr=None):
			
 
				+        use_np = isinstance(chunk, np.ndarray)
			
 
				+        if use_np:
			
 
				+            chunk = torch.from_numpy(chunk)
			
 
				+
			
 
				+        if sr is not None and sr != 16000:
			
 
				+            chunk = AF.resample(chunk, sr, 16000)
			
 
				+
			
 
				+        for vad_audio in self.vad.add_chunk(chunk, 16000):
			
 
				+            self.vad_segments.append(vad_audio)
			
 
				+            asr_text = self.asr.run(vad_audio)
			
 
				+            self.text_segments.append(asr_text)
			
 
				+            logger.info(f"ASR: {asr_text}")
			
 
				+
			
 
				+            # Actually should detect if intent is finished here
			
 
				+            result = self.agent.generate(vad_audio, 16000, text=asr_text)
			
 
				+            if result is None:
			
 
				+                continue
			
 
				+
			
 
				+            if sr is not None and sr != 16000:
			
 
				+                result = AF.resample(result, 16000, sr)
			
 
				+
			
 
				+            if use_np:
			
 
				+                result = result.cpu().numpy()
			
 
				+
			
 
				+            yield result
			
 
				+
			
 
				+    def reset(self):
			
 
				+        self.vad.reset()
			
 
				+        self.agent.reset()
			
 
				+        self.vad_segments = []
			
 
				+        self.text_segments = []
			
 
				+
			
 
				+    def warmup(self):
			
 
				+        logger.info("Warming up the pipeline")
			
 
				+        audio, sr = librosa.load("example/q0.mp3", sr=16000)
			
 
				+        for i in range(0, len(audio), 882):
			
 
				+            for audio in self.add_chunk(audio[i : i + 882], sr):
			
 
				+                pass
			
 
				+        logger.info("Pipeline warmed up")
			
 
				+        self.reset()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import soundfile as sf
			
 
				+
			
 
				+    service = FishAgentPipeline()
			
 
				+    service.warmup()
			
 
				+    logger.info("Stream service started")
			
 
				+
			
 
				+    audio, sr = librosa.load("example/q1.mp3", sr=16000)
			
 
				+    seg = []
			
 
				+    for i in range(0, len(audio), 882):
			
 
				+        for audio in service.add_chunk(audio[i : i + 882], sr):
			
 
				+            seg.append(audio)
			
 
				+
			
 
				+    audio = np.concatenate(seg)
			
 
				+    sf.write("output.wav", audio, 16000)
			
--- a/test_echo.py
+++ b/test_echo.py
@@ -0,0 +1,183 @@
 
				+import io
			
 
				+import wave
			
 
				+from typing import List
			
 
				+
			
 
				+import av
			
 
				+import numpy as np
			
 
				+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
			
 
				+from fastapi.responses import HTMLResponse
			
 
				+
			
 
				+app = FastAPI()
			
 
				+
			
 
				+html = """
			
 
				+<!DOCTYPE html>
			
 
				+<html>
			
 
				+<head>
			
 
				+    <title>Real-time Chat Room</title>
			
 
				+</head>
			
 
				+<body>
			
 
				+    <h1>Real-time Chat Room</h1>
			
 
				+    <button id="start">Start Streaming</button>
			
 
				+    <button id="stop">Stop Streaming</button>
			
 
				+    <script type="module">
			
 
				+        import { MediaRecorder, register } from 'https://dev.jspm.io/npm:extendable-media-recorder';
			
 
				+        import { connect } from 'https://dev.jspm.io/npm:extendable-media-recorder-wav-encoder';
			
 
				+    
			
 
				+        await register(await connect());
			
 
				+
			
 
				+        let socket;
			
 
				+        let mediaRecorder;
			
 
				+        let audioContext;
			
 
				+
			
 
				+        function startStreaming() {
			
 
				+            initWebSocket();
			
 
				+
			
 
				+            audioContext = new (window.AudioContext || window.webkitAudioContext)();
			
 
				+            navigator.mediaDevices.getUserMedia({ audio: {
			
 
				+                channelCount: 1,  
			
 
				+                sampleRate: 44100,
			
 
				+                sampleSize: 16,
			
 
				+                echoCancellation: true,
			
 
				+                noiseSuppression: true
			
 
				+            } })
			
 
				+                .then(function (stream) {
			
 
				+                    mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
			
 
				+                    mediaRecorder.start(100);
			
 
				+                    mediaRecorder.addEventListener("dataavailable", function (event) {
			
 
				+                        socket.send(event.data);
			
 
				+                    });
			
 
				+                })
			
 
				+                .catch(function (err) {
			
 
				+                    console.error("Error accessing microphone:", err);
			
 
				+                });
			
 
				+
			
 
				+                // Create a MediaSource
			
 
				+                const mediaSource = new MediaSource();
			
 
				+                const mediaStream = new MediaStream();
			
 
				+
			
 
				+                // Create an HTMLVideoElement and attach the MediaSource to it
			
 
				+                const audioElement = document.createElement('audio');
			
 
				+                audioElement.src = URL.createObjectURL(mediaSource);
			
 
				+                audioElement.autoplay = true;
			
 
				+                document.body.appendChild(audioElement);
			
 
				+
			
 
				+                mediaSource.addEventListener('sourceopen', function() {
			
 
				+                    const sourceBuffer = mediaSource.addSourceBuffer('audio/webm; codecs=opus');
			
 
				+
			
 
				+                    socket.onmessage = function(event) {
			
 
				+                        const arrayBuffer = event.data;
			
 
				+
			
 
				+                        sourceBuffer.appendBuffer(arrayBuffer);
			
 
				+                    };
			
 
				+                });
			
 
				+        }
			
 
				+
			
 
				+        function stopStreaming() {
			
 
				+            mediaRecorder.stop();
			
 
				+        }
			
 
				+
			
 
				+        function initWebSocket() {
			
 
				+            const is_wss = window.location.protocol === "https:";
			
 
				+            socket = new WebSocket(`${is_wss ? "wss" : "ws"}://${window.location.host}/ws`);
			
 
				+            socket.binaryType = 'arraybuffer';
			
 
				+        }
			
 
				+
			
 
				+        document.getElementById("start").onclick = startStreaming;
			
 
				+        document.getElementById("stop").onclick = stopStreaming;
			
 
				+    </script>
			
 
				+</body>
			
 
				+</html>
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+def encode_wav(data):
			
 
				+    sample_rate = 44100
			
 
				+    samples = np.frombuffer(data, dtype=np.int16)
			
 
				+    buffer = io.BytesIO()
			
 
				+
			
 
				+    with wave.open(buffer, "wb") as wav_file:
			
 
				+        wav_file.setnchannels(1)
			
 
				+        wav_file.setsampwidth(2)
			
 
				+        wav_file.setframerate(sample_rate)
			
 
				+        wav_file.writeframes(samples.tobytes())
			
 
				+
			
 
				+    return buffer.getvalue()
			
 
				+
			
 
				+
			
 
				+class ConnectionManager:
			
 
				+    def __init__(self):
			
 
				+        self.active_connections: List[WebSocket] = []
			
 
				+
			
 
				+    async def connect(self, websocket: WebSocket):
			
 
				+        await websocket.accept()
			
 
				+        self.active_connections.append(websocket)
			
 
				+
			
 
				+    def disconnect(self, websocket: WebSocket):
			
 
				+        self.active_connections.remove(websocket)
			
 
				+
			
 
				+    async def broadcast(self, message: bytes, sender: WebSocket):
			
 
				+        for connection in self.active_connections:
			
 
				+            if connection == sender:
			
 
				+                #     print("Sending message to client", connection)
			
 
				+                await connection.send_bytes(message)
			
 
				+
			
 
				+
			
 
				+manager = ConnectionManager()
			
 
				+
			
 
				+
			
 
				+@app.get("/")
			
 
				+async def get():
			
 
				+    return HTMLResponse(html)
			
 
				+
			
 
				+
			
 
				+@app.websocket("/ws")
			
 
				+async def websocket_endpoint(websocket: WebSocket):
			
 
				+    await manager.connect(websocket)
			
 
				+    try:
			
 
				+        buffer = io.BytesIO()
			
 
				+        container = None
			
 
				+        cur_pos = 0
			
 
				+        total_size = 0
			
 
				+
			
 
				+        while True:
			
 
				+            data = await websocket.receive_bytes()
			
 
				+            # data = encode_wav(data)
			
 
				+            # if len(data) == 1:
			
 
				+            #     print(f"len(data): {len(data)}, data: {data}")
			
 
				+            # if len(data) > 1:
			
 
				+            #     data = b'\x1a' + data
			
 
				+            #     with open("output.webm", "wb") as f:
			
 
				+            #         f.write(data)
			
 
				+            #     exit()
			
 
				+            # print(f"len(data): {len(data)}")
			
 
				+
			
 
				+            # print("Received data:", data)
			
 
				+            # Save as webm file and exit
			
 
				+            # with open("output.wav", "wb") as f:
			
 
				+            #     f.write(encode_wav(data))
			
 
				+
			
 
				+            buffer.write(data)
			
 
				+            buffer.seek(cur_pos)
			
 
				+            total_size += len(data)
			
 
				+
			
 
				+            if not container and total_size > 1000:
			
 
				+                container = av.open(buffer, "r", format="webm")
			
 
				+                print(container)
			
 
				+            elif container:
			
 
				+                for packet in container.decode(video=0):
			
 
				+                    if packet.size == 0:
			
 
				+                        continue
			
 
				+
			
 
				+                    cur_pos += packet.size
			
 
				+                    for frame in packet.decode():
			
 
				+                        print(frame.to_ndarray().shape)
			
 
				+
			
 
				+            await manager.broadcast(data, websocket)
			
 
				+    except WebSocketDisconnect:
			
 
				+        manager.disconnect(websocket)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import uvicorn
			
 
				+
			
 
				+    uvicorn.run(app, host="0.0.0.0", port=8000)
			
--- a/tools/api.py
+++ b/tools/api.py
@@ -400,21 +400,17 @@ def parse_args():
 
				     parser.add_argument(
			
 
				         "--llama-checkpoint-path",
			
 
				         type=str,
			
 
				-        default="checkpoints/text2semantic-sft-medium-v1-4k.pth",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--llama-config-name", type=str, default="dual_ar_2_codebook_medium"
			
 
				+        default="checkpoints/fish-speech-1.2",
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "--decoder-checkpoint-path",
			
 
				         type=str,
			
 
				-        default="checkpoints/vq-gan-group-fsq-2x1024.pth",
			
 
				+        default="checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
			
 
				     )
			
 
				-    parser.add_argument("--decoder-config-name", type=str, default="vqgan_pretrain")
			
 
				+    parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
			
 
				     parser.add_argument("--tokenizer", type=str, default="fishaudio/fish-speech-1")
			
 
				     parser.add_argument("--device", type=str, default="cuda")
			
 
				     parser.add_argument("--half", action="store_true")
			
 
				-    parser.add_argument("--max-length", type=int, default=2048)
			
 
				     parser.add_argument("--compile", action="store_true")
			
 
				     parser.add_argument("--max-text-length", type=int, default=0)
			
 
				     parser.add_argument("--listen", type=str, default="127.0.0.1:8000")
			
@@ -450,11 +446,9 @@ if __name__ == "__main__":
 
				 
			
 
				     logger.info("Loading Llama model...")
			
 
				     llama_queue = launch_thread_safe_queue(
			
 
				-        config_name=args.llama_config_name,
			
 
				         checkpoint_path=args.llama_checkpoint_path,
			
 
				         device=args.device,
			
 
				         precision=args.precision,
			
 
				-        max_length=args.max_length,
			
 
				         compile=args.compile,
			
 
				     )
			
 
				     llama_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
			
--- a/tools/llama/convert_hf_weights_to_llama.py
+++ b/tools/llama/convert_hf_weights_to_llama.py
@@ -0,0 +1,101 @@
 
				+import torch
			
 
				+from transformers import LlamaForCausalLM
			
 
				+
			
 
				+from fish_speech.models.text2semantic.llama import BaseModelArgs, BaseTransformer
			
 
				+
			
 
				+# Load the HF model
			
 
				+hf_model = LlamaForCausalLM.from_pretrained(
			
 
				+    "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
			
 
				+)
			
 
				+
			
 
				+model = BaseTransformer(
			
 
				+    BaseModelArgs(
			
 
				+        vocab_size=hf_model.config.vocab_size + 8,
			
 
				+        n_layer=hf_model.config.num_hidden_layers,
			
 
				+        n_head=hf_model.config.num_attention_heads,
			
 
				+        n_local_heads=hf_model.config.num_key_value_heads,
			
 
				+        dim=hf_model.config.hidden_size,
			
 
				+        head_dim=hf_model.config.hidden_size // hf_model.config.num_attention_heads,
			
 
				+        num_codebooks=2,
			
 
				+        codebook_size=1032,
			
 
				+    )
			
 
				+)
			
 
				+print(model.config)
			
 
				+
			
 
				+hf_state_dict = hf_model.state_dict()
			
 
				+model_state_dict = model.state_dict()
			
 
				+
			
 
				+# print(hf_state_dict.keys())
			
 
				+# print(model_state_dict.keys())
			
 
				+
			
 
				+new_state_dict = {}
			
 
				+
			
 
				+# Handle embeddings
			
 
				+new_state_dict["embeddings.weight"] = model_state_dict.pop("embeddings.weight")
			
 
				+hf_embed_tokens = hf_state_dict.pop("model.embed_tokens.weight")
			
 
				+new_state_dict["embeddings.weight"][: hf_embed_tokens.shape[0]] = hf_embed_tokens
			
 
				+
			
 
				+# Restore layers
			
 
				+for layer_idx in range(hf_model.config.num_hidden_layers):
			
 
				+    # Handle attention
			
 
				+    q_weight = hf_state_dict.pop(f"model.layers.{layer_idx}.self_attn.q_proj.weight")
			
 
				+    k_weight = hf_state_dict.pop(f"model.layers.{layer_idx}.self_attn.k_proj.weight")
			
 
				+    v_weight = hf_state_dict.pop(f"model.layers.{layer_idx}.self_attn.v_proj.weight")
			
 
				+    qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
			
 
				+    new_state_dict[f"layers.{layer_idx}.attention.wqkv.weight"] = qkv_weight
			
 
				+    model_state_dict.pop(f"layers.{layer_idx}.attention.wqkv.weight")
			
 
				+
			
 
				+    o_weight = hf_state_dict.pop(f"model.layers.{layer_idx}.self_attn.o_proj.weight")
			
 
				+    new_state_dict[f"layers.{layer_idx}.attention.wo.weight"] = o_weight
			
 
				+    model_state_dict.pop(f"layers.{layer_idx}.attention.wo.weight")
			
 
				+
			
 
				+    # Handle feed forward
			
 
				+    up_weight = hf_state_dict.pop(f"model.layers.{layer_idx}.mlp.up_proj.weight")
			
 
				+    down_weight = hf_state_dict.pop(f"model.layers.{layer_idx}.mlp.down_proj.weight")
			
 
				+    gate_weight = hf_state_dict.pop(f"model.layers.{layer_idx}.mlp.gate_proj.weight")
			
 
				+
			
 
				+    new_state_dict[f"layers.{layer_idx}.feed_forward.w1.weight"] = gate_weight
			
 
				+    new_state_dict[f"layers.{layer_idx}.feed_forward.w2.weight"] = down_weight
			
 
				+    new_state_dict[f"layers.{layer_idx}.feed_forward.w3.weight"] = up_weight
			
 
				+
			
 
				+    model_state_dict.pop(f"layers.{layer_idx}.feed_forward.w1.weight")
			
 
				+    model_state_dict.pop(f"layers.{layer_idx}.feed_forward.w2.weight")
			
 
				+    model_state_dict.pop(f"layers.{layer_idx}.feed_forward.w3.weight")
			
 
				+
			
 
				+    # Handle layer norms
			
 
				+    input_layernorm_weight = hf_state_dict.pop(
			
 
				+        f"model.layers.{layer_idx}.input_layernorm.weight"
			
 
				+    )
			
 
				+    post_attention_layernorm_weight = hf_state_dict.pop(
			
 
				+        f"model.layers.{layer_idx}.post_attention_layernorm.weight"
			
 
				+    )
			
 
				+
			
 
				+    new_state_dict[f"layers.{layer_idx}.ffn_norm.weight"] = (
			
 
				+        post_attention_layernorm_weight
			
 
				+    )
			
 
				+    new_state_dict[f"layers.{layer_idx}.attention_norm.weight"] = input_layernorm_weight
			
 
				+
			
 
				+    model_state_dict.pop(f"layers.{layer_idx}.ffn_norm.weight")
			
 
				+    model_state_dict.pop(f"layers.{layer_idx}.attention_norm.weight")
			
 
				+
			
 
				+# Handle final layer norm
			
 
				+new_state_dict["norm.weight"] = hf_state_dict.pop("model.norm.weight")
			
 
				+model_state_dict.pop("norm.weight")
			
 
				+
			
 
				+# Handle output layer
			
 
				+w = hf_state_dict.pop("lm_head.weight")
			
 
				+new_state_dict["output.weight"] = model_state_dict.pop("output.weight")
			
 
				+new_state_dict["output.weight"][: w.shape[0]] = w
			
 
				+
			
 
				+print(hf_state_dict.keys(), len(hf_state_dict))
			
 
				+print(model_state_dict.keys(), len(model_state_dict))
			
 
				+
			
 
				+print(model.load_state_dict(new_state_dict, strict=True))
			
 
				+
			
 
				+model = model.bfloat16()
			
 
				+
			
 
				+new_state_dict = {f"model.{k}": v for k, v in model.state_dict().items()}
			
 
				+torch.save(
			
 
				+    new_state_dict,
			
 
				+    "checkpoints/fish-speech-agent-1/TinyLlama-1.1B-intermediate-step-1431k-3T.pth",
			
 
				+)
			
--- a/tools/llama/generate.py
+++ b/tools/llama/generate.py
@@ -19,7 +19,7 @@ from loguru import logger
 
				 from tqdm import tqdm
			
 
				 from transformers import AutoTokenizer
			
 
				 
			
 
				-from fish_speech.datasets.text import CODEBOOK_EOS_TOKEN_ID, CODEBOOK_PAD_TOKEN_ID
			
 
				+from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
			
 
				 from fish_speech.text import clean_text, split_text
			
 
				 
			
 
				 os.environ["TOKENIZERS_PARALLELISM"] = "false"
			
@@ -31,7 +31,11 @@ if hasattr(torch._inductor.config, "fx_graph_cache"):
 
				     torch._inductor.config.fx_graph_cache = True
			
 
				 
			
 
				 
			
 
				-from fish_speech.models.text2semantic.llama import DualARTransformer, NaiveTransformer
			
 
				+from fish_speech.models.text2semantic.llama import (
			
 
				+    BaseTransformer,
			
 
				+    DualARTransformer,
			
 
				+    NaiveTransformer,
			
 
				+)
			
 
				 
			
 
				 
			
 
				 def multinomial_sample_one_no_sync(
			
@@ -161,7 +165,6 @@ def decode_n_tokens(
 
				     cur_token: torch.Tensor,
			
 
				     input_pos: torch.Tensor,
			
 
				     num_new_tokens: int,
			
 
				-    eos_token_id: int = 2,
			
 
				     im_end_id: int = 4,
			
 
				     decode_one_token=decode_one_token_naive,
			
 
				     **sampling_kwargs,
			
@@ -197,11 +200,7 @@ def decode_n_tokens(
 
				             model.config.num_codebooks + 1, -1
			
 
				         )
			
 
				 
			
 
				-        if (
			
 
				-            cur_token[0, 0, -1] == eos_token_id
			
 
				-            or cur_token[0, 0, -1] == im_end_id
			
 
				-            or (cur_token[0, 1:, -1] == CODEBOOK_EOS_TOKEN_ID).any()
			
 
				-        ):
			
 
				+        if cur_token[0, 0, -1] == im_end_id:
			
 
				             break
			
 
				 
			
 
				     return previous_tokens[:, : i + 1]
			
@@ -214,7 +213,6 @@ def generate(
 
				     model: NaiveTransformer,
			
 
				     prompt: torch.Tensor,
			
 
				     max_new_tokens: int,
			
 
				-    eos_token_id: int = 2,
			
 
				     im_end_id: int = 4,
			
 
				     decode_one_token=decode_one_token_naive,
			
 
				     **sampling_kwargs,
			
@@ -255,6 +253,7 @@ def generate(
 
				         if isinstance(model, NaiveTransformer)
			
 
				         else decode_one_token_ar
			
 
				     )
			
 
				+
			
 
				     next_token = prefill_decode(
			
 
				         model, prompt.view(1, codebook_dim, -1), input_pos, **sampling_kwargs
			
 
				     )
			
@@ -266,7 +265,6 @@ def generate(
 
				         next_token.view(1, codebook_dim, -1),
			
 
				         input_pos,
			
 
				         max_new_tokens - 1,
			
 
				-        eos_token_id=eos_token_id,
			
 
				         im_end_id=im_end_id,
			
 
				         decode_one_token=decode_one_token,
			
 
				         **sampling_kwargs,
			
@@ -281,22 +279,12 @@ def generate(
 
				 def encode_tokens(
			
 
				     tokenizer,
			
 
				     string,
			
 
				-    bos=True,
			
 
				     device="cuda",
			
 
				     prompt_tokens=None,
			
 
				-    speaker=None,
			
 
				     num_codebooks=4,
			
 
				 ):
			
 
				     string = clean_text(string)
			
 
				-
			
 
				-    if speaker is None:
			
 
				-        speaker = "assistant"
			
 
				-
			
 
				-    string = (
			
 
				-        f"<|im_start|>user<|im_sep|>{string}<|im_end|><|im_start|>{speaker}<|im_sep|>"
			
 
				-    )
			
 
				-    if bos:
			
 
				-        string = f"<|begin_of_sequence|>{string}"
			
 
				+    string = f"<|im_start|>user\n{string}<|im_end|><|im_start|>assistant\n"
			
 
				 
			
 
				     new_tokens = tokenizer.encode(
			
 
				         string,
			
@@ -324,7 +312,7 @@ def encode_tokens(
 
				         prompt_tokens = prompt_tokens[0]
			
 
				 
			
 
				     assert prompt_tokens.ndim == 2
			
 
				-    data = prompt_tokens + 2
			
 
				+    data = prompt_tokens + 1
			
 
				 
			
 
				     if prompt_tokens.shape[0] > num_codebooks:
			
 
				         logger.warning(
			
@@ -332,13 +320,9 @@ def encode_tokens(
 
				         )
			
 
				         data = data[:num_codebooks]
			
 
				 
			
 
				-    # Add eos token for each codebook
			
 
				+    # Add pad token for each codebook
			
 
				     data = torch.cat(
			
 
				-        (
			
 
				-            data,
			
 
				-            torch.ones((data.size(0), 1), dtype=torch.int, device=device)
			
 
				-            * CODEBOOK_EOS_TOKEN_ID,
			
 
				-        ),
			
 
				+        (data, torch.zeros((data.size(0), 1), dtype=torch.int, device=device)),
			
 
				         dim=1,
			
 
				     )
			
 
				 
			
@@ -356,16 +340,10 @@ def encode_tokens(
 
				     return prompt
			
 
				 
			
 
				 
			
 
				-def load_model(
			
 
				-    config_name, checkpoint_path, device, precision, max_length, compile=False
			
 
				-):
			
 
				-    hydra.core.global_hydra.GlobalHydra.instance().clear()
			
 
				-    with initialize(version_base="1.3", config_path="../../fish_speech/configs/model"):
			
 
				-        cfg = compose(
			
 
				-            config_name=config_name, overrides=[f"config.max_seq_len={max_length}"]
			
 
				-        )
			
 
				-
			
 
				-    model: Union[NaiveTransformer, DualARTransformer] = instantiate(cfg)
			
 
				+def load_model(checkpoint_path, device, precision, compile=False):
			
 
				+    model: Union[NaiveTransformer, DualARTransformer] = BaseTransformer.from_pretrained(
			
 
				+        checkpoint_path, load_weights=True
			
 
				+    )
			
 
				 
			
 
				     if "int8" in str(checkpoint_path):
			
 
				         logger.info("Using int8 weight-only quantization!")
			
@@ -384,21 +362,8 @@ def load_model(
 
				         simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
			
 
				         model = simple_quantizer.convert_for_runtime()
			
 
				 
			
 
				-    checkpoint = torch.load(str(checkpoint_path), map_location="cpu")
			
 
				-    if "state_dict" in checkpoint:
			
 
				-        checkpoint = checkpoint["state_dict"]
			
 
				-
			
 
				-    if any(k.startswith("model.") for k in checkpoint):
			
 
				-        checkpoint = {
			
 
				-            k.replace("model.", ""): v
			
 
				-            for k, v in checkpoint.items()
			
 
				-            if k.startswith("model.")
			
 
				-        }
			
 
				-
			
 
				-    model.load_state_dict(checkpoint, assign=True)
			
 
				-
			
 
				     model = model.to(device=device, dtype=precision)
			
 
				-    logger.info("Restored model from checkpoint")
			
 
				+    logger.info(f"Restored model from checkpoint")
			
 
				 
			
 
				     if isinstance(model, DualARTransformer):
			
 
				         decode_one_token = decode_one_token_ar
			
@@ -426,7 +391,6 @@ class GenerateResponse:
 
				 def generate_long(
			
 
				     *,
			
 
				     model,
			
 
				-    tokenizer: callable,
			
 
				     device: str | torch.device,
			
 
				     decode_one_token: callable,
			
 
				     text: str,
			
@@ -439,7 +403,6 @@ def generate_long(
 
				     iterative_prompt: bool = True,
			
 
				     max_length: int = 2048,
			
 
				     chunk_length: int = 150,
			
 
				-    speaker: Optional[str] = None,
			
 
				     prompt_text: Optional[str | list[str]] = None,
			
 
				     prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
			
 
				 ):
			
@@ -457,6 +420,7 @@ def generate_long(
 
				     ), "Prompt text and tokens must have the same length"
			
 
				 
			
 
				     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
			
 
				+    tokenizer = model.tokenizer
			
 
				     im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
			
 
				 
			
 
				     encoded = []
			
@@ -469,10 +433,8 @@ def generate_long(
 
				                 encode_tokens(
			
 
				                     tokenizer,
			
 
				                     string=t,
			
 
				-                    bos=idx == 0,
			
 
				                     device=device,
			
 
				                     prompt_tokens=c,
			
 
				-                    speaker=speaker,
			
 
				                     num_codebooks=model.config.num_codebooks,
			
 
				                 )
			
 
				             )
			
@@ -482,9 +444,7 @@ def generate_long(
 
				             encode_tokens(
			
 
				                 tokenizer,
			
 
				                 string=text,
			
 
				-                bos=idx == 0 and not use_prompt,
			
 
				                 device=device,
			
 
				-                speaker=speaker,
			
 
				                 num_codebooks=model.config.num_codebooks,
			
 
				             )
			
 
				         )
			
@@ -544,7 +504,6 @@ def generate_long(
 
				                 model=model,
			
 
				                 prompt=cat_encoded,
			
 
				                 max_new_tokens=max_new_tokens,
			
 
				-                eos_token_id=tokenizer.eos_token_id,
			
 
				                 im_end_id=im_end_id,
			
 
				                 decode_one_token=decode_one_token,
			
 
				                 temperature=temperature,
			
@@ -576,19 +535,13 @@ def generate_long(
 
				 
			
 
				             # Put the generated tokens
			
 
				             # since there is <im_end> and <eos> tokens, we remove last 2 tokens
			
 
				-            codes = y[1:, prompt_length:-2].clone()
			
 
				-
			
 
				-            codes = codes - 2
			
 
				+            codes = y[1:, prompt_length:-1].clone()
			
 
				+            codes = codes - 1
			
 
				             assert (codes >= 0).all(), f"Negative code found"
			
 
				 
			
 
				             decoded = y[:, prompt_length:-1].clone()
			
 
				-            if decoded[0, -1] != im_end_id:  # <im_end>
			
 
				-                val = [[im_end_id]] + [[CODEBOOK_EOS_TOKEN_ID]] * (decoded.size(0) - 1)
			
 
				-                decoded = torch.cat(
			
 
				-                    (decoded, torch.tensor(val, device=device, dtype=torch.int)), dim=1
			
 
				-                )
			
 
				-
			
 
				             # But for global encoding, we should keep the <im_end> token
			
 
				+
			
 
				             global_encoded.append(decoded)
			
 
				             assert (codes >= 0).all(), f"Negative code found: {codes}"
			
 
				             yield GenerateResponse(action="sample", codes=codes, text=texts[seg_idx])
			
@@ -611,11 +564,9 @@ class GenerateRequest:
 
				 
			
 
				 
			
 
				 def launch_thread_safe_queue(
			
 
				-    config_name,
			
 
				     checkpoint_path,
			
 
				     device,
			
 
				     precision,
			
 
				-    max_length: int,
			
 
				     compile: bool = False,
			
 
				 ):
			
 
				     input_queue = queue.Queue()
			
@@ -623,7 +574,7 @@ def launch_thread_safe_queue(
 
				 
			
 
				     def worker():
			
 
				         model, decode_one_token = load_model(
			
 
				-            config_name, checkpoint_path, device, precision, max_length, compile=compile
			
 
				+            checkpoint_path, device, precision, compile=compile
			
 
				         )
			
 
				         init_event.set()
			
 
				 
			
@@ -672,16 +623,12 @@ def launch_thread_safe_queue(
 
				 @click.option(
			
 
				     "--checkpoint-path",
			
 
				     type=click.Path(path_type=Path, exists=True),
			
 
				-    default="checkpoints/text2semantic-sft-medium-v1-4k.pth",
			
 
				+    default="checkpoints/fish-speech-1.2",
			
 
				 )
			
 
				-@click.option("--config-name", type=str, default="dual_ar_2_codebook_medium")
			
 
				-@click.option("--tokenizer", type=str, default="fishaudio/fish-speech-1")
			
 
				 @click.option("--compile/--no-compile", default=False)
			
 
				 @click.option("--seed", type=int, default=42)
			
 
				-@click.option("--speaker", type=str, default=None)
			
 
				 @click.option("--half/--no-half", default=False)
			
 
				 @click.option("--iterative-prompt/--no-iterative-prompt", default=True)
			
 
				-@click.option("--max-length", type=int, default=2048)
			
 
				 @click.option("--chunk-length", type=int, default=150)
			
 
				 def main(
			
 
				     text: str,
			
@@ -693,14 +640,10 @@ def main(
 
				     repetition_penalty: float,
			
 
				     temperature: float,
			
 
				     checkpoint_path: Path,
			
 
				-    config_name: str,
			
 
				-    tokenizer: str,
			
 
				     compile: bool,
			
 
				     seed: int,
			
 
				-    speaker: Optional[str],
			
 
				     half: bool,
			
 
				     iterative_prompt: bool,
			
 
				-    max_length: int,
			
 
				     chunk_length: int,
			
 
				 ) -> None:
			
 
				     device = "cuda"
			
@@ -715,7 +658,7 @@ def main(
 
				     logger.info("Loading model ...")
			
 
				     t0 = time.time()
			
 
				     model, decode_one_token = load_model(
			
 
				-        config_name, checkpoint_path, device, precision, max_length, compile=compile
			
 
				+        checkpoint_path, device, precision, compile=compile
			
 
				     )
			
 
				 
			
 
				     if torch.cuda.is_available():
			
@@ -726,7 +669,6 @@ def main(
 
				     if prompt_tokens is not None:
			
 
				         prompt_tokens = [torch.from_numpy(np.load(p)).to(device) for p in prompt_tokens]
			
 
				 
			
 
				-    tokenizer = AutoTokenizer.from_pretrained(tokenizer)
			
 
				     torch.manual_seed(seed)
			
 
				 
			
 
				     if torch.cuda.is_available():
			
@@ -742,11 +684,8 @@ def main(
 
				         top_p=top_p,
			
 
				         repetition_penalty=repetition_penalty,
			
 
				         temperature=temperature,
			
 
				-        tokenizer=tokenizer,
			
 
				         compile=compile,
			
 
				-        speaker=speaker,
			
 
				         iterative_prompt=iterative_prompt,
			
 
				-        max_length=max_length,
			
 
				         chunk_length=chunk_length,
			
 
				         prompt_text=prompt_text,
			
 
				         prompt_tokens=prompt_tokens,
			
--- a/tools/llama/merge_lora.py
+++ b/tools/llama/merge_lora.py
@@ -14,9 +14,7 @@ from fish_speech.models.text2semantic.lora_utils import (
 
				 @click.command()
			
 
				 @click.option("--llama-config", type=str, default="dual_ar_2_codebook_medium")
			
 
				 @click.option("--lora-config", type=str, default="r_8_alpha_16")
			
 
				-@click.option(
			
 
				-    "--llama-weight", type=str, default="checkpoints/text2semantic-sft-medium-v1-4k.pth"
			
 
				-)
			
 
				+@click.option("--llama-weight", type=str, default="checkpoints/fish-speech-1.2")
			
 
				 @click.option("--lora-weight", type=str, required=True)
			
 
				 @click.option("--output", type=str, required=True)
			
 
				 def merge(llama_config, lora_config, llama_weight, lora_weight, output):
			
--- a/tools/llama/quantize.py
+++ b/tools/llama/quantize.py
@@ -419,7 +419,7 @@ class WeightOnlyInt4Linear(torch.nn.Module):
 
				 @click.option(
			
 
				     "--checkpoint-path",
			
 
				     type=click.Path(path_type=Path, exists=True),
			
 
				-    default="checkpoints/text2semantic-sft-medium-v1-4k.pth",
			
 
				+    default="checkpoints/fish-speech-1.2",
			
 
				 )
			
 
				 @click.option("--config-name", type=str, default="dual_ar_2_codebook_medium")
			
 
				 @click.option(
			
--- a/tools/vits_decoder/inference.py
+++ b/tools/vits_decoder/inference.py
@@ -72,7 +72,7 @@ def load_model(config_name, checkpoint_path, device="cuda"):
 
				 @click.option(
			
 
				     "--checkpoint-path",
			
 
				     "-ckpt",
			
 
				-    default="checkpoints/vq-gan-group-fsq-2x1024.pth",
			
 
				+    default="checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
			
 
				 )
			
 
				 @click.option(
			
 
				     "--device",
			
--- a/tools/vqgan/extract_vq.py
+++ b/tools/vqgan/extract_vq.py
@@ -41,23 +41,31 @@ logger.add(sys.stderr, format=logger_format)
 
				 
			
 
				 @lru_cache(maxsize=1)
			
 
				 def get_model(
			
 
				-    config_name: str = "vqgan_pretrain",
			
 
				-    checkpoint_path: str = "checkpoints/vq-gan-group-fsq-2x1024.pth",
			
 
				+    config_name: str = "firefly_gan_vq",
			
 
				+    checkpoint_path: str = "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
			
 
				+    device: str | torch.device = "cuda",
			
 
				 ):
			
 
				     with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
			
 
				         cfg = compose(config_name=config_name)
			
 
				 
			
 
				-    model: LightningModule = instantiate(cfg.model)
			
 
				+    model = instantiate(cfg)
			
 
				     state_dict = torch.load(
			
 
				         checkpoint_path,
			
 
				-        map_location=model.device,
			
 
				+        map_location=device,
			
 
				     )
			
 
				     if "state_dict" in state_dict:
			
 
				         state_dict = state_dict["state_dict"]
			
 
				 
			
 
				+    if any("generator" in k for k in state_dict):
			
 
				+        state_dict = {
			
 
				+            k.replace("generator.", ""): v
			
 
				+            for k, v in state_dict.items()
			
 
				+            if "generator." in k
			
 
				+        }
			
 
				+
			
 
				     model.load_state_dict(state_dict, strict=False)
			
 
				     model.eval()
			
 
				-    model.cuda()
			
 
				+    model.to(device)
			
 
				 
			
 
				     logger.info(f"Loaded model")
			
 
				     return model
			
@@ -82,8 +90,10 @@ def process_batch(files: list[Path], model) -> float:
 
				         if wav.shape[0] > 1:
			
 
				             wav = wav.mean(dim=0, keepdim=True)
			
 
				 
			
 
				-        wav = torchaudio.functional.resample(wav.cuda(), sr, model.sampling_rate)[0]
			
 
				-        total_time += len(wav) / model.sampling_rate
			
 
				+        wav = torchaudio.functional.resample(
			
 
				+            wav.cuda(), sr, model.spec_transform.sample_rate
			
 
				+        )[0]
			
 
				+        total_time += len(wav) / model.spec_transform.sample_rate
			
 
				         max_length = max(max_length, len(wav))
			
 
				 
			
 
				         wavs.append(wav)
			
@@ -120,10 +130,10 @@ def process_batch(files: list[Path], model) -> float:
 
				 @click.command()
			
 
				 @click.argument("folder")
			
 
				 @click.option("--num-workers", default=1)
			
 
				-@click.option("--config-name", default="vqgan_pretrain")
			
 
				+@click.option("--config-name", default="firefly_gan_vq")
			
 
				 @click.option(
			
 
				     "--checkpoint-path",
			
 
				-    default="checkpoints/vq-gan-group-fsq-2x1024.pth",
			
 
				+    default="checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
			
 
				 )
			
 
				 @click.option("--batch-size", default=64)
			
 
				 @click.option("--filelist", default=None, type=Path)
			
--- a/tools/vqgan/inference.py
+++ b/tools/vqgan/inference.py
@@ -8,7 +8,6 @@ import torch
 
				 import torchaudio
			
 
				 from hydra import compose, initialize
			
 
				 from hydra.utils import instantiate
			
 
				-from lightning import LightningModule
			
 
				 from loguru import logger
			
 
				 from omegaconf import OmegaConf
			
 
				 
			
@@ -23,20 +22,26 @@ def load_model(config_name, checkpoint_path, device="cuda"):
 
				     with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
			
 
				         cfg = compose(config_name=config_name)
			
 
				 
			
 
				-    model: LightningModule = instantiate(cfg.model)
			
 
				+    model = instantiate(cfg)
			
 
				     state_dict = torch.load(
			
 
				         checkpoint_path,
			
 
				-        map_location=model.device,
			
 
				+        map_location=device,
			
 
				     )
			
 
				-
			
 
				     if "state_dict" in state_dict:
			
 
				         state_dict = state_dict["state_dict"]
			
 
				 
			
 
				-    model.load_state_dict(state_dict, strict=False)
			
 
				+    if any("generator" in k for k in state_dict):
			
 
				+        state_dict = {
			
 
				+            k.replace("generator.", ""): v
			
 
				+            for k, v in state_dict.items()
			
 
				+            if "generator." in k
			
 
				+        }
			
 
				+
			
 
				+    result = model.load_state_dict(state_dict, strict=False)
			
 
				     model.eval()
			
 
				     model.to(device)
			
 
				-    logger.info("Restored model from checkpoint")
			
 
				 
			
 
				+    logger.info(f"Loaded model: {result}")
			
 
				     return model
			
 
				 
			
 
				 
			
@@ -51,11 +56,10 @@ def load_model(config_name, checkpoint_path, device="cuda"):
 
				 @click.option(
			
 
				     "--output-path", "-o", default="fake.wav", type=click.Path(path_type=Path)
			
 
				 )
			
 
				-@click.option("--config-name", "-cfg", default="vqgan_pretrain")
			
 
				+@click.option("--config-name", default="firefly_gan_vq")
			
 
				 @click.option(
			
 
				     "--checkpoint-path",
			
 
				-    "-ckpt",
			
 
				-    default="checkpoints/vq-gan-group-fsq-2x1024.pth",
			
 
				+    default="checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
			
 
				 )
			
 
				 @click.option(
			
 
				     "--device",
			
@@ -72,17 +76,17 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
 
				         audio, sr = torchaudio.load(str(input_path))
			
 
				         if audio.shape[0] > 1:
			
 
				             audio = audio.mean(0, keepdim=True)
			
 
				-        audio = torchaudio.functional.resample(audio, sr, model.sampling_rate)
			
 
				+        audio = torchaudio.functional.resample(
			
 
				+            audio, sr, model.spec_transform.sample_rate
			
 
				+        )
			
 
				 
			
 
				-        audios = audio[None].to(model.device)
			
 
				+        audios = audio[None].to(device)
			
 
				         logger.info(
			
 
				-            f"Loaded audio with {audios.shape[2] / model.sampling_rate:.2f} seconds"
			
 
				+            f"Loaded audio with {audios.shape[2] / model.spec_transform.sample_rate:.2f} seconds"
			
 
				         )
			
 
				 
			
 
				         # VQ Encoder
			
 
				-        audio_lengths = torch.tensor(
			
 
				-            [audios.shape[2]], device=model.device, dtype=torch.long
			
 
				-        )
			
 
				+        audio_lengths = torch.tensor([audios.shape[2]], device=device, dtype=torch.long)
			
 
				         indices = model.encode(audios, audio_lengths)[0][0]
			
 
				 
			
 
				         logger.info(f"Generated indices of shape {indices.shape}")
			
@@ -92,17 +96,15 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
 
				     elif input_path.suffix == ".npy":
			
 
				         logger.info(f"Processing precomputed indices from {input_path}")
			
 
				         indices = np.load(input_path)
			
 
				-        indices = torch.from_numpy(indices).to(model.device).long()
			
 
				+        indices = torch.from_numpy(indices).to(device).long()
			
 
				         assert indices.ndim == 2, f"Expected 2D indices, got {indices.ndim}"
			
 
				     else:
			
 
				         raise ValueError(f"Unknown input type: {input_path}")
			
 
				 
			
 
				     # Restore
			
 
				-    feature_lengths = torch.tensor([indices.shape[1]], device=model.device)
			
 
				-    fake_audios = model.decode(
			
 
				-        indices=indices[None], feature_lengths=feature_lengths, return_audios=True
			
 
				-    )
			
 
				-    audio_time = fake_audios.shape[-1] / model.sampling_rate
			
 
				+    feature_lengths = torch.tensor([indices.shape[1]], device=device)
			
 
				+    fake_audios = model.decode(indices=indices[None], feature_lengths=feature_lengths)
			
 
				+    audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate
			
 
				 
			
 
				     logger.info(
			
 
				         f"Generated audio of shape {fake_audios.shape}, equivalent to {audio_time:.2f} seconds from {indices.shape[1]} features, features/second: {indices.shape[1] / audio_time:.2f}"
			
@@ -110,7 +112,7 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
 
				 
			
 
				     # Save audio
			
 
				     fake_audio = fake_audios[0, 0].float().cpu().numpy()
			
 
				-    sf.write(output_path, fake_audio, model.sampling_rate)
			
 
				+    sf.write(output_path, fake_audio, model.spec_transform.sample_rate)
			
 
				     logger.info(f"Saved audio to {output_path}")
			
 
				 
			
 
				 
			
--- a/tools/webui.py
+++ b/tools/webui.py
@@ -443,21 +443,17 @@ def parse_args():
 
				     parser.add_argument(
			
 
				         "--llama-checkpoint-path",
			
 
				         type=Path,
			
 
				-        default="checkpoints/text2semantic-sft-large-v1.1-4k.pth",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--llama-config-name", type=str, default="dual_ar_2_codebook_large"
			
 
				+        default="checkpoints/fish-speech-1.2",
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "--decoder-checkpoint-path",
			
 
				         type=Path,
			
 
				-        default="checkpoints/vq-gan-group-fsq-2x1024.pth",
			
 
				+        default="checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
			
 
				     )
			
 
				-    parser.add_argument("--decoder-config-name", type=str, default="vqgan_pretrain")
			
 
				+    parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
			
 
				     parser.add_argument("--tokenizer", type=str, default="fishaudio/fish-speech-1")
			
 
				     parser.add_argument("--device", type=str, default="cuda")
			
 
				     parser.add_argument("--half", action="store_true")
			
 
				-    parser.add_argument("--max-length", type=int, default=2048)
			
 
				     parser.add_argument("--compile", action="store_true")
			
 
				     parser.add_argument("--max-gradio-length", type=int, default=0)
			
 
				 
			
@@ -470,11 +466,9 @@ if __name__ == "__main__":
 
				 
			
 
				     logger.info("Loading Llama model...")
			
 
				     llama_queue = launch_thread_safe_queue(
			
 
				-        config_name=args.llama_config_name,
			
 
				         checkpoint_path=args.llama_checkpoint_path,
			
 
				         device=args.device,
			
 
				         precision=args.precision,
			
 
				-        max_length=args.max_length,
			
 
				         compile=args.compile,
			
 
				     )
			
 
				     llama_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)