2 سال پیش · 1609e9bad4
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,6 +18,7 @@ repos:
 
				     hooks:
			
 
				       - id: codespell
			
 
				         files: ^.*\.(py|md|rst|yml)$
			
 
				+        args: [-L=fro]
			
 
				 
			
 
				   - repo: https://github.com/pre-commit/pre-commit-hooks
			
 
				     rev: v4.5.0
			
--- a/fish_speech/configs/vqgan_pretrain_v2.yaml
+++ b/fish_speech/configs/vqgan_pretrain_v2.yaml
@@ -3,6 +3,8 @@ defaults:
 
				   - _self_
			
 
				 
			
 
				 project: vqgan_pretrain_v2
			
 
				+ckpt_path: checkpoints/hifigan-base-comb-mix-lb-020/step_001200000_weights_only.ckpt
			
 
				+resume_weights_only: true
			
 
				 
			
 
				 # Lightning Trainer
			
 
				 trainer:
			
@@ -15,22 +17,36 @@ trainer:
 
				 
			
 
				 sample_rate: 44100
			
 
				 hop_length: 512
			
 
				-num_mels: 128
			
 
				+num_mels: 160
			
 
				 n_fft: 2048
			
 
				 win_length: 2048
			
 
				 segment_size: 256
			
 
				 
			
 
				 # Dataset Configuration
			
 
				 train_dataset:
			
 
				-  _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				-  filelist: data/Genshin/vq_train_filelist.txt
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-  slice_frames: ${segment_size}
			
 
				+  _target_: fish_speech.datasets.vqgan.MixDatast
			
 
				+  datasets:
			
 
				+    high-quality-441:
			
 
				+      prob: 0.5
			
 
				+      dataset:
			
 
				+        _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				+        filelist: data/vocoder_data_441/vq_train_filelist.txt
			
 
				+        sample_rate: ${sample_rate}
			
 
				+        hop_length: ${hop_length}
			
 
				+        slice_frames: ${segment_size}
			
 
				+    
			
 
				+    common-voice:
			
 
				+      prob: 0.5
			
 
				+      dataset:
			
 
				+        _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				+        filelist: data/cv-corpus-16.0-2023-12-06/vq_train_filelist.txt
			
 
				+        sample_rate: ${sample_rate}
			
 
				+        hop_length: ${hop_length}
			
 
				+        slice_frames: ${segment_size}
			
 
				 
			
 
				 val_dataset:
			
 
				   _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				-  filelist: data/Genshin/vq_val_filelist.txt
			
 
				+  filelist: data/vocoder_data_441/vq_val_filelist.txt
			
 
				   sample_rate: ${sample_rate}
			
 
				   hop_length: ${hop_length}
			
 
				 
			
@@ -47,8 +63,9 @@ model:
 
				   _target_: fish_speech.models.vqgan.VQGAN
			
 
				   sample_rate: ${sample_rate}
			
 
				   hop_length: ${hop_length}
			
 
				-  segment_size: 8192
			
 
				-  mode: pretrain-stage1
			
 
				+  segment_size: 32768
			
 
				+  mode: pretrain
			
 
				+  freeze_discriminator: true
			
 
				 
			
 
				   downsample:
			
 
				     _target_: fish_speech.models.vqgan.modules.encoders.ConvDownSampler
			
@@ -67,8 +84,8 @@ model:
 
				     _target_: fish_speech.models.vqgan.modules.encoders.VQEncoder
			
 
				     in_channels: 256
			
 
				     vq_channels: 256
			
 
				-    codebook_size: 1024
			
 
				-    codebook_layers: 4
			
 
				+    codebook_size: 256
			
 
				+    codebook_groups: 4
			
 
				     downsample: 1
			
 
				 
			
 
				   decoder:
			
@@ -80,19 +97,38 @@ model:
 
				     n_layers: 6
			
 
				 
			
 
				   generator:
			
 
				-    _target_: fish_speech.models.vqgan.modules.decoder.Generator
			
 
				-    initial_channel: ${num_mels}
			
 
				-    resblock: "1"
			
 
				+    _target_: fish_speech.models.vqgan.modules.decoder_v2.HiFiGANGenerator
			
 
				+    hop_length: ${hop_length}
			
 
				+    upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
			
 
				+    upsample_kernel_sizes: [16, 16, 4, 4, 4]
			
 
				     resblock_kernel_sizes: [3, 7, 11]
			
 
				     resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				-    upsample_rates: [8, 8, 2, 2, 2]
			
 
				+    num_mels: ${num_mels}
			
 
				     upsample_initial_channel: 512
			
 
				-    upsample_kernel_sizes: [16, 16, 4, 4, 4]
			
 
				-
			
 
				-  discriminator:
			
 
				-    _target_: fish_speech.models.vqgan.modules.discriminator.EnsembleDiscriminator
			
 
				-    periods: [2, 3, 5, 7, 11, 17, 23, 37]
			
 
				-
			
 
				+    use_template: true
			
 
				+    pre_conv_kernel_size: 7
			
 
				+    post_conv_kernel_size: 7
			
 
				+
			
 
				+  discriminators:
			
 
				+    _target_: torch.nn.ModuleDict
			
 
				+    modules:
			
 
				+      mpd:
			
 
				+        _target_: fish_speech.models.vqgan.modules.discriminators.mpd.MultiPeriodDiscriminator
			
 
				+        periods: [2, 3, 5, 7, 11, 17, 23, 37]
			
 
				+
			
 
				+      mrd:
			
 
				+        _target_: fish_speech.models.vqgan.modules.discriminators.mrd.MultiResolutionDiscriminator
			
 
				+        resolutions:
			
 
				+          - ["${n_fft}", "${hop_length}", "${win_length}"]
			
 
				+          - [1024, 120, 600]
			
 
				+          - [2048, 240, 1200]
			
 
				+          - [4096, 480, 2400]
			
 
				+          - [512, 50, 240]
			
 
				+
			
 
				+  multi_resolution_stft_loss:
			
 
				+    _target_: fish_speech.models.vqgan.losses.MultiResolutionSTFTLoss
			
 
				+    resolutions: ${model.discriminators.modules.mrd.resolutions}
			
 
				+  
			
 
				   mel_transform:
			
 
				     _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
			
 
				     sample_rate: ${sample_rate}
			
@@ -100,13 +136,11 @@ model:
 
				     hop_length: ${hop_length}
			
 
				     win_length: ${win_length}
			
 
				     n_mels: ${num_mels}
			
 
				-    f_min: 0
			
 
				-    f_max: 16000
			
 
				 
			
 
				   optimizer:
			
 
				     _target_: torch.optim.AdamW
			
 
				     _partial_: true
			
 
				-    lr: 2e-4
			
 
				+    lr: 1e-4
			
 
				     betas: [0.8, 0.99]
			
 
				     eps: 1e-5
			
 
				 
			
@@ -119,7 +153,7 @@ callbacks:
 
				   grad_norm_monitor:
			
 
				     sub_module: 
			
 
				       - generator
			
 
				-      - discriminator
			
 
				+      - discriminators
			
 
				       - mel_encoder
			
 
				       - vq_encoder
			
 
				       - decoder
			
--- a/fish_speech/datasets/vqgan.py
+++ b/fish_speech/datasets/vqgan.py
@@ -6,7 +6,7 @@ import librosa
 
				 import numpy as np
			
 
				 import torch
			
 
				 from lightning import LightningDataModule
			
 
				-from torch.utils.data import DataLoader, Dataset
			
 
				+from torch.utils.data import DataLoader, Dataset, IterableDataset
			
 
				 
			
 
				 from fish_speech.utils import RankedLogger
			
 
				 
			
@@ -72,6 +72,33 @@ class VQGANDataset(Dataset):
 
				             return None
			
 
				 
			
 
				 
			
 
				+class MixDatast(IterableDataset):
			
 
				+    def __init__(self, datasets: dict[str, dict], seed: int = 42) -> None:
			
 
				+        values = list(datasets.values())
			
 
				+        probs = [v["prob"] for v in values]
			
 
				+        self.datasets = [v["dataset"] for v in values]
			
 
				+
			
 
				+        total_probs = sum(probs)
			
 
				+        self.probs = [p / total_probs for p in probs]
			
 
				+        self.seed = seed
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        rng = np.random.default_rng(self.seed)
			
 
				+        dataset_iterators = [iter(dataset) for dataset in self.datasets]
			
 
				+
			
 
				+        while True:
			
 
				+            # Random choice one
			
 
				+            dataset_idx = rng.choice(len(self.datasets), p=self.probs)
			
 
				+            dataset_iterator = dataset_iterators[dataset_idx]
			
 
				+
			
 
				+            try:
			
 
				+                yield next(dataset_iterator)
			
 
				+            except StopIteration:
			
 
				+                # Exhausted, create a new iterator
			
 
				+                dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
			
 
				+                yield next(dataset_iterators[dataset_idx])
			
 
				+
			
 
				+
			
 
				 @dataclass
			
 
				 class VQGANCollator:
			
 
				     def __call__(self, batch):
			
@@ -116,7 +143,7 @@ class VQGANDataModule(LightningDataModule):
 
				             batch_size=self.batch_size,
			
 
				             collate_fn=VQGANCollator(),
			
 
				             num_workers=self.num_workers,
			
 
				-            shuffle=True,
			
 
				+            shuffle=not isinstance(self.train_dataset, IterableDataset),
			
 
				         )
			
 
				 
			
 
				     def val_dataloader(self):
			
--- a/fish_speech/models/vqgan/lit_module.py
+++ b/fish_speech/models/vqgan/lit_module.py
@@ -1,5 +1,6 @@
 
				 import itertools
			
 
				-from typing import Any, Callable, Literal
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Any, Callable, Literal, Optional
			
 
				 
			
 
				 import lightning as L
			
 
				 import torch
			
@@ -8,19 +9,17 @@ import wandb
 
				 from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
			
 
				 from matplotlib import pyplot as plt
			
 
				 from torch import nn
			
 
				-from vector_quantize_pytorch import VectorQuantize
			
 
				 
			
 
				 from fish_speech.models.vqgan.losses import (
			
 
				+    MultiResolutionSTFTLoss,
			
 
				     discriminator_loss,
			
 
				     feature_loss,
			
 
				     generator_loss,
			
 
				-    kl_loss,
			
 
				 )
			
 
				+from fish_speech.models.vqgan.modules.balancer import Balancer
			
 
				 from fish_speech.models.vqgan.modules.decoder import Generator
			
 
				-from fish_speech.models.vqgan.modules.discriminator import EnsembleDiscriminator
			
 
				 from fish_speech.models.vqgan.modules.encoders import (
			
 
				     ConvDownSampler,
			
 
				-    SpeakerEncoder,
			
 
				     TextEncoder,
			
 
				     VQEncoder,
			
 
				 )
			
@@ -32,6 +31,21 @@ from fish_speech.models.vqgan.utils import (
 
				 )
			
 
				 
			
 
				 
			
 
				+@dataclass
			
 
				+class VQEncodeResult:
			
 
				+    features: torch.Tensor
			
 
				+    indices: torch.Tensor
			
 
				+    loss: torch.Tensor
			
 
				+    feature_lengths: torch.Tensor
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class VQDecodeResult:
			
 
				+    audios: torch.Tensor
			
 
				+    mels: torch.Tensor
			
 
				+    mel_lengths: torch.Tensor
			
 
				+
			
 
				+
			
 
				 class VQGAN(L.LightningModule):
			
 
				     def __init__(
			
 
				         self,
			
@@ -42,18 +56,18 @@ class VQGAN(L.LightningModule):
 
				         mel_encoder: TextEncoder,
			
 
				         decoder: TextEncoder,
			
 
				         generator: Generator,
			
 
				-        discriminator: EnsembleDiscriminator,
			
 
				+        discriminators: nn.ModuleDict,
			
 
				         mel_transform: nn.Module,
			
 
				         segment_size: int = 20480,
			
 
				         hop_length: int = 640,
			
 
				         sample_rate: int = 32000,
			
 
				-        mode: Literal["pretrain-stage1", "pretrain-stage2", "finetune"] = "finetune",
			
 
				-        speaker_encoder: SpeakerEncoder = None,
			
 
				+        mode: Literal["pretrain", "finetune"] = "finetune",
			
 
				+        freeze_discriminator: bool = False,
			
 
				+        multi_resolution_stft_loss: Optional[MultiResolutionSTFTLoss] = None,
			
 
				     ):
			
 
				         super().__init__()
			
 
				 
			
 
				-        # pretrain-stage1: vq use gt mel as target, hifigan use gt mel as input
			
 
				-        # pretrain-stage2: end-to-end training, use gt mel as hifi gan target
			
 
				+        # pretrain: vq use gt mel as target, hifigan use gt mel as input
			
 
				         # finetune: end-to-end training, use gt mel as hifi gan target but freeze vq
			
 
				 
			
 
				         # Model parameters
			
@@ -64,11 +78,11 @@ class VQGAN(L.LightningModule):
 
				         self.downsample = downsample
			
 
				         self.vq_encoder = vq_encoder
			
 
				         self.mel_encoder = mel_encoder
			
 
				-        self.speaker_encoder = speaker_encoder
			
 
				         self.decoder = decoder
			
 
				         self.generator = generator
			
 
				-        self.discriminator = discriminator
			
 
				+        self.discriminators = discriminators
			
 
				         self.mel_transform = mel_transform
			
 
				+        self.freeze_discriminator = freeze_discriminator
			
 
				 
			
 
				         # Crop length for saving memory
			
 
				         self.segment_size = segment_size
			
@@ -90,20 +104,30 @@ class VQGAN(L.LightningModule):
 
				             for p in self.downsample.parameters():
			
 
				                 p.requires_grad = False
			
 
				 
			
 
				+        if self.freeze_discriminator:
			
 
				+            for p in self.discriminators.parameters():
			
 
				+                p.requires_grad = False
			
 
				+
			
 
				+        # Losses
			
 
				+        self.multi_resolution_stft_loss = multi_resolution_stft_loss
			
 
				+        loss_dict = {
			
 
				+            "mel": 1,
			
 
				+            "adv": 1,
			
 
				+            "fm": 1,
			
 
				+        }
			
 
				+
			
 
				+        if self.multi_resolution_stft_loss is not None:
			
 
				+            loss_dict["stft"] = 1
			
 
				+
			
 
				+        self.balancer = Balancer(loss_dict)
			
 
				+
			
 
				     def configure_optimizers(self):
			
 
				         # Need two optimizers and two schedulers
			
 
				-        components = []
			
 
				-        if self.mode != "finetune":
			
 
				-            components.extend(
			
 
				-                [
			
 
				-                    self.downsample.parameters(),
			
 
				-                    self.vq_encoder.parameters(),
			
 
				-                    self.mel_encoder.parameters(),
			
 
				-                ]
			
 
				-            )
			
 
				-
			
 
				-        if self.speaker_encoder is not None:
			
 
				-            components.append(self.speaker_encoder.parameters())
			
 
				+        components = [
			
 
				+            self.downsample.parameters(),
			
 
				+            self.vq_encoder.parameters(),
			
 
				+            self.mel_encoder.parameters(),
			
 
				+        ]
			
 
				 
			
 
				         if self.decoder is not None:
			
 
				             components.append(self.decoder.parameters())
			
@@ -111,7 +135,7 @@ class VQGAN(L.LightningModule):
 
				         components.append(self.generator.parameters())
			
 
				         optimizer_generator = self.optimizer_builder(itertools.chain(*components))
			
 
				         optimizer_discriminator = self.optimizer_builder(
			
 
				-            self.discriminator.parameters()
			
 
				+            self.discriminators.parameters()
			
 
				         )
			
 
				 
			
 
				         lr_scheduler_generator = self.lr_scheduler_builder(optimizer_generator)
			
@@ -145,9 +169,7 @@ class VQGAN(L.LightningModule):
 
				         audios = audios[:, None, :]
			
 
				 
			
 
				         with torch.no_grad():
			
 
				-            features = gt_mels = self.mel_transform(
			
 
				-                audios, sample_rate=self.sampling_rate
			
 
				-            )
			
 
				+            gt_mels = self.mel_transform(audios, sample_rate=self.sampling_rate)
			
 
				 
			
 
				         if self.mode == "finetune":
			
 
				             # Disable gradient computation for VQ
			
@@ -156,29 +178,13 @@ class VQGAN(L.LightningModule):
 
				             self.mel_encoder.eval()
			
 
				             self.downsample.eval()
			
 
				 
			
 
				-        if self.downsample is not None:
			
 
				-            features = self.downsample(features)
			
 
				-
			
 
				         mel_lengths = audio_lengths // self.hop_length
			
 
				-        feature_lengths = (
			
 
				-            audio_lengths
			
 
				-            / self.hop_length
			
 
				-            / (self.downsample.total_strides if self.downsample is not None else 1)
			
 
				-        ).long()
			
 
				-
			
 
				-        feature_masks = torch.unsqueeze(
			
 
				-            sequence_mask(feature_lengths, features.shape[2]), 1
			
 
				-        ).to(gt_mels.dtype)
			
 
				         mel_masks = torch.unsqueeze(sequence_mask(mel_lengths, gt_mels.shape[2]), 1).to(
			
 
				             gt_mels.dtype
			
 
				         )
			
 
				 
			
 
				-        # vq_features is 50 hz, need to convert to true mel size
			
 
				-        text_features = self.mel_encoder(features, feature_masks)
			
 
				-        text_features, _, loss_vq = self.vq_encoder(text_features, feature_masks)
			
 
				-        text_features = F.interpolate(
			
 
				-            text_features, size=gt_mels.shape[2], mode="nearest"
			
 
				-        )
			
 
				+        vq_result = self.encode(audios, audio_lengths)
			
 
				+        loss_vq = vq_result.loss
			
 
				 
			
 
				         if loss_vq.ndim > 1:
			
 
				             loss_vq = loss_vq.mean()
			
@@ -187,18 +193,15 @@ class VQGAN(L.LightningModule):
 
				             # Enable gradient computation
			
 
				             torch.set_grad_enabled(True)
			
 
				 
			
 
				-        # Sample mels
			
 
				-        if self.decoder is not None:
			
 
				-            speaker_features = (
			
 
				-                self.speaker_encoder(gt_mels, mel_masks)
			
 
				-                if self.speaker_encoder is not None
			
 
				-                else None
			
 
				-            )
			
 
				-            decoded_mels = self.decoder(text_features, mel_masks, g=speaker_features)
			
 
				-        else:
			
 
				-            decoded_mels = text_features
			
 
				+        decoded = self.decode(
			
 
				+            indices=vq_result.indices if self.mode == "finetune" else None,
			
 
				+            features=vq_result.features if self.mode == "pretrain" else None,
			
 
				+            audio_lengths=audio_lengths,
			
 
				+            mel_only=True,
			
 
				+        )
			
 
				+        decoded_mels = decoded.mels
			
 
				+        input_mels = gt_mels if self.mode == "pretrain" else decoded_mels
			
 
				 
			
 
				-        input_mels = gt_mels if self.mode == "pretrain-stage1" else decoded_mels
			
 
				         if self.segment_size is not None:
			
 
				             audios, ids_slice = rand_slice_segments(
			
 
				                 audios, audio_lengths, self.segment_size
			
@@ -228,75 +231,145 @@ class VQGAN(L.LightningModule):
 
				             audios.shape == fake_audios.shape
			
 
				         ), f"{audios.shape} != {fake_audios.shape}"
			
 
				 
			
 
				+        # Multi-Resolution STFT Loss
			
 
				+        if self.multi_resolution_stft_loss is not None:
			
 
				+            with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				+                sc_loss, mag_loss = self.multi_resolution_stft_loss(
			
 
				+                    fake_audios.squeeze(1).float(), audios.squeeze(1).float()
			
 
				+                )
			
 
				+                loss_stft = sc_loss + mag_loss
			
 
				+
			
 
				         # Discriminator
			
 
				-        y_d_hat_r, y_d_hat_g, _, _ = self.discriminator(audios, fake_audios.detach())
			
 
				+        if self.freeze_discriminator is False:
			
 
				+            loss_disc_all = []
			
 
				+
			
 
				+            for key, disc in self.discriminators.items():
			
 
				+                scores, _ = disc(audios)
			
 
				+                score_fakes, _ = disc(fake_audios.detach())
			
 
				+
			
 
				+                with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				+                    loss_disc, _, _ = discriminator_loss(scores, score_fakes)
			
 
				+
			
 
				+                self.log(
			
 
				+                    f"train/discriminator/{key}",
			
 
				+                    loss_disc,
			
 
				+                    on_step=True,
			
 
				+                    on_epoch=False,
			
 
				+                    prog_bar=False,
			
 
				+                    logger=True,
			
 
				+                    sync_dist=True,
			
 
				+                )
			
 
				 
			
 
				-        with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				-            loss_disc_all, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g)
			
 
				+                loss_disc_all.append(loss_disc)
			
 
				 
			
 
				-        self.log(
			
 
				-            "train/discriminator/loss",
			
 
				-            loss_disc_all,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=True,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				+            loss_disc_all = torch.stack(loss_disc_all).mean()
			
 
				 
			
 
				-        optim_d.zero_grad()
			
 
				-        self.manual_backward(loss_disc_all)
			
 
				-        self.clip_gradients(
			
 
				-            optim_d, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
			
 
				-        )
			
 
				-        optim_d.step()
			
 
				+            self.log(
			
 
				+                "train/discriminator/loss",
			
 
				+                loss_disc_all,
			
 
				+                on_step=True,
			
 
				+                on_epoch=False,
			
 
				+                prog_bar=True,
			
 
				+                logger=True,
			
 
				+                sync_dist=True,
			
 
				+            )
			
 
				+
			
 
				+            optim_d.zero_grad()
			
 
				+            self.manual_backward(loss_disc_all)
			
 
				+            self.clip_gradients(
			
 
				+                optim_d, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
			
 
				+            )
			
 
				+            optim_d.step()
			
 
				+
			
 
				+        # Adv Loss
			
 
				+        loss_adv_all = []
			
 
				+        loss_fm_all = []
			
 
				+
			
 
				+        for key, disc in self.discriminators.items():
			
 
				+            score_fakes, feat_fake = disc(fake_audios)
			
 
				+
			
 
				+            # Adversarial Loss
			
 
				+            with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				+                loss_fake, _ = generator_loss(score_fakes)
			
 
				+
			
 
				+            self.log(
			
 
				+                f"train/generator/adv_{key}",
			
 
				+                loss_fake,
			
 
				+                on_step=True,
			
 
				+                on_epoch=False,
			
 
				+                prog_bar=False,
			
 
				+                logger=True,
			
 
				+                sync_dist=True,
			
 
				+            )
			
 
				+
			
 
				+            loss_adv_all.append(loss_fake)
			
 
				+
			
 
				+            # Feature Matching Loss
			
 
				+            _, feat_real = disc(audios)
			
 
				+
			
 
				+            with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				+                loss_fm = feature_loss(feat_real, feat_fake)
			
 
				 
			
 
				-        y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = self.discriminator(audios, fake_audios)
			
 
				+            self.log(
			
 
				+                f"train/generator/adv_fm_{key}",
			
 
				+                loss_fm,
			
 
				+                on_step=True,
			
 
				+                on_epoch=False,
			
 
				+                prog_bar=False,
			
 
				+                logger=True,
			
 
				+                sync_dist=True,
			
 
				+            )
			
 
				+
			
 
				+            loss_fm_all.append(loss_fm)
			
 
				+
			
 
				+        loss_adv_all = torch.stack(loss_adv_all).mean()
			
 
				+        loss_fm_all = torch.stack(loss_fm_all).mean()
			
 
				 
			
 
				         with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				             loss_decoded_mel = F.l1_loss(gt_mels * mel_masks, decoded_mels * mel_masks)
			
 
				             loss_mel = F.l1_loss(
			
 
				                 sliced_gt_mels * gen_mel_masks, fake_audio_mels * gen_mel_masks
			
 
				             )
			
 
				-            loss_adv, _ = generator_loss(y_d_hat_g)
			
 
				-            loss_fm = feature_loss(fmap_r, fmap_g)
			
 
				 
			
 
				-            if self.mode == "pretrain-stage1":
			
 
				+            loss_dict = {
			
 
				+                "mel": loss_mel,
			
 
				+                "adv": loss_adv_all,
			
 
				+                "fm": loss_fm_all,
			
 
				+            }
			
 
				+
			
 
				+            if self.multi_resolution_stft_loss is not None:
			
 
				+                loss_dict["stft"] = loss_stft
			
 
				+
			
 
				+            generator_out_grad = self.balancer.compute(
			
 
				+                loss_dict,
			
 
				+                fake_audios,
			
 
				+            )
			
 
				+
			
 
				+            if self.mode == "pretrain":
			
 
				                 loss_vq_all = loss_decoded_mel + loss_vq
			
 
				-                loss_gen_all = loss_mel * 45 + loss_fm + loss_adv
			
 
				-            else:
			
 
				-                loss_gen_all = loss_mel * 45 + loss_vq * 45 + loss_fm + loss_adv
			
 
				 
			
 
				-        self.log(
			
 
				-            "train/generator/loss_gen_all",
			
 
				-            loss_gen_all,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=True,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				+        # Loss vq and loss decoded mel are only used in pretrain stage
			
 
				+        if self.mode == "pretrain":
			
 
				+            self.log(
			
 
				+                "train/generator/loss_vq",
			
 
				+                loss_vq,
			
 
				+                on_step=True,
			
 
				+                on_epoch=False,
			
 
				+                prog_bar=False,
			
 
				+                logger=True,
			
 
				+                sync_dist=True,
			
 
				+            )
			
 
				 
			
 
				-        if self.mode == "pretrain-stage1":
			
 
				             self.log(
			
 
				-                "train/generator/loss_vq_all",
			
 
				-                loss_vq_all,
			
 
				+                "train/generator/loss_decoded_mel",
			
 
				+                loss_decoded_mel,
			
 
				                 on_step=True,
			
 
				                 on_epoch=False,
			
 
				-                prog_bar=True,
			
 
				+                prog_bar=False,
			
 
				                 logger=True,
			
 
				                 sync_dist=True,
			
 
				             )
			
 
				 
			
 
				-        self.log(
			
 
				-            "train/generator/loss_decoded_mel",
			
 
				-            loss_decoded_mel,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				         self.log(
			
 
				             "train/generator/loss_mel",
			
 
				             loss_mel,
			
@@ -306,18 +379,21 @@ class VQGAN(L.LightningModule):
 
				             logger=True,
			
 
				             sync_dist=True,
			
 
				         )
			
 
				+
			
 
				+        if self.multi_resolution_stft_loss is not None:
			
 
				+            self.log(
			
 
				+                "train/generator/loss_stft",
			
 
				+                loss_stft,
			
 
				+                on_step=True,
			
 
				+                on_epoch=False,
			
 
				+                prog_bar=False,
			
 
				+                logger=True,
			
 
				+                sync_dist=True,
			
 
				+            )
			
 
				+
			
 
				         self.log(
			
 
				-            "train/generator/loss_fm",
			
 
				-            loss_fm,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-        self.log(
			
 
				-            "train/generator/loss_adv",
			
 
				-            loss_adv,
			
 
				+            "train/generator/loss_fm_all",
			
 
				+            loss_fm_all,
			
 
				             on_step=True,
			
 
				             on_epoch=False,
			
 
				             prog_bar=False,
			
@@ -325,8 +401,8 @@ class VQGAN(L.LightningModule):
 
				             sync_dist=True,
			
 
				         )
			
 
				         self.log(
			
 
				-            "train/generator/loss_vq",
			
 
				-            loss_vq,
			
 
				+            "train/generator/loss_adv_all",
			
 
				+            loss_adv_all,
			
 
				             on_step=True,
			
 
				             on_epoch=False,
			
 
				             prog_bar=False,
			
@@ -336,11 +412,11 @@ class VQGAN(L.LightningModule):
 
				 
			
 
				         optim_g.zero_grad()
			
 
				 
			
 
				-        # Only backpropagate loss_vq_all in pretrain-stage1
			
 
				-        if self.mode == "pretrain-stage1":
			
 
				-            self.manual_backward(loss_vq_all)
			
 
				+        # Only backpropagate loss_vq_all in pretrain stage
			
 
				+        if self.mode == "pretrain":
			
 
				+            self.manual_backward(loss_vq_all, retain_graph=True)
			
 
				 
			
 
				-        self.manual_backward(loss_gen_all)
			
 
				+        self.manual_backward(fake_audios, gradient=generator_out_grad)
			
 
				         self.clip_gradients(
			
 
				             optim_g, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
			
 
				         )
			
@@ -357,44 +433,26 @@ class VQGAN(L.LightningModule):
 
				         audios = audios.float()
			
 
				         audios = audios[:, None, :]
			
 
				 
			
 
				-        features = gt_mels = self.mel_transform(audios, sample_rate=self.sampling_rate)
			
 
				-
			
 
				-        if self.downsample is not None:
			
 
				-            features = self.downsample(features)
			
 
				-
			
 
				+        gt_mels = self.mel_transform(audios, sample_rate=self.sampling_rate)
			
 
				         mel_lengths = audio_lengths // self.hop_length
			
 
				-        feature_lengths = (
			
 
				-            audio_lengths
			
 
				-            / self.hop_length
			
 
				-            / (self.downsample.total_strides if self.downsample is not None else 1)
			
 
				-        ).long()
			
 
				-
			
 
				-        feature_masks = torch.unsqueeze(
			
 
				-            sequence_mask(feature_lengths, features.shape[2]), 1
			
 
				-        ).to(gt_mels.dtype)
			
 
				         mel_masks = torch.unsqueeze(sequence_mask(mel_lengths, gt_mels.shape[2]), 1).to(
			
 
				             gt_mels.dtype
			
 
				         )
			
 
				 
			
 
				-        # vq_features is 50 hz, need to convert to true mel size
			
 
				-        text_features = self.mel_encoder(features, feature_masks)
			
 
				-        text_features, _, _ = self.vq_encoder(text_features, feature_masks)
			
 
				-        text_features = F.interpolate(
			
 
				-            text_features, size=gt_mels.shape[2], mode="nearest"
			
 
				+        vq_result = self.encode(audios, audio_lengths)
			
 
				+        decoded = self.decode(
			
 
				+            indices=vq_result.indices,
			
 
				+            audio_lengths=audio_lengths,
			
 
				+            mel_only=self.mode == "pretrain",
			
 
				         )
			
 
				 
			
 
				-        # Sample mels
			
 
				-        if self.decoder is not None:
			
 
				-            speaker_features = (
			
 
				-                self.speaker_encoder(gt_mels, mel_masks)
			
 
				-                if self.speaker_encoder is not None
			
 
				-                else None
			
 
				-            )
			
 
				-            decoded_mels = self.decoder(text_features, mel_masks, g=speaker_features)
			
 
				-        else:
			
 
				-            decoded_mels = text_features
			
 
				+        decoded_mels = decoded.mels
			
 
				 
			
 
				-        fake_audios = self.generator(decoded_mels)
			
 
				+        # Use gt mel as input for pretrain
			
 
				+        if self.mode == "pretrain":
			
 
				+            fake_audios = self.generator(gt_mels)
			
 
				+        else:
			
 
				+            fake_audios = decoded.audios
			
 
				 
			
 
				         fake_mels = self.mel_transform(fake_audios.squeeze(1))
			
 
				 
			
@@ -487,3 +545,92 @@ class VQGAN(L.LightningModule):
 
				                 )
			
 
				 
			
 
				             plt.close(image_mels)
			
 
				+
			
 
				+    def encode(self, audios, audio_lengths=None):
			
 
				+        if audio_lengths is None:
			
 
				+            audio_lengths = torch.tensor(
			
 
				+                [audios.shape[-1]] * audios.shape[0],
			
 
				+                device=audios.device,
			
 
				+                dtype=torch.long,
			
 
				+            )
			
 
				+
			
 
				+        with torch.no_grad():
			
 
				+            features = self.mel_transform(audios, sample_rate=self.sampling_rate)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            features = self.downsample(features)
			
 
				+
			
 
				+        feature_lengths = (
			
 
				+            audio_lengths
			
 
				+            / self.hop_length
			
 
				+            / (self.downsample.total_strides if self.downsample is not None else 1)
			
 
				+        ).long()
			
 
				+
			
 
				+        feature_masks = torch.unsqueeze(
			
 
				+            sequence_mask(feature_lengths, features.shape[2]), 1
			
 
				+        ).to(features.dtype)
			
 
				+
			
 
				+        text_features = self.mel_encoder(features, feature_masks)
			
 
				+        vq_features, indices, loss = self.vq_encoder(text_features, feature_masks)
			
 
				+
			
 
				+        return VQEncodeResult(
			
 
				+            features=vq_features,
			
 
				+            indices=indices,
			
 
				+            loss=loss,
			
 
				+            feature_lengths=feature_lengths,
			
 
				+        )
			
 
				+
			
 
				+    def calculate_audio_lengths(self, feature_lengths):
			
 
				+        return (
			
 
				+            feature_lengths
			
 
				+            * self.hop_length
			
 
				+            * (self.downsample.total_strides if self.downsample is not None else 1)
			
 
				+        )
			
 
				+
			
 
				+    def decode(
			
 
				+        self,
			
 
				+        indices=None,
			
 
				+        features=None,
			
 
				+        audio_lengths=None,
			
 
				+        mel_only=False,
			
 
				+        feature_lengths=None,
			
 
				+    ):
			
 
				+        assert (
			
 
				+            indices is not None or features is not None
			
 
				+        ), "indices or features must be provided"
			
 
				+        assert (
			
 
				+            feature_lengths is not None or audio_lengths is not None
			
 
				+        ), "feature_lengths or audio_lengths must be provided"
			
 
				+
			
 
				+        if audio_lengths is None:
			
 
				+            audio_lengths = self.calculate_audio_lengths(feature_lengths)
			
 
				+
			
 
				+        mel_lengths = audio_lengths // self.hop_length
			
 
				+        mel_masks = torch.unsqueeze(
			
 
				+            sequence_mask(mel_lengths, torch.max(mel_lengths)), 1
			
 
				+        ).float()
			
 
				+
			
 
				+        if indices is not None:
			
 
				+            features = self.vq_encoder.decode(indices)
			
 
				+
			
 
				+        features = F.interpolate(features, size=mel_masks.shape[2], mode="nearest")
			
 
				+
			
 
				+        # Sample mels
			
 
				+        if self.decoder is not None:
			
 
				+            decoded_mels = self.decoder(features, mel_masks)
			
 
				+        else:
			
 
				+            decoded_mels = features
			
 
				+
			
 
				+        if mel_only:
			
 
				+            return VQDecodeResult(
			
 
				+                audios=None,
			
 
				+                mels=decoded_mels,
			
 
				+                mel_lengths=mel_lengths,
			
 
				+            )
			
 
				+
			
 
				+        fake_audios = self.generator(decoded_mels)
			
 
				+        return VQDecodeResult(
			
 
				+            audios=fake_audios,
			
 
				+            mels=decoded_mels,
			
 
				+            mel_lengths=mel_lengths,
			
 
				+        )
			
--- a/fish_speech/models/vqgan/losses.py
+++ b/fish_speech/models/vqgan/losses.py
@@ -1,9 +1,9 @@
 
				-from typing import List
			
 
				-
			
 
				 import torch
			
 
				+import torch.nn.functional as F
			
 
				+from torch import nn
			
 
				 
			
 
				 
			
 
				-def feature_loss(fmap_r: List[torch.Tensor], fmap_g: List[torch.Tensor]):
			
 
				+def feature_loss(fmap_r: list[torch.Tensor], fmap_g: list[torch.Tensor]):
			
 
				     loss = 0
			
 
				     for dr, dg in zip(fmap_r, fmap_g):
			
 
				         for rl, gl in zip(dr, dg):
			
@@ -15,7 +15,7 @@ def feature_loss(fmap_r: List[torch.Tensor], fmap_g: List[torch.Tensor]):
 
				 
			
 
				 
			
 
				 def discriminator_loss(
			
 
				-    disc_real_outputs: List[torch.Tensor], disc_generated_outputs: List[torch.Tensor]
			
 
				+    disc_real_outputs: list[torch.Tensor], disc_generated_outputs: list[torch.Tensor]
			
 
				 ):
			
 
				     loss = 0
			
 
				     r_losses = []
			
@@ -32,7 +32,7 @@ def discriminator_loss(
 
				     return loss, r_losses, g_losses
			
 
				 
			
 
				 
			
 
				-def generator_loss(disc_outputs: List[torch.Tensor]):
			
 
				+def generator_loss(disc_outputs: list[torch.Tensor]):
			
 
				     loss = 0
			
 
				     gen_losses = []
			
 
				     for dg in disc_outputs:
			
@@ -66,3 +66,133 @@ def kl_loss(
 
				     kl = torch.sum(kl * z_mask)
			
 
				     l = kl / torch.sum(z_mask)
			
 
				     return l
			
 
				+
			
 
				+
			
 
				+def stft(x, fft_size, hop_size, win_length, window):
			
 
				+    """Perform STFT and convert to magnitude spectrogram.
			
 
				+    Args:
			
 
				+        x (Tensor): Input signal tensor (B, T).
			
 
				+        fft_size (int): FFT size.
			
 
				+        hop_size (int): Hop size.
			
 
				+        win_length (int): Window length.
			
 
				+        window (str): Window function type.
			
 
				+    Returns:
			
 
				+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
			
 
				+    """
			
 
				+    spec = torch.stft(
			
 
				+        x,
			
 
				+        fft_size,
			
 
				+        hop_size,
			
 
				+        win_length,
			
 
				+        window,
			
 
				+        return_complex=True,
			
 
				+        pad_mode="reflect",
			
 
				+    )
			
 
				+    spec = torch.view_as_real(spec)
			
 
				+
			
 
				+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
			
 
				+    return torch.sqrt(torch.clamp(spec.pow(2).sum(-1), min=1e-6)).transpose(2, 1)
			
 
				+
			
 
				+
			
 
				+class SpectralConvergengeLoss(nn.Module):
			
 
				+    """Spectral convergence loss module."""
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        """Initialize spectral convergence loss module."""
			
 
				+        super(SpectralConvergengeLoss, self).__init__()
			
 
				+
			
 
				+    def forward(self, x_mag, y_mag):
			
 
				+        """Calculate forward propagation.
			
 
				+        Args:
			
 
				+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
			
 
				+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
			
 
				+        Returns:
			
 
				+            Tensor: Spectral convergence loss value.
			
 
				+        """  # noqa: E501
			
 
				+
			
 
				+        return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
			
 
				+
			
 
				+
			
 
				+class LogSTFTMagnitudeLoss(nn.Module):
			
 
				+    """Log STFT magnitude loss module."""
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        """Initialize los STFT magnitude loss module."""
			
 
				+        super(LogSTFTMagnitudeLoss, self).__init__()
			
 
				+
			
 
				+    def forward(self, x_mag, y_mag):
			
 
				+        """Calculate forward propagation.
			
 
				+        Args:
			
 
				+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
			
 
				+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
			
 
				+        Returns:
			
 
				+            Tensor: Log STFT magnitude loss value.
			
 
				+        """  # noqa: E501
			
 
				+
			
 
				+        return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
			
 
				+
			
 
				+
			
 
				+class STFTLoss(nn.Module):
			
 
				+    """STFT loss module."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, fft_size=1024, shift_size=120, win_length=600, window=torch.hann_window
			
 
				+    ):
			
 
				+        """Initialize STFT loss module."""
			
 
				+        super(STFTLoss, self).__init__()
			
 
				+
			
 
				+        self.fft_size = fft_size
			
 
				+        self.shift_size = shift_size
			
 
				+        self.win_length = win_length
			
 
				+        self.register_buffer("window", window(win_length))
			
 
				+        self.spectral_convergenge_loss = SpectralConvergengeLoss()
			
 
				+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
			
 
				+
			
 
				+    def forward(self, x, y):
			
 
				+        """Calculate forward propagation.
			
 
				+        Args:
			
 
				+            x (Tensor): Predicted signal (B, T).
			
 
				+            y (Tensor): Groundtruth signal (B, T).
			
 
				+        Returns:
			
 
				+            Tensor: Spectral convergence loss value.
			
 
				+            Tensor: Log STFT magnitude loss value.
			
 
				+        """
			
 
				+
			
 
				+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
			
 
				+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
			
 
				+        sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
			
 
				+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
			
 
				+
			
 
				+        return sc_loss, mag_loss
			
 
				+
			
 
				+
			
 
				+class MultiResolutionSTFTLoss(nn.Module):
			
 
				+    """Multi resolution STFT loss module."""
			
 
				+
			
 
				+    def __init__(self, resolutions, window=torch.hann_window):
			
 
				+        super(MultiResolutionSTFTLoss, self).__init__()
			
 
				+
			
 
				+        self.stft_losses = nn.ModuleList()
			
 
				+        for fs, ss, wl in resolutions:
			
 
				+            self.stft_losses += [STFTLoss(fs, ss, wl, window)]
			
 
				+
			
 
				+    def forward(self, x, y):
			
 
				+        """Calculate forward propagation.
			
 
				+        Args:
			
 
				+            x (Tensor): Predicted signal (B, T).
			
 
				+            y (Tensor): Groundtruth signal (B, T).
			
 
				+        Returns:
			
 
				+            Tensor: Multi resolution spectral convergence loss value.
			
 
				+            Tensor: Multi resolution log STFT magnitude loss value.
			
 
				+        """
			
 
				+        sc_loss = 0.0
			
 
				+        mag_loss = 0.0
			
 
				+        for f in self.stft_losses:
			
 
				+            sc_l, mag_l = f(x, y)
			
 
				+            sc_loss += sc_l
			
 
				+            mag_loss += mag_l
			
 
				+
			
 
				+        sc_loss /= len(self.stft_losses)
			
 
				+        mag_loss /= len(self.stft_losses)
			
 
				+
			
 
				+        return sc_loss, mag_loss
			
--- a/fish_speech/models/vqgan/modules/balancer.py
+++ b/fish_speech/models/vqgan/modules/balancer.py
@@ -0,0 +1,193 @@
 
				+import typing as tp
			
 
				+from collections import defaultdict
			
 
				+
			
 
				+import torch
			
 
				+from torch import autograd
			
 
				+
			
 
				+
			
 
				+def rank():
			
 
				+    if torch.distributed.is_initialized():
			
 
				+        return torch.distributed.get_rank()
			
 
				+    else:
			
 
				+        return 0
			
 
				+
			
 
				+
			
 
				+def world_size():
			
 
				+    if torch.distributed.is_initialized():
			
 
				+        return torch.distributed.get_world_size()
			
 
				+    else:
			
 
				+        return 1
			
 
				+
			
 
				+
			
 
				+def is_distributed():
			
 
				+    return world_size() > 1
			
 
				+
			
 
				+
			
 
				+def average_metrics(metrics: tp.Dict[str, float], count=1.0):
			
 
				+    """Average a dictionary of metrics across all workers, using the optional
			
 
				+    `count` as unnormalized weight.
			
 
				+    """
			
 
				+    if not is_distributed():
			
 
				+        return metrics
			
 
				+    keys, values = zip(*metrics.items())
			
 
				+    device = "cuda" if torch.cuda.is_available() else "cpu"
			
 
				+    tensor = torch.tensor(list(values) + [1], device=device, dtype=torch.float32)
			
 
				+    tensor *= count
			
 
				+    all_reduce(tensor)
			
 
				+    averaged = (tensor[:-1] / tensor[-1]).cpu().tolist()
			
 
				+    return dict(zip(keys, averaged))
			
 
				+
			
 
				+
			
 
				+def all_reduce(tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM):
			
 
				+    if is_distributed():
			
 
				+        return torch.distributed.all_reduce(tensor, op)
			
 
				+
			
 
				+
			
 
				+def averager(beta: float = 1):
			
 
				+    """
			
 
				+    Exponential Moving Average callback.
			
 
				+    Returns a single function that can be called to repeatidly update the EMA
			
 
				+    with a dict of metrics. The callback will return
			
 
				+    the new averaged dict of metrics.
			
 
				+
			
 
				+    Note that for `beta=1`, this is just plain averaging.
			
 
				+    """
			
 
				+    fix: tp.Dict[str, float] = defaultdict(float)
			
 
				+    total: tp.Dict[str, float] = defaultdict(float)
			
 
				+
			
 
				+    def _update(
			
 
				+        metrics: tp.Dict[str, tp.Any], weight: float = 1
			
 
				+    ) -> tp.Dict[str, float]:
			
 
				+        nonlocal total, fix
			
 
				+        for key, value in metrics.items():
			
 
				+            total[key] = total[key] * beta + weight * float(value)
			
 
				+            fix[key] = fix[key] * beta + weight
			
 
				+        return {key: tot / fix[key] for key, tot in total.items()}
			
 
				+
			
 
				+    return _update
			
 
				+
			
 
				+
			
 
				+class Balancer:
			
 
				+    """Loss balancer.
			
 
				+
			
 
				+    The loss balancer combines losses together to compute gradients for the backward.
			
 
				+    A call to the balancer will weight the losses according the specified weight coefficients.
			
 
				+    A call to the backward method of the balancer will compute the gradients, combining all the losses and
			
 
				+    potentially rescaling the gradients, which can help stabilize the training and reasonate
			
 
				+    about multiple losses with varying scales.
			
 
				+
			
 
				+    Expected usage:
			
 
				+        weights = {'loss_a': 1, 'loss_b': 4}
			
 
				+        balancer = Balancer(weights, ...)
			
 
				+        losses: dict = {}
			
 
				+        losses['loss_a'] = compute_loss_a(x, y)
			
 
				+        losses['loss_b'] = compute_loss_b(x, y)
			
 
				+        if model.training():
			
 
				+            balancer.backward(losses, x)
			
 
				+
			
 
				+    ..Warning:: It is unclear how this will interact with DistributedDataParallel,
			
 
				+        in particular if you have some losses not handled by the balancer. In that case
			
 
				+        you can use `encodec.distrib.sync_grad(model.parameters())` and
			
 
				+        `encodec.distrib.sync_buffwers(model.buffers())` as a safe alternative.
			
 
				+
			
 
				+    Args:
			
 
				+        weights (Dict[str, float]): Weight coefficient for each loss. The balancer expect the losses keys
			
 
				+            from the backward method to match the weights keys to assign weight to each of the provided loss.
			
 
				+        rescale_grads (bool): Whether to rescale gradients or not, without. If False, this is just
			
 
				+            a regular weighted sum of losses.
			
 
				+        total_norm (float): Reference norm when rescaling gradients, ignored otherwise.
			
 
				+        emay_decay (float): EMA decay for averaging the norms when `rescale_grads` is True.
			
 
				+        per_batch_item (bool): Whether to compute the averaged norm per batch item or not. This only holds
			
 
				+            when rescaling the gradients.
			
 
				+        epsilon (float): Epsilon value for numerical stability.
			
 
				+        monitor (bool): Whether to store additional ratio for each loss key in metrics.
			
 
				+    """  # noqa: E501
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        weights: tp.Dict[str, float],
			
 
				+        rescale_grads: bool = True,
			
 
				+        total_norm: float = 1.0,
			
 
				+        ema_decay: float = 0.999,
			
 
				+        per_batch_item: bool = True,
			
 
				+        epsilon: float = 1e-12,
			
 
				+        monitor: bool = False,
			
 
				+    ):
			
 
				+        self.weights = weights
			
 
				+        self.per_batch_item = per_batch_item
			
 
				+        self.total_norm = total_norm
			
 
				+        self.averager = averager(ema_decay)
			
 
				+        self.epsilon = epsilon
			
 
				+        self.monitor = monitor
			
 
				+        self.rescale_grads = rescale_grads
			
 
				+        self._metrics: tp.Dict[str, tp.Any] = {}
			
 
				+
			
 
				+    @property
			
 
				+    def metrics(self):
			
 
				+        return self._metrics
			
 
				+
			
 
				+    def compute(self, losses: tp.Dict[str, torch.Tensor], input: torch.Tensor):
			
 
				+        norms = {}
			
 
				+        grads = {}
			
 
				+        for name, loss in losses.items():
			
 
				+            (grad,) = autograd.grad(loss, [input], retain_graph=True)
			
 
				+            if self.per_batch_item:
			
 
				+                dims = tuple(range(1, grad.dim()))
			
 
				+                norm = grad.norm(dim=dims).mean()
			
 
				+            else:
			
 
				+                norm = grad.norm()
			
 
				+            norms[name] = norm
			
 
				+            grads[name] = grad
			
 
				+
			
 
				+        count = 1
			
 
				+        if self.per_batch_item:
			
 
				+            count = len(grad)
			
 
				+        avg_norms = average_metrics(self.averager(norms), count)
			
 
				+        total = sum(avg_norms.values())
			
 
				+
			
 
				+        self._metrics = {}
			
 
				+        if self.monitor:
			
 
				+            for k, v in avg_norms.items():
			
 
				+                self._metrics[f"ratio_{k}"] = v / total
			
 
				+
			
 
				+        total_weights = sum([self.weights[k] for k in avg_norms])
			
 
				+        ratios = {k: w / total_weights for k, w in self.weights.items()}
			
 
				+
			
 
				+        out_grad: tp.Any = 0
			
 
				+        for name, avg_norm in avg_norms.items():
			
 
				+            if self.rescale_grads:
			
 
				+                scale = ratios[name] * self.total_norm / (self.epsilon + avg_norm)
			
 
				+                grad = grads[name] * scale
			
 
				+            else:
			
 
				+                grad = self.weights[name] * grads[name]
			
 
				+            out_grad += grad
			
 
				+
			
 
				+        return out_grad
			
 
				+
			
 
				+
			
 
				+def test():
			
 
				+    from torch.nn import functional as F
			
 
				+
			
 
				+    x = torch.zeros(1, requires_grad=True)
			
 
				+    one = torch.ones_like(x)
			
 
				+    loss_1 = F.l1_loss(x, one)
			
 
				+    loss_2 = 100 * F.l1_loss(x, -one)
			
 
				+    losses = {"1": loss_1, "2": loss_2}
			
 
				+
			
 
				+    balancer = Balancer(weights={"1": 1, "2": 1}, rescale_grads=False)
			
 
				+    out_grad = balancer.compute(losses, x)
			
 
				+    x.backward(out_grad)
			
 
				+    assert torch.allclose(x.grad, torch.tensor(99.0)), x.grad
			
 
				+
			
 
				+    loss_1 = F.l1_loss(x, one)
			
 
				+    loss_2 = 100 * F.l1_loss(x, -one)
			
 
				+    losses = {"1": loss_1, "2": loss_2}
			
 
				+    x.grad = None
			
 
				+    balancer = Balancer(weights={"1": 1, "2": 1}, rescale_grads=True)
			
 
				+    out_grad = balancer.compute({"1": loss_1, "2": loss_2}, x)
			
 
				+    x.backward(out_grad)
			
 
				+    assert torch.allclose(x.grad, torch.tensor(0.0)), x.grad
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    test()
			
--- a/fish_speech/models/vqgan/modules/decoder_v2.py
+++ b/fish_speech/models/vqgan/modules/decoder_v2.py
@@ -0,0 +1,270 @@
 
				+from functools import partial
			
 
				+from math import prod
			
 
				+from typing import Callable
			
 
				+
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+from torch.nn import Conv1d
			
 
				+from torch.nn.utils.parametrizations import weight_norm
			
 
				+from torch.nn.utils.parametrize import remove_parametrizations
			
 
				+
			
 
				+
			
 
				+def init_weights(m, mean=0.0, std=0.01):
			
 
				+    classname = m.__class__.__name__
			
 
				+    if classname.find("Conv") != -1:
			
 
				+        m.weight.data.normal_(mean, std)
			
 
				+
			
 
				+
			
 
				+def get_padding(kernel_size, dilation=1):
			
 
				+    return (kernel_size * dilation - dilation) // 2
			
 
				+
			
 
				+
			
 
				+class ResBlock(torch.nn.Module):
			
 
				+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.convs1 = nn.ModuleList(
			
 
				+            [
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[0],
			
 
				+                        padding=get_padding(kernel_size, dilation[0]),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[1],
			
 
				+                        padding=get_padding(kernel_size, dilation[1]),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[2],
			
 
				+                        padding=get_padding(kernel_size, dilation[2]),
			
 
				+                    )
			
 
				+                ),
			
 
				+            ]
			
 
				+        )
			
 
				+        self.convs1.apply(init_weights)
			
 
				+
			
 
				+        self.convs2 = nn.ModuleList(
			
 
				+            [
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=1,
			
 
				+                        padding=get_padding(kernel_size, 1),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=1,
			
 
				+                        padding=get_padding(kernel_size, 1),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=1,
			
 
				+                        padding=get_padding(kernel_size, 1),
			
 
				+                    )
			
 
				+                ),
			
 
				+            ]
			
 
				+        )
			
 
				+        self.convs2.apply(init_weights)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        for c1, c2 in zip(self.convs1, self.convs2):
			
 
				+            xt = F.silu(x)
			
 
				+            xt = c1(xt)
			
 
				+            xt = F.silu(xt)
			
 
				+            xt = c2(xt)
			
 
				+            x = xt + x
			
 
				+        return x
			
 
				+
			
 
				+    def remove_parametrizations(self):
			
 
				+        for conv in self.convs1:
			
 
				+            remove_parametrizations(conv)
			
 
				+        for conv in self.convs2:
			
 
				+            remove_parametrizations(conv)
			
 
				+
			
 
				+
			
 
				+class ParralelBlock(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        channels: int,
			
 
				+        kernel_sizes: tuple[int] = (3, 7, 11),
			
 
				+        dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        assert len(kernel_sizes) == len(dilation_sizes)
			
 
				+
			
 
				+        self.blocks = nn.ModuleList()
			
 
				+        for k, d in zip(kernel_sizes, dilation_sizes):
			
 
				+            self.blocks.append(ResBlock(channels, k, d))
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        xs = [block(x) for block in self.blocks]
			
 
				+
			
 
				+        return torch.stack(xs, dim=0).mean(dim=0)
			
 
				+
			
 
				+
			
 
				+class HiFiGANGenerator(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        *,
			
 
				+        hop_length: int = 512,
			
 
				+        upsample_rates: tuple[int] = (8, 8, 2, 2, 2),
			
 
				+        upsample_kernel_sizes: tuple[int] = (16, 16, 8, 2, 2),
			
 
				+        resblock_kernel_sizes: tuple[int] = (3, 7, 11),
			
 
				+        resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
			
 
				+        num_mels: int = 128,
			
 
				+        upsample_initial_channel: int = 512,
			
 
				+        use_template: bool = True,
			
 
				+        pre_conv_kernel_size: int = 7,
			
 
				+        post_conv_kernel_size: int = 7,
			
 
				+        post_activation: Callable = partial(nn.SiLU, inplace=True),
			
 
				+        checkpointing: bool = False,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        assert (
			
 
				+            prod(upsample_rates) == hop_length
			
 
				+        ), f"hop_length must be {prod(upsample_rates)}"
			
 
				+
			
 
				+        self.conv_pre = weight_norm(
			
 
				+            nn.Conv1d(
			
 
				+                num_mels,
			
 
				+                upsample_initial_channel,
			
 
				+                pre_conv_kernel_size,
			
 
				+                1,
			
 
				+                padding=get_padding(pre_conv_kernel_size),
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        self.hop_length = hop_length
			
 
				+        self.num_upsamples = len(upsample_rates)
			
 
				+        self.num_kernels = len(resblock_kernel_sizes)
			
 
				+
			
 
				+        self.noise_convs = nn.ModuleList()
			
 
				+        self.use_template = use_template
			
 
				+        self.ups = nn.ModuleList()
			
 
				+
			
 
				+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
			
 
				+            c_cur = upsample_initial_channel // (2 ** (i + 1))
			
 
				+            self.ups.append(
			
 
				+                weight_norm(
			
 
				+                    nn.ConvTranspose1d(
			
 
				+                        upsample_initial_channel // (2**i),
			
 
				+                        upsample_initial_channel // (2 ** (i + 1)),
			
 
				+                        k,
			
 
				+                        u,
			
 
				+                        padding=(k - u) // 2,
			
 
				+                    )
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+            if not use_template:
			
 
				+                continue
			
 
				+
			
 
				+            if i + 1 < len(upsample_rates):
			
 
				+                stride_f0 = np.prod(upsample_rates[i + 1 :])
			
 
				+                self.noise_convs.append(
			
 
				+                    Conv1d(
			
 
				+                        1,
			
 
				+                        c_cur,
			
 
				+                        kernel_size=stride_f0 * 2,
			
 
				+                        stride=stride_f0,
			
 
				+                        padding=stride_f0 // 2,
			
 
				+                    )
			
 
				+                )
			
 
				+            else:
			
 
				+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
			
 
				+
			
 
				+        self.resblocks = nn.ModuleList()
			
 
				+        for i in range(len(self.ups)):
			
 
				+            ch = upsample_initial_channel // (2 ** (i + 1))
			
 
				+            self.resblocks.append(
			
 
				+                ParralelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes)
			
 
				+            )
			
 
				+
			
 
				+        self.activation_post = post_activation()
			
 
				+        self.conv_post = weight_norm(
			
 
				+            nn.Conv1d(
			
 
				+                ch,
			
 
				+                1,
			
 
				+                post_conv_kernel_size,
			
 
				+                1,
			
 
				+                padding=get_padding(post_conv_kernel_size),
			
 
				+            )
			
 
				+        )
			
 
				+        self.ups.apply(init_weights)
			
 
				+        self.conv_post.apply(init_weights)
			
 
				+
			
 
				+        # Gradient checkpointing
			
 
				+        self.checkpointing = checkpointing
			
 
				+
			
 
				+    def forward(self, x, template=None):
			
 
				+        if self.use_template and template is None:
			
 
				+            length = x.shape[-1] * self.hop_length
			
 
				+            template = (
			
 
				+                torch.randn(x.shape[0], 1, length, device=x.device, dtype=x.dtype)
			
 
				+                * 0.003
			
 
				+            )
			
 
				+
			
 
				+        x = self.conv_pre(x)
			
 
				+
			
 
				+        for i in range(self.num_upsamples):
			
 
				+            x = F.silu(x, inplace=True)
			
 
				+            x = self.ups[i](x)
			
 
				+
			
 
				+            if self.use_template:
			
 
				+                x = x + self.noise_convs[i](template)
			
 
				+
			
 
				+            if self.training and self.checkpointing:
			
 
				+                x = torch.utils.checkpoint.checkpoint(
			
 
				+                    self.resblocks[i],
			
 
				+                    x,
			
 
				+                    use_reentrant=False,
			
 
				+                )
			
 
				+            else:
			
 
				+                x = self.resblocks[i](x)
			
 
				+
			
 
				+        x = self.activation_post(x)
			
 
				+        x = self.conv_post(x)
			
 
				+        x = torch.tanh(x)
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+    def remove_parametrizations(self):
			
 
				+        for up in self.ups:
			
 
				+            remove_parametrizations(up)
			
 
				+        for block in self.resblocks:
			
 
				+            block.remove_parametrizations()
			
 
				+        remove_parametrizations(self.conv_pre)
			
 
				+        remove_parametrizations(self.conv_post)
			
--- a/fish_speech/models/vqgan/modules/discriminator.py
+++ b/fish_speech/models/vqgan/modules/discriminator.py
@@ -1,166 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from torch.nn.utils import spectral_norm, weight_norm
			
 
				-
			
 
				-from fish_speech.models.vqgan.modules.modules import LRELU_SLOPE
			
 
				-from fish_speech.models.vqgan.utils import get_padding
			
 
				-
			
 
				-
			
 
				-class DiscriminatorP(nn.Module):
			
 
				-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
			
 
				-        super(DiscriminatorP, self).__init__()
			
 
				-        self.period = period
			
 
				-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
			
 
				-        self.convs = nn.ModuleList(
			
 
				-            [
			
 
				-                norm_f(
			
 
				-                    nn.Conv2d(
			
 
				-                        1,
			
 
				-                        32,
			
 
				-                        (kernel_size, 1),
			
 
				-                        (stride, 1),
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-                norm_f(
			
 
				-                    nn.Conv2d(
			
 
				-                        32,
			
 
				-                        128,
			
 
				-                        (kernel_size, 1),
			
 
				-                        (stride, 1),
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-                norm_f(
			
 
				-                    nn.Conv2d(
			
 
				-                        128,
			
 
				-                        512,
			
 
				-                        (kernel_size, 1),
			
 
				-                        (stride, 1),
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-                norm_f(
			
 
				-                    nn.Conv2d(
			
 
				-                        512,
			
 
				-                        1024,
			
 
				-                        (kernel_size, 1),
			
 
				-                        (stride, 1),
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-                norm_f(
			
 
				-                    nn.Conv2d(
			
 
				-                        1024,
			
 
				-                        1024,
			
 
				-                        (kernel_size, 1),
			
 
				-                        1,
			
 
				-                        padding=(get_padding(kernel_size, 1), 0),
			
 
				-                    )
			
 
				-                ),
			
 
				-            ]
			
 
				-        )
			
 
				-        self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        fmap = []
			
 
				-
			
 
				-        # 1d to 2d
			
 
				-        b, c, t = x.shape
			
 
				-        if t % self.period != 0:  # pad first
			
 
				-            n_pad = self.period - (t % self.period)
			
 
				-            x = F.pad(x, (0, n_pad), "reflect")
			
 
				-            t = t + n_pad
			
 
				-        x = x.view(b, c, t // self.period, self.period)
			
 
				-
			
 
				-        for l in self.convs:
			
 
				-            x = l(x)
			
 
				-            x = F.leaky_relu(x, LRELU_SLOPE)
			
 
				-            fmap.append(x)
			
 
				-        x = self.conv_post(x)
			
 
				-        fmap.append(x)
			
 
				-        x = torch.flatten(x, 1, -1)
			
 
				-
			
 
				-        return x, fmap
			
 
				-
			
 
				-
			
 
				-class DiscriminatorS(nn.Module):
			
 
				-    def __init__(self, use_spectral_norm=False):
			
 
				-        super(DiscriminatorS, self).__init__()
			
 
				-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
			
 
				-        self.convs = nn.ModuleList(
			
 
				-            [
			
 
				-                norm_f(nn.Conv1d(1, 128, 15, 1, padding=7)),
			
 
				-                norm_f(nn.Conv1d(128, 128, 41, 2, groups=4, padding=20)),
			
 
				-                norm_f(nn.Conv1d(128, 256, 41, 2, groups=16, padding=20)),
			
 
				-                norm_f(nn.Conv1d(256, 512, 41, 4, groups=16, padding=20)),
			
 
				-                norm_f(nn.Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
			
 
				-                norm_f(nn.Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
			
 
				-                norm_f(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
			
 
				-            ]
			
 
				-        )
			
 
				-        self.conv_post = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        fmap = []
			
 
				-
			
 
				-        for l in self.convs:
			
 
				-            x = l(x)
			
 
				-            x = F.leaky_relu(x, LRELU_SLOPE)
			
 
				-            fmap.append(x)
			
 
				-        x = self.conv_post(x)
			
 
				-        fmap.append(x)
			
 
				-        x = torch.flatten(x, 1, -1)
			
 
				-
			
 
				-        return x, fmap
			
 
				-
			
 
				-
			
 
				-class EnsembleDiscriminator(nn.Module):
			
 
				-    def __init__(self, ckpt_path=None, periods=(2, 3, 5, 7, 11)):
			
 
				-        super(EnsembleDiscriminator, self).__init__()
			
 
				-
			
 
				-        discs = [DiscriminatorS(use_spectral_norm=True)]
			
 
				-        discs = discs + [DiscriminatorP(i, use_spectral_norm=False) for i in periods]
			
 
				-        self.discriminators = nn.ModuleList(discs)
			
 
				-
			
 
				-        if ckpt_path is not None:
			
 
				-            self.restore_from_ckpt(ckpt_path)
			
 
				-
			
 
				-    def restore_from_ckpt(self, ckpt_path):
			
 
				-        ckpt = torch.load(ckpt_path, map_location="cpu")
			
 
				-        mpd, msd = ckpt["mpd"], ckpt["msd"]
			
 
				-
			
 
				-        all_keys = {}
			
 
				-        for k, v in mpd.items():
			
 
				-            keys = k.split(".")
			
 
				-            keys[1] = str(int(keys[1]) + 1)
			
 
				-            all_keys[".".join(keys)] = v
			
 
				-
			
 
				-        for k, v in msd.items():
			
 
				-            if not k.startswith("discriminators.0"):
			
 
				-                continue
			
 
				-            all_keys[k] = v
			
 
				-
			
 
				-        self.load_state_dict(all_keys, strict=True)
			
 
				-
			
 
				-    def forward(self, y, y_hat):
			
 
				-        y_d_rs = []
			
 
				-        y_d_gs = []
			
 
				-        fmap_rs = []
			
 
				-        fmap_gs = []
			
 
				-        for i, d in enumerate(self.discriminators):
			
 
				-            y_d_r, fmap_r = d(y)
			
 
				-            y_d_g, fmap_g = d(y_hat)
			
 
				-            y_d_rs.append(y_d_r)
			
 
				-            y_d_gs.append(y_d_g)
			
 
				-            fmap_rs.append(fmap_r)
			
 
				-            fmap_gs.append(fmap_g)
			
 
				-
			
 
				-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    m = EnsembleDiscriminator(
			
 
				-        ckpt_path="checkpoints/hifigan-v1-universal-22050/do_02500000"
			
 
				-    )
			
--- a/fish_speech/models/vqgan/modules/discriminators/mpd.py
+++ b/fish_speech/models/vqgan/modules/discriminators/mpd.py
@@ -0,0 +1,80 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+from torch.nn.utils.parametrizations import weight_norm
			
 
				+
			
 
				+
			
 
				+class DiscriminatorP(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        *,
			
 
				+        period: int,
			
 
				+        kernel_size: int = 5,
			
 
				+        stride: int = 3,
			
 
				+        channels: tuple[int] = (1, 64, 128, 256, 512, 1024),
			
 
				+    ) -> None:
			
 
				+        super(DiscriminatorP, self).__init__()
			
 
				+
			
 
				+        self.period = period
			
 
				+        self.convs = nn.ModuleList(
			
 
				+            [
			
 
				+                weight_norm(
			
 
				+                    nn.Conv2d(
			
 
				+                        in_channels,
			
 
				+                        out_channels,
			
 
				+                        (kernel_size, 1),
			
 
				+                        (stride, 1),
			
 
				+                        padding=(kernel_size // 2, 0),
			
 
				+                    )
			
 
				+                )
			
 
				+                for in_channels, out_channels in zip(channels[:-1], channels[1:])
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+        self.conv_post = weight_norm(
			
 
				+            nn.Conv2d(channels[-1], 1, (3, 1), 1, padding=(1, 0))
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        fmap = []
			
 
				+
			
 
				+        # 1d to 2d
			
 
				+        b, c, t = x.shape
			
 
				+        if t % self.period != 0:  # pad first
			
 
				+            n_pad = self.period - (t % self.period)
			
 
				+            x = F.pad(x, (0, n_pad), "constant")
			
 
				+            t = t + n_pad
			
 
				+        x = x.view(b, c, t // self.period, self.period)
			
 
				+
			
 
				+        for conv in self.convs:
			
 
				+            x = conv(x)
			
 
				+            x = F.silu(x, inplace=True)
			
 
				+            fmap.append(x)
			
 
				+
			
 
				+        x = self.conv_post(x)
			
 
				+        fmap.append(x)
			
 
				+        x = torch.flatten(x, 1, -1)
			
 
				+
			
 
				+        return x, fmap
			
 
				+
			
 
				+
			
 
				+class MultiPeriodDiscriminator(nn.Module):
			
 
				+    def __init__(self, periods: tuple[int] = (2, 3, 5, 7, 11)) -> None:
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.discriminators = nn.ModuleList(
			
 
				+            [DiscriminatorP(period=period) for period in periods]
			
 
				+        )
			
 
				+
			
 
				+    def forward(
			
 
				+        self, x: torch.Tensor
			
 
				+    ) -> tuple[list[torch.Tensor], list[list[torch.Tensor]]]:
			
 
				+        scores, feature_map = [], []
			
 
				+
			
 
				+        for disc in self.discriminators:
			
 
				+            res, fmap = disc(x)
			
 
				+
			
 
				+            scores.append(res)
			
 
				+            feature_map.append(fmap)
			
 
				+
			
 
				+        return scores, feature_map
			
--- a/fish_speech/models/vqgan/modules/discriminators/mrd.py
+++ b/fish_speech/models/vqgan/modules/discriminators/mrd.py
@@ -0,0 +1,100 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+from torch.nn.utils.parametrizations import weight_norm
			
 
				+
			
 
				+
			
 
				+class DiscriminatorR(torch.nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        *,
			
 
				+        n_fft: int = 1024,
			
 
				+        hop_length: int = 120,
			
 
				+        win_length: int = 600,
			
 
				+    ):
			
 
				+        super(DiscriminatorR, self).__init__()
			
 
				+
			
 
				+        self.n_fft = n_fft
			
 
				+        self.hop_length = hop_length
			
 
				+        self.win_length = win_length
			
 
				+
			
 
				+        self.convs = nn.ModuleList(
			
 
				+            [
			
 
				+                weight_norm(nn.Conv2d(1, 32, (3, 9), padding=(1, 4))),
			
 
				+                weight_norm(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
			
 
				+                weight_norm(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
			
 
				+                weight_norm(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
			
 
				+                weight_norm(nn.Conv2d(32, 32, (3, 3), padding=(1, 1))),
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+        self.conv_post = weight_norm(nn.Conv2d(32, 1, (3, 3), padding=(1, 1)))
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        fmap = []
			
 
				+
			
 
				+        x = self.spectrogram(x)
			
 
				+        x = x.unsqueeze(1)
			
 
				+
			
 
				+        for conv in self.convs:
			
 
				+            x = conv(x)
			
 
				+            x = F.silu(x, inplace=True)
			
 
				+            fmap.append(x)
			
 
				+
			
 
				+        x = self.conv_post(x)
			
 
				+        fmap.append(x)
			
 
				+        x = torch.flatten(x, 1, -1)
			
 
				+
			
 
				+        return x, fmap
			
 
				+
			
 
				+    def spectrogram(self, x):
			
 
				+        x = F.pad(
			
 
				+            x,
			
 
				+            (
			
 
				+                (self.n_fft - self.hop_length) // 2,
			
 
				+                (self.n_fft - self.hop_length + 1) // 2,
			
 
				+            ),
			
 
				+            mode="reflect",
			
 
				+        )
			
 
				+        x = x.squeeze(1)
			
 
				+        x = torch.stft(
			
 
				+            x,
			
 
				+            n_fft=self.n_fft,
			
 
				+            hop_length=self.hop_length,
			
 
				+            win_length=self.win_length,
			
 
				+            center=False,
			
 
				+            return_complex=True,
			
 
				+        )
			
 
				+        x = torch.view_as_real(x)  # [B, F, TT, 2]
			
 
				+        mag = torch.norm(x, p=2, dim=-1)  # [B, F, TT]
			
 
				+
			
 
				+        return mag
			
 
				+
			
 
				+
			
 
				+class MultiResolutionDiscriminator(torch.nn.Module):
			
 
				+    def __init__(self, resolutions: list[tuple[int]]):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.discriminators = nn.ModuleList(
			
 
				+            [
			
 
				+                DiscriminatorR(
			
 
				+                    n_fft=n_fft,
			
 
				+                    hop_length=hop_length,
			
 
				+                    win_length=win_length,
			
 
				+                )
			
 
				+                for n_fft, hop_length, win_length in resolutions
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+    def forward(
			
 
				+        self, x: torch.Tensor
			
 
				+    ) -> tuple[list[torch.Tensor], list[list[torch.Tensor]]]:
			
 
				+        scores, feature_map = [], []
			
 
				+
			
 
				+        for disc in self.discriminators:
			
 
				+            res, fmap = disc(x)
			
 
				+
			
 
				+            scores.append(res)
			
 
				+            feature_map.append(fmap)
			
 
				+
			
 
				+        return scores, feature_map
			
--- a/fish_speech/models/vqgan/modules/discriminators/mssbcqtd.py
+++ b/fish_speech/models/vqgan/modules/discriminators/mssbcqtd.py
@@ -0,0 +1,188 @@
 
				+# Copyright (c) 2023 Amphion.
			
 
				+#
			
 
				+# This source code is licensed under the MIT license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+# Monkey patching to fix a bug in nnAudio
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+import torchaudio.transforms as T
			
 
				+from einops import rearrange
			
 
				+from nnAudio import features
			
 
				+from torch import nn
			
 
				+
			
 
				+from .msstftd import NormConv2d, get_2d_padding
			
 
				+
			
 
				+np.float = float
			
 
				+
			
 
				+LRELU_SLOPE = 0.1
			
 
				+
			
 
				+
			
 
				+class DiscriminatorCQT(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hop_length,
			
 
				+        n_octaves,
			
 
				+        bins_per_octave,
			
 
				+        filters=32,
			
 
				+        max_filters=1024,
			
 
				+        filters_scale=1,
			
 
				+        dilations=[1, 2, 4],
			
 
				+        in_channels=1,
			
 
				+        out_channels=1,
			
 
				+        sample_rate=16000,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.filters = filters
			
 
				+        self.max_filters = max_filters
			
 
				+        self.filters_scale = filters_scale
			
 
				+        self.kernel_size = (3, 9)
			
 
				+        self.dilations = dilations
			
 
				+        self.stride = (1, 2)
			
 
				+
			
 
				+        self.in_channels = in_channels
			
 
				+        self.out_channels = out_channels
			
 
				+        self.fs = sample_rate
			
 
				+        self.hop_length = hop_length
			
 
				+        self.n_octaves = n_octaves
			
 
				+        self.bins_per_octave = bins_per_octave
			
 
				+
			
 
				+        self.cqt_transform = features.cqt.CQT2010v2(
			
 
				+            sr=self.fs * 2,
			
 
				+            hop_length=self.hop_length,
			
 
				+            n_bins=self.bins_per_octave * self.n_octaves,
			
 
				+            bins_per_octave=self.bins_per_octave,
			
 
				+            output_format="Complex",
			
 
				+            pad_mode="constant",
			
 
				+        )
			
 
				+
			
 
				+        self.conv_pres = nn.ModuleList()
			
 
				+        for i in range(self.n_octaves):
			
 
				+            self.conv_pres.append(
			
 
				+                NormConv2d(
			
 
				+                    self.in_channels * 2,
			
 
				+                    self.in_channels * 2,
			
 
				+                    kernel_size=self.kernel_size,
			
 
				+                    padding=get_2d_padding(self.kernel_size),
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        self.convs = nn.ModuleList()
			
 
				+
			
 
				+        self.convs.append(
			
 
				+            NormConv2d(
			
 
				+                self.in_channels * 2,
			
 
				+                self.filters,
			
 
				+                kernel_size=self.kernel_size,
			
 
				+                padding=get_2d_padding(self.kernel_size),
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        in_chs = min(self.filters_scale * self.filters, self.max_filters)
			
 
				+        for i, dilation in enumerate(self.dilations):
			
 
				+            out_chs = min(
			
 
				+                (self.filters_scale ** (i + 1)) * self.filters, self.max_filters
			
 
				+            )
			
 
				+            self.convs.append(
			
 
				+                NormConv2d(
			
 
				+                    in_chs,
			
 
				+                    out_chs,
			
 
				+                    kernel_size=self.kernel_size,
			
 
				+                    stride=self.stride,
			
 
				+                    dilation=(dilation, 1),
			
 
				+                    padding=get_2d_padding(self.kernel_size, (dilation, 1)),
			
 
				+                    norm="weight_norm",
			
 
				+                )
			
 
				+            )
			
 
				+            in_chs = out_chs
			
 
				+        out_chs = min(
			
 
				+            (self.filters_scale ** (len(self.dilations) + 1)) * self.filters,
			
 
				+            self.max_filters,
			
 
				+        )
			
 
				+        self.convs.append(
			
 
				+            NormConv2d(
			
 
				+                in_chs,
			
 
				+                out_chs,
			
 
				+                kernel_size=(self.kernel_size[0], self.kernel_size[0]),
			
 
				+                padding=get_2d_padding((self.kernel_size[0], self.kernel_size[0])),
			
 
				+                norm="weight_norm",
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        self.conv_post = NormConv2d(
			
 
				+            out_chs,
			
 
				+            self.out_channels,
			
 
				+            kernel_size=(self.kernel_size[0], self.kernel_size[0]),
			
 
				+            padding=get_2d_padding((self.kernel_size[0], self.kernel_size[0])),
			
 
				+            norm="weight_norm",
			
 
				+        )
			
 
				+
			
 
				+        self.activation = torch.nn.LeakyReLU(negative_slope=LRELU_SLOPE)
			
 
				+        self.resample = T.Resample(orig_freq=self.fs, new_freq=self.fs * 2)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        fmap = []
			
 
				+
			
 
				+        x = self.resample(x)
			
 
				+
			
 
				+        z = self.cqt_transform(x)
			
 
				+
			
 
				+        z_amplitude = z[:, :, :, 0].unsqueeze(1)
			
 
				+        z_phase = z[:, :, :, 1].unsqueeze(1)
			
 
				+
			
 
				+        z = torch.cat([z_amplitude, z_phase], dim=1)
			
 
				+        z = rearrange(z, "b c w t -> b c t w")
			
 
				+
			
 
				+        latent_z = []
			
 
				+        for i in range(self.n_octaves):
			
 
				+            latent_z.append(
			
 
				+                self.conv_pres[i](
			
 
				+                    z[
			
 
				+                        :,
			
 
				+                        :,
			
 
				+                        :,
			
 
				+                        i * self.bins_per_octave : (i + 1) * self.bins_per_octave,
			
 
				+                    ]
			
 
				+                )
			
 
				+            )
			
 
				+        latent_z = torch.cat(latent_z, dim=-1)
			
 
				+
			
 
				+        for i, layer in enumerate(self.convs):
			
 
				+            latent_z = layer(latent_z)
			
 
				+
			
 
				+            latent_z = self.activation(latent_z)
			
 
				+            fmap.append(latent_z)
			
 
				+
			
 
				+        latent_z = self.conv_post(latent_z)
			
 
				+
			
 
				+        return latent_z, fmap
			
 
				+
			
 
				+
			
 
				+class MultiScaleSubbandCQTDiscriminator(nn.Module):
			
 
				+    def __init__(self, hop_lengths, n_octaves, bins_per_octaves, **kwargs):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.discriminators = nn.ModuleList(
			
 
				+            [
			
 
				+                DiscriminatorCQT(
			
 
				+                    hop_length=hop_length,
			
 
				+                    n_octaves=n_octaves,
			
 
				+                    bins_per_octave=bins_per_octave,
			
 
				+                    **kwargs,
			
 
				+                )
			
 
				+                for hop_length, n_octaves, bins_per_octave in zip(
			
 
				+                    hop_lengths, n_octaves, bins_per_octaves
			
 
				+                )
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor):
			
 
				+        logits = []
			
 
				+        fmaps = []
			
 
				+        for disc in self.discriminators:
			
 
				+            logit, fmap = disc(x)
			
 
				+            logits.append(logit)
			
 
				+            fmaps.append(fmap)
			
 
				+
			
 
				+        return logits, fmaps
			
--- a/fish_speech/models/vqgan/modules/discriminators/msstftd.py
+++ b/fish_speech/models/vqgan/modules/discriminators/msstftd.py
@@ -0,0 +1,303 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+"""MS-STFT discriminator, provided here for reference."""
			
 
				+
			
 
				+import typing as tp
			
 
				+
			
 
				+import einops
			
 
				+import torch
			
 
				+import torchaudio
			
 
				+from einops import rearrange
			
 
				+from torch import nn
			
 
				+from torch.nn.utils import spectral_norm, weight_norm
			
 
				+
			
 
				+FeatureMapType = tp.List[torch.Tensor]
			
 
				+LogitsType = torch.Tensor
			
 
				+DiscriminatorOutput = tp.Tuple[tp.List[LogitsType], tp.List[FeatureMapType]]
			
 
				+
			
 
				+
			
 
				+class ConvLayerNorm(nn.LayerNorm):
			
 
				+    """
			
 
				+    Convolution-friendly LayerNorm that moves channels to last dimensions
			
 
				+    before running the normalization and moves them back to original position right after.
			
 
				+    """  # noqa: E501
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs
			
 
				+    ):
			
 
				+        super().__init__(normalized_shape, **kwargs)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        x = einops.rearrange(x, "b ... t -> b t ...")
			
 
				+        x = super().forward(x)
			
 
				+        x = einops.rearrange(x, "b t ... -> b ... t")
			
 
				+        return
			
 
				+
			
 
				+
			
 
				+CONV_NORMALIZATIONS = frozenset(
			
 
				+    [
			
 
				+        "none",
			
 
				+        "weight_norm",
			
 
				+        "spectral_norm",
			
 
				+        "time_layer_norm",
			
 
				+        "layer_norm",
			
 
				+        "time_group_norm",
			
 
				+    ]
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def apply_parametrization_norm(module: nn.Module, norm: str = "none") -> nn.Module:
			
 
				+    assert norm in CONV_NORMALIZATIONS
			
 
				+    if norm == "weight_norm":
			
 
				+        return weight_norm(module)
			
 
				+    elif norm == "spectral_norm":
			
 
				+        return spectral_norm(module)
			
 
				+    else:
			
 
				+        # We already check was in CONV_NORMALIZATION, so any other choice
			
 
				+        # doesn't need reparametrization.
			
 
				+        return module
			
 
				+
			
 
				+
			
 
				+def get_norm_module(
			
 
				+    module: nn.Module, causal: bool = False, norm: str = "none", **norm_kwargs
			
 
				+) -> nn.Module:
			
 
				+    """Return the proper normalization module. If causal is True, this will ensure the returned
			
 
				+    module is causal, or return an error if the normalization doesn't support causal evaluation.
			
 
				+    """  # noqa: E501
			
 
				+    assert norm in CONV_NORMALIZATIONS
			
 
				+    if norm == "layer_norm":
			
 
				+        assert isinstance(module, nn.modules.conv._ConvNd)
			
 
				+        return ConvLayerNorm(module.out_channels, **norm_kwargs)
			
 
				+    elif norm == "time_group_norm":
			
 
				+        if causal:
			
 
				+            raise ValueError("GroupNorm doesn't support causal evaluation.")
			
 
				+        assert isinstance(module, nn.modules.conv._ConvNd)
			
 
				+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
			
 
				+    else:
			
 
				+        return nn.Identity()
			
 
				+
			
 
				+
			
 
				+class NormConv2d(nn.Module):
			
 
				+    """Wrapper around Conv2d and normalization applied to this conv
			
 
				+    to provide a uniform interface across normalization approaches.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        *args,
			
 
				+        norm: str = "none",
			
 
				+        norm_kwargs: tp.Dict[str, tp.Any] = {},
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
			
 
				+        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
			
 
				+        self.norm_type = norm
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        x = self.conv(x)
			
 
				+        x = self.norm(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+def get_2d_padding(
			
 
				+    kernel_size: tp.Tuple[int, int], dilation: tp.Tuple[int, int] = (1, 1)
			
 
				+):
			
 
				+    return (
			
 
				+        ((kernel_size[0] - 1) * dilation[0]) // 2,
			
 
				+        ((kernel_size[1] - 1) * dilation[1]) // 2,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+class DiscriminatorSTFT(nn.Module):
			
 
				+    """STFT sub-discriminator.
			
 
				+    Args:
			
 
				+        filters (int): Number of filters in convolutions
			
 
				+        in_channels (int): Number of input channels. Default: 1
			
 
				+        out_channels (int): Number of output channels. Default: 1
			
 
				+        n_fft (int): Size of FFT for each scale. Default: 1024
			
 
				+        hop_length (int): Length of hop between STFT windows for each scale. Default: 256
			
 
				+        kernel_size (tuple of int): Inner Conv2d kernel sizes. Default: ``(3, 9)``
			
 
				+        stride (tuple of int): Inner Conv2d strides. Default: ``(1, 2)``
			
 
				+        dilations (list of int): Inner Conv2d dilation on the time dimension. Default: ``[1, 2, 4]``
			
 
				+        win_length (int): Window size for each scale. Default: 1024
			
 
				+        normalized (bool): Whether to normalize by magnitude after stft. Default: True
			
 
				+        norm (str): Normalization method. Default: `'weight_norm'`
			
 
				+        activation (str): Activation function. Default: `'LeakyReLU'`
			
 
				+        activation_params (dict): Parameters to provide to the activation function.
			
 
				+        growth (int): Growth factor for the filters. Default: 1
			
 
				+    """  # noqa: E501
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        filters: int,
			
 
				+        in_channels: int = 1,
			
 
				+        out_channels: int = 1,
			
 
				+        n_fft: int = 1024,
			
 
				+        hop_length: int = 256,
			
 
				+        win_length: int = 1024,
			
 
				+        max_filters: int = 1024,
			
 
				+        filters_scale: int = 1,
			
 
				+        kernel_size: tp.Tuple[int, int] = (3, 9),
			
 
				+        dilations: tp.List = [1, 2, 4],
			
 
				+        stride: tp.Tuple[int, int] = (1, 2),
			
 
				+        normalized: bool = True,
			
 
				+        norm: str = "weight_norm",
			
 
				+        activation: str = "LeakyReLU",
			
 
				+        activation_params: dict = {"negative_slope": 0.2},
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert len(kernel_size) == 2
			
 
				+        assert len(stride) == 2
			
 
				+        self.filters = filters
			
 
				+        self.in_channels = in_channels
			
 
				+        self.out_channels = out_channels
			
 
				+        self.n_fft = n_fft
			
 
				+        self.hop_length = hop_length
			
 
				+        self.win_length = win_length
			
 
				+        self.normalized = normalized
			
 
				+        self.activation = getattr(torch.nn, activation)(**activation_params)
			
 
				+        self.spec_transform = torchaudio.transforms.Spectrogram(
			
 
				+            n_fft=self.n_fft,
			
 
				+            hop_length=self.hop_length,
			
 
				+            win_length=self.win_length,
			
 
				+            window_fn=torch.hann_window,
			
 
				+            normalized=self.normalized,
			
 
				+            center=False,
			
 
				+            pad_mode=None,
			
 
				+            power=None,
			
 
				+        )
			
 
				+        spec_channels = 2 * self.in_channels
			
 
				+        self.convs = nn.ModuleList()
			
 
				+        self.convs.append(
			
 
				+            NormConv2d(
			
 
				+                spec_channels,
			
 
				+                self.filters,
			
 
				+                kernel_size=kernel_size,
			
 
				+                padding=get_2d_padding(kernel_size),
			
 
				+            )
			
 
				+        )
			
 
				+        in_chs = min(filters_scale * self.filters, max_filters)
			
 
				+        for i, dilation in enumerate(dilations):
			
 
				+            out_chs = min((filters_scale ** (i + 1)) * self.filters, max_filters)
			
 
				+            self.convs.append(
			
 
				+                NormConv2d(
			
 
				+                    in_chs,
			
 
				+                    out_chs,
			
 
				+                    kernel_size=kernel_size,
			
 
				+                    stride=stride,
			
 
				+                    dilation=(dilation, 1),
			
 
				+                    padding=get_2d_padding(kernel_size, (dilation, 1)),
			
 
				+                    norm=norm,
			
 
				+                )
			
 
				+            )
			
 
				+            in_chs = out_chs
			
 
				+        out_chs = min(
			
 
				+            (filters_scale ** (len(dilations) + 1)) * self.filters, max_filters
			
 
				+        )
			
 
				+        self.convs.append(
			
 
				+            NormConv2d(
			
 
				+                in_chs,
			
 
				+                out_chs,
			
 
				+                kernel_size=(kernel_size[0], kernel_size[0]),
			
 
				+                padding=get_2d_padding((kernel_size[0], kernel_size[0])),
			
 
				+                norm=norm,
			
 
				+            )
			
 
				+        )
			
 
				+        self.conv_post = NormConv2d(
			
 
				+            out_chs,
			
 
				+            self.out_channels,
			
 
				+            kernel_size=(kernel_size[0], kernel_size[0]),
			
 
				+            padding=get_2d_padding((kernel_size[0], kernel_size[0])),
			
 
				+            norm=norm,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor):
			
 
				+        fmap = []
			
 
				+        z = self.spec_transform(x)  # [B, 2, Freq, Frames, 2]
			
 
				+        z = torch.cat([z.real, z.imag], dim=1)
			
 
				+        z = rearrange(z, "b c w t -> b c t w")
			
 
				+        for i, layer in enumerate(self.convs):
			
 
				+            z = layer(z)
			
 
				+            z = self.activation(z)
			
 
				+            fmap.append(z)
			
 
				+        z = self.conv_post(z)
			
 
				+        return z, fmap
			
 
				+
			
 
				+
			
 
				+class MultiScaleSTFTDiscriminator(nn.Module):
			
 
				+    """Multi-Scale STFT (MS-STFT) discriminator.
			
 
				+    Args:
			
 
				+        filters (int): Number of filters in convolutions
			
 
				+        in_channels (int): Number of input channels. Default: 1
			
 
				+        out_channels (int): Number of output channels. Default: 1
			
 
				+        n_ffts (Sequence[int]): Size of FFT for each scale
			
 
				+        hop_lengths (Sequence[int]): Length of hop between STFT windows for each scale
			
 
				+        win_lengths (Sequence[int]): Window size for each scale
			
 
				+        **kwargs: additional args for STFTDiscriminator
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        filters: int,
			
 
				+        in_channels: int = 1,
			
 
				+        out_channels: int = 1,
			
 
				+        n_ffts: tp.List[int] = [1024, 2048, 512],
			
 
				+        hop_lengths: tp.List[int] = [256, 512, 128],
			
 
				+        win_lengths: tp.List[int] = [1024, 2048, 512],
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
			
 
				+        self.discriminators = nn.ModuleList(
			
 
				+            [
			
 
				+                DiscriminatorSTFT(
			
 
				+                    filters,
			
 
				+                    in_channels=in_channels,
			
 
				+                    out_channels=out_channels,
			
 
				+                    n_fft=n_ffts[i],
			
 
				+                    win_length=win_lengths[i],
			
 
				+                    hop_length=hop_lengths[i],
			
 
				+                    **kwargs,
			
 
				+                )
			
 
				+                for i in range(len(n_ffts))
			
 
				+            ]
			
 
				+        )
			
 
				+        self.num_discriminators = len(self.discriminators)
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> DiscriminatorOutput:
			
 
				+        logits = []
			
 
				+        fmaps = []
			
 
				+        for disc in self.discriminators:
			
 
				+            logit, fmap = disc(x)
			
 
				+            logits.append(logit)
			
 
				+            fmaps.append(fmap)
			
 
				+
			
 
				+        return logits, fmaps
			
 
				+
			
 
				+
			
 
				+def test():
			
 
				+    disc = MultiScaleSTFTDiscriminator(filters=32)
			
 
				+    y = torch.randn(1, 1, 24000)
			
 
				+    y_hat = torch.randn(1, 1, 24000)
			
 
				+
			
 
				+    y_disc_r, fmap_r = disc(y)
			
 
				+    y_disc_gen, fmap_gen = disc(y_hat)
			
 
				+    assert (
			
 
				+        len(y_disc_r)
			
 
				+        == len(y_disc_gen)
			
 
				+        == len(fmap_r)
			
 
				+        == len(fmap_gen)
			
 
				+        == disc.num_discriminators
			
 
				+    )
			
 
				+    assert all([len(fm) == 5 for fm in fmap_r + fmap_gen])
			
 
				+    assert all([list(f.shape)[:2] == [1, 32] for fm in fmap_r + fmap_gen for f in fm])
			
 
				+    assert all([len(logits.shape) == 4 for logits in y_disc_r + y_disc_gen])
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    test()
			
--- a/fish_speech/models/vqgan/modules/encoders.py
+++ b/fish_speech/models/vqgan/modules/encoders.py
@@ -5,6 +5,7 @@ import numpy as np
 
				 import torch
			
 
				 import torch.nn as nn
			
 
				 import torch.nn.functional as F
			
 
				+from einops import rearrange
			
 
				 from vector_quantize_pytorch import LFQ, GroupedResidualVQ, VectorQuantize
			
 
				 
			
 
				 from fish_speech.models.vqgan.modules.modules import WN
			
@@ -298,6 +299,7 @@ class VQEncoder(nn.Module):
 
				             )
			
 
				 
			
 
				         self.codebook_groups = codebook_groups
			
 
				+        self.codebook_layers = codebook_layers
			
 
				         self.downsample = downsample
			
 
				         self.conv_in = nn.Conv1d(
			
 
				             in_channels, vq_channels, kernel_size=downsample, stride=downsample
			
@@ -309,6 +311,17 @@ class VQEncoder(nn.Module):
 
				             nn.Conv1d(vq_channels, in_channels, kernel_size=1, stride=1),
			
 
				         )
			
 
				 
			
 
				+    @property
			
 
				+    def mode(self):
			
 
				+        if self.codebook_groups > 1 and self.codebook_layers > 1:
			
 
				+            return "grouped-residual"
			
 
				+        elif self.codebook_groups > 1:
			
 
				+            return "grouped"
			
 
				+        elif self.codebook_layers > 1:
			
 
				+            return "residual"
			
 
				+        else:
			
 
				+            return "single"
			
 
				+
			
 
				     def forward(self, x, x_mask):
			
 
				         # x: [B, C, T], x_mask: [B, 1, T]
			
 
				         x_len = x.shape[2]
			
@@ -327,15 +340,61 @@ class VQEncoder(nn.Module):
 
				         x = self.conv_out(q) * x_mask
			
 
				         x = x[:, :, :x_len]
			
 
				 
			
 
				+        # Post process indices
			
 
				+        if self.mode == "grouped-residual":
			
 
				+            indices = rearrange(indices, "g b t r -> b (g r) t")
			
 
				+        elif self.mode == "grouped":
			
 
				+            indices = rearrange(indices, "g b t 1 -> b g t")
			
 
				+        elif self.mode == "residual":
			
 
				+            indices = rearrange(indices, "1 b t r -> b r t")
			
 
				+        else:
			
 
				+            indices = rearrange(indices, "b t -> b 1 t")
			
 
				+
			
 
				         return x, indices, loss
			
 
				 
			
 
				     def decode(self, indices):
			
 
				+        # Undo rearrange
			
 
				+        if self.mode == "grouped-residual":
			
 
				+            indices = rearrange(indices, "b (g r) t -> g b t r", g=self.codebook_groups)
			
 
				+        elif self.mode == "grouped":
			
 
				+            indices = rearrange(indices, "b g t -> g b t 1")
			
 
				+        elif self.mode == "residual":
			
 
				+            indices = rearrange(indices, "b r t -> 1 b t r")
			
 
				+        else:
			
 
				+            indices = rearrange(indices, "b 1 t -> b t")
			
 
				+
			
 
				         q = self.vq.get_output_from_indices(indices)
			
 
				 
			
 
				-        if q.shape[1] != indices.shape[1] and indices.ndim != 4:
			
 
				-            q = q.view(q.shape[0], indices.shape[1], -1)
			
 
				-        q = q.mT
			
 
				+        # Edge case for single vq
			
 
				+        if self.mode == "single":
			
 
				+            q = rearrange(q, "b (t c) -> b t c", t=indices.shape[-1])
			
 
				 
			
 
				-        x = self.conv_out(q)
			
 
				+        x = self.conv_out(q.mT)
			
 
				 
			
 
				         return x
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # Test VQEncoder
			
 
				+    for group, layer in [
			
 
				+        (1, 1),
			
 
				+        (1, 2),
			
 
				+        (2, 1),
			
 
				+        (2, 2),
			
 
				+        (4, 1),
			
 
				+        (4, 2),
			
 
				+    ]:
			
 
				+        encoder = VQEncoder(
			
 
				+            in_channels=1024,
			
 
				+            vq_channels=1024,
			
 
				+            codebook_size=2048,
			
 
				+            downsample=1,
			
 
				+            codebook_groups=group,
			
 
				+            codebook_layers=layer,
			
 
				+            threshold_ema_dead_code=2,
			
 
				+        )
			
 
				+        x = torch.randn(2, 1024, 100)
			
 
				+        x_mask = torch.ones(2, 1, 100)
			
 
				+        x, indices, loss = encoder(x, x_mask)
			
 
				+        x = encoder.decode(indices)
			
 
				+        assert x.shape == (2, 1024, 100)
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
 
				     "zibai-server>=0.9.0",
			
 
				     "loguru>=0.6.0",
			
 
				     "WeTextProcessing>=0.1.10",
			
 
				+    "nnAudio>=0.3.2",
			
 
				     "loralib>=0.1.2",
			
 
				     "natsort>=8.4.0",
			
 
				     "cn2an>=0.5.22"
			
--- a/tools/api_server.py
+++ b/tools/api_server.py
@@ -138,36 +138,11 @@ class VQGANModel:
 
				     def sematic_to_wav(self, indices):
			
 
				         model = self.model
			
 
				         indices = indices.to(model.device).long()
			
 
				-        indices = indices.unsqueeze(1).unsqueeze(-1)
			
 
				-
			
 
				-        mel_lengths = indices.shape[2] * (
			
 
				-            model.downsample.total_strides if model.downsample is not None else 1
			
 
				-        )
			
 
				-        mel_lengths = torch.tensor([mel_lengths], device=model.device, dtype=torch.long)
			
 
				-        mel_masks = torch.ones(
			
 
				-            (1, 1, mel_lengths), device=model.device, dtype=torch.float32
			
 
				-        )
			
 
				-
			
 
				-        text_features = model.vq_encoder.decode(indices)
			
 
				-
			
 
				-        logger.info(
			
 
				-            f"VQ Encoded, indices: {indices.shape} equivalent to "
			
 
				-            + f"{1 / (mel_lengths[0] * model.hop_length / model.sampling_rate / indices.shape[2]):.2f} Hz"
			
 
				-        )
			
 
				-
			
 
				-        text_features = F.interpolate(
			
 
				-            text_features, size=mel_lengths[0], mode="nearest"
			
 
				-        )
			
 
				-
			
 
				-        # Sample mels
			
 
				-        decoded_mels = model.decoder(text_features, mel_masks)
			
 
				-        fake_audios = model.generator(decoded_mels)
			
 
				-        logger.info(
			
 
				-            f"Generated audio of shape {fake_audios.shape}, equivalent to {fake_audios.shape[-1] / model.sampling_rate:.2f} seconds"
			
 
				-        )
			
 
				+        feature_lengths = torch.tensor([indices.shape[1]], device=model.device)
			
 
				+        decoded = model.decode(indices=indices[None], feature_lengths=feature_lengths)
			
 
				 
			
 
				         # Save audio
			
 
				-        fake_audio = fake_audios[0, 0].cpu().numpy().astype(np.float32)
			
 
				+        fake_audio = decoded.audios[0, 0].cpu().numpy().astype(np.float32)
			
 
				 
			
 
				         return fake_audio, model.sampling_rate
			
 
				 
			
@@ -189,37 +164,10 @@ class VQGANModel:
 
				         audio_lengths = torch.tensor(
			
 
				             [audios.shape[2]], device=model.device, dtype=torch.long
			
 
				         )
			
 
				-
			
 
				-        features = gt_mels = model.mel_transform(
			
 
				-            audios, sample_rate=model.sampling_rate
			
 
				-        )
			
 
				-
			
 
				-        if model.downsample is not None:
			
 
				-            features = model.downsample(features)
			
 
				-
			
 
				-        mel_lengths = audio_lengths // model.hop_length
			
 
				-        feature_lengths = (
			
 
				-            audio_lengths
			
 
				-            / model.hop_length
			
 
				-            / (model.downsample.total_strides if model.downsample is not None else 1)
			
 
				-        ).long()
			
 
				-
			
 
				-        feature_masks = torch.unsqueeze(
			
 
				-            sequence_mask(feature_lengths, features.shape[2]), 1
			
 
				-        ).to(gt_mels.dtype)
			
 
				-
			
 
				-        # vq_features is 50 hz, need to convert to true mel size
			
 
				-        text_features = model.mel_encoder(features, feature_masks)
			
 
				-        _, indices, _ = model.vq_encoder(text_features, feature_masks)
			
 
				-
			
 
				-        if indices.ndim == 4 and indices.shape[1] == 1 and indices.shape[3] == 1:
			
 
				-            indices = indices[:, 0, :, 0]
			
 
				-        else:
			
 
				-            logger.error(f"Unknown indices shape: {indices.shape}")
			
 
				-            return
			
 
				+        encoded = model.encode(audios, audio_lengths)
			
 
				+        indices = encoded.indices[0]
			
 
				 
			
 
				         logger.info(f"Generated indices of shape {indices.shape}")
			
 
				-
			
 
				         return indices
			
 
				 
			
 
				 
			
--- a/tools/vqgan/extract_vq.py
+++ b/tools/vqgan/extract_vq.py
@@ -90,43 +90,15 @@ def process_batch(files: list[Path], model) -> float:
 
				 
			
 
				     # Calculate lengths
			
 
				     with torch.no_grad():
			
 
				-        # VQ Encoder
			
 
				-        features = gt_mels = model.mel_transform(
			
 
				-            audios, sample_rate=model.sampling_rate
			
 
				-        )
			
 
				-
			
 
				-        if model.downsample is not None:
			
 
				-            features = model.downsample(features)
			
 
				-
			
 
				-        feature_lengths = (
			
 
				-            audio_lengths
			
 
				-            / model.hop_length
			
 
				-            / (model.downsample.total_strides if model.downsample is not None else 1)
			
 
				-        ).long()
			
 
				-
			
 
				-        feature_masks = torch.unsqueeze(
			
 
				-            sequence_mask(feature_lengths, features.shape[2]), 1
			
 
				-        ).to(gt_mels.dtype)
			
 
				-
			
 
				-        text_features = model.mel_encoder(features, feature_masks)
			
 
				-        _, indices, _ = model.vq_encoder(text_features, feature_masks)
			
 
				-
			
 
				-        if indices.ndim == 4:
			
 
				-            # Grouped vq
			
 
				-            assert indices.shape[-1] == 1, f"Residual vq is not supported"
			
 
				-            indices = indices.squeeze(-1)
			
 
				-        elif indices.ndim == 2:
			
 
				-            # Single vq
			
 
				-            indices = indices.unsqueeze(0)
			
 
				-        else:
			
 
				-            raise ValueError(f"Invalid indices shape {indices.shape}")
			
 
				-
			
 
				-        indices = rearrange(indices, "c b t -> b c t")
			
 
				+        out = model.encode(audios, audio_lengths)
			
 
				+        indices, feature_lengths = out.indices, out.feature_lengths
			
 
				 
			
 
				     # Save to disk
			
 
				     outputs = indices.cpu().numpy()
			
 
				 
			
 
				-    for file, length, feature, audio in zip(files, feature_lengths, outputs, audios):
			
 
				+    for file, length, feature, audio_length in zip(
			
 
				+        files, feature_lengths, outputs, audio_lengths
			
 
				+    ):
			
 
				         feature = feature[:, :length]
			
 
				 
			
 
				         # (T,)
			
--- a/tools/vqgan/inference.py
+++ b/tools/vqgan/inference.py
@@ -67,37 +67,8 @@ def main(input_path, output_path, config_name, checkpoint_path):
 
				         audio_lengths = torch.tensor(
			
 
				             [audios.shape[2]], device=model.device, dtype=torch.long
			
 
				         )
			
 
				-
			
 
				-        features = gt_mels = model.mel_transform(
			
 
				-            audios, sample_rate=model.sampling_rate
			
 
				-        )
			
 
				-
			
 
				-        if model.downsample is not None:
			
 
				-            features = model.downsample(features)
			
 
				-
			
 
				-        mel_lengths = audio_lengths // model.hop_length
			
 
				-        feature_lengths = (
			
 
				-            audio_lengths
			
 
				-            / model.hop_length
			
 
				-            / (model.downsample.total_strides if model.downsample is not None else 1)
			
 
				-        ).long()
			
 
				-
			
 
				-        feature_masks = torch.unsqueeze(
			
 
				-            sequence_mask(feature_lengths, features.shape[2]), 1
			
 
				-        ).to(gt_mels.dtype)
			
 
				-        mel_masks = torch.unsqueeze(sequence_mask(mel_lengths, gt_mels.shape[2]), 1).to(
			
 
				-            gt_mels.dtype
			
 
				-        )
			
 
				-
			
 
				-        # vq_features is 50 hz, need to convert to true mel size
			
 
				-        text_features = model.mel_encoder(features, feature_masks)
			
 
				-        _, indices, _ = model.vq_encoder(text_features, feature_masks)
			
 
				-
			
 
				-        if indices.ndim == 4 and indices.shape[1] == 1 and indices.shape[3] == 1:
			
 
				-            indices = indices[:, 0, :, 0]
			
 
				-        else:
			
 
				-            logger.error(f"Unknown indices shape: {indices.shape}")
			
 
				-            return
			
 
				+        encoded = model.encode(audios, audio_lengths)
			
 
				+        indices = encoded.indices[0]
			
 
				 
			
 
				         logger.info(f"Generated indices of shape {indices.shape}")
			
 
				 
			
@@ -112,29 +83,13 @@ def main(input_path, output_path, config_name, checkpoint_path):
 
				         raise ValueError(f"Unknown input type: {input_path}")
			
 
				 
			
 
				     # Restore
			
 
				-    indices = indices.unsqueeze(1).unsqueeze(-1)
			
 
				-    mel_lengths = indices.shape[2] * (
			
 
				-        model.downsample.total_strides if model.downsample is not None else 1
			
 
				-    )
			
 
				-    mel_lengths = torch.tensor([mel_lengths], device=model.device, dtype=torch.long)
			
 
				-    mel_masks = torch.ones(
			
 
				-        (1, 1, mel_lengths), device=model.device, dtype=torch.float32
			
 
				-    )
			
 
				-
			
 
				-    text_features = model.vq_encoder.decode(indices)
			
 
				-
			
 
				-    logger.info(
			
 
				-        f"VQ Encoded, indices: {indices.shape} equivalent to "
			
 
				-        + f"{1/(mel_lengths[0] * model.hop_length / model.sampling_rate / indices.shape[2]):.2f} Hz"
			
 
				-    )
			
 
				-
			
 
				-    text_features = F.interpolate(text_features, size=mel_lengths[0], mode="nearest")
			
 
				+    feature_lengths = torch.tensor([indices.shape[1]], device=model.device)
			
 
				+    decoded = model.decode(indices=indices[None], feature_lengths=feature_lengths)
			
 
				+    fake_audios = decoded.audios
			
 
				+    audio_time = fake_audios.shape[-1] / model.sampling_rate
			
 
				 
			
 
				-    # Sample mels
			
 
				-    decoded_mels = model.decoder(text_features, mel_masks)
			
 
				-    fake_audios = model.generator(decoded_mels)
			
 
				     logger.info(
			
 
				-        f"Generated audio of shape {fake_audios.shape}, equivalent to {fake_audios.shape[-1] / model.sampling_rate:.2f} seconds"
			
 
				+        f"Generated audio of shape {fake_audios.shape}, equivalent to {audio_time:.2f} seconds from {indices.shape[1]} features, features/second: {indices.shape[1] / audio_time:.2f}"
			
 
				     )
			
 
				 
			
 
				     # Save audio