1 год назад · aad7ba4942
--- a/fish_speech/configs/firefly_gan_vq.yaml
+++ b/fish_speech/configs/firefly_gan_vq.yaml
@@ -22,13 +22,12 @@ head:
 
				   resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				   num_mels: 512
			
 
				   upsample_initial_channel: 512
			
 
				-  use_template: false
			
 
				   pre_conv_kernel_size: 13
			
 
				   post_conv_kernel_size: 13
			
 
				 quantizer:
			
 
				   _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
			
 
				   input_dim: 512
			
 
				-  n_groups: 4
			
 
				+  n_groups: 8
			
 
				   n_codebooks: 1
			
 
				   levels: [8, 5, 5, 5]
			
 
				-  downsample_factor: [2]
			
 
				+  downsample_factor: [2, 2]
			
--- a/fish_speech/models/vqgan/__init__.py
+++ b/fish_speech/models/vqgan/__init__.py
@@ -1,3 +0,0 @@
 
				-from .lit_module import VQGAN
			
 
				-
			
 
				-__all__ = ["VQGAN"]
			
--- a/fish_speech/models/vqgan/lit_module.py
+++ b/fish_speech/models/vqgan/lit_module.py
@@ -1,442 +0,0 @@
 
				-import itertools
			
 
				-import math
			
 
				-from typing import Any, Callable
			
 
				-
			
 
				-import lightning as L
			
 
				-import torch
			
 
				-import torch.nn.functional as F
			
 
				-import wandb
			
 
				-from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
			
 
				-from matplotlib import pyplot as plt
			
 
				-from torch import nn
			
 
				-
			
 
				-from fish_speech.models.vqgan.modules.discriminator import Discriminator
			
 
				-from fish_speech.models.vqgan.modules.wavenet import WaveNet
			
 
				-from fish_speech.models.vqgan.utils import avg_with_mask, plot_mel, sequence_mask
			
 
				-
			
 
				-
			
 
				-class VQGAN(L.LightningModule):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        optimizer: Callable,
			
 
				-        lr_scheduler: Callable,
			
 
				-        encoder: WaveNet,
			
 
				-        quantizer: nn.Module,
			
 
				-        decoder: WaveNet,
			
 
				-        discriminator: Discriminator,
			
 
				-        vocoder: nn.Module,
			
 
				-        encode_mel_transform: nn.Module,
			
 
				-        gt_mel_transform: nn.Module,
			
 
				-        weight_adv: float = 1.0,
			
 
				-        weight_vq: float = 1.0,
			
 
				-        weight_mel: float = 1.0,
			
 
				-        sampling_rate: int = 44100,
			
 
				-        freeze_encoder: bool = False,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        # Model parameters
			
 
				-        self.optimizer_builder = optimizer
			
 
				-        self.lr_scheduler_builder = lr_scheduler
			
 
				-
			
 
				-        # Modules
			
 
				-        self.encoder = encoder
			
 
				-        self.quantizer = quantizer
			
 
				-        self.decoder = decoder
			
 
				-        self.vocoder = vocoder
			
 
				-        self.discriminator = discriminator
			
 
				-        self.encode_mel_transform = encode_mel_transform
			
 
				-        self.gt_mel_transform = gt_mel_transform
			
 
				-
			
 
				-        # A simple linear layer to project quality to condition channels
			
 
				-        self.quality_projection = nn.Linear(1, 768)
			
 
				-
			
 
				-        # Freeze vocoder
			
 
				-        for param in self.vocoder.parameters():
			
 
				-            param.requires_grad = False
			
 
				-
			
 
				-        # Loss weights
			
 
				-        self.weight_adv = weight_adv
			
 
				-        self.weight_vq = weight_vq
			
 
				-        self.weight_mel = weight_mel
			
 
				-
			
 
				-        # Other parameters
			
 
				-        self.sampling_rate = sampling_rate
			
 
				-
			
 
				-        # Disable strict loading
			
 
				-        self.strict_loading = False
			
 
				-
			
 
				-        # If encoder is frozen
			
 
				-        if freeze_encoder:
			
 
				-            for param in self.encoder.parameters():
			
 
				-                param.requires_grad = False
			
 
				-
			
 
				-            for param in self.quantizer.parameters():
			
 
				-                param.requires_grad = False
			
 
				-
			
 
				-        self.automatic_optimization = False
			
 
				-
			
 
				-    def on_save_checkpoint(self, checkpoint):
			
 
				-        # Do not save vocoder
			
 
				-        state_dict = checkpoint["state_dict"]
			
 
				-        for name in list(state_dict.keys()):
			
 
				-            if "vocoder" in name:
			
 
				-                state_dict.pop(name)
			
 
				-
			
 
				-    def configure_optimizers(self):
			
 
				-        optimizer_generator = self.optimizer_builder(
			
 
				-            itertools.chain(
			
 
				-                self.encoder.parameters(),
			
 
				-                self.quantizer.parameters(),
			
 
				-                self.decoder.parameters(),
			
 
				-                self.quality_projection.parameters(),
			
 
				-            )
			
 
				-        )
			
 
				-        optimizer_discriminator = self.optimizer_builder(
			
 
				-            self.discriminator.parameters()
			
 
				-        )
			
 
				-
			
 
				-        lr_scheduler_generator = self.lr_scheduler_builder(optimizer_generator)
			
 
				-        lr_scheduler_discriminator = self.lr_scheduler_builder(optimizer_discriminator)
			
 
				-
			
 
				-        return (
			
 
				-            {
			
 
				-                "optimizer": optimizer_generator,
			
 
				-                "lr_scheduler": {
			
 
				-                    "scheduler": lr_scheduler_generator,
			
 
				-                    "interval": "step",
			
 
				-                    "name": "optimizer/generator",
			
 
				-                },
			
 
				-            },
			
 
				-            {
			
 
				-                "optimizer": optimizer_discriminator,
			
 
				-                "lr_scheduler": {
			
 
				-                    "scheduler": lr_scheduler_discriminator,
			
 
				-                    "interval": "step",
			
 
				-                    "name": "optimizer/discriminator",
			
 
				-                },
			
 
				-            },
			
 
				-        )
			
 
				-
			
 
				-    def training_step(self, batch, batch_idx):
			
 
				-        optim_g, optim_d = self.optimizers()
			
 
				-
			
 
				-        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
			
 
				-
			
 
				-        audios = audios.float()
			
 
				-        audios = audios[:, None, :]
			
 
				-
			
 
				-        with torch.no_grad():
			
 
				-            encoded_mels = self.encode_mel_transform(audios)
			
 
				-            gt_mels = self.gt_mel_transform(audios)
			
 
				-            quality = ((gt_mels.mean(-1) > -8).sum(-1) - 90) / 10
			
 
				-            quality = quality.unsqueeze(-1)
			
 
				-
			
 
				-        mel_lengths = audio_lengths // self.gt_mel_transform.hop_length
			
 
				-        mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
			
 
				-        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				-        gt_mels = gt_mels * mel_masks_float_conv
			
 
				-        encoded_mels = encoded_mels * mel_masks_float_conv
			
 
				-
			
 
				-        # Encode
			
 
				-        encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv
			
 
				-
			
 
				-        # Quantize
			
 
				-        vq_result = self.quantizer(encoded_features)
			
 
				-        loss_vq = getattr("vq_result", "loss", 0.0)
			
 
				-        vq_recon_features = vq_result.z * mel_masks_float_conv
			
 
				-        vq_recon_features = (
			
 
				-            vq_recon_features + self.quality_projection(quality)[:, :, None]
			
 
				-        )
			
 
				-
			
 
				-        # VQ Decode
			
 
				-        gen_mel = (
			
 
				-            self.decoder(
			
 
				-                torch.randn_like(vq_recon_features) * mel_masks_float_conv,
			
 
				-                condition=vq_recon_features,
			
 
				-            )
			
 
				-            * mel_masks_float_conv
			
 
				-        )
			
 
				-
			
 
				-        # Discriminator
			
 
				-        real_logits = self.discriminator(gt_mels)
			
 
				-        fake_logits = self.discriminator(gen_mel.detach())
			
 
				-        d_mask = F.interpolate(
			
 
				-            mel_masks_float_conv, size=(real_logits.shape[2],), mode="nearest"
			
 
				-        )
			
 
				-
			
 
				-        loss_real = avg_with_mask((real_logits - 1) ** 2, d_mask)
			
 
				-        loss_fake = avg_with_mask(fake_logits**2, d_mask)
			
 
				-
			
 
				-        loss_d = loss_real + loss_fake
			
 
				-
			
 
				-        self.log(
			
 
				-            "train/discriminator/loss",
			
 
				-            loss_d,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=True,
			
 
				-            logger=True,
			
 
				-        )
			
 
				-
			
 
				-        # Discriminator backward
			
 
				-        optim_d.zero_grad()
			
 
				-        self.manual_backward(loss_d)
			
 
				-        self.clip_gradients(
			
 
				-            optim_d, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
			
 
				-        )
			
 
				-        optim_d.step()
			
 
				-
			
 
				-        # Mel Loss, applying l1, using a weighted sum
			
 
				-        mel_distance = (
			
 
				-            gen_mel - gt_mels
			
 
				-        ).abs()  # * 0.5 + self.ssim(gen_mel, gt_mels) * 0.5
			
 
				-        loss_mel_low_freq = avg_with_mask(mel_distance[:, :40, :], mel_masks_float_conv)
			
 
				-        loss_mel_mid_freq = avg_with_mask(
			
 
				-            mel_distance[:, 40:70, :], mel_masks_float_conv
			
 
				-        )
			
 
				-        loss_mel_high_freq = avg_with_mask(
			
 
				-            mel_distance[:, 70:, :], mel_masks_float_conv
			
 
				-        )
			
 
				-        loss_mel = (
			
 
				-            loss_mel_low_freq * 0.6 + loss_mel_mid_freq * 0.3 + loss_mel_high_freq * 0.1
			
 
				-        )
			
 
				-
			
 
				-        # Adversarial Loss
			
 
				-        fake_logits = self.discriminator(gen_mel)
			
 
				-        loss_adv = avg_with_mask((fake_logits - 1) ** 2, d_mask)
			
 
				-
			
 
				-        # Total loss
			
 
				-        loss = (
			
 
				-            self.weight_vq * loss_vq
			
 
				-            + self.weight_mel * loss_mel
			
 
				-            + self.weight_adv * loss_adv
			
 
				-        )
			
 
				-
			
 
				-        # Log losses
			
 
				-        self.log(
			
 
				-            "train/generator/loss",
			
 
				-            loss,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=True,
			
 
				-            logger=True,
			
 
				-        )
			
 
				-        self.log(
			
 
				-            "train/generator/loss_vq",
			
 
				-            loss_vq,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-        )
			
 
				-        self.log(
			
 
				-            "train/generator/loss_mel",
			
 
				-            loss_mel,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-        )
			
 
				-        self.log(
			
 
				-            "train/generator/loss_adv",
			
 
				-            loss_adv,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-        )
			
 
				-
			
 
				-        # Generator backward
			
 
				-        optim_g.zero_grad()
			
 
				-        self.manual_backward(loss)
			
 
				-        self.clip_gradients(
			
 
				-            optim_g, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
			
 
				-        )
			
 
				-        optim_g.step()
			
 
				-
			
 
				-        scheduler_g, scheduler_d = self.lr_schedulers()
			
 
				-        scheduler_g.step()
			
 
				-        scheduler_d.step()
			
 
				-
			
 
				-    def validation_step(self, batch: Any, batch_idx: int):
			
 
				-        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
			
 
				-
			
 
				-        audios = audios.float()
			
 
				-        audios = audios[:, None, :]
			
 
				-
			
 
				-        encoded_mels = self.encode_mel_transform(audios)
			
 
				-        gt_mels = self.gt_mel_transform(audios)
			
 
				-
			
 
				-        mel_lengths = audio_lengths // self.gt_mel_transform.hop_length
			
 
				-        mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
			
 
				-        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				-        gt_mels = gt_mels * mel_masks_float_conv
			
 
				-        encoded_mels = encoded_mels * mel_masks_float_conv
			
 
				-
			
 
				-        # Encode
			
 
				-        encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv
			
 
				-
			
 
				-        # Quantize
			
 
				-        vq_recon_features = self.quantizer(encoded_features).z * mel_masks_float_conv
			
 
				-        vq_recon_features = (
			
 
				-            vq_recon_features
			
 
				-            + self.quality_projection(
			
 
				-                torch.ones(
			
 
				-                    vq_recon_features.shape[0], 1, device=vq_recon_features.device
			
 
				-                )
			
 
				-                * 2
			
 
				-            )[:, :, None]
			
 
				-        )
			
 
				-
			
 
				-        # VQ Decode
			
 
				-        gen_aux_mels = (
			
 
				-            self.decoder(
			
 
				-                torch.randn_like(vq_recon_features) * mel_masks_float_conv,
			
 
				-                condition=vq_recon_features,
			
 
				-            )
			
 
				-            * mel_masks_float_conv
			
 
				-        )
			
 
				-        loss_mel = avg_with_mask((gen_aux_mels - gt_mels).abs(), mel_masks_float_conv)
			
 
				-
			
 
				-        self.log(
			
 
				-            "val/loss_mel",
			
 
				-            loss_mel,
			
 
				-            on_step=False,
			
 
				-            on_epoch=True,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        recon_audios = self.vocoder(gt_mels)
			
 
				-        gen_aux_audios = self.vocoder(gen_aux_mels)
			
 
				-
			
 
				-        # only log the first batch
			
 
				-        if batch_idx != 0:
			
 
				-            return
			
 
				-
			
 
				-        for idx, (
			
 
				-            gt_mel,
			
 
				-            gen_aux_mel,
			
 
				-            audio,
			
 
				-            gen_aux_audio,
			
 
				-            recon_audio,
			
 
				-            audio_len,
			
 
				-        ) in enumerate(
			
 
				-            zip(
			
 
				-                gt_mels,
			
 
				-                gen_aux_mels,
			
 
				-                audios.cpu().float(),
			
 
				-                gen_aux_audios.cpu().float(),
			
 
				-                recon_audios.cpu().float(),
			
 
				-                audio_lengths,
			
 
				-            )
			
 
				-        ):
			
 
				-            if idx > 4:
			
 
				-                break
			
 
				-
			
 
				-            mel_len = audio_len // self.gt_mel_transform.hop_length
			
 
				-
			
 
				-            image_mels = plot_mel(
			
 
				-                [
			
 
				-                    gt_mel[:, :mel_len],
			
 
				-                    gen_aux_mel[:, :mel_len],
			
 
				-                ],
			
 
				-                [
			
 
				-                    "Ground-Truth",
			
 
				-                    "Auxiliary",
			
 
				-                ],
			
 
				-            )
			
 
				-
			
 
				-            if isinstance(self.logger, WandbLogger):
			
 
				-                self.logger.experiment.log(
			
 
				-                    {
			
 
				-                        "reconstruction_mel": wandb.Image(image_mels, caption="mels"),
			
 
				-                        "wavs": [
			
 
				-                            wandb.Audio(
			
 
				-                                audio[0, :audio_len],
			
 
				-                                sample_rate=self.sampling_rate,
			
 
				-                                caption="gt",
			
 
				-                            ),
			
 
				-                            wandb.Audio(
			
 
				-                                gen_aux_audio[0, :audio_len],
			
 
				-                                sample_rate=self.sampling_rate,
			
 
				-                                caption="aux",
			
 
				-                            ),
			
 
				-                            wandb.Audio(
			
 
				-                                recon_audio[0, :audio_len],
			
 
				-                                sample_rate=self.sampling_rate,
			
 
				-                                caption="recon",
			
 
				-                            ),
			
 
				-                        ],
			
 
				-                    },
			
 
				-                )
			
 
				-
			
 
				-            if isinstance(self.logger, TensorBoardLogger):
			
 
				-                self.logger.experiment.add_figure(
			
 
				-                    f"sample-{idx}/mels",
			
 
				-                    image_mels,
			
 
				-                    global_step=self.global_step,
			
 
				-                )
			
 
				-                self.logger.experiment.add_audio(
			
 
				-                    f"sample-{idx}/wavs/gt",
			
 
				-                    audio[0, :audio_len],
			
 
				-                    self.global_step,
			
 
				-                    sample_rate=self.sampling_rate,
			
 
				-                )
			
 
				-                self.logger.experiment.add_audio(
			
 
				-                    f"sample-{idx}/wavs/gen",
			
 
				-                    gen_aux_audio[0, :audio_len],
			
 
				-                    self.global_step,
			
 
				-                    sample_rate=self.sampling_rate,
			
 
				-                )
			
 
				-                self.logger.experiment.add_audio(
			
 
				-                    f"sample-{idx}/wavs/recon",
			
 
				-                    recon_audio[0, :audio_len],
			
 
				-                    self.global_step,
			
 
				-                    sample_rate=self.sampling_rate,
			
 
				-                )
			
 
				-
			
 
				-            plt.close(image_mels)
			
 
				-
			
 
				-    def encode(self, audios, audio_lengths):
			
 
				-        audios = audios.float()
			
 
				-
			
 
				-        mels = self.encode_mel_transform(audios)
			
 
				-        mel_lengths = audio_lengths // self.encode_mel_transform.hop_length
			
 
				-        mel_masks = sequence_mask(mel_lengths, mels.shape[2])
			
 
				-        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				-        mels = mels * mel_masks_float_conv
			
 
				-
			
 
				-        # Encode
			
 
				-        encoded_features = self.encoder(mels) * mel_masks_float_conv
			
 
				-        feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
			
 
				-
			
 
				-        return self.quantizer.encode(encoded_features), feature_lengths
			
 
				-
			
 
				-    def decode(self, indices, feature_lengths, return_audios=False):
			
 
				-        factor = math.prod(self.quantizer.downsample_factor)
			
 
				-        mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
			
 
				-        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				-
			
 
				-        z = self.quantizer.decode(indices) * mel_masks_float_conv
			
 
				-        z = (
			
 
				-            z
			
 
				-            + self.quality_projection(torch.ones(z.shape[0], 1, device=z.device) * 2)[
			
 
				-                :, :, None
			
 
				-            ]
			
 
				-        )
			
 
				-
			
 
				-        gen_mel = (
			
 
				-            self.decoder(
			
 
				-                torch.randn_like(z) * mel_masks_float_conv,
			
 
				-                condition=z,
			
 
				-            )
			
 
				-            * mel_masks_float_conv
			
 
				-        )
			
 
				-
			
 
				-        if return_audios:
			
 
				-            return self.vocoder(gen_mel)
			
 
				-
			
 
				-        return gen_mel
			
--- a/fish_speech/models/vqgan/modules/discriminator.py
+++ b/fish_speech/models/vqgan/modules/discriminator.py
@@ -1,44 +0,0 @@
 
				-import torch
			
 
				-from torch import nn
			
 
				-from torch.nn.utils.parametrizations import weight_norm
			
 
				-
			
 
				-
			
 
				-class Discriminator(nn.Module):
			
 
				-    def __init__(self):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        blocks = []
			
 
				-        convs = [
			
 
				-            (1, 64, (3, 9), 1, (1, 4)),
			
 
				-            (64, 128, (3, 9), (1, 2), (1, 4)),
			
 
				-            (128, 256, (3, 9), (1, 2), (1, 4)),
			
 
				-            (256, 512, (3, 9), (1, 2), (1, 4)),
			
 
				-            (512, 1024, (3, 3), 1, (1, 1)),
			
 
				-            (1024, 1, (3, 3), 1, (1, 1)),
			
 
				-        ]
			
 
				-
			
 
				-        for idx, (in_channels, out_channels, kernel_size, stride, padding) in enumerate(
			
 
				-            convs
			
 
				-        ):
			
 
				-            blocks.append(
			
 
				-                weight_norm(
			
 
				-                    nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
			
 
				-                )
			
 
				-            )
			
 
				-
			
 
				-            if idx != len(convs) - 1:
			
 
				-                blocks.append(nn.SiLU(inplace=True))
			
 
				-
			
 
				-        self.blocks = nn.Sequential(*blocks)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return self.blocks(x[:, None])[:, 0]
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    model = Discriminator()
			
 
				-    print(sum(p.numel() for p in model.parameters()) / 1_000_000)
			
 
				-    x = torch.randn(1, 128, 1024)
			
 
				-    y = model(x)
			
 
				-    print(y.shape)
			
 
				-    print(y)
			
--- a/fish_speech/models/vqgan/modules/firefly.py
+++ b/fish_speech/models/vqgan/modules/firefly.py
@@ -1,25 +1,26 @@
 
				-# A inference only version of the FireflyGAN model
			
 
				-
			
 
				 import math
			
 
				 from functools import partial
			
 
				 from math import prod
			
 
				 from typing import Callable
			
 
				 
			
 
				-import numpy as np
			
 
				 import torch
			
 
				 import torch.nn.functional as F
			
 
				 from torch import nn
			
 
				-from torch.nn import Conv1d
			
 
				 from torch.nn.utils.parametrizations import weight_norm
			
 
				 from torch.nn.utils.parametrize import remove_parametrizations
			
 
				 from torch.utils.checkpoint import checkpoint
			
 
				 
			
 
				-from fish_speech.models.vqgan.utils import sequence_mask
			
 
				+
			
 
				+def sequence_mask(length, max_length=None):
			
 
				+    if max_length is None:
			
 
				+        max_length = length.max()
			
 
				+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
			
 
				+    return x.unsqueeze(0) < length.unsqueeze(1)
			
 
				 
			
 
				 
			
 
				 def init_weights(m, mean=0.0, std=0.01):
			
 
				     classname = m.__class__.__name__
			
 
				-    if classname.find("Conv") != -1:
			
 
				+    if classname.find("Conv1D") != -1:
			
 
				         m.weight.data.normal_(mean, std)
			
 
				 
			
 
				 
			
@@ -27,78 +28,141 @@ def get_padding(kernel_size, dilation=1):
 
				     return (kernel_size * dilation - dilation) // 2
			
 
				 
			
 
				 
			
 
				+def unpad1d(x: torch.Tensor, paddings: tuple[int, int]):
			
 
				+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
			
 
				+    padding_left, padding_right = paddings
			
 
				+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
			
 
				+    assert (padding_left + padding_right) <= x.shape[-1]
			
 
				+    end = x.shape[-1] - padding_right
			
 
				+    return x[..., padding_left:end]
			
 
				+
			
 
				+
			
 
				+def get_extra_padding_for_conv1d(
			
 
				+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
			
 
				+) -> int:
			
 
				+    """See `pad_for_conv1d`."""
			
 
				+    length = x.shape[-1]
			
 
				+    n_frames = (length - kernel_size + padding_total) / stride + 1
			
 
				+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
			
 
				+    return ideal_length - length
			
 
				+
			
 
				+
			
 
				+def pad1d(
			
 
				+    x: torch.Tensor,
			
 
				+    paddings: tuple[int, int],
			
 
				+    mode: str = "zeros",
			
 
				+    value: float = 0.0,
			
 
				+):
			
 
				+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
			
 
				+    If this is the case, we insert extra 0 padding to the right
			
 
				+    before the reflection happen.
			
 
				+    """
			
 
				+    length = x.shape[-1]
			
 
				+    padding_left, padding_right = paddings
			
 
				+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
			
 
				+    if mode == "reflect":
			
 
				+        max_pad = max(padding_left, padding_right)
			
 
				+        extra_pad = 0
			
 
				+        if length <= max_pad:
			
 
				+            extra_pad = max_pad - length + 1
			
 
				+            x = F.pad(x, (0, extra_pad))
			
 
				+        padded = F.pad(x, paddings, mode, value)
			
 
				+        end = padded.shape[-1] - extra_pad
			
 
				+        return padded[..., :end]
			
 
				+    else:
			
 
				+        return F.pad(x, paddings, mode, value)
			
 
				+
			
 
				+
			
 
				+class FishConvNet(nn.Module):
			
 
				+    def __init__(
			
 
				+        self, in_channels, out_channels, kernel_size, dilation=1, stride=1, groups=1
			
 
				+    ):
			
 
				+        super(FishConvNet, self).__init__()
			
 
				+        self.conv = nn.Conv1d(
			
 
				+            in_channels,
			
 
				+            out_channels,
			
 
				+            kernel_size,
			
 
				+            stride=stride,
			
 
				+            dilation=dilation,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.stride = stride
			
 
				+        self.kernel_size = (kernel_size - 1) * dilation + 1
			
 
				+        self.dilation = dilation
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        pad = self.kernel_size - self.stride
			
 
				+        extra_padding = get_extra_padding_for_conv1d(
			
 
				+            x, self.kernel_size, self.stride, pad
			
 
				+        )
			
 
				+        x = pad1d(x, (pad, extra_padding), mode="constant", value=0)
			
 
				+        return self.conv(x).contiguous()
			
 
				+
			
 
				+    def weight_norm(self, name="weight", dim=0):
			
 
				+        self.conv = weight_norm(self.conv, name=name, dim=dim)
			
 
				+        return self
			
 
				+
			
 
				+    def remove_weight_norm(self):
			
 
				+        self.conv = remove_parametrizations(self.conv)
			
 
				+        return self
			
 
				+
			
 
				+
			
 
				+class FishTransConvNet(nn.Module):
			
 
				+    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, stride=1):
			
 
				+        super(FishTransConvNet, self).__init__()
			
 
				+        self.conv = nn.ConvTranspose1d(
			
 
				+            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation
			
 
				+        )
			
 
				+        self.stride = stride
			
 
				+        self.kernel_size = kernel_size
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        x = self.conv(x)
			
 
				+        pad = self.kernel_size - self.stride
			
 
				+        padding_right = math.ceil(pad)
			
 
				+        padding_left = pad - padding_right
			
 
				+        x = unpad1d(x, (padding_left, padding_right))
			
 
				+        return x.contiguous()
			
 
				+
			
 
				+    def weight_norm(self, name="weight", dim=0):
			
 
				+        self.conv = weight_norm(self.conv, name=name, dim=dim)
			
 
				+        return self
			
 
				+
			
 
				+    def remove_weight_norm(self):
			
 
				+        self.conv = remove_parametrizations(self.conv)
			
 
				+        return self
			
 
				+
			
 
				+
			
 
				 class ResBlock1(torch.nn.Module):
			
 
				     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
			
 
				         super().__init__()
			
 
				 
			
 
				         self.convs1 = nn.ModuleList(
			
 
				             [
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=dilation[0],
			
 
				-                        padding=get_padding(kernel_size, dilation[0]),
			
 
				-                    )
			
 
				-                ),
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=dilation[1],
			
 
				-                        padding=get_padding(kernel_size, dilation[1]),
			
 
				-                    )
			
 
				-                ),
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=dilation[2],
			
 
				-                        padding=get_padding(kernel_size, dilation[2]),
			
 
				-                    )
			
 
				-                ),
			
 
				+                FishConvNet(
			
 
				+                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
			
 
				+                ).weight_norm(),
			
 
				+                FishConvNet(
			
 
				+                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
			
 
				+                ).weight_norm(),
			
 
				+                FishConvNet(
			
 
				+                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
			
 
				+                ).weight_norm(),
			
 
				             ]
			
 
				         )
			
 
				         self.convs1.apply(init_weights)
			
 
				 
			
 
				         self.convs2 = nn.ModuleList(
			
 
				             [
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=1,
			
 
				-                        padding=get_padding(kernel_size, 1),
			
 
				-                    )
			
 
				-                ),
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=1,
			
 
				-                        padding=get_padding(kernel_size, 1),
			
 
				-                    )
			
 
				-                ),
			
 
				-                weight_norm(
			
 
				-                    Conv1d(
			
 
				-                        channels,
			
 
				-                        channels,
			
 
				-                        kernel_size,
			
 
				-                        1,
			
 
				-                        dilation=1,
			
 
				-                        padding=get_padding(kernel_size, 1),
			
 
				-                    )
			
 
				-                ),
			
 
				+                FishConvNet(
			
 
				+                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
			
 
				+                ).weight_norm(),
			
 
				+                FishConvNet(
			
 
				+                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
			
 
				+                ).weight_norm(),
			
 
				+                FishConvNet(
			
 
				+                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
			
 
				+                ).weight_norm(),
			
 
				             ]
			
 
				         )
			
 
				         self.convs2.apply(init_weights)
			
@@ -153,7 +217,6 @@ class HiFiGANGenerator(nn.Module):
 
				         resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
			
 
				         num_mels: int = 128,
			
 
				         upsample_initial_channel: int = 512,
			
 
				-        use_template: bool = True,
			
 
				         pre_conv_kernel_size: int = 7,
			
 
				         post_conv_kernel_size: int = 7,
			
 
				         post_activation: Callable = partial(nn.SiLU, inplace=True),
			
@@ -164,54 +227,29 @@ class HiFiGANGenerator(nn.Module):
 
				             prod(upsample_rates) == hop_length
			
 
				         ), f"hop_length must be {prod(upsample_rates)}"
			
 
				 
			
 
				-        self.conv_pre = weight_norm(
			
 
				-            nn.Conv1d(
			
 
				-                num_mels,
			
 
				-                upsample_initial_channel,
			
 
				-                pre_conv_kernel_size,
			
 
				-                1,
			
 
				-                padding=get_padding(pre_conv_kernel_size),
			
 
				-            )
			
 
				-        )
			
 
				+        self.conv_pre = FishConvNet(
			
 
				+            num_mels,
			
 
				+            upsample_initial_channel,
			
 
				+            pre_conv_kernel_size,
			
 
				+            stride=1,
			
 
				+        ).weight_norm()
			
 
				 
			
 
				         self.num_upsamples = len(upsample_rates)
			
 
				         self.num_kernels = len(resblock_kernel_sizes)
			
 
				 
			
 
				         self.noise_convs = nn.ModuleList()
			
 
				-        self.use_template = use_template
			
 
				         self.ups = nn.ModuleList()
			
 
				 
			
 
				         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
			
 
				-            c_cur = upsample_initial_channel // (2 ** (i + 1))
			
 
				             self.ups.append(
			
 
				-                weight_norm(
			
 
				-                    nn.ConvTranspose1d(
			
 
				-                        upsample_initial_channel // (2**i),
			
 
				-                        upsample_initial_channel // (2 ** (i + 1)),
			
 
				-                        k,
			
 
				-                        u,
			
 
				-                        padding=(k - u) // 2,
			
 
				-                    )
			
 
				-                )
			
 
				+                FishTransConvNet(
			
 
				+                    upsample_initial_channel // (2**i),
			
 
				+                    upsample_initial_channel // (2 ** (i + 1)),
			
 
				+                    k,
			
 
				+                    stride=u,
			
 
				+                ).weight_norm()
			
 
				             )
			
 
				 
			
 
				-            if not use_template:
			
 
				-                continue
			
 
				-
			
 
				-            if i + 1 < len(upsample_rates):
			
 
				-                stride_f0 = np.prod(upsample_rates[i + 1 :])
			
 
				-                self.noise_convs.append(
			
 
				-                    Conv1d(
			
 
				-                        1,
			
 
				-                        c_cur,
			
 
				-                        kernel_size=stride_f0 * 2,
			
 
				-                        stride=stride_f0,
			
 
				-                        padding=stride_f0 // 2,
			
 
				-                    )
			
 
				-                )
			
 
				-            else:
			
 
				-                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
			
 
				-
			
 
				         self.resblocks = nn.ModuleList()
			
 
				         for i in range(len(self.ups)):
			
 
				             ch = upsample_initial_channel // (2 ** (i + 1))
			
@@ -220,29 +258,20 @@ class HiFiGANGenerator(nn.Module):
 
				             )
			
 
				 
			
 
				         self.activation_post = post_activation()
			
 
				-        self.conv_post = weight_norm(
			
 
				-            nn.Conv1d(
			
 
				-                ch,
			
 
				-                1,
			
 
				-                post_conv_kernel_size,
			
 
				-                1,
			
 
				-                padding=get_padding(post_conv_kernel_size),
			
 
				-            )
			
 
				-        )
			
 
				+        self.conv_post = FishConvNet(
			
 
				+            ch, 1, post_conv_kernel_size, stride=1
			
 
				+        ).weight_norm()
			
 
				         self.ups.apply(init_weights)
			
 
				         self.conv_post.apply(init_weights)
			
 
				 
			
 
				-    def forward(self, x, template=None):
			
 
				+    def forward(self, x):
			
 
				         x = self.conv_pre(x)
			
 
				 
			
 
				         for i in range(self.num_upsamples):
			
 
				             x = F.silu(x, inplace=True)
			
 
				             x = self.ups[i](x)
			
 
				 
			
 
				-            if self.use_template:
			
 
				-                x = x + self.noise_convs[i](template)
			
 
				-
			
 
				-            if self.training:
			
 
				+            if self.training and self.checkpointing:
			
 
				                 x = checkpoint(
			
 
				                     self.resblocks[i],
			
 
				                     x,
			
@@ -364,11 +393,11 @@ class ConvNeXtBlock(nn.Module):
 
				     ):
			
 
				         super().__init__()
			
 
				 
			
 
				-        self.dwconv = nn.Conv1d(
			
 
				+        self.dwconv = FishConvNet(
			
 
				             dim,
			
 
				             dim,
			
 
				             kernel_size=kernel_size,
			
 
				-            padding=int(dilation * (kernel_size - 1) / 2),
			
 
				+            # padding=int(dilation * (kernel_size - 1) / 2),
			
 
				             groups=dim,
			
 
				         )  # depthwise conv
			
 
				         self.norm = LayerNorm(dim, eps=1e-6)
			
@@ -421,12 +450,13 @@ class ConvNeXtEncoder(nn.Module):
 
				 
			
 
				         self.downsample_layers = nn.ModuleList()
			
 
				         stem = nn.Sequential(
			
 
				-            nn.Conv1d(
			
 
				+            FishConvNet(
			
 
				                 input_channels,
			
 
				                 dims[0],
			
 
				-                kernel_size=kernel_size,
			
 
				-                padding=kernel_size // 2,
			
 
				-                padding_mode="zeros",
			
 
				+                kernel_size=7,
			
 
				+                # padding=3,
			
 
				+                # padding_mode="replicate",
			
 
				+                # padding_mode="zeros",
			
 
				             ),
			
 
				             LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
			
 
				         )
			
@@ -491,6 +521,7 @@ class FireflyArchitecture(nn.Module):
 
				         self.head = head
			
 
				         self.quantizer = quantizer
			
 
				         self.spec_transform = spec_transform
			
 
				+        self.downsample_factor = math.prod(self.quantizer.downsample_factor)
			
 
				 
			
 
				     def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
			
 
				         if self.spec_transform is not None:
			
@@ -512,7 +543,7 @@ class FireflyArchitecture(nn.Module):
 
				         if x.ndim == 2:
			
 
				             x = x[:, None, :]
			
 
				 
			
 
				-        if self.quantizer is not None:
			
 
				+        if self.vq is not None:
			
 
				             return x, vq_result
			
 
				 
			
 
				         return x
			
@@ -528,25 +559,30 @@ class FireflyArchitecture(nn.Module):
 
				 
			
 
				         # Encode
			
 
				         encoded_features = self.backbone(mels) * mel_masks_float_conv
			
 
				-        feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
			
 
				+        feature_lengths = mel_lengths // self.downsample_factor
			
 
				 
			
 
				         return self.quantizer.encode(encoded_features), feature_lengths
			
 
				 
			
 
				     def decode(self, indices, feature_lengths) -> torch.Tensor:
			
 
				-        factor = math.prod(self.quantizer.downsample_factor)
			
 
				-        mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
			
 
				+        mel_masks = sequence_mask(
			
 
				+            feature_lengths * self.downsample_factor,
			
 
				+            indices.shape[2] * self.downsample_factor,
			
 
				+        )
			
 
				         mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				+        audio_lengths = (
			
 
				+            feature_lengths * self.downsample_factor * self.spec_transform.hop_length
			
 
				+        )
			
 
				 
			
 
				         audio_masks = sequence_mask(
			
 
				-            feature_lengths * factor * self.spec_transform.hop_length,
			
 
				-            indices.shape[2] * factor * self.spec_transform.hop_length,
			
 
				+            audio_lengths,
			
 
				+            indices.shape[2] * self.downsample_factor * self.spec_transform.hop_length,
			
 
				         )
			
 
				         audio_masks_float_conv = audio_masks[:, None, :].float()
			
 
				 
			
 
				         z = self.quantizer.decode(indices) * mel_masks_float_conv
			
 
				         x = self.head(z) * audio_masks_float_conv
			
 
				 
			
 
				-        return x
			
 
				+        return x, audio_lengths
			
 
				 
			
 
				     def remove_parametrizations(self):
			
 
				         if hasattr(self.backbone, "remove_parametrizations"):
			
@@ -558,68 +594,3 @@ class FireflyArchitecture(nn.Module):
 
				     @property
			
 
				     def device(self):
			
 
				         return next(self.parameters()).device
			
 
				-
			
 
				-
			
 
				-class FireflyBase(nn.Module):
			
 
				-    def __init__(self, ckpt_path: str = None, pretrained: bool = True):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.backbone = ConvNeXtEncoder(
			
 
				-            input_channels=128,
			
 
				-            depths=[3, 3, 9, 3],
			
 
				-            dims=[128, 256, 384, 512],
			
 
				-            drop_path_rate=0.2,
			
 
				-            kernel_size=7,
			
 
				-        )
			
 
				-
			
 
				-        self.head = HiFiGANGenerator(
			
 
				-            hop_length=512,
			
 
				-            upsample_rates=[8, 8, 2, 2, 2],
			
 
				-            upsample_kernel_sizes=[16, 16, 4, 4, 4],
			
 
				-            resblock_kernel_sizes=[3, 7, 11],
			
 
				-            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
			
 
				-            num_mels=512,
			
 
				-            upsample_initial_channel=512,
			
 
				-            use_template=False,
			
 
				-            pre_conv_kernel_size=13,
			
 
				-            post_conv_kernel_size=13,
			
 
				-        )
			
 
				-
			
 
				-        if ckpt_path is not None:
			
 
				-            state_dict = torch.load(ckpt_path, map_location="cpu")
			
 
				-        elif pretrained:
			
 
				-            state_dict = torch.hub.load_state_dict_from_url(
			
 
				-                "https://github.com/fishaudio/vocoder/releases/download/1.0.0/firefly-gan-base-generator.ckpt",
			
 
				-                map_location="cpu",
			
 
				-                model_dir="checkpoints",
			
 
				-            )
			
 
				-
			
 
				-        if "state_dict" in state_dict:
			
 
				-            state_dict = state_dict["state_dict"]
			
 
				-
			
 
				-        if any("generator." in k for k in state_dict):
			
 
				-            state_dict = {
			
 
				-                k.replace("generator.", ""): v
			
 
				-                for k, v in state_dict.items()
			
 
				-                if "generator." in k
			
 
				-            }
			
 
				-
			
 
				-        self.load_state_dict(state_dict, strict=True)
			
 
				-        self.head.remove_parametrizations()
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				-        x = self.backbone(x)
			
 
				-        x = self.head(x)
			
 
				-        if x.ndim == 2:
			
 
				-            x = x[:, None, :]
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    model = FireflyBase()
			
 
				-    model.eval()
			
 
				-    x = torch.randn(1, 128, 128)
			
 
				-    with torch.no_grad():
			
 
				-        y = model(x)
			
 
				-    print(y.shape)
			
--- a/fish_speech/models/vqgan/modules/fsq.py
+++ b/fish_speech/models/vqgan/modules/fsq.py
@@ -6,7 +6,7 @@ import torch.nn.functional as F
 
				 from einops import rearrange
			
 
				 from vector_quantize_pytorch import GroupedResidualFSQ
			
 
				 
			
 
				-from .firefly import ConvNeXtBlock
			
 
				+from .firefly import ConvNeXtBlock, FishConvNet, FishTransConvNet
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -20,7 +20,7 @@ class DownsampleFiniteScalarQuantize(nn.Module):
 
				     def __init__(
			
 
				         self,
			
 
				         input_dim: int = 512,
			
 
				-        n_codebooks: int = 1,
			
 
				+        n_codebooks: int = 9,
			
 
				         n_groups: int = 1,
			
 
				         levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
			
 
				         downsample_factor: tuple[int] = (2, 2),
			
@@ -46,7 +46,7 @@ class DownsampleFiniteScalarQuantize(nn.Module):
 
				         self.downsample = nn.Sequential(
			
 
				             *[
			
 
				                 nn.Sequential(
			
 
				-                    nn.Conv1d(
			
 
				+                    FishConvNet(
			
 
				                         all_dims[idx],
			
 
				                         all_dims[idx + 1],
			
 
				                         kernel_size=factor,
			
@@ -61,7 +61,7 @@ class DownsampleFiniteScalarQuantize(nn.Module):
 
				         self.upsample = nn.Sequential(
			
 
				             *[
			
 
				                 nn.Sequential(
			
 
				-                    nn.ConvTranspose1d(
			
 
				+                    FishTransConvNet(
			
 
				                         all_dims[idx + 1],
			
 
				                         all_dims[idx],
			
 
				                         kernel_size=factor,
			
@@ -114,26 +114,3 @@ class DownsampleFiniteScalarQuantize(nn.Module):
 
				         z_q = self.residual_fsq.get_output_from_indices(indices)
			
 
				         z_q = self.upsample(z_q.mT)
			
 
				         return z_q
			
 
				-
			
 
				-    # def from_latents(self, latents: torch.Tensor):
			
 
				-    #     z_q, z_p, codes = super().from_latents(latents)
			
 
				-    #     z_q = self.upsample(z_q)
			
 
				-    #     return z_q, z_p, codes
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    rvq = DownsampleFiniteScalarQuantize(
			
 
				-        n_codebooks=1,
			
 
				-        downsample_factor=(2, 2),
			
 
				-    )
			
 
				-    x = torch.randn(16, 512, 80)
			
 
				-
			
 
				-    result = rvq(x)
			
 
				-    print(rvq)
			
 
				-    print(result.latents.shape, result.codes.shape, result.z.shape)
			
 
				-
			
 
				-    # y = rvq.from_codes(result.codes)
			
 
				-    # print(y[0].shape)
			
 
				-
			
 
				-    # y = rvq.from_latents(result.latents)
			
 
				-    # print(y[0].shape)
			
--- a/fish_speech/models/vqgan/modules/reference.py
+++ b/fish_speech/models/vqgan/modules/reference.py
@@ -1,115 +0,0 @@
 
				-from typing import Optional
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn.functional as F
			
 
				-from torch import nn
			
 
				-
			
 
				-from fish_speech.utils import autocast_exclude_mps
			
 
				-
			
 
				-from .wavenet import WaveNet
			
 
				-
			
 
				-
			
 
				-class ReferenceEncoder(WaveNet):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        input_channels: Optional[int] = None,
			
 
				-        output_channels: Optional[int] = None,
			
 
				-        residual_channels: int = 512,
			
 
				-        residual_layers: int = 20,
			
 
				-        dilation_cycle: Optional[int] = 4,
			
 
				-        num_heads: int = 8,
			
 
				-        latent_len: int = 4,
			
 
				-    ):
			
 
				-        super().__init__(
			
 
				-            input_channels=input_channels,
			
 
				-            residual_channels=residual_channels,
			
 
				-            residual_layers=residual_layers,
			
 
				-            dilation_cycle=dilation_cycle,
			
 
				-        )
			
 
				-
			
 
				-        self.head_dim = residual_channels // num_heads
			
 
				-        self.num_heads = num_heads
			
 
				-
			
 
				-        self.latent_len = latent_len
			
 
				-        self.latent = nn.Parameter(torch.zeros(1, self.latent_len, residual_channels))
			
 
				-
			
 
				-        self.q = nn.Linear(residual_channels, residual_channels, bias=True)
			
 
				-        self.kv = nn.Linear(residual_channels, residual_channels * 2, bias=True)
			
 
				-        self.q_norm = nn.LayerNorm(self.head_dim)
			
 
				-        self.k_norm = nn.LayerNorm(self.head_dim)
			
 
				-        self.proj = nn.Linear(residual_channels, residual_channels)
			
 
				-        self.proj_drop = nn.Dropout(0.1)
			
 
				-
			
 
				-        self.norm = nn.LayerNorm(residual_channels)
			
 
				-        self.mlp = nn.Sequential(
			
 
				-            nn.Linear(residual_channels, residual_channels * 4),
			
 
				-            nn.SiLU(),
			
 
				-            nn.Linear(residual_channels * 4, residual_channels),
			
 
				-        )
			
 
				-        self.output_projection_attn = nn.Linear(residual_channels, output_channels)
			
 
				-
			
 
				-        torch.nn.init.trunc_normal_(self.latent, std=0.02)
			
 
				-        self.apply(self.init_weights)
			
 
				-
			
 
				-    def init_weights(self, m):
			
 
				-        if isinstance(m, nn.Linear):
			
 
				-            torch.nn.init.trunc_normal_(m.weight, std=0.02)
			
 
				-            if m.bias is not None:
			
 
				-                torch.nn.init.constant_(m.bias, 0)
			
 
				-
			
 
				-    def forward(self, x, attn_mask=None):
			
 
				-        x = super().forward(x).mT
			
 
				-        B, N, C = x.shape
			
 
				-
			
 
				-        # Calculate mask
			
 
				-        if attn_mask is not None:
			
 
				-            assert attn_mask.shape == (B, N) and attn_mask.dtype == torch.bool
			
 
				-
			
 
				-            attn_mask = attn_mask[:, None, None, :].expand(
			
 
				-                B, self.num_heads, self.latent_len, N
			
 
				-            )
			
 
				-
			
 
				-        q_latent = self.latent.expand(B, -1, -1)
			
 
				-        q = (
			
 
				-            self.q(q_latent)
			
 
				-            .reshape(B, self.latent_len, self.num_heads, self.head_dim)
			
 
				-            .transpose(1, 2)
			
 
				-        )
			
 
				-
			
 
				-        kv = (
			
 
				-            self.kv(x)
			
 
				-            .reshape(B, N, 2, self.num_heads, self.head_dim)
			
 
				-            .permute(2, 0, 3, 1, 4)
			
 
				-        )
			
 
				-        k, v = kv.unbind(0)
			
 
				-
			
 
				-        q, k = self.q_norm(q), self.k_norm(k)
			
 
				-        x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
			
 
				-
			
 
				-        x = x.transpose(1, 2).reshape(B, self.latent_len, C)
			
 
				-        x = self.proj(x)
			
 
				-        x = self.proj_drop(x)
			
 
				-
			
 
				-        x = x + self.mlp(self.norm(x))
			
 
				-        x = self.output_projection_attn(x)
			
 
				-        x = x.mean(1)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    with autocast_exclude_mps(device_type="cpu", dtype=torch.bfloat16):
			
 
				-        model = ReferenceEncoder(
			
 
				-            input_channels=128,
			
 
				-            output_channels=64,
			
 
				-            residual_channels=384,
			
 
				-            residual_layers=20,
			
 
				-            dilation_cycle=4,
			
 
				-            num_heads=8,
			
 
				-        )
			
 
				-        x = torch.randn(4, 128, 64)
			
 
				-        mask = torch.ones(4, 64, dtype=torch.bool)
			
 
				-        y = model(x, mask)
			
 
				-        print(y.shape)
			
 
				-        loss = F.mse_loss(y, torch.randn(4, 64))
			
 
				-        loss.backward()
			
--- a/fish_speech/models/vqgan/modules/wavenet.py
+++ b/fish_speech/models/vqgan/modules/wavenet.py
@@ -1,225 +0,0 @@
 
				-import math
			
 
				-from typing import Optional
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn.functional as F
			
 
				-from torch import nn
			
 
				-
			
 
				-
			
 
				-class Mish(nn.Module):
			
 
				-    def forward(self, x):
			
 
				-        return x * torch.tanh(F.softplus(x))
			
 
				-
			
 
				-
			
 
				-class DiffusionEmbedding(nn.Module):
			
 
				-    """Diffusion Step Embedding"""
			
 
				-
			
 
				-    def __init__(self, d_denoiser):
			
 
				-        super(DiffusionEmbedding, self).__init__()
			
 
				-        self.dim = d_denoiser
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        device = x.device
			
 
				-        half_dim = self.dim // 2
			
 
				-        emb = math.log(10000) / (half_dim - 1)
			
 
				-        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
			
 
				-        emb = x[:, None] * emb[None, :]
			
 
				-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
			
 
				-        return emb
			
 
				-
			
 
				-
			
 
				-class LinearNorm(nn.Module):
			
 
				-    """LinearNorm Projection"""
			
 
				-
			
 
				-    def __init__(self, in_features, out_features, bias=False):
			
 
				-        super(LinearNorm, self).__init__()
			
 
				-        self.linear = nn.Linear(in_features, out_features, bias)
			
 
				-
			
 
				-        nn.init.xavier_uniform_(self.linear.weight)
			
 
				-        if bias:
			
 
				-            nn.init.constant_(self.linear.bias, 0.0)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = self.linear(x)
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class ConvNorm(nn.Module):
			
 
				-    """1D Convolution"""
			
 
				-
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        in_channels,
			
 
				-        out_channels,
			
 
				-        kernel_size=1,
			
 
				-        stride=1,
			
 
				-        padding=None,
			
 
				-        dilation=1,
			
 
				-        bias=True,
			
 
				-        w_init_gain="linear",
			
 
				-    ):
			
 
				-        super(ConvNorm, self).__init__()
			
 
				-
			
 
				-        if padding is None:
			
 
				-            assert kernel_size % 2 == 1
			
 
				-            padding = int(dilation * (kernel_size - 1) / 2)
			
 
				-
			
 
				-        self.conv = nn.Conv1d(
			
 
				-            in_channels,
			
 
				-            out_channels,
			
 
				-            kernel_size=kernel_size,
			
 
				-            stride=stride,
			
 
				-            padding=padding,
			
 
				-            dilation=dilation,
			
 
				-            bias=bias,
			
 
				-        )
			
 
				-        nn.init.kaiming_normal_(self.conv.weight)
			
 
				-
			
 
				-    def forward(self, signal):
			
 
				-        conv_signal = self.conv(signal)
			
 
				-
			
 
				-        return conv_signal
			
 
				-
			
 
				-
			
 
				-class ResidualBlock(nn.Module):
			
 
				-    """Residual Block"""
			
 
				-
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        residual_channels,
			
 
				-        use_linear_bias=False,
			
 
				-        dilation=1,
			
 
				-        condition_channels=None,
			
 
				-    ):
			
 
				-        super(ResidualBlock, self).__init__()
			
 
				-        self.conv_layer = ConvNorm(
			
 
				-            residual_channels,
			
 
				-            2 * residual_channels,
			
 
				-            kernel_size=3,
			
 
				-            stride=1,
			
 
				-            padding=dilation,
			
 
				-            dilation=dilation,
			
 
				-        )
			
 
				-
			
 
				-        if condition_channels is not None:
			
 
				-            self.diffusion_projection = LinearNorm(
			
 
				-                residual_channels, residual_channels, use_linear_bias
			
 
				-            )
			
 
				-            self.condition_projection = ConvNorm(
			
 
				-                condition_channels, 2 * residual_channels, kernel_size=1
			
 
				-            )
			
 
				-
			
 
				-        self.output_projection = ConvNorm(
			
 
				-            residual_channels, 2 * residual_channels, kernel_size=1
			
 
				-        )
			
 
				-
			
 
				-    def forward(self, x, condition=None, diffusion_step=None):
			
 
				-        y = x
			
 
				-
			
 
				-        if diffusion_step is not None:
			
 
				-            diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
			
 
				-            y = y + diffusion_step
			
 
				-
			
 
				-        y = self.conv_layer(y)
			
 
				-
			
 
				-        if condition is not None:
			
 
				-            condition = self.condition_projection(condition)
			
 
				-            y = y + condition
			
 
				-
			
 
				-        gate, filter = torch.chunk(y, 2, dim=1)
			
 
				-        y = torch.sigmoid(gate) * torch.tanh(filter)
			
 
				-
			
 
				-        y = self.output_projection(y)
			
 
				-        residual, skip = torch.chunk(y, 2, dim=1)
			
 
				-
			
 
				-        return (x + residual) / math.sqrt(2.0), skip
			
 
				-
			
 
				-
			
 
				-class WaveNet(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        input_channels: Optional[int] = None,
			
 
				-        output_channels: Optional[int] = None,
			
 
				-        residual_channels: int = 512,
			
 
				-        residual_layers: int = 20,
			
 
				-        dilation_cycle: Optional[int] = 4,
			
 
				-        is_diffusion: bool = False,
			
 
				-        condition_channels: Optional[int] = None,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        # Input projection
			
 
				-        self.input_projection = None
			
 
				-        if input_channels is not None and input_channels != residual_channels:
			
 
				-            self.input_projection = ConvNorm(
			
 
				-                input_channels, residual_channels, kernel_size=1
			
 
				-            )
			
 
				-
			
 
				-        if input_channels is None:
			
 
				-            input_channels = residual_channels
			
 
				-
			
 
				-        self.input_channels = input_channels
			
 
				-
			
 
				-        # Residual layers
			
 
				-        self.residual_layers = nn.ModuleList(
			
 
				-            [
			
 
				-                ResidualBlock(
			
 
				-                    residual_channels=residual_channels,
			
 
				-                    use_linear_bias=False,
			
 
				-                    dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1,
			
 
				-                    condition_channels=condition_channels,
			
 
				-                )
			
 
				-                for i in range(residual_layers)
			
 
				-            ]
			
 
				-        )
			
 
				-
			
 
				-        # Skip projection
			
 
				-        self.skip_projection = ConvNorm(
			
 
				-            residual_channels, residual_channels, kernel_size=1
			
 
				-        )
			
 
				-
			
 
				-        # Output projection
			
 
				-        self.output_projection = None
			
 
				-        if output_channels is not None and output_channels != residual_channels:
			
 
				-            self.output_projection = ConvNorm(
			
 
				-                residual_channels, output_channels, kernel_size=1
			
 
				-            )
			
 
				-
			
 
				-        if is_diffusion:
			
 
				-            self.diffusion_embedding = DiffusionEmbedding(residual_channels)
			
 
				-            self.mlp = nn.Sequential(
			
 
				-                LinearNorm(residual_channels, residual_channels * 4, False),
			
 
				-                Mish(),
			
 
				-                LinearNorm(residual_channels * 4, residual_channels, False),
			
 
				-            )
			
 
				-
			
 
				-        self.apply(self._init_weights)
			
 
				-
			
 
				-    def _init_weights(self, m):
			
 
				-        if isinstance(m, (nn.Conv1d, nn.Linear)):
			
 
				-            nn.init.trunc_normal_(m.weight, std=0.02)
			
 
				-            if getattr(m, "bias", None) is not None:
			
 
				-                nn.init.constant_(m.bias, 0)
			
 
				-
			
 
				-    def forward(self, x, t=None, condition=None):
			
 
				-        if self.input_projection is not None:
			
 
				-            x = self.input_projection(x)
			
 
				-            x = F.silu(x)
			
 
				-
			
 
				-        if t is not None:
			
 
				-            t = self.diffusion_embedding(t)
			
 
				-            t = self.mlp(t)
			
 
				-
			
 
				-        skip = []
			
 
				-        for layer in self.residual_layers:
			
 
				-            x, skip_connection = layer(x, condition, t)
			
 
				-            skip.append(skip_connection)
			
 
				-
			
 
				-        x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers))
			
 
				-        x = self.skip_projection(x)
			
 
				-
			
 
				-        if self.output_projection is not None:
			
 
				-            x = F.silu(x)
			
 
				-            x = self.output_projection(x)
			
 
				-
			
 
				-        return x
			
--- a/tools/vqgan/inference.py
+++ b/tools/vqgan/inference.py
@@ -103,7 +103,9 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
 
				 
			
 
				     # Restore
			
 
				     feature_lengths = torch.tensor([indices.shape[1]], device=device)
			
 
				-    fake_audios = model.decode(indices=indices[None], feature_lengths=feature_lengths)
			
 
				+    fake_audios, _ = model.decode(
			
 
				+        indices=indices[None], feature_lengths=feature_lengths
			
 
				+    )
			
 
				     audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate
			
 
				 
			
 
				     logger.info(