2 years ago · 7b5fe470e4
--- a/fish_speech/configs/vqgan_pretrain.yaml
+++ b/fish_speech/configs/vqgan_pretrain.yaml
@@ -2,36 +2,36 @@ defaults:
 
				   - base
			
 
				   - _self_
			
 
				 
			
 
				-project: vqgan_pretrain_lfq
			
 
				-ckpt_path: checkpoints/gpt_sovits_488k.pth
			
 
				-resume_weights_only: true
			
 
				+project: vq_reflow_debug
			
 
				 
			
 
				 # Lightning Trainer
			
 
				 trainer:
			
 
				   accelerator: gpu
			
 
				   devices: auto
			
 
				   strategy: ddp_find_unused_parameters_true
			
 
				-  precision: 32
			
 
				+  precision: 16-mixed
			
 
				   max_steps: 1_000_000
			
 
				   val_check_interval: 2000
			
 
				+  gradient_clip_algorithm: norm
			
 
				+  gradient_clip_val: 1.0
			
 
				 
			
 
				-sample_rate: 32000
			
 
				-hop_length: 640
			
 
				-num_mels: 128
			
 
				+sample_rate: 44100
			
 
				+hop_length: 512
			
 
				+num_mels: 160
			
 
				 n_fft: 2048
			
 
				 win_length: 2048
			
 
				 
			
 
				 # Dataset Configuration
			
 
				 train_dataset:
			
 
				   _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				-  filelist: data/vq_train_filelist.txt
			
 
				+  filelist: /***REMOVED***/workspace/diffusion-test/data/HiFi-TTS/vq_train_filelist.txt
			
 
				   sample_rate: ${sample_rate}
			
 
				   hop_length: ${hop_length}
			
 
				-  slice_frames: 128
			
 
				+  slice_frames: 512
			
 
				 
			
 
				 val_dataset:
			
 
				   _target_: fish_speech.datasets.vqgan.VQGANDataset
			
 
				-  filelist: data/vq_val_filelist.txt
			
 
				+  filelist: /***REMOVED***/workspace/diffusion-test/data/HiFi-TTS/vq_val_filelist.txt
			
 
				   sample_rate: ${sample_rate}
			
 
				   hop_length: ${hop_length}
			
 
				 
			
@@ -40,49 +40,48 @@ data:
 
				   train_dataset: ${train_dataset}
			
 
				   val_dataset: ${val_dataset}
			
 
				   num_workers: 4
			
 
				-  batch_size: 16
			
 
				+  batch_size: 32
			
 
				   val_batch_size: 4
			
 
				 
			
 
				 # Model Configuration
			
 
				 model:
			
 
				   _target_: fish_speech.models.vqgan.VQGAN
			
 
				-  sample_rate: ${sample_rate}
			
 
				-  hop_length: ${hop_length}
			
 
				-  freeze_discriminator: false
			
 
				 
			
 
				-  weight_mel: 45.0
			
 
				-  weight_kl: 0.1
			
 
				+  sampling_rate: ${sample_rate}
			
 
				+  weight_reflow: 1.0
			
 
				   weight_vq: 1.0
			
 
				   weight_aux_mel: 1.0
			
 
				 
			
 
				-  generator:
			
 
				-    _target_: fish_speech.models.vqgan.modules.models.SynthesizerTrn
			
 
				-    spec_channels: 1025
			
 
				-    segment_size: 32
			
 
				-    inter_channels: 192
			
 
				-    prior_hidden_channels: 192
			
 
				-    posterior_hidden_channels: 192
			
 
				-    prior_n_layers: 16
			
 
				-    posterior_n_layers: 16
			
 
				-    kernel_size: 5
			
 
				-    p_dropout: 0.1
			
 
				-    resblock: "1"
			
 
				-    resblock_kernel_sizes: [3, 7, 11]
			
 
				-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				-    upsample_rates: [10, 8, 2, 2, 2]
			
 
				-    upsample_initial_channel: 512
			
 
				-    upsample_kernel_sizes: [16, 16, 8, 2, 2]
			
 
				-    gin_channels: 512
			
 
				-    freeze_quantizer: false
			
 
				-    freeze_decoder: false
			
 
				-    freeze_posterior_encoder: false
			
 
				-    codebook_size: 1024
			
 
				-    num_codebooks: 2
			
 
				-    aux_spec_channels: ${num_mels}
			
 
				+  encoder:
			
 
				+    _target_: fish_speech.models.vqgan.modules.convnext.ConvNeXtEncoder
			
 
				+    input_channels: ${num_mels}
			
 
				+    depths: [3, 3, 9, 3]
			
 
				+    dims: [128, 256, 384, 512]
			
 
				+  
			
 
				+  quantizer:
			
 
				+    _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
			
 
				+    input_dim: 512
			
 
				+    n_codebooks: 8
			
 
				+    levels: [8, 5, 5, 5]
			
 
				+  
			
 
				+  aux_decoder:
			
 
				+    _target_: fish_speech.models.vqgan.modules.convnext.ConvNeXtEncoder
			
 
				+    input_channels: 512
			
 
				+    output_channels: ${num_mels}
			
 
				+    depths: [6]
			
 
				+    dims: [384]
			
 
				+
			
 
				+  reflow:
			
 
				+    _target_: fish_speech.models.vqgan.modules.dit.DiT
			
 
				+    hidden_size: 768
			
 
				+    num_heads: 12
			
 
				+    diffusion_num_layers: 12
			
 
				+    channels: ${num_mels}
			
 
				+    condition_dim: 512
			
 
				 
			
 
				-  discriminator:
			
 
				-    _target_: fish_speech.models.vqgan.modules.models.EnsembledDiscriminator
			
 
				-    periods: [2, 3, 5, 7, 11]
			
 
				+  vocoder:
			
 
				+    _target_: fish_speech.models.vqgan.modules.firefly.FireflyBase
			
 
				+    ckpt_path: checkpoints/firefly-gan-base-002000000.ckpt
			
 
				 
			
 
				   mel_transform:
			
 
				     _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
			
@@ -92,13 +91,6 @@ model:
 
				     win_length: ${win_length}
			
 
				     n_mels: ${num_mels}
			
 
				 
			
 
				-  spec_transform:
			
 
				-    _target_: fish_speech.models.vqgan.spectrogram.LinearSpectrogram
			
 
				-    n_fft: ${n_fft}
			
 
				-    hop_length: ${hop_length}
			
 
				-    win_length: ${win_length}
			
 
				-    mode: pow2_sqrt
			
 
				-  
			
 
				   optimizer:
			
 
				     _target_: torch.optim.AdamW
			
 
				     _partial_: true
			
@@ -107,12 +99,19 @@ model:
 
				     eps: 1e-5
			
 
				 
			
 
				   lr_scheduler:
			
 
				-    _target_: torch.optim.lr_scheduler.ExponentialLR
			
 
				+    _target_: torch.optim.lr_scheduler.LambdaLR
			
 
				     _partial_: true
			
 
				-    gamma: 0.99999
			
 
				+    lr_lambda:
			
 
				+      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
			
 
				+      _partial_: true
			
 
				+      num_warmup_steps: 100
			
 
				+      num_training_steps: ${trainer.max_steps}
			
 
				+      final_lr_ratio: 0
			
 
				 
			
 
				 callbacks:
			
 
				   grad_norm_monitor:
			
 
				     sub_module: 
			
 
				-      - generator
			
 
				-      - discriminator
			
 
				+      - encoder
			
 
				+      - aux_decoder
			
 
				+      - quantizer
			
 
				+      - reflow
			
--- a/fish_speech/models/vqgan/lit_module.py
+++ b/fish_speech/models/vqgan/lit_module.py
@@ -9,15 +9,7 @@ import wandb
 
				 from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
			
 
				 from matplotlib import pyplot as plt
			
 
				 from torch import nn
			
 
				-from torch.utils.checkpoint import checkpoint as gradient_checkpoint
			
 
				-
			
 
				-from fish_speech.models.vqgan.losses import (
			
 
				-    MultiResolutionSTFTLoss,
			
 
				-    discriminator_loss,
			
 
				-    feature_loss,
			
 
				-    generator_loss,
			
 
				-    kl_loss,
			
 
				-)
			
 
				+
			
 
				 from fish_speech.models.vqgan.utils import plot_mel, sequence_mask, slice_segments
			
 
				 
			
 
				 
			
@@ -40,17 +32,16 @@ class VQGAN(L.LightningModule):
 
				         self,
			
 
				         optimizer: Callable,
			
 
				         lr_scheduler: Callable,
			
 
				-        generator: nn.Module,
			
 
				-        discriminator: nn.Module,
			
 
				+        encoder: nn.Module,
			
 
				+        quantizer: nn.Module,
			
 
				+        aux_decoder: nn.Module,
			
 
				+        reflow: nn.Module,
			
 
				+        vocoder: nn.Module,
			
 
				         mel_transform: nn.Module,
			
 
				-        spec_transform: nn.Module,
			
 
				-        hop_length: int = 640,
			
 
				-        sample_rate: int = 32000,
			
 
				-        freeze_discriminator: bool = False,
			
 
				-        weight_mel: float = 45,
			
 
				-        weight_kl: float = 0.1,
			
 
				+        weight_reflow: float = 1.0,
			
 
				         weight_vq: float = 1.0,
			
 
				-        weight_aux_mel: float = 20.0,
			
 
				+        weight_aux_mel: float = 1.0,
			
 
				+        sampling_rate: int = 44100,
			
 
				     ):
			
 
				         super().__init__()
			
 
				 
			
@@ -58,62 +49,54 @@ class VQGAN(L.LightningModule):
 
				         self.optimizer_builder = optimizer
			
 
				         self.lr_scheduler_builder = lr_scheduler
			
 
				 
			
 
				-        # Generator and discriminator
			
 
				-        self.generator = generator
			
 
				-        self.discriminator = discriminator
			
 
				+        # Modules
			
 
				+        self.encoder = encoder
			
 
				+        self.quantizer = quantizer
			
 
				+        self.aux_decoder = aux_decoder
			
 
				+        self.reflow = reflow
			
 
				         self.mel_transform = mel_transform
			
 
				-        self.spec_transform = spec_transform
			
 
				-        self.freeze_discriminator = freeze_discriminator
			
 
				+        self.vocoder = vocoder
			
 
				+
			
 
				+        # Freeze vocoder
			
 
				+        for param in self.vocoder.parameters():
			
 
				+            param.requires_grad = False
			
 
				 
			
 
				         # Loss weights
			
 
				-        self.weight_mel = weight_mel
			
 
				-        self.weight_kl = weight_kl
			
 
				+        self.weight_reflow = weight_reflow
			
 
				         self.weight_vq = weight_vq
			
 
				         self.weight_aux_mel = weight_aux_mel
			
 
				 
			
 
				-        # Other parameters
			
 
				-        self.hop_length = hop_length
			
 
				-        self.sampling_rate = sample_rate
			
 
				-
			
 
				-        # Disable automatic optimization
			
 
				-        self.automatic_optimization = False
			
 
				+        self.spec_min = -12
			
 
				+        self.spec_max = 3
			
 
				+        self.sampling_rate = sampling_rate
			
 
				 
			
 
				-        if self.freeze_discriminator:
			
 
				-            for p in self.discriminator.parameters():
			
 
				-                p.requires_grad = False
			
 
				+    def on_save_checkpoint(self, checkpoint):
			
 
				+        # Do not save vocoder
			
 
				+        state_dict = checkpoint["state_dict"]
			
 
				+        for name in list(state_dict.keys()):
			
 
				+            if "vocoder" in name:
			
 
				+                state_dict.pop(name)
			
 
				 
			
 
				     def configure_optimizers(self):
			
 
				         # Need two optimizers and two schedulers
			
 
				-        optimizer_generator = self.optimizer_builder(self.generator.parameters())
			
 
				-        optimizer_discriminator = self.optimizer_builder(
			
 
				-            self.discriminator.parameters()
			
 
				-        )
			
 
				-
			
 
				-        lr_scheduler_generator = self.lr_scheduler_builder(optimizer_generator)
			
 
				-        lr_scheduler_discriminator = self.lr_scheduler_builder(optimizer_discriminator)
			
 
				-
			
 
				-        return (
			
 
				-            {
			
 
				-                "optimizer": optimizer_generator,
			
 
				-                "lr_scheduler": {
			
 
				-                    "scheduler": lr_scheduler_generator,
			
 
				-                    "interval": "step",
			
 
				-                    "name": "optimizer/generator",
			
 
				-                },
			
 
				+        optimizer = self.optimizer_builder(self.parameters())
			
 
				+        lr_scheduler = self.lr_scheduler_builder(optimizer)
			
 
				+
			
 
				+        return {
			
 
				+            "optimizer": optimizer,
			
 
				+            "lr_scheduler": {
			
 
				+                "scheduler": lr_scheduler,
			
 
				+                "interval": "step",
			
 
				             },
			
 
				-            {
			
 
				-                "optimizer": optimizer_discriminator,
			
 
				-                "lr_scheduler": {
			
 
				-                    "scheduler": lr_scheduler_discriminator,
			
 
				-                    "interval": "step",
			
 
				-                    "name": "optimizer/discriminator",
			
 
				-                },
			
 
				-            },
			
 
				-        )
			
 
				+        }
			
 
				 
			
 
				-    def training_step(self, batch, batch_idx):
			
 
				-        optim_g, optim_d = self.optimizers()
			
 
				+    def norm_spec(self, x):
			
 
				+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
			
 
				 
			
 
				+    def denorm_spec(self, x):
			
 
				+        return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
			
 
				+
			
 
				+    def training_step(self, batch, batch_idx):
			
 
				         audios, audio_lengths = batch["audios"], batch["audio_lengths"]
			
 
				 
			
 
				         audios = audios.float()
			
@@ -121,173 +104,84 @@ class VQGAN(L.LightningModule):
 
				 
			
 
				         with torch.no_grad():
			
 
				             gt_mels = self.mel_transform(audios)
			
 
				-            gt_specs = self.spec_transform(audios)
			
 
				-
			
 
				-        spec_lengths = audio_lengths // self.hop_length
			
 
				-        spec_masks = torch.unsqueeze(
			
 
				-            sequence_mask(spec_lengths, gt_mels.shape[2]), 1
			
 
				-        ).to(gt_mels.dtype)
			
 
				-        (
			
 
				-            fake_audios,
			
 
				-            ids_slice,
			
 
				-            y_mask,
			
 
				-            y_mask,
			
 
				-            (z, z_p, m_p, logs_p, m_q, logs_q),
			
 
				-            loss_vq,
			
 
				-            decoded_aux_mels,
			
 
				-        ) = self.generator(gt_specs, spec_lengths)
			
 
				 
			
 
				-        gt_mels = slice_segments(gt_mels, ids_slice, self.generator.segment_size)
			
 
				-        decoded_aux_mels = slice_segments(
			
 
				-            decoded_aux_mels, ids_slice, self.generator.segment_size
			
 
				-        )
			
 
				-        spec_masks = slice_segments(spec_masks, ids_slice, self.generator.segment_size)
			
 
				-        audios = slice_segments(
			
 
				-            audios,
			
 
				-            ids_slice * self.hop_length,
			
 
				-            self.generator.segment_size * self.hop_length,
			
 
				-        )
			
 
				-        fake_mels = self.mel_transform(fake_audios.squeeze(1))
			
 
				+        mel_lengths = audio_lengths // self.mel_transform.hop_length
			
 
				+        mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
			
 
				+        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				 
			
 
				-        assert (
			
 
				-            audios.shape == fake_audios.shape
			
 
				-        ), f"{audios.shape} != {fake_audios.shape}"
			
 
				-
			
 
				-        # Discriminator
			
 
				-        if self.freeze_discriminator is False:
			
 
				-            y_d_hat_r, y_d_hat_g, _, _ = self.discriminator(
			
 
				-                audios, fake_audios.detach()
			
 
				-            )
			
 
				+        # Encode
			
 
				+        encoded_features = self.encoder(gt_mels) * mel_masks_float_conv
			
 
				 
			
 
				-            with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				-                loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g)
			
 
				-
			
 
				-            self.log(
			
 
				-                f"train/discriminator/loss",
			
 
				-                loss_disc,
			
 
				-                on_step=True,
			
 
				-                on_epoch=False,
			
 
				-                prog_bar=False,
			
 
				-                logger=True,
			
 
				-                sync_dist=True,
			
 
				-            )
			
 
				+        # Quantize
			
 
				+        vq_result = self.quantizer(encoded_features)
			
 
				+        loss_vq = getattr("vq_result", "loss", 0.0)
			
 
				+        vq_recon_features = vq_result.z * mel_masks_float_conv
			
 
				 
			
 
				-            optim_d.zero_grad()
			
 
				-            self.manual_backward(loss_disc)
			
 
				-            self.clip_gradients(
			
 
				-                optim_d, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
			
 
				-            )
			
 
				-            optim_d.step()
			
 
				+        # VQ Decode
			
 
				+        aux_mel = self.aux_decoder(vq_recon_features)
			
 
				+        loss_aux_mel = F.l1_loss(
			
 
				+            aux_mel * mel_masks_float_conv, gt_mels * mel_masks_float_conv
			
 
				+        )
			
 
				 
			
 
				-        # Adv Loss
			
 
				-        y_d_hat_r, y_d_hat_g, _, _ = self.discriminator(audios, fake_audios)
			
 
				+        # Reflow
			
 
				+        x_1 = self.norm_spec(gt_mels.mT)
			
 
				+        t = torch.rand(gt_mels.shape[0], device=gt_mels.device)
			
 
				+        x_0 = torch.randn_like(x_1)
			
 
				 
			
 
				-        # Adversarial Loss
			
 
				-        with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				-            loss_adv, _ = generator_loss(y_d_hat_g)
			
 
				+        # X_t = t * X_1 + (1 - t) * X_0
			
 
				+        x_t = x_0 + t[:, None, None] * (x_1 - x_0)
			
 
				 
			
 
				-        self.log(
			
 
				-            f"train/generator/adv",
			
 
				-            loss_adv,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				+        v_pred = self.reflow(
			
 
				+            x_t,
			
 
				+            1000 * t,
			
 
				+            condition=vq_recon_features.mT,
			
 
				+            self_mask=mel_masks,
			
 
				         )
			
 
				 
			
 
				-        with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				-            loss_fm = feature_loss(y_d_hat_r, y_d_hat_g)
			
 
				-
			
 
				-        self.log(
			
 
				-            f"train/generator/adv_fm",
			
 
				-            loss_fm,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				+        # Log L2 loss with
			
 
				+        weights = 0.398942 / t / (1 - t) * torch.exp(-0.5 * torch.log(t / (1 - t)) ** 2)
			
 
				+        loss_reflow = weights[:, None, None] * F.mse_loss(
			
 
				+            x_1 - x_0, v_pred, reduction="none"
			
 
				         )
			
 
				+        loss_reflow = (loss_reflow * mel_masks_float_conv.mT).mean()
			
 
				 
			
 
				-        with torch.autocast(device_type=audios.device.type, enabled=False):
			
 
				-            loss_mel = F.l1_loss(gt_mels * spec_masks, fake_mels * spec_masks)
			
 
				-            loss_aux_mel = F.l1_loss(
			
 
				-                gt_mels * spec_masks, decoded_aux_mels * spec_masks
			
 
				-            )
			
 
				-
			
 
				-        self.log(
			
 
				-            "train/generator/loss_mel",
			
 
				-            loss_mel,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				+        # Total loss
			
 
				+        loss = (
			
 
				+            self.weight_vq * loss_vq
			
 
				+            + self.weight_aux_mel * loss_aux_mel
			
 
				+            + self.weight_reflow * loss_reflow
			
 
				         )
			
 
				 
			
 
				+        # Log losses
			
 
				         self.log(
			
 
				-            "train/generator/loss_aux_mel",
			
 
				-            loss_aux_mel,
			
 
				-            on_step=True,
			
 
				-            on_epoch=False,
			
 
				-            prog_bar=False,
			
 
				-            logger=True,
			
 
				-            sync_dist=True,
			
 
				+            "train/loss", loss, on_step=True, on_epoch=False, prog_bar=True, logger=True
			
 
				         )
			
 
				-
			
 
				         self.log(
			
 
				-            "train/generator/loss_vq",
			
 
				+            "train/loss_vq",
			
 
				             loss_vq,
			
 
				             on_step=True,
			
 
				             on_epoch=False,
			
 
				             prog_bar=False,
			
 
				             logger=True,
			
 
				-            sync_dist=True,
			
 
				         )
			
 
				-
			
 
				-        loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, y_mask)
			
 
				-
			
 
				         self.log(
			
 
				-            "train/generator/loss_kl",
			
 
				-            loss_kl,
			
 
				+            "train/loss_aux_mel",
			
 
				+            loss_aux_mel,
			
 
				             on_step=True,
			
 
				             on_epoch=False,
			
 
				             prog_bar=False,
			
 
				             logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        loss = (
			
 
				-            loss_mel * self.weight_mel
			
 
				-            + loss_aux_mel * self.weight_aux_mel
			
 
				-            + loss_vq * self.weight_vq
			
 
				-            + loss_kl * self.weight_kl
			
 
				-            + loss_adv
			
 
				-            + loss_fm
			
 
				         )
			
 
				         self.log(
			
 
				-            "train/generator/loss",
			
 
				-            loss,
			
 
				+            "train/loss_reflow",
			
 
				+            loss_reflow,
			
 
				             on_step=True,
			
 
				             on_epoch=False,
			
 
				             prog_bar=False,
			
 
				             logger=True,
			
 
				-            sync_dist=True,
			
 
				-        )
			
 
				-
			
 
				-        # Backward
			
 
				-        optim_g.zero_grad()
			
 
				-
			
 
				-        self.manual_backward(loss)
			
 
				-        self.clip_gradients(
			
 
				-            optim_g, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
			
 
				         )
			
 
				-        optim_g.step()
			
 
				 
			
 
				-        # Manual LR Scheduler
			
 
				-        scheduler_g, scheduler_d = self.lr_schedulers()
			
 
				-        scheduler_g.step()
			
 
				-        scheduler_d.step()
			
 
				+        return loss
			
 
				 
			
 
				     def validation_step(self, batch: Any, batch_idx: int):
			
 
				         audios, audio_lengths = batch["audios"], batch["audio_lengths"]
			
@@ -296,32 +190,25 @@ class VQGAN(L.LightningModule):
 
				         audios = audios[:, None, :]
			
 
				 
			
 
				         gt_mels = self.mel_transform(audios)
			
 
				-        gt_specs = self.spec_transform(audios)
			
 
				-        spec_lengths = audio_lengths // self.hop_length
			
 
				-        spec_masks = torch.unsqueeze(
			
 
				-            sequence_mask(spec_lengths, gt_mels.shape[2]), 1
			
 
				-        ).to(gt_mels.dtype)
			
 
				-
			
 
				-        prior_audios, _, _ = self.generator.infer(gt_specs, spec_lengths)
			
 
				-        posterior_audios, _, _ = self.generator.infer_posterior(gt_specs, spec_lengths)
			
 
				-        prior_mels = self.mel_transform(prior_audios.squeeze(1))
			
 
				-        posterior_mels = self.mel_transform(posterior_audios.squeeze(1))
			
 
				-
			
 
				-        min_mel_length = min(
			
 
				-            gt_mels.shape[-1], prior_mels.shape[-1], posterior_mels.shape[-1]
			
 
				-        )
			
 
				-        gt_mels = gt_mels[:, :, :min_mel_length]
			
 
				-        prior_mels = prior_mels[:, :, :min_mel_length]
			
 
				-        posterior_mels = posterior_mels[:, :, :min_mel_length]
			
 
				+        mel_lengths = audio_lengths // self.mel_transform.hop_length
			
 
				+        mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
			
 
				+        mel_masks_float_conv = mel_masks[:, None, :].float()
			
 
				+
			
 
				+        # Encode
			
 
				+        encoded_features = self.encoder(gt_mels) * mel_masks_float_conv
			
 
				 
			
 
				-        prior_mel_loss = F.l1_loss(gt_mels * spec_masks, prior_mels * spec_masks)
			
 
				-        posterior_mel_loss = F.l1_loss(
			
 
				-            gt_mels * spec_masks, posterior_mels * spec_masks
			
 
				+        # Quantize
			
 
				+        vq_result = self.quantizer(encoded_features)
			
 
				+
			
 
				+        # VQ Decode
			
 
				+        aux_mels = self.aux_decoder(vq_result.z)
			
 
				+        loss_aux_mel = F.l1_loss(
			
 
				+            aux_mels * mel_masks_float_conv, gt_mels * mel_masks_float_conv
			
 
				         )
			
 
				 
			
 
				         self.log(
			
 
				-            "val/prior_mel_loss",
			
 
				-            prior_mel_loss,
			
 
				+            "val/loss_aux_mel",
			
 
				+            loss_aux_mel,
			
 
				             on_step=False,
			
 
				             on_epoch=True,
			
 
				             prog_bar=False,
			
@@ -329,9 +216,33 @@ class VQGAN(L.LightningModule):
 
				             sync_dist=True,
			
 
				         )
			
 
				 
			
 
				+        # Reflow inference
			
 
				+        t_start = 0.0
			
 
				+        infer_step = 20
			
 
				+        gen_mels = torch.randn(gt_mels.shape, device=gt_mels.device).mT
			
 
				+        t = torch.zeros(gt_mels.shape[0], device=gt_mels.device)
			
 
				+        dt = (1.0 - t_start) / infer_step
			
 
				+
			
 
				+        for _ in range(infer_step):
			
 
				+            gen_mels += (
			
 
				+                self.reflow(
			
 
				+                    gen_mels,
			
 
				+                    1000 * t,
			
 
				+                    condition=vq_result.z.mT,
			
 
				+                    self_mask=mel_masks,
			
 
				+                )
			
 
				+                * dt
			
 
				+            )
			
 
				+            t += dt
			
 
				+
			
 
				+        gen_mels = self.denorm_spec(gen_mels).mT
			
 
				+        loss_recon_reflow = F.l1_loss(
			
 
				+            gen_mels * mel_masks_float_conv, gt_mels * mel_masks_float_conv
			
 
				+        )
			
 
				+
			
 
				         self.log(
			
 
				-            "val/posterior_mel_loss",
			
 
				-            posterior_mel_loss,
			
 
				+            "val/loss_recon_reflow",
			
 
				+            loss_recon_reflow,
			
 
				             on_step=False,
			
 
				             on_epoch=True,
			
 
				             prog_bar=False,
			
@@ -339,41 +250,47 @@ class VQGAN(L.LightningModule):
 
				             sync_dist=True,
			
 
				         )
			
 
				 
			
 
				+        gen_audios = self.vocoder(gen_mels)
			
 
				+        recon_audios = self.vocoder(gt_mels)
			
 
				+        aux_audios = self.vocoder(aux_mels)
			
 
				+
			
 
				         # only log the first batch
			
 
				         if batch_idx != 0:
			
 
				             return
			
 
				 
			
 
				         for idx, (
			
 
				-            mel,
			
 
				-            prior_mel,
			
 
				-            posterior_mel,
			
 
				+            gt_mel,
			
 
				+            reflow_mel,
			
 
				+            aux_mel,
			
 
				             audio,
			
 
				-            prior_audio,
			
 
				-            posterior_audio,
			
 
				+            reflow_audio,
			
 
				+            aux_audio,
			
 
				+            recon_audio,
			
 
				             audio_len,
			
 
				         ) in enumerate(
			
 
				             zip(
			
 
				                 gt_mels,
			
 
				-                prior_mels,
			
 
				-                posterior_mels,
			
 
				-                audios.detach().float(),
			
 
				-                prior_audios.detach().float(),
			
 
				-                posterior_audios.detach().float(),
			
 
				+                gen_mels,
			
 
				+                aux_mels,
			
 
				+                audios.float(),
			
 
				+                gen_audios.float(),
			
 
				+                aux_audios.float(),
			
 
				+                recon_audios.float(),
			
 
				                 audio_lengths,
			
 
				             )
			
 
				         ):
			
 
				-            mel_len = audio_len // self.hop_length
			
 
				+            mel_len = audio_len // self.mel_transform.hop_length
			
 
				 
			
 
				             image_mels = plot_mel(
			
 
				                 [
			
 
				-                    prior_mel[:, :mel_len],
			
 
				-                    posterior_mel[:, :mel_len],
			
 
				-                    mel[:, :mel_len],
			
 
				+                    gt_mel[:, :mel_len],
			
 
				+                    reflow_mel[:, :mel_len],
			
 
				+                    aux_mel[:, :mel_len],
			
 
				                 ],
			
 
				                 [
			
 
				-                    "Prior (VQ)",
			
 
				-                    "Posterior (Reconstruction)",
			
 
				                     "Ground-Truth",
			
 
				+                    "Reflow",
			
 
				+                    "Aux",
			
 
				                 ],
			
 
				             )
			
 
				 
			
@@ -388,14 +305,19 @@ class VQGAN(L.LightningModule):
 
				                                 caption="gt",
			
 
				                             ),
			
 
				                             wandb.Audio(
			
 
				-                                prior_audio[0, :audio_len],
			
 
				+                                reflow_audio[0, :audio_len],
			
 
				                                 sample_rate=self.sampling_rate,
			
 
				-                                caption="prior",
			
 
				+                                caption="reflow",
			
 
				                             ),
			
 
				                             wandb.Audio(
			
 
				-                                posterior_audio[0, :audio_len],
			
 
				+                                aux_audio[0, :audio_len],
			
 
				                                 sample_rate=self.sampling_rate,
			
 
				-                                caption="posterior",
			
 
				+                                caption="aux",
			
 
				+                            ),
			
 
				+                            wandb.Audio(
			
 
				+                                recon_audio[0, :audio_len],
			
 
				+                                sample_rate=self.sampling_rate,
			
 
				+                                caption="recon",
			
 
				                             ),
			
 
				                         ],
			
 
				                     },
			
@@ -414,91 +336,22 @@ class VQGAN(L.LightningModule):
 
				                     sample_rate=self.sampling_rate,
			
 
				                 )
			
 
				                 self.logger.experiment.add_audio(
			
 
				-                    f"sample-{idx}/wavs/prior",
			
 
				-                    prior_audio[0, :audio_len],
			
 
				+                    f"sample-{idx}/wavs/reflow",
			
 
				+                    reflow_audio[0, :audio_len],
			
 
				                     self.global_step,
			
 
				                     sample_rate=self.sampling_rate,
			
 
				                 )
			
 
				                 self.logger.experiment.add_audio(
			
 
				-                    f"sample-{idx}/wavs/posterior",
			
 
				-                    posterior_audio[0, :audio_len],
			
 
				+                    f"sample-{idx}/wavs/aux",
			
 
				+                    aux_audio[0, :audio_len],
			
 
				+                    self.global_step,
			
 
				+                    sample_rate=self.sampling_rate,
			
 
				+                )
			
 
				+                self.logger.experiment.add_audio(
			
 
				+                    f"sample-{idx}/wavs/recon",
			
 
				+                    recon_audio[0, :audio_len],
			
 
				                     self.global_step,
			
 
				                     sample_rate=self.sampling_rate,
			
 
				                 )
			
 
				 
			
 
				             plt.close(image_mels)
			
 
				-
			
 
				-    # def encode(self, audios, audio_lengths=None):
			
 
				-    #     if audio_lengths is None:
			
 
				-    #         audio_lengths = torch.tensor(
			
 
				-    #             [audios.shape[-1]] * audios.shape[0],
			
 
				-    #             device=audios.device,
			
 
				-    #             dtype=torch.long,
			
 
				-    #         )
			
 
				-
			
 
				-    #     with torch.no_grad():
			
 
				-    #         features = self.mel_transform(audios, sample_rate=self.sampling_rate)
			
 
				-
			
 
				-    #     feature_lengths = (
			
 
				-    #         audio_lengths
			
 
				-    #         / self.hop_length
			
 
				-    #         # / self.vq.downsample
			
 
				-    #     ).long()
			
 
				-
			
 
				-    #     # print(features.shape, feature_lengths.shape, torch.max(feature_lengths))
			
 
				-
			
 
				-    #     feature_masks = torch.unsqueeze(
			
 
				-    #         sequence_mask(feature_lengths, features.shape[2]), 1
			
 
				-    #     ).to(features.dtype)
			
 
				-
			
 
				-    #     features = (
			
 
				-    #         gradient_checkpoint(
			
 
				-    #             self.encoder, features, feature_masks, use_reentrant=False
			
 
				-    #         )
			
 
				-    #         * feature_masks
			
 
				-    #     )
			
 
				-    #     vq_features, indices, loss = self.vq(features, feature_masks)
			
 
				-
			
 
				-    #     return VQEncodeResult(
			
 
				-    #         features=vq_features,
			
 
				-    #         indices=indices,
			
 
				-    #         loss=loss,
			
 
				-    #         feature_lengths=feature_lengths,
			
 
				-    #     )
			
 
				-
			
 
				-    # def calculate_audio_lengths(self, feature_lengths):
			
 
				-    #     return feature_lengths * self.hop_length * self.vq.downsample
			
 
				-
			
 
				-    # def decode(
			
 
				-    #     self,
			
 
				-    #     indices=None,
			
 
				-    #     features=None,
			
 
				-    #     audio_lengths=None,
			
 
				-    #     feature_lengths=None,
			
 
				-    #     return_audios=False,
			
 
				-    # ):
			
 
				-    #     assert (
			
 
				-    #         indices is not None or features is not None
			
 
				-    #     ), "indices or features must be provided"
			
 
				-    #     assert (
			
 
				-    #         feature_lengths is not None or audio_lengths is not None
			
 
				-    #     ), "feature_lengths or audio_lengths must be provided"
			
 
				-
			
 
				-    #     if audio_lengths is None:
			
 
				-    #         audio_lengths = self.calculate_audio_lengths(feature_lengths)
			
 
				-
			
 
				-    #     mel_lengths = audio_lengths // self.hop_length
			
 
				-    #     mel_masks = torch.unsqueeze(
			
 
				-    #         sequence_mask(mel_lengths, torch.max(mel_lengths)), 1
			
 
				-    #     ).float()
			
 
				-
			
 
				-    #     if indices is not None:
			
 
				-    #         features = self.vq.decode(indices)
			
 
				-
			
 
				-    #     # Sample mels
			
 
				-    #     decoded = gradient_checkpoint(self.decoder, features, use_reentrant=False)
			
 
				-
			
 
				-    #     return VQDecodeResult(
			
 
				-    #         mels=decoded,
			
 
				-    #         audios=self.generator(decoded) if return_audios else None,
			
 
				-    #     )
			
--- a/fish_speech/models/vqgan/modules/convnext.py
+++ b/fish_speech/models/vqgan/modules/convnext.py
@@ -0,0 +1,249 @@
 
				+from functools import partial
			
 
				+from typing import Optional
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+from torch import nn
			
 
				+
			
 
				+
			
 
				+def drop_path(
			
 
				+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
			
 
				+):
			
 
				+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
			
 
				+
			
 
				+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
			
 
				+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
			
 
				+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
			
 
				+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
			
 
				+    'survival rate' as the argument.
			
 
				+
			
 
				+    """  # noqa: E501
			
 
				+
			
 
				+    if drop_prob == 0.0 or not training:
			
 
				+        return x
			
 
				+    keep_prob = 1 - drop_prob
			
 
				+    shape = (x.shape[0],) + (1,) * (
			
 
				+        x.ndim - 1
			
 
				+    )  # work with diff dim tensors, not just 2D ConvNets
			
 
				+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
			
 
				+    if keep_prob > 0.0 and scale_by_keep:
			
 
				+        random_tensor.div_(keep_prob)
			
 
				+    return x * random_tensor
			
 
				+
			
 
				+
			
 
				+class DropPath(nn.Module):
			
 
				+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
			
 
				+
			
 
				+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
			
 
				+        super(DropPath, self).__init__()
			
 
				+        self.drop_prob = drop_prob
			
 
				+        self.scale_by_keep = scale_by_keep
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
			
 
				+
			
 
				+    def extra_repr(self):
			
 
				+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
			
 
				+
			
 
				+
			
 
				+class LayerNorm(nn.Module):
			
 
				+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
			
 
				+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
			
 
				+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
			
 
				+    with shape (batch_size, channels, height, width).
			
 
				+    """  # noqa: E501
			
 
				+
			
 
				+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
			
 
				+        super().__init__()
			
 
				+        self.weight = nn.Parameter(torch.ones(normalized_shape))
			
 
				+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
			
 
				+        self.eps = eps
			
 
				+        self.data_format = data_format
			
 
				+        if self.data_format not in ["channels_last", "channels_first"]:
			
 
				+            raise NotImplementedError
			
 
				+        self.normalized_shape = (normalized_shape,)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        if self.data_format == "channels_last":
			
 
				+            return F.layer_norm(
			
 
				+                x, self.normalized_shape, self.weight, self.bias, self.eps
			
 
				+            )
			
 
				+        elif self.data_format == "channels_first":
			
 
				+            u = x.mean(1, keepdim=True)
			
 
				+            s = (x - u).pow(2).mean(1, keepdim=True)
			
 
				+            x = (x - u) / torch.sqrt(s + self.eps)
			
 
				+            x = self.weight[:, None] * x + self.bias[:, None]
			
 
				+            return x
			
 
				+
			
 
				+
			
 
				+class ConvNeXtBlock(nn.Module):
			
 
				+    r"""ConvNeXt Block. There are two equivalent implementations:
			
 
				+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
			
 
				+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
			
 
				+    We use (2) as we find it slightly faster in PyTorch
			
 
				+
			
 
				+    Args:
			
 
				+        dim (int): Number of input channels.
			
 
				+        drop_path (float): Stochastic depth rate. Default: 0.0
			
 
				+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
			
 
				+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
			
 
				+        kernel_size (int): Kernel size for depthwise conv. Default: 7.
			
 
				+        dilation (int): Dilation for depthwise conv. Default: 1.
			
 
				+    """  # noqa: E501
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        dim: int,
			
 
				+        drop_path: float = 0.0,
			
 
				+        layer_scale_init_value: float = 1e-6,
			
 
				+        mlp_ratio: float = 4.0,
			
 
				+        kernel_size: int = 7,
			
 
				+        dilation: int = 1,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.dwconv = nn.Conv1d(
			
 
				+            dim,
			
 
				+            dim,
			
 
				+            kernel_size=kernel_size,
			
 
				+            padding=int(dilation * (kernel_size - 1) / 2),
			
 
				+            groups=dim,
			
 
				+        )  # depthwise conv
			
 
				+        self.norm = LayerNorm(dim, eps=1e-6)
			
 
				+        self.pwconv1 = nn.Linear(
			
 
				+            dim, int(mlp_ratio * dim)
			
 
				+        )  # pointwise/1x1 convs, implemented with linear layers
			
 
				+        self.act = nn.GELU()
			
 
				+        self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
			
 
				+        self.gamma = (
			
 
				+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
			
 
				+            if layer_scale_init_value > 0
			
 
				+            else None
			
 
				+        )
			
 
				+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
			
 
				+
			
 
				+    def forward(self, x, apply_residual: bool = True):
			
 
				+        input = x
			
 
				+
			
 
				+        x = self.dwconv(x)
			
 
				+        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
			
 
				+        x = self.norm(x)
			
 
				+        x = self.pwconv1(x)
			
 
				+        x = self.act(x)
			
 
				+        x = self.pwconv2(x)
			
 
				+
			
 
				+        if self.gamma is not None:
			
 
				+            x = self.gamma * x
			
 
				+
			
 
				+        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
			
 
				+        x = self.drop_path(x)
			
 
				+
			
 
				+        if apply_residual:
			
 
				+            x = input + x
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class ParallelConvNeXtBlock(nn.Module):
			
 
				+    def __init__(self, kernel_sizes: list[int], *args, **kwargs):
			
 
				+        super().__init__()
			
 
				+        self.blocks = nn.ModuleList(
			
 
				+            [
			
 
				+                ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs)
			
 
				+                for kernel_size in kernel_sizes
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        return torch.stack(
			
 
				+            [block(x, apply_residual=False) for block in self.blocks] + [x],
			
 
				+            dim=1,
			
 
				+        ).sum(dim=1)
			
 
				+
			
 
				+
			
 
				+class ConvNeXtEncoder(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        input_channels: int = 3,
			
 
				+        output_channels: Optional[int] = None,
			
 
				+        depths: list[int] = [3, 3, 9, 3],
			
 
				+        dims: list[int] = [96, 192, 384, 768],
			
 
				+        drop_path_rate: float = 0.0,
			
 
				+        layer_scale_init_value: float = 1e-6,
			
 
				+        kernel_sizes: tuple[int] = (7,),
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert len(depths) == len(dims)
			
 
				+
			
 
				+        self.channel_layers = nn.ModuleList()
			
 
				+        stem = nn.Sequential(
			
 
				+            nn.Conv1d(
			
 
				+                input_channels,
			
 
				+                dims[0],
			
 
				+                kernel_size=7,
			
 
				+                padding=3,
			
 
				+                padding_mode="zeros",
			
 
				+            ),
			
 
				+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
			
 
				+        )
			
 
				+        self.channel_layers.append(stem)
			
 
				+
			
 
				+        for i in range(len(depths) - 1):
			
 
				+            mid_layer = nn.Sequential(
			
 
				+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
			
 
				+                nn.Conv1d(dims[i], dims[i + 1], kernel_size=1),
			
 
				+            )
			
 
				+            self.channel_layers.append(mid_layer)
			
 
				+
			
 
				+        block_fn = (
			
 
				+            partial(ConvNeXtBlock, kernel_size=kernel_sizes[0])
			
 
				+            if len(kernel_sizes) == 1
			
 
				+            else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes)
			
 
				+        )
			
 
				+
			
 
				+        self.stages = nn.ModuleList()
			
 
				+        drop_path_rates = [
			
 
				+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
			
 
				+        ]
			
 
				+
			
 
				+        cur = 0
			
 
				+        for i in range(len(depths)):
			
 
				+            stage = nn.Sequential(
			
 
				+                *[
			
 
				+                    block_fn(
			
 
				+                        dim=dims[i],
			
 
				+                        drop_path=drop_path_rates[cur + j],
			
 
				+                        layer_scale_init_value=layer_scale_init_value,
			
 
				+                    )
			
 
				+                    for j in range(depths[i])
			
 
				+                ]
			
 
				+            )
			
 
				+            self.stages.append(stage)
			
 
				+            cur += depths[i]
			
 
				+
			
 
				+        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
			
 
				+
			
 
				+        if output_channels is not None:
			
 
				+            self.output_projection = nn.Conv1d(dims[-1], output_channels, kernel_size=1)
			
 
				+
			
 
				+        self.apply(self._init_weights)
			
 
				+
			
 
				+    def _init_weights(self, m):
			
 
				+        if isinstance(m, (nn.Conv1d, nn.Linear)):
			
 
				+            nn.init.trunc_normal_(m.weight, std=0.02)
			
 
				+            nn.init.constant_(m.bias, 0)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x: torch.Tensor,
			
 
				+    ) -> torch.Tensor:
			
 
				+        for channel_layer, stage in zip(self.channel_layers, self.stages):
			
 
				+            x = channel_layer(x)
			
 
				+            x = stage(x)
			
 
				+
			
 
				+        x = self.norm(x)
			
 
				+
			
 
				+        if hasattr(self, "output_projection"):
			
 
				+            x = self.output_projection(x)
			
 
				+
			
 
				+        return x
			
--- a/fish_speech/models/vqgan/modules/dit.py
+++ b/fish_speech/models/vqgan/modules/dit.py
@@ -0,0 +1,419 @@
 
				+import math
			
 
				+from typing import Callable, Optional, Union
			
 
				+
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+
			
 
				+def modulate(x, shift, scale):
			
 
				+    return x * (1 + scale) + shift
			
 
				+
			
 
				+
			
 
				+def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
			
 
				+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
			
 
				+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
			
 
				+    x_out2 = torch.stack(
			
 
				+        [
			
 
				+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
			
 
				+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
			
 
				+        ],
			
 
				+        -1,
			
 
				+    )
			
 
				+
			
 
				+    x_out2 = x_out2.flatten(3)
			
 
				+    return x_out2.type_as(x)
			
 
				+
			
 
				+
			
 
				+class TimestepEmbedder(nn.Module):
			
 
				+    """
			
 
				+    Embeds scalar timesteps into vector representations.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, hidden_size, frequency_embedding_size=256):
			
 
				+        super().__init__()
			
 
				+        self.mlp = FeedForward(
			
 
				+            frequency_embedding_size, hidden_size, out_dim=hidden_size
			
 
				+        )
			
 
				+        self.frequency_embedding_size = frequency_embedding_size
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def timestep_embedding(t, dim, max_period=10000):
			
 
				+        """
			
 
				+        Create sinusoidal timestep embeddings.
			
 
				+        :param t: a 1-D Tensor of N indices, one per batch element.
			
 
				+                          These may be fractional.
			
 
				+        :param dim: the dimension of the output.
			
 
				+        :param max_period: controls the minimum frequency of the embeddings.
			
 
				+        :return: an (N, D) Tensor of positional embeddings.
			
 
				+        """
			
 
				+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
			
 
				+        half = dim // 2
			
 
				+        freqs = torch.exp(
			
 
				+            -math.log(max_period)
			
 
				+            * torch.arange(start=0, end=half, dtype=torch.float32)
			
 
				+            / half
			
 
				+        ).to(device=t.device)
			
 
				+        args = t[:, None].float() * freqs[None]
			
 
				+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
			
 
				+        if dim % 2:
			
 
				+            embedding = torch.cat(
			
 
				+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
			
 
				+            )
			
 
				+        return embedding
			
 
				+
			
 
				+    def forward(self, t):
			
 
				+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
			
 
				+        t_emb = self.mlp(t_freq)
			
 
				+        return t_emb
			
 
				+
			
 
				+
			
 
				+def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> torch.Tensor:
			
 
				+    freqs = 1.0 / (
			
 
				+        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
			
 
				+    )
			
 
				+    t = torch.arange(seq_len, device=freqs.device)
			
 
				+    freqs = torch.outer(t, freqs)
			
 
				+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
			
 
				+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
			
 
				+    return cache.to(dtype=torch.bfloat16)
			
 
				+
			
 
				+
			
 
				+class Attention(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        dim,
			
 
				+        n_head,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert dim % n_head == 0
			
 
				+
			
 
				+        self.dim = dim
			
 
				+        self.n_head = n_head
			
 
				+        self.head_dim = dim // n_head
			
 
				+
			
 
				+        self.wq = nn.Linear(dim, dim)
			
 
				+        self.wk = nn.Linear(dim, dim)
			
 
				+        self.wv = nn.Linear(dim, dim)
			
 
				+        self.wo = nn.Linear(dim, dim)
			
 
				+
			
 
				+    def forward(self, q, freqs_cis, kv=None, mask=None):
			
 
				+        bsz, seqlen, _ = q.shape
			
 
				+
			
 
				+        if kv is None:
			
 
				+            kv = q
			
 
				+
			
 
				+        kv_seqlen = kv.shape[1]
			
 
				+
			
 
				+        q = self.wq(q).view(bsz, seqlen, self.n_head, self.head_dim)
			
 
				+        k = self.wk(kv).view(bsz, kv_seqlen, self.n_head, self.head_dim)
			
 
				+        v = self.wv(kv).view(bsz, kv_seqlen, self.n_head, self.head_dim)
			
 
				+
			
 
				+        q = apply_rotary_emb(q, freqs_cis[:seqlen])
			
 
				+        k = apply_rotary_emb(k, freqs_cis[:kv_seqlen])
			
 
				+
			
 
				+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
			
 
				+        y = F.scaled_dot_product_attention(
			
 
				+            q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False
			
 
				+        )
			
 
				+
			
 
				+        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
			
 
				+
			
 
				+        y = self.wo(y)
			
 
				+        return y
			
 
				+
			
 
				+
			
 
				+class FeedForward(nn.Module):
			
 
				+    def __init__(self, in_dim, intermediate_size, out_dim=None):
			
 
				+        super().__init__()
			
 
				+        self.w1 = nn.Linear(in_dim, intermediate_size)
			
 
				+        self.w3 = nn.Linear(in_dim, intermediate_size)
			
 
				+        self.w2 = nn.Linear(intermediate_size, out_dim or in_dim)
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
			
 
				+
			
 
				+
			
 
				+class DiTBlock(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hidden_size,
			
 
				+        num_heads,
			
 
				+        mlp_ratio=4.0,
			
 
				+        use_self_attention=True,
			
 
				+        use_cross_attention=False,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.use_self_attention = use_self_attention
			
 
				+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
			
 
				+
			
 
				+        if use_self_attention:
			
 
				+            self.mix = Attention(hidden_size, num_heads)
			
 
				+        else:
			
 
				+            self.mix = nn.Conv1d(
			
 
				+                hidden_size,
			
 
				+                hidden_size,
			
 
				+                kernel_size=7,
			
 
				+                padding=3,
			
 
				+                bias=True,
			
 
				+                groups=hidden_size,
			
 
				+            )
			
 
				+
			
 
				+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
			
 
				+        self.mlp = FeedForward(hidden_size, int(hidden_size * mlp_ratio))
			
 
				+        self.adaLN_modulation = nn.Sequential(
			
 
				+            nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)
			
 
				+        )
			
 
				+
			
 
				+        self.use_cross_attention = use_cross_attention
			
 
				+        if self.use_cross_attention:
			
 
				+            self.norm3 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
			
 
				+            self.norm4 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
			
 
				+            self.cross_attn = Attention(hidden_size, num_heads)
			
 
				+            self.adaLN_modulation_cross = nn.Sequential(
			
 
				+                nn.SiLU(), nn.Linear(hidden_size, 3 * hidden_size, bias=True)
			
 
				+            )
			
 
				+            self.adaLN_modulation_cross_condition = nn.Sequential(
			
 
				+                nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
			
 
				+            )
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x,
			
 
				+        condition,
			
 
				+        freqs_cis,
			
 
				+        self_mask=None,
			
 
				+        cross_condition=None,
			
 
				+        cross_mask=None,
			
 
				+    ):
			
 
				+        (
			
 
				+            shift_msa,
			
 
				+            scale_msa,
			
 
				+            gate_msa,
			
 
				+            shift_mlp,
			
 
				+            scale_mlp,
			
 
				+            gate_mlp,
			
 
				+        ) = self.adaLN_modulation(condition).chunk(6, dim=-1)
			
 
				+
			
 
				+        # Self-attention
			
 
				+        inp = modulate(self.norm1(x), shift_msa, scale_msa)
			
 
				+        if self.use_self_attention:
			
 
				+            inp = self.mix(inp, freqs_cis=freqs_cis, mask=self_mask)
			
 
				+        else:
			
 
				+            inp = self.mix(inp.mT).mT
			
 
				+        x = x + gate_msa * inp
			
 
				+
			
 
				+        # Cross-attention
			
 
				+        if self.use_cross_attention:
			
 
				+            (
			
 
				+                shift_cross,
			
 
				+                scale_cross,
			
 
				+                gate_cross,
			
 
				+            ) = self.adaLN_modulation_cross(
			
 
				+                condition
			
 
				+            ).chunk(3, dim=-1)
			
 
				+
			
 
				+            (
			
 
				+                shift_cross_condition,
			
 
				+                scale_cross_condition,
			
 
				+            ) = self.adaLN_modulation_cross_condition(cross_condition).chunk(2, dim=-1)
			
 
				+
			
 
				+            inp = modulate(self.norm3(x), shift_cross, scale_cross)
			
 
				+            inp = self.cross_attn(
			
 
				+                inp,
			
 
				+                freqs_cis=freqs_cis,
			
 
				+                kv=modulate(
			
 
				+                    self.norm4(cross_condition),
			
 
				+                    shift_cross_condition,
			
 
				+                    scale_cross_condition,
			
 
				+                ),
			
 
				+                mask=cross_mask,
			
 
				+            )
			
 
				+            x = x + gate_cross * inp
			
 
				+
			
 
				+        # MLP
			
 
				+        x = x + gate_mlp * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class FinalLayer(nn.Module):
			
 
				+    """
			
 
				+    The final layer of DiT.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, hidden_size, out_channels):
			
 
				+        super().__init__()
			
 
				+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
			
 
				+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
			
 
				+        self.adaLN_modulation = nn.Sequential(
			
 
				+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x, c):
			
 
				+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
			
 
				+        x = modulate(self.norm_final(x), shift, scale)
			
 
				+        return self.linear(x)
			
 
				+
			
 
				+
			
 
				+class DiT(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hidden_size,
			
 
				+        num_heads,
			
 
				+        diffusion_num_layers,
			
 
				+        channels=160,
			
 
				+        mlp_ratio=4.0,
			
 
				+        max_seq_len=16384,
			
 
				+        condition_dim=512,
			
 
				+        style_dim=None,
			
 
				+        cross_condition_dim=None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.max_seq_len = max_seq_len
			
 
				+
			
 
				+        self.time_embedder = TimestepEmbedder(hidden_size)
			
 
				+        self.condition_embedder = FeedForward(
			
 
				+            condition_dim, int(hidden_size * mlp_ratio), out_dim=hidden_size
			
 
				+        )
			
 
				+
			
 
				+        if cross_condition_dim is not None:
			
 
				+            self.cross_condition_embedder = FeedForward(
			
 
				+                cross_condition_dim, int(hidden_size * mlp_ratio), out_dim=hidden_size
			
 
				+            )
			
 
				+
			
 
				+        self.use_style = style_dim is not None
			
 
				+        if self.use_style:
			
 
				+            self.style_embedder = FeedForward(
			
 
				+                style_dim, int(hidden_size * mlp_ratio), out_dim=hidden_size
			
 
				+            )
			
 
				+
			
 
				+        self.diffusion_blocks = nn.ModuleList(
			
 
				+            [
			
 
				+                DiTBlock(
			
 
				+                    hidden_size,
			
 
				+                    num_heads,
			
 
				+                    mlp_ratio,
			
 
				+                    use_self_attention=i % 4 == 0,
			
 
				+                    use_cross_attention=cross_condition_dim is not None,
			
 
				+                )
			
 
				+                for i in range(diffusion_num_layers)
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+        # Downsample & upsample blocks
			
 
				+        self.input_embedder = FeedForward(
			
 
				+            channels, int(hidden_size * mlp_ratio), out_dim=hidden_size
			
 
				+        )
			
 
				+        self.final_layer = FinalLayer(hidden_size, channels)
			
 
				+
			
 
				+        self.register_buffer(
			
 
				+            "freqs_cis", precompute_freqs_cis(max_seq_len, hidden_size // num_heads)
			
 
				+        )
			
 
				+
			
 
				+        self.initialize_weights()
			
 
				+
			
 
				+    def initialize_weights(self):
			
 
				+        # Initialize input embedding:
			
 
				+        self.input_embedder.apply(self.init_weight)
			
 
				+        self.time_embedder.mlp.apply(self.init_weight)
			
 
				+        self.condition_embedder.apply(self.init_weight)
			
 
				+
			
 
				+        if self.use_style:
			
 
				+            self.style_embedder.apply(self.init_weight)
			
 
				+
			
 
				+        if hasattr(self, "cross_condition_embedder"):
			
 
				+            self.cross_condition_embedder.apply(self.init_weight)
			
 
				+
			
 
				+        for block in self.diffusion_blocks:
			
 
				+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
			
 
				+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
			
 
				+            block.mix.apply(self.init_weight)
			
 
				+
			
 
				+        # Zero-out output layers:
			
 
				+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
			
 
				+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
			
 
				+        self.final_layer.linear.apply(self.init_weight)
			
 
				+
			
 
				+    def init_weight(self, m):
			
 
				+        if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d, nn.Linear)):
			
 
				+            nn.init.normal_(m.weight, 0, 0.02)
			
 
				+            if m.bias is not None:
			
 
				+                nn.init.constant_(m.bias, 0)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x,
			
 
				+        time,
			
 
				+        condition,
			
 
				+        style=None,
			
 
				+        self_mask=None,
			
 
				+        cross_condition=None,
			
 
				+        cross_mask=None,
			
 
				+    ):
			
 
				+        # Embed inputs
			
 
				+        x = self.input_embedder(x)
			
 
				+        t = self.time_embedder(time)
			
 
				+
			
 
				+        condition = self.condition_embedder(condition)
			
 
				+
			
 
				+        if self.use_style:
			
 
				+            style = self.style_embedder(style)
			
 
				+
			
 
				+        if cross_condition is not None:
			
 
				+            cross_condition = self.cross_condition_embedder(cross_condition)
			
 
				+            cross_condition = t[:, None, :] + cross_condition
			
 
				+
			
 
				+        # Merge t, condition, and style
			
 
				+        condition = t[:, None, :] + condition
			
 
				+        if self.use_style:
			
 
				+            condition = condition + style[:, None, :]
			
 
				+
			
 
				+        if self_mask is not None:
			
 
				+            self_mask = self_mask[:, None, None, :]
			
 
				+
			
 
				+        if cross_mask is not None:
			
 
				+            cross_mask = cross_mask[:, None, None, :]
			
 
				+
			
 
				+        # DiT
			
 
				+        for block in self.diffusion_blocks:
			
 
				+            x = block(
			
 
				+                x,
			
 
				+                condition,
			
 
				+                self.freqs_cis,
			
 
				+                self_mask=self_mask,
			
 
				+                cross_condition=cross_condition,
			
 
				+                cross_mask=cross_mask,
			
 
				+            )
			
 
				+
			
 
				+        x = self.final_layer(x, condition)
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    model = DiT(
			
 
				+        hidden_size=384,
			
 
				+        num_heads=6,
			
 
				+        diffusion_num_layers=12,
			
 
				+        channels=160,
			
 
				+        condition_dim=512,
			
 
				+        style_dim=256,
			
 
				+    )
			
 
				+    bs, seq_len = 8, 1024
			
 
				+    x = torch.randn(bs, seq_len, 160)
			
 
				+    condition = torch.randn(bs, seq_len, 512)
			
 
				+    style = torch.randn(bs, 256)
			
 
				+    mask = torch.ones(bs, seq_len, dtype=torch.bool)
			
 
				+    mask[0, 5:] = False
			
 
				+    time = torch.arange(bs)
			
 
				+    print(time)
			
 
				+    out = model(x, time, condition, style, self_mask=mask)
			
 
				+    print(out.shape)  # torch.Size([2, 100, 160])
			
 
				+
			
 
				+    # Print model size
			
 
				+    num_params = sum(p.numel() for p in model.parameters())
			
 
				+    print(f"Number of parameters: {num_params / 1e6:.1f}M")
			
--- a/fish_speech/models/vqgan/modules/firefly.py
+++ b/fish_speech/models/vqgan/modules/firefly.py
@@ -0,0 +1,63 @@
 
				+import torch
			
 
				+from torch import nn
			
 
				+
			
 
				+from .convnext import ConvNeXtEncoder
			
 
				+from .hifigan import HiFiGANGenerator
			
 
				+
			
 
				+
			
 
				+class FireflyBase(nn.Module):
			
 
				+    def __init__(self, ckpt_path: str = None):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.backbone = ConvNeXtEncoder(
			
 
				+            input_channels=160,
			
 
				+            depths=[3, 3, 9, 3],
			
 
				+            dims=[128, 256, 384, 512],
			
 
				+            drop_path_rate=0.2,
			
 
				+            kernel_sizes=[7],
			
 
				+        )
			
 
				+
			
 
				+        self.head = HiFiGANGenerator(
			
 
				+            hop_length=512,
			
 
				+            upsample_rates=[8, 8, 2, 2, 2],
			
 
				+            upsample_kernel_sizes=[16, 16, 4, 4, 4],
			
 
				+            resblock_kernel_sizes=[3, 7, 11],
			
 
				+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
			
 
				+            num_mels=512,
			
 
				+            upsample_initial_channel=512,
			
 
				+            use_template=True,
			
 
				+            pre_conv_kernel_size=13,
			
 
				+            post_conv_kernel_size=13,
			
 
				+        )
			
 
				+
			
 
				+        if ckpt_path is None:
			
 
				+            return
			
 
				+
			
 
				+        state_dict = torch.load(ckpt_path, map_location="cpu")
			
 
				+
			
 
				+        if "state_dict" in state_dict:
			
 
				+            state_dict = state_dict["state_dict"]
			
 
				+
			
 
				+        if any("generator." in k for k in state_dict):
			
 
				+            state_dict = {
			
 
				+                k.replace("generator.", ""): v
			
 
				+                for k, v in state_dict.items()
			
 
				+                if "generator." in k
			
 
				+            }
			
 
				+
			
 
				+        self.load_state_dict(state_dict, strict=True)
			
 
				+
			
 
				+    def encode(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x = self.backbone(x)
			
 
				+        return x
			
 
				+
			
 
				+    def decode(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x = self.head(x)
			
 
				+        if x.ndim == 2:
			
 
				+            x = x[:, None, :]
			
 
				+        return x
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x = self.encode(x)
			
 
				+        x = self.decode(x)
			
 
				+        return x
			
--- a/fish_speech/models/vqgan/modules/fsq.py
+++ b/fish_speech/models/vqgan/modules/fsq.py
@@ -0,0 +1,128 @@
 
				+from dataclasses import dataclass
			
 
				+from typing import Union
			
 
				+
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+from einops import rearrange
			
 
				+from torch.nn.utils import weight_norm
			
 
				+from vector_quantize_pytorch import ResidualFSQ
			
 
				+
			
 
				+from .convnext import ConvNeXtBlock
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class FSQResult:
			
 
				+    z: torch.Tensor
			
 
				+    codes: torch.Tensor
			
 
				+    latents: torch.Tensor
			
 
				+
			
 
				+
			
 
				+class DownsampleFiniteScalarQuantize(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        input_dim: int = 512,
			
 
				+        n_codebooks: int = 9,
			
 
				+        levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
			
 
				+        downsample_factor: tuple[int] = (2, 2),
			
 
				+        downsample_dims: tuple[int] | None = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        if downsample_dims is None:
			
 
				+            downsample_dims = [input_dim for _ in range(len(downsample_factor))]
			
 
				+
			
 
				+        all_dims = (input_dim,) + tuple(downsample_dims)
			
 
				+
			
 
				+        self.residual_fsq = ResidualFSQ(
			
 
				+            dim=all_dims[-1],
			
 
				+            levels=levels,
			
 
				+            num_quantizers=n_codebooks,
			
 
				+        )
			
 
				+
			
 
				+        self.downsample_factor = downsample_factor
			
 
				+        self.downsample_dims = downsample_dims
			
 
				+
			
 
				+        self.downsample = nn.Sequential(
			
 
				+            *[
			
 
				+                nn.Sequential(
			
 
				+                    nn.Conv1d(
			
 
				+                        all_dims[idx],
			
 
				+                        all_dims[idx + 1],
			
 
				+                        kernel_size=factor,
			
 
				+                        stride=factor,
			
 
				+                    ),
			
 
				+                    ConvNeXtBlock(dim=all_dims[idx + 1]),
			
 
				+                    ConvNeXtBlock(dim=all_dims[idx + 1]),
			
 
				+                )
			
 
				+                for idx, factor in enumerate(downsample_factor)
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+        self.upsample = nn.Sequential(
			
 
				+            *[
			
 
				+                nn.Sequential(
			
 
				+                    nn.ConvTranspose1d(
			
 
				+                        all_dims[idx + 1],
			
 
				+                        all_dims[idx],
			
 
				+                        kernel_size=factor,
			
 
				+                        stride=factor,
			
 
				+                    ),
			
 
				+                    ConvNeXtBlock(dim=all_dims[idx]),
			
 
				+                    ConvNeXtBlock(dim=all_dims[idx]),
			
 
				+                )
			
 
				+                for idx, factor in reversed(list(enumerate(downsample_factor)))
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, z) -> FSQResult:
			
 
				+        original_shape = z.shape
			
 
				+        z = self.downsample(z)
			
 
				+        quantized, indices = self.residual_fsq(z.mT)
			
 
				+        result = FSQResult(
			
 
				+            z=quantized.mT,
			
 
				+            codes=indices.mT,
			
 
				+            latents=z,
			
 
				+        )
			
 
				+        result.z = self.upsample(result.z)
			
 
				+
			
 
				+        # Pad or crop z to match original shape
			
 
				+        diff = original_shape[-1] - result.z.shape[-1]
			
 
				+        left = diff // 2
			
 
				+        right = diff - left
			
 
				+
			
 
				+        if diff > 0:
			
 
				+            result.z = F.pad(result.z, (left, right))
			
 
				+        elif diff < 0:
			
 
				+            result.z = result.z[..., left:-right]
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    # def from_codes(self, codes: torch.Tensor):
			
 
				+    #     z_q, z_p, codes = self.residual_fsq.get_output_from_indices(codes)
			
 
				+    #     z_q = self.upsample(z_q)
			
 
				+    #     return z_q, z_p, codes
			
 
				+
			
 
				+    # def from_latents(self, latents: torch.Tensor):
			
 
				+    #     z_q, z_p, codes = super().from_latents(latents)
			
 
				+    #     z_q = self.upsample(z_q)
			
 
				+    #     return z_q, z_p, codes
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    rvq = DownsampleFiniteScalarQuantize(
			
 
				+        n_codebooks=1,
			
 
				+        downsample_factor=(2, 2),
			
 
				+    )
			
 
				+    x = torch.randn(16, 512, 80)
			
 
				+
			
 
				+    result = rvq(x)
			
 
				+    print(rvq)
			
 
				+    print(result.latents.shape, result.codes.shape, result.z.shape)
			
 
				+
			
 
				+    # y = rvq.from_codes(result.codes)
			
 
				+    # print(y[0].shape)
			
 
				+
			
 
				+    # y = rvq.from_latents(result.latents)
			
 
				+    # print(y[0].shape)
			
--- a/fish_speech/models/vqgan/modules/hifigan.py
+++ b/fish_speech/models/vqgan/modules/hifigan.py
@@ -0,0 +1,278 @@
 
				+from functools import partial
			
 
				+from math import prod
			
 
				+from typing import Callable, Optional
			
 
				+
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+from torch.nn import Conv1d
			
 
				+from torch.nn.utils.parametrizations import weight_norm
			
 
				+from torch.nn.utils.parametrize import remove_parametrizations
			
 
				+
			
 
				+
			
 
				+def init_weights(m, mean=0.0, std=0.01):
			
 
				+    classname = m.__class__.__name__
			
 
				+    if classname.find("Conv") != -1:
			
 
				+        m.weight.data.normal_(mean, std)
			
 
				+
			
 
				+
			
 
				+def get_padding(kernel_size, dilation=1):
			
 
				+    return (kernel_size * dilation - dilation) // 2
			
 
				+
			
 
				+
			
 
				+class ResBlock(torch.nn.Module):
			
 
				+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.convs1 = nn.ModuleList(
			
 
				+            [
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[0],
			
 
				+                        padding=get_padding(kernel_size, dilation[0]),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[1],
			
 
				+                        padding=get_padding(kernel_size, dilation[1]),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[2],
			
 
				+                        padding=get_padding(kernel_size, dilation[2]),
			
 
				+                    )
			
 
				+                ),
			
 
				+            ]
			
 
				+        )
			
 
				+        self.convs1.apply(init_weights)
			
 
				+
			
 
				+        self.convs2 = nn.ModuleList(
			
 
				+            [
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=1,
			
 
				+                        padding=get_padding(kernel_size, 1),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=1,
			
 
				+                        padding=get_padding(kernel_size, 1),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=1,
			
 
				+                        padding=get_padding(kernel_size, 1),
			
 
				+                    )
			
 
				+                ),
			
 
				+            ]
			
 
				+        )
			
 
				+        self.convs2.apply(init_weights)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        for c1, c2 in zip(self.convs1, self.convs2):
			
 
				+            xt = F.silu(x)
			
 
				+            xt = c1(xt)
			
 
				+            xt = F.silu(xt)
			
 
				+            xt = c2(xt)
			
 
				+            x = xt + x
			
 
				+        return x
			
 
				+
			
 
				+    def remove_parametrizations(self):
			
 
				+        for conv in self.convs1:
			
 
				+            remove_parametrizations(conv)
			
 
				+        for conv in self.convs2:
			
 
				+            remove_parametrizations(conv)
			
 
				+
			
 
				+
			
 
				+class ParralelBlock(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        channels: int,
			
 
				+        kernel_sizes: tuple[int] = (3, 7, 11),
			
 
				+        dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        assert len(kernel_sizes) == len(dilation_sizes)
			
 
				+
			
 
				+        self.blocks = nn.ModuleList()
			
 
				+        for k, d in zip(kernel_sizes, dilation_sizes):
			
 
				+            self.blocks.append(ResBlock(channels, k, d))
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        xs = [block(x) for block in self.blocks]
			
 
				+
			
 
				+        return torch.stack(xs, dim=0).mean(dim=0)
			
 
				+
			
 
				+
			
 
				+class HiFiGANGenerator(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        *,
			
 
				+        hop_length: int = 512,
			
 
				+        upsample_rates: tuple[int] = (8, 8, 2, 2, 2),
			
 
				+        upsample_kernel_sizes: tuple[int] = (16, 16, 4, 4, 4),
			
 
				+        resblock_kernel_sizes: tuple[int] = (3, 7, 11),
			
 
				+        resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
			
 
				+        num_mels: int = 160,
			
 
				+        upsample_initial_channel: int = 512,
			
 
				+        use_template: bool = True,
			
 
				+        pre_conv_kernel_size: int = 7,
			
 
				+        post_conv_kernel_size: int = 7,
			
 
				+        post_activation: Callable = partial(nn.SiLU, inplace=True),
			
 
				+        checkpointing: bool = False,
			
 
				+        condition_dim: Optional[int] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        assert (
			
 
				+            prod(upsample_rates) == hop_length
			
 
				+        ), f"hop_length must be {prod(upsample_rates)}"
			
 
				+
			
 
				+        self.conv_pre = weight_norm(
			
 
				+            nn.Conv1d(
			
 
				+                num_mels,
			
 
				+                upsample_initial_channel,
			
 
				+                pre_conv_kernel_size,
			
 
				+                1,
			
 
				+                padding=get_padding(pre_conv_kernel_size),
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        self.hop_length = hop_length
			
 
				+        self.num_upsamples = len(upsample_rates)
			
 
				+        self.num_kernels = len(resblock_kernel_sizes)
			
 
				+
			
 
				+        self.noise_convs = nn.ModuleList()
			
 
				+        self.use_template = use_template
			
 
				+        self.ups = nn.ModuleList()
			
 
				+        self.condition_dim = condition_dim
			
 
				+
			
 
				+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
			
 
				+            c_cur = upsample_initial_channel // (2 ** (i + 1))
			
 
				+            self.ups.append(
			
 
				+                weight_norm(
			
 
				+                    nn.ConvTranspose1d(
			
 
				+                        upsample_initial_channel // (2**i),
			
 
				+                        upsample_initial_channel // (2 ** (i + 1)),
			
 
				+                        k,
			
 
				+                        u,
			
 
				+                        padding=(k - u) // 2,
			
 
				+                    )
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+            if not use_template:
			
 
				+                continue
			
 
				+
			
 
				+            if i + 1 < len(upsample_rates):
			
 
				+                stride_f0 = np.prod(upsample_rates[i + 1 :])
			
 
				+                self.noise_convs.append(
			
 
				+                    Conv1d(
			
 
				+                        1,
			
 
				+                        c_cur,
			
 
				+                        kernel_size=stride_f0 * 2,
			
 
				+                        stride=stride_f0,
			
 
				+                        padding=stride_f0 // 2,
			
 
				+                    )
			
 
				+                )
			
 
				+            else:
			
 
				+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
			
 
				+
			
 
				+        self.resblocks = nn.ModuleList()
			
 
				+        for i in range(len(self.ups)):
			
 
				+            ch = upsample_initial_channel // (2 ** (i + 1))
			
 
				+            self.resblocks.append(
			
 
				+                ParralelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes)
			
 
				+            )
			
 
				+
			
 
				+        self.activation_post = post_activation()
			
 
				+        self.conv_post = weight_norm(
			
 
				+            nn.Conv1d(
			
 
				+                ch,
			
 
				+                1,
			
 
				+                post_conv_kernel_size,
			
 
				+                1,
			
 
				+                padding=get_padding(post_conv_kernel_size),
			
 
				+            )
			
 
				+        )
			
 
				+        self.ups.apply(init_weights)
			
 
				+        self.conv_post.apply(init_weights)
			
 
				+
			
 
				+        if condition_dim is not None:
			
 
				+            self.condition = nn.Conv1d(condition_dim, upsample_initial_channel, 1)
			
 
				+
			
 
				+        # Gradient checkpointing
			
 
				+        self.checkpointing = checkpointing
			
 
				+
			
 
				+    def forward(self, x, template=None, condition=None):
			
 
				+        if self.use_template and template is None:
			
 
				+            length = x.shape[-1] * self.hop_length
			
 
				+            template = (
			
 
				+                torch.randn(x.shape[0], 1, length, device=x.device, dtype=x.dtype)
			
 
				+                * 0.003
			
 
				+            )
			
 
				+
			
 
				+        if self.condition_dim is not None:
			
 
				+            x = x + self.condition(condition)
			
 
				+
			
 
				+        x = self.conv_pre(x)
			
 
				+
			
 
				+        for i in range(self.num_upsamples):
			
 
				+            x = F.silu(x, inplace=True)
			
 
				+            x = self.ups[i](x)
			
 
				+
			
 
				+            if self.use_template:
			
 
				+                x = x + self.noise_convs[i](template)
			
 
				+
			
 
				+            if self.training and self.checkpointing:
			
 
				+                x = torch.utils.checkpoint.checkpoint(
			
 
				+                    self.resblocks[i],
			
 
				+                    x,
			
 
				+                    use_reentrant=False,
			
 
				+                )
			
 
				+            else:
			
 
				+                x = self.resblocks[i](x)
			
 
				+
			
 
				+        x = self.activation_post(x)
			
 
				+        x = self.conv_post(x)
			
 
				+        x = torch.tanh(x)
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+    def remove_parametrizations(self):
			
 
				+        for up in self.ups:
			
 
				+            remove_parametrizations(up)
			
 
				+        for block in self.resblocks:
			
 
				+            block.remove_parametrizations()
			
 
				+        remove_parametrizations(self.conv_pre)
			
 
				+        remove_parametrizations(self.conv_post)
			
--- a/fish_speech/models/vqgan/spectrogram.py
+++ b/fish_speech/models/vqgan/spectrogram.py
@@ -21,7 +21,7 @@ class LinearSpectrogram(nn.Module):
 
				         self.center = center
			
 
				         self.mode = mode
			
 
				 
			
 
				-        self.register_buffer("window", torch.hann_window(win_length))
			
 
				+        self.register_buffer("window", torch.hann_window(win_length), persistent=False)
			
 
				 
			
 
				     def forward(self, y: Tensor) -> Tensor:
			
 
				         if y.ndim == 3:
			
@@ -78,17 +78,23 @@ class LogMelSpectrogram(nn.Module):
 
				         self.center = center
			
 
				         self.n_mels = n_mels
			
 
				         self.f_min = f_min
			
 
				-        self.f_max = f_max or sample_rate // 2
			
 
				+        self.f_max = f_max or float(sample_rate // 2)
			
 
				 
			
 
				         self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
			
 
				-        self.mel_scale = MelScale(
			
 
				-            self.n_mels,
			
 
				-            self.sample_rate,
			
 
				-            self.f_min,
			
 
				-            self.f_max,
			
 
				-            self.n_fft // 2 + 1,
			
 
				-            "slaney",
			
 
				-            "slaney",
			
 
				+
			
 
				+        fb = F.melscale_fbanks(
			
 
				+            n_freqs=self.n_fft // 2 + 1,
			
 
				+            f_min=self.f_min,
			
 
				+            f_max=self.f_max,
			
 
				+            n_mels=self.n_mels,
			
 
				+            sample_rate=self.sample_rate,
			
 
				+            norm="slaney",
			
 
				+            mel_scale="slaney",
			
 
				+        )
			
 
				+        self.register_buffer(
			
 
				+            "fb",
			
 
				+            fb,
			
 
				+            persistent=False,
			
 
				         )
			
 
				 
			
 
				     def compress(self, x: Tensor) -> Tensor:
			
@@ -97,6 +103,9 @@ class LogMelSpectrogram(nn.Module):
 
				     def decompress(self, x: Tensor) -> Tensor:
			
 
				         return torch.exp(x)
			
 
				 
			
 
				+    def apply_mel_scale(self, x: Tensor) -> Tensor:
			
 
				+        return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
			
 
				+
			
 
				     def forward(
			
 
				         self, x: Tensor, return_linear: bool = False, sample_rate: int = None
			
 
				     ) -> Tensor:
			
@@ -104,7 +113,7 @@ class LogMelSpectrogram(nn.Module):
 
				             x = F.resample(x, orig_freq=sample_rate, new_freq=self.sample_rate)
			
 
				 
			
 
				         linear = self.spectrogram(x)
			
 
				-        x = self.mel_scale(linear)
			
 
				+        x = self.apply_mel_scale(linear)
			
 
				         x = self.compress(x)
			
 
				 
			
 
				         if return_linear: