2 tahun lalu · c6126b4d7c
--- a/README.md
+++ b/README.md
@@ -1,18 +1,17 @@
 
				-# Speech LLM
			
 
				+# Fish Speech
			
 
				+
			
 
				+This repo is still under construction. Please check back later.
			
 
				 
			
 
				 ## Setup
			
 
				 ```bash
			
 
				 # Basic environment setup
			
 
				-conda create -n speech-llm python=3.10
			
 
				-conda activate speech-llm
			
 
				+conda create -n fish-speech python=3.10
			
 
				+conda activate fish-speech
			
 
				 conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
			
 
				 
			
 
				-# Install requirements
			
 
				-pip3 install -r requirements.txt
			
 
				-
			
 
				-# Install flash-attn
			
 
				-MAX_JOBS=4 pip install flash-attn --no-build-isolation
			
 
				+# Install flash-attn (for linux)
			
 
				+pip3 install ninja && MAX_JOBS=4 pip3 install flash-attn --no-build-isolation
			
 
				 
			
 
				-# Install speech-llm
			
 
				+# Install fish-speech
			
 
				 pip3 install -e .
			
 
				 ```
			
--- a/fish_speech/configs/llama_finetune.yaml
+++ b/fish_speech/configs/llama_finetune.yaml
@@ -18,19 +18,9 @@ tokenizer:
 
				 
			
 
				 # Dataset Configuration
			
 
				 train_dataset:
			
 
				-  _target_: fish_speech.datasets.text.InterleaveDataset
			
 
				-  datasets:
			
 
				-    - _target_: fish_speech.datasets.text.TextDataset
			
 
				-      prefix: 'en/'
			
 
				-    - _target_: fish_speech.datasets.text.TextDataset
			
 
				-      prefix: 'zh/'
			
 
				-    - _target_: fish_speech.datasets.text.TextDataset
			
 
				-      prefix: 'ja/'
			
 
				-    - _target_: fish_speech.datasets.text.TextDataset
			
 
				-      repo: fishaudio/cn-hubert-25hz-vq
			
 
				-      prefix: 'data/train'
			
 
				-  probabilities: [0.2, 0.2, 0.2, 0.4]
			
 
				-  seed: 42
			
 
				+  - _target_: fish_speech.datasets.text.TextDataset
			
 
				+    repo: fishaudio/cn-hubert-25hz-vq
			
 
				+    prefix: 'data/train'
			
 
				 
			
 
				 val_dataset:
			
 
				   _target_: fish_speech.datasets.text.TextDataset
			
--- a/fish_speech/models/hubert_vq/lit_module.py
+++ b/fish_speech/models/hubert_vq/lit_module.py
@@ -0,0 +1,347 @@
 
				+from typing import Any, Callable
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+from fish_vocoder.models.vocoder import VocoderModel
			
 
				+from fish_vocoder.modules.losses.stft import MultiResolutionSTFTLoss
			
 
				+from fish_vocoder.utils.grad_norm import grad_norm
			
 
				+from fish_vocoder.utils.mask import sequence_mask
			
 
				+from torch import nn
			
 
				+from torch.utils.checkpoint import checkpoint as gradient_checkpointing
			
 
				+
			
 
				+
			
 
				+class GANModel(VocoderModel):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        sampling_rate: int,
			
 
				+        n_fft: int,
			
 
				+        hop_length: int,
			
 
				+        win_length: int,
			
 
				+        num_mels: int,
			
 
				+        optimizer: Callable,
			
 
				+        lr_scheduler: Callable,
			
 
				+        mel_transforms: nn.ModuleDict,
			
 
				+        generator: nn.Module,
			
 
				+        discriminators: nn.ModuleDict,
			
 
				+        multi_resolution_stft_loss: MultiResolutionSTFTLoss,
			
 
				+        num_frames: int,
			
 
				+        crop_length: int | None = None,
			
 
				+        checkpointing: bool = False,
			
 
				+        feature_matching: bool = False,
			
 
				+    ):
			
 
				+        super().__init__(
			
 
				+            sampling_rate=sampling_rate,
			
 
				+            n_fft=n_fft,
			
 
				+            hop_length=hop_length,
			
 
				+            win_length=win_length,
			
 
				+            num_mels=num_mels,
			
 
				+        )
			
 
				+
			
 
				+        # Model parameters
			
 
				+        self.optimizer_builder = optimizer
			
 
				+        self.lr_scheduler_builder = lr_scheduler
			
 
				+
			
 
				+        # Spectrogram transforms
			
 
				+        self.mel_transforms = mel_transforms
			
 
				+
			
 
				+        # Generator and discriminators
			
 
				+        # Compile generator so that snake can save memory
			
 
				+        self.generator = generator
			
 
				+        self.discriminators = discriminators
			
 
				+
			
 
				+        # Loss
			
 
				+        self.multi_resolution_stft_loss = multi_resolution_stft_loss
			
 
				+
			
 
				+        # Crop length for saving memory
			
 
				+        self.num_frames = num_frames
			
 
				+        self.crop_length = crop_length
			
 
				+
			
 
				+        # Disable automatic optimization
			
 
				+        self.automatic_optimization = False
			
 
				+
			
 
				+        # Gradient checkpointing
			
 
				+        self.checkpointing = checkpointing
			
 
				+
			
 
				+        # Feature matching
			
 
				+        self.feature_matching = feature_matching
			
 
				+
			
 
				+    def configure_optimizers(self):
			
 
				+        # Need two optimizers and two schedulers
			
 
				+        optimizer_generator = self.optimizer_builder(self.generator.parameters())
			
 
				+        optimizer_discriminator = self.optimizer_builder(
			
 
				+            self.discriminators.parameters()
			
 
				+        )
			
 
				+
			
 
				+        lr_scheduler_generator = self.lr_scheduler_builder(optimizer_generator)
			
 
				+        lr_scheduler_discriminator = self.lr_scheduler_builder(optimizer_discriminator)
			
 
				+
			
 
				+        return (
			
 
				+            {
			
 
				+                "optimizer": optimizer_generator,
			
 
				+                "lr_scheduler": {
			
 
				+                    "scheduler": lr_scheduler_generator,
			
 
				+                    "interval": "step",
			
 
				+                    "name": "optimizer/generator",
			
 
				+                },
			
 
				+            },
			
 
				+            {
			
 
				+                "optimizer": optimizer_discriminator,
			
 
				+                "lr_scheduler": {
			
 
				+                    "scheduler": lr_scheduler_discriminator,
			
 
				+                    "interval": "step",
			
 
				+                    "name": "optimizer/discriminator",
			
 
				+                },
			
 
				+            },
			
 
				+        )
			
 
				+
			
 
				+    def training_generator(self, audio, audio_mask):
			
 
				+        if self.training and self.checkpointing:
			
 
				+            fake_audio, base_loss = gradient_checkpointing(
			
 
				+                self.forward, audio, audio_mask, use_reentrant=False
			
 
				+            )
			
 
				+        else:
			
 
				+            fake_audio, base_loss = self.forward(audio, audio_mask)
			
 
				+
			
 
				+        assert fake_audio.shape == audio.shape
			
 
				+
			
 
				+        # Apply mask
			
 
				+        audio = audio * audio_mask
			
 
				+        fake_audio = fake_audio * audio_mask
			
 
				+
			
 
				+        # Multi-Resolution STFT Loss
			
 
				+        sc_loss, mag_loss = self.multi_resolution_stft_loss(
			
 
				+            fake_audio.squeeze(1), audio.squeeze(1)
			
 
				+        )
			
 
				+        loss_stft = sc_loss + mag_loss
			
 
				+
			
 
				+        self.log(
			
 
				+            "train/generator/stft",
			
 
				+            loss_stft,
			
 
				+            on_step=True,
			
 
				+            on_epoch=False,
			
 
				+            prog_bar=True,
			
 
				+            logger=True,
			
 
				+            sync_dist=True,
			
 
				+        )
			
 
				+
			
 
				+        # L1 Mel-Spectrogram Loss
			
 
				+        # This is not used in backpropagation currently
			
 
				+        audio_mel = self.mel_transforms.loss(audio.squeeze(1))
			
 
				+        fake_audio_mel = self.mel_transforms.loss(fake_audio.squeeze(1))
			
 
				+        loss_mel = F.l1_loss(audio_mel, fake_audio_mel)
			
 
				+
			
 
				+        self.log(
			
 
				+            "train/generator/mel",
			
 
				+            loss_mel,
			
 
				+            on_step=True,
			
 
				+            on_epoch=False,
			
 
				+            prog_bar=True,
			
 
				+            logger=True,
			
 
				+            sync_dist=True,
			
 
				+        )
			
 
				+
			
 
				+        # Now, we need to reduce the length of the audio to save memory
			
 
				+        if self.crop_length is not None and audio.shape[2] > self.crop_length:
			
 
				+            slice_idx = torch.randint(0, audio.shape[-1] - self.crop_length, (1,))
			
 
				+
			
 
				+            audio = audio[..., slice_idx : slice_idx + self.crop_length]
			
 
				+            fake_audio = fake_audio[..., slice_idx : slice_idx + self.crop_length]
			
 
				+            audio_mask = audio_mask[..., slice_idx : slice_idx + self.crop_length]
			
 
				+
			
 
				+            assert audio.shape == fake_audio.shape == audio_mask.shape
			
 
				+
			
 
				+        # Adv Loss
			
 
				+        loss_adv_all = 0
			
 
				+
			
 
				+        for key, disc in self.discriminators.items():
			
 
				+            score_fakes, feat_fake = disc(fake_audio)
			
 
				+
			
 
				+            # Adversarial Loss
			
 
				+            score_fakes = torch.cat(score_fakes, dim=1)
			
 
				+            loss_fake = torch.mean((1 - score_fakes) ** 2)
			
 
				+
			
 
				+            self.log(
			
 
				+                f"train/generator/adv_{key}",
			
 
				+                loss_fake,
			
 
				+                on_step=True,
			
 
				+                on_epoch=False,
			
 
				+                prog_bar=False,
			
 
				+                logger=True,
			
 
				+                sync_dist=True,
			
 
				+            )
			
 
				+
			
 
				+            loss_adv_all += loss_fake
			
 
				+
			
 
				+            if self.feature_matching is False:
			
 
				+                continue
			
 
				+
			
 
				+            # Feature Matching Loss
			
 
				+            _, feat_real = disc(audio)
			
 
				+            loss_fm = 0
			
 
				+            for dr, dg in zip(feat_real, feat_fake):
			
 
				+                for rl, gl in zip(dr, dg):
			
 
				+                    loss_fm += F.l1_loss(rl, gl)
			
 
				+
			
 
				+            loss_fm /= len(feat_real)
			
 
				+
			
 
				+            self.log(
			
 
				+                f"train/generator/adv_fm_{key}",
			
 
				+                loss_fm,
			
 
				+                on_step=True,
			
 
				+                on_epoch=False,
			
 
				+                prog_bar=False,
			
 
				+                logger=True,
			
 
				+                sync_dist=True,
			
 
				+            )
			
 
				+
			
 
				+            loss_adv_all += loss_fm
			
 
				+
			
 
				+        loss_adv_all /= len(self.discriminators)
			
 
				+        loss_gen_all = base_loss + loss_stft * 2.5 + loss_mel * 45 + loss_adv_all
			
 
				+
			
 
				+        self.log(
			
 
				+            "train/generator/all",
			
 
				+            loss_gen_all,
			
 
				+            on_step=True,
			
 
				+            on_epoch=False,
			
 
				+            prog_bar=True,
			
 
				+            logger=True,
			
 
				+            sync_dist=True,
			
 
				+        )
			
 
				+
			
 
				+        return loss_gen_all, audio, fake_audio
			
 
				+
			
 
				+    def training_discriminator(self, audio, fake_audio):
			
 
				+        loss_disc_all = 0
			
 
				+
			
 
				+        for key, disc in self.discriminators.items():
			
 
				+            if self.training and self.checkpointing:
			
 
				+                scores, _ = gradient_checkpointing(disc, audio, use_reentrant=False)
			
 
				+                score_fakes, _ = gradient_checkpointing(
			
 
				+                    disc, fake_audio.detach(), use_reentrant=False
			
 
				+                )
			
 
				+            else:
			
 
				+                scores, _ = disc(audio)
			
 
				+                score_fakes, _ = disc(fake_audio.detach())
			
 
				+
			
 
				+            scores = torch.cat(scores, dim=1)
			
 
				+            score_fakes = torch.cat(score_fakes, dim=1)
			
 
				+            loss_disc = torch.mean((scores - 1) ** 2) + torch.mean((score_fakes) ** 2)
			
 
				+
			
 
				+            self.log(
			
 
				+                f"train/discriminator/{key}",
			
 
				+                loss_disc,
			
 
				+                on_step=True,
			
 
				+                on_epoch=False,
			
 
				+                prog_bar=False,
			
 
				+                logger=True,
			
 
				+                sync_dist=True,
			
 
				+            )
			
 
				+
			
 
				+            loss_disc_all += loss_disc
			
 
				+
			
 
				+        loss_disc_all /= len(self.discriminators)
			
 
				+
			
 
				+        self.log(
			
 
				+            "train/discriminator/all",
			
 
				+            loss_disc_all,
			
 
				+            on_step=True,
			
 
				+            on_epoch=False,
			
 
				+            prog_bar=True,
			
 
				+            logger=True,
			
 
				+            sync_dist=True,
			
 
				+        )
			
 
				+
			
 
				+        return loss_disc_all
			
 
				+
			
 
				+    def training_step(self, batch, batch_idx):
			
 
				+        optim_g, optim_d = self.optimizers()
			
 
				+
			
 
				+        audio, lengths = batch["audio"], batch["lengths"]
			
 
				+        audio_mask = sequence_mask(lengths)[:, None, :].to(audio.device, torch.float32)
			
 
				+
			
 
				+        # Generator
			
 
				+        optim_g.zero_grad()
			
 
				+        loss_gen_all, audio, fake_audio = self.training_generator(audio, audio_mask)
			
 
				+        self.manual_backward(loss_gen_all)
			
 
				+
			
 
				+        self.log(
			
 
				+            "train/generator/grad_norm",
			
 
				+            grad_norm(self.generator.parameters()),
			
 
				+            on_step=True,
			
 
				+            on_epoch=False,
			
 
				+            prog_bar=False,
			
 
				+            logger=True,
			
 
				+            sync_dist=True,
			
 
				+        )
			
 
				+
			
 
				+        self.clip_gradients(
			
 
				+            optim_g, gradient_clip_val=1000, gradient_clip_algorithm="norm"
			
 
				+        )
			
 
				+        optim_g.step()
			
 
				+
			
 
				+        # Discriminator
			
 
				+        assert fake_audio.shape == audio.shape
			
 
				+
			
 
				+        optim_d.zero_grad()
			
 
				+        loss_disc_all = self.training_discriminator(audio, fake_audio)
			
 
				+        self.manual_backward(loss_disc_all)
			
 
				+
			
 
				+        for key, disc in self.discriminators.items():
			
 
				+            self.log(
			
 
				+                f"train/discriminator/grad_norm_{key}",
			
 
				+                grad_norm(disc.parameters()),
			
 
				+                on_step=True,
			
 
				+                on_epoch=False,
			
 
				+                prog_bar=False,
			
 
				+                logger=True,
			
 
				+                sync_dist=True,
			
 
				+            )
			
 
				+
			
 
				+        self.clip_gradients(
			
 
				+            optim_d, gradient_clip_val=1000, gradient_clip_algorithm="norm"
			
 
				+        )
			
 
				+        optim_d.step()
			
 
				+
			
 
				+        # Manual LR Scheduler
			
 
				+        scheduler_g, scheduler_d = self.lr_schedulers()
			
 
				+        scheduler_g.step()
			
 
				+        scheduler_d.step()
			
 
				+
			
 
				+    def forward(self, audio, mask=None, input_spec=None):
			
 
				+        if input_spec is None:
			
 
				+            input_spec = self.mel_transforms.input(audio.squeeze(1))
			
 
				+
			
 
				+        fake_audio = self.generator(input_spec)
			
 
				+
			
 
				+        return fake_audio, 0
			
 
				+
			
 
				+    def validation_step(self, batch: Any, batch_idx: int):
			
 
				+        audio, lengths = batch["audio"], batch["lengths"]
			
 
				+        audio_mask = sequence_mask(lengths)[:, None, :].to(audio.device, torch.float32)
			
 
				+
			
 
				+        # Generator
			
 
				+        fake_audio, _ = self.forward(audio, audio_mask)
			
 
				+        assert fake_audio.shape == audio.shape
			
 
				+
			
 
				+        # Apply mask
			
 
				+        audio = audio * audio_mask
			
 
				+        fake_audio = fake_audio * audio_mask
			
 
				+
			
 
				+        # L1 Mel-Spectrogram Loss
			
 
				+        audio_mel = self.mel_transforms.loss(audio.squeeze(1))
			
 
				+        fake_audio_mel = self.mel_transforms.loss(fake_audio.squeeze(1))
			
 
				+        loss_mel = F.l1_loss(audio_mel, fake_audio_mel)
			
 
				+
			
 
				+        self.log(
			
 
				+            "val/metrics/mel",
			
 
				+            loss_mel,
			
 
				+            on_step=False,
			
 
				+            on_epoch=True,
			
 
				+            prog_bar=True,
			
 
				+            logger=True,
			
 
				+            sync_dist=True,
			
 
				+        )
			
 
				+
			
 
				+        # Report other metrics
			
 
				+        self.report_val_metrics(fake_audio, audio, lengths)
			
--- a/fish_speech/models/hubert_vq/modules.py
+++ b/fish_speech/models/hubert_vq/modules.py
@@ -0,0 +1,573 @@
 
				+import math
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from torch.nn import Conv1d, Conv2d, ConvTranspose1d
			
 
				+from torch.nn import functional as F
			
 
				+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
			
 
				+
			
 
				+from fish_speech.models.hubert_vq.utils import (
			
 
				+    convert_pad_shape,
			
 
				+    get_padding,
			
 
				+    init_weights,
			
 
				+)
			
 
				+
			
 
				+LRELU_SLOPE = 0.1
			
 
				+
			
 
				+
			
 
				+class VQEncoder(nn.Module):
			
 
				+    def __init__(self, *args, **kwargs) -> None:
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+
			
 
				+        encoder_layer = nn.TransformerEncoderLayer(
			
 
				+            d_model=256, nhead=4, dim_feedforward=1024, dropout=0.1, activation="gelu"
			
 
				+        )
			
 
				+        self.encoder = nn.TransformerEncoder(
			
 
				+            encoder_layer, num_layers=6, norm=nn.LayerNorm(256)
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+class RelativeAttention(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        channels,
			
 
				+        n_heads,
			
 
				+        p_dropout=0.0,
			
 
				+        window_size=4,
			
 
				+        window_heads_share=True,
			
 
				+        proximal_init=True,
			
 
				+        proximal_bias=False,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert channels % n_heads == 0
			
 
				+
			
 
				+        self.channels = channels
			
 
				+        self.n_heads = n_heads
			
 
				+        self.p_dropout = p_dropout
			
 
				+        self.window_size = window_size
			
 
				+        self.heads_share = window_heads_share
			
 
				+        self.proximal_init = proximal_init
			
 
				+        self.proximal_bias = proximal_bias
			
 
				+
			
 
				+        self.k_channels = channels // n_heads
			
 
				+        self.qkv = nn.Linear(channels, channels * 3)
			
 
				+        self.drop = nn.Dropout(p_dropout)
			
 
				+
			
 
				+        if window_size is not None:
			
 
				+            n_heads_rel = 1 if window_heads_share else n_heads
			
 
				+            rel_stddev = self.k_channels**-0.5
			
 
				+            self.emb_rel_k = nn.Parameter(
			
 
				+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
			
 
				+                * rel_stddev
			
 
				+            )
			
 
				+            self.emb_rel_v = nn.Parameter(
			
 
				+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
			
 
				+                * rel_stddev
			
 
				+            )
			
 
				+
			
 
				+        nn.init.xavier_uniform_(self.qkv.weight)
			
 
				+
			
 
				+        if proximal_init:
			
 
				+            with torch.no_grad():
			
 
				+                # Sync qk weights
			
 
				+                self.qkv.weight.data[: self.channels] = self.qkv.weight.data[
			
 
				+                    self.channels : self.channels * 2
			
 
				+                ]
			
 
				+                self.qkv.bias.data[: self.channels] = self.qkv.bias.data[
			
 
				+                    self.channels : self.channels * 2
			
 
				+                ]
			
 
				+
			
 
				+    def forward(self, x, key_padding_mask=None):
			
 
				+        # x: (batch, seq_len, channels)
			
 
				+        batch_size, seq_len, _ = x.size()
			
 
				+        qkv = (
			
 
				+            self.qkv(x)
			
 
				+            .reshape(batch_size, seq_len, 3, self.n_heads, self.k_channels)
			
 
				+            .permute(2, 0, 3, 1, 4)
			
 
				+        )
			
 
				+        query, key, value = torch.unbind(qkv, dim=0)
			
 
				+
			
 
				+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
			
 
				+
			
 
				+        if self.window_size is not None:
			
 
				+            key_relative_embeddings = self._get_relative_embeddings(
			
 
				+                self.emb_rel_k, seq_len
			
 
				+            )
			
 
				+            rel_logits = self._matmul_with_relative_keys(
			
 
				+                query / math.sqrt(self.k_channels), key_relative_embeddings
			
 
				+            )
			
 
				+            scores_local = self._relative_position_to_absolute_position(rel_logits)
			
 
				+            scores = scores + scores_local
			
 
				+
			
 
				+        if self.proximal_bias:
			
 
				+            scores = scores + self._attention_bias_proximal(seq_len).to(
			
 
				+                device=scores.device, dtype=scores.dtype
			
 
				+            )
			
 
				+
			
 
				+        # key_padding_mask: (batch, seq_len)
			
 
				+        if key_padding_mask is not None:
			
 
				+            assert key_padding_mask.size() == (
			
 
				+                batch_size,
			
 
				+                seq_len,
			
 
				+            ), f"key_padding_mask shape {key_padding_mask.size()} is not (batch_size, seq_len)"
			
 
				+            assert (
			
 
				+                key_padding_mask.dtype == torch.bool
			
 
				+            ), f"key_padding_mask dtype {key_padding_mask.dtype} is not bool"
			
 
				+
			
 
				+            key_padding_mask = key_padding_mask.view(batch_size, 1, 1, seq_len).expand(
			
 
				+                -1, self.n_heads, -1, -1
			
 
				+            )
			
 
				+            print(key_padding_mask.shape, scores.shape)
			
 
				+            scores = scores.masked_fill(key_padding_mask, float("-inf"))
			
 
				+
			
 
				+            print(scores[0, 0])
			
 
				+
			
 
				+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
			
 
				+        p_attn = self.drop(p_attn)
			
 
				+        output = torch.matmul(p_attn, value)
			
 
				+
			
 
				+        if self.window_size is not None:
			
 
				+            relative_weights = self._absolute_position_to_relative_position(p_attn)
			
 
				+            value_relative_embeddings = self._get_relative_embeddings(
			
 
				+                self.emb_rel_v, seq_len
			
 
				+            )
			
 
				+            output = output + self._matmul_with_relative_values(
			
 
				+                relative_weights, value_relative_embeddings
			
 
				+            )
			
 
				+
			
 
				+        return output.reshape(batch_size, seq_len, self.n_heads * self.k_channels)
			
 
				+
			
 
				+    def _matmul_with_relative_values(self, x, y):
			
 
				+        """
			
 
				+        x: [b, h, l, m]
			
 
				+        y: [h or 1, m, d]
			
 
				+        ret: [b, h, l, d]
			
 
				+        """
			
 
				+        ret = torch.matmul(x, y.unsqueeze(0))
			
 
				+        return ret
			
 
				+
			
 
				+    def _matmul_with_relative_keys(self, x, y):
			
 
				+        """
			
 
				+        x: [b, h, l, d]
			
 
				+        y: [h or 1, m, d]
			
 
				+        ret: [b, h, l, m]
			
 
				+        """
			
 
				+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
			
 
				+        return ret
			
 
				+
			
 
				+    def _get_relative_embeddings(self, relative_embeddings, length):
			
 
				+        max_relative_position = 2 * self.window_size + 1
			
 
				+        # Pad first before slice to avoid using cond ops.
			
 
				+        pad_length = max(length - (self.window_size + 1), 0)
			
 
				+        slice_start_position = max((self.window_size + 1) - length, 0)
			
 
				+        slice_end_position = slice_start_position + 2 * length - 1
			
 
				+        if pad_length > 0:
			
 
				+            padded_relative_embeddings = F.pad(
			
 
				+                relative_embeddings,
			
 
				+                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
			
 
				+            )
			
 
				+        else:
			
 
				+            padded_relative_embeddings = relative_embeddings
			
 
				+        used_relative_embeddings = padded_relative_embeddings[
			
 
				+            :, slice_start_position:slice_end_position
			
 
				+        ]
			
 
				+        return used_relative_embeddings
			
 
				+
			
 
				+    def _relative_position_to_absolute_position(self, x):
			
 
				+        """
			
 
				+        x: [b, h, l, 2*l-1]
			
 
				+        ret: [b, h, l, l]
			
 
				+        """
			
 
				+        batch, heads, length, _ = x.size()
			
 
				+        # Concat columns of pad to shift from relative to absolute indexing.
			
 
				+        x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
			
 
				+
			
 
				+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
			
 
				+        x_flat = x.view([batch, heads, length * 2 * length])
			
 
				+        x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
			
 
				+
			
 
				+        # Reshape and slice out the padded elements.
			
 
				+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
			
 
				+            :, :, :length, length - 1 :
			
 
				+        ]
			
 
				+        return x_final
			
 
				+
			
 
				+    def _absolute_position_to_relative_position(self, x):
			
 
				+        """
			
 
				+        x: [b, h, l, l]
			
 
				+        ret: [b, h, l, 2*l-1]
			
 
				+        """
			
 
				+        batch, heads, length, _ = x.size()
			
 
				+        # pad along column
			
 
				+        x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
			
 
				+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
			
 
				+        # add 0's in the beginning that will skew the elements after reshape
			
 
				+        x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
			
 
				+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
			
 
				+        return x_final
			
 
				+
			
 
				+    def _attention_bias_proximal(self, length):
			
 
				+        """Bias for self-attention to encourage attention to close positions.
			
 
				+        Args:
			
 
				+          length: an integer scalar.
			
 
				+        Returns:
			
 
				+          a Tensor with shape [1, 1, length, length]
			
 
				+        """
			
 
				+        r = torch.arange(length, dtype=torch.float32)
			
 
				+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
			
 
				+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
			
 
				+
			
 
				+
			
 
				+class ResBlock1(torch.nn.Module):
			
 
				+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
			
 
				+        super(ResBlock1, self).__init__()
			
 
				+        self.convs1 = nn.ModuleList(
			
 
				+            [
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[0],
			
 
				+                        padding=get_padding(kernel_size, dilation[0]),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[1],
			
 
				+                        padding=get_padding(kernel_size, dilation[1]),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[2],
			
 
				+                        padding=get_padding(kernel_size, dilation[2]),
			
 
				+                    )
			
 
				+                ),
			
 
				+            ]
			
 
				+        )
			
 
				+        self.convs1.apply(init_weights)
			
 
				+
			
 
				+        self.convs2 = nn.ModuleList(
			
 
				+            [
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=1,
			
 
				+                        padding=get_padding(kernel_size, 1),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=1,
			
 
				+                        padding=get_padding(kernel_size, 1),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=1,
			
 
				+                        padding=get_padding(kernel_size, 1),
			
 
				+                    )
			
 
				+                ),
			
 
				+            ]
			
 
				+        )
			
 
				+        self.convs2.apply(init_weights)
			
 
				+
			
 
				+    def forward(self, x, x_mask=None):
			
 
				+        for c1, c2 in zip(self.convs1, self.convs2):
			
 
				+            xt = F.leaky_relu(x, LRELU_SLOPE)
			
 
				+            if x_mask is not None:
			
 
				+                xt = xt * x_mask
			
 
				+            xt = c1(xt)
			
 
				+            xt = F.leaky_relu(xt, LRELU_SLOPE)
			
 
				+            if x_mask is not None:
			
 
				+                xt = xt * x_mask
			
 
				+            xt = c2(xt)
			
 
				+            x = xt + x
			
 
				+        if x_mask is not None:
			
 
				+            x = x * x_mask
			
 
				+        return x
			
 
				+
			
 
				+    def remove_weight_norm(self):
			
 
				+        for l in self.convs1:
			
 
				+            remove_weight_norm(l)
			
 
				+        for l in self.convs2:
			
 
				+            remove_weight_norm(l)
			
 
				+
			
 
				+
			
 
				+class ResBlock2(torch.nn.Module):
			
 
				+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
			
 
				+        super(ResBlock2, self).__init__()
			
 
				+        self.convs = nn.ModuleList(
			
 
				+            [
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[0],
			
 
				+                        padding=get_padding(kernel_size, dilation[0]),
			
 
				+                    )
			
 
				+                ),
			
 
				+                weight_norm(
			
 
				+                    Conv1d(
			
 
				+                        channels,
			
 
				+                        channels,
			
 
				+                        kernel_size,
			
 
				+                        1,
			
 
				+                        dilation=dilation[1],
			
 
				+                        padding=get_padding(kernel_size, dilation[1]),
			
 
				+                    )
			
 
				+                ),
			
 
				+            ]
			
 
				+        )
			
 
				+        self.convs.apply(init_weights)
			
 
				+
			
 
				+    def forward(self, x, x_mask=None):
			
 
				+        for c in self.convs:
			
 
				+            xt = F.leaky_relu(x, LRELU_SLOPE)
			
 
				+            if x_mask is not None:
			
 
				+                xt = xt * x_mask
			
 
				+            xt = c(xt)
			
 
				+            x = xt + x
			
 
				+        if x_mask is not None:
			
 
				+            x = x * x_mask
			
 
				+        return x
			
 
				+
			
 
				+    def remove_weight_norm(self):
			
 
				+        for l in self.convs:
			
 
				+            remove_weight_norm(l)
			
 
				+
			
 
				+
			
 
				+class Generator(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        initial_channel,
			
 
				+        resblock,
			
 
				+        resblock_kernel_sizes,
			
 
				+        resblock_dilation_sizes,
			
 
				+        upsample_rates,
			
 
				+        upsample_initial_channel,
			
 
				+        upsample_kernel_sizes,
			
 
				+        gin_channels=0,
			
 
				+    ):
			
 
				+        super(Generator, self).__init__()
			
 
				+        self.num_kernels = len(resblock_kernel_sizes)
			
 
				+        self.num_upsamples = len(upsample_rates)
			
 
				+        self.conv_pre = Conv1d(
			
 
				+            initial_channel, upsample_initial_channel, 7, 1, padding=3
			
 
				+        )
			
 
				+        resblock = ResBlock1 if resblock == "1" else ResBlock2
			
 
				+
			
 
				+        self.ups = nn.ModuleList()
			
 
				+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
			
 
				+            self.ups.append(
			
 
				+                weight_norm(
			
 
				+                    ConvTranspose1d(
			
 
				+                        upsample_initial_channel // (2**i),
			
 
				+                        upsample_initial_channel // (2 ** (i + 1)),
			
 
				+                        k,
			
 
				+                        u,
			
 
				+                        padding=(k - u) // 2,
			
 
				+                    )
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        self.resblocks = nn.ModuleList()
			
 
				+        for i in range(len(self.ups)):
			
 
				+            ch = upsample_initial_channel // (2 ** (i + 1))
			
 
				+            for j, (k, d) in enumerate(
			
 
				+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
			
 
				+            ):
			
 
				+                self.resblocks.append(resblock(ch, k, d))
			
 
				+
			
 
				+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
			
 
				+        self.ups.apply(init_weights)
			
 
				+
			
 
				+        if gin_channels != 0:
			
 
				+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
			
 
				+
			
 
				+    def forward(self, x, g=None):
			
 
				+        x = self.conv_pre(x)
			
 
				+        if g is not None:
			
 
				+            x = x + self.cond(g)
			
 
				+
			
 
				+        for i in range(self.num_upsamples):
			
 
				+            x = F.leaky_relu(x, LRELU_SLOPE)
			
 
				+            x = self.ups[i](x)
			
 
				+            xs = None
			
 
				+            for j in range(self.num_kernels):
			
 
				+                if xs is None:
			
 
				+                    xs = self.resblocks[i * self.num_kernels + j](x)
			
 
				+                else:
			
 
				+                    xs += self.resblocks[i * self.num_kernels + j](x)
			
 
				+            x = xs / self.num_kernels
			
 
				+        x = F.leaky_relu(x)
			
 
				+        x = self.conv_post(x)
			
 
				+        x = torch.tanh(x)
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+    def remove_weight_norm(self):
			
 
				+        print("Removing weight norm...")
			
 
				+        for l in self.ups:
			
 
				+            remove_weight_norm(l)
			
 
				+        for l in self.resblocks:
			
 
				+            l.remove_weight_norm()
			
 
				+
			
 
				+
			
 
				+class DiscriminatorP(nn.Module):
			
 
				+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
			
 
				+        super(DiscriminatorP, self).__init__()
			
 
				+        self.period = period
			
 
				+        self.use_spectral_norm = use_spectral_norm
			
 
				+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
			
 
				+        self.convs = nn.ModuleList(
			
 
				+            [
			
 
				+                norm_f(
			
 
				+                    Conv2d(
			
 
				+                        1,
			
 
				+                        32,
			
 
				+                        (kernel_size, 1),
			
 
				+                        (stride, 1),
			
 
				+                        padding=(get_padding(kernel_size, 1), 0),
			
 
				+                    )
			
 
				+                ),
			
 
				+                norm_f(
			
 
				+                    Conv2d(
			
 
				+                        32,
			
 
				+                        128,
			
 
				+                        (kernel_size, 1),
			
 
				+                        (stride, 1),
			
 
				+                        padding=(get_padding(kernel_size, 1), 0),
			
 
				+                    )
			
 
				+                ),
			
 
				+                norm_f(
			
 
				+                    Conv2d(
			
 
				+                        128,
			
 
				+                        512,
			
 
				+                        (kernel_size, 1),
			
 
				+                        (stride, 1),
			
 
				+                        padding=(get_padding(kernel_size, 1), 0),
			
 
				+                    )
			
 
				+                ),
			
 
				+                norm_f(
			
 
				+                    Conv2d(
			
 
				+                        512,
			
 
				+                        1024,
			
 
				+                        (kernel_size, 1),
			
 
				+                        (stride, 1),
			
 
				+                        padding=(get_padding(kernel_size, 1), 0),
			
 
				+                    )
			
 
				+                ),
			
 
				+                norm_f(
			
 
				+                    Conv2d(
			
 
				+                        1024,
			
 
				+                        1024,
			
 
				+                        (kernel_size, 1),
			
 
				+                        1,
			
 
				+                        padding=(get_padding(kernel_size, 1), 0),
			
 
				+                    )
			
 
				+                ),
			
 
				+            ]
			
 
				+        )
			
 
				+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        fmap = []
			
 
				+
			
 
				+        # 1d to 2d
			
 
				+        b, c, t = x.shape
			
 
				+        if t % self.period != 0:  # pad first
			
 
				+            n_pad = self.period - (t % self.period)
			
 
				+            x = F.pad(x, (0, n_pad), "reflect")
			
 
				+            t = t + n_pad
			
 
				+        x = x.view(b, c, t // self.period, self.period)
			
 
				+
			
 
				+        for l in self.convs:
			
 
				+            x = l(x)
			
 
				+            x = F.leaky_relu(x, LRELU_SLOPE)
			
 
				+            fmap.append(x)
			
 
				+        x = self.conv_post(x)
			
 
				+        fmap.append(x)
			
 
				+        x = torch.flatten(x, 1, -1)
			
 
				+
			
 
				+        return x, fmap
			
 
				+
			
 
				+
			
 
				+class DiscriminatorS(nn.Module):
			
 
				+    def __init__(self, use_spectral_norm=False):
			
 
				+        super(DiscriminatorS, self).__init__()
			
 
				+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
			
 
				+        self.convs = nn.ModuleList(
			
 
				+            [
			
 
				+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
			
 
				+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
			
 
				+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
			
 
				+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
			
 
				+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
			
 
				+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
			
 
				+            ]
			
 
				+        )
			
 
				+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        fmap = []
			
 
				+
			
 
				+        for l in self.convs:
			
 
				+            x = l(x)
			
 
				+            x = F.leaky_relu(x, LRELU_SLOPE)
			
 
				+            fmap.append(x)
			
 
				+        x = self.conv_post(x)
			
 
				+        fmap.append(x)
			
 
				+        x = torch.flatten(x, 1, -1)
			
 
				+
			
 
				+        return x, fmap
			
 
				+
			
 
				+
			
 
				+class EnsembleDiscriminator(nn.Module):
			
 
				+    def __init__(self, use_spectral_norm=False):
			
 
				+        super(EnsembleDiscriminator, self).__init__()
			
 
				+        periods = [2, 3, 5, 7, 11]
			
 
				+
			
 
				+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
			
 
				+        discs = discs + [
			
 
				+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
			
 
				+        ]
			
 
				+        self.discriminators = nn.ModuleList(discs)
			
 
				+
			
 
				+    def forward(self, y, y_hat):
			
 
				+        y_d_rs = []
			
 
				+        y_d_gs = []
			
 
				+        fmap_rs = []
			
 
				+        fmap_gs = []
			
 
				+        for i, d in enumerate(self.discriminators):
			
 
				+            y_d_r, fmap_r = d(y)
			
 
				+            y_d_g, fmap_g = d(y_hat)
			
 
				+            y_d_rs.append(y_d_r)
			
 
				+            y_d_gs.append(y_d_g)
			
 
				+            fmap_rs.append(fmap_r)
			
 
				+            fmap_gs.append(fmap_g)
			
 
				+
			
 
				+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
			
--- a/fish_speech/models/hubert_vq/utils.py
+++ b/fish_speech/models/hubert_vq/utils.py
@@ -0,0 +1,163 @@
 
				+import torch
			
 
				+import torch.utils.data
			
 
				+from librosa.filters import mel as librosa_mel_fn
			
 
				+
			
 
				+
			
 
				+def convert_pad_shape(pad_shape):
			
 
				+    l = pad_shape[::-1]
			
 
				+    pad_shape = [item for sublist in l for item in sublist]
			
 
				+    return pad_shape
			
 
				+
			
 
				+
			
 
				+def sequence_mask(length, max_length=None):
			
 
				+    if max_length is None:
			
 
				+        max_length = length.max()
			
 
				+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
			
 
				+    return x.unsqueeze(0) < length.unsqueeze(1)
			
 
				+
			
 
				+
			
 
				+def init_weights(m, mean=0.0, std=0.01):
			
 
				+    classname = m.__class__.__name__
			
 
				+    if classname.find("Conv") != -1:
			
 
				+        m.weight.data.normal_(mean, std)
			
 
				+
			
 
				+
			
 
				+def get_padding(kernel_size, dilation=1):
			
 
				+    return int((kernel_size * dilation - dilation) / 2)
			
 
				+
			
 
				+
			
 
				+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
			
 
				+    """
			
 
				+    PARAMS
			
 
				+    ------
			
 
				+    C: compression factor
			
 
				+    """
			
 
				+    return torch.log(torch.clamp(x, min=clip_val) * C)
			
 
				+
			
 
				+
			
 
				+def dynamic_range_decompression_torch(x, C=1):
			
 
				+    """
			
 
				+    PARAMS
			
 
				+    ------
			
 
				+    C: compression factor used to compress
			
 
				+    """
			
 
				+    return torch.exp(x) / C
			
 
				+
			
 
				+
			
 
				+def spectral_normalize_torch(magnitudes):
			
 
				+    output = dynamic_range_compression_torch(magnitudes)
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+def spectral_de_normalize_torch(magnitudes):
			
 
				+    output = dynamic_range_decompression_torch(magnitudes)
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+mel_basis = {}
			
 
				+hann_window = {}
			
 
				+
			
 
				+
			
 
				+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
			
 
				+    if torch.min(y) < -1.0:
			
 
				+        print("min value is ", torch.min(y))
			
 
				+    if torch.max(y) > 1.0:
			
 
				+        print("max value is ", torch.max(y))
			
 
				+
			
 
				+    global hann_window
			
 
				+    dtype_device = str(y.dtype) + "_" + str(y.device)
			
 
				+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
			
 
				+    if wnsize_dtype_device not in hann_window:
			
 
				+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
			
 
				+            dtype=y.dtype, device=y.device
			
 
				+        )
			
 
				+
			
 
				+    y = torch.nn.functional.pad(
			
 
				+        y.unsqueeze(1),
			
 
				+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
			
 
				+        mode="reflect",
			
 
				+    )
			
 
				+    y = y.squeeze(1)
			
 
				+    spec = torch.stft(
			
 
				+        y,
			
 
				+        n_fft,
			
 
				+        hop_length=hop_size,
			
 
				+        win_length=win_size,
			
 
				+        window=hann_window[wnsize_dtype_device],
			
 
				+        center=center,
			
 
				+        pad_mode="reflect",
			
 
				+        normalized=False,
			
 
				+        onesided=True,
			
 
				+        return_complex=False,
			
 
				+    )
			
 
				+
			
 
				+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
			
 
				+    return spec
			
 
				+
			
 
				+
			
 
				+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
			
 
				+    global mel_basis
			
 
				+    dtype_device = str(spec.dtype) + "_" + str(spec.device)
			
 
				+    fmax_dtype_device = str(fmax) + "_" + dtype_device
			
 
				+    if fmax_dtype_device not in mel_basis:
			
 
				+        mel = librosa_mel_fn(
			
 
				+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
			
 
				+        )
			
 
				+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
			
 
				+            dtype=spec.dtype, device=spec.device
			
 
				+        )
			
 
				+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
			
 
				+    spec = spectral_normalize_torch(spec)
			
 
				+    return spec
			
 
				+
			
 
				+
			
 
				+def mel_spectrogram_torch(
			
 
				+    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
			
 
				+):
			
 
				+    if torch.min(y) < -1.0:
			
 
				+        print("min value is ", torch.min(y))
			
 
				+    if torch.max(y) > 1.0:
			
 
				+        print("max value is ", torch.max(y))
			
 
				+
			
 
				+    global mel_basis, hann_window
			
 
				+    dtype_device = str(y.dtype) + "_" + str(y.device)
			
 
				+    fmax_dtype_device = str(fmax) + "_" + dtype_device
			
 
				+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
			
 
				+    if fmax_dtype_device not in mel_basis:
			
 
				+        mel = librosa_mel_fn(
			
 
				+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
			
 
				+        )
			
 
				+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
			
 
				+            dtype=y.dtype, device=y.device
			
 
				+        )
			
 
				+    if wnsize_dtype_device not in hann_window:
			
 
				+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
			
 
				+            dtype=y.dtype, device=y.device
			
 
				+        )
			
 
				+
			
 
				+    y = torch.nn.functional.pad(
			
 
				+        y.unsqueeze(1),
			
 
				+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
			
 
				+        mode="reflect",
			
 
				+    )
			
 
				+    y = y.squeeze(1)
			
 
				+
			
 
				+    spec = torch.stft(
			
 
				+        y,
			
 
				+        n_fft,
			
 
				+        hop_length=hop_size,
			
 
				+        win_length=win_size,
			
 
				+        window=hann_window[wnsize_dtype_device],
			
 
				+        center=center,
			
 
				+        pad_mode="reflect",
			
 
				+        normalized=False,
			
 
				+        onesided=True,
			
 
				+        return_complex=False,
			
 
				+    )
			
 
				+
			
 
				+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
			
 
				+
			
 
				+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
			
 
				+    spec = spectral_normalize_torch(spec)
			
 
				+
			
 
				+    return spec
			
--- a/fish_speech/models/whisper_vq.py
+++ b/fish_speech/models/whisper_vq.py
@@ -1,216 +0,0 @@
 
				-from dataclasses import dataclass
			
 
				-from typing import Optional
			
 
				-
			
 
				-import torch
			
 
				-from torch import nn
			
 
				-from vector_quantize_pytorch import VectorQuantize
			
 
				-
			
 
				-from fish_speech.modules.flash_whisper import (
			
 
				-    FlashWhisperEncoderLayer,
			
 
				-    FlashWhisperForConditionalGeneration,
			
 
				-)
			
 
				-
			
 
				-
			
 
				-@dataclass
			
 
				-class WhisperVQOutput:
			
 
				-    loss: torch.Tensor
			
 
				-    metrics: dict[str, torch.Tensor]
			
 
				-
			
 
				-
			
 
				-class WhisperVQ(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        model_name_or_path: str = "openai/whisper-medium",
			
 
				-        # Quantization
			
 
				-        codebook_dim: int = 32,
			
 
				-        codebook_size: int = 4096,
			
 
				-        codebook_decay: float = 0.9,
			
 
				-        threshold_ema_dead_code: int = 0,
			
 
				-        use_cosine_similarity: bool = True,
			
 
				-        downsample: bool = True,
			
 
				-        # Attention
			
 
				-        post_attention_depth: int = 2,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.whisper = FlashWhisperForConditionalGeneration.from_pretrained(
			
 
				-            model_name_or_path
			
 
				-        )
			
 
				-        self.whisper.gradient_checkpointing_enable()
			
 
				-
			
 
				-        # Freeze Whisper
			
 
				-        for param in self.whisper.parameters():
			
 
				-            param.requires_grad = False
			
 
				-
			
 
				-        # Store vars
			
 
				-        self.downsample = downsample
			
 
				-        self.codebook_dim = codebook_dim
			
 
				-        self.codebook_size = codebook_size
			
 
				-
			
 
				-        # Pre-quantization
			
 
				-        whisper_config = self.whisper.model.config
			
 
				-        encoder_width = whisper_config.encoder_attention_heads * 64
			
 
				-
			
 
				-        self.pre_ln = nn.LayerNorm(encoder_width)
			
 
				-        self.pre_mlp = nn.Sequential(
			
 
				-            nn.Linear(encoder_width, whisper_config.encoder_ffn_dim),
			
 
				-            nn.GELU(),
			
 
				-            nn.Linear(whisper_config.encoder_ffn_dim, encoder_width),
			
 
				-        )
			
 
				-
			
 
				-        # Quantization
			
 
				-        self.quantizer = VectorQuantize(
			
 
				-            dim=encoder_width,
			
 
				-            codebook_size=codebook_size,
			
 
				-            codebook_dim=codebook_dim,
			
 
				-            decay=codebook_decay,
			
 
				-            commitment_weight=1.0,
			
 
				-            threshold_ema_dead_code=threshold_ema_dead_code,
			
 
				-            use_cosine_sim=use_cosine_similarity,
			
 
				-        )
			
 
				-        self.pad_embedding = nn.Parameter(torch.randn(encoder_width))
			
 
				-
			
 
				-        # Post-quantization
			
 
				-        self.post_positional_embedding = nn.Embedding(
			
 
				-            whisper_config.max_source_positions, encoder_width
			
 
				-        )
			
 
				-        self.post_attention = nn.Sequential(
			
 
				-            *[
			
 
				-                FlashWhisperEncoderLayer(
			
 
				-                    config=whisper_config,
			
 
				-                )
			
 
				-                for _ in range(post_attention_depth)
			
 
				-            ]
			
 
				-        )
			
 
				-        self.post_ln = nn.LayerNorm(encoder_width)
			
 
				-
			
 
				-    def encode(
			
 
				-        self,
			
 
				-        input_features: Optional[torch.Tensor],
			
 
				-        attention_mask: Optional[torch.Tensor] = None,
			
 
				-    ) -> torch.Tensor:
			
 
				-        if attention_mask is not None:
			
 
				-            assert attention_mask.ndim == 2, "Attention mask must be 2D"
			
 
				-
			
 
				-            # Whisper will downsample by 2
			
 
				-            attention_mask = attention_mask[:, ::2]
			
 
				-
			
 
				-        with torch.no_grad():
			
 
				-            hidden_states = self.whisper.model.encoder(
			
 
				-                input_features,
			
 
				-            ).last_hidden_state
			
 
				-
			
 
				-            x = hidden_states
			
 
				-            if self.downsample:
			
 
				-                x = x.reshape(x.shape[0], x.shape[1] // 2, 2, x.shape[2]).mean(dim=2)
			
 
				-
			
 
				-                if attention_mask is not None:
			
 
				-                    attention_mask = attention_mask[:, ::2]
			
 
				-
			
 
				-        x = x + self.pre_mlp(self.pre_ln(x))
			
 
				-        quantized, indices, loss = self.quantizer(
			
 
				-            x, mask=attention_mask.bool() if attention_mask is not None else None
			
 
				-        )
			
 
				-
			
 
				-        # Fill masked positions with pad embedding
			
 
				-        if attention_mask is not None:
			
 
				-            quantized[attention_mask == 0] = self.pad_embedding
			
 
				-
			
 
				-        return quantized, indices, loss, hidden_states
			
 
				-
			
 
				-    def decode(
			
 
				-        self,
			
 
				-        hidden_states: torch.Tensor,
			
 
				-    ) -> torch.Tensor:
			
 
				-        # Upsample
			
 
				-        if self.downsample:
			
 
				-            hidden_states = hidden_states.repeat_interleave(2, dim=1)
			
 
				-
			
 
				-        # Inject position embeddings
			
 
				-        positions = torch.arange(
			
 
				-            0, hidden_states.shape[1], dtype=torch.long, device=hidden_states.device
			
 
				-        )
			
 
				-        x = hidden_states + self.post_positional_embedding(positions)
			
 
				-
			
 
				-        # Decode
			
 
				-        for layer in self.post_attention:
			
 
				-            x = layer(x, None, None)[0]
			
 
				-        hidden_states = self.post_ln(hidden_states)
			
 
				-
			
 
				-        return hidden_states
			
 
				-
			
 
				-    def forward(
			
 
				-        self,
			
 
				-        input_features: torch.Tensor,
			
 
				-        encoder_attention_mask: torch.Tensor,
			
 
				-        decoder_input_ids: torch.Tensor,
			
 
				-        decoder_attention_mask: torch.Tensor,
			
 
				-        labels: torch.Tensor,
			
 
				-        # Audio, not used here
			
 
				-        input_values: Optional[torch.Tensor] = None,
			
 
				-    ) -> WhisperVQOutput:
			
 
				-        quantize, _, vq_loss, teacher_hidden_states = self.encode(
			
 
				-            input_features=input_features,
			
 
				-            attention_mask=encoder_attention_mask,
			
 
				-        )
			
 
				-        vq_hidden_states = self.decode(quantize)
			
 
				-
			
 
				-        # student cross entropy loss
			
 
				-        outputs = self.whisper(
			
 
				-            encoder_outputs=(vq_hidden_states,),
			
 
				-            decoder_input_ids=decoder_input_ids,
			
 
				-            decoder_attention_mask=decoder_attention_mask,
			
 
				-            labels=labels,
			
 
				-        )
			
 
				-        student_ce_loss = outputs.loss
			
 
				-        student_logits = outputs.logits
			
 
				-
			
 
				-        # teacher cross entropy loss
			
 
				-        with torch.no_grad():
			
 
				-            outputs = self.whisper(
			
 
				-                encoder_outputs=(teacher_hidden_states,),
			
 
				-                decoder_input_ids=decoder_input_ids,
			
 
				-                decoder_attention_mask=decoder_attention_mask,
			
 
				-                labels=labels,
			
 
				-            )
			
 
				-            teacher_ce_loss = outputs.loss
			
 
				-            teacher_logits = outputs.logits
			
 
				-
			
 
				-        # KL divergence
			
 
				-        kl_loss = nn.functional.kl_div(
			
 
				-            nn.functional.log_softmax(student_logits, dim=-1),
			
 
				-            nn.functional.softmax(teacher_logits, dim=-1),
			
 
				-            reduction="batchmean",
			
 
				-        )
			
 
				-
			
 
				-        loss = vq_loss + student_ce_loss + kl_loss
			
 
				-
			
 
				-        return WhisperVQOutput(
			
 
				-            loss=loss,
			
 
				-            metrics={
			
 
				-                "vq_loss": vq_loss,
			
 
				-                "student_ce_loss": student_ce_loss,
			
 
				-                "teacher_ce_loss": teacher_ce_loss,
			
 
				-                "kl_loss": kl_loss,
			
 
				-            },
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    from torch.utils.data import DataLoader
			
 
				-    from transformers import WhisperProcessor
			
 
				-
			
 
				-    from fish_speech.datasets.whisper_vq import WhisperVQCollator, WhisperVQDataset
			
 
				-
			
 
				-    processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
			
 
				-    model = WhisperVQ()
			
 
				-
			
 
				-    ds = WhisperVQDataset(
			
 
				-        "filelists/whisper-vq.train.test.filelist", "openai/whisper-medium"
			
 
				-    )
			
 
				-    loader = DataLoader(ds, batch_size=8, collate_fn=WhisperVQCollator())
			
 
				-
			
 
				-    for batch in loader:
			
 
				-        output = model(**batch)
			
 
				-        print(output)
			
 
				-        break
			
--- a/fish_speech/modules/flash_whisper.py
+++ b/fish_speech/modules/flash_whisper.py
@@ -1,313 +0,0 @@
 
				-# A whisper that supports flash-attention and dynamic input length.
			
 
				-from typing import Optional, Tuple, Union
			
 
				-
			
 
				-import numpy as np
			
 
				-import torch
			
 
				-import torch.nn.functional as F
			
 
				-from torch import nn
			
 
				-from transformers.modeling_outputs import BaseModelOutput
			
 
				-from transformers.models.whisper.modeling_whisper import (
			
 
				-    WhisperAttention,
			
 
				-    WhisperConfig,
			
 
				-    WhisperDecoder,
			
 
				-    WhisperDecoderLayer,
			
 
				-    WhisperEncoder,
			
 
				-    WhisperEncoderLayer,
			
 
				-    WhisperForConditionalGeneration,
			
 
				-    WhisperModel,
			
 
				-)
			
 
				-from transformers.utils import logging
			
 
				-
			
 
				-logger = logging.get_logger(__name__)
			
 
				-
			
 
				-
			
 
				-class FlashWhisperAttention(WhisperAttention):
			
 
				-    """Multi-headed attention from 'Attention Is All You Need' paper"""
			
 
				-
			
 
				-    # Copied from transformers.models.bart.modeling_bart.BartAttention.forward with BART->whisper
			
 
				-    def forward(
			
 
				-        self,
			
 
				-        hidden_states: torch.Tensor,
			
 
				-        key_value_states: Optional[torch.Tensor] = None,
			
 
				-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
			
 
				-        attention_mask: Optional[torch.Tensor] = None,
			
 
				-        layer_head_mask: Optional[torch.Tensor] = None,
			
 
				-        output_attentions: bool = False,
			
 
				-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
			
 
				-        """Input shape: Batch x Time x Channel"""
			
 
				-
			
 
				-        # if key_value_states are provided this layer is used as a cross-attention layer
			
 
				-        # for the decoder
			
 
				-        is_cross_attention = key_value_states is not None
			
 
				-
			
 
				-        bsz, tgt_len, _ = hidden_states.size()
			
 
				-
			
 
				-        # get query proj - don't scale here since Flash Attention performs this under the hood
			
 
				-        query_states = self._shape(self.q_proj(hidden_states), -1, bsz)
			
 
				-
			
 
				-        # get key, value proj
			
 
				-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
			
 
				-        # is checking that the `sequence_length` of the `past_key_value` is the same as
			
 
				-        # the provided `key_value_states` to support prefix tuning
			
 
				-        if (
			
 
				-            is_cross_attention
			
 
				-            and past_key_value is not None
			
 
				-            and past_key_value[0].shape[2] == key_value_states.shape[1]
			
 
				-        ):
			
 
				-            # reuse k,v, cross_attentions
			
 
				-            key_states = past_key_value[0]
			
 
				-            value_states = past_key_value[1]
			
 
				-        elif is_cross_attention:
			
 
				-            # cross_attentions
			
 
				-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
			
 
				-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
			
 
				-        elif past_key_value is not None:
			
 
				-            # reuse k, v, self_attention
			
 
				-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
			
 
				-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
			
 
				-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
			
 
				-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
			
 
				-        else:
			
 
				-            # self_attention
			
 
				-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
			
 
				-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
			
 
				-
			
 
				-        if self.is_decoder:
			
 
				-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
			
 
				-            # Further calls to cross_attention layer can then reuse all cross-attention
			
 
				-            # key/value_states (first "if" case)
			
 
				-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
			
 
				-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
			
 
				-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
			
 
				-            # if encoder bi-directional self-attention `past_key_value` is always `None`
			
 
				-            past_key_value = (key_states, value_states)
			
 
				-
			
 
				-        attn_output = F.scaled_dot_product_attention(
			
 
				-            query=query_states,
			
 
				-            key=key_states,
			
 
				-            value=value_states,
			
 
				-            attn_mask=attention_mask,
			
 
				-            scale=self.scaling,
			
 
				-        )
			
 
				-
			
 
				-        attn_output = attn_output.transpose(1, 2)
			
 
				-
			
 
				-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
			
 
				-        # partitioned across GPUs when using tensor-parallelism.
			
 
				-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
			
 
				-
			
 
				-        attn_output = self.out_proj(attn_output)
			
 
				-
			
 
				-        return attn_output, None, past_key_value
			
 
				-
			
 
				-
			
 
				-# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Whisper
			
 
				-class FlashWhisperEncoderLayer(WhisperEncoderLayer):
			
 
				-    def __init__(self, config: WhisperConfig):
			
 
				-        super().__init__(config)
			
 
				-
			
 
				-        self.self_attn = FlashWhisperAttention(
			
 
				-            embed_dim=self.embed_dim,
			
 
				-            num_heads=config.encoder_attention_heads,
			
 
				-            dropout=config.attention_dropout,
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-class FlashWhisperDecoderLayer(WhisperDecoderLayer):
			
 
				-    def __init__(self, config: WhisperConfig):
			
 
				-        super().__init__(config)
			
 
				-
			
 
				-        self.self_attn = FlashWhisperAttention(
			
 
				-            embed_dim=self.embed_dim,
			
 
				-            num_heads=config.decoder_attention_heads,
			
 
				-            dropout=config.attention_dropout,
			
 
				-            is_decoder=True,
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-class FlashWhisperEncoder(WhisperEncoder):
			
 
				-    """
			
 
				-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
			
 
				-    [`WhisperEncoderLayer`].
			
 
				-
			
 
				-    Args:
			
 
				-        config: WhisperConfig
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, config: WhisperConfig):
			
 
				-        super().__init__(config)
			
 
				-
			
 
				-        self.layers = nn.ModuleList(
			
 
				-            [FlashWhisperEncoderLayer(config) for _ in range(config.encoder_layers)]
			
 
				-        )
			
 
				-
			
 
				-    def forward(
			
 
				-        self,
			
 
				-        input_features,
			
 
				-        attention_mask=None,
			
 
				-        head_mask=None,
			
 
				-        output_attentions=None,
			
 
				-        output_hidden_states=None,
			
 
				-        return_dict=None,
			
 
				-    ):
			
 
				-        r"""
			
 
				-        Args:
			
 
				-            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
			
 
				-                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
			
 
				-                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
			
 
				-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
			
 
				-                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
			
 
				-                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
			
 
				-            attention_mask (`torch.Tensor`)`, *optional*):
			
 
				-                Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
			
 
				-                but it is not used. By default the silence in the input log mel spectrogram are ignored.
			
 
				-            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
			
 
				-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
			
 
				-
			
 
				-                - 1 indicates the head is **not masked**,
			
 
				-                - 0 indicates the head is **masked**.
			
 
				-            output_attentions (`bool`, *optional*):
			
 
				-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
			
 
				-                returned tensors for more detail.
			
 
				-            output_hidden_states (`bool`, *optional*):
			
 
				-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
			
 
				-                for more detail.
			
 
				-            return_dict (`bool`, *optional*):
			
 
				-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
			
 
				-        """
			
 
				-
			
 
				-        # If we receive the output of input feature directly, just return it
			
 
				-        if input_features.shape[-2:] == (1500, 1024):
			
 
				-            if not return_dict:
			
 
				-                return (input_features,)
			
 
				-
			
 
				-            return BaseModelOutput(last_hidden_state=input_features)
			
 
				-
			
 
				-        output_attentions = (
			
 
				-            output_attentions
			
 
				-            if output_attentions is not None
			
 
				-            else self.config.output_attentions
			
 
				-        )
			
 
				-        output_hidden_states = (
			
 
				-            output_hidden_states
			
 
				-            if output_hidden_states is not None
			
 
				-            else self.config.output_hidden_states
			
 
				-        )
			
 
				-        return_dict = (
			
 
				-            return_dict if return_dict is not None else self.config.use_return_dict
			
 
				-        )
			
 
				-        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
			
 
				-        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
			
 
				-
			
 
				-        inputs_embeds = inputs_embeds.permute(0, 2, 1)
			
 
				-        embed_pos = self.embed_positions.weight
			
 
				-
			
 
				-        hidden_states = inputs_embeds + embed_pos[None, : inputs_embeds.size(1), :]
			
 
				-        hidden_states = nn.functional.dropout(
			
 
				-            hidden_states, p=self.dropout, training=self.training
			
 
				-        )
			
 
				-
			
 
				-        encoder_states = () if output_hidden_states else None
			
 
				-        all_attentions = () if output_attentions else None
			
 
				-
			
 
				-        # check if head_mask has a correct number of layers specified if desired
			
 
				-        if head_mask is not None:
			
 
				-            assert head_mask.size()[0] == (
			
 
				-                len(self.layers)
			
 
				-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
			
 
				-
			
 
				-        for idx, encoder_layer in enumerate(self.layers):
			
 
				-            if output_hidden_states:
			
 
				-                encoder_states = encoder_states + (hidden_states,)
			
 
				-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
			
 
				-            to_drop = False
			
 
				-            if self.training:
			
 
				-                dropout_probability = torch.rand([])
			
 
				-                if dropout_probability < self.layerdrop:  # skip the layer
			
 
				-                    to_drop = True
			
 
				-
			
 
				-            if to_drop:
			
 
				-                layer_outputs = (None, None)
			
 
				-            else:
			
 
				-                if self.gradient_checkpointing and self.training:
			
 
				-
			
 
				-                    def create_custom_forward(module):
			
 
				-                        def custom_forward(*inputs):
			
 
				-                            return module(*inputs, output_attentions)
			
 
				-
			
 
				-                        return custom_forward
			
 
				-
			
 
				-                    layer_outputs = torch.utils.checkpoint.checkpoint(
			
 
				-                        create_custom_forward(encoder_layer),
			
 
				-                        hidden_states,
			
 
				-                        None,
			
 
				-                        (head_mask[idx] if head_mask is not None else None),
			
 
				-                    )
			
 
				-                else:
			
 
				-                    layer_outputs = encoder_layer(
			
 
				-                        hidden_states,
			
 
				-                        None,
			
 
				-                        layer_head_mask=(
			
 
				-                            head_mask[idx] if head_mask is not None else None
			
 
				-                        ),
			
 
				-                        output_attentions=output_attentions,
			
 
				-                    )
			
 
				-
			
 
				-                hidden_states = layer_outputs[0]
			
 
				-
			
 
				-            if output_attentions:
			
 
				-                all_attentions = all_attentions + (layer_outputs[1],)
			
 
				-
			
 
				-        hidden_states = self.layer_norm(hidden_states)
			
 
				-
			
 
				-        # Simply set states to zero for attention_mask
			
 
				-        # hidden_states[:, 40:, :] = 0
			
 
				-
			
 
				-        if output_hidden_states:
			
 
				-            encoder_states = encoder_states + (hidden_states,)
			
 
				-
			
 
				-        if not return_dict:
			
 
				-            return tuple(
			
 
				-                v
			
 
				-                for v in [hidden_states, encoder_states, all_attentions]
			
 
				-                if v is not None
			
 
				-            )
			
 
				-        return BaseModelOutput(
			
 
				-            last_hidden_state=hidden_states,
			
 
				-            hidden_states=encoder_states,
			
 
				-            attentions=all_attentions,
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-class FlashWhisperDecoder(WhisperDecoder):
			
 
				-    """
			
 
				-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
			
 
				-    [`WhisperDecoderLayer`]
			
 
				-
			
 
				-    Args:
			
 
				-        config: WhisperConfig
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, config: WhisperConfig):
			
 
				-        super().__init__(config)
			
 
				-
			
 
				-        self.layers = nn.ModuleList(
			
 
				-            [FlashWhisperDecoderLayer(config) for _ in range(config.decoder_layers)]
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-class FlashWhisperModel(WhisperModel):
			
 
				-    def __init__(self, config: WhisperConfig):
			
 
				-        super().__init__(config)
			
 
				-
			
 
				-        self.encoder = FlashWhisperEncoder(config)
			
 
				-        self.decoder = FlashWhisperDecoder(config)
			
 
				-        self.post_init()
			
 
				-
			
 
				-
			
 
				-class FlashWhisperForConditionalGeneration(WhisperForConditionalGeneration):
			
 
				-    def __init__(self, config: WhisperConfig):
			
 
				-        super().__init__(config)
			
 
				-
			
 
				-        self.model = FlashWhisperModel(config)
			
 
				-        self.post_init()
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,40 @@
 
				+[project]
			
 
				+name = "fish-speech"
			
 
				+version = "0.1.0"
			
 
				+authors = [
			
 
				+    {name = "Lengyue", email = "lengyue@lengyue.me"},
			
 
				+]
			
 
				+description = "Fish Speech"
			
 
				+readme = "README.md"
			
 
				+requires-python = ">=3.10"
			
 
				+keywords = ["TTS", "Speech"]
			
 
				+license = {text = "BSD-3-Clause"}
			
 
				+classifiers = [
			
 
				+    "Programming Language :: Python :: 3",
			
 
				+]
			
 
				+dependencies = [
			
 
				+    "transformers>=4.34.1",
			
 
				+    "datasets>=2.14.5",
			
 
				+    "bitsandbytes>=0.41.1",
			
 
				+    "peft>=0.5.0",
			
 
				+    "lightning>=2.1.0",
			
 
				+    "hydra-core>=1.3.2",
			
 
				+    "tensorboard>=2.14.1",
			
 
				+    "natsort>=8.4.0",
			
 
				+    "einops>=0.7.0",
			
 
				+    "librosa>=0.10.1",
			
 
				+    "vector-quantize-pytorch>=1.9.18",
			
 
				+    "rich>=13.5.3",
			
 
				+    "cn2an",
			
 
				+    "pypinyin",
			
 
				+    "jieba",
			
 
				+    "g2p_en",
			
 
				+    "pyopenjtalk",
			
 
				+]
			
 
				+
			
 
				+[build-system]
			
 
				+requires = ["setuptools", "setuptools-scm"]
			
 
				+build-backend = "setuptools.build_meta"
			
 
				+
			
 
				+[tool.setuptools]
			
 
				+packages = ["fish_speech"]
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +0,0 @@
 
				-transformers>=4.34.1
			
 
				-datasets>=2.14.5
			
 
				-bitsandbytes>=0.41.1
			
 
				-peft>=0.5.0
			
 
				-lightning>=2.1.0
			
 
				-hydra-core>=1.3.2
			
 
				-tensorboard>=2.14.1
			
 
				-natsort>=8.4.0
			
 
				-einops>=0.7.0
			
 
				-librosa>=0.10.1
			
 
				-vector-quantize-pytorch>=1.9.18
			
 
				-rich>=13.5.3
			
 
				-cn2an
			
 
				-pypinyin
			
 
				-jieba
			
 
				-g2p_en
			
 
				-pyopenjtalk
			
--- a/setup.py
+++ b/setup.py
@@ -1,7 +0,0 @@
 
				-from setuptools import find_packages, setup
			
 
				-
			
 
				-setup(
			
 
				-    name="fish-speech",
			
 
				-    version="0.1.0",
			
 
				-    packages=find_packages(include=["fish_speech", "fish_speech.*"]),
			
 
				-)