%!s(int64=2) %!d(string=hai) anos · c9e1f95503
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ __pycache__
 
															 *.filelist
														
 
															 filelists
														
 
															 /fish_speech/text/cmudict_cache.pickle
														
 
															+/checkpoints
														
--- a/fish_speech/configs/hubert_vq.yaml
+++ b/fish_speech/configs/hubert_vq.yaml
@@ -45,7 +45,7 @@ data:
 
															 # Model Configuration
														
 
															 model:
														
 
															-  _target_: fish_speech.models.vqgan.VQGAN
														
 
															+  _target_: fish_speech.models.vq_diffusion.VQGAN
														
 
															   sample_rate: ${sample_rate}
														
 
															   hop_length: ${hop_length}
														
 
															   segment_size: 20480
														
@@ -110,5 +110,5 @@ callbacks:
 
															     sub_module: generator
														
 
															 # Resume from rcell's checkpoint
														
 
															-# ckpt_path: results/hubert-vq-pretrain/rcell/ckpt_23000_pl.pth
														
 
															-# resume_weights_only: true
														
 
															+ckpt_path: results/hubert-vq-pretrain/rcell/ckpt_23000_pl.pth
														
 
															+resume_weights_only: true
														
--- a/fish_speech/configs/hubert_vq_diffusion.yaml
+++ b/fish_speech/configs/hubert_vq_diffusion.yaml
@@ -0,0 +1,130 @@
 
															+defaults:
														
 
															+  - base
														
 
															+  - _self_
														
 
															+
														
 
															+project: hubert_vq_diffusion
														
 
															+
														
 
															+# Lightning Trainer
														
 
															+trainer:
														
 
															+  accelerator: gpu
														
 
															+  devices: 4
														
 
															+  strategy:
														
 
															+    _target_: lightning.pytorch.strategies.DDPStrategy
														
 
															+    static_graph: true
														
 
															+  gradient_clip_val: 1.0
														
 
															+  gradient_clip_algorithm: 'norm'
														
 
															+  precision: bf16-mixed
														
 
															+  max_steps: 1_000_000
														
 
															+  val_check_interval: 1000
														
 
															+
														
 
															+sample_rate: 44100
														
 
															+hop_length: 512
														
 
															+num_mels: 128
														
 
															+n_fft: 2048
														
 
															+win_length: 2048
														
 
															+
														
 
															+# Dataset Configuration
														
 
															+train_dataset:
														
 
															+  _target_: fish_speech.datasets.vqgan.VQGANDataset
														
 
															+  filelist: data/vq_train_filelist.txt
														
 
															+  sample_rate: ${sample_rate}
														
 
															+  hop_length: ${hop_length}
														
 
															+  slice_frames: 512
														
 
															+
														
 
															+val_dataset:
														
 
															+  _target_: fish_speech.datasets.vqgan.VQGANDataset
														
 
															+  filelist: data/vq_val_filelist.txt
														
 
															+  sample_rate: ${sample_rate}
														
 
															+  hop_length: ${hop_length}
														
 
															+
														
 
															+data:
														
 
															+  _target_: fish_speech.datasets.vqgan.VQGANDataModule
														
 
															+  train_dataset: ${train_dataset}
														
 
															+  val_dataset: ${val_dataset}
														
 
															+  num_workers: 4
														
 
															+  batch_size: 8
														
 
															+  val_batch_size: 4
														
 
															+
														
 
															+# Model Configuration
														
 
															+model:
														
 
															+  _target_: fish_speech.models.vq_diffusion.lit_module.VQDiffusion
														
 
															+  sample_rate: ${sample_rate}
														
 
															+  hop_length: ${hop_length}
														
 
															+
														
 
															+  text_encoder:
														
 
															+    _target_: fish_speech.models.vqgan.modules.encoders.TextEncoder
														
 
															+    in_channels: 1024
														
 
															+    out_channels: 128
														
 
															+    hidden_channels: 192
														
 
															+    hidden_channels_ffn: 768
														
 
															+    n_heads: 2
														
 
															+    n_layers: 4
														
 
															+    kernel_size: 1
														
 
															+    dropout: 0.1
														
 
															+    use_vae: false
														
 
															+    gin_channels: 512
														
 
															+    speaker_cond_layer: 0
														
 
															+
														
 
															+  vq_encoder:
														
 
															+    _target_: fish_speech.models.vqgan.modules.encoders.VQEncoder
														
 
															+    in_channels: 1024
														
 
															+    vq_channels: 1024
														
 
															+    codebook_size: 2048
														
 
															+    downsample: 2
														
 
															+    kmeans_ckpt: results/hubert-vq-pretrain/kmeans.pt
														
 
															+
														
 
															+  speaker_encoder:
														
 
															+    _target_: fish_speech.models.vqgan.modules.encoders.SpeakerEncoder
														
 
															+    in_channels: 128
														
 
															+    hidden_channels: 192
														
 
															+    out_channels: 512
														
 
															+    num_heads: 2
														
 
															+    num_layers: 4
														
 
															+    p_dropout: 0.1
														
 
															+  
														
 
															+  # denoiser:
														
 
															+  #   _target_: fish_speech.models.vq_diffusion.convnext_1d.ConvNext1DModel
														
 
															+  #   in_channels: 256
														
 
															+  #   out_channels: 128
														
 
															+  #   intermediate_dim: 512
														
 
															+  #   mlp_dim: 2048
														
 
															+  #   num_layers: 20
														
 
															+  #   dilation_cycle_length: 2
														
 
															+  #   time_embedding_type: "positional"
														
 
															+
														
 
															+  denoiser:
														
 
															+    _target_: fish_speech.models.vq_diffusion.unet1d.Unet1DDenoiser
														
 
															+    dim: 64
														
 
															+    dim_mults: [1, 2, 4]
														
 
															+    groups: 8
														
 
															+    pe_scale: 1000
														
 
															+
														
 
															+  vocoder:
														
 
															+    _target_: fish_speech.models.vq_diffusion.adamos.ADaMoSHiFiGANV1
														
 
															+
														
 
															+  mel_transform:
														
 
															+    _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
														
 
															+    sample_rate: ${sample_rate}
														
 
															+    n_fft: ${n_fft}
														
 
															+    hop_length: ${hop_length}
														
 
															+    win_length: ${win_length}
														
 
															+    n_mels: ${num_mels}
														
 
															+    f_min: 40
														
 
															+    f_max: 16000
														
 
															+
														
 
															+  optimizer:
														
 
															+    _target_: torch.optim.AdamW
														
 
															+    _partial_: true
														
 
															+    lr: 1e-4
														
 
															+    betas: [0.9, 0.999]
														
 
															+    eps: 1e-5
														
 
															+
														
 
															+  lr_scheduler:
														
 
															+    _target_: torch.optim.lr_scheduler.LambdaLR
														
 
															+    _partial_: true
														
 
															+    lr_lambda:
														
 
															+      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
														
 
															+      _partial_: true
														
 
															+      num_warmup_steps: 0
														
 
															+      num_training_steps: ${trainer.max_steps}
														
 
															+      final_lr_ratio: 0.05
														
--- a/fish_speech/datasets/vqgan.py
+++ b/fish_speech/datasets/vqgan.py
@@ -48,23 +48,26 @@ class VQGANDataset(Dataset):
 
															         if self.slice_frames is not None and features.shape[0] > self.slice_frames:
														
 
															             start = np.random.randint(0, features.shape[0] - self.slice_frames)
														
 
															             features = features[start : start + self.slice_frames]
														
 
															+            feature_hop_length = features.shape[0] * (32000 // 50)
														
 
															             audio = audio[
														
 
															-                start * self.hop_length : (start + self.slice_frames) * self.hop_length
														
 
															+                start
														
 
															+                * feature_hop_length : (start + self.slice_frames)
														
 
															+                * feature_hop_length
														
 
															             ]
														
 
															-        if features.shape[0] % 2 != 0:
														
 
															-            features = features[:-1]
														
 
															+        # if features.shape[0] % 2 != 0:
														
 
															+        #     features = features[:-1]
														
 
															-        if len(audio) > len(features) * self.hop_length:
														
 
															-            audio = audio[: features.shape[0] * self.hop_length]
														
 
															+        # if len(audio) > len(features) * self.hop_length:
														
 
															+        #     audio = audio[: features.shape[0] * self.hop_length]
														
 
															-        if len(audio) < len(features) * self.hop_length:
														
 
															-            audio = np.pad(
														
 
															-                audio,
														
 
															-                (0, len(features) * self.hop_length - len(audio)),
														
 
															-                mode="constant",
														
 
															-                constant_values=0,
														
 
															-            )
														
 
															+        # if len(audio) < len(features) * self.hop_length:
														
 
															+        #     audio = np.pad(
														
 
															+        #         audio,
														
 
															+        #         (0, len(features) * self.hop_length - len(audio)),
														
 
															+        #         mode="constant",
														
 
															+        #         constant_values=0,
														
 
															+        #     )
														
 
															         return {
														
 
															             "audio": torch.from_numpy(audio),
														
@@ -90,6 +93,7 @@ class VQGANCollator:
 
															         audio_maxlen = audio_lengths.max()
														
 
															         feature_maxlen = feature_lengths.max()
														
 
															+        # Rounds up to nearest multiple of 2 (audio_lengths)
														
 
															         audios, features = [], []
														
 
															         for x in batch:
														
 
															             audios.append(
														
--- a/fish_speech/models/vq_diffusion/adamos/__init__.py
+++ b/fish_speech/models/vq_diffusion/adamos/__init__.py
@@ -0,0 +1,3 @@
 
															+from .adamos import ADaMoSHiFiGANV1
														
 
															+
														
 
															+__all__ = ["ADaMoSHiFiGANV1"]
														
--- a/fish_speech/models/vq_diffusion/adamos/adamos.py
+++ b/fish_speech/models/vq_diffusion/adamos/adamos.py
@@ -0,0 +1,88 @@
 
															+import librosa
														
 
															+import torch
														
 
															+from torch import nn
														
 
															+
														
 
															+from fish_speech.models.vqgan.spectrogram import LogMelSpectrogram
														
 
															+
														
 
															+from .encoder import ConvNeXtEncoder
														
 
															+from .hifigan import HiFiGANGenerator
														
 
															+
														
 
															+
														
 
															+class ADaMoSHiFiGANV1(nn.Module):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        checkpoint_path: str = "checkpoints/adamos-generator-1640000.pth",
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+
														
 
															+        self.backbone = ConvNeXtEncoder(
														
 
															+            input_channels=128,
														
 
															+            depths=[3, 3, 9, 3],
														
 
															+            dims=[128, 256, 384, 512],
														
 
															+            drop_path_rate=0,
														
 
															+            kernel_sizes=(7,),
														
 
															+        )
														
 
															+
														
 
															+        self.head = HiFiGANGenerator(
														
 
															+            hop_length=512,
														
 
															+            upsample_rates=(4, 4, 2, 2, 2, 2, 2),
														
 
															+            upsample_kernel_sizes=(8, 8, 4, 4, 4, 4, 4),
														
 
															+            resblock_kernel_sizes=(3, 7, 11, 13),
														
 
															+            resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5), (1, 3, 5)),
														
 
															+            num_mels=512,
														
 
															+            upsample_initial_channel=1024,
														
 
															+            use_template=False,
														
 
															+            pre_conv_kernel_size=13,
														
 
															+            post_conv_kernel_size=13,
														
 
															+        )
														
 
															+        self.sampling_rate = 44100
														
 
															+
														
 
															+        ckpt_state = torch.load(checkpoint_path, map_location="cpu")
														
 
															+
														
 
															+        if "state_dict" in ckpt_state:
														
 
															+            ckpt_state = ckpt_state["state_dict"]
														
 
															+
														
 
															+        if any(k.startswith("generator.") for k in ckpt_state):
														
 
															+            ckpt_state = {
														
 
															+                k.replace("generator.", ""): v
														
 
															+                for k, v in ckpt_state.items()
														
 
															+                if k.startswith("generator.")
														
 
															+            }
														
 
															+
														
 
															+        self.load_state_dict(ckpt_state)
														
 
															+        self.eval()
														
 
															+
														
 
															+        self.mel_transform = LogMelSpectrogram(
														
 
															+            sample_rate=44100,
														
 
															+            n_fft=2048,
														
 
															+            win_length=2048,
														
 
															+            hop_length=512,
														
 
															+            f_min=40,
														
 
															+            f_max=16000,
														
 
															+            n_mels=128,
														
 
															+        )
														
 
															+
														
 
															+    @torch.no_grad()
														
 
															+    def decode(self, mel):
														
 
															+        y = self.backbone(mel)
														
 
															+        y = self.head(y)
														
 
															+
														
 
															+        return y
														
 
															+
														
 
															+    @torch.no_grad()
														
 
															+    def encode(self, x):
														
 
															+        return self.mel_transform(x)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    import soundfile as sf
														
 
															+
														
 
															+    x = "data/StarRail/Chinese/罗刹/archive_luocha_2.wav"
														
 
															+    model = ADaMoSHiFiGANV1()
														
 
															+
														
 
															+    wav, sr = librosa.load(x, sr=44100, mono=True)
														
 
															+    wav = torch.from_numpy(wav).float()[None]
														
 
															+    mel = model.encode(wav)
														
 
															+
														
 
															+    wav = model.decode(mel)[0].mT
														
 
															+    sf.write("test.wav", wav.cpu().numpy(), 44100)
														
--- a/fish_speech/models/vq_diffusion/adamos/encoder.py
+++ b/fish_speech/models/vq_diffusion/adamos/encoder.py
@@ -0,0 +1,238 @@
 
															+from functools import partial
														
 
															+
														
 
															+import torch
														
 
															+import torch.nn.functional as F
														
 
															+from torch import nn
														
 
															+
														
 
															+
														
 
															+def drop_path(
														
 
															+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
														
 
															+):
														
 
															+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
														
 
															+
														
 
															+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
														
 
															+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
														
 
															+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
														
 
															+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
														
 
															+    'survival rate' as the argument.
														
 
															+
														
 
															+    """  # noqa: E501
														
 
															+
														
 
															+    if drop_prob == 0.0 or not training:
														
 
															+        return x
														
 
															+    keep_prob = 1 - drop_prob
														
 
															+    shape = (x.shape[0],) + (1,) * (
														
 
															+        x.ndim - 1
														
 
															+    )  # work with diff dim tensors, not just 2D ConvNets
														
 
															+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
														
 
															+    if keep_prob > 0.0 and scale_by_keep:
														
 
															+        random_tensor.div_(keep_prob)
														
 
															+    return x * random_tensor
														
 
															+
														
 
															+
														
 
															+class DropPath(nn.Module):
														
 
															+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
														
 
															+
														
 
															+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
														
 
															+        super(DropPath, self).__init__()
														
 
															+        self.drop_prob = drop_prob
														
 
															+        self.scale_by_keep = scale_by_keep
														
 
															+
														
 
															+    def forward(self, x):
														
 
															+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
														
 
															+
														
 
															+    def extra_repr(self):
														
 
															+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
														
 
															+
														
 
															+
														
 
															+class LayerNorm(nn.Module):
														
 
															+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
														
 
															+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
														
 
															+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
														
 
															+    with shape (batch_size, channels, height, width).
														
 
															+    """  # noqa: E501
														
 
															+
														
 
															+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
														
 
															+        super().__init__()
														
 
															+        self.weight = nn.Parameter(torch.ones(normalized_shape))
														
 
															+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
														
 
															+        self.eps = eps
														
 
															+        self.data_format = data_format
														
 
															+        if self.data_format not in ["channels_last", "channels_first"]:
														
 
															+            raise NotImplementedError
														
 
															+        self.normalized_shape = (normalized_shape,)
														
 
															+
														
 
															+    def forward(self, x):
														
 
															+        if self.data_format == "channels_last":
														
 
															+            return F.layer_norm(
														
 
															+                x, self.normalized_shape, self.weight, self.bias, self.eps
														
 
															+            )
														
 
															+        elif self.data_format == "channels_first":
														
 
															+            u = x.mean(1, keepdim=True)
														
 
															+            s = (x - u).pow(2).mean(1, keepdim=True)
														
 
															+            x = (x - u) / torch.sqrt(s + self.eps)
														
 
															+            x = self.weight[:, None] * x + self.bias[:, None]
														
 
															+            return x
														
 
															+
														
 
															+
														
 
															+class ConvNeXtBlock(nn.Module):
														
 
															+    r"""ConvNeXt Block. There are two equivalent implementations:
														
 
															+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
														
 
															+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
														
 
															+    We use (2) as we find it slightly faster in PyTorch
														
 
															+
														
 
															+    Args:
														
 
															+        dim (int): Number of input channels.
														
 
															+        drop_path (float): Stochastic depth rate. Default: 0.0
														
 
															+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
														
 
															+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
														
 
															+        kernel_size (int): Kernel size for depthwise conv. Default: 7.
														
 
															+        dilation (int): Dilation for depthwise conv. Default: 1.
														
 
															+    """  # noqa: E501
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        dim: int,
														
 
															+        drop_path: float = 0.0,
														
 
															+        layer_scale_init_value: float = 1e-6,
														
 
															+        mlp_ratio: float = 4.0,
														
 
															+        kernel_size: int = 7,
														
 
															+        dilation: int = 1,
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+
														
 
															+        self.dwconv = nn.Conv1d(
														
 
															+            dim,
														
 
															+            dim,
														
 
															+            kernel_size=kernel_size,
														
 
															+            padding=int(dilation * (kernel_size - 1) / 2),
														
 
															+            groups=dim,
														
 
															+        )  # depthwise conv
														
 
															+        self.norm = LayerNorm(dim, eps=1e-6)
														
 
															+        self.pwconv1 = nn.Linear(
														
 
															+            dim, int(mlp_ratio * dim)
														
 
															+        )  # pointwise/1x1 convs, implemented with linear layers
														
 
															+        self.act = nn.GELU()
														
 
															+        self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
														
 
															+        self.gamma = (
														
 
															+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
														
 
															+            if layer_scale_init_value > 0
														
 
															+            else None
														
 
															+        )
														
 
															+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
														
 
															+
														
 
															+    def forward(self, x, apply_residual: bool = True):
														
 
															+        input = x
														
 
															+
														
 
															+        x = self.dwconv(x)
														
 
															+        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
														
 
															+        x = self.norm(x)
														
 
															+        x = self.pwconv1(x)
														
 
															+        x = self.act(x)
														
 
															+        x = self.pwconv2(x)
														
 
															+
														
 
															+        if self.gamma is not None:
														
 
															+            x = self.gamma * x
														
 
															+
														
 
															+        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
														
 
															+        x = self.drop_path(x)
														
 
															+
														
 
															+        if apply_residual:
														
 
															+            x = input + x
														
 
															+
														
 
															+        return x
														
 
															+
														
 
															+
														
 
															+class ParallelConvNeXtBlock(nn.Module):
														
 
															+    def __init__(self, kernel_sizes: list[int], *args, **kwargs):
														
 
															+        super().__init__()
														
 
															+        self.blocks = nn.ModuleList(
														
 
															+            [
														
 
															+                ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs)
														
 
															+                for kernel_size in kernel_sizes
														
 
															+            ]
														
 
															+        )
														
 
															+
														
 
															+    def forward(self, x: torch.Tensor) -> torch.Tensor:
														
 
															+        return torch.stack(
														
 
															+            [block(x, apply_residual=False) for block in self.blocks] + [x],
														
 
															+            dim=1,
														
 
															+        ).sum(dim=1)
														
 
															+
														
 
															+
														
 
															+class ConvNeXtEncoder(nn.Module):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        input_channels=3,
														
 
															+        depths=[3, 3, 9, 3],
														
 
															+        dims=[96, 192, 384, 768],
														
 
															+        drop_path_rate=0.0,
														
 
															+        layer_scale_init_value=1e-6,
														
 
															+        kernel_sizes: tuple[int] = (7,),
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+        assert len(depths) == len(dims)
														
 
															+
														
 
															+        self.channel_layers = nn.ModuleList()
														
 
															+        stem = nn.Sequential(
														
 
															+            nn.Conv1d(
														
 
															+                input_channels,
														
 
															+                dims[0],
														
 
															+                kernel_size=7,
														
 
															+                padding=3,
														
 
															+                padding_mode="replicate",
														
 
															+            ),
														
 
															+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
														
 
															+        )
														
 
															+        self.channel_layers.append(stem)
														
 
															+
														
 
															+        for i in range(len(depths) - 1):
														
 
															+            mid_layer = nn.Sequential(
														
 
															+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
														
 
															+                nn.Conv1d(dims[i], dims[i + 1], kernel_size=1),
														
 
															+            )
														
 
															+            self.channel_layers.append(mid_layer)
														
 
															+
														
 
															+        block_fn = (
														
 
															+            partial(ConvNeXtBlock, kernel_size=kernel_sizes[0])
														
 
															+            if len(kernel_sizes) == 1
														
 
															+            else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes)
														
 
															+        )
														
 
															+
														
 
															+        self.stages = nn.ModuleList()
														
 
															+        drop_path_rates = [
														
 
															+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
														
 
															+        ]
														
 
															+
														
 
															+        cur = 0
														
 
															+        for i in range(len(depths)):
														
 
															+            stage = nn.Sequential(
														
 
															+                *[
														
 
															+                    block_fn(
														
 
															+                        dim=dims[i],
														
 
															+                        drop_path=drop_path_rates[cur + j],
														
 
															+                        layer_scale_init_value=layer_scale_init_value,
														
 
															+                    )
														
 
															+                    for j in range(depths[i])
														
 
															+                ]
														
 
															+            )
														
 
															+            self.stages.append(stage)
														
 
															+            cur += depths[i]
														
 
															+
														
 
															+        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
														
 
															+        self.apply(self._init_weights)
														
 
															+
														
 
															+    def _init_weights(self, m):
														
 
															+        if isinstance(m, (nn.Conv1d, nn.Linear)):
														
 
															+            nn.init.trunc_normal_(m.weight, std=0.02)
														
 
															+            nn.init.constant_(m.bias, 0)
														
 
															+
														
 
															+    def forward(
														
 
															+        self,
														
 
															+        x: torch.Tensor,
														
 
															+    ) -> torch.Tensor:
														
 
															+        for channel_layer, stage in zip(self.channel_layers, self.stages):
														
 
															+            x = channel_layer(x)
														
 
															+            x = stage(x)
														
 
															+
														
 
															+        return self.norm(x)
														
--- a/fish_speech/models/vq_diffusion/adamos/hifigan.py
+++ b/fish_speech/models/vq_diffusion/adamos/hifigan.py
@@ -0,0 +1,237 @@
 
															+from functools import partial
														
 
															+from math import prod
														
 
															+from typing import Callable
														
 
															+
														
 
															+import numpy as np
														
 
															+import torch
														
 
															+import torch.nn as nn
														
 
															+import torch.nn.functional as F
														
 
															+from torch.nn import Conv1d
														
 
															+from torch.nn.utils.parametrizations import weight_norm
														
 
															+from torch.nn.utils.parametrize import remove_parametrizations as remove_weight_norm
														
 
															+
														
 
															+
														
 
															+def init_weights(m, mean=0.0, std=0.01):
														
 
															+    classname = m.__class__.__name__
														
 
															+    if classname.find("Conv") != -1:
														
 
															+        m.weight.data.normal_(mean, std)
														
 
															+
														
 
															+
														
 
															+def get_padding(kernel_size, dilation=1):
														
 
															+    return (kernel_size * dilation - dilation) // 2
														
 
															+
														
 
															+
														
 
															+class ResBlock1(torch.nn.Module):
														
 
															+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
														
 
															+        super().__init__()
														
 
															+
														
 
															+        self.convs1 = nn.ModuleList(
														
 
															+            [
														
 
															+                weight_norm(
														
 
															+                    Conv1d(
														
 
															+                        channels,
														
 
															+                        channels,
														
 
															+                        kernel_size,
														
 
															+                        1,
														
 
															+                        dilation=dilation[0],
														
 
															+                        padding=get_padding(kernel_size, dilation[0]),
														
 
															+                    )
														
 
															+                ),
														
 
															+                weight_norm(
														
 
															+                    Conv1d(
														
 
															+                        channels,
														
 
															+                        channels,
														
 
															+                        kernel_size,
														
 
															+                        1,
														
 
															+                        dilation=dilation[1],
														
 
															+                        padding=get_padding(kernel_size, dilation[1]),
														
 
															+                    )
														
 
															+                ),
														
 
															+                weight_norm(
														
 
															+                    Conv1d(
														
 
															+                        channels,
														
 
															+                        channels,
														
 
															+                        kernel_size,
														
 
															+                        1,
														
 
															+                        dilation=dilation[2],
														
 
															+                        padding=get_padding(kernel_size, dilation[2]),
														
 
															+                    )
														
 
															+                ),
														
 
															+            ]
														
 
															+        )
														
 
															+        self.convs1.apply(init_weights)
														
 
															+
														
 
															+        self.convs2 = nn.ModuleList(
														
 
															+            [
														
 
															+                weight_norm(
														
 
															+                    Conv1d(
														
 
															+                        channels,
														
 
															+                        channels,
														
 
															+                        kernel_size,
														
 
															+                        1,
														
 
															+                        dilation=1,
														
 
															+                        padding=get_padding(kernel_size, 1),
														
 
															+                    )
														
 
															+                ),
														
 
															+                weight_norm(
														
 
															+                    Conv1d(
														
 
															+                        channels,
														
 
															+                        channels,
														
 
															+                        kernel_size,
														
 
															+                        1,
														
 
															+                        dilation=1,
														
 
															+                        padding=get_padding(kernel_size, 1),
														
 
															+                    )
														
 
															+                ),
														
 
															+                weight_norm(
														
 
															+                    Conv1d(
														
 
															+                        channels,
														
 
															+                        channels,
														
 
															+                        kernel_size,
														
 
															+                        1,
														
 
															+                        dilation=1,
														
 
															+                        padding=get_padding(kernel_size, 1),
														
 
															+                    )
														
 
															+                ),
														
 
															+            ]
														
 
															+        )
														
 
															+        self.convs2.apply(init_weights)
														
 
															+
														
 
															+    def forward(self, x):
														
 
															+        for c1, c2 in zip(self.convs1, self.convs2):
														
 
															+            xt = F.silu(x)
														
 
															+            xt = c1(xt)
														
 
															+            xt = F.silu(xt)
														
 
															+            xt = c2(xt)
														
 
															+            x = xt + x
														
 
															+        return x
														
 
															+
														
 
															+    def remove_weight_norm(self):
														
 
															+        for conv in self.convs1:
														
 
															+            remove_weight_norm(conv)
														
 
															+        for conv in self.convs2:
														
 
															+            remove_weight_norm(conv)
														
 
															+
														
 
															+
														
 
															+class HiFiGANGenerator(nn.Module):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        *,
														
 
															+        hop_length: int = 512,
														
 
															+        upsample_rates: tuple[int] = (8, 8, 2, 2, 2),
														
 
															+        upsample_kernel_sizes: tuple[int] = (16, 16, 8, 2, 2),
														
 
															+        resblock_kernel_sizes: tuple[int] = (3, 7, 11),
														
 
															+        resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
														
 
															+        num_mels: int = 128,
														
 
															+        upsample_initial_channel: int = 512,
														
 
															+        use_template: bool = True,
														
 
															+        pre_conv_kernel_size: int = 7,
														
 
															+        post_conv_kernel_size: int = 7,
														
 
															+        post_activation: Callable = partial(nn.SiLU, inplace=True),
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+
														
 
															+        assert (
														
 
															+            prod(upsample_rates) == hop_length
														
 
															+        ), f"hop_length must be {prod(upsample_rates)}"
														
 
															+
														
 
															+        self.conv_pre = weight_norm(
														
 
															+            nn.Conv1d(
														
 
															+                num_mels,
														
 
															+                upsample_initial_channel,
														
 
															+                pre_conv_kernel_size,
														
 
															+                1,
														
 
															+                padding=get_padding(pre_conv_kernel_size),
														
 
															+            )
														
 
															+        )
														
 
															+
														
 
															+        self.num_upsamples = len(upsample_rates)
														
 
															+        self.num_kernels = len(resblock_kernel_sizes)
														
 
															+
														
 
															+        self.noise_convs = nn.ModuleList()
														
 
															+        self.use_template = use_template
														
 
															+        self.ups = nn.ModuleList()
														
 
															+
														
 
															+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
														
 
															+            c_cur = upsample_initial_channel // (2 ** (i + 1))
														
 
															+            self.ups.append(
														
 
															+                weight_norm(
														
 
															+                    nn.ConvTranspose1d(
														
 
															+                        upsample_initial_channel // (2**i),
														
 
															+                        upsample_initial_channel // (2 ** (i + 1)),
														
 
															+                        k,
														
 
															+                        u,
														
 
															+                        padding=(k - u) // 2,
														
 
															+                    )
														
 
															+                )
														
 
															+            )
														
 
															+
														
 
															+            if not use_template:
														
 
															+                continue
														
 
															+
														
 
															+            if i + 1 < len(upsample_rates):
														
 
															+                stride_f0 = np.prod(upsample_rates[i + 1 :])
														
 
															+                self.noise_convs.append(
														
 
															+                    Conv1d(
														
 
															+                        1,
														
 
															+                        c_cur,
														
 
															+                        kernel_size=stride_f0 * 2,
														
 
															+                        stride=stride_f0,
														
 
															+                        padding=stride_f0 // 2,
														
 
															+                    )
														
 
															+                )
														
 
															+            else:
														
 
															+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
														
 
															+
														
 
															+        self.resblocks = nn.ModuleList()
														
 
															+        for i in range(len(self.ups)):
														
 
															+            ch = upsample_initial_channel // (2 ** (i + 1))
														
 
															+            for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
														
 
															+                self.resblocks.append(ResBlock1(ch, k, d))
														
 
															+
														
 
															+        self.activation_post = post_activation()
														
 
															+        self.conv_post = weight_norm(
														
 
															+            nn.Conv1d(
														
 
															+                ch,
														
 
															+                1,
														
 
															+                post_conv_kernel_size,
														
 
															+                1,
														
 
															+                padding=get_padding(post_conv_kernel_size),
														
 
															+            )
														
 
															+        )
														
 
															+        self.ups.apply(init_weights)
														
 
															+        self.conv_post.apply(init_weights)
														
 
															+
														
 
															+    def forward(self, x, template=None):
														
 
															+        x = self.conv_pre(x)
														
 
															+
														
 
															+        for i in range(self.num_upsamples):
														
 
															+            x = F.silu(x, inplace=True)
														
 
															+            x = self.ups[i](x)
														
 
															+
														
 
															+            if self.use_template:
														
 
															+                x = x + self.noise_convs[i](template)
														
 
															+
														
 
															+            xs = None
														
 
															+
														
 
															+            for j in range(self.num_kernels):
														
 
															+                if xs is None:
														
 
															+                    xs = self.resblocks[i * self.num_kernels + j](x)
														
 
															+                else:
														
 
															+                    xs += self.resblocks[i * self.num_kernels + j](x)
														
 
															+
														
 
															+            x = xs / self.num_kernels
														
 
															+
														
 
															+        x = self.activation_post(x)
														
 
															+        x = self.conv_post(x)
														
 
															+        x = torch.tanh(x)
														
 
															+
														
 
															+        return x
														
 
															+
														
 
															+    def remove_weight_norm(self):
														
 
															+        for up in self.ups:
														
 
															+            remove_weight_norm(up)
														
 
															+        for block in self.resblocks:
														
 
															+            block.remove_weight_norm()
														
 
															+        remove_weight_norm(self.conv_pre)
														
 
															+        remove_weight_norm(self.conv_post)
														
--- a/fish_speech/models/vq_diffusion/convnext_1d.py
+++ b/fish_speech/models/vq_diffusion/convnext_1d.py
@@ -0,0 +1,244 @@
 
															+from dataclasses import dataclass
														
 
															+from typing import Optional, Tuple, Union
														
 
															+
														
 
															+import torch
														
 
															+import torch.nn as nn
														
 
															+from diffusers.configuration_utils import ConfigMixin, register_to_config
														
 
															+from diffusers.models.embeddings import (
														
 
															+    GaussianFourierProjection,
														
 
															+    TimestepEmbedding,
														
 
															+    Timesteps,
														
 
															+)
														
 
															+from diffusers.models.modeling_utils import ModelMixin
														
 
															+from diffusers.utils import BaseOutput
														
 
															+
														
 
															+
														
 
															+class ConvNeXtBlock(nn.Module):
														
 
															+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
														
 
															+
														
 
															+    Args:
														
 
															+        dim (int): Number of input channels.
														
 
															+        mlp_dim (int): Dimensionality of the intermediate layer.
														
 
															+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
														
 
															+            Defaults to None.
														
 
															+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
														
 
															+            None means non-conditional LayerNorm. Defaults to None.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        dim: int,
														
 
															+        intermediate_dim: int,
														
 
															+        dilation: int = 1,
														
 
															+        layer_scale_init_value: Optional[float] = 1e-6,
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+        self.dwconv = nn.Conv1d(
														
 
															+            dim,
														
 
															+            dim,
														
 
															+            kernel_size=7,
														
 
															+            groups=dim,
														
 
															+            dilation=dilation,
														
 
															+            padding=int(dilation * (7 - 1) / 2),
														
 
															+        )  # depthwise conv
														
 
															+        self.norm = nn.LayerNorm(dim, eps=1e-6)
														
 
															+        self.pwconv1 = nn.Linear(
														
 
															+            dim, intermediate_dim
														
 
															+        )  # pointwise/1x1 convs, implemented with linear layers
														
 
															+        self.act = nn.GELU()
														
 
															+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
														
 
															+        self.gamma = (
														
 
															+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
														
 
															+            if layer_scale_init_value is not None and layer_scale_init_value > 0
														
 
															+            else None
														
 
															+        )
														
 
															+
														
 
															+        self.condition_projection = nn.Sequential(
														
 
															+            nn.Conv1d(dim, dim, 1),
														
 
															+            nn.GELU(),
														
 
															+            nn.Conv1d(dim, dim, 1),
														
 
															+        )
														
 
															+
														
 
															+    def forward(
														
 
															+        self,
														
 
															+        x: torch.Tensor,
														
 
															+        condition: Optional[torch.Tensor] = None,
														
 
															+        x_mask: Optional[torch.Tensor] = None,
														
 
															+    ) -> torch.Tensor:
														
 
															+        residual = x
														
 
															+
														
 
															+        if condition is not None:
														
 
															+            x = x + self.condition_projection(condition)
														
 
															+
														
 
															+        if x_mask is not None:
														
 
															+            x = x * x_mask
														
 
															+
														
 
															+        x = self.dwconv(x)
														
 
															+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
														
 
															+        x = self.norm(x)
														
 
															+        x = self.pwconv1(x)
														
 
															+        x = self.act(x)
														
 
															+        x = self.pwconv2(x)
														
 
															+        if self.gamma is not None:
														
 
															+            x = self.gamma * x
														
 
															+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
														
 
															+
														
 
															+        x = residual + x
														
 
															+
														
 
															+        return x
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class ConvNext1DOutput(BaseOutput):
														
 
															+    """
														
 
															+    The output of [`UNet1DModel`].
														
 
															+
														
 
															+    Args:
														
 
															+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, sample_size)`):
														
 
															+            The hidden states output from the last layer of the model.
														
 
															+    """
														
 
															+
														
 
															+    sample: torch.FloatTensor
														
 
															+
														
 
															+
														
 
															+class ConvNext1DModel(ModelMixin, ConfigMixin):
														
 
															+    r"""
														
 
															+    A ConvNext model that takes a noisy sample and a timestep and returns a sample shaped output.
														
 
															+
														
 
															+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
														
 
															+    for all models (such as downloading or saving).
														
 
															+
														
 
															+    Parameters:
														
 
															+        in_channels (`int`, *optional*, defaults to 128):
														
 
															+            Number of channels in the input sample.
														
 
															+        out_channels (`int`, *optional*, defaults to 128):
														
 
															+            Number of channels in the output.
														
 
															+        intermediate_dim (`int`, *optional*, defaults to 512):
														
 
															+            Dimensionality of the intermediate blocks.
														
 
															+        mlp_dim (`int`, *optional*, defaults to 2048):
														
 
															+            Dimensionality of the MLP.
														
 
															+        num_layers (`int`, *optional*, defaults to 20):
														
 
															+            Number of layers in the model.
														
 
															+        dilation_cycle_length (`int`, *optional*, defaults to 4):
														
 
															+            Length of the dilation cycle.
														
 
															+        time_embedding_type (`str`, *optional*, defaults to `positional`):
														
 
															+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
														
 
															+        time_embedding_dim (`int`, *optional*, defaults to `None`):
														
 
															+            An optional override for the dimension of the projected time embedding.
														
 
															+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
														
 
															+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
														
 
															+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
														
 
															+    """
														
 
															+
														
 
															+    _supports_gradient_checkpointing = True
														
 
															+
														
 
															+    @register_to_config
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        in_channels: int = 128,
														
 
															+        out_channels: int = 128,
														
 
															+        intermediate_dim: int = 512,
														
 
															+        mlp_dim: int = 2048,
														
 
															+        num_layers: int = 20,
														
 
															+        dilation_cycle_length: int = 4,
														
 
															+        time_embedding_type: str = "positional",
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+
														
 
															+        if intermediate_dim % 2 != 0:
														
 
															+            raise ValueError("intermediate_dim must be divisible by 2.")
														
 
															+
														
 
															+        # time
														
 
															+        if time_embedding_type == "fourier":
														
 
															+            self.time_proj = GaussianFourierProjection(
														
 
															+                intermediate_dim // 2,
														
 
															+                set_W_to_weight=False,
														
 
															+                log=False,
														
 
															+                flip_sin_to_cos=False,
														
 
															+            )
														
 
															+            timestep_input_dim = intermediate_dim
														
 
															+        elif time_embedding_type == "positional":
														
 
															+            self.time_proj = Timesteps(in_channels, False, 0)
														
 
															+            timestep_input_dim = in_channels
														
 
															+        else:
														
 
															+            raise ValueError(
														
 
															+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
														
 
															+            )
														
 
															+
														
 
															+        self.time_mlp = TimestepEmbedding(
														
 
															+            timestep_input_dim,
														
 
															+            intermediate_dim,
														
 
															+            act_fn="silu",
														
 
															+            cond_proj_dim=None,  # No conditional projection for now
														
 
															+        )
														
 
															+
														
 
															+        # Project to intermediate dim
														
 
															+        self.in_proj = nn.Conv1d(in_channels, intermediate_dim, 1)
														
 
															+        self.out_proj = nn.Conv1d(intermediate_dim, out_channels, 1)
														
 
															+
														
 
															+        # Blocks
														
 
															+        self.blocks = nn.ModuleList(
														
 
															+            [
														
 
															+                ConvNeXtBlock(
														
 
															+                    dim=intermediate_dim,
														
 
															+                    intermediate_dim=mlp_dim,
														
 
															+                    dilation=2 ** (i % dilation_cycle_length),
														
 
															+                )
														
 
															+                for i in range(num_layers)
														
 
															+            ]
														
 
															+        )
														
 
															+
														
 
															+        self.gradient_checkpointing = False
														
 
															+
														
 
															+    def _set_gradient_checkpointing(self, module, value: bool = False):
														
 
															+        self.gradient_checkpointing = value
														
 
															+
														
 
															+    def forward(
														
 
															+        self,
														
 
															+        sample: torch.FloatTensor,
														
 
															+        timestep: Union[torch.Tensor, float, int],
														
 
															+        sample_mask: Optional[torch.Tensor] = None,
														
 
															+        condition: Optional[torch.Tensor] = None,
														
 
															+    ) -> Union[ConvNext1DOutput, Tuple]:
														
 
															+        r"""
														
 
															+        The [`ConvNext1DModel`] forward method.
														
 
															+
														
 
															+        Args:
														
 
															+            sample (`torch.FloatTensor`):
														
 
															+                The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`.
														
 
															+            timestep (`torch.FloatTensor` or `float` or `int`):
														
 
															+                The number of timesteps to denoise an input.
														
 
															+            sample_mask (`torch.BoolTensor`, *optional*):
														
 
															+                A mask of the same shape as `sample` that indicates which elements are invalid.
														
 
															+                True means the element is invalid and should be masked out.
														
 
															+            return_dict (`bool`, *optional*, defaults to `True`):
														
 
															+                Whether or not to return a [`~models.unet_1d.ConvNext1DOutput`] instead of a plain tuple.
														
 
															+
														
 
															+        Returns:
														
 
															+            [`~models.unet_1d.ConvNext1DOutput`] or `tuple`:
														
 
															+                If `return_dict` is True, an [`~models.unet_1d.ConvNext1DOutput`] is returned, otherwise a `tuple` is
														
 
															+                returned where the first element is the sample tensor.
														
 
															+        """
														
 
															+
														
 
															+        # 1. time
														
 
															+        t_emb = self.time_proj(timestep)
														
 
															+        t_emb = self.time_mlp(t_emb)[..., None]
														
 
															+
														
 
															+        # 2. pre-process
														
 
															+        if condition is not None:
														
 
															+            sample = torch.cat([sample, condition], dim=1)
														
 
															+
														
 
															+        x = self.in_proj(sample)
														
 
															+
														
 
															+        if sample_mask.ndim == 2:
														
 
															+            sample_mask = sample_mask[:, None, :]
														
 
															+
														
 
															+        # 3. blocks
														
 
															+        for block in self.blocks:
														
 
															+            if self.training and self.is_gradient_checkpointing:
														
 
															+                x = torch.utils.checkpoint.checkpoint(block, x, t_emb, sample_mask)
														
 
															+            else:
														
 
															+                x = block(x, t_emb, sample_mask)
														
 
															+
														
 
															+        # 4. post-process
														
 
															+        return self.out_proj(x)
														
--- a/fish_speech/models/vq_diffusion/lit_module.py
+++ b/fish_speech/models/vq_diffusion/lit_module.py
@@ -0,0 +1,270 @@
 
															+import itertools
														
 
															+from typing import Any, Callable
														
 
															+
														
 
															+import lightning as L
														
 
															+import torch
														
 
															+import torch.nn.functional as F
														
 
															+import wandb
														
 
															+from diffusers.schedulers import DDIMScheduler, UniPCMultistepScheduler
														
 
															+from diffusers.utils.torch_utils import randn_tensor
														
 
															+from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
														
 
															+from matplotlib import pyplot as plt
														
 
															+from torch import nn
														
 
															+
														
 
															+from fish_speech.models.vq_diffusion.unet1d import Unet1DDenoiser
														
 
															+from fish_speech.models.vqgan.modules.encoders import (
														
 
															+    SpeakerEncoder,
														
 
															+    TextEncoder,
														
 
															+    VQEncoder,
														
 
															+)
														
 
															+from fish_speech.models.vqgan.utils import plot_mel, sequence_mask
														
 
															+
														
 
															+
														
 
															+class VQDiffusion(L.LightningModule):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        optimizer: Callable,
														
 
															+        lr_scheduler: Callable,
														
 
															+        mel_transform: nn.Module,
														
 
															+        vq_encoder: VQEncoder,
														
 
															+        speaker_encoder: SpeakerEncoder,
														
 
															+        text_encoder: TextEncoder,
														
 
															+        denoiser: Unet1DDenoiser,
														
 
															+        vocoder: nn.Module,
														
 
															+        hop_length: int = 640,
														
 
															+        sample_rate: int = 32000,
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+
														
 
															+        # Model parameters
														
 
															+        self.optimizer_builder = optimizer
														
 
															+        self.lr_scheduler_builder = lr_scheduler
														
 
															+
														
 
															+        # Generator and discriminators
														
 
															+        self.mel_transform = mel_transform
														
 
															+        self.noise_scheduler_train = DDIMScheduler(num_train_timesteps=1000)
														
 
															+        self.noise_scheduler_infer = UniPCMultistepScheduler(num_train_timesteps=1000)
														
 
															+        self.noise_scheduler_infer.set_timesteps(20)
														
 
															+
														
 
															+        # Modules
														
 
															+        self.vq_encoder = vq_encoder
														
 
															+        self.speaker_encoder = speaker_encoder
														
 
															+        self.text_encoder = text_encoder
														
 
															+        self.denoiser = denoiser
														
 
															+
														
 
															+        self.vocoder = vocoder
														
 
															+        self.hop_length = hop_length
														
 
															+        self.sampling_rate = sample_rate
														
 
															+
														
 
															+        # Freeze vocoder
														
 
															+        for param in self.vocoder.parameters():
														
 
															+            param.requires_grad = False
														
 
															+
														
 
															+    def configure_optimizers(self):
														
 
															+        optimizer = self.optimizer_builder(self.parameters())
														
 
															+        lr_scheduler = self.lr_scheduler_builder(optimizer)
														
 
															+
														
 
															+        return {
														
 
															+            "optimizer": optimizer,
														
 
															+            "lr_scheduler": {
														
 
															+                "scheduler": lr_scheduler,
														
 
															+                "interval": "step",
														
 
															+            },
														
 
															+        }
														
 
															+
														
 
															+    def normalize_mels(self, x):
														
 
															+        return (x + 11.5129251) / (1 + 11.5129251) * 2 - 1
														
 
															+
														
 
															+    def denormalize_mels(self, x):
														
 
															+        return (x + 1) / 2 * (1.0 + 11.5129251) - 11.5129251
														
 
															+
														
 
															+    def training_step(self, batch, batch_idx):
														
 
															+        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
														
 
															+        features, feature_lengths = batch["features"], batch["feature_lengths"]
														
 
															+
														
 
															+        audios = audios.float()
														
 
															+        features = features.float().mT
														
 
															+        audios = audios[:, None, :]
														
 
															+
														
 
															+        with torch.no_grad():
														
 
															+            gt_mels = self.mel_transform(audios)
														
 
															+
														
 
															+        mel_lengths = audio_lengths // self.hop_length
														
 
															+
														
 
															+        feature_masks = torch.unsqueeze(
														
 
															+            sequence_mask(feature_lengths, features.shape[2]), 1
														
 
															+        ).to(gt_mels.dtype)
														
 
															+        mel_masks = torch.unsqueeze(sequence_mask(mel_lengths, gt_mels.shape[2]), 1).to(
														
 
															+            gt_mels.dtype
														
 
															+        )
														
 
															+
														
 
															+        speaker_features = self.speaker_encoder(gt_mels, mel_masks)
														
 
															+        vq_features, _ = self.vq_encoder(features, feature_masks)
														
 
															+
														
 
															+        # vq_features is 50 hz, need to convert to true mel size
														
 
															+        vq_features = F.interpolate(vq_features, size=gt_mels.shape[2], mode="nearest")
														
 
															+        text_features = self.text_encoder(vq_features, mel_masks, g=speaker_features)
														
 
															+
														
 
															+        # Sample noise that we'll add to the images
														
 
															+        normalized_gt_mels = self.normalize_mels(gt_mels)
														
 
															+        noise = torch.randn_like(normalized_gt_mels)
														
 
															+
														
 
															+        # Sample a random timestep for each image
														
 
															+        timesteps = torch.randint(
														
 
															+            0,
														
 
															+            self.noise_scheduler_train.config.num_train_timesteps,
														
 
															+            (normalized_gt_mels.shape[0],),
														
 
															+            device=normalized_gt_mels.device,
														
 
															+        ).long()
														
 
															+
														
 
															+        # Add noise to the clean images according to the noise magnitude at each timestep
														
 
															+        # (this is the forward diffusion process)
														
 
															+        noisy_images = self.noise_scheduler_train.add_noise(
														
 
															+            normalized_gt_mels, noise, timesteps
														
 
															+        )
														
 
															+
														
 
															+        # Predict
														
 
															+        model_output = self.denoiser(noisy_images, timesteps, mel_masks, text_features)
														
 
															+
														
 
															+        # MSE loss without the mask
														
 
															+        loss = (
														
 
															+            (model_output * mel_masks - normalized_gt_mels * mel_masks) ** 2
														
 
															+        ).sum() / (mel_masks.sum() * gt_mels.shape[1])
														
 
															+
														
 
															+        self.log(
														
 
															+            "train/loss",
														
 
															+            loss,
														
 
															+            on_step=True,
														
 
															+            on_epoch=False,
														
 
															+            prog_bar=True,
														
 
															+            logger=True,
														
 
															+            sync_dist=True,
														
 
															+        )
														
 
															+
														
 
															+        return loss
														
 
															+
														
 
															+    def validation_step(self, batch: Any, batch_idx: int):
														
 
															+        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
														
 
															+        features, feature_lengths = batch["features"], batch["feature_lengths"]
														
 
															+
														
 
															+        audios = audios.float()
														
 
															+        features = features.float().mT
														
 
															+        audios = audios[:, None, :]
														
 
															+        gt_mels = self.mel_transform(audios)
														
 
															+        mel_lengths = audio_lengths // self.hop_length
														
 
															+
														
 
															+        feature_masks = torch.unsqueeze(
														
 
															+            sequence_mask(feature_lengths, features.shape[2]), 1
														
 
															+        ).to(gt_mels.dtype)
														
 
															+        mel_masks = torch.unsqueeze(sequence_mask(mel_lengths, gt_mels.shape[2]), 1).to(
														
 
															+            gt_mels.dtype
														
 
															+        )
														
 
															+
														
 
															+        speaker_features = self.speaker_encoder(gt_mels, mel_masks)
														
 
															+        vq_features, _ = self.vq_encoder(features, feature_masks)
														
 
															+
														
 
															+        # vq_features is 50 hz, need to convert to true mel size
														
 
															+        vq_features = F.interpolate(vq_features, size=gt_mels.shape[2], mode="nearest")
														
 
															+        text_features = self.text_encoder(vq_features, mel_masks, g=speaker_features)
														
 
															+
														
 
															+        # Begin sampling
														
 
															+        sampled_mels = torch.randn_like(gt_mels)
														
 
															+        self.noise_scheduler_infer.set_timesteps(20)
														
 
															+
														
 
															+        for t in self.noise_scheduler_infer.timesteps:
														
 
															+            timesteps = torch.tensor([t], device=sampled_mels.device, dtype=torch.long)
														
 
															+
														
 
															+            # 1. predict noise model_output
														
 
															+            model_output = self.denoiser(
														
 
															+                sampled_mels, timesteps, mel_masks, text_features
														
 
															+            )
														
 
															+
														
 
															+            # 2. compute previous image: x_t -> x_t-1
														
 
															+            sampled_mels = self.noise_scheduler_infer.step(
														
 
															+                model_output, t, sampled_mels
														
 
															+            ).prev_sample
														
 
															+
														
 
															+        sampled_mels = self.denormalize_mels(sampled_mels)
														
 
															+
														
 
															+        with torch.autocast(device_type=sampled_mels.device.type, enabled=False):
														
 
															+            # Run vocoder on fp32
														
 
															+            fake_audios = self.vocoder.decode(sampled_mels.float())
														
 
															+
														
 
															+        mel_loss = F.l1_loss(gt_mels, sampled_mels)
														
 
															+        self.log(
														
 
															+            "val/mel_loss",
														
 
															+            mel_loss,
														
 
															+            on_step=False,
														
 
															+            on_epoch=True,
														
 
															+            prog_bar=True,
														
 
															+            logger=True,
														
 
															+            sync_dist=True,
														
 
															+        )
														
 
															+
														
 
															+        for idx, (
														
 
															+            mel,
														
 
															+            gen_mel,
														
 
															+            audio,
														
 
															+            gen_audio,
														
 
															+            audio_len,
														
 
															+        ) in enumerate(
														
 
															+            zip(
														
 
															+                gt_mels,
														
 
															+                sampled_mels,
														
 
															+                audios,
														
 
															+                fake_audios,
														
 
															+                audio_lengths,
														
 
															+            )
														
 
															+        ):
														
 
															+            mel_len = audio_len // self.hop_length
														
 
															+
														
 
															+            image_mels = plot_mel(
														
 
															+                [
														
 
															+                    gen_mel[:, :mel_len],
														
 
															+                    mel[:, :mel_len],
														
 
															+                ],
														
 
															+                [
														
 
															+                    "Generated Spectrogram",
														
 
															+                    "Ground-Truth Spectrogram",
														
 
															+                ],
														
 
															+            )
														
 
															+
														
 
															+            if isinstance(self.logger, WandbLogger):
														
 
															+                self.logger.experiment.log(
														
 
															+                    {
														
 
															+                        "reconstruction_mel": wandb.Image(image_mels, caption="mels"),
														
 
															+                        "wavs": [
														
 
															+                            wandb.Audio(
														
 
															+                                audio[0, :audio_len],
														
 
															+                                sample_rate=self.sampling_rate,
														
 
															+                                caption="gt",
														
 
															+                            ),
														
 
															+                            wandb.Audio(
														
 
															+                                gen_audio[0, :audio_len],
														
 
															+                                sample_rate=self.sampling_rate,
														
 
															+                                caption="prediction",
														
 
															+                            ),
														
 
															+                        ],
														
 
															+                    },
														
 
															+                )
														
 
															+
														
 
															+            if isinstance(self.logger, TensorBoardLogger):
														
 
															+                self.logger.experiment.add_figure(
														
 
															+                    f"sample-{idx}/mels",
														
 
															+                    image_mels,
														
 
															+                    global_step=self.global_step,
														
 
															+                )
														
 
															+                self.logger.experiment.add_audio(
														
 
															+                    f"sample-{idx}/wavs/gt",
														
 
															+                    audio[0, :audio_len],
														
 
															+                    self.global_step,
														
 
															+                    sample_rate=self.sampling_rate,
														
 
															+                )
														
 
															+                self.logger.experiment.add_audio(
														
 
															+                    f"sample-{idx}/wavs/prediction",
														
 
															+                    gen_audio[0, :audio_len],
														
 
															+                    self.global_step,
														
 
															+                    sample_rate=self.sampling_rate,
														
 
															+                )
														
 
															+
														
 
															+            plt.close(image_mels)
														
--- a/fish_speech/models/vq_diffusion/unet1d.py
+++ b/fish_speech/models/vq_diffusion/unet1d.py
@@ -0,0 +1,198 @@
 
															+# Refer to https://github.com/huawei-noah/Speech-Backbones/blob/main/Grad-TTS/model/diffusion.py
														
 
															+
														
 
															+import math
														
 
															+
														
 
															+import torch
														
 
															+from einops import rearrange
														
 
															+from torch import nn
														
 
															+
														
 
															+
														
 
															+class Block(nn.Module):
														
 
															+    def __init__(self, dim, dim_out, groups=8):
														
 
															+        super().__init__()
														
 
															+        self.block = nn.Sequential(
														
 
															+            nn.Conv2d(dim, dim_out, 3, padding=1),
														
 
															+            nn.GroupNorm(groups, dim_out),
														
 
															+            nn.Mish(),
														
 
															+        )
														
 
															+
														
 
															+    def forward(self, x, mask):
														
 
															+        output = self.block(x * mask)
														
 
															+        return output * mask
														
 
															+
														
 
															+
														
 
															+class ResnetBlock(nn.Module):
														
 
															+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
														
 
															+        super().__init__()
														
 
															+        self.mlp = nn.Sequential(nn.Mish(), nn.Linear(time_emb_dim, dim_out))
														
 
															+
														
 
															+        self.block1 = Block(dim, dim_out, groups=groups)
														
 
															+        self.block2 = Block(dim_out, dim_out, groups=groups)
														
 
															+        if dim != dim_out:
														
 
															+            self.res_conv = nn.Conv2d(dim, dim_out, 1)
														
 
															+        else:
														
 
															+            self.res_conv = nn.Identity()
														
 
															+
														
 
															+    def forward(self, x, mask, time_emb):
														
 
															+        h = self.block1(x, mask)
														
 
															+        h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1)
														
 
															+        h = self.block2(h, mask)
														
 
															+        output = h + self.res_conv(x * mask)
														
 
															+        return output
														
 
															+
														
 
															+
														
 
															+class LinearAttention(nn.Module):
														
 
															+    def __init__(self, dim, heads=4, dim_head=32, init_values=1e-5):
														
 
															+        super().__init__()
														
 
															+
														
 
															+        self.heads = heads
														
 
															+        hidden_dim = dim_head * heads
														
 
															+
														
 
															+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
														
 
															+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
														
 
															+        self.gamma = nn.Parameter(torch.ones(dim) * init_values)
														
 
															+
														
 
															+    def forward(self, x):
														
 
															+        b, c, h, w = x.shape
														
 
															+        qkv = self.to_qkv(x)
														
 
															+        q, k, v = rearrange(
														
 
															+            qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
														
 
															+        )
														
 
															+        k = k.softmax(dim=-1)
														
 
															+        context = torch.einsum("bhdn,bhen->bhde", k, v)
														
 
															+        out = torch.einsum("bhde,bhdn->bhen", context, q)
														
 
															+        out = rearrange(
														
 
															+            out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
														
 
															+        )
														
 
															+        return self.to_out(out) * self.gamma.view(1, -1, 1, 1) + x
														
 
															+
														
 
															+
														
 
															+class SinusoidalPosEmb(nn.Module):
														
 
															+    def __init__(self, dim):
														
 
															+        super().__init__()
														
 
															+        self.dim = dim
														
 
															+
														
 
															+    def forward(self, x, scale=1000):
														
 
															+        device = x.device
														
 
															+        half_dim = self.dim // 2
														
 
															+        emb = math.log(10000) / (half_dim - 1)
														
 
															+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
														
 
															+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
														
 
															+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
														
 
															+        return emb
														
 
															+
														
 
															+
														
 
															+class Unet1DDenoiser(nn.Module):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        dim,
														
 
															+        dim_mults=(1, 2, 4),
														
 
															+        groups=8,
														
 
															+        pe_scale=1000,
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+        self.dim = dim
														
 
															+        self.dim_mults = dim_mults
														
 
															+        self.groups = groups
														
 
															+        self.pe_scale = pe_scale
														
 
															+
														
 
															+        self.time_pos_emb = SinusoidalPosEmb(dim)
														
 
															+        self.mlp = nn.Sequential(
														
 
															+            nn.Linear(dim, dim * 4), nn.Mish(), nn.Linear(dim * 4, dim)
														
 
															+        )
														
 
															+        self.downsample_rate = 2 ** (len(dim_mults) - 1)
														
 
															+
														
 
															+        dims = [2, *map(lambda m: dim * m, dim_mults)]
														
 
															+        in_out = list(zip(dims[:-1], dims[1:]))
														
 
															+        self.downs = nn.ModuleList([])
														
 
															+        self.ups = nn.ModuleList([])
														
 
															+        num_resolutions = len(in_out)
														
 
															+
														
 
															+        for ind, (dim_in, dim_out) in enumerate(in_out):
														
 
															+            is_last = ind >= (num_resolutions - 1)
														
 
															+            self.downs.append(
														
 
															+                nn.ModuleList(
														
 
															+                    [
														
 
															+                        ResnetBlock(dim_in, dim_out, time_emb_dim=dim),
														
 
															+                        ResnetBlock(dim_out, dim_out, time_emb_dim=dim),
														
 
															+                        LinearAttention(dim_out),
														
 
															+                        nn.Conv2d(dim_out, dim_out, 3, 2, 1)
														
 
															+                        if not is_last
														
 
															+                        else nn.Identity(),
														
 
															+                    ]
														
 
															+                )
														
 
															+            )
														
 
															+
														
 
															+        mid_dim = dims[-1]
														
 
															+        self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim)
														
 
															+        self.mid_attn = LinearAttention(mid_dim)
														
 
															+        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim)
														
 
															+
														
 
															+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
														
 
															+            self.ups.append(
														
 
															+                nn.ModuleList(
														
 
															+                    [
														
 
															+                        ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim),
														
 
															+                        ResnetBlock(dim_in, dim_in, time_emb_dim=dim),
														
 
															+                        LinearAttention(dim_in),
														
 
															+                        nn.ConvTranspose2d(dim_in, dim_in, 4, 2, 1),
														
 
															+                    ]
														
 
															+                )
														
 
															+            )
														
 
															+        self.final_block = Block(dim, dim)
														
 
															+        self.final_conv = nn.Conv2d(dim, 1, 1)
														
 
															+
														
 
															+    def forward(self, x, t, mask, condition):
														
 
															+        t = self.time_pos_emb(t, scale=self.pe_scale)
														
 
															+        t = self.mlp(t)
														
 
															+
														
 
															+        x = torch.stack([condition, x], 1)
														
 
															+        mask = mask.unsqueeze(1)
														
 
															+
														
 
															+        original_len = x.shape[3]
														
 
															+        if x.shape[3] % self.downsample_rate != 0:
														
 
															+            x = nn.functional.pad(
														
 
															+                x, (0, self.downsample_rate - x.shape[3] % self.downsample_rate)
														
 
															+            )
														
 
															+            mask = nn.functional.pad(
														
 
															+                mask, (0, self.downsample_rate - mask.shape[3] % self.downsample_rate)
														
 
															+            )
														
 
															+
														
 
															+        hiddens = []
														
 
															+        masks = [mask]
														
 
															+        for resnet1, resnet2, attn, downsample in self.downs:
														
 
															+            mask_down = masks[-1]
														
 
															+            x = resnet1(x, mask_down, t)
														
 
															+            x = resnet2(x, mask_down, t)
														
 
															+            x = attn(x)
														
 
															+            hiddens.append(x)
														
 
															+            x = downsample(x * mask_down)
														
 
															+            masks.append(mask_down[:, :, :, ::2])
														
 
															+
														
 
															+        masks = masks[:-1]
														
 
															+        mask_mid = masks[-1]
														
 
															+        x = self.mid_block1(x, mask_mid, t)
														
 
															+        x = self.mid_attn(x)
														
 
															+        x = self.mid_block2(x, mask_mid, t)
														
 
															+
														
 
															+        for resnet1, resnet2, attn, upsample in self.ups:
														
 
															+            mask_up = masks.pop()
														
 
															+            x = torch.cat((x, hiddens.pop()), dim=1)
														
 
															+            x = resnet1(x, mask_up, t)
														
 
															+            x = resnet2(x, mask_up, t)
														
 
															+            x = attn(x)
														
 
															+            x = upsample(x * mask_up)
														
 
															+
														
 
															+        x = self.final_block(x, mask)
														
 
															+        output = self.final_conv(x * mask)
														
 
															+
														
 
															+        output = (output * mask).squeeze(1)
														
 
															+        return output[:, :, :original_len]
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    model = Unet1DDenoiser(128)
														
 
															+    mel = torch.randn(1, 128, 99)
														
 
															+    mask = torch.ones(1, 1, 99)
														
 
															+
														
 
															+    print(model(mel, mask, torch.tensor([10], dtype=torch.long), mel).shape)
														
--- a/fish_speech/models/vqgan/lit_module.py
+++ b/fish_speech/models/vqgan/lit_module.py
@@ -233,9 +233,7 @@ class VQGAN(L.LightningModule):
 
															         audios = audios[:, None, :]
														
 
															         gt_mels = self.mel_transform(audios)
														
 
															-        assert (
														
 
															-            gt_mels.shape[2] == features.shape[1]
														
 
															-        ), f"Shapes do not match: {gt_mels.shape}, {features.shape}"
														
 
															+        gt_mels = gt_mels[:, :, : features.shape[1]]
														
 
															         fake_audios = self.generator.infer(features, feature_lengths, gt_mels)
														
 
															         posterior_audios = self.generator.reconstruct(gt_mels, feature_lengths)
														
--- a/fish_speech/models/vqgan/modules/encoders.py
+++ b/fish_speech/models/vqgan/modules/encoders.py
@@ -2,6 +2,7 @@ from typing import Optional
 
															 import torch
														
 
															 import torch.nn as nn
														
 
															+import torch.nn.functional as F
														
 
															 from vector_quantize_pytorch import VectorQuantize
														
 
															 from fish_speech.models.vqgan.modules.modules import WN
														
@@ -13,7 +14,7 @@ from fish_speech.models.vqgan.utils import sequence_mask
 
															 class TextEncoder(nn.Module):
														
 
															     def __init__(
														
 
															         self,
														
 
															-        n_vocab: int,
														
 
															+        in_channels: int,
														
 
															         out_channels: int,
														
 
															         hidden_channels: int,
														
 
															         hidden_channels_ffn: int,
														
@@ -23,11 +24,12 @@ class TextEncoder(nn.Module):
 
															         dropout: float,
														
 
															         gin_channels=0,
														
 
															         speaker_cond_layer=0,
														
 
															+        use_vae=True,
														
 
															     ):
														
 
															         """Text Encoder for VITS model.
														
 
															         Args:
														
 
															-            n_vocab (int): Number of characters for the embedding layer.
														
 
															+            in_channels (int): Number of characters for the embedding layer.
														
 
															             out_channels (int): Number of channels for the output.
														
 
															             hidden_channels (int): Number of channels for the hidden layers.
														
 
															             hidden_channels_ffn (int): Number of channels for the convolutional layers.
														
@@ -41,9 +43,7 @@ class TextEncoder(nn.Module):
 
															         self.out_channels = out_channels
														
 
															         self.hidden_channels = hidden_channels
														
 
															-        # self.emb = nn.Linear(n_vocab, hidden_channels)
														
 
															-        self.emb = nn.Linear(n_vocab, hidden_channels, 1)
														
 
															-        # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
														
 
															+        self.proj_in = nn.Conv1d(in_channels, hidden_channels, 1)
														
 
															         self.encoder = RelativePositionTransformer(
														
 
															             in_channels=hidden_channels,
														
@@ -58,12 +58,15 @@ class TextEncoder(nn.Module):
 
															             gin_channels=gin_channels,
														
 
															             speaker_cond_layer=speaker_cond_layer,
														
 
															         )
														
 
															-        self.proj = nn.Linear(hidden_channels, out_channels * 2)
														
 
															+        self.proj_out = nn.Conv1d(
														
 
															+            hidden_channels, out_channels * 2 if use_vae else out_channels, 1
														
 
															+        )
														
 
															+        self.use_vae = use_vae
														
 
															     def forward(
														
 
															         self,
														
 
															         x: torch.Tensor,
														
 
															-        x_lengths: torch.Tensor,
														
 
															+        x_mask: torch.Tensor,
														
 
															         g: torch.Tensor = None,
														
 
															         noise_scale: float = 1,
														
 
															     ):
														
@@ -72,14 +75,14 @@ class TextEncoder(nn.Module):
 
															             - x: :math:`[B, T]`
														
 
															             - x_length: :math:`[B]`
														
 
															         """
														
 
															-        # x = self.emb(x).mT * math.sqrt(self.hidden_channels)  # [b, h, t]
														
 
															-        x = self.emb(x).mT  # * math.sqrt(self.hidden_channels)  # [b, h, t]
														
 
															-        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
														
 
															-
														
 
															+        x = self.proj_in(x) * x_mask
														
 
															         x = self.encoder(x, x_mask, g=g)
														
 
															-        stats = self.proj(x.mT).mT * x_mask
														
 
															+        x = self.proj_out(x) * x_mask
														
 
															-        m, logs = torch.split(stats, self.out_channels, dim=1)
														
 
															+        if self.use_vae is False:
														
 
															+            return x
														
 
															+
														
 
															+        m, logs = torch.split(x, self.out_channels, dim=1)
														
 
															         z = m + torch.randn_like(m) * torch.exp(logs) * x_mask * noise_scale
														
 
															         return z, m, logs, x, x_mask
														
@@ -113,7 +116,7 @@ class PosteriorEncoder(nn.Module):
 
															         super().__init__()
														
 
															         self.out_channels = out_channels
														
 
															-        self.pre = nn.Linear(in_channels, hidden_channels)
														
 
															+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
														
 
															         self.enc = WN(
														
 
															             hidden_channels,
														
 
															             kernel_size,
														
@@ -121,7 +124,7 @@ class PosteriorEncoder(nn.Module):
 
															             n_layers,
														
 
															             gin_channels=gin_channels,
														
 
															         )
														
 
															-        self.proj = nn.Linear(hidden_channels, out_channels * 2)
														
 
															+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
														
 
															     def forward(
														
 
															         self,
														
@@ -137,9 +140,9 @@ class PosteriorEncoder(nn.Module):
 
															             - g: :math:`[B, C, 1]`
														
 
															         """
														
 
															         x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
														
 
															-        x = self.pre(x.mT).mT * x_mask
														
 
															+        x = self.pre(x) * x_mask
														
 
															         x = self.enc(x, x_mask, g=g)
														
 
															-        stats = self.proj(x.mT).mT * x_mask
														
 
															+        stats = self.proj(x) * x_mask
														
 
															         m, logs = torch.split(stats, self.out_channels, dim=1)
														
 
															         z = m + torch.randn_like(m) * torch.exp(logs) * x_mask * noise_scale
														
 
															         return z, m, logs, x_mask
														
@@ -180,22 +183,19 @@ class SpeakerEncoder(nn.Module):
 
															         )
														
 
															         self.out_proj = nn.Linear(hidden_channels, out_channels)
														
 
															-    def forward(self, mels, mel_lengths: torch.Tensor):
														
 
															+    def forward(self, mels, mel_masks: torch.Tensor):
														
 
															         """
														
 
															         Shapes:
														
 
															             - x: :math:`[B, C, T]`
														
 
															             - x_lengths: :math:`[B, 1]`
														
 
															         """
														
 
															-        x_mask = torch.unsqueeze(sequence_mask(mel_lengths, mels.size(2)), 1).to(
														
 
															-            mels.dtype
														
 
															-        )
														
 
															-        x = self.in_proj(mels) * x_mask
														
 
															-        x = self.encoder(x, x_mask)
														
 
															+        x = self.in_proj(mels) * mel_masks
														
 
															+        x = self.encoder(x, mel_masks)
														
 
															         # Avg Pooling
														
 
															-        x = x * x_mask
														
 
															-        x = torch.sum(x, dim=2) / torch.sum(x_mask, dim=2)
														
 
															+        x = x * mel_masks
														
 
															+        x = torch.sum(x, dim=2) / torch.sum(mel_masks, dim=2)
														
 
															         x = self.out_proj(x)[..., None]
														
 
															         return x
														
@@ -219,7 +219,7 @@ class VQEncoder(nn.Module):
 
															             kmeans_init=False,
														
 
															             channel_last=False,
														
 
															         )
														
 
															-
														
 
															+        self.downsample = downsample
														
 
															         self.conv_in = nn.Conv1d(
														
 
															             in_channels, vq_channels, kernel_size=downsample, stride=downsample
														
 
															         )
														
@@ -253,10 +253,17 @@ class VQEncoder(nn.Module):
 
															         self.vq.load_state_dict(state_dict, strict=True)
														
 
															-    def forward(self, x):
														
 
															-        # x: [B, T, C]
														
 
															-        x = self.conv_in(x.mT)
														
 
															+    def forward(self, x, x_mask):
														
 
															+        # x: [B, C, T], x_mask: [B, 1, T]
														
 
															+        x_len = x.shape[2]
														
 
															+
														
 
															+        if x_len % self.downsample != 0:
														
 
															+            x = F.pad(x, (0, self.downsample - x_len % self.downsample))
														
 
															+            x_mask = F.pad(x_mask, (0, self.downsample - x_len % self.downsample))
														
 
															+
														
 
															+        x = self.conv_in(x)
														
 
															         q, _, loss = self.vq(x)
														
 
															-        x = self.conv_out(q).mT
														
 
															+        x = self.conv_out(q) * x_mask
														
 
															+        x = x[:, :, :x_len]
														
 
															         return x, loss
														
--- a/fish_speech/models/vqgan/modules/models.py
+++ b/fish_speech/models/vqgan/modules/models.py
@@ -104,12 +104,12 @@ class SynthesizerTrn(nn.Module):
 
															             gin_channels=gin_channels,
														
 
															         )
														
 
															-    def forward(self, x, x_lengths, y):
														
 
															-        g = self.enc_spk(y, x_lengths)
														
 
															+    def forward(self, x, x_lengths, specs):
														
 
															+        g = self.enc_spk(specs, x_lengths)
														
 
															         x, vq_loss = self.vq(x)
														
 
															         _, m_p, logs_p, _, x_mask = self.enc_p(x, x_lengths, g=g)
														
 
															-        z_q, m_q, logs_q, y_mask = self.enc_q(y, x_lengths, g=g)
														
 
															+        z_q, m_q, logs_q, y_mask = self.enc_q(specs, x_lengths, g=g)
														
 
															         z_p = self.flow(z_q, y_mask, g=g, reverse=False)
														
 
															         z_slice, ids_slice = rand_slice_segments(z_q, x_lengths, self.segment_size)
														
@@ -126,8 +126,8 @@ class SynthesizerTrn(nn.Module):
 
															             vq_loss,
														
 
															         )
														
 
															-    def infer(self, x, x_lengths, y, max_len=None, noise_scale=0.35):
														
 
															-        g = self.enc_spk(y, x_lengths)
														
 
															+    def infer(self, x, x_lengths, specs, max_len=None, noise_scale=0.35):
														
 
															+        g = self.enc_spk(specs, x_lengths)
														
 
															         x, vq_loss = self.vq(x)
														
 
															         z_p, m_p, logs_p, h_text, x_mask = self.enc_p(
														
 
															             x, x_lengths, g=g, noise_scale=noise_scale
														
--- a/fish_speech/models/vqgan/modules/modules.py
+++ b/fish_speech/models/vqgan/modules/modules.py
@@ -32,7 +32,7 @@ class WN(nn.Module):
 
															         self.drop = nn.Dropout(p_dropout)
														
 
															         if gin_channels != 0:
														
 
															-            cond_layer = nn.Linear(gin_channels, 2 * hidden_channels * n_layers)
														
 
															+            cond_layer = nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
														
 
															             self.cond_layer = weight_norm(cond_layer, name="weight")
														
 
															         for i in range(n_layers):
														
@@ -52,7 +52,7 @@ class WN(nn.Module):
 
															             res_skip_channels = (
														
 
															                 2 * hidden_channels if i < n_layers - 1 else hidden_channels
														
 
															             )
														
 
															-            res_skip_layer = nn.Linear(hidden_channels, res_skip_channels)
														
 
															+            res_skip_layer = nn.Conv1d(hidden_channels, res_skip_channels, 1)
														
 
															             res_skip_layer = weight_norm(res_skip_layer, name="weight")
														
 
															             self.res_skip_layers.append(res_skip_layer)
														
@@ -61,7 +61,7 @@ class WN(nn.Module):
 
															         n_channels_tensor = torch.IntTensor([self.hidden_channels])
														
 
															         if g is not None:
														
 
															-            g = self.cond_layer(g.mT).mT
														
 
															+            g = self.cond_layer(g)
														
 
															         for i in range(self.n_layers):
														
 
															             x_in = self.in_layers[i](x)
														
@@ -74,7 +74,7 @@ class WN(nn.Module):
 
															             acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
														
 
															             acts = self.drop(acts)
														
 
															-            res_skip_acts = self.res_skip_layers[i](acts.mT).mT
														
 
															+            res_skip_acts = self.res_skip_layers[i](acts)
														
 
															             if i < self.n_layers - 1:
														
 
															                 res_acts = res_skip_acts[:, : self.hidden_channels, :]
														
 
															                 x = (x + res_acts) * x_mask
														
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
 
															     "vector-quantize-pytorch>=1.10.0",
														
 
															     "rich>=13.5.3",
														
 
															     "gradio>=4.0.0",
														
 
															+    "diffusers@git+https://github.com/huggingface/diffusers",
														
 
															     "cn2an",
														
 
															     "pypinyin",
														
 
															     "jieba",
														
--- a/tools/vqgan/migrate_from_vits.py
+++ b/tools/vqgan/migrate_from_vits.py
@@ -27,22 +27,23 @@ def main(cfg: DictConfig):
 
															     # Decoder
														
 
															     generator_state = {
														
 
															-        k[4:]: v for k, v in generator_weights.items() if k.startswith("dec.")
														
 
															+        k[4:]: v
														
 
															+        for k, v in generator_weights.items()
														
 
															+        if k.startswith("dec.") and not k.startswith("dec.cond.")
														
 
															     }
														
 
															     logger.info(f"Found {len(generator_state)} HiFiGAN weights, restoring...")
														
 
															-    model.generator.load_state_dict(generator_state, strict=True)
														
 
															-    logger.info("Generator weights restored.")
														
 
															+    r = model.generator.dec.load_state_dict(generator_state, strict=False)
														
 
															+    logger.info(f"Generator weights restored. {r}")
														
 
															     # Posterior Encoder
														
 
															-    encoder_state = {
														
 
															-        k[6:]: v
														
 
															-        for k, v in generator_weights.items()
														
 
															-        if k.startswith("enc_q.")
														
 
															-        if not k.startswith("enc_q.proj.")
														
 
															-    }
														
 
															-    logger.info(f"Found {len(encoder_state)} posterior encoder weights, restoring...")
														
 
															-    x = model.posterior_encoder.load_state_dict(encoder_state, strict=False)
														
 
															-    logger.info(f"Posterior encoder weights restored. {x}")
														
 
															+    # encoder_state = {
														
 
															+    #     k[6:]: v
														
 
															+    #     for k, v in generator_weights.items()
														
 
															+    #     if k.startswith("enc_q.") and not k.startswith("enc_q.proj.")
														
 
															+    # }
														
 
															+    # logger.info(f"Found {len(encoder_state)} posterior encoder weights, restoring...")
														
 
															+    # x = model.generator.enc_q.load_state_dict(encoder_state, strict=False)
														
 
															+    # logger.info(f"Posterior encoder weights restored. {x}")
														
 
															     # Flow
														
 
															     # flow_state = {
														
@@ -61,37 +62,6 @@ def main(cfg: DictConfig):
 
															     model.discriminator.load_state_dict(discriminator_weights, strict=True)
														
 
															     logger.info("Discriminator weights restored.")
														
 
															-    # Restore kmeans
														
 
															-    logger.info("Reset vq projection layer to mimic avg pooling")
														
 
															-    torch.nn.init.normal_(
														
 
															-        model.semantic_encoder.in_proj.weight,
														
 
															-        mean=1
														
 
															-        / (
														
 
															-            model.semantic_encoder.in_proj.weight.shape[0]
														
 
															-            * model.semantic_encoder.in_proj.weight.shape[-1]
														
 
															-        ),
														
 
															-        std=1e-2,
														
 
															-    )
														
 
															-    model.semantic_encoder.in_proj.bias.data.zero_()
														
 
															-
														
 
															-    kmeans_ckpt = "results/hubert-vq-pretrain/kmeans.pt"
														
 
															-    kmeans_ckpt = torch.load(kmeans_ckpt, map_location="cpu")
														
 
															-
														
 
															-    centroids = kmeans_ckpt["centroids"][0]
														
 
															-    bins = kmeans_ckpt["bins"][0]
														
 
															-    logger.info(
														
 
															-        f"Restoring kmeans centroids with shape {centroids.shape} and bins {bins.shape}"
														
 
															-    )
														
 
															-
														
 
															-    state_dict = {
														
 
															-        "_codebook.inited": torch.Tensor([True]),
														
 
															-        "_codebook.cluster_size": bins,
														
 
															-        "_codebook.embed": centroids,
														
 
															-        "_codebook.embed_avg": centroids.clone(),
														
 
															-    }
														
 
															-
														
 
															-    model.semantic_encoder.vq.load_state_dict(state_dict, strict=True)
														
 
															-
														
 
															     torch.save(model.state_dict(), cfg.ckpt_path)
														
 
															     logger.info("Done")