2 лет назад · 223d0b8f81
--- a/fish_speech/configs/vqgan_pretrain.yaml
+++ b/fish_speech/configs/vqgan_pretrain.yaml
@@ -2,7 +2,7 @@ defaults:
 
															   - base
														
 
															   - _self_
														
 
															-project: vq_reflow_shallow_group_fsq_8x1024_wavenet
														
 
															+project: vq-group-fsq-8x1024-wn-20x768-cond
														
 
															 # Lightning Trainer
														
 
															 trainer:
														
@@ -10,8 +10,8 @@ trainer:
 
															   devices: 1
														
 
															   precision: bf16-mixed
														
 
															   max_steps: 1_000_000
														
 
															-  val_check_interval: 1000
														
 
															-  strategy: ddp
														
 
															+  val_check_interval: 5000
														
 
															+  strategy: ddp_find_unused_parameters_true
														
 
															 sample_rate: 44100
														
 
															 hop_length: 512
														
@@ -38,8 +38,8 @@ data:
 
															   train_dataset: ${train_dataset}
														
 
															   val_dataset: ${val_dataset}
														
 
															   num_workers: 4
														
 
															-  batch_size: 128
														
 
															-  val_batch_size: 4
														
 
															+  batch_size: 64
														
 
															+  val_batch_size: 64
														
 
															 # Model Configuration
														
 
															 model:
														
@@ -58,33 +58,24 @@ model:
 
															   encoder:
														
 
															     _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
														
 
															     input_channels: ${num_mels}
														
 
															-    residual_channels: 512
														
 
															+    residual_channels: 768
														
 
															     residual_layers: 20
														
 
															     dilation_cycle: 4
														
 
															   quantizer:
														
 
															     _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
														
 
															-    input_dim: 512
														
 
															-    n_codebooks: 8
														
 
															-    n_groups: 1
														
 
															+    input_dim: 768
														
 
															+    n_codebooks: 1
														
 
															+    n_groups: 8
														
 
															     levels: [8, 5, 5, 5]
														
 
															-  
														
 
															-  decoder:
														
 
															-    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
														
 
															-    output_channels: ${num_mels}
														
 
															-    residual_channels: 512
														
 
															-    residual_layers: 20
														
 
															-    dilation_cycle: 4
														
 
															-  reflow:
														
 
															+  decoder:
														
 
															     _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
														
 
															-    input_channels: ${num_mels}
														
 
															     output_channels: ${num_mels}
														
 
															-    residual_channels: 512
														
 
															-    condition_channels: 512
														
 
															+    residual_channels: 768
														
 
															     residual_layers: 20
														
 
															     dilation_cycle: 4
														
 
															-    is_diffusion: true
														
 
															+    condition_channels: 768
														
 
															   vocoder:
														
 
															     _target_: fish_speech.models.vqgan.modules.firefly.FireflyBase
														
@@ -129,4 +120,3 @@ callbacks:
 
															       - encoder
														
 
															       - decoder
														
 
															       - quantizer
														
 
															-      - reflow
														
--- a/fish_speech/datasets/vqgan.py
+++ b/fish_speech/datasets/vqgan.py
@@ -122,7 +122,7 @@ class VQGANDataModule(LightningDataModule):
 
															     def val_dataloader(self):
														
 
															         return DataLoader(
														
 
															             self.val_dataset,
														
 
															-            batch_size=self.batch_size,
														
 
															+            batch_size=self.val_batch_size,
														
 
															             collate_fn=VQGANCollator(),
														
 
															             num_workers=self.num_workers,
														
 
															         )
														
--- a/fish_speech/models/vqgan/lit_module.py
+++ b/fish_speech/models/vqgan/lit_module.py
@@ -1,6 +1,5 @@
 
															-import itertools
														
 
															-from dataclasses import dataclass
														
 
															-from typing import Any, Callable, Literal, Optional
														
 
															+import math
														
 
															+from typing import Any, Callable
														
 
															 import lightning as L
														
 
															 import torch
														
@@ -22,7 +21,7 @@ class VQGAN(L.LightningModule):
 
															         encoder: WaveNet,
														
 
															         quantizer: nn.Module,
														
 
															         decoder: WaveNet,
														
 
															-        reflow: nn.Module,
														
 
															+        # reflow: nn.Module,
														
 
															         vocoder: nn.Module,
														
 
															         mel_transform: nn.Module,
														
 
															         weight_reflow: float = 1.0,
														
@@ -44,7 +43,7 @@ class VQGAN(L.LightningModule):
 
															         self.quantizer = quantizer
														
 
															         self.decoder = decoder
														
 
															         self.vocoder = vocoder
														
 
															-        self.reflow = reflow
														
 
															+        # self.reflow = reflow
														
 
															         self.mel_transform = mel_transform
														
 
															         # Freeze vocoder
														
@@ -122,51 +121,21 @@ class VQGAN(L.LightningModule):
 
															         vq_recon_features = vq_result.z * mel_masks_float_conv
														
 
															         # VQ Decode
														
 
															-        gen_mel = self.decoder(vq_recon_features) * mel_masks_float_conv
														
 
															+        gen_mel = (
														
 
															+            self.decoder(
														
 
															+                torch.randn_like(vq_recon_features) * mel_masks_float_conv,
														
 
															+                condition=vq_recon_features,
														
 
															+            )
														
 
															+            * mel_masks_float_conv
														
 
															+        )
														
 
															         # Mel Loss
														
 
															         loss_mel = (gen_mel - gt_mels).abs().mean(
														
 
															             dim=1, keepdim=True
														
 
															         ).sum() / mel_masks_float_conv.sum()
														
 
															-        # Reflow, given x_1_aux, we want to reconstruct x_1
														
 
															-        x_1 = self.norm_spec(gt_mels)
														
 
															-        t = torch.rand(gt_mels.shape[0], device=gt_mels.device, dtype=torch.float32)
														
 
															-        t = torch.clamp(t, 1e-6, 1 - 1e-6)  # Avoid 0 and 1
														
 
															-        x_0 = torch.randn_like(x_1)
														
 
															-
														
 
															-        # X_t = t * X_1 + (1 - t) * X_0
														
 
															-        x_t = x_0 + t[:, None, None] * (x_1 - x_0)
														
 
															-
														
 
															-        v_pred = self.reflow(
														
 
															-            x_t,
														
 
															-            1000 * t,
														
 
															-            vq_recon_features.detach(),  # Stop gradients, avoid reflow to destroy the VQ
														
 
															-        )
														
 
															-
														
 
															-        # Log L2 loss with
														
 
															-        with torch.autocast(device_type=gt_mels.device.type, dtype=torch.float32):
														
 
															-            weights = (
														
 
															-                0.398942 / t / (1 - t) * torch.exp(-0.5 * torch.log(t / (1 - t)) ** 2)
														
 
															-            )
														
 
															-            assert (
														
 
															-                torch.isnan(weights).any() == False
														
 
															-                and torch.isinf(weights).any() == False
														
 
															-            ), "Found NaN or Inf in weights."
														
 
															-
														
 
															-            loss_reflow = weights[:, None, None] * F.mse_loss(
														
 
															-                x_1 - x_0, v_pred, reduction="none"
														
 
															-            )
														
 
															-            loss_reflow = (loss_reflow * mel_masks_float_conv).mean(
														
 
															-                dim=1
														
 
															-            ).sum() / mel_masks_float_conv.sum()
														
 
															-
														
 
															         # Total loss
														
 
															-        loss = (
														
 
															-            self.weight_vq * loss_vq
														
 
															-            + self.weight_mel * loss_mel
														
 
															-            + self.weight_reflow * loss_reflow
														
 
															-        )
														
 
															+        loss = self.weight_vq * loss_vq + self.weight_mel * loss_mel
														
 
															         # Log losses
														
 
															         self.log(
														
@@ -193,14 +162,6 @@ class VQGAN(L.LightningModule):
 
															             prog_bar=False,
														
 
															             logger=True,
														
 
															         )
														
 
															-        self.log(
														
 
															-            "train/generator/loss_reflow",
														
 
															-            loss_reflow,
														
 
															-            on_step=True,
														
 
															-            on_epoch=False,
														
 
															-            prog_bar=False,
														
 
															-            logger=True,
														
 
															-        )
														
 
															         return loss
														
@@ -223,7 +184,13 @@ class VQGAN(L.LightningModule):
 
															         vq_recon_features = self.quantizer(encoded_features).z * mel_masks_float_conv
														
 
															         # VQ Decode
														
 
															-        gen_aux_mels = self.decoder(vq_recon_features) * mel_masks_float_conv
														
 
															+        gen_aux_mels = (
														
 
															+            self.decoder(
														
 
															+                torch.randn_like(vq_recon_features) * mel_masks_float_conv,
														
 
															+                condition=vq_recon_features,
														
 
															+            )
														
 
															+            * mel_masks_float_conv
														
 
															+        )
														
 
															         loss_mel = (gen_aux_mels - gt_mels).abs().mean(
														
 
															             dim=1, keepdim=True
														
 
															         ).sum() / mel_masks_float_conv.sum()
														
@@ -238,45 +205,8 @@ class VQGAN(L.LightningModule):
 
															             sync_dist=True,
														
 
															         )
														
 
															-        # Reflow inference
														
 
															-        t_start = 0.0
														
 
															-
														
 
															-        x_1 = self.norm_spec(gen_aux_mels)
														
 
															-        x_0 = torch.randn_like(x_1)
														
 
															-        gen_reflow_mels = (1 - t_start) * x_0 + t_start * x_1
														
 
															-
														
 
															-        t = torch.zeros(gt_mels.shape[0], device=gt_mels.device)
														
 
															-        dt = (1.0 - t_start) / self.reflow_inference_steps
														
 
															-
														
 
															-        for _ in range(self.reflow_inference_steps):
														
 
															-            gen_reflow_mels += (
														
 
															-                self.reflow(
														
 
															-                    gen_reflow_mels,
														
 
															-                    1000 * t,
														
 
															-                    vq_recon_features,
														
 
															-                )
														
 
															-                * dt
														
 
															-            )
														
 
															-            t += dt
														
 
															-
														
 
															-        gen_reflow_mels = self.denorm_spec(gen_reflow_mels) * mel_masks_float_conv
														
 
															-        loss_reflow_mel = (gen_reflow_mels - gt_mels).abs().mean(
														
 
															-            dim=1, keepdim=True
														
 
															-        ).sum() / mel_masks_float_conv.sum()
														
 
															-
														
 
															-        self.log(
														
 
															-            "val/loss_reflow_mel",
														
 
															-            loss_reflow_mel,
														
 
															-            on_step=False,
														
 
															-            on_epoch=True,
														
 
															-            prog_bar=False,
														
 
															-            logger=True,
														
 
															-            sync_dist=True,
														
 
															-        )
														
 
															-
														
 
															         recon_audios = self.vocoder(gt_mels)
														
 
															         gen_aux_audios = self.vocoder(gen_aux_mels)
														
 
															-        gen_reflow_audios = self.vocoder(gen_reflow_mels)
														
 
															         # only log the first batch
														
 
															         if batch_idx != 0:
														
@@ -285,36 +215,33 @@ class VQGAN(L.LightningModule):
 
															         for idx, (
														
 
															             gt_mel,
														
 
															             gen_aux_mel,
														
 
															-            gen_reflow_mel,
														
 
															             audio,
														
 
															             gen_aux_audio,
														
 
															-            gen_reflow_audio,
														
 
															             recon_audio,
														
 
															             audio_len,
														
 
															         ) in enumerate(
														
 
															             zip(
														
 
															                 gt_mels,
														
 
															                 gen_aux_mels,
														
 
															-                gen_reflow_mels,
														
 
															-                audios.float(),
														
 
															-                gen_aux_audios.float(),
														
 
															-                gen_reflow_audios.float(),
														
 
															-                recon_audios.float(),
														
 
															+                audios.cpu().float(),
														
 
															+                gen_aux_audios.cpu().float(),
														
 
															+                recon_audios.cpu().float(),
														
 
															                 audio_lengths,
														
 
															             )
														
 
															         ):
														
 
															+            if idx > 4:
														
 
															+                break
														
 
															+
														
 
															             mel_len = audio_len // self.mel_transform.hop_length
														
 
															             image_mels = plot_mel(
														
 
															                 [
														
 
															                     gt_mel[:, :mel_len],
														
 
															                     gen_aux_mel[:, :mel_len],
														
 
															-                    gen_reflow_mel[:, :mel_len],
														
 
															                 ],
														
 
															                 [
														
 
															                     "Ground-Truth",
														
 
															                     "Auxiliary",
														
 
															-                    "Reflow",
														
 
															                 ],
														
 
															             )
														
@@ -333,11 +260,6 @@ class VQGAN(L.LightningModule):
 
															                                 sample_rate=self.sampling_rate,
														
 
															                                 caption="aux",
														
 
															                             ),
														
 
															-                            wandb.Audio(
														
 
															-                                gen_reflow_audio[0, :audio_len],
														
 
															-                                sample_rate=self.sampling_rate,
														
 
															-                                caption="reflow",
														
 
															-                            ),
														
 
															                             wandb.Audio(
														
 
															                                 recon_audio[0, :audio_len],
														
 
															                                 sample_rate=self.sampling_rate,
														
@@ -365,12 +287,6 @@ class VQGAN(L.LightningModule):
 
															                     self.global_step,
														
 
															                     sample_rate=self.sampling_rate,
														
 
															                 )
														
 
															-                self.logger.experiment.add_audio(
														
 
															-                    f"sample-{idx}/wavs/reflow",
														
 
															-                    gen_reflow_audio[0, :audio_len],
														
 
															-                    self.global_step,
														
 
															-                    sample_rate=self.sampling_rate,
														
 
															-                )
														
 
															                 self.logger.experiment.add_audio(
														
 
															                     f"sample-{idx}/wavs/recon",
														
 
															                     recon_audio[0, :audio_len],
														
@@ -379,3 +295,37 @@ class VQGAN(L.LightningModule):
 
															                 )
														
 
															             plt.close(image_mels)
														
 
															+
														
 
															+    def encode(self, audios, audio_lengths):
														
 
															+        audios = audios.float()
														
 
															+
														
 
															+        gt_mels = self.mel_transform(audios)
														
 
															+        mel_lengths = audio_lengths // self.mel_transform.hop_length
														
 
															+        mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
														
 
															+        mel_masks_float_conv = mel_masks[:, None, :].float()
														
 
															+        gt_mels = gt_mels * mel_masks_float_conv
														
 
															+
														
 
															+        # Encode
														
 
															+        encoded_features = self.encoder(gt_mels) * mel_masks_float_conv
														
 
															+        feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
														
 
															+
														
 
															+        return self.quantizer.encode(encoded_features), feature_lengths
														
 
															+
														
 
															+    def decode(self, indices, feature_lengths, return_audios=False):
														
 
															+        factor = math.prod(self.quantizer.downsample_factor)
														
 
															+        mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
														
 
															+        mel_masks_float_conv = mel_masks[:, None, :].float()
														
 
															+
														
 
															+        z = self.quantizer.decode(indices) * mel_masks_float_conv
														
 
															+        gen_mel = (
														
 
															+            self.decoder(
														
 
															+                torch.randn_like(z) * mel_masks_float_conv,
														
 
															+                condition=z,
														
 
															+            )
														
 
															+            * mel_masks_float_conv
														
 
															+        )
														
 
															+
														
 
															+        if return_audios:
														
 
															+            return self.vocoder(gen_mel)
														
 
															+
														
 
															+        return gen_mel
														
--- a/fish_speech/models/vqgan/modules/fsq.py
+++ b/fish_speech/models/vqgan/modules/fsq.py
@@ -1,12 +1,9 @@
 
															 from dataclasses import dataclass
														
 
															-from typing import Union
														
 
															-import numpy as np
														
 
															 import torch
														
 
															 import torch.nn as nn
														
 
															 import torch.nn.functional as F
														
 
															 from einops import rearrange
														
 
															-from torch.nn.utils import weight_norm
														
 
															 from vector_quantize_pytorch import GroupedResidualFSQ
														
 
															 from .firefly import ConvNeXtBlock
														
@@ -106,10 +103,17 @@ class DownsampleFiniteScalarQuantize(nn.Module):
 
															         return result
														
 
															-    # def from_codes(self, codes: torch.Tensor):
														
 
															-    #     z_q, z_p, codes = self.residual_fsq.get_output_from_indices(codes)
														
 
															-    #     z_q = self.upsample(z_q)
														
 
															-    #     return z_q, z_p, codes
														
 
															+    def encode(self, z):
														
 
															+        z = self.downsample(z)
														
 
															+        _, indices = self.residual_fsq(z.mT)
														
 
															+        indices = rearrange(indices, "g b l r -> b (g r) l")
														
 
															+        return indices
														
 
															+
														
 
															+    def decode(self, indices: torch.Tensor):
														
 
															+        indices = rearrange(indices, "b (g r) l -> g b l r", g=self.residual_fsq.groups)
														
 
															+        z_q = self.residual_fsq.get_output_from_indices(indices)
														
 
															+        z_q = self.upsample(z_q.mT)
														
 
															+        return z_q
														
 
															     # def from_latents(self, latents: torch.Tensor):
														
 
															     #     z_q, z_p, codes = super().from_latents(latents)
														
--- a/fish_speech/models/vqgan/modules/wavenet.py
+++ b/fish_speech/models/vqgan/modules/wavenet.py
@@ -89,7 +89,6 @@ class ResidualBlock(nn.Module):
 
															         residual_channels,
														
 
															         use_linear_bias=False,
														
 
															         dilation=1,
														
 
															-        has_condition=True,
														
 
															         condition_channels=None,
														
 
															     ):
														
 
															         super(ResidualBlock, self).__init__()
														
@@ -102,7 +101,7 @@ class ResidualBlock(nn.Module):
 
															             dilation=dilation,
														
 
															         )
														
 
															-        if has_condition:
														
 
															+        if condition_channels is not None:
														
 
															             self.diffusion_projection = LinearNorm(
														
 
															                 residual_channels, residual_channels, use_linear_bias
														
 
															             )
														
@@ -159,6 +158,8 @@ class WaveNet(nn.Module):
 
															         if input_channels is None:
														
 
															             input_channels = residual_channels
														
 
															+        self.input_channels = input_channels
														
 
															+
														
 
															         # Residual layers
														
 
															         self.residual_layers = nn.ModuleList(
														
 
															             [
														
@@ -166,7 +167,6 @@ class WaveNet(nn.Module):
 
															                     residual_channels=residual_channels,
														
 
															                     use_linear_bias=False,
														
 
															                     dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1,
														
 
															-                    has_condition=is_diffusion,
														
 
															                     condition_channels=condition_channels,
														
 
															                 )
														
 
															                 for i in range(residual_layers)
														
--- a/tools/vqgan/extract_vq.py
+++ b/tools/vqgan/extract_vq.py
@@ -11,14 +11,12 @@ import click
 
															 import numpy as np
														
 
															 import torch
														
 
															 import torchaudio
														
 
															-from einops import rearrange
														
 
															 from hydra import compose, initialize
														
 
															 from hydra.utils import instantiate
														
 
															 from lightning import LightningModule
														
 
															 from loguru import logger
														
 
															 from omegaconf import OmegaConf
														
 
															-from fish_speech.models.vqgan.utils import sequence_mask
														
 
															 from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist
														
 
															 # register eval resolver
														
@@ -57,7 +55,7 @@ def get_model(
 
															     if "state_dict" in state_dict:
														
 
															         state_dict = state_dict["state_dict"]
														
 
															-    model.load_state_dict(state_dict, strict=True)
														
 
															+    model.load_state_dict(state_dict, strict=False)
														
 
															     model.eval()
														
 
															     model.cuda()
														
@@ -90,8 +88,7 @@ def process_batch(files: list[Path], model) -> float:
 
															     # Calculate lengths
														
 
															     with torch.no_grad():
														
 
															-        out = model.encode(audios, audio_lengths)
														
 
															-        indices, feature_lengths = out.indices, out.feature_lengths
														
 
															+        indices, feature_lengths = model.encode(audios, audio_lengths)
														
 
															     # Save to disk
														
 
															     outputs = indices.cpu().numpy()
														
--- a/tools/vqgan/inference.py
+++ b/tools/vqgan/inference.py
@@ -26,14 +26,18 @@ OmegaConf.register_new_resolver("eval", eval)
 
															 @click.option(
														
 
															     "--input-path",
														
 
															     "-i",
														
 
															-    default="data/Genshin/Chinese/派蒙/vo_WYLQ103_10_paimon_04.wav",
														
 
															+    default="data/sft/Rail_ZH/三月七/1fe0cc6fc3fe3e6d.wav",
														
 
															     type=click.Path(exists=True, path_type=Path),
														
 
															 )
														
 
															 @click.option(
														
 
															     "--output-path", "-o", default="fake.wav", type=click.Path(path_type=Path)
														
 
															 )
														
 
															 @click.option("--config-name", "-cfg", default="vqgan_pretrain")
														
 
															-@click.option("--checkpoint-path", "-ckpt", default="checkpoints/vqgan-v1.pth")
														
 
															+@click.option(
														
 
															+    "--checkpoint-path",
														
 
															+    "-ckpt",
														
 
															+    default="results/vq-group-fsq-8x1024-wn-20x512-cond-e009/checkpoints/step_000355000.ckpt",
														
 
															+)
														
 
															 def main(input_path, output_path, config_name, checkpoint_path):
														
 
															     with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
														
 
															         cfg = compose(config_name=config_name)
														
@@ -45,7 +49,7 @@ def main(input_path, output_path, config_name, checkpoint_path):
 
															     )
														
 
															     if "state_dict" in state_dict:
														
 
															         state_dict = state_dict["state_dict"]
														
 
															-    model.load_state_dict(state_dict, strict=True)
														
 
															+    model.load_state_dict(state_dict, strict=False)
														
 
															     model.eval()
														
 
															     model.cuda()
														
 
															     logger.info("Restored model from checkpoint")
														
@@ -67,8 +71,7 @@ def main(input_path, output_path, config_name, checkpoint_path):
 
															         audio_lengths = torch.tensor(
														
 
															             [audios.shape[2]], device=model.device, dtype=torch.long
														
 
															         )
														
 
															-        encoded = model.encode(audios, audio_lengths)
														
 
															-        indices = encoded.indices[0]
														
 
															+        indices = model.encode(audios, audio_lengths)[0][0]
														
 
															         logger.info(f"Generated indices of shape {indices.shape}")
														
@@ -82,12 +85,15 @@ def main(input_path, output_path, config_name, checkpoint_path):
 
															     else:
														
 
															         raise ValueError(f"Unknown input type: {input_path}")
														
 
															+    # random destroy 10% of indices
														
 
															+    # mask = torch.rand_like(indices, dtype=torch.float) > 0.9
														
 
															+    # indices[mask] = torch.randint(0, 1000, mask.shape, device=indices.device, dtype=indices.dtype)[mask]
														
 
															+
														
 
															     # Restore
														
 
															     feature_lengths = torch.tensor([indices.shape[1]], device=model.device)
														
 
															-    decoded = model.decode(
														
 
															+    fake_audios = model.decode(
														
 
															         indices=indices[None], feature_lengths=feature_lengths, return_audios=True
														
 
															     )
														
 
															-    fake_audios = decoded.audios
														
 
															     audio_time = fake_audios.shape[-1] / model.sampling_rate
														
 
															     logger.info(