2 年前 · 8de4b48c02
--- a/fish_speech/configs/vqgan_pretrain.yaml
+++ b/fish_speech/configs/vqgan_pretrain.yaml
@@ -2,36 +2,20 @@ defaults:
 
															   - base
														
 
															   - _self_
														
 
															-project: vq_reflow_wavenet_group_fsq
														
 
															-ckpt_path: results/vq_reflow_bf16/checkpoints/step_000248000.ckpt
														
 
															-resume_weights_only: true
														
 
															+project: vq_reflow_shallow_group_fsq_8x1024_wavenet
														
 
															 # Lightning Trainer
														
 
															 trainer:
														
 
															   accelerator: gpu
														
 
															   devices: auto
														
 
															-  precision: 32
														
 
															+  precision: bf16-mixed
														
 
															   max_steps: 1_000_000
														
 
															-  # max_steps: 100
														
 
															-  val_check_interval: 2000
														
 
															-  gradient_clip_algorithm: norm
														
 
															-  gradient_clip_val: 1.0
														
 
															-  # limit_val_batches: 0.0
														
 
															-
														
 
															-  strategy: ddp #_find_unused_parameters_true
														
 
															-  # strategy:
														
 
															-  #   _target_: lightning.pytorch.strategies.DeepSpeedStrategy
														
 
															-  #   stage: 1
														
 
															-  #   overlap_comm: true
														
 
															-
														
 
															-  # profiler:
														
 
															-  #   _target_: lightning.pytorch.profilers.PyTorchProfiler
														
 
															-  #   export_to_chrome: true
														
 
															-  #   filename: prof.txt
														
 
															+  val_check_interval: 1000
														
 
															+  strategy: ddp_find_unused_parameters_true
														
 
															 sample_rate: 44100
														
 
															 hop_length: 512
														
 
															-num_mels: 160
														
 
															+num_mels: 128
														
 
															 n_fft: 2048
														
 
															 win_length: 2048
														
@@ -64,46 +48,48 @@ model:
 
															   sampling_rate: ${sample_rate}
														
 
															   weight_reflow: 1.0
														
 
															   weight_vq: 1.0
														
 
															-  weight_aux_mel: 1.0
														
 
															+  weight_mel: 1.0
														
 
															+  freeze_encoder: false
														
 
															+
														
 
															+  # Reflow configs
														
 
															+  reflow_use_shallow: true
														
 
															+  reflow_inference_steps: 10
														
 
															+  reflow_inference_start_t: 0.5
														
 
															   encoder:
														
 
															-    _target_: fish_speech.models.vqgan.modules.convnext.ConvNeXtEncoder
														
 
															+    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
														
 
															     input_channels: ${num_mels}
														
 
															-    depths: [3, 3, 9, 3]
														
 
															-    dims: [128, 256, 384, 512]
														
 
															+    residual_channels: 512
														
 
															+    residual_layers: 20
														
 
															+    dilation_cycle: 4
														
 
															   quantizer:
														
 
															     _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
														
 
															     input_dim: 512
														
 
															-    n_codebooks: 1
														
 
															-    n_groups: 8
														
 
															+    n_codebooks: 8
														
 
															+    n_groups: 1
														
 
															     levels: [8, 5, 5, 5]
														
 
															-  aux_decoder:
														
 
															-    _target_: fish_speech.models.vqgan.modules.convnext.ConvNeXtEncoder
														
 
															-    input_channels: 512
														
 
															+  decoder:
														
 
															+    _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
														
 
															     output_channels: ${num_mels}
														
 
															-    depths: [6]
														
 
															-    dims: [384]
														
 
															-
														
 
															-  # reflow:
														
 
															-  #   _target_: fish_speech.models.vqgan.modules.dit.DiT
														
 
															-  #   hidden_size: 768
														
 
															-  #   num_heads: 12
														
 
															-  #   diffusion_num_layers: 12
														
 
															-  #   channels: ${num_mels}
														
 
															-  #   condition_dim: 512
														
 
															+    residual_channels: 512
														
 
															+    residual_layers: 20
														
 
															+    dilation_cycle: 4
														
 
															   reflow:
														
 
															     _target_: fish_speech.models.vqgan.modules.wavenet.WaveNet
														
 
															-    mel_channels: ${num_mels}
														
 
															-    d_encoder: 512
														
 
															+    input_channels: ${num_mels}
														
 
															+    output_channels: ${num_mels}
														
 
															     residual_channels: 512
														
 
															+    condition_channels: 512
														
 
															     residual_layers: 20
														
 
															+    dilation_cycle: 4
														
 
															+    is_diffusion: true
														
 
															   vocoder:
														
 
															     _target_: fish_speech.models.vqgan.modules.firefly.FireflyBase
														
 
															-    ckpt_path: checkpoints/firefly-gan-base-002000000.ckpt
														
 
															+    ckpt_path: null # You may download the pretrained vocoder and set the path here
														
 
															   mel_transform:
														
 
															     _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
														
@@ -132,12 +118,16 @@ model:
 
															       final_lr_ratio: 0
														
 
															 callbacks:
														
 
															+  model_summary:
														
 
															+    _target_: lightning.pytorch.callbacks.ModelSummary
														
 
															+    max_depth: 1
														
 
															+
														
 
															+  model_checkpoint:
														
 
															+    every_n_train_steps: ${trainer.val_check_interval}
														
 
															+
														
 
															   grad_norm_monitor:
														
 
															     sub_module: 
														
 
															       - encoder
														
 
															-      - aux_decoder
														
 
															+      - decoder
														
 
															       - quantizer
														
 
															       - reflow
														
 
															-
														
 
															-  model_checkpoint:
														
 
															-    every_n_train_steps: ${trainer.val_check_interval}
														
--- a/fish_speech/models/vqgan/lit_module.py
+++ b/fish_speech/models/vqgan/lit_module.py
@@ -10,21 +10,8 @@ from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
 
															 from matplotlib import pyplot as plt
														
 
															 from torch import nn
														
 
															-from fish_speech.models.vqgan.utils import plot_mel, sequence_mask, slice_segments
														
 
															-
														
 
															-
														
 
															-@dataclass
														
 
															-class VQEncodeResult:
														
 
															-    features: torch.Tensor
														
 
															-    indices: torch.Tensor
														
 
															-    loss: torch.Tensor
														
 
															-    feature_lengths: torch.Tensor
														
 
															-
														
 
															-
														
 
															-@dataclass
														
 
															-class VQDecodeResult:
														
 
															-    mels: torch.Tensor
														
 
															-    audios: Optional[torch.Tensor] = None
														
 
															+from fish_speech.models.vqgan.modules.wavenet import WaveNet
														
 
															+from fish_speech.models.vqgan.utils import plot_mel, sequence_mask
														
 
															 class VQGAN(L.LightningModule):
														
@@ -32,16 +19,20 @@ class VQGAN(L.LightningModule):
 
															         self,
														
 
															         optimizer: Callable,
														
 
															         lr_scheduler: Callable,
														
 
															-        encoder: nn.Module,
														
 
															+        encoder: WaveNet,
														
 
															         quantizer: nn.Module,
														
 
															-        aux_decoder: nn.Module,
														
 
															+        decoder: WaveNet,
														
 
															         reflow: nn.Module,
														
 
															         vocoder: nn.Module,
														
 
															         mel_transform: nn.Module,
														
 
															         weight_reflow: float = 1.0,
														
 
															         weight_vq: float = 1.0,
														
 
															-        weight_aux_mel: float = 1.0,
														
 
															+        weight_mel: float = 1.0,
														
 
															         sampling_rate: int = 44100,
														
 
															+        freeze_encoder: bool = False,
														
 
															+        reflow_use_shallow: bool = False,
														
 
															+        reflow_inference_steps: int = 10,
														
 
															+        reflow_inference_start_t: float = 0.5,
														
 
															     ):
														
 
															         super().__init__()
														
@@ -52,10 +43,10 @@ class VQGAN(L.LightningModule):
 
															         # Modules
														
 
															         self.encoder = encoder
														
 
															         self.quantizer = quantizer
														
 
															-        self.aux_decoder = aux_decoder
														
 
															+        self.decoder = decoder
														
 
															+        self.vocoder = vocoder
														
 
															         self.reflow = reflow
														
 
															         self.mel_transform = mel_transform
														
 
															-        self.vocoder = vocoder
														
 
															         # Freeze vocoder
														
 
															         for param in self.vocoder.parameters():
														
@@ -64,13 +55,27 @@ class VQGAN(L.LightningModule):
 
															         # Loss weights
														
 
															         self.weight_reflow = weight_reflow
														
 
															         self.weight_vq = weight_vq
														
 
															-        self.weight_aux_mel = weight_aux_mel
														
 
															+        self.weight_mel = weight_mel
														
 
															+        # Other parameters
														
 
															         self.spec_min = -12
														
 
															         self.spec_max = 3
														
 
															         self.sampling_rate = sampling_rate
														
 
															+        self.reflow_use_shallow = reflow_use_shallow
														
 
															+        self.reflow_inference_steps = reflow_inference_steps
														
 
															+        self.reflow_inference_start_t = reflow_inference_start_t
														
 
															+
														
 
															+        # Disable strict loading
														
 
															         self.strict_loading = False
														
 
															+        # If encoder is frozen
														
 
															+        if freeze_encoder:
														
 
															+            for param in self.encoder.parameters():
														
 
															+                param.requires_grad = False
														
 
															+
														
 
															+            for param in self.quantizer.parameters():
														
 
															+                param.requires_grad = False
														
 
															+
														
 
															     def on_save_checkpoint(self, checkpoint):
														
 
															         # Do not save vocoder
														
 
															         state_dict = checkpoint["state_dict"]
														
@@ -79,7 +84,6 @@ class VQGAN(L.LightningModule):
 
															                 state_dict.pop(name)
														
 
															     def configure_optimizers(self):
														
 
															-        # Need two optimizers and two schedulers
														
 
															         optimizer = self.optimizer_builder(self.parameters())
														
 
															         lr_scheduler = self.lr_scheduler_builder(optimizer)
														
@@ -97,7 +101,6 @@ class VQGAN(L.LightningModule):
 
															     def denorm_spec(self, x):
														
 
															         return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
														
 
															-    # @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
														
 
															     def training_step(self, batch, batch_idx):
														
 
															         audios, audio_lengths = batch["audios"], batch["audio_lengths"]
														
@@ -110,6 +113,7 @@ class VQGAN(L.LightningModule):
 
															         mel_lengths = audio_lengths // self.mel_transform.hop_length
														
 
															         mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
														
 
															         mel_masks_float_conv = mel_masks[:, None, :].float()
														
 
															+        gt_mels = gt_mels * mel_masks_float_conv
														
 
															         # Encode
														
 
															         encoded_features = self.encoder(gt_mels) * mel_masks_float_conv
														
@@ -120,25 +124,31 @@ class VQGAN(L.LightningModule):
 
															         vq_recon_features = vq_result.z * mel_masks_float_conv
														
 
															         # VQ Decode
														
 
															-        aux_mel = self.aux_decoder(vq_recon_features)
														
 
															-        loss_aux_mel = F.l1_loss(
														
 
															-            aux_mel * mel_masks_float_conv, gt_mels * mel_masks_float_conv
														
 
															-        )
														
 
															+        gen_mel = self.decoder(vq_recon_features) * mel_masks_float_conv
														
 
															-        # Reflow
														
 
															+        # Mel Loss
														
 
															+        loss_mel = (gen_mel - gt_mels).abs().mean(
														
 
															+            dim=1, keepdim=True
														
 
															+        ).sum() / mel_masks_float_conv.sum()
														
 
															+
														
 
															+        # Reflow, given x_1_aux, we want to reconstruct x_1
														
 
															         x_1 = self.norm_spec(gt_mels)
														
 
															+
														
 
															+        if self.reflow_use_shallow:
														
 
															+            x_1_aux = self.norm_spec(gen_mel)
														
 
															+        else:
														
 
															+            x_1_aux = x_1
														
 
															+
														
 
															         t = torch.rand(gt_mels.shape[0], device=gt_mels.device)
														
 
															         x_0 = torch.randn_like(x_1)
														
 
															         # X_t = t * X_1 + (1 - t) * X_0
														
 
															-        x_t = x_0 + t[:, None, None] * (x_1 - x_0)
														
 
															+        x_t = x_0 + t[:, None, None] * (x_1_aux - x_0)
														
 
															         v_pred = self.reflow(
														
 
															             x_t,
														
 
															             1000 * t,
														
 
															-            vq_recon_features,  # .detach()
														
 
															-            x_masks=mel_masks_float_conv,
														
 
															-            cond_masks=mel_masks_float_conv,
														
 
															+            vq_recon_features,
														
 
															         )
														
 
															         # Log L2 loss with
														
@@ -146,21 +156,28 @@ class VQGAN(L.LightningModule):
 
															         loss_reflow = weights[:, None, None] * F.mse_loss(
														
 
															             x_1 - x_0, v_pred, reduction="none"
														
 
															         )
														
 
															-        loss_reflow = (loss_reflow * mel_masks_float_conv).mean()
														
 
															+        loss_reflow = (loss_reflow * mel_masks_float_conv).mean(
														
 
															+            dim=1
														
 
															+        ).sum() / mel_masks_float_conv.sum()
														
 
															         # Total loss
														
 
															         loss = (
														
 
															             self.weight_vq * loss_vq
														
 
															-            + self.weight_aux_mel * loss_aux_mel
														
 
															+            + self.weight_mel * loss_mel
														
 
															             + self.weight_reflow * loss_reflow
														
 
															         )
														
 
															         # Log losses
														
 
															         self.log(
														
 
															-            "train/loss", loss, on_step=True, on_epoch=False, prog_bar=True, logger=True
														
 
															+            "train/generator/loss",
														
 
															+            loss,
														
 
															+            on_step=True,
														
 
															+            on_epoch=False,
														
 
															+            prog_bar=True,
														
 
															+            logger=True,
														
 
															         )
														
 
															         self.log(
														
 
															-            "train/loss_vq",
														
 
															+            "train/generator/loss_vq",
														
 
															             loss_vq,
														
 
															             on_step=True,
														
 
															             on_epoch=False,
														
@@ -168,15 +185,15 @@ class VQGAN(L.LightningModule):
 
															             logger=True,
														
 
															         )
														
 
															         self.log(
														
 
															-            "train/loss_aux_mel",
														
 
															-            loss_aux_mel,
														
 
															+            "train/generator/loss_mel",
														
 
															+            loss_mel,
														
 
															             on_step=True,
														
 
															             on_epoch=False,
														
 
															             prog_bar=False,
														
 
															             logger=True,
														
 
															         )
														
 
															         self.log(
														
 
															-            "train/loss_reflow",
														
 
															+            "train/generator/loss_reflow",
														
 
															             loss_reflow,
														
 
															             on_step=True,
														
 
															             on_epoch=False,
														
@@ -196,22 +213,23 @@ class VQGAN(L.LightningModule):
 
															         mel_lengths = audio_lengths // self.mel_transform.hop_length
														
 
															         mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
														
 
															         mel_masks_float_conv = mel_masks[:, None, :].float()
														
 
															+        gt_mels = gt_mels * mel_masks_float_conv
														
 
															         # Encode
														
 
															         encoded_features = self.encoder(gt_mels) * mel_masks_float_conv
														
 
															         # Quantize
														
 
															-        vq_result = self.quantizer(encoded_features)
														
 
															+        vq_recon_features = self.quantizer(encoded_features).z * mel_masks_float_conv
														
 
															         # VQ Decode
														
 
															-        aux_mels = self.aux_decoder(vq_result.z)
														
 
															-        loss_aux_mel = F.l1_loss(
														
 
															-            aux_mels * mel_masks_float_conv, gt_mels * mel_masks_float_conv
														
 
															-        )
														
 
															+        gen_aux_mels = self.decoder(vq_recon_features) * mel_masks_float_conv
														
 
															+        loss_mel = (gen_aux_mels - gt_mels).abs().mean(
														
 
															+            dim=1, keepdim=True
														
 
															+        ).sum() / mel_masks_float_conv.sum()
														
 
															         self.log(
														
 
															-            "val/loss_aux_mel",
														
 
															-            loss_aux_mel,
														
 
															+            "val/loss_mel",
														
 
															+            loss_mel,
														
 
															             on_step=False,
														
 
															             on_epoch=True,
														
 
															             prog_bar=False,
														
@@ -220,37 +238,34 @@ class VQGAN(L.LightningModule):
 
															         )
														
 
															         # Reflow inference
														
 
															-        t_start = 0.0
														
 
															-        infer_step = 10
														
 
															+        t_start = self.reflow_inference_start_t if self.reflow_use_shallow else 0.0
														
 
															-        x_1 = self.norm_spec(aux_mels)
														
 
															+        x_1 = self.norm_spec(gen_aux_mels)
														
 
															         x_0 = torch.randn_like(x_1)
														
 
															-        gen_mels = (1 - t_start) * x_0 + t_start * x_1
														
 
															+        gen_reflow_mels = (1 - t_start) * x_0 + t_start * x_1
														
 
															         t = torch.zeros(gt_mels.shape[0], device=gt_mels.device)
														
 
															-        dt = (1.0 - t_start) / infer_step
														
 
															+        dt = (1.0 - t_start) / self.reflow_inference_steps
														
 
															-        for _ in range(infer_step):
														
 
															-            gen_mels += (
														
 
															+        for _ in range(self.reflow_inference_steps):
														
 
															+            gen_reflow_mels += (
														
 
															                 self.reflow(
														
 
															-                    gen_mels,
														
 
															+                    gen_reflow_mels,
														
 
															                     1000 * t,
														
 
															-                    vq_result.z,
														
 
															-                    x_masks=mel_masks_float_conv,
														
 
															-                    cond_masks=mel_masks_float_conv,
														
 
															+                    vq_recon_features,
														
 
															                 )
														
 
															                 * dt
														
 
															             )
														
 
															             t += dt
														
 
															-        gen_mels = self.denorm_spec(gen_mels)
														
 
															-        loss_recon_reflow = F.l1_loss(
														
 
															-            gen_mels * mel_masks_float_conv, gt_mels * mel_masks_float_conv
														
 
															-        )
														
 
															+        gen_reflow_mels = self.denorm_spec(gen_reflow_mels) * mel_masks_float_conv
														
 
															+        loss_reflow_mel = (gen_reflow_mels - gt_mels).abs().mean(
														
 
															+            dim=1, keepdim=True
														
 
															+        ).sum() / mel_masks_float_conv.sum()
														
 
															         self.log(
														
 
															-            "val/loss_recon_reflow",
														
 
															-            loss_recon_reflow,
														
 
															+            "val/loss_reflow_mel",
														
 
															+            loss_reflow_mel,
														
 
															             on_step=False,
														
 
															             on_epoch=True,
														
 
															             prog_bar=False,
														
@@ -258,9 +273,9 @@ class VQGAN(L.LightningModule):
 
															             sync_dist=True,
														
 
															         )
														
 
															-        gen_audios = self.vocoder(gen_mels)
														
 
															         recon_audios = self.vocoder(gt_mels)
														
 
															-        aux_audios = self.vocoder(aux_mels)
														
 
															+        gen_aux_audios = self.vocoder(gen_aux_mels)
														
 
															+        gen_reflow_audios = self.vocoder(gen_reflow_mels)
														
 
															         # only log the first batch
														
 
															         if batch_idx != 0:
														
@@ -268,21 +283,21 @@ class VQGAN(L.LightningModule):
 
															         for idx, (
														
 
															             gt_mel,
														
 
															-            reflow_mel,
														
 
															-            aux_mel,
														
 
															+            gen_aux_mel,
														
 
															+            gen_reflow_mel,
														
 
															             audio,
														
 
															-            reflow_audio,
														
 
															-            aux_audio,
														
 
															+            gen_aux_audio,
														
 
															+            gen_reflow_audio,
														
 
															             recon_audio,
														
 
															             audio_len,
														
 
															         ) in enumerate(
														
 
															             zip(
														
 
															                 gt_mels,
														
 
															-                gen_mels,
														
 
															-                aux_mels,
														
 
															+                gen_aux_mels,
														
 
															+                gen_reflow_mels,
														
 
															                 audios.float(),
														
 
															-                gen_audios.float(),
														
 
															-                aux_audios.float(),
														
 
															+                gen_aux_audios.float(),
														
 
															+                gen_reflow_audios.float(),
														
 
															                 recon_audios.float(),
														
 
															                 audio_lengths,
														
 
															             )
														
@@ -292,13 +307,13 @@ class VQGAN(L.LightningModule):
 
															             image_mels = plot_mel(
														
 
															                 [
														
 
															                     gt_mel[:, :mel_len],
														
 
															-                    reflow_mel[:, :mel_len],
														
 
															-                    aux_mel[:, :mel_len],
														
 
															+                    gen_aux_mel[:, :mel_len],
														
 
															+                    gen_reflow_mel[:, :mel_len],
														
 
															                 ],
														
 
															                 [
														
 
															                     "Ground-Truth",
														
 
															+                    "Auxiliary",
														
 
															                     "Reflow",
														
 
															-                    "Aux",
														
 
															                 ],
														
 
															             )
														
@@ -313,14 +328,14 @@ class VQGAN(L.LightningModule):
 
															                                 caption="gt",
														
 
															                             ),
														
 
															                             wandb.Audio(
														
 
															-                                reflow_audio[0, :audio_len],
														
 
															+                                gen_aux_audio[0, :audio_len],
														
 
															                                 sample_rate=self.sampling_rate,
														
 
															-                                caption="reflow",
														
 
															+                                caption="aux",
														
 
															                             ),
														
 
															                             wandb.Audio(
														
 
															-                                aux_audio[0, :audio_len],
														
 
															+                                gen_reflow_audio[0, :audio_len],
														
 
															                                 sample_rate=self.sampling_rate,
														
 
															-                                caption="aux",
														
 
															+                                caption="reflow",
														
 
															                             ),
														
 
															                             wandb.Audio(
														
 
															                                 recon_audio[0, :audio_len],
														
@@ -344,14 +359,14 @@ class VQGAN(L.LightningModule):
 
															                     sample_rate=self.sampling_rate,
														
 
															                 )
														
 
															                 self.logger.experiment.add_audio(
														
 
															-                    f"sample-{idx}/wavs/reflow",
														
 
															-                    reflow_audio[0, :audio_len],
														
 
															+                    f"sample-{idx}/wavs/gen",
														
 
															+                    gen_aux_audio[0, :audio_len],
														
 
															                     self.global_step,
														
 
															                     sample_rate=self.sampling_rate,
														
 
															                 )
														
 
															                 self.logger.experiment.add_audio(
														
 
															-                    f"sample-{idx}/wavs/aux",
														
 
															-                    aux_audio[0, :audio_len],
														
 
															+                    f"sample-{idx}/wavs/reflow",
														
 
															+                    gen_reflow_audio[0, :audio_len],
														
 
															                     self.global_step,
														
 
															                     sample_rate=self.sampling_rate,
														
 
															                 )
														
--- a/fish_speech/models/vqgan/modules/dit.py
+++ b/fish_speech/models/vqgan/modules/dit.py
@@ -1,419 +0,0 @@
 
															-import math
														
 
															-from typing import Callable, Optional, Union
														
 
															-
														
 
															-import numpy as np
														
 
															-import torch
														
 
															-import torch.nn as nn
														
 
															-import torch.nn.functional as F
														
 
															-
														
 
															-
														
 
															-def modulate(x, shift, scale):
														
 
															-    return x * (1 + scale) + shift
														
 
															-
														
 
															-
														
 
															-def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
														
 
															-    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
														
 
															-    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
														
 
															-    x_out2 = torch.stack(
														
 
															-        [
														
 
															-            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
														
 
															-            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
														
 
															-        ],
														
 
															-        -1,
														
 
															-    )
														
 
															-
														
 
															-    x_out2 = x_out2.flatten(3)
														
 
															-    return x_out2.type_as(x)
														
 
															-
														
 
															-
														
 
															-class TimestepEmbedder(nn.Module):
														
 
															-    """
														
 
															-    Embeds scalar timesteps into vector representations.
														
 
															-    """
														
 
															-
														
 
															-    def __init__(self, hidden_size, frequency_embedding_size=256):
														
 
															-        super().__init__()
														
 
															-        self.mlp = FeedForward(
														
 
															-            frequency_embedding_size, hidden_size, out_dim=hidden_size
														
 
															-        )
														
 
															-        self.frequency_embedding_size = frequency_embedding_size
														
 
															-
														
 
															-    @staticmethod
														
 
															-    def timestep_embedding(t, dim, max_period=10000):
														
 
															-        """
														
 
															-        Create sinusoidal timestep embeddings.
														
 
															-        :param t: a 1-D Tensor of N indices, one per batch element.
														
 
															-                          These may be fractional.
														
 
															-        :param dim: the dimension of the output.
														
 
															-        :param max_period: controls the minimum frequency of the embeddings.
														
 
															-        :return: an (N, D) Tensor of positional embeddings.
														
 
															-        """
														
 
															-        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
														
 
															-        half = dim // 2
														
 
															-        freqs = torch.exp(
														
 
															-            -math.log(max_period)
														
 
															-            * torch.arange(start=0, end=half, dtype=torch.float32)
														
 
															-            / half
														
 
															-        ).to(device=t.device)
														
 
															-        args = t[:, None].float() * freqs[None]
														
 
															-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
														
 
															-        if dim % 2:
														
 
															-            embedding = torch.cat(
														
 
															-                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
														
 
															-            )
														
 
															-        return embedding
														
 
															-
														
 
															-    def forward(self, t):
														
 
															-        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
														
 
															-        t_emb = self.mlp(t_freq)
														
 
															-        return t_emb
														
 
															-
														
 
															-
														
 
															-def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> torch.Tensor:
														
 
															-    freqs = 1.0 / (
														
 
															-        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
														
 
															-    )
														
 
															-    t = torch.arange(seq_len, device=freqs.device)
														
 
															-    freqs = torch.outer(t, freqs)
														
 
															-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
														
 
															-    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
														
 
															-    return cache.to(dtype=torch.bfloat16)
														
 
															-
														
 
															-
														
 
															-class Attention(nn.Module):
														
 
															-    def __init__(
														
 
															-        self,
														
 
															-        dim,
														
 
															-        n_head,
														
 
															-    ):
														
 
															-        super().__init__()
														
 
															-        assert dim % n_head == 0
														
 
															-
														
 
															-        self.dim = dim
														
 
															-        self.n_head = n_head
														
 
															-        self.head_dim = dim // n_head
														
 
															-
														
 
															-        self.wq = nn.Linear(dim, dim)
														
 
															-        self.wk = nn.Linear(dim, dim)
														
 
															-        self.wv = nn.Linear(dim, dim)
														
 
															-        self.wo = nn.Linear(dim, dim)
														
 
															-
														
 
															-    def forward(self, q, freqs_cis, kv=None, mask=None):
														
 
															-        bsz, seqlen, _ = q.shape
														
 
															-
														
 
															-        if kv is None:
														
 
															-            kv = q
														
 
															-
														
 
															-        kv_seqlen = kv.shape[1]
														
 
															-
														
 
															-        q = self.wq(q).view(bsz, seqlen, self.n_head, self.head_dim)
														
 
															-        k = self.wk(kv).view(bsz, kv_seqlen, self.n_head, self.head_dim)
														
 
															-        v = self.wv(kv).view(bsz, kv_seqlen, self.n_head, self.head_dim)
														
 
															-
														
 
															-        q = apply_rotary_emb(q, freqs_cis[:seqlen])
														
 
															-        k = apply_rotary_emb(k, freqs_cis[:kv_seqlen])
														
 
															-
														
 
															-        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
														
 
															-        y = F.scaled_dot_product_attention(
														
 
															-            q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False
														
 
															-        )
														
 
															-
														
 
															-        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
														
 
															-
														
 
															-        y = self.wo(y)
														
 
															-        return y
														
 
															-
														
 
															-
														
 
															-class FeedForward(nn.Module):
														
 
															-    def __init__(self, in_dim, intermediate_size, out_dim=None):
														
 
															-        super().__init__()
														
 
															-        self.w1 = nn.Linear(in_dim, intermediate_size)
														
 
															-        self.w3 = nn.Linear(in_dim, intermediate_size)
														
 
															-        self.w2 = nn.Linear(intermediate_size, out_dim or in_dim)
														
 
															-
														
 
															-    def forward(self, x: torch.Tensor) -> torch.Tensor:
														
 
															-        return self.w2(F.silu(self.w1(x)) * self.w3(x))
														
 
															-
														
 
															-
														
 
															-class DiTBlock(nn.Module):
														
 
															-    def __init__(
														
 
															-        self,
														
 
															-        hidden_size,
														
 
															-        num_heads,
														
 
															-        mlp_ratio=4.0,
														
 
															-        use_self_attention=True,
														
 
															-        use_cross_attention=False,
														
 
															-    ):
														
 
															-        super().__init__()
														
 
															-
														
 
															-        self.use_self_attention = use_self_attention
														
 
															-        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
														
 
															-
														
 
															-        if use_self_attention:
														
 
															-            self.mix = Attention(hidden_size, num_heads)
														
 
															-        else:
														
 
															-            self.mix = nn.Conv1d(
														
 
															-                hidden_size,
														
 
															-                hidden_size,
														
 
															-                kernel_size=7,
														
 
															-                padding=3,
														
 
															-                bias=True,
														
 
															-                groups=hidden_size,
														
 
															-            )
														
 
															-
														
 
															-        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
														
 
															-        self.mlp = FeedForward(hidden_size, int(hidden_size * mlp_ratio))
														
 
															-        self.adaLN_modulation = nn.Sequential(
														
 
															-            nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)
														
 
															-        )
														
 
															-
														
 
															-        self.use_cross_attention = use_cross_attention
														
 
															-        if self.use_cross_attention:
														
 
															-            self.norm3 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
														
 
															-            self.norm4 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
														
 
															-            self.cross_attn = Attention(hidden_size, num_heads)
														
 
															-            self.adaLN_modulation_cross = nn.Sequential(
														
 
															-                nn.SiLU(), nn.Linear(hidden_size, 3 * hidden_size, bias=True)
														
 
															-            )
														
 
															-            self.adaLN_modulation_cross_condition = nn.Sequential(
														
 
															-                nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
														
 
															-            )
														
 
															-
														
 
															-    def forward(
														
 
															-        self,
														
 
															-        x,
														
 
															-        condition,
														
 
															-        freqs_cis,
														
 
															-        self_mask=None,
														
 
															-        cross_condition=None,
														
 
															-        cross_mask=None,
														
 
															-    ):
														
 
															-        (
														
 
															-            shift_msa,
														
 
															-            scale_msa,
														
 
															-            gate_msa,
														
 
															-            shift_mlp,
														
 
															-            scale_mlp,
														
 
															-            gate_mlp,
														
 
															-        ) = self.adaLN_modulation(condition).chunk(6, dim=-1)
														
 
															-
														
 
															-        # Self-attention
														
 
															-        inp = modulate(self.norm1(x), shift_msa, scale_msa)
														
 
															-        if self.use_self_attention:
														
 
															-            inp = self.mix(inp, freqs_cis=freqs_cis, mask=self_mask)
														
 
															-        else:
														
 
															-            inp = self.mix(inp.mT).mT
														
 
															-        x = x + gate_msa * inp
														
 
															-
														
 
															-        # Cross-attention
														
 
															-        if self.use_cross_attention:
														
 
															-            (
														
 
															-                shift_cross,
														
 
															-                scale_cross,
														
 
															-                gate_cross,
														
 
															-            ) = self.adaLN_modulation_cross(
														
 
															-                condition
														
 
															-            ).chunk(3, dim=-1)
														
 
															-
														
 
															-            (
														
 
															-                shift_cross_condition,
														
 
															-                scale_cross_condition,
														
 
															-            ) = self.adaLN_modulation_cross_condition(cross_condition).chunk(2, dim=-1)
														
 
															-
														
 
															-            inp = modulate(self.norm3(x), shift_cross, scale_cross)
														
 
															-            inp = self.cross_attn(
														
 
															-                inp,
														
 
															-                freqs_cis=freqs_cis,
														
 
															-                kv=modulate(
														
 
															-                    self.norm4(cross_condition),
														
 
															-                    shift_cross_condition,
														
 
															-                    scale_cross_condition,
														
 
															-                ),
														
 
															-                mask=cross_mask,
														
 
															-            )
														
 
															-            x = x + gate_cross * inp
														
 
															-
														
 
															-        # MLP
														
 
															-        x = x + gate_mlp * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
														
 
															-
														
 
															-        return x
														
 
															-
														
 
															-
														
 
															-class FinalLayer(nn.Module):
														
 
															-    """
														
 
															-    The final layer of DiT.
														
 
															-    """
														
 
															-
														
 
															-    def __init__(self, hidden_size, out_channels):
														
 
															-        super().__init__()
														
 
															-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
														
 
															-        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
														
 
															-        self.adaLN_modulation = nn.Sequential(
														
 
															-            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
														
 
															-        )
														
 
															-
														
 
															-    def forward(self, x, c):
														
 
															-        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
														
 
															-        x = modulate(self.norm_final(x), shift, scale)
														
 
															-        return self.linear(x)
														
 
															-
														
 
															-
														
 
															-class DiT(nn.Module):
														
 
															-    def __init__(
														
 
															-        self,
														
 
															-        hidden_size,
														
 
															-        num_heads,
														
 
															-        diffusion_num_layers,
														
 
															-        channels=160,
														
 
															-        mlp_ratio=4.0,
														
 
															-        max_seq_len=16384,
														
 
															-        condition_dim=512,
														
 
															-        style_dim=None,
														
 
															-        cross_condition_dim=None,
														
 
															-    ):
														
 
															-        super().__init__()
														
 
															-
														
 
															-        self.max_seq_len = max_seq_len
														
 
															-
														
 
															-        self.time_embedder = TimestepEmbedder(hidden_size)
														
 
															-        self.condition_embedder = FeedForward(
														
 
															-            condition_dim, int(hidden_size * mlp_ratio), out_dim=hidden_size
														
 
															-        )
														
 
															-
														
 
															-        if cross_condition_dim is not None:
														
 
															-            self.cross_condition_embedder = FeedForward(
														
 
															-                cross_condition_dim, int(hidden_size * mlp_ratio), out_dim=hidden_size
														
 
															-            )
														
 
															-
														
 
															-        self.use_style = style_dim is not None
														
 
															-        if self.use_style:
														
 
															-            self.style_embedder = FeedForward(
														
 
															-                style_dim, int(hidden_size * mlp_ratio), out_dim=hidden_size
														
 
															-            )
														
 
															-
														
 
															-        self.diffusion_blocks = nn.ModuleList(
														
 
															-            [
														
 
															-                DiTBlock(
														
 
															-                    hidden_size,
														
 
															-                    num_heads,
														
 
															-                    mlp_ratio,
														
 
															-                    use_self_attention=i % 4 == 0,
														
 
															-                    use_cross_attention=cross_condition_dim is not None,
														
 
															-                )
														
 
															-                for i in range(diffusion_num_layers)
														
 
															-            ]
														
 
															-        )
														
 
															-
														
 
															-        # Downsample & upsample blocks
														
 
															-        self.input_embedder = FeedForward(
														
 
															-            channels, int(hidden_size * mlp_ratio), out_dim=hidden_size
														
 
															-        )
														
 
															-        self.final_layer = FinalLayer(hidden_size, channels)
														
 
															-
														
 
															-        self.register_buffer(
														
 
															-            "freqs_cis", precompute_freqs_cis(max_seq_len, hidden_size // num_heads)
														
 
															-        )
														
 
															-
														
 
															-        self.initialize_weights()
														
 
															-
														
 
															-    def initialize_weights(self):
														
 
															-        # Initialize input embedding:
														
 
															-        self.input_embedder.apply(self.init_weight)
														
 
															-        self.time_embedder.mlp.apply(self.init_weight)
														
 
															-        self.condition_embedder.apply(self.init_weight)
														
 
															-
														
 
															-        if self.use_style:
														
 
															-            self.style_embedder.apply(self.init_weight)
														
 
															-
														
 
															-        if hasattr(self, "cross_condition_embedder"):
														
 
															-            self.cross_condition_embedder.apply(self.init_weight)
														
 
															-
														
 
															-        for block in self.diffusion_blocks:
														
 
															-            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
														
 
															-            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
														
 
															-            block.mix.apply(self.init_weight)
														
 
															-
														
 
															-        # Zero-out output layers:
														
 
															-        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
														
 
															-        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
														
 
															-        self.final_layer.linear.apply(self.init_weight)
														
 
															-
														
 
															-    def init_weight(self, m):
														
 
															-        if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d, nn.Linear)):
														
 
															-            nn.init.normal_(m.weight, 0, 0.02)
														
 
															-            if m.bias is not None:
														
 
															-                nn.init.constant_(m.bias, 0)
														
 
															-
														
 
															-    def forward(
														
 
															-        self,
														
 
															-        x,
														
 
															-        time,
														
 
															-        condition,
														
 
															-        style=None,
														
 
															-        self_mask=None,
														
 
															-        cross_condition=None,
														
 
															-        cross_mask=None,
														
 
															-    ):
														
 
															-        # Embed inputs
														
 
															-        x = self.input_embedder(x)
														
 
															-        t = self.time_embedder(time)
														
 
															-
														
 
															-        condition = self.condition_embedder(condition)
														
 
															-
														
 
															-        if self.use_style:
														
 
															-            style = self.style_embedder(style)
														
 
															-
														
 
															-        if cross_condition is not None:
														
 
															-            cross_condition = self.cross_condition_embedder(cross_condition)
														
 
															-            cross_condition = t[:, None, :] + cross_condition
														
 
															-
														
 
															-        # Merge t, condition, and style
														
 
															-        condition = t[:, None, :] + condition
														
 
															-        if self.use_style:
														
 
															-            condition = condition + style[:, None, :]
														
 
															-
														
 
															-        if self_mask is not None:
														
 
															-            self_mask = self_mask[:, None, None, :]
														
 
															-
														
 
															-        if cross_mask is not None:
														
 
															-            cross_mask = cross_mask[:, None, None, :]
														
 
															-
														
 
															-        # DiT
														
 
															-        for block in self.diffusion_blocks:
														
 
															-            x = block(
														
 
															-                x,
														
 
															-                condition,
														
 
															-                self.freqs_cis,
														
 
															-                self_mask=self_mask,
														
 
															-                cross_condition=cross_condition,
														
 
															-                cross_mask=cross_mask,
														
 
															-            )
														
 
															-
														
 
															-        x = self.final_layer(x, condition)
														
 
															-
														
 
															-        return x
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    model = DiT(
														
 
															-        hidden_size=384,
														
 
															-        num_heads=6,
														
 
															-        diffusion_num_layers=12,
														
 
															-        channels=160,
														
 
															-        condition_dim=512,
														
 
															-        style_dim=256,
														
 
															-    )
														
 
															-    bs, seq_len = 8, 1024
														
 
															-    x = torch.randn(bs, seq_len, 160)
														
 
															-    condition = torch.randn(bs, seq_len, 512)
														
 
															-    style = torch.randn(bs, 256)
														
 
															-    mask = torch.ones(bs, seq_len, dtype=torch.bool)
														
 
															-    mask[0, 5:] = False
														
 
															-    time = torch.arange(bs)
														
 
															-    print(time)
														
 
															-    out = model(x, time, condition, style, self_mask=mask)
														
 
															-    print(out.shape)  # torch.Size([2, 100, 160])
														
 
															-
														
 
															-    # Print model size
														
 
															-    num_params = sum(p.numel() for p in model.parameters())
														
 
															-    print(f"Number of parameters: {num_params / 1e6:.1f}M")
														
--- a/fish_speech/models/vqgan/modules/fsq.py
+++ b/fish_speech/models/vqgan/modules/fsq.py
@@ -9,7 +9,7 @@ from einops import rearrange
 
															 from torch.nn.utils import weight_norm
														
 
															 from vector_quantize_pytorch import GroupedResidualFSQ
														
 
															-from .convnext import ConvNeXtBlock
														
 
															+from .firefly import ConvNeXtBlock
														
 
															 @dataclass
														
@@ -56,7 +56,6 @@ class DownsampleFiniteScalarQuantize(nn.Module):
 
															                         stride=factor,
														
 
															                     ),
														
 
															                     ConvNeXtBlock(dim=all_dims[idx + 1]),
														
 
															-                    ConvNeXtBlock(dim=all_dims[idx + 1]),
														
 
															                 )
														
 
															                 for idx, factor in enumerate(downsample_factor)
														
 
															             ]
														
@@ -72,12 +71,18 @@ class DownsampleFiniteScalarQuantize(nn.Module):
 
															                         stride=factor,
														
 
															                     ),
														
 
															                     ConvNeXtBlock(dim=all_dims[idx]),
														
 
															-                    ConvNeXtBlock(dim=all_dims[idx]),
														
 
															                 )
														
 
															                 for idx, factor in reversed(list(enumerate(downsample_factor)))
														
 
															             ]
														
 
															         )
														
 
															+        self.apply(self._init_weights)
														
 
															+
														
 
															+    def _init_weights(self, m):
														
 
															+        if isinstance(m, (nn.Conv1d, nn.Linear)):
														
 
															+            nn.init.trunc_normal_(m.weight, std=0.02)
														
 
															+            nn.init.constant_(m.bias, 0)
														
 
															+
														
 
															     def forward(self, z) -> FSQResult:
														
 
															         original_shape = z.shape
														
 
															         z = self.downsample(z)
														
--- a/fish_speech/models/vqgan/modules/wavenet.py
+++ b/fish_speech/models/vqgan/modules/wavenet.py
@@ -1,4 +1,5 @@
 
															 import math
														
 
															+from typing import Optional
														
 
															 import torch
														
 
															 import torch.nn.functional as F
														
@@ -83,7 +84,14 @@ class ConvNorm(nn.Module):
 
															 class ResidualBlock(nn.Module):
														
 
															     """Residual Block"""
														
 
															-    def __init__(self, d_encoder, residual_channels, use_linear_bias=False, dilation=1):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        residual_channels,
														
 
															+        use_linear_bias=False,
														
 
															+        dilation=1,
														
 
															+        has_condition=True,
														
 
															+        condition_channels=None,
														
 
															+    ):
														
 
															         super(ResidualBlock, self).__init__()
														
 
															         self.conv_layer = ConvNorm(
														
 
															             residual_channels,
														
@@ -93,23 +101,31 @@ class ResidualBlock(nn.Module):
 
															             padding=dilation,
														
 
															             dilation=dilation,
														
 
															         )
														
 
															-        self.diffusion_projection = LinearNorm(
														
 
															-            residual_channels, residual_channels, use_linear_bias
														
 
															-        )
														
 
															-        self.conditioner_projection = ConvNorm(
														
 
															-            d_encoder, 2 * residual_channels, kernel_size=1
														
 
															-        )
														
 
															+
														
 
															+        if has_condition:
														
 
															+            self.diffusion_projection = LinearNorm(
														
 
															+                residual_channels, residual_channels, use_linear_bias
														
 
															+            )
														
 
															+            self.condition_projection = ConvNorm(
														
 
															+                condition_channels, 2 * residual_channels, kernel_size=1
														
 
															+            )
														
 
															+
														
 
															         self.output_projection = ConvNorm(
														
 
															             residual_channels, 2 * residual_channels, kernel_size=1
														
 
															         )
														
 
															-    def forward(self, x, conditioner, diffusion_step):
														
 
															-        diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
														
 
															-        conditioner = self.conditioner_projection(conditioner)
														
 
															+    def forward(self, x, condition=None, diffusion_step=None):
														
 
															+        y = x
														
 
															+
														
 
															+        if diffusion_step is not None:
														
 
															+            diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
														
 
															+            y = y + diffusion_step
														
 
															-        y = x + diffusion_step
														
 
															+        y = self.conv_layer(y)
														
 
															-        y = self.conv_layer(y) + conditioner
														
 
															+        if condition is not None:
														
 
															+            condition = self.condition_projection(condition)
														
 
															+            y = y + condition
														
 
															         gate, filter = torch.chunk(y, 2, dim=1)
														
 
															         y = torch.sigmoid(gate) * torch.tanh(filter)
														
@@ -120,117 +136,90 @@ class ResidualBlock(nn.Module):
 
															         return (x + residual) / math.sqrt(2.0), skip
														
 
															-class SpectrogramUpsampler(nn.Module):
														
 
															-    def __init__(self, hop_size):
														
 
															+class WaveNet(nn.Module):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        input_channels: Optional[int] = None,
														
 
															+        output_channels: Optional[int] = None,
														
 
															+        residual_channels: int = 512,
														
 
															+        residual_layers: int = 20,
														
 
															+        dilation_cycle: Optional[int] = 4,
														
 
															+        is_diffusion: bool = False,
														
 
															+        condition_channels: Optional[int] = None,
														
 
															+    ):
														
 
															         super().__init__()
														
 
															-        if hop_size == 256:
														
 
															-            self.conv1 = nn.ConvTranspose2d(
														
 
															-                1, 1, [3, 32], stride=[1, 16], padding=[1, 8]
														
 
															-            )
														
 
															-        elif hop_size == 512:
														
 
															-            self.conv1 = nn.ConvTranspose2d(
														
 
															-                1, 1, [3, 64], stride=[1, 32], padding=[1, 16]
														
 
															+        # Input projection
														
 
															+        self.input_projection = None
														
 
															+        if input_channels is not None and input_channels != residual_channels:
														
 
															+            self.input_projection = ConvNorm(
														
 
															+                input_channels, residual_channels, kernel_size=1
														
 
															             )
														
 
															-        else:
														
 
															-            raise ValueError(f"Unsupported hop_size: {hop_size}")
														
 
															-        self.conv2 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
														
 
															-
														
 
															-    def forward(self, x):
														
 
															-        x = torch.unsqueeze(x, 1)
														
 
															-        x = self.conv1(x)
														
 
															-        x = F.leaky_relu(x, 0.4)
														
 
															-        x = self.conv2(x)
														
 
															-        x = F.leaky_relu(x, 0.4)
														
 
															-        x = torch.squeeze(x, 1)
														
 
															+        if input_channels is None:
														
 
															+            input_channels = residual_channels
														
 
															-        return x
														
 
															-
														
 
															-
														
 
															-class WaveNet(nn.Module):
														
 
															-    """
														
 
															-    WaveNet
														
 
															-    https://www.deepmind.com/blog/wavenet-a-generative-model-for-raw-audio
														
 
															-    """
														
 
															-
														
 
															-    def __init__(
														
 
															-        self,
														
 
															-        mel_channels=128,
														
 
															-        d_encoder=256,
														
 
															-        residual_channels=512,
														
 
															-        residual_layers=20,
														
 
															-        use_linear_bias=False,
														
 
															-        dilation_cycle=None,
														
 
															-    ):
														
 
															-        super(WaveNet, self).__init__()
														
 
															-
														
 
															-        self.input_projection = ConvNorm(mel_channels, residual_channels, kernel_size=1)
														
 
															-        self.diffusion_embedding = DiffusionEmbedding(residual_channels)
														
 
															-        self.mlp = nn.Sequential(
														
 
															-            LinearNorm(residual_channels, residual_channels * 4, use_linear_bias),
														
 
															-            Mish(),
														
 
															-            LinearNorm(residual_channels * 4, residual_channels, use_linear_bias),
														
 
															-        )
														
 
															+        # Residual layers
														
 
															         self.residual_layers = nn.ModuleList(
														
 
															             [
														
 
															                 ResidualBlock(
														
 
															-                    d_encoder,
														
 
															-                    residual_channels,
														
 
															-                    use_linear_bias=use_linear_bias,
														
 
															+                    residual_channels=residual_channels,
														
 
															+                    use_linear_bias=False,
														
 
															                     dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1,
														
 
															+                    has_condition=is_diffusion,
														
 
															+                    condition_channels=condition_channels,
														
 
															                 )
														
 
															                 for i in range(residual_layers)
														
 
															             ]
														
 
															         )
														
 
															+
														
 
															+        # Skip projection
														
 
															         self.skip_projection = ConvNorm(
														
 
															             residual_channels, residual_channels, kernel_size=1
														
 
															         )
														
 
															-        self.output_projection = ConvNorm(
														
 
															-            residual_channels, mel_channels, kernel_size=1
														
 
															-        )
														
 
															-        nn.init.zeros_(self.output_projection.conv.weight)
														
 
															-
														
 
															-    def forward(self, x, diffusion_step, conditioner, x_masks=None, cond_masks=None):
														
 
															-        """
														
 
															-        :param x: [B, M, T]
														
 
															-        :param diffusion_step: [B,]
														
 
															-        :param conditioner: [B, M, T]
														
 
															-        :return:
														
 
															-        """
														
 
															-
														
 
															-        # To keep compatibility with DiffSVC, [B, 1, M, T]
														
 
															-        use_4_dim = False
														
 
															-        if x.dim() == 4:
														
 
															-            x = x[:, 0]
														
 
															-            use_4_dim = True
														
 
															+        # Output projection
														
 
															+        self.output_projection = None
														
 
															+        if output_channels is not None and output_channels != residual_channels:
														
 
															+            self.output_projection = ConvNorm(
														
 
															+                residual_channels, output_channels, kernel_size=1
														
 
															+            )
														
 
															-        assert x.dim() == 3, f"mel must be 3 dim tensor, but got {x.dim()}"
														
 
															+        if is_diffusion:
														
 
															+            self.diffusion_embedding = DiffusionEmbedding(residual_channels)
														
 
															+            self.mlp = nn.Sequential(
														
 
															+                LinearNorm(residual_channels, residual_channels * 4, False),
														
 
															+                Mish(),
														
 
															+                LinearNorm(residual_channels * 4, residual_channels, False),
														
 
															+            )
														
 
															-        x = self.input_projection(x)  # x [B, residual_channel, T]
														
 
															-        x = F.relu(x)
														
 
															+        self.apply(self._init_weights)
														
 
															-        diffusion_step = self.diffusion_embedding(diffusion_step)
														
 
															-        diffusion_step = self.mlp(diffusion_step)
														
 
															+    def _init_weights(self, m):
														
 
															+        if isinstance(m, (nn.Conv1d, nn.Linear)):
														
 
															+            nn.init.trunc_normal_(m.weight, std=0.02)
														
 
															+            if getattr(m, "bias", None) is not None:
														
 
															+                nn.init.constant_(m.bias, 0)
														
 
															-        if x_masks is not None:
														
 
															-            x = x * x_masks
														
 
															+    def forward(self, x, t=None, condition=None):
														
 
															+        if self.input_projection is not None:
														
 
															+            x = self.input_projection(x)
														
 
															+            x = F.silu(x)
														
 
															-        if cond_masks is not None:
														
 
															-            conditioner = conditioner * cond_masks
														
 
															+        if t is not None:
														
 
															+            t = self.diffusion_embedding(t)
														
 
															+            t = self.mlp(t)
														
 
															         skip = []
														
 
															         for layer in self.residual_layers:
														
 
															-            x, skip_connection = layer(x, conditioner, diffusion_step)
														
 
															+            x, skip_connection = layer(x, condition, t)
														
 
															             skip.append(skip_connection)
														
 
															         x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers))
														
 
															         x = self.skip_projection(x)
														
 
															-        x = F.relu(x)
														
 
															-        x = self.output_projection(x)  # [B, 128, T]
														
 
															-        if x_masks is not None:
														
 
															-            x = x * x_masks
														
 
															+        if self.output_projection is not None:
														
 
															+            x = F.silu(x)
														
 
															+            x = self.output_projection(x)
														
 
															-        return x[:, None] if use_4_dim else x
														
 
															+        return x