2 yıl önce · 158b0c82c0
--- a/fish_speech/configs/text2semantic_sft_medium.yaml
+++ b/fish_speech/configs/text2semantic_sft_medium.yaml
@@ -2,9 +2,11 @@ defaults:
 
															   - base
														
 
															   - _self_
														
 
															-project: text2semantic_sft_medium
														
 
															+project: text2semantic_sft_medium_delay
														
 
															 max_length: 4096
														
 
															-use_delay_pattern: true
														
 
															+use_delay_pattern: false
														
 
															+ckpt_path: results/text2semantic_pretrain_medium_4_in_8_codebooks/checkpoints/step_000100000.ckpt
														
 
															+resume_weights_only: true
														
 
															 # Lightning Trainer
														
 
															 trainer:
														
@@ -14,7 +16,7 @@ trainer:
 
															   max_steps: 10_000
														
 
															   precision: bf16-true
														
 
															   limit_val_batches: 10
														
 
															-  val_check_interval: 1000
														
 
															+  val_check_interval: 500
														
 
															 # Dataset Configuration
														
 
															 tokenizer:
														
@@ -54,7 +56,7 @@ data:
 
															   train_dataset: ${train_dataset}
														
 
															   val_dataset: ${val_dataset}
														
 
															   num_workers: 4
														
 
															-  batch_size: 32
														
 
															+  batch_size: 16
														
 
															   tokenizer: ${tokenizer}
														
 
															   max_length: ${max_length}
														
@@ -83,7 +85,7 @@ model:
 
															   optimizer:
														
 
															     _target_: bitsandbytes.optim.AdamW8bit
														
 
															     _partial_: true
														
 
															-    lr: 1e-4
														
 
															+    lr: 4e-5
														
 
															     weight_decay: 0
														
 
															     betas: [0.9, 0.95]
														
 
															     eps: 1e-5
														
@@ -94,6 +96,11 @@ model:
 
															     lr_lambda:
														
 
															       _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
														
 
															       _partial_: true
														
 
															-      num_warmup_steps: 200
														
 
															+      num_warmup_steps: 100
														
 
															       num_training_steps: ${trainer.max_steps}
														
 
															       final_lr_ratio: 0
														
 
															+
														
 
															+callbacks:
														
 
															+  model_checkpoint:
														
 
															+    every_n_train_steps: 1000
														
 
															+    save_top_k: 10
														
--- a/fish_speech/configs/vqgan_pretrain.yaml
+++ b/fish_speech/configs/vqgan_pretrain.yaml
@@ -2,7 +2,7 @@ defaults:
 
															   - base
														
 
															   - _self_
														
 
															-project: vqgan_pretrain
														
 
															+project: vqgan_pretrain_lfq
														
 
															 ckpt_path: checkpoints/gpt_sovits_488k.pth
														
 
															 resume_weights_only: true
														
@@ -13,7 +13,7 @@ trainer:
 
															   strategy: ddp_find_unused_parameters_true
														
 
															   precision: 32
														
 
															   max_steps: 1_000_000
														
 
															-  val_check_interval: 5000
														
 
															+  val_check_interval: 2000
														
 
															 sample_rate: 32000
														
 
															 hop_length: 640
														
@@ -48,22 +48,23 @@ model:
 
															   _target_: fish_speech.models.vqgan.VQGAN
														
 
															   sample_rate: ${sample_rate}
														
 
															   hop_length: ${hop_length}
														
 
															-  freeze_discriminator: true
														
 
															+  freeze_discriminator: false
														
 
															-  weight_mel: 45
														
 
															+  weight_mel: 45.0
														
 
															   weight_kl: 0.1
														
 
															   weight_vq: 1.0
														
 
															+  weight_aux_mel: 1.0
														
 
															   generator:
														
 
															     _target_: fish_speech.models.vqgan.modules.models.SynthesizerTrn
														
 
															     spec_channels: 1025
														
 
															     segment_size: 32
														
 
															     inter_channels: 192
														
 
															-    hidden_channels: 192
														
 
															-    filter_channels: 768
														
 
															-    n_heads: 2
														
 
															-    n_layers: 6
														
 
															-    kernel_size: 3
														
 
															+    prior_hidden_channels: 192
														
 
															+    posterior_hidden_channels: 192
														
 
															+    prior_n_layers: 16
														
 
															+    posterior_n_layers: 16
														
 
															+    kernel_size: 5
														
 
															     p_dropout: 0.1
														
 
															     resblock: "1"
														
 
															     resblock_kernel_sizes: [3, 7, 11]
														
@@ -73,8 +74,11 @@ model:
 
															     upsample_kernel_sizes: [16, 16, 8, 2, 2]
														
 
															     gin_channels: 512
														
 
															     freeze_quantizer: false
														
 
															+    freeze_decoder: false
														
 
															+    freeze_posterior_encoder: false
														
 
															     codebook_size: 1024
														
 
															     num_codebooks: 2
														
 
															+    aux_spec_channels: ${num_mels}
														
 
															   discriminator:
														
 
															     _target_: fish_speech.models.vqgan.modules.models.EnsembledDiscriminator
														
--- a/fish_speech/models/vqgan/lit_module.py
+++ b/fish_speech/models/vqgan/lit_module.py
@@ -50,6 +50,7 @@ class VQGAN(L.LightningModule):
 
															         weight_mel: float = 45,
														
 
															         weight_kl: float = 0.1,
														
 
															         weight_vq: float = 1.0,
														
 
															+        weight_aux_mel: float = 20.0,
														
 
															     ):
														
 
															         super().__init__()
														
@@ -68,6 +69,7 @@ class VQGAN(L.LightningModule):
 
															         self.weight_mel = weight_mel
														
 
															         self.weight_kl = weight_kl
														
 
															         self.weight_vq = weight_vq
														
 
															+        self.weight_aux_mel = weight_aux_mel
														
 
															         # Other parameters
														
 
															         self.hop_length = hop_length
														
@@ -131,10 +133,14 @@ class VQGAN(L.LightningModule):
 
															             y_mask,
														
 
															             y_mask,
														
 
															             (z, z_p, m_p, logs_p, m_q, logs_q),
														
 
															-            quantized,
														
 
															+            loss_vq,
														
 
															+            decoded_aux_mels,
														
 
															         ) = self.generator(gt_specs, spec_lengths)
														
 
															         gt_mels = slice_segments(gt_mels, ids_slice, self.generator.segment_size)
														
 
															+        decoded_aux_mels = slice_segments(
														
 
															+            decoded_aux_mels, ids_slice, self.generator.segment_size
														
 
															+        )
														
 
															         spec_masks = slice_segments(spec_masks, ids_slice, self.generator.segment_size)
														
 
															         audios = slice_segments(
														
 
															             audios,
														
@@ -205,6 +211,9 @@ class VQGAN(L.LightningModule):
 
															         with torch.autocast(device_type=audios.device.type, enabled=False):
														
 
															             loss_mel = F.l1_loss(gt_mels * spec_masks, fake_mels * spec_masks)
														
 
															+            loss_aux_mel = F.l1_loss(
														
 
															+                gt_mels * spec_masks, decoded_aux_mels * spec_masks
														
 
															+            )
														
 
															         self.log(
														
 
															             "train/generator/loss_mel",
														
@@ -216,7 +225,16 @@ class VQGAN(L.LightningModule):
 
															             sync_dist=True,
														
 
															         )
														
 
															-        loss_vq = quantized.commitment_loss + quantized.codebook_loss
														
 
															+        self.log(
														
 
															+            "train/generator/loss_aux_mel",
														
 
															+            loss_aux_mel,
														
 
															+            on_step=True,
														
 
															+            on_epoch=False,
														
 
															+            prog_bar=False,
														
 
															+            logger=True,
														
 
															+            sync_dist=True,
														
 
															+        )
														
 
															+
														
 
															         self.log(
														
 
															             "train/generator/loss_vq",
														
 
															             loss_vq,
														
@@ -241,6 +259,7 @@ class VQGAN(L.LightningModule):
 
															         loss = (
														
 
															             loss_mel * self.weight_mel
														
 
															+            + loss_aux_mel * self.weight_aux_mel
														
 
															             + loss_vq * self.weight_vq
														
 
															             + loss_kl * self.weight_kl
														
 
															             + loss_adv
														
--- a/fish_speech/models/vqgan/modules/models.py
+++ b/fish_speech/models/vqgan/modules/models.py
@@ -19,54 +19,58 @@ class FeatureEncoder(nn.Module):
 
															         spec_channels,
														
 
															         out_channels,
														
 
															         hidden_channels,
														
 
															-        filter_channels,
														
 
															-        n_heads,
														
 
															         n_layers,
														
 
															         kernel_size,
														
 
															         p_dropout,
														
 
															         codebook_size=1024,
														
 
															         num_codebooks=2,
														
 
															         gin_channels=0,
														
 
															+        aux_spec_channels=None,
														
 
															     ):
														
 
															         super().__init__()
														
 
															         self.out_channels = out_channels
														
 
															         self.hidden_channels = hidden_channels
														
 
															-        self.filter_channels = filter_channels
														
 
															-        self.n_heads = n_heads
														
 
															         self.n_layers = n_layers
														
 
															         self.kernel_size = kernel_size
														
 
															         self.p_dropout = p_dropout
														
 
															+        if aux_spec_channels is None:
														
 
															+            aux_spec_channels = spec_channels
														
 
															+
														
 
															         self.spec_proj = nn.Conv1d(spec_channels, hidden_channels, 1)
														
 
															-        self.encoder = attentions.Encoder(
														
 
															-            hidden_channels,
														
 
															-            filter_channels,
														
 
															-            n_heads,
														
 
															-            n_layers // 2,
														
 
															-            kernel_size,
														
 
															-            p_dropout,
														
 
															+        self.encoder = modules.WN(
														
 
															+            hidden_channels=hidden_channels,
														
 
															+            kernel_size=kernel_size,
														
 
															+            dilation_rate=1,
														
 
															+            n_layers=n_layers // 2,
														
 
															         )
														
 
															         self.vq = DownsampleResidualVectorQuantizer(
														
 
															             input_dim=hidden_channels,
														
 
															             n_codebooks=num_codebooks,
														
 
															             codebook_size=codebook_size,
														
 
															+            codebook_dim=hidden_channels,
														
 
															             min_quantizers=num_codebooks,
														
 
															             downsample_factor=(2,),
														
 
															         )
														
 
															-        self.decoder = attentions.Encoder(
														
 
															-            hidden_channels,
														
 
															-            filter_channels,
														
 
															-            n_heads,
														
 
															-            n_layers // 2,
														
 
															-            kernel_size,
														
 
															-            p_dropout,
														
 
															-            isflow=True,
														
 
															+        self.decoder = modules.WN(
														
 
															+            hidden_channels=hidden_channels,
														
 
															+            kernel_size=kernel_size,
														
 
															+            dilation_rate=1,
														
 
															+            n_layers=n_layers // 2,
														
 
															             gin_channels=gin_channels,
														
 
															         )
														
 
															+        self.aux_decoder = modules.WN(
														
 
															+            hidden_channels=hidden_channels,
														
 
															+            kernel_size=kernel_size,
														
 
															+            dilation_rate=1,
														
 
															+            n_layers=4,
														
 
															+        )
														
 
															+        self.aux_proj = nn.Conv1d(hidden_channels, aux_spec_channels, 1)
														
 
															+
														
 
															         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
														
 
															     def forward(self, y, y_lengths, ge):
														
@@ -75,13 +79,15 @@ class FeatureEncoder(nn.Module):
 
															         )
														
 
															         y = self.spec_proj(y * y_mask) * y_mask
														
 
															-        y = self.encoder(y * y_mask, y_mask)
														
 
															-        quantized = self.vq(y)
														
 
															-        y = self.decoder(quantized.z * y_mask, y_mask, g=ge)
														
 
															+        y = self.encoder(y, y_mask) * y_mask
														
 
															+        z, indices, loss_vq = self.vq(y)
														
 
															+        y = self.decoder(z, y_mask, g=ge) * y_mask
														
 
															+        decoded_aux_mel = self.aux_decoder(y, y_mask)
														
 
															+        decoded_aux_mel = self.aux_proj(decoded_aux_mel) * y_mask
														
 
															         stats = self.proj(y) * y_mask
														
 
															         m, logs = torch.split(stats, self.out_channels, dim=1)
														
 
															-        return y, m, logs, y_mask, quantized
														
 
															+        return y, m, logs, y_mask, loss_vq, decoded_aux_mel
														
 
															 class ResidualCouplingBlock(nn.Module):
														
@@ -436,10 +442,10 @@ class SynthesizerTrn(nn.Module):
 
															         spec_channels,
														
 
															         segment_size,
														
 
															         inter_channels,
														
 
															-        hidden_channels,
														
 
															-        filter_channels,
														
 
															-        n_heads,
														
 
															-        n_layers,
														
 
															+        prior_hidden_channels,
														
 
															+        prior_n_layers,
														
 
															+        posterior_hidden_channels,
														
 
															+        posterior_n_layers,
														
 
															         kernel_size,
														
 
															         p_dropout,
														
 
															         resblock,
														
@@ -452,14 +458,17 @@ class SynthesizerTrn(nn.Module):
 
															         freeze_quantizer=False,
														
 
															         codebook_size=1024,
														
 
															         num_codebooks=2,
														
 
															+        freeze_decoder=False,
														
 
															+        freeze_posterior_encoder=False,
														
 
															+        aux_spec_channels=None,
														
 
															     ):
														
 
															         super().__init__()
														
 
															         self.spec_channels = spec_channels
														
 
															         self.inter_channels = inter_channels
														
 
															-        self.hidden_channels = hidden_channels
														
 
															-        self.filter_channels = filter_channels
														
 
															-        self.n_heads = n_heads
														
 
															-        self.n_layers = n_layers
														
 
															+        self.prior_hidden_channels = prior_hidden_channels
														
 
															+        self.prior_n_layers = prior_n_layers
														
 
															+        self.posterior_hidden_channels = posterior_hidden_channels
														
 
															+        self.posterior_n_layers = posterior_n_layers
														
 
															         self.kernel_size = kernel_size
														
 
															         self.p_dropout = p_dropout
														
 
															         self.resblock = resblock
														
@@ -472,40 +481,44 @@ class SynthesizerTrn(nn.Module):
 
															         self.gin_channels = gin_channels
														
 
															         self.enc_p = FeatureEncoder(
														
 
															-            spec_channels,
														
 
															-            inter_channels,
														
 
															-            hidden_channels,
														
 
															-            filter_channels,
														
 
															-            n_heads,
														
 
															-            n_layers,
														
 
															-            kernel_size,
														
 
															-            p_dropout,
														
 
															+            spec_channels=spec_channels,
														
 
															+            out_channels=inter_channels,
														
 
															+            hidden_channels=prior_hidden_channels,
														
 
															+            n_layers=prior_n_layers,
														
 
															+            kernel_size=kernel_size,
														
 
															+            p_dropout=p_dropout,
														
 
															             codebook_size=codebook_size,
														
 
															             num_codebooks=num_codebooks,
														
 
															             gin_channels=gin_channels,
														
 
															+            aux_spec_channels=aux_spec_channels,
														
 
															         )
														
 
															         self.dec = Generator(
														
 
															-            inter_channels,
														
 
															-            resblock,
														
 
															-            resblock_kernel_sizes,
														
 
															-            resblock_dilation_sizes,
														
 
															-            upsample_rates,
														
 
															-            upsample_initial_channel,
														
 
															-            upsample_kernel_sizes,
														
 
															+            initial_channel=inter_channels,
														
 
															+            resblock=resblock,
														
 
															+            resblock_kernel_sizes=resblock_kernel_sizes,
														
 
															+            resblock_dilation_sizes=resblock_dilation_sizes,
														
 
															+            upsample_rates=upsample_rates,
														
 
															+            upsample_initial_channel=upsample_initial_channel,
														
 
															+            upsample_kernel_sizes=upsample_kernel_sizes,
														
 
															             gin_channels=gin_channels,
														
 
															         )
														
 
															         self.enc_q = PosteriorEncoder(
														
 
															-            spec_channels,
														
 
															+            in_channels=spec_channels,
														
 
															+            out_channels=inter_channels,
														
 
															+            hidden_channels=posterior_hidden_channels,
														
 
															+            kernel_size=5,
														
 
															+            dilation_rate=1,
														
 
															+            n_layers=posterior_n_layers,
														
 
															+            gin_channels=gin_channels,
														
 
															+        )
														
 
															+        self.flow = ResidualCouplingBlock(
														
 
															             inter_channels,
														
 
															-            hidden_channels,
														
 
															+            posterior_hidden_channels,
														
 
															             5,
														
 
															             1,
														
 
															-            16,
														
 
															+            4,
														
 
															             gin_channels=gin_channels,
														
 
															         )
														
 
															-        self.flow = ResidualCouplingBlock(
														
 
															-            inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
														
 
															-        )
														
 
															         self.ref_enc = modules.MelStyleEncoder(
														
 
															             spec_channels, style_vector_dim=gin_channels
														
@@ -516,13 +529,21 @@ class SynthesizerTrn(nn.Module):
 
															             self.enc_p.encoder.requires_grad_(False)
														
 
															             self.enc_p.vq.requires_grad_(False)
														
 
															+        if freeze_decoder:
														
 
															+            self.dec.requires_grad_(False)
														
 
															+
														
 
															+        if freeze_posterior_encoder:
														
 
															+            self.enc_q.requires_grad_(False)
														
 
															+
														
 
															     def forward(self, y, y_lengths):
														
 
															         y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(
														
 
															             y.dtype
														
 
															         )
														
 
															         ge = self.ref_enc(y * y_mask, y_mask)
														
 
															-        x, m_p, logs_p, y_mask, quantized = self.enc_p(y, y_lengths, ge)
														
 
															+        x, m_p, logs_p, y_mask, quantized, decoded_aux_mel = self.enc_p(
														
 
															+            y, y_lengths, ge
														
 
															+        )
														
 
															         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge)
														
 
															         z_p = self.flow(z, y_mask, g=ge)
														
@@ -538,6 +559,7 @@ class SynthesizerTrn(nn.Module):
 
															             y_mask,
														
 
															             (z, z_p, m_p, logs_p, m_q, logs_q),
														
 
															             quantized,
														
 
															+            decoded_aux_mel,
														
 
															         )
														
 
															     def infer(self, y, y_lengths, noise_scale=0.5):
														
@@ -545,7 +567,7 @@ class SynthesizerTrn(nn.Module):
 
															             y.dtype
														
 
															         )
														
 
															         ge = self.ref_enc(y * y_mask, y_mask)
														
 
															-        x, m_p, logs_p, y_mask, quantized = self.enc_p(y, y_lengths, ge)
														
 
															+        x, m_p, logs_p, y_mask, _, _ = self.enc_p(y, y_lengths, ge)
														
 
															         z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
														
 
															         z = self.flow(z_p, y_mask, g=ge, reverse=True)
														
@@ -600,10 +622,10 @@ if __name__ == "__main__":
 
															         spec_channels=1025,
														
 
															         segment_size=20480,
														
 
															         inter_channels=192,
														
 
															-        hidden_channels=192,
														
 
															-        filter_channels=768,
														
 
															-        n_heads=2,
														
 
															-        n_layers=6,
														
 
															+        prior_hidden_channels=384,
														
 
															+        posterior_hidden_channels=192,
														
 
															+        prior_n_layers=16,
														
 
															+        posterior_n_layers=16,
														
 
															         kernel_size=3,
														
 
															         p_dropout=0.1,
														
 
															         resblock="1",
														
@@ -617,18 +639,21 @@ if __name__ == "__main__":
 
															     )
														
 
															     state_dict_g = torch.load("checkpoints/gpt_sovits_g_488k.pth", map_location="cpu")
														
 
															-    # state_dict_d = torch.load("checkpoints/gpt_sovits_d_488k.pth", map_location="cpu")
														
 
															-    # keys = set(model.state_dict().keys())
														
 
															-    # state_dict_g = {k.replace("encoder2.", "decoder."): v for k, v in state_dict_g.items() if k in keys}
														
 
															+    state_dict_d = torch.load("checkpoints/gpt_sovits_d_488k.pth", map_location="cpu")
														
 
															+    keys = set(model.state_dict().keys())
														
 
															+    state_dict_g = {
														
 
															+        k: v for k, v in state_dict_g.items() if k in keys and "enc_p" not in k
														
 
															+    }
														
 
															-    # new_state = {}
														
 
															-    # for k, v in state_dict_g.items():
														
 
															-    #     new_state["generator." + k] = v
														
 
															+    new_state = {}
														
 
															+    for k, v in state_dict_g.items():
														
 
															+        new_state["generator." + k] = v
														
 
															-    # for k, v in state_dict_d.items():
														
 
															-    #     new_state["discriminator." + k] = v
														
 
															+    for k, v in state_dict_d.items():
														
 
															+        new_state["discriminator." + k] = v
														
 
															-    # torch.save(new_state, "checkpoints/gpt_sovits_488k.pth")
														
 
															+    torch.save(new_state, "checkpoints/gpt_sovits_488k.pth")
														
 
															+    exit()
														
 
															     # print(EnsembledDiscriminator().load_state_dict(state_dict_d, strict=False))
														
 
															     print(model.load_state_dict(state_dict_g, strict=False))
														
--- a/fish_speech/models/vqgan/modules/rvq.py
+++ b/fish_speech/models/vqgan/modules/rvq.py
@@ -7,253 +7,10 @@ import torch.nn as nn
 
															 import torch.nn.functional as F
														
 
															 from einops import rearrange
														
 
															 from torch.nn.utils import weight_norm
														
 
															+from vector_quantize_pytorch import LFQ, ResidualVQ
														
 
															-class VectorQuantize(nn.Module):
														
 
															-    """
														
 
															-    Implementation of VQ similar to Karpathy's repo:
														
 
															-    https://github.com/karpathy/deep-vector-quantization
														
 
															-    Additionally uses following tricks from Improved VQGAN
														
 
															-    (https://arxiv.org/pdf/2110.04627.pdf):
														
 
															-        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
														
 
															-            for improved codebook usage
														
 
															-        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
														
 
															-            improves training stability
														
 
															-    """
														
 
															-
														
 
															-    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int):
														
 
															-        super().__init__()
														
 
															-        self.codebook_size = codebook_size
														
 
															-        self.codebook_dim = codebook_dim
														
 
															-
														
 
															-        self.in_proj = weight_norm(nn.Conv1d(input_dim, codebook_dim, kernel_size=1))
														
 
															-        self.out_proj = weight_norm(nn.Conv1d(codebook_dim, input_dim, kernel_size=1))
														
 
															-        self.codebook = nn.Embedding(codebook_size, codebook_dim)
														
 
															-
														
 
															-    def forward(self, z):
														
 
															-        """Quantized the input tensor using a fixed codebook and returns
														
 
															-        the corresponding codebook vectors
														
 
															-
														
 
															-        Parameters
														
 
															-        ----------
														
 
															-        z : Tensor[B x D x T]
														
 
															-
														
 
															-        Returns
														
 
															-        -------
														
 
															-        Tensor[B x D x T]
														
 
															-            Quantized continuous representation of input
														
 
															-        Tensor[1]
														
 
															-            Commitment loss to train encoder to predict vectors closer to codebook
														
 
															-            entries
														
 
															-        Tensor[1]
														
 
															-            Codebook loss to update the codebook
														
 
															-        Tensor[B x T]
														
 
															-            Codebook indices (quantized discrete representation of input)
														
 
															-        Tensor[B x D x T]
														
 
															-            Projected latents (continuous representation of input before quantization)
														
 
															-        """
														
 
															-
														
 
															-        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
														
 
															-        z_e = self.in_proj(z)  # z_e : (B x D x T)
														
 
															-        z_q, indices = self.decode_latents(z_e)
														
 
															-
														
 
															-        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
														
 
															-        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
														
 
															-
														
 
															-        z_q = (
														
 
															-            z_e + (z_q - z_e).detach()
														
 
															-        )  # noop in forward pass, straight-through gradient estimator in backward pass
														
 
															-
														
 
															-        z_q = self.out_proj(z_q)
														
 
															-
														
 
															-        return z_q, commitment_loss, codebook_loss, indices, z_e
														
 
															-
														
 
															-    def embed_code(self, embed_id):
														
 
															-        return F.embedding(embed_id, self.codebook.weight)
														
 
															-
														
 
															-    def decode_code(self, embed_id):
														
 
															-        return self.embed_code(embed_id).transpose(1, 2)
														
 
															-
														
 
															-    def decode_latents(self, latents):
														
 
															-        encodings = rearrange(latents, "b d t -> (b t) d")
														
 
															-        codebook = self.codebook.weight  # codebook: (N x D)
														
 
															-
														
 
															-        # L2 normalize encodings and codebook (ViT-VQGAN)
														
 
															-        encodings = F.normalize(encodings)
														
 
															-        codebook = F.normalize(codebook)
														
 
															-
														
 
															-        # Compute euclidean distance with codebook
														
 
															-        dist = (
														
 
															-            encodings.pow(2).sum(1, keepdim=True)
														
 
															-            - 2 * encodings @ codebook.t()
														
 
															-            + codebook.pow(2).sum(1, keepdim=True).t()
														
 
															-        )
														
 
															-        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
														
 
															-        z_q = self.decode_code(indices)
														
 
															-        return z_q, indices
														
 
															-
														
 
															-
														
 
															-@dataclass
														
 
															-class VQResult:
														
 
															-    z: torch.Tensor
														
 
															-    codes: torch.Tensor
														
 
															-    latents: torch.Tensor
														
 
															-    commitment_loss: torch.Tensor
														
 
															-    codebook_loss: torch.Tensor
														
 
															-
														
 
															-
														
 
															-class ResidualVectorQuantize(nn.Module):
														
 
															-    """
														
 
															-    Introduced in SoundStream: An end2end neural audio codec
														
 
															-    https://arxiv.org/abs/2107.03312
														
 
															-    """
														
 
															-
														
 
															-    def __init__(
														
 
															-        self,
														
 
															-        input_dim: int = 512,
														
 
															-        n_codebooks: int = 9,
														
 
															-        codebook_size: int = 1024,
														
 
															-        codebook_dim: Union[int, list] = 8,
														
 
															-        quantizer_dropout: float = 0.0,
														
 
															-        min_quantizers: int = 4,
														
 
															-    ):
														
 
															-        super().__init__()
														
 
															-        if isinstance(codebook_dim, int):
														
 
															-            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
														
 
															-
														
 
															-        self.n_codebooks = n_codebooks
														
 
															-        self.codebook_dim = codebook_dim
														
 
															-        self.codebook_size = codebook_size
														
 
															-
														
 
															-        self.quantizers = nn.ModuleList(
														
 
															-            [
														
 
															-                VectorQuantize(input_dim, codebook_size, codebook_dim[i])
														
 
															-                for i in range(n_codebooks)
														
 
															-            ]
														
 
															-        )
														
 
															-        self.quantizer_dropout = quantizer_dropout
														
 
															-        self.min_quantizers = min_quantizers
														
 
															-
														
 
															-    def forward(self, z, n_quantizers: int = None) -> VQResult:
														
 
															-        """Quantized the input tensor using a fixed set of `n` codebooks and returns
														
 
															-        the corresponding codebook vectors
														
 
															-        Parameters
														
 
															-        ----------
														
 
															-        z : Tensor[B x D x T]
														
 
															-        n_quantizers : int, optional
														
 
															-            No. of quantizers to use
														
 
															-            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
														
 
															-            Note: if `self.quantizer_dropout` is True, this argument is ignored
														
 
															-                when in training mode, and a random number of quantizers is used.
														
 
															-        Returns
														
 
															-        -------
														
 
															-        """
														
 
															-        z_q = 0
														
 
															-        residual = z
														
 
															-        commitment_loss = 0
														
 
															-        codebook_loss = 0
														
 
															-
														
 
															-        codebook_indices = []
														
 
															-        latents = []
														
 
															-
														
 
															-        if n_quantizers is None:
														
 
															-            n_quantizers = self.n_codebooks
														
 
															-
														
 
															-        if self.training:
														
 
															-            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
														
 
															-            dropout = torch.randint(
														
 
															-                self.min_quantizers, self.n_codebooks + 1, (z.shape[0],)
														
 
															-            )
														
 
															-            n_dropout = int(z.shape[0] * self.quantizer_dropout)
														
 
															-            n_quantizers[:n_dropout] = dropout[:n_dropout]
														
 
															-            n_quantizers = n_quantizers.to(z.device)
														
 
															-
														
 
															-        for i, quantizer in enumerate(self.quantizers):
														
 
															-            if self.training is False and i >= n_quantizers:
														
 
															-                break
														
 
															-
														
 
															-            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
														
 
															-                residual
														
 
															-            )
														
 
															-
														
 
															-            # Create mask to apply quantizer dropout
														
 
															-            mask = (
														
 
															-                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
														
 
															-            )
														
 
															-            z_q = z_q + z_q_i * mask[:, None, None]
														
 
															-            residual = residual - z_q_i
														
 
															-
														
 
															-            # Sum losses
														
 
															-            commitment_loss += (commitment_loss_i * mask).mean()
														
 
															-            codebook_loss += (codebook_loss_i * mask).mean()
														
 
															-
														
 
															-            codebook_indices.append(indices_i)
														
 
															-            latents.append(z_e_i)
														
 
															-
														
 
															-        codes = torch.stack(codebook_indices, dim=1)
														
 
															-        latents = torch.cat(latents, dim=1)
														
 
															-
														
 
															-        return VQResult(z_q, codes, latents, commitment_loss, codebook_loss)
														
 
															-
														
 
															-    def from_codes(self, codes: torch.Tensor):
														
 
															-        """Given the quantized codes, reconstruct the continuous representation
														
 
															-        Parameters
														
 
															-        ----------
														
 
															-        codes : Tensor[B x N x T]
														
 
															-            Quantized discrete representation of input
														
 
															-        Returns
														
 
															-        -------
														
 
															-        Tensor[B x D x T]
														
 
															-            Quantized continuous representation of input
														
 
															-        """
														
 
															-        z_q = 0.0
														
 
															-        z_p = []
														
 
															-        n_codebooks = codes.shape[1]
														
 
															-        for i in range(n_codebooks):
														
 
															-            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
														
 
															-            z_p.append(z_p_i)
														
 
															-
														
 
															-            z_q_i = self.quantizers[i].out_proj(z_p_i)
														
 
															-            z_q = z_q + z_q_i
														
 
															-        return z_q, torch.cat(z_p, dim=1), codes
														
 
															-
														
 
															-    def from_latents(self, latents: torch.Tensor):
														
 
															-        """Given the unquantized latents, reconstruct the
														
 
															-        continuous representation after quantization.
														
 
															-
														
 
															-        Parameters
														
 
															-        ----------
														
 
															-        latents : Tensor[B x N x T]
														
 
															-            Continuous representation of input after projection
														
 
															-
														
 
															-        Returns
														
 
															-        -------
														
 
															-        Tensor[B x D x T]
														
 
															-            Quantized representation of full-projected space
														
 
															-        Tensor[B x D x T]
														
 
															-            Quantized representation of latent space
														
 
															-        """
														
 
															-        z_q = 0
														
 
															-        z_p = []
														
 
															-        codes = []
														
 
															-        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
														
 
															-
														
 
															-        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
														
 
															-            0
														
 
															-        ]
														
 
															-        for i in range(n_codebooks):
														
 
															-            j, k = dims[i], dims[i + 1]
														
 
															-            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
														
 
															-            z_p.append(z_p_i)
														
 
															-            codes.append(codes_i)
														
 
															-
														
 
															-            z_q_i = self.quantizers[i].out_proj(z_p_i)
														
 
															-            z_q = z_q + z_q_i
														
 
															-
														
 
															-        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
														
 
															-
														
 
															-
														
 
															-class DownsampleResidualVectorQuantizer(ResidualVectorQuantize):
														
 
															+class DownsampleResidualVectorQuantizer(nn.Module):
														
 
															     """
														
 
															     Downsampled version of ResidualVectorQuantize
														
 
															     """
														
@@ -269,18 +26,26 @@ class DownsampleResidualVectorQuantizer(ResidualVectorQuantize):
 
															         downsample_factor: tuple[int] = (2, 2),
														
 
															         downsample_dims: tuple[int] | None = None,
														
 
															     ):
														
 
															+        super().__init__()
														
 
															         if downsample_dims is None:
														
 
															             downsample_dims = [input_dim for _ in range(len(downsample_factor))]
														
 
															         all_dims = (input_dim,) + tuple(downsample_dims)
														
 
															-        super().__init__(
														
 
															-            all_dims[-1],
														
 
															-            n_codebooks,
														
 
															-            codebook_size,
														
 
															-            codebook_dim,
														
 
															-            quantizer_dropout,
														
 
															-            min_quantizers,
														
 
															+        # self.vq = ResidualVQ(
														
 
															+        #     dim=all_dims[-1],
														
 
															+        #     num_quantizers=n_codebooks,
														
 
															+        #     codebook_dim=codebook_dim,
														
 
															+        #     threshold_ema_dead_code=2,
														
 
															+        #     codebook_size=codebook_size,
														
 
															+        #     kmeans_init=False,
														
 
															+        # )
														
 
															+
														
 
															+        self.vq = LFQ(
														
 
															+            dim=all_dims[-1],
														
 
															+            codebook_size=2**14,
														
 
															+            entropy_loss_weight=0.1,
														
 
															+            diversity_gamma=1.0,
														
 
															         )
														
 
															         self.downsample_factor = downsample_factor
														
@@ -310,33 +75,34 @@ class DownsampleResidualVectorQuantizer(ResidualVectorQuantize):
 
															             ]
														
 
															         )
														
 
															-    def forward(self, z, n_quantizers: int = None) -> VQResult:
														
 
															+    def forward(self, z):
														
 
															         original_shape = z.shape
														
 
															         z = self.downsample(z)
														
 
															-        result = super().forward(z, n_quantizers)
														
 
															-        result.z = self.upsample(result.z)
														
 
															+        z, indices, loss = self.vq(z.mT)
														
 
															+        z = self.upsample(z.mT)
														
 
															+        loss = loss.mean()
														
 
															         # Pad or crop z to match original shape
														
 
															-        diff = original_shape[-1] - result.z.shape[-1]
														
 
															+        diff = original_shape[-1] - z.shape[-1]
														
 
															         left = diff // 2
														
 
															         right = diff - left
														
 
															         if diff > 0:
														
 
															-            result.z = F.pad(result.z, (left, right))
														
 
															+            z = F.pad(z, (left, right))
														
 
															         elif diff < 0:
														
 
															-            result.z = result.z[..., left:-right]
														
 
															+            z = z[..., left:-right]
														
 
															-        return result
														
 
															+        return z, indices, loss
														
 
															-    def from_codes(self, codes: torch.Tensor):
														
 
															-        z_q, z_p, codes = super().from_codes(codes)
														
 
															-        z_q = self.upsample(z_q)
														
 
															-        return z_q, z_p, codes
														
 
															+    # def from_codes(self, codes: torch.Tensor):
														
 
															+    #     z_q, z_p, codes = super().from_codes(codes)
														
 
															+    #     z_q = self.upsample(z_q)
														
 
															+    #     return z_q, z_p, codes
														
 
															-    def from_latents(self, latents: torch.Tensor):
														
 
															-        z_q, z_p, codes = super().from_latents(latents)
														
 
															-        z_q = self.upsample(z_q)
														
 
															-        return z_q, z_p, codes
														
 
															+    # def from_latents(self, latents: torch.Tensor):
														
 
															+    #     z_q, z_p, codes = super().from_latents(latents)
														
 
															+    #     z_q = self.upsample(z_q)
														
 
															+    #     return z_q, z_p, codes
														
 
															 if __name__ == "__main__":