2 лет назад · 78c9747b4c
--- a/fish_speech/configs/hubert_vq.yaml
+++ b/fish_speech/configs/hubert_vq.yaml
@@ -0,0 +1,100 @@
 
				+defaults:
			
 
				+  - base
			
 
				+  - _self_
			
 
				+
			
 
				+project: hubert_vq
			
 
				+
			
 
				+# Lightning Trainer
			
 
				+trainer:
			
 
				+  accumulate_grad_batches: 2
			
 
				+  gradient_clip_val: 1000.0  # For safety
			
 
				+  gradient_clip_algorithm: 'norm'
			
 
				+  precision: 32
			
 
				+  max_steps: 1_000_000
			
 
				+
			
 
				+# Dataset Configuration
			
 
				+tokenizer:
			
 
				+  _target_: transformers.AutoTokenizer.from_pretrained
			
 
				+  pretrained_model_name_or_path: fishaudio/speech-lm-300m
			
 
				+  revision: text-pretrain-10k
			
 
				+
			
 
				+# Dataset Configuration
			
 
				+train_dataset:
			
 
				+  - _target_: fish_speech.datasets.text.TextDataset
			
 
				+    repo: fishaudio/cn-hubert-25hz-vq
			
 
				+    prefix: 'data/train'
			
 
				+
			
 
				+val_dataset:
			
 
				+  _target_: fish_speech.datasets.text.TextDataset
			
 
				+  repo: fishaudio/cn-hubert-25hz-vq
			
 
				+  prefix: 'data/test'
			
 
				+
			
 
				+data:
			
 
				+  _target_: fish_speech.datasets.text.TextDataModule
			
 
				+  train_dataset: ${train_dataset}
			
 
				+  val_dataset: ${val_dataset}
			
 
				+  num_workers: 4
			
 
				+  batch_size: 8
			
 
				+  tokenizer: ${tokenizer}
			
 
				+
			
 
				+# Model Configuration
			
 
				+model:
			
 
				+  _target_: fish_speech.models.vqgan.VQGAN
			
 
				+
			
 
				+  encoder:
			
 
				+    _target_: fish_speech.models.modules.VQEncoder
			
 
				+    in_channels: 1024
			
 
				+    channels: 192
			
 
				+    num_heads: 2
			
 
				+    num_feature_layers: 2
			
 
				+    num_speaker_layers: 4
			
 
				+    num_mixin_layers: 4
			
 
				+    input_downsample: true
			
 
				+    code_book_size: 2048
			
 
				+    freeze_vq: false
			
 
				+
			
 
				+  generator:
			
 
				+    _target_: fish_speech.models.modules.Generator
			
 
				+    initial_channel: 192
			
 
				+    resblock: "1"
			
 
				+    resblock_kernel_sizes: [3, 7, 11]
			
 
				+    resblock_dilation_sizes: 
			
 
				+      - [1, 3, 5]
			
 
				+      - [1, 3, 5]
			
 
				+      - [1, 3, 5]
			
 
				+    upsample_rates: [10, 8, 2, 2, 2]
			
 
				+    upsample_initial_channel: 512
			
 
				+    upsample_kernel_sizes: [16, 16, 8, 2, 2]
			
 
				+
			
 
				+  discriminator:
			
 
				+    _target_: fish_speech.models.modules.EnsembleDiscriminator
			
 
				+
			
 
				+  mel_transform:
			
 
				+    _target_: fish_speech.models.spectrogram.LogMelSpectrogram
			
 
				+    sample_rate: 32000
			
 
				+    n_fft: 2048
			
 
				+    hop_length: 640
			
 
				+    win_length: 2048
			
 
				+    n_mels: 128
			
 
				+
			
 
				+  optimizer:
			
 
				+    _target_: torch.optim.AdamW
			
 
				+    _partial_: true
			
 
				+    lr: 1e-4
			
 
				+    betas: [0.8, 0.99]
			
 
				+    eps: 1e-5
			
 
				+
			
 
				+  lr_scheduler:
			
 
				+    _target_: torch.optim.lr_scheduler.LambdaLR
			
 
				+    _partial_: true
			
 
				+    lr_lambda:
			
 
				+      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
			
 
				+      _partial_: true
			
 
				+      num_warmup_steps: 2000
			
 
				+      num_training_steps: ${trainer.max_steps}
			
 
				+      final_lr_ratio: 0.05
			
 
				+  
			
 
				+  # Restore from old checkpoint
			
 
				+  generator_ckpt: results/hubert-vq-pretrain/rcell/G_23000.pth
			
 
				+  discriminator_ckpt: results/hubert-vq-pretrain/rcell/D_23000.pth
			
 
				+  kmeans_ckpt: results/hubert-vq-pretrain/rcell/kmeans_23000.pth
			
--- a/fish_speech/configs/whisper_vq.yaml
+++ b/fish_speech/configs/whisper_vq.yaml
@@ -1,89 +0,0 @@
 
				-paths:
			
 
				-  run_dir: results/whisper-vq
			
 
				-  checkpoint_dir: ${paths.run_dir}/checkpoints
			
 
				-
			
 
				-hydra:
			
 
				-  run:
			
 
				-    dir: ${paths.run_dir}
			
 
				-
			
 
				-trainer:
			
 
				-  _target_: lightning.fabric.Fabric
			
 
				-  accelerator: gpu
			
 
				-  strategy: 
			
 
				-    _target_: lightning.fabric.strategies.DDPStrategy
			
 
				-    static_graph: true
			
 
				-
			
 
				-  devices: auto
			
 
				-  precision: bf16-mixed
			
 
				-  loggers:
			
 
				-    _target_: pytorch_lightning.loggers.TensorBoardLogger
			
 
				-    save_dir: ${paths.run_dir}
			
 
				-    name: tensorboard
			
 
				-    version: null
			
 
				-
			
 
				-model:
			
 
				-  _target_: fish_speech.models.whisper_vq.WhisperVQ
			
 
				-  model_name_or_path: "openai/whisper-medium"
			
 
				-
			
 
				-  # Quantization
			
 
				-  codebook_dim: 32
			
 
				-  codebook_size: 4096
			
 
				-  codebook_decay: 0.9
			
 
				-  threshold_ema_dead_code: 0
			
 
				-  use_cosine_similarity: true
			
 
				-  downsample: true
			
 
				-
			
 
				-  # Attention
			
 
				-  post_attention_depth: 2
			
 
				-
			
 
				-schedule:
			
 
				-  batch_size: 64
			
 
				-  micro_batch_size: 32
			
 
				-  max_steps: 10000
			
 
				-  save_interval: 2000
			
 
				-  gradient_accumulation_steps: "${eval: ${schedule.batch_size} // ${schedule.micro_batch_size}}"
			
 
				-  clip_grad_norm: 2.0
			
 
				-  log_interval: 50
			
 
				-  eval_interval: 2000
			
 
				-
			
 
				-train_dataloader:
			
 
				-  _target_: torch.utils.data.DataLoader
			
 
				-  dataset:
			
 
				-    _target_: fish_speech.datasets.whisper_vq.WhisperVQDataset
			
 
				-    filelist: filelists/whisper-vq.train.filelist
			
 
				-  batch_size: ${schedule.micro_batch_size}
			
 
				-  num_workers: 16
			
 
				-  prefetch_factor: 4
			
 
				-  pin_memory: true
			
 
				-  persistent_workers: true
			
 
				-  shuffle: true
			
 
				-  collate_fn:
			
 
				-    _target_: fish_speech.datasets.whisper_vq.WhisperVQCollator
			
 
				-
			
 
				-valid_dataloader:
			
 
				-  _target_: torch.utils.data.DataLoader
			
 
				-  dataset:
			
 
				-    _target_: fish_speech.datasets.whisper_vq.WhisperVQDataset
			
 
				-    filelist: filelists/whisper-vq.test.filelist
			
 
				-  batch_size: 16
			
 
				-  num_workers: 8
			
 
				-  prefetch_factor: 4
			
 
				-  pin_memory: true
			
 
				-  shuffle: false
			
 
				-  collate_fn:
			
 
				-    _target_: fish_speech.datasets.whisper_vq.WhisperVQCollator
			
 
				-
			
 
				-optimizer:
			
 
				-  _target_: torch.optim.AdamW
			
 
				-  lr: 3e-4
			
 
				-  weight_decay: 0.1
			
 
				-  betas: [0.9, 0.95]
			
 
				-  eps: 1e-5
			
 
				-
			
 
				-scheduler:
			
 
				-  _target_: torch.optim.lr_scheduler.LambdaLR
			
 
				-  lr_lambda:
			
 
				-    _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
			
 
				-    _partial_: true
			
 
				-    num_warmup_steps: 1000
			
 
				-    num_training_steps: ${schedule.max_steps}
			
--- a/fish_speech/datasets/whisper_vq.py
+++ b/fish_speech/datasets/whisper_vq.py
@@ -5,53 +5,26 @@ import librosa
 
				 import torch
			
 
				 from torch.utils.data import Dataset
			
 
				 from transformers import WhisperProcessor
			
 
				-from whisper.audio import HOP_LENGTH, load_audio, log_mel_spectrogram, pad_or_trim
			
 
				 
			
 
				 
			
 
				-class WhisperVQDataset(Dataset):
			
 
				+class VQGANDataset(Dataset):
			
 
				     def __init__(
			
 
				-        self, filelist: str, model_name_or_path: str = "openai/whisper-medium"
			
 
				+        self,
			
 
				+        filelist: str,
			
 
				+        sample_rate: int = 32000,
			
 
				     ):
			
 
				         super().__init__()
			
 
				 
			
 
				-        self.files = [
			
 
				-            Path(line.strip()) for line in Path(filelist).read_text().splitlines()
			
 
				-        ]
			
 
				-        self.processor = WhisperProcessor.from_pretrained(model_name_or_path)
			
 
				+        filelist = Path(filelist)
			
 
				+        root = filelist.parent
			
 
				+
			
 
				+        self.files = [root / line.strip() for line in filelist.read_text().splitlines()]
			
 
				 
			
 
				     def __len__(self):
			
 
				         return len(self.files)
			
 
				 
			
 
				     def __getitem__(self, idx):
			
 
				         file = self.files[idx]
			
 
				-        wav = load_audio(file)
			
 
				-        wav_length = wav.shape[-1]
			
 
				-        mel_length = wav_length // HOP_LENGTH + 1
			
 
				-
			
 
				-        wav = pad_or_trim(wav)
			
 
				-        wav = torch.from_numpy(wav).float()
			
 
				-        input_features = log_mel_spectrogram(wav)
			
 
				-        mel_mask = torch.zeros(input_features.shape[1], dtype=torch.float)
			
 
				-        mel_mask[:mel_length] = 1
			
 
				-
			
 
				-        input_ids = file.with_suffix(".whisper.txt").read_text().strip().split("\t")[0]
			
 
				-        input_ids = [int(x) for x in input_ids.split(",")]
			
 
				-
			
 
				-        while input_ids[-1] in [
			
 
				-            self.processor.tokenizer.pad_token_id,
			
 
				-            self.processor.tokenizer.eos_token_id,
			
 
				-        ]:
			
 
				-            input_ids.pop()
			
 
				-
			
 
				-        input_ids.append(self.processor.tokenizer.eos_token_id)
			
 
				-        input_ids = torch.tensor(input_ids, dtype=torch.long)
			
 
				-
			
 
				-        return {
			
 
				-            "input_values": wav,
			
 
				-            "input_features": input_features,
			
 
				-            "input_ids": input_ids,
			
 
				-            "mel_mask": mel_mask,
			
 
				-        }
			
 
				 
			
 
				 
			
 
				 @dataclass
			
--- a/fish_speech/models/hubert_vq/utils.py
+++ b/fish_speech/models/hubert_vq/utils.py
@@ -1,163 +0,0 @@
 
				-import torch
			
 
				-import torch.utils.data
			
 
				-from librosa.filters import mel as librosa_mel_fn
			
 
				-
			
 
				-
			
 
				-def convert_pad_shape(pad_shape):
			
 
				-    l = pad_shape[::-1]
			
 
				-    pad_shape = [item for sublist in l for item in sublist]
			
 
				-    return pad_shape
			
 
				-
			
 
				-
			
 
				-def sequence_mask(length, max_length=None):
			
 
				-    if max_length is None:
			
 
				-        max_length = length.max()
			
 
				-    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
			
 
				-    return x.unsqueeze(0) < length.unsqueeze(1)
			
 
				-
			
 
				-
			
 
				-def init_weights(m, mean=0.0, std=0.01):
			
 
				-    classname = m.__class__.__name__
			
 
				-    if classname.find("Conv") != -1:
			
 
				-        m.weight.data.normal_(mean, std)
			
 
				-
			
 
				-
			
 
				-def get_padding(kernel_size, dilation=1):
			
 
				-    return int((kernel_size * dilation - dilation) / 2)
			
 
				-
			
 
				-
			
 
				-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
			
 
				-    """
			
 
				-    PARAMS
			
 
				-    ------
			
 
				-    C: compression factor
			
 
				-    """
			
 
				-    return torch.log(torch.clamp(x, min=clip_val) * C)
			
 
				-
			
 
				-
			
 
				-def dynamic_range_decompression_torch(x, C=1):
			
 
				-    """
			
 
				-    PARAMS
			
 
				-    ------
			
 
				-    C: compression factor used to compress
			
 
				-    """
			
 
				-    return torch.exp(x) / C
			
 
				-
			
 
				-
			
 
				-def spectral_normalize_torch(magnitudes):
			
 
				-    output = dynamic_range_compression_torch(magnitudes)
			
 
				-    return output
			
 
				-
			
 
				-
			
 
				-def spectral_de_normalize_torch(magnitudes):
			
 
				-    output = dynamic_range_decompression_torch(magnitudes)
			
 
				-    return output
			
 
				-
			
 
				-
			
 
				-mel_basis = {}
			
 
				-hann_window = {}
			
 
				-
			
 
				-
			
 
				-def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
			
 
				-    if torch.min(y) < -1.0:
			
 
				-        print("min value is ", torch.min(y))
			
 
				-    if torch.max(y) > 1.0:
			
 
				-        print("max value is ", torch.max(y))
			
 
				-
			
 
				-    global hann_window
			
 
				-    dtype_device = str(y.dtype) + "_" + str(y.device)
			
 
				-    wnsize_dtype_device = str(win_size) + "_" + dtype_device
			
 
				-    if wnsize_dtype_device not in hann_window:
			
 
				-        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
			
 
				-            dtype=y.dtype, device=y.device
			
 
				-        )
			
 
				-
			
 
				-    y = torch.nn.functional.pad(
			
 
				-        y.unsqueeze(1),
			
 
				-        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
			
 
				-        mode="reflect",
			
 
				-    )
			
 
				-    y = y.squeeze(1)
			
 
				-    spec = torch.stft(
			
 
				-        y,
			
 
				-        n_fft,
			
 
				-        hop_length=hop_size,
			
 
				-        win_length=win_size,
			
 
				-        window=hann_window[wnsize_dtype_device],
			
 
				-        center=center,
			
 
				-        pad_mode="reflect",
			
 
				-        normalized=False,
			
 
				-        onesided=True,
			
 
				-        return_complex=False,
			
 
				-    )
			
 
				-
			
 
				-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
			
 
				-    return spec
			
 
				-
			
 
				-
			
 
				-def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
			
 
				-    global mel_basis
			
 
				-    dtype_device = str(spec.dtype) + "_" + str(spec.device)
			
 
				-    fmax_dtype_device = str(fmax) + "_" + dtype_device
			
 
				-    if fmax_dtype_device not in mel_basis:
			
 
				-        mel = librosa_mel_fn(
			
 
				-            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
			
 
				-        )
			
 
				-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
			
 
				-            dtype=spec.dtype, device=spec.device
			
 
				-        )
			
 
				-    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
			
 
				-    spec = spectral_normalize_torch(spec)
			
 
				-    return spec
			
 
				-
			
 
				-
			
 
				-def mel_spectrogram_torch(
			
 
				-    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
			
 
				-):
			
 
				-    if torch.min(y) < -1.0:
			
 
				-        print("min value is ", torch.min(y))
			
 
				-    if torch.max(y) > 1.0:
			
 
				-        print("max value is ", torch.max(y))
			
 
				-
			
 
				-    global mel_basis, hann_window
			
 
				-    dtype_device = str(y.dtype) + "_" + str(y.device)
			
 
				-    fmax_dtype_device = str(fmax) + "_" + dtype_device
			
 
				-    wnsize_dtype_device = str(win_size) + "_" + dtype_device
			
 
				-    if fmax_dtype_device not in mel_basis:
			
 
				-        mel = librosa_mel_fn(
			
 
				-            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
			
 
				-        )
			
 
				-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
			
 
				-            dtype=y.dtype, device=y.device
			
 
				-        )
			
 
				-    if wnsize_dtype_device not in hann_window:
			
 
				-        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
			
 
				-            dtype=y.dtype, device=y.device
			
 
				-        )
			
 
				-
			
 
				-    y = torch.nn.functional.pad(
			
 
				-        y.unsqueeze(1),
			
 
				-        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
			
 
				-        mode="reflect",
			
 
				-    )
			
 
				-    y = y.squeeze(1)
			
 
				-
			
 
				-    spec = torch.stft(
			
 
				-        y,
			
 
				-        n_fft,
			
 
				-        hop_length=hop_size,
			
 
				-        win_length=win_size,
			
 
				-        window=hann_window[wnsize_dtype_device],
			
 
				-        center=center,
			
 
				-        pad_mode="reflect",
			
 
				-        normalized=False,
			
 
				-        onesided=True,
			
 
				-        return_complex=False,
			
 
				-    )
			
 
				-
			
 
				-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
			
 
				-
			
 
				-    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
			
 
				-    spec = spectral_normalize_torch(spec)
			
 
				-
			
 
				-    return spec
			
--- a/fish_speech/models/vqgan/__init__.py
+++ b/fish_speech/models/vqgan/__init__.py
@@ -0,0 +1,3 @@
 
				+from .lit_module import VQGAN
			
 
				+
			
 
				+__all__ = ["VQGAN"]
			
--- a/fish_speech/models/hubert_vq/lit_module.py
+++ b/fish_speech/models/hubert_vq/lit_module.py
@@ -1,70 +1,41 @@
 
				 from typing import Any, Callable
			
 
				 
			
 
				+import lightning as L
			
 
				 import torch
			
 
				 import torch.nn.functional as F
			
 
				-from fish_vocoder.models.vocoder import VocoderModel
			
 
				-from fish_vocoder.modules.losses.stft import MultiResolutionSTFTLoss
			
 
				-from fish_vocoder.utils.grad_norm import grad_norm
			
 
				-from fish_vocoder.utils.mask import sequence_mask
			
 
				 from torch import nn
			
 
				 from torch.utils.checkpoint import checkpoint as gradient_checkpointing
			
 
				 
			
 
				 
			
 
				-class GANModel(VocoderModel):
			
 
				+class VQGAN(L.LightningModule):
			
 
				     def __init__(
			
 
				         self,
			
 
				-        sampling_rate: int,
			
 
				-        n_fft: int,
			
 
				-        hop_length: int,
			
 
				-        win_length: int,
			
 
				-        num_mels: int,
			
 
				         optimizer: Callable,
			
 
				         lr_scheduler: Callable,
			
 
				-        mel_transforms: nn.ModuleDict,
			
 
				+        encoder: nn.Module,
			
 
				         generator: nn.Module,
			
 
				-        discriminators: nn.ModuleDict,
			
 
				-        multi_resolution_stft_loss: MultiResolutionSTFTLoss,
			
 
				-        num_frames: int,
			
 
				-        crop_length: int | None = None,
			
 
				-        checkpointing: bool = False,
			
 
				-        feature_matching: bool = False,
			
 
				+        discriminator: nn.Module,
			
 
				+        mel_transform: nn.Module,
			
 
				+        segment_size: int = 20480,
			
 
				     ):
			
 
				-        super().__init__(
			
 
				-            sampling_rate=sampling_rate,
			
 
				-            n_fft=n_fft,
			
 
				-            hop_length=hop_length,
			
 
				-            win_length=win_length,
			
 
				-            num_mels=num_mels,
			
 
				-        )
			
 
				+        super().__init__()
			
 
				 
			
 
				         # Model parameters
			
 
				         self.optimizer_builder = optimizer
			
 
				         self.lr_scheduler_builder = lr_scheduler
			
 
				 
			
 
				-        # Spectrogram transforms
			
 
				-        self.mel_transforms = mel_transforms
			
 
				-
			
 
				         # Generator and discriminators
			
 
				         # Compile generator so that snake can save memory
			
 
				         self.generator = generator
			
 
				-        self.discriminators = discriminators
			
 
				-
			
 
				-        # Loss
			
 
				-        self.multi_resolution_stft_loss = multi_resolution_stft_loss
			
 
				+        self.discriminator = discriminator
			
 
				+        self.mel_transform = mel_transform
			
 
				 
			
 
				         # Crop length for saving memory
			
 
				-        self.num_frames = num_frames
			
 
				-        self.crop_length = crop_length
			
 
				+        self.segment_size = segment_size
			
 
				 
			
 
				         # Disable automatic optimization
			
 
				         self.automatic_optimization = False
			
 
				 
			
 
				-        # Gradient checkpointing
			
 
				-        self.checkpointing = checkpointing
			
 
				-
			
 
				-        # Feature matching
			
 
				-        self.feature_matching = feature_matching
			
 
				-
			
 
				     def configure_optimizers(self):
			
 
				         # Need two optimizers and two schedulers
			
 
				         optimizer_generator = self.optimizer_builder(self.generator.parameters())
			
@@ -95,12 +66,7 @@ class GANModel(VocoderModel):
 
				         )
			
 
				 
			
 
				     def training_generator(self, audio, audio_mask):
			
 
				-        if self.training and self.checkpointing:
			
 
				-            fake_audio, base_loss = gradient_checkpointing(
			
 
				-                self.forward, audio, audio_mask, use_reentrant=False
			
 
				-            )
			
 
				-        else:
			
 
				-            fake_audio, base_loss = self.forward(audio, audio_mask)
			
 
				+        # fake_audio, base_loss = self.forward(audio, audio_mask)
			
 
				 
			
 
				         assert fake_audio.shape == audio.shape
			
 
				 
			
@@ -308,14 +274,6 @@ class GANModel(VocoderModel):
 
				         scheduler_g.step()
			
 
				         scheduler_d.step()
			
 
				 
			
 
				-    def forward(self, audio, mask=None, input_spec=None):
			
 
				-        if input_spec is None:
			
 
				-            input_spec = self.mel_transforms.input(audio.squeeze(1))
			
 
				-
			
 
				-        fake_audio = self.generator(input_spec)
			
 
				-
			
 
				-        return fake_audio, 0
			
 
				-
			
 
				     def validation_step(self, batch: Any, batch_idx: int):
			
 
				         audio, lengths = batch["audio"], batch["lengths"]
			
 
				         audio_mask = sequence_mask(lengths)[:, None, :].to(audio.device, torch.float32)
			
--- a/fish_speech/models/hubert_vq/modules.py
+++ b/fish_speech/models/hubert_vq/modules.py
@@ -8,11 +8,7 @@ from torch.nn import functional as F
 
				 from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
			
 
				 from vector_quantize_pytorch import VectorQuantize
			
 
				 
			
 
				-from fish_speech.models.hubert_vq.utils import (
			
 
				-    convert_pad_shape,
			
 
				-    get_padding,
			
 
				-    init_weights,
			
 
				-)
			
 
				+from fish_speech.models.vqgan.utils import convert_pad_shape, get_padding, init_weights
			
 
				 
			
 
				 LRELU_SLOPE = 0.1
			
 
				 
			
@@ -603,7 +599,6 @@ class Generator(nn.Module):
 
				         upsample_rates,
			
 
				         upsample_initial_channel,
			
 
				         upsample_kernel_sizes,
			
 
				-        gin_channels=0,
			
 
				     ):
			
 
				         super(Generator, self).__init__()
			
 
				         self.num_kernels = len(resblock_kernel_sizes)
			
@@ -638,13 +633,8 @@ class Generator(nn.Module):
 
				         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
			
 
				         self.ups.apply(init_weights)
			
 
				 
			
 
				-        if gin_channels != 0:
			
 
				-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
			
 
				-
			
 
				-    def forward(self, x, g=None):
			
 
				+    def forward(self, x):
			
 
				         x = self.conv_pre(x)
			
 
				-        if g is not None:
			
 
				-            x = x + self.cond(g)
			
 
				 
			
 
				         for i in range(self.num_upsamples):
			
 
				             x = F.leaky_relu(x, LRELU_SLOPE)
			
--- a/fish_speech/models/vqgan/spectrogram.py
+++ b/fish_speech/models/vqgan/spectrogram.py
@@ -0,0 +1,104 @@
 
				+import torch
			
 
				+from torch import Tensor, nn
			
 
				+from torchaudio.transforms import MelScale
			
 
				+
			
 
				+
			
 
				+class LinearSpectrogram(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        n_fft=2048,
			
 
				+        win_length=2048,
			
 
				+        hop_length=512,
			
 
				+        center=False,
			
 
				+        mode="pow2_sqrt",
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.n_fft = n_fft
			
 
				+        self.win_length = win_length
			
 
				+        self.hop_length = hop_length
			
 
				+        self.center = center
			
 
				+        self.mode = mode
			
 
				+
			
 
				+        self.register_buffer("window", torch.hann_window(win_length))
			
 
				+
			
 
				+    def forward(self, y: Tensor) -> Tensor:
			
 
				+        if y.ndim == 3:
			
 
				+            y = y.squeeze(1)
			
 
				+
			
 
				+        y = torch.nn.functional.pad(
			
 
				+            y.unsqueeze(1),
			
 
				+            (
			
 
				+                (self.win_length - self.hop_length) // 2,
			
 
				+                (self.win_length - self.hop_length + 1) // 2,
			
 
				+            ),
			
 
				+            mode="reflect",
			
 
				+        ).squeeze(1)
			
 
				+
			
 
				+        spec = torch.stft(
			
 
				+            y,
			
 
				+            self.n_fft,
			
 
				+            hop_length=self.hop_length,
			
 
				+            win_length=self.win_length,
			
 
				+            window=self.window,
			
 
				+            center=self.center,
			
 
				+            pad_mode="reflect",
			
 
				+            normalized=False,
			
 
				+            onesided=True,
			
 
				+            return_complex=True,
			
 
				+        )
			
 
				+
			
 
				+        spec = torch.view_as_real(spec)
			
 
				+
			
 
				+        if self.mode == "pow2_sqrt":
			
 
				+            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
			
 
				+
			
 
				+        return spec
			
 
				+
			
 
				+
			
 
				+class LogMelSpectrogram(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        sample_rate=44100,
			
 
				+        n_fft=2048,
			
 
				+        win_length=2048,
			
 
				+        hop_length=512,
			
 
				+        n_mels=128,
			
 
				+        center=False,
			
 
				+        f_min=0.0,
			
 
				+        f_max=None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.sample_rate = sample_rate
			
 
				+        self.n_fft = n_fft
			
 
				+        self.win_length = win_length
			
 
				+        self.hop_length = hop_length
			
 
				+        self.center = center
			
 
				+        self.n_mels = n_mels
			
 
				+        self.f_min = f_min
			
 
				+        self.f_max = f_max or sample_rate // 2
			
 
				+
			
 
				+        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
			
 
				+        self.mel_scale = MelScale(
			
 
				+            self.n_mels,
			
 
				+            self.sample_rate,
			
 
				+            self.f_min,
			
 
				+            self.f_max,
			
 
				+            self.n_fft // 2 + 1,
			
 
				+            "slaney",
			
 
				+            "slaney",
			
 
				+        )
			
 
				+
			
 
				+    def compress(self, x: Tensor) -> Tensor:
			
 
				+        return torch.log(torch.clamp(x, min=1e-5))
			
 
				+
			
 
				+    def decompress(self, x: Tensor) -> Tensor:
			
 
				+        return torch.exp(x)
			
 
				+
			
 
				+    def forward(self, x: Tensor) -> Tensor:
			
 
				+        x = self.spectrogram(x)
			
 
				+        x = self.mel_scale(x)
			
 
				+        x = self.compress(x)
			
 
				+
			
 
				+        return x
			
--- a/fish_speech/models/vqgan/utils.py
+++ b/fish_speech/models/vqgan/utils.py
@@ -0,0 +1,26 @@
 
				+import torch
			
 
				+import torch.utils.data
			
 
				+from librosa.filters import mel as librosa_mel_fn
			
 
				+
			
 
				+
			
 
				+def convert_pad_shape(pad_shape):
			
 
				+    l = pad_shape[::-1]
			
 
				+    pad_shape = [item for sublist in l for item in sublist]
			
 
				+    return pad_shape
			
 
				+
			
 
				+
			
 
				+def sequence_mask(length, max_length=None):
			
 
				+    if max_length is None:
			
 
				+        max_length = length.max()
			
 
				+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
			
 
				+    return x.unsqueeze(0) < length.unsqueeze(1)
			
 
				+
			
 
				+
			
 
				+def init_weights(m, mean=0.0, std=0.01):
			
 
				+    classname = m.__class__.__name__
			
 
				+    if classname.find("Conv") != -1:
			
 
				+        m.weight.data.normal_(mean, std)
			
 
				+
			
 
				+
			
 
				+def get_padding(kernel_size, dilation=1):
			
 
				+    return int((kernel_size * dilation - dilation) / 2)
			
--- a/tools/vqgan/calculate_hubert_features.py
+++ b/tools/vqgan/calculate_hubert_features.py
@@ -145,8 +145,8 @@ def main(folder: str, num_workers: int):
 
				     begin_time = time.time()
			
 
				     processed_files = 0
			
 
				 
			
 
				-    for n_batch, idx in enumerate(range(0, len(files), 64)):
			
 
				-        batch = files[idx : idx + 64]
			
 
				+    for n_batch, idx in enumerate(range(0, len(files), 32)):
			
 
				+        batch = files[idx : idx + 32]
			
 
				         batch_time = process_batch(batch)
			
 
				         total_time += batch_time
			
 
				         processed_files += len(batch)