%!s(int64=2) %!d(string=hai) anos · 38aa5ea106
--- a/fish_speech/configs/vq_diffusion.yaml
+++ b/fish_speech/configs/vq_diffusion.yaml
@@ -12,7 +12,7 @@ trainer:
 
															   gradient_clip_val: 1.0
														
 
															   gradient_clip_algorithm: 'norm'
														
 
															   precision: 16-mixed
														
 
															-  max_steps: 1_000_000
														
 
															+  max_steps: 300_000
														
 
															   val_check_interval: 5000
														
 
															 sample_rate: 24000
														
@@ -84,7 +84,7 @@ model:
 
															     _target_: fish_speech.models.vq_diffusion.wavenet.WaveNet
														
 
															     d_encoder: 128
														
 
															     mel_channels: 100
														
 
															-    residual_channels: 384
														
 
															+    residual_channels: 512
														
 
															     residual_layers: 20
														
 
															   vocoder:
														
@@ -104,7 +104,7 @@ model:
 
															     _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
														
 
															     sample_rate: 32000
														
 
															     n_fft: 2048
														
 
															-    hop_length: 640
														
 
															+    hop_length: 1280
														
 
															     win_length: 2048
														
 
															     n_mels: 128
														
--- a/fish_speech/models/vq_diffusion/bigvgan/bigvgan.py
+++ b/fish_speech/models/vq_diffusion/bigvgan/bigvgan.py
@@ -366,7 +366,9 @@ class BigVGAN(nn.Module):
 
															     @torch.no_grad()
														
 
															     def decode(self, mel):
														
 
															+        mel = F.pad(mel, (0, 10), "reflect")
														
 
															         y = self.model(mel)
														
 
															+        y = y[:, :, : -self.h.hop_size * 10]
														
 
															         return y
														
 
															     @torch.no_grad()