|
@@ -7,6 +7,7 @@ from fish_speech.models.vqgan.modules.encoders import (
|
|
|
SpeakerEncoder,
|
|
SpeakerEncoder,
|
|
|
TextEncoder,
|
|
TextEncoder,
|
|
|
)
|
|
)
|
|
|
|
|
+from fish_speech.models.vqgan.modules.flow import TransformerCouplingBlock
|
|
|
from fish_speech.models.vqgan.utils import rand_slice_segments
|
|
from fish_speech.models.vqgan.utils import rand_slice_segments
|
|
|
|
|
|
|
|
|
|
|
|
@@ -17,6 +18,7 @@ class SynthesizerTrn(nn.Module):
|
|
|
|
|
|
|
|
def __init__(
|
|
def __init__(
|
|
|
self,
|
|
self,
|
|
|
|
|
+ *,
|
|
|
in_channels,
|
|
in_channels,
|
|
|
spec_channels,
|
|
spec_channels,
|
|
|
segment_size,
|
|
segment_size,
|
|
@@ -25,8 +27,10 @@ class SynthesizerTrn(nn.Module):
|
|
|
filter_channels,
|
|
filter_channels,
|
|
|
n_heads,
|
|
n_heads,
|
|
|
n_layers,
|
|
n_layers,
|
|
|
|
|
+ n_flows,
|
|
|
n_layers_q,
|
|
n_layers_q,
|
|
|
n_layers_spk,
|
|
n_layers_spk,
|
|
|
|
|
+ n_layers_flow,
|
|
|
kernel_size,
|
|
kernel_size,
|
|
|
p_dropout,
|
|
p_dropout,
|
|
|
speaker_cond_layer,
|
|
speaker_cond_layer,
|
|
@@ -62,6 +66,17 @@ class SynthesizerTrn(nn.Module):
|
|
|
num_layers=n_layers_spk,
|
|
num_layers=n_layers_spk,
|
|
|
p_dropout=p_dropout,
|
|
p_dropout=p_dropout,
|
|
|
)
|
|
)
|
|
|
|
|
+ self.flow = TransformerCouplingBlock(
|
|
|
|
|
+ channels=inter_channels,
|
|
|
|
|
+ hidden_channels=hidden_channels,
|
|
|
|
|
+ filter_channels=filter_channels,
|
|
|
|
|
+ n_heads=n_heads,
|
|
|
|
|
+ n_layers=n_layers_flow,
|
|
|
|
|
+ kernel_size=5,
|
|
|
|
|
+ p_dropout=p_dropout,
|
|
|
|
|
+ n_flows=n_flows,
|
|
|
|
|
+ gin_channels=gin_channels,
|
|
|
|
|
+ )
|
|
|
self.enc_q = PosteriorEncoder(
|
|
self.enc_q = PosteriorEncoder(
|
|
|
spec_channels,
|
|
spec_channels,
|
|
|
inter_channels,
|
|
inter_channels,
|
|
@@ -84,8 +99,11 @@ class SynthesizerTrn(nn.Module):
|
|
|
|
|
|
|
|
def forward(self, x, x_lengths, y):
|
|
def forward(self, x, x_lengths, y):
|
|
|
g = self.enc_spk(y, x_lengths)
|
|
g = self.enc_spk(y, x_lengths)
|
|
|
- z_p, m_p, logs_p, _, x_mask = self.enc_p(x, x_lengths, g=g)
|
|
|
|
|
|
|
+
|
|
|
|
|
+ _, m_p, logs_p, _, x_mask = self.enc_p(x, x_lengths, g=g)
|
|
|
z_q, m_q, logs_q, y_mask = self.enc_q(y, x_lengths, g=g)
|
|
z_q, m_q, logs_q, y_mask = self.enc_q(y, x_lengths, g=g)
|
|
|
|
|
+ z_p = self.flow(z_q, y_mask, g=g, reverse=False)
|
|
|
|
|
+
|
|
|
z_slice, ids_slice = rand_slice_segments(z_q, x_lengths, self.segment_size)
|
|
z_slice, ids_slice = rand_slice_segments(z_q, x_lengths, self.segment_size)
|
|
|
o = self.dec(z_slice, g=g)
|
|
o = self.dec(z_slice, g=g)
|
|
|
|
|
|
|
@@ -99,17 +117,21 @@ class SynthesizerTrn(nn.Module):
|
|
|
(m_q, logs_q),
|
|
(m_q, logs_q),
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- def infer(self, x, x_lengths, y, max_len=None):
|
|
|
|
|
|
|
+ def infer(self, x, x_lengths, y, max_len=None, noise_scale=0.35):
|
|
|
g = self.enc_spk(y, x_lengths)
|
|
g = self.enc_spk(y, x_lengths)
|
|
|
- z_p, m_p, logs_p, h_text, x_mask = self.enc_p(x, x_lengths, g=g)
|
|
|
|
|
- # z_p_audio, m_p_audio, logs_p_audio = self.flow(z_p_text, m_p_text, logs_p_text, x_mask, g=g, reverse=True)
|
|
|
|
|
|
|
+ z_p, m_p, logs_p, h_text, x_mask = self.enc_p(
|
|
|
|
|
+ x, x_lengths, g=g, noise_scale=noise_scale
|
|
|
|
|
+ )
|
|
|
|
|
+ z_p = self.flow(z_p, x_mask, g=g, reverse=True)
|
|
|
|
|
|
|
|
o = self.dec((z_p * x_mask)[:, :, :max_len], g=g)
|
|
o = self.dec((z_p * x_mask)[:, :, :max_len], g=g)
|
|
|
return o
|
|
return o
|
|
|
|
|
|
|
|
- def reconstruct(self, x, x_lengths, max_len=None):
|
|
|
|
|
|
|
+ def reconstruct(self, x, x_lengths, max_len=None, noise_scale=0.35):
|
|
|
g = self.enc_spk(x, x_lengths)
|
|
g = self.enc_spk(x, x_lengths)
|
|
|
- z_q, m_q, logs_q, x_mask = self.enc_q(x, x_lengths, g=g)
|
|
|
|
|
|
|
+ z_q, m_q, logs_q, x_mask = self.enc_q(
|
|
|
|
|
+ x, x_lengths, g=g, noise_scale=noise_scale
|
|
|
|
|
+ )
|
|
|
o = self.dec((z_q * x_mask)[:, :, :max_len], g=g)
|
|
o = self.dec((z_q * x_mask)[:, :, :max_len], g=g)
|
|
|
|
|
|
|
|
return o
|
|
return o
|