123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233 |
- import copy
- import torch
- from glow import Invertible1x1Conv, remove
- @torch.jit.script
- def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
- n_channels_int = n_channels[0]
- in_act = input_a+input_b
- t_act = torch.tanh(in_act[:, :n_channels_int, :])
- s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
- acts = t_act * s_act
- return acts
- class WN(torch.nn.Module):
- """
- This is the WaveNet like layer for the affine coupling. The primary difference
- from WaveNet is the convolutions need not be causal. There is also no dilation
- size reset. The dilation only doubles on each layer
- """
- def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
- kernel_size):
- super(WN, self).__init__()
- assert(kernel_size % 2 == 1)
- assert(n_channels % 2 == 0)
- self.n_layers = n_layers
- self.n_channels = n_channels
- self.in_layers = torch.nn.ModuleList()
- self.res_skip_layers = torch.nn.ModuleList()
- self.cond_layers = torch.nn.ModuleList()
- start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
- start = torch.nn.utils.weight_norm(start, name='weight')
- self.start = start
- # Initializing last layer to 0 makes the affine coupling layers
- # do nothing at first. This helps with training stability
- end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
- end.weight.data.zero_()
- end.bias.data.zero_()
- self.end = end
- for i in range(n_layers):
- dilation = 2 ** i
- padding = int((kernel_size*dilation - dilation)/2)
- in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
- dilation=dilation, padding=padding)
- in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
- self.in_layers.append(in_layer)
- cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
- cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
- self.cond_layers.append(cond_layer)
- # last one is not necessary
- if i < n_layers - 1:
- res_skip_channels = 2*n_channels
- else:
- res_skip_channels = n_channels
- res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
- res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
- self.res_skip_layers.append(res_skip_layer)
- def forward(self, forward_input):
- audio, spect = forward_input
- audio = self.start(audio)
- for i in range(self.n_layers):
- acts = fused_add_tanh_sigmoid_multiply(
- self.in_layers[i](audio),
- self.cond_layers[i](spect),
- torch.IntTensor([self.n_channels]))
- res_skip_acts = self.res_skip_layers[i](acts)
- if i < self.n_layers - 1:
- audio = res_skip_acts[:,:self.n_channels,:] + audio
- skip_acts = res_skip_acts[:,self.n_channels:,:]
- else:
- skip_acts = res_skip_acts
- if i == 0:
- output = skip_acts
- else:
- output = skip_acts + output
- return self.end(output)
- class WaveGlow(torch.nn.Module):
- def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
- n_early_size, WN_config):
- super(WaveGlow, self).__init__()
- self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
- n_mel_channels,
- 1024, stride=256)
- assert(n_group % 2 == 0)
- self.n_flows = n_flows
- self.n_group = n_group
- self.n_early_every = n_early_every
- self.n_early_size = n_early_size
- self.WN = torch.nn.ModuleList()
- self.convinv = torch.nn.ModuleList()
- n_half = int(n_group/2)
- # Set up layers with the right sizes based on how many dimensions
- # have been output already
- n_remaining_channels = n_group
- for k in range(n_flows):
- if k % self.n_early_every == 0 and k > 0:
- n_half = n_half - int(self.n_early_size/2)
- n_remaining_channels = n_remaining_channels - self.n_early_size
- self.convinv.append(Invertible1x1Conv(n_remaining_channels))
- self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
- self.n_remaining_channels = n_remaining_channels # Useful during inference
- def forward(self, forward_input):
- return None
- """
- forward_input[0] = audio: batch x time
- forward_input[1] = upsamp_spectrogram: batch x n_cond_channels x time
- """
- """
- spect, audio = forward_input
- # Upsample spectrogram to size of audio
- spect = self.upsample(spect)
- assert(spect.size(2) >= audio.size(1))
- if spect.size(2) > audio.size(1):
- spect = spect[:, :, :audio.size(1)]
- spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
- spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
- audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
- output_audio = []
- s_list = []
- s_conv_list = []
- for k in range(self.n_flows):
- if k%4 == 0 and k > 0:
- output_audio.append(audio[:,:self.n_multi,:])
- audio = audio[:,self.n_multi:,:]
- # project to new basis
- audio, s = self.convinv[k](audio)
- s_conv_list.append(s)
- n_half = int(audio.size(1)/2)
- if k%2 == 0:
- audio_0 = audio[:,:n_half,:]
- audio_1 = audio[:,n_half:,:]
- else:
- audio_1 = audio[:,:n_half,:]
- audio_0 = audio[:,n_half:,:]
- output = self.nn[k]((audio_0, spect))
- s = output[:, n_half:, :]
- b = output[:, :n_half, :]
- audio_1 = torch.exp(s)*audio_1 + b
- s_list.append(s)
- if k%2 == 0:
- audio = torch.cat([audio[:,:n_half,:], audio_1],1)
- else:
- audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
- output_audio.append(audio)
- return torch.cat(output_audio,1), s_list, s_conv_list
- """
- def infer(self, spect, sigma=1.0):
- spect = self.upsample(spect)
- # trim conv artifacts. maybe pad spec to kernel multiple
- time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
- spect = spect[:, :, :-time_cutoff]
- spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
- spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
- if spect.type() == 'torch.cuda.HalfTensor':
- audio = torch.cuda.HalfTensor(spect.size(0),
- self.n_remaining_channels,
- spect.size(2)).normal_()
- else:
- audio = torch.cuda.FloatTensor(spect.size(0),
- self.n_remaining_channels,
- spect.size(2)).normal_()
- audio = torch.autograd.Variable(sigma*audio)
- for k in reversed(range(self.n_flows)):
- n_half = int(audio.size(1)/2)
- if k%2 == 0:
- audio_0 = audio[:,:n_half,:]
- audio_1 = audio[:,n_half:,:]
- else:
- audio_1 = audio[:,:n_half,:]
- audio_0 = audio[:,n_half:,:]
- output = self.WN[k]((audio_0, spect))
- s = output[:, n_half:, :]
- b = output[:, :n_half, :]
- audio_1 = (audio_1 - b)/torch.exp(s)
- if k%2 == 0:
- audio = torch.cat([audio[:,:n_half,:], audio_1],1)
- else:
- audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
- audio = self.convinv[k](audio, reverse=True)
- if k%4 == 0 and k > 0:
- if spect.type() == 'torch.cuda.HalfTensor':
- z = torch.cuda.HalfTensor(spect.size(0),
- self.n_early_size,
- spect.size(2)).normal_()
- else:
- z = torch.cuda.FloatTensor(spect.size(0),
- self.n_early_size,
- spect.size(2)).normal_()
- audio = torch.cat((sigma*z, audio),1)
- return audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
- @staticmethod
- def remove_weightnorm(model):
- waveglow = model
- for WN in waveglow.WN:
- WN.start = torch.nn.utils.remove_weight_norm(WN.start)
- WN.in_layers = remove(WN.in_layers)
- WN.cond_layers = remove(WN.cond_layers)
- WN.res_skip_layers = remove(WN.res_skip_layers)
- return waveglow
|