layers.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. import torch
  2. from librosa.filters import mel as librosa_mel_fn
  3. from audio_processing import dynamic_range_compression, dynamic_range_decompression
  4. from stft import STFT
  5. class LinearNorm(torch.nn.Module):
  6. def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
  7. super(LinearNorm, self).__init__()
  8. self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
  9. torch.nn.init.xavier_uniform_(
  10. self.linear_layer.weight,
  11. gain=torch.nn.init.calculate_gain(w_init_gain))
  12. def forward(self, x):
  13. return self.linear_layer(x)
  14. class ConvNorm(torch.nn.Module):
  15. def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
  16. padding=None, dilation=1, bias=True, w_init_gain='linear'):
  17. super(ConvNorm, self).__init__()
  18. if padding is None:
  19. assert(kernel_size % 2 == 1)
  20. padding = int(dilation * (kernel_size - 1) / 2)
  21. self.conv = torch.nn.Conv1d(in_channels, out_channels,
  22. kernel_size=kernel_size, stride=stride,
  23. padding=padding, dilation=dilation,
  24. bias=bias)
  25. torch.nn.init.xavier_uniform_(
  26. self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
  27. def forward(self, signal):
  28. conv_signal = self.conv(signal)
  29. return conv_signal
  30. class ConvNorm2D(torch.nn.Module):
  31. def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
  32. padding=None, dilation=1, bias=True, w_init_gain='linear'):
  33. super(ConvNorm2D, self).__init__()
  34. self.conv = torch.nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
  35. kernel_size=kernel_size, stride=stride,
  36. padding=padding, dilation=dilation,
  37. groups=1, bias=bias)
  38. torch.nn.init.xavier_uniform_(
  39. self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
  40. def forward(self, signal):
  41. conv_signal = self.conv(signal)
  42. return conv_signal
  43. class TacotronSTFT(torch.nn.Module):
  44. def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
  45. n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
  46. mel_fmax=8000.0):
  47. super(TacotronSTFT, self).__init__()
  48. self.n_mel_channels = n_mel_channels
  49. self.sampling_rate = sampling_rate
  50. self.stft_fn = STFT(filter_length, hop_length, win_length)
  51. mel_basis = librosa_mel_fn(
  52. sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
  53. mel_basis = torch.from_numpy(mel_basis).float()
  54. self.register_buffer('mel_basis', mel_basis)
  55. def spectral_normalize(self, magnitudes):
  56. output = dynamic_range_compression(magnitudes)
  57. return output
  58. def spectral_de_normalize(self, magnitudes):
  59. output = dynamic_range_decompression(magnitudes)
  60. return output
  61. def mel_spectrogram(self, y, ref_level_db = 20, magnitude_power=1.5):
  62. """Computes mel-spectrograms from a batch of waves
  63. PARAMS
  64. ------
  65. y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
  66. RETURNS
  67. -------
  68. mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
  69. """
  70. assert(torch.min(y.data) >= -1)
  71. assert(torch.max(y.data) <= 1)
  72. magnitudes, phases = self.stft_fn.transform(y)
  73. magnitudes = magnitudes.data
  74. mel_output = torch.matmul(self.mel_basis, magnitudes)
  75. mel_output = self.spectral_normalize(mel_output)
  76. return mel_output