123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- import random
- import os
- import re
- import numpy as np
- import torch
- import torch.utils.data
- import librosa
- import layers
- from utils import load_wav_to_torch, load_filepaths_and_text
- from text import text_to_sequence, cmudict
- from yin import compute_yin
- class TextMelLoader(torch.utils.data.Dataset):
- """
- 1) loads audio, text and speaker ids
- 2) normalizes text and converts them to sequences of one-hot vectors
- 3) computes mel-spectrograms and f0s from audio files.
- """
- def __init__(self, audiopaths_and_text, hparams, speaker_ids=None):
- self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
- self.text_cleaners = hparams.text_cleaners
- self.max_wav_value = hparams.max_wav_value
- self.sampling_rate = hparams.sampling_rate
- self.stft = layers.TacotronSTFT(
- hparams.filter_length, hparams.hop_length, hparams.win_length,
- hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
- hparams.mel_fmax)
- self.sampling_rate = hparams.sampling_rate
- self.filter_length = hparams.filter_length
- self.hop_length = hparams.hop_length
- self.f0_min = hparams.f0_min
- self.f0_max = hparams.f0_max
- self.harm_thresh = hparams.harm_thresh
- self.p_arpabet = hparams.p_arpabet
- self.cmudict = None
- if hparams.cmudict_path is not None:
- self.cmudict = cmudict.CMUDict(hparams.cmudict_path)
- self.speaker_ids = speaker_ids
- if speaker_ids is None:
- self.speaker_ids = self.create_speaker_lookup_table(
- self.audiopaths_and_text)
- random.seed(1234)
- random.shuffle(self.audiopaths_and_text)
- def create_speaker_lookup_table(self, audiopaths_and_text):
- speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text]))
- d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
- return d
- # sampling_rate = 22050
- def get_f0(self, audio, sampling_rate=16000, frame_length=1024,
- hop_length=256, f0_min=100, f0_max=300, harm_thresh=0.1):
- f0, harmonic_rates, argmins, times = compute_yin(
- audio, sampling_rate, frame_length, hop_length, f0_min, f0_max,
- harm_thresh)
- pad = int((frame_length / hop_length) / 2)
- f0 = [0.0] * pad + f0 + [0.0] * pad
- f0 = np.array(f0, dtype=np.float32)
- return f0
- def get_data(self, audiopath_and_text):
- audiopath, text, speaker = audiopath_and_text
- text = self.get_text(text)
- mel, f0 = self.get_mel_and_f0(audiopath)
- speaker_id = self.get_speaker_id(speaker)
- return (text, mel, speaker_id, f0)
- def get_speaker_id(self, speaker_id):
- return torch.IntTensor([self.speaker_ids[int(speaker_id)]])
- def get_mel_and_f0(self, filepath):
- audio, sampling_rate = load_wav_to_torch(filepath)
- if sampling_rate != self.stft.sampling_rate:
- raise ValueError("{} SR doesn't match target {} SR".format(
- sampling_rate, self.stft.sampling_rate))
- audio_norm = audio / self.max_wav_value
- audio_norm = audio_norm.unsqueeze(0)
- melspec = self.stft.mel_spectrogram(audio_norm)
- melspec = torch.squeeze(melspec, 0)
- f0 = self.get_f0(audio.cpu().numpy(), self.sampling_rate,
- self.filter_length, self.hop_length, self.f0_min,
- self.f0_max, self.harm_thresh)
- f0 = torch.from_numpy(f0)[None]
- f0 = f0[:, :melspec.size(1)]
- return melspec, f0
- def get_text(self, text):
- text_norm = torch.IntTensor(
- text_to_sequence(text, self.text_cleaners, self.cmudict, self.p_arpabet))
- return text_norm
- def __getitem__(self, index):
- return self.get_data(self.audiopaths_and_text[index])
- def __len__(self):
- return len(self.audiopaths_and_text)
- class TextMelCollate():
- """ Zero-pads model inputs and targets based on number of frames per setep
- """
- def __init__(self, n_frames_per_step):
- self.n_frames_per_step = n_frames_per_step
- def __call__(self, batch):
- """Collate's training batch from normalized text and mel-spectrogram
- PARAMS
- ------
- batch: [text_normalized, mel_normalized]
- """
- # Right zero-pad all one-hot text sequences to max input length
- input_lengths, ids_sorted_decreasing = torch.sort(
- torch.LongTensor([len(x[0]) for x in batch]),
- dim=0, descending=True)
- max_input_len = input_lengths[0]
- text_padded = torch.LongTensor(len(batch), max_input_len)
- text_padded.zero_()
- for i in range(len(ids_sorted_decreasing)):
- text = batch[ids_sorted_decreasing[i]][0]
- text_padded[i, :text.size(0)] = text
- # Right zero-pad mel-spec
- num_mels = batch[0][1].size(0)
- max_target_len = max([x[1].size(1) for x in batch])
- if max_target_len % self.n_frames_per_step != 0:
- max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
- assert max_target_len % self.n_frames_per_step == 0
- # include mel padded, gate padded and speaker ids
- mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
- mel_padded.zero_()
- gate_padded = torch.FloatTensor(len(batch), max_target_len)
- gate_padded.zero_()
- output_lengths = torch.LongTensor(len(batch))
- speaker_ids = torch.LongTensor(len(batch))
- f0_padded = torch.FloatTensor(len(batch), 1, max_target_len)
- f0_padded.zero_()
- for i in range(len(ids_sorted_decreasing)):
- mel = batch[ids_sorted_decreasing[i]][1]
- mel_padded[i, :, :mel.size(1)] = mel
- gate_padded[i, mel.size(1)-1:] = 1
- output_lengths[i] = mel.size(1)
- speaker_ids[i] = batch[ids_sorted_decreasing[i]][2]
- f0 = batch[ids_sorted_decreasing[i]][3]
- f0_padded[i, :, :f0.size(1)] = f0
- model_inputs = (text_padded, input_lengths, mel_padded, gate_padded,
- output_lengths, speaker_ids, f0_padded)
- return model_inputs
|