lijunfeng
/
melltron


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
							#!/usr/bin/env python
# coding: utf-8

# In[1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
# get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib
import matplotlib.pyplot as plt
import IPython.display as ipd

import sys
sys.path.append('waveglow/')

from itertools import cycle
import numpy as np
import scipy as sp
from scipy.io.wavfile import write
import pandas as pd
import librosa
import torch

from hparams import create_hparams
from model import Tacotron2, load_model
from waveglow.denoiser import Denoiser
from layers import TacotronSTFT
from data_utils import TextMelLoader, TextMelCollate
from text import cmudict, text_to_sequence
from mellotron_utils import get_data_from_musicxml


# In[2]:


import torch

# In[2]:


def panner(signal, angle):
    angle = np.radians(angle)
    left = np.sqrt(2)/2.0 * (np.cos(angle) - np.sin(angle)) * signal
    right = np.sqrt(2)/2.0 * (np.cos(angle) + np.sin(angle)) * signal
    return np.dstack((left, right))[0]


# In[3]:


def plot_mel_f0_alignment(mel_source, mel_outputs_postnet, f0s, alignments, figsize=(16, 16)):
    fig, axes = plt.subplots(4, 1, figsize=figsize)
    axes = axes.flatten()
    axes[0].imshow(mel_source, aspect='auto', origin='bottom', interpolation='none')
    axes[1].imshow(mel_outputs_postnet, aspect='auto', origin='bottom', interpolation='none')
    axes[2].scatter(range(len(f0s)), f0s, alpha=0.5, color='red', marker='.', s=1)
    axes[2].set_xlim(0, len(f0s))
    axes[3].imshow(alignments, aspect='auto', origin='bottom', interpolation='none')
    axes[0].set_title("Source Mel")
    axes[1].set_title("Predicted Mel")
    axes[2].set_title("Source pitch contour")
    axes[3].set_title("Source rhythm")
    plt.tight_layout()


# In[4]:


def load_mel(path):
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.to(device)
    return melspec


# In[5]:


hparams = create_hparams()


# In[6]:


stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length,
                    hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                    hparams.mel_fmax)


# ## Load Models

# In[7]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# In[8]:


checkpoint_path = "models/mellotron_libritts.pt"
# mellotron = load_model(hparams).cuda().eval()
mellotron = load_model(hparams).to(device).eval()
mellotron.load_state_dict(torch.load(checkpoint_path,map_location=torch.device('cpu'))['state_dict'])


# In[9]:


waveglow_path = 'models/waveglow_256channels_universal_v4.pt'
waveglow = torch.load(waveglow_path,map_location=torch.device('cpu'))['model'].to(device).eval()
denoiser = Denoiser(waveglow).to(device).eval()


# ## Setup dataloaders

# In[10]:


arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
audio_paths = 'data/examples_filelist.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)


# ## Load data

# In[11]:


file_idx = 0
audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

# get audio path, encoded text, pitch contour and mel for gst
text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].to(device)
pitch_contour = dataloader[file_idx][3][None].to(device)
mel = load_mel(audio_path)
print(audio_path, text)

# load source data to obtain rhythm using tacotron 2 as a forced aligner
# x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))


# In[12]:


# In[14]:


# ## Define Speakers Set

# In[15]:


speaker_ids = TextMelLoader("filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", hparams).speaker_ids
speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python',header=None, comment=';', sep=' *\| *', 
                       names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
speakers['MELLOTRON_ID'] = speakers['ID'].apply(lambda x: speaker_ids[x] if x in speaker_ids else -1)
female_speakers = cycle(
    speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())
male_speakers = cycle(
    speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())


# In[ ]:


# # Style Transfer (Rhythm and Pitch Contour)

# In[16]:


# with torch.no_grad():
#     # get rhythm (alignment map) using tacotron 2
#     mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(x)
#     rhythm = rhythm.permute(1, 0, 2)
#
#
# # In[17]:
#
#
# speaker_id = next(female_speakers) if np.random.randint(2) else next(male_speakers)
# speaker_id = torch.LongTensor([speaker_id]).to(device)
#
# with torch.no_grad():
#     mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
#         (text_encoded, mel, speaker_id, pitch_contour, rhythm))
#
# plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
#                       mel_outputs_postnet.data.cpu().numpy()[0],
#                       pitch_contour.data.cpu().numpy()[0, 0],
#                       rhythm.data.cpu().numpy()[:, 0].T)
#
#
# # In[18]:
#
#
# with torch.no_grad():
#     audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]
# ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)


# # Singing Voice from Music Score

# In[49]:


data = get_data_from_musicxml('data/haendel_hallelujah_1.musicxml', 132, convert_stress=True)
# data = get_data_from_musicxml('data/Dream_a_little_dream_of_me.musicxml', 132, convert_stress=True)
panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]}


# In[ ]:


torch.LongTensor([next(female_speakers)]).to(device)


# In[18]:


n_speakers_per_part = 4
frequency_scaling = 0.4
n_seconds = 90
audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32)
for i, (part, v) in enumerate(data.items()):
    rhythm = data[part]['rhythm'].to(device)
    pitch_contour = data[part]['pitch_contour'].to(device)
    text_encoded = data[part]['text_encoded'].to(device)
    
    for k in range(n_speakers_per_part):
        pan = np.random.randint(panning[part][0], panning[part][1])
        if any(x in part.lower() for x in ('soprano', 'alto', 'female')):
            speaker_id = torch.LongTensor([next(female_speakers)]).to(device)
        else:
            speaker_id = torch.LongTensor([next(male_speakers)]).to(device)
        print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan))

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour*frequency_scaling, rhythm))

            audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
            audio = audio.cpu().numpy()
            audio = panner(audio, pan)
            audio_stereo[:audio.shape[0]] += audio            
            write("{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio)


# In[19]:


audio_stereo = audio_stereo / np.max(np.abs(audio_stereo))
write("audio_stereo.wav", hparams.sampling_rate, audio_stereo)
ipd.Audio([audio_stereo[:,0], audio_stereo[:,1]], rate=hparams.sampling_rate)


# In[20]:


# mellotron.inference_noattention(text_encoded)


# In[21]: