smart_pad.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import random
  2. from multiprocessing import Pool
  3. from pathlib import Path
  4. import click
  5. import librosa
  6. import torch.nn.functional as F
  7. import torchaudio
  8. from tqdm import tqdm
  9. from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files
  10. threshold = 10 ** (-50 / 20.0)
  11. def process(file):
  12. waveform, sample_rate = torchaudio.load(str(file), backend="sox")
  13. loudness = librosa.feature.rms(
  14. y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
  15. )[0]
  16. for i in range(len(loudness) - 1, 0, -1):
  17. if loudness[i] > threshold:
  18. break
  19. silent_time = (len(loudness) - i) * 512 / sample_rate
  20. if silent_time <= 0.3:
  21. random_time = random.uniform(0.3, 0.7)
  22. waveform = F.pad(
  23. waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
  24. )
  25. torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)
  26. @click.command()
  27. @click.argument("source", type=Path)
  28. @click.option("--num-workers", type=int, default=12)
  29. def main(source, num_workers):
  30. files = list(list_files(source, AUDIO_EXTENSIONS, recursive=True))
  31. with Pool(num_workers) as p:
  32. list(tqdm(p.imap_unordered(process, files), total=len(files)))
  33. if __name__ == "__main__":
  34. main()