to_flac.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. import random
  2. import subprocess
  3. from multiprocessing import Pool, cpu_count
  4. from pathlib import Path
  5. from tqdm import tqdm
  6. def convert_to_flac(src_file_path):
  7. dst_file_path = src_file_path.with_suffix(".flac")
  8. dst_file_path.parent.mkdir(parents=True, exist_ok=True)
  9. try:
  10. subprocess.check_call(
  11. [
  12. "ffmpeg",
  13. "-y",
  14. "-i",
  15. str(src_file_path),
  16. "-acodec",
  17. "flac",
  18. "-threads",
  19. "0",
  20. str(dst_file_path),
  21. ],
  22. stdout=subprocess.DEVNULL,
  23. stderr=subprocess.DEVNULL,
  24. )
  25. # remove the input file
  26. src_file_path.unlink()
  27. return True
  28. except subprocess.CalledProcessError:
  29. return False
  30. if __name__ == "__main__":
  31. src_dir = Path("dataset/tts/WenetSpeech/cleaned")
  32. wav_files = list(src_dir.rglob("*.wav"))
  33. random.shuffle(wav_files)
  34. print(f"Found {len(wav_files)} wav files")
  35. success_counter = 0
  36. fail_counter = 0
  37. with Pool(processes=cpu_count(), maxtasksperchild=100) as pool:
  38. with tqdm(
  39. pool.imap_unordered(convert_to_flac, wav_files), total=len(wav_files)
  40. ) as pbar:
  41. for success in pbar:
  42. if success:
  43. success_counter += 1
  44. else:
  45. fail_counter += 1
  46. pbar.set_description(f"Success: {success_counter}, Fail: {fail_counter}")
  47. print(f"Successfully converted: {success_counter}")
  48. print(f"Failed conversions: {fail_counter}")