split_protos.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. from pathlib import Path
  2. import click
  3. from loguru import logger
  4. from fish_speech.datasets.protos.text_data_stream import split_pb_stream
  5. @click.command()
  6. @click.argument("input", type=click.Path(exists=True, path_type=Path))
  7. @click.argument("output", type=click.Path(path_type=Path))
  8. @click.option("--chunk-size", type=int, default=1024**3) # 1GB
  9. def main(input, output, chunk_size):
  10. chunk_idx = 0
  11. current_size = 0
  12. current_file = None
  13. if output.exists() is False:
  14. output.mkdir(parents=True)
  15. with open(input, "rb") as f:
  16. for chunk in split_pb_stream(f):
  17. if current_file is None or current_size + len(chunk) > chunk_size:
  18. if current_file is not None:
  19. current_file.close()
  20. current_file = open(
  21. output / f"{input.stem}.{chunk_idx:04d}.protos", "wb"
  22. )
  23. chunk_idx += 1
  24. current_size = 0
  25. logger.info(f"Writing to {current_file.name}")
  26. current_file.write(chunk)
  27. current_size += len(chunk)
  28. if current_file is not None:
  29. current_file.close()
  30. logger.info(f"Split {input} into {chunk_idx} files")
  31. if __name__ == "__main__":
  32. main()