| 1234567891011121314151617181920212223242526272829303132333435363738394041424344 |
- from pathlib import Path
- import click
- from loguru import logger
- from fish_speech.datasets.protos.text_data_stream import split_pb_stream
- @click.command()
- @click.argument("input", type=click.Path(exists=True, path_type=Path))
- @click.argument("output", type=click.Path(path_type=Path))
- @click.option("--chunk-size", type=int, default=1024**3) # 1GB
- def main(input, output, chunk_size):
- chunk_idx = 0
- current_size = 0
- current_file = None
- if output.exists() is False:
- output.mkdir(parents=True)
- with open(input, "rb") as f:
- for chunk in split_pb_stream(f):
- if current_file is None or current_size + len(chunk) > chunk_size:
- if current_file is not None:
- current_file.close()
- current_file = open(
- output / f"{input.stem}.{chunk_idx:04d}.protos", "wb"
- )
- chunk_idx += 1
- current_size = 0
- logger.info(f"Writing to {current_file.name}")
- current_file.write(chunk)
- current_size += len(chunk)
- if current_file is not None:
- current_file.close()
- logger.info(f"Split {input} into {chunk_idx} files")
- if __name__ == "__main__":
- main()
|