build_vq_text.py 969 B

1234567891011121314151617181920212223242526272829303132
  1. from pathlib import Path
  2. from datasets import Dataset
  3. def parse_data(wav_dir, item):
  4. text_file = (wav_dir / item["item_name"]).with_suffix(".txt")
  5. text = text_file.read_text().strip()
  6. semantic = item["semantic_audio"]
  7. semantic = [f"<semantic_{x}>" for x in semantic.split(" ")]
  8. semantic = " ".join(semantic)
  9. text = f"[INST] {text} [/INST] {semantic} </s>"
  10. return {
  11. "text": text,
  12. }
  13. if __name__ == "__main__":
  14. # dataset = WenetVQDataset()
  15. # dataset = list(dataset)
  16. # print("Initialized dataset.")
  17. dataset = Dataset.from_csv("data/cn-hubert-wenet-25hz-semantic.tsv", delimiter="\t")
  18. dataset = dataset.map(
  19. lambda item: parse_data(Path("data/WenetSpeech"), item), num_proc=64
  20. )
  21. dataset = dataset.remove_columns(["item_name", "semantic_audio"])
  22. dataset = dataset.train_test_split(test_size=0.01)
  23. print(dataset["test"][0])
  24. dataset.push_to_hub("fishaudio/wenet-vq", private=True)