Server
/
fish-speech


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
							from functools import partial
from pathlib import Path

import numpy as np
from datasets import Dataset


def parse_data(phones, items):
    results = []

    for item_name, semantic_audio in zip(items["item_name"], items["semantic_audio"]):
        wav_file = Path(item_name)
        text_file = wav_file.with_suffix(".txt")

        if not text_file.exists():
            text_file = wav_file.with_suffix(".lab")

        if not text_file.exists():
            print(f"Missing {text_file}")
            return None

        text = text_file.read_text().strip()
        semantic = [f"<semantic_{x}>" for x in semantic_audio.split(" ")]
        semantic = " ".join(semantic)
        results.append(f"[INST] {text} [/INST] {semantic} </s>")
        results.append(f"[INST] {phones[item_name]} [/INST] {semantic} </s>")

    return {
        "text": results,
    }


if __name__ == "__main__":
    phones = np.load("dump/phoneme_train.npy", allow_pickle=True).item()
    phones1 = np.load(
        "/home/fish/hubert-vq-vits/dump/phoneme_train.npy", allow_pickle=True
    ).item()
    phones.update(phones1)
    print(len(phones))

    dataset = Dataset.from_csv(
        [
            "dump/semantic_train.tsv",
            "/home/fish/hubert-vq-vits/dump/semantic_train.tsv",
        ],
        delimiter="\t",
        split="train",
    )
    dataset = dataset.map(
        partial(parse_data, phones),
        num_proc=32,
        remove_columns=dataset.column_names,
        batched=True,
    )
    print(len(dataset), dataset[0])
    dataset.push_to_hub("fishaudio/cn-hubert-25hz-vq", private=True)