from transformers import AutoModelForCausalLM, AutoTokenizer from fish_speech.text.symbols import en_symbols, jp_symbols, zh_symbols # reuse the tokenizer from the llama model_type = "meta-llama/Llama-2-7b-hf" tokenizer = AutoTokenizer.from_pretrained(model_type) # new tokens new_tokens = list(set(zh_symbols + jp_symbols + en_symbols)) new_tokens = [f"" for token in new_tokens] + [ f"" for i in range(4096) ] tokenizer.add_tokens(new_tokens) tokenizer.add_special_tokens({"pad_token": ""}) # pad token tokenizer.padding_side = "right" tokenizer.truncation_side = "right" length = len(tokenizer) if length % 8 != 0: length += 8 - (length % 8) print(f"Vocab size: {len(tokenizer)}, padded to {length}") # model = AutoModelForCausalLM.from_pretrained( # "fishaudio/speech-lm-300m", revision="mqtts-proto" # ) # Resize the token embeddings to include the new tokens # Make sure it's a multiple of 8 for faster training # model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) # total_params = sum(p.numel() for p in model.parameters()) # print(f"Total parameters: {total_params / 1e6:.2f}M") # Try tokenizing a new sequence sequence = "All around, too, lay vast quantities of the costliest merchandise, and treasures were heaped in every cranny of the rocks, but all these things only added to the desolation of the scene." encoded = tokenizer.encode(sequence) print("Test encoding....") print(f"\tSentence: {sequence}") print(f"\tEncoded: {encoded}") print(f"\tDecoded: {tokenizer.batch_decode(encoded)}") # model.push_to_hub( # "fishaudio/speech-lm-300m", private=True, revision="text-pretrain-10k-phones" # ) tokenizer.push_to_hub("fishaudio/speech-lm-v1", private=True)