rebuild_tokenizer.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. from transformers import AutoModelForCausalLM, AutoTokenizer
  2. from fish_speech.text.symbols import en_symbols, jp_symbols, zh_symbols
  3. # reuse the tokenizer from the llama
  4. model_type = "meta-llama/Llama-2-7b-hf"
  5. tokenizer = AutoTokenizer.from_pretrained(model_type)
  6. # new tokens
  7. new_tokens = list(set(zh_symbols + jp_symbols + en_symbols))
  8. new_tokens = [f"<p:{token}>" for token in new_tokens] + [
  9. f"<s:{i}>" for i in range(4096)
  10. ]
  11. tokenizer.add_tokens(new_tokens)
  12. tokenizer.add_special_tokens({"pad_token": "<pad>"})
  13. # pad token
  14. tokenizer.padding_side = "right"
  15. tokenizer.truncation_side = "right"
  16. length = len(tokenizer)
  17. if length % 8 != 0:
  18. length += 8 - (length % 8)
  19. print(f"Vocab size: {len(tokenizer)}, padded to {length}")
  20. # model = AutoModelForCausalLM.from_pretrained(
  21. # "fishaudio/speech-lm-300m", revision="mqtts-proto"
  22. # )
  23. # Resize the token embeddings to include the new tokens
  24. # Make sure it's a multiple of 8 for faster training
  25. # model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
  26. # total_params = sum(p.numel() for p in model.parameters())
  27. # print(f"Total parameters: {total_params / 1e6:.2f}M")
  28. # Try tokenizing a new sequence
  29. sequence = "All around, too, lay vast quantities of the costliest merchandise, and treasures were heaped in every cranny of the rocks, but all these things only added to the desolation of the scene."
  30. encoded = tokenizer.encode(sequence)
  31. print("Test encoding....")
  32. print(f"\tSentence: {sequence}")
  33. print(f"\tEncoded: {encoded}")
  34. print(f"\tDecoded: {tokenizer.batch_decode(encoded)}")
  35. # model.push_to_hub(
  36. # "fishaudio/speech-lm-300m", private=True, revision="text-pretrain-10k-phones"
  37. # )
  38. tokenizer.push_to_hub("fishaudio/speech-lm-v1", private=True)