japanese.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
  2. import re
  3. import pyopenjtalk
  4. # Regular expression matching Japanese without punctuation marks:
  5. _japanese_characters = re.compile(
  6. r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
  7. )
  8. # Regular expression matching non-Japanese characters or punctuation marks:
  9. _japanese_marks = re.compile(
  10. r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
  11. )
  12. # List of (symbol, Japanese) pairs for marks:
  13. _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
  14. # List of (consonant, sokuon) pairs:
  15. _real_sokuon = [
  16. (re.compile("%s" % x[0]), x[1])
  17. for x in [
  18. (r"Q([↑↓]*[kg])", r"k#\1"),
  19. (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
  20. (r"Q([↑↓]*[sʃ])", r"s\1"),
  21. (r"Q([↑↓]*[pb])", r"p#\1"),
  22. ]
  23. ]
  24. # List of (consonant, hatsuon) pairs:
  25. _real_hatsuon = [
  26. (re.compile("%s" % x[0]), x[1])
  27. for x in [
  28. (r"N([↑↓]*[pbm])", r"m\1"),
  29. (r"N([↑↓]*[ʧʥj])", r"n^\1"),
  30. (r"N([↑↓]*[tdn])", r"n\1"),
  31. (r"N([↑↓]*[kg])", r"ŋ\1"),
  32. ]
  33. ]
  34. def symbols_to_japanese(text):
  35. for regex, replacement in _symbols_to_japanese:
  36. text = re.sub(regex, replacement, text)
  37. return text
  38. def g2p(text):
  39. """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
  40. text = symbols_to_japanese(text)
  41. sentences = re.split(_japanese_marks, text)
  42. marks = re.findall(_japanese_marks, text)
  43. ct = text
  44. text = []
  45. for i, sentence in enumerate(sentences):
  46. if re.match(_japanese_characters, sentence):
  47. p = pyopenjtalk.g2p(sentence)
  48. text += p.split(" ")
  49. if i < len(marks):
  50. text += [marks[i].replace(" ", "")]
  51. # Clean empty strings
  52. text = [t for t in text if t.strip() != ""]
  53. text = ["-" if t == "pau" else t for t in text]
  54. return text