chinese.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. import os
  2. import re
  3. import jieba.posseg as psg
  4. from loguru import logger
  5. from pypinyin import Style, lazy_pinyin
  6. from fish_speech.text.symbols import punctuation
  7. from fish_speech.text.tone_sandhi import ToneSandhi
  8. try:
  9. from tn.chinese.normalizer import Normalizer
  10. normalizer = Normalizer().normalize
  11. except ImportError:
  12. import cn2an
  13. logger.warning("tn.chinese.normalizer not found, use cn2an normalizer")
  14. normalizer = lambda x: cn2an.transform(x, "an2cn")
  15. current_file_path = os.path.dirname(__file__)
  16. OPENCPOP_DICT_PATH = os.path.join(current_file_path, "opencpop-strict.txt")
  17. pinyin_to_symbol_map = {
  18. line.split("\t")[0]: line.strip().split("\t")[1]
  19. for line in open(OPENCPOP_DICT_PATH).readlines()
  20. }
  21. tone_modifier = ToneSandhi()
  22. def replace_punctuation(text):
  23. text = text.replace("嗯", "恩").replace("呣", "母")
  24. replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", text)
  25. return replaced_text
  26. def g2p(text):
  27. text = text_normalize(text)
  28. text = replace_punctuation(text)
  29. pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
  30. sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
  31. phones = _g2p(sentences)
  32. return phones
  33. def _get_initials_finals(word):
  34. initials = []
  35. finals = []
  36. orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
  37. orig_finals = lazy_pinyin(
  38. word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
  39. )
  40. for c, v in zip(orig_initials, orig_finals):
  41. initials.append(c)
  42. finals.append(v)
  43. return initials, finals
  44. def _g2p(segments):
  45. phones_list = []
  46. for seg in segments:
  47. pinyins = []
  48. # Replace all English words in the sentence
  49. seg = re.sub("[a-zA-Z]+", "", seg)
  50. seg_cut = psg.lcut(seg)
  51. initials = []
  52. finals = []
  53. seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
  54. for word, pos in seg_cut:
  55. if pos == "eng":
  56. continue
  57. sub_initials, sub_finals = _get_initials_finals(word)
  58. sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
  59. initials.append(sub_initials)
  60. finals.append(sub_finals)
  61. # assert len(sub_initials) == len(sub_finals) == len(word)
  62. initials = sum(initials, [])
  63. finals = sum(finals, [])
  64. #
  65. for c, v in zip(initials, finals):
  66. raw_pinyin = c + v
  67. # NOTE: post process for pypinyin outputs
  68. # we discriminate i, ii and iii
  69. if c == v:
  70. assert c in punctuation
  71. phone = [c]
  72. else:
  73. v_without_tone = v[:-1]
  74. tone = v[-1]
  75. pinyin = c + v_without_tone
  76. assert tone in "12345"
  77. if c:
  78. # 多音节
  79. v_rep_map = {
  80. "uei": "ui",
  81. "iou": "iu",
  82. "uen": "un",
  83. }
  84. if v_without_tone in v_rep_map.keys():
  85. pinyin = c + v_rep_map[v_without_tone]
  86. else:
  87. # 单音节
  88. pinyin_rep_map = {
  89. "ing": "ying",
  90. "i": "yi",
  91. "in": "yin",
  92. "u": "wu",
  93. }
  94. if pinyin in pinyin_rep_map.keys():
  95. pinyin = pinyin_rep_map[pinyin]
  96. else:
  97. single_rep_map = {
  98. "v": "yu",
  99. "e": "e",
  100. "i": "y",
  101. "u": "w",
  102. }
  103. if pinyin[0] in single_rep_map.keys():
  104. pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
  105. assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
  106. new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
  107. new_v = new_v + tone
  108. phone = [new_c, new_v]
  109. phones_list += phone
  110. return phones_list
  111. def text_normalize(text):
  112. return normalizer(text)
  113. if __name__ == "__main__":
  114. text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
  115. text = "呣呣呣~就是…大人的鼹鼠党吧?"
  116. # text = "你好"
  117. text = text_normalize(text)
  118. print(g2p(text))