text.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. # -*- coding: utf-8 -*-
  2. """
  3. TEXT类
  4. """
  5. __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
  6. __data__ = "2019-05-03"
  7. import re
  8. from fish_speech.text.chn_text_norm.cardinal import Cardinal
  9. from fish_speech.text.chn_text_norm.date import Date
  10. from fish_speech.text.chn_text_norm.digit import Digit
  11. from fish_speech.text.chn_text_norm.fraction import Fraction
  12. from fish_speech.text.chn_text_norm.money import Money
  13. from fish_speech.text.chn_text_norm.percentage import Percentage
  14. from fish_speech.text.chn_text_norm.telephone import TelePhone
  15. CURRENCY_NAMES = (
  16. "(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|"
  17. "里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)"
  18. )
  19. CURRENCY_UNITS = "((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)"
  20. COM_QUANTIFIERS = (
  21. "(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|"
  22. "砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|"
  23. "针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|"
  24. "毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|"
  25. "盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|"
  26. "纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|人|抽)"
  27. )
  28. class Text:
  29. """
  30. Text类
  31. """
  32. def __init__(self, raw_text, norm_text=None):
  33. self.raw_text = "^" + raw_text + "$"
  34. self.norm_text = norm_text
  35. def _particular(self):
  36. text = self.norm_text
  37. pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
  38. matchers = pattern.findall(text)
  39. if matchers:
  40. # print('particular')
  41. for matcher in matchers:
  42. text = text.replace(matcher[0], matcher[1] + "2" + matcher[2], 1)
  43. self.norm_text = text
  44. return self.norm_text
  45. def normalize(self):
  46. text = self.raw_text
  47. # 规范化日期
  48. pattern = re.compile(
  49. r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)"
  50. )
  51. matchers = pattern.findall(text)
  52. if matchers:
  53. # print('date')
  54. for matcher in matchers:
  55. text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
  56. # 规范化金钱
  57. pattern = re.compile(
  58. r"\D+((\d+(\.\d+)?)[多余几]?"
  59. + CURRENCY_UNITS
  60. + "(\d"
  61. + CURRENCY_UNITS
  62. + "?)?)"
  63. )
  64. matchers = pattern.findall(text)
  65. if matchers:
  66. # print('money')
  67. for matcher in matchers:
  68. text = text.replace(
  69. matcher[0], Money(money=matcher[0]).money2chntext(), 1
  70. )
  71. # 规范化固话/手机号码
  72. # 手机
  73. # http://www.jihaoba.com/news/show/13680
  74. # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
  75. # 联通:130、131、132、156、155、186、185、176
  76. # 电信:133、153、189、180、181、177
  77. pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
  78. matchers = pattern.findall(text)
  79. if matchers:
  80. # print('telephone')
  81. for matcher in matchers:
  82. text = text.replace(
  83. matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1
  84. )
  85. # 固话
  86. pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
  87. matchers = pattern.findall(text)
  88. if matchers:
  89. # print('fixed telephone')
  90. for matcher in matchers:
  91. text = text.replace(
  92. matcher[0],
  93. TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True),
  94. 1,
  95. )
  96. # 规范化分数
  97. pattern = re.compile(r"(\d+/\d+)")
  98. matchers = pattern.findall(text)
  99. if matchers:
  100. # print('fraction')
  101. for matcher in matchers:
  102. text = text.replace(
  103. matcher, Fraction(fraction=matcher).fraction2chntext(), 1
  104. )
  105. # 规范化百分数
  106. text = text.replace("%", "%")
  107. pattern = re.compile(r"(\d+(\.\d+)?%)")
  108. matchers = pattern.findall(text)
  109. if matchers:
  110. # print('percentage')
  111. for matcher in matchers:
  112. text = text.replace(
  113. matcher[0],
  114. Percentage(percentage=matcher[0]).percentage2chntext(),
  115. 1,
  116. )
  117. # 规范化纯数+量词
  118. pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
  119. matchers = pattern.findall(text)
  120. if matchers:
  121. # print('cardinal+quantifier')
  122. for matcher in matchers:
  123. text = text.replace(
  124. matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1
  125. )
  126. # 规范化数字编号
  127. pattern = re.compile(r"(\d{4,32})")
  128. matchers = pattern.findall(text)
  129. if matchers:
  130. # print('digit')
  131. for matcher in matchers:
  132. text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
  133. # 规范化纯数
  134. pattern = re.compile(r"(\d+(\.\d+)?)")
  135. matchers = pattern.findall(text)
  136. if matchers:
  137. # print('cardinal')
  138. for matcher in matchers:
  139. text = text.replace(
  140. matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1
  141. )
  142. self.norm_text = text
  143. self._particular()
  144. return self.norm_text.lstrip("^").rstrip("$")
  145. if __name__ == "__main__":
  146. # 测试程序
  147. print(Text(raw_text="固话:0595-23865596或23880880。").normalize())
  148. print(Text(raw_text="手机:+86 19859213959或15659451527。").normalize())
  149. print(Text(raw_text="分数:32477/76391。").normalize())
  150. print(Text(raw_text="百分数:80.03%。").normalize())
  151. print(Text(raw_text="编号:31520181154418。").normalize())
  152. print(Text(raw_text="纯数:2983.07克或12345.60米。").normalize())
  153. print(Text(raw_text="日期:1999年2月20日或09年3月15号。").normalize())
  154. print(Text(raw_text="金钱:12块5,34.5元,20.1万").normalize())
  155. print(Text(raw_text="特殊:O2O或B2C。").normalize())