tone_sandhi.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from typing import List, Tuple
  15. import jieba
  16. from pypinyin import Style, lazy_pinyin
  17. class ToneSandhi:
  18. def __init__(self):
  19. # fmt: off
  20. self.must_neural_tone_words = {
  21. '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
  22. '难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊',
  23. '里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去',
  24. '软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号',
  25. '认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当',
  26. '蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻',
  27. '舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂',
  28. '胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆',
  29. '老头', '老太', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
  30. '精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿',
  31. '窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台',
  32. '码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算',
  33. '白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨',
  34. '琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快',
  35. '爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜',
  36. '溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔',
  37. '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事',
  38. '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
  39. '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
  40. '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打点', '打扮', '打听', '打发', '扎实',
  41. '扁担', '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头',
  42. '念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼',
  43. '干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数',
  44. '屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气',
  45. '实在', '官司', '学问', '学生', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈',
  46. '姑娘', '姐夫', '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方',
  47. '大意', '大夫', '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴',
  48. '嘱咐', '嘟囔', '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦',
  49. '咳嗽', '和尚', '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝',
  50. '叫唤', '口袋', '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹',
  51. '功夫', '力气', '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息',
  52. '凑合', '凉快', '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤',
  53. '佩服', '作坊', '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家',
  54. '交情', '云彩', '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故',
  55. '不由', '不在', '下水', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨',
  56. '父亲', '母亲', '咕噜', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅',
  57. '幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱',
  58. '凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱',
  59. '扫把', '惦记'
  60. }
  61. self.must_not_neural_tone_words = {
  62. "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎"
  63. }
  64. self.punc = ":,;。?!“”‘’':,;.?!"
  65. # fmt: on
  66. # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
  67. # e.g.
  68. # word: "家里"
  69. # pos: "s"
  70. # finals: ['ia1', 'i3']
  71. def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
  72. # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
  73. for j, item in enumerate(word):
  74. if (
  75. j - 1 >= 0
  76. and item == word[j - 1]
  77. and pos[0] in {"n", "v", "a"}
  78. and word not in self.must_not_neural_tone_words
  79. ):
  80. finals[j] = finals[j][:-1] + "5"
  81. ge_idx = word.find("个")
  82. if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
  83. finals[-1] = finals[-1][:-1] + "5"
  84. elif len(word) >= 1 and word[-1] in "的地得":
  85. finals[-1] = finals[-1][:-1] + "5"
  86. # e.g. 走了, 看着, 去过
  87. elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
  88. finals[-1] = finals[-1][:-1] + "5"
  89. elif (
  90. len(word) > 1
  91. and word[-1] in "们子"
  92. and pos in {"r", "n"}
  93. and word not in self.must_not_neural_tone_words
  94. ):
  95. finals[-1] = finals[-1][:-1] + "5"
  96. # e.g. 桌上, 地下, 家里
  97. elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
  98. finals[-1] = finals[-1][:-1] + "5"
  99. # e.g. 上来, 下去
  100. elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
  101. finals[-1] = finals[-1][:-1] + "5"
  102. # 个做量词
  103. elif (
  104. ge_idx >= 1
  105. and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
  106. ) or word == "个":
  107. finals[ge_idx] = finals[ge_idx][:-1] + "5"
  108. else:
  109. if (
  110. word in self.must_neural_tone_words
  111. or word[-2:] in self.must_neural_tone_words
  112. ):
  113. finals[-1] = finals[-1][:-1] + "5"
  114. word_list = self._split_word(word)
  115. finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
  116. for i, word in enumerate(word_list):
  117. # conventional neural in Chinese
  118. if (
  119. word in self.must_neural_tone_words
  120. or word[-2:] in self.must_neural_tone_words
  121. ):
  122. finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
  123. finals = sum(finals_list, [])
  124. return finals
  125. def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
  126. # e.g. 看不懂
  127. if len(word) == 3 and word[1] == "不":
  128. finals[1] = finals[1][:-1] + "5"
  129. else:
  130. for i, char in enumerate(word):
  131. # "不" before tone4 should be bu2, e.g. 不怕
  132. if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
  133. finals[i] = finals[i][:-1] + "2"
  134. return finals
  135. def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
  136. # "一" in number sequences, e.g. 一零零, 二一零
  137. if word.find("一") != -1 and all(
  138. [item.isnumeric() for item in word if item != "一"]
  139. ):
  140. return finals
  141. # "一" between reduplication words should be yi5, e.g. 看一看
  142. elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
  143. finals[1] = finals[1][:-1] + "5"
  144. # when "一" is ordinal word, it should be yi1
  145. elif word.startswith("第一"):
  146. finals[1] = finals[1][:-1] + "1"
  147. else:
  148. for i, char in enumerate(word):
  149. if char == "一" and i + 1 < len(word):
  150. # "一" before tone4 should be yi2, e.g. 一段
  151. if finals[i + 1][-1] == "4":
  152. finals[i] = finals[i][:-1] + "2"
  153. # "一" before non-tone4 should be yi4, e.g. 一天
  154. else:
  155. # "一" 后面如果是标点,还读一声
  156. if word[i + 1] not in self.punc:
  157. finals[i] = finals[i][:-1] + "4"
  158. return finals
  159. def _split_word(self, word: str) -> List[str]:
  160. word_list = jieba.cut_for_search(word)
  161. word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
  162. first_subword = word_list[0]
  163. first_begin_idx = word.find(first_subword)
  164. if first_begin_idx == 0:
  165. second_subword = word[len(first_subword) :]
  166. new_word_list = [first_subword, second_subword]
  167. else:
  168. second_subword = word[: -len(first_subword)]
  169. new_word_list = [second_subword, first_subword]
  170. return new_word_list
  171. def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
  172. if len(word) == 2 and self._all_tone_three(finals):
  173. finals[0] = finals[0][:-1] + "2"
  174. elif len(word) == 3:
  175. word_list = self._split_word(word)
  176. if self._all_tone_three(finals):
  177. # disyllabic + monosyllabic, e.g. 蒙古/包
  178. if len(word_list[0]) == 2:
  179. finals[0] = finals[0][:-1] + "2"
  180. finals[1] = finals[1][:-1] + "2"
  181. # monosyllabic + disyllabic, e.g. 纸/老虎
  182. elif len(word_list[0]) == 1:
  183. finals[1] = finals[1][:-1] + "2"
  184. else:
  185. finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
  186. if len(finals_list) == 2:
  187. for i, sub in enumerate(finals_list):
  188. # e.g. 所有/人
  189. if self._all_tone_three(sub) and len(sub) == 2:
  190. finals_list[i][0] = finals_list[i][0][:-1] + "2"
  191. # e.g. 好/喜欢
  192. elif (
  193. i == 1
  194. and not self._all_tone_three(sub)
  195. and finals_list[i][0][-1] == "3"
  196. and finals_list[0][-1][-1] == "3"
  197. ):
  198. finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
  199. finals = sum(finals_list, [])
  200. # split idiom into two words who's length is 2
  201. elif len(word) == 4:
  202. finals_list = [finals[:2], finals[2:]]
  203. finals = []
  204. for sub in finals_list:
  205. if self._all_tone_three(sub):
  206. sub[0] = sub[0][:-1] + "2"
  207. finals += sub
  208. return finals
  209. def _all_tone_three(self, finals: List[str]) -> bool:
  210. return all(x[-1] == "3" for x in finals)
  211. # merge "不" and the word behind it
  212. # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
  213. def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  214. new_seg = []
  215. last_word = ""
  216. for word, pos in seg:
  217. if last_word == "不":
  218. word = last_word + word
  219. if word != "不":
  220. new_seg.append((word, pos))
  221. last_word = word[:]
  222. if last_word == "不":
  223. new_seg.append((last_word, "d"))
  224. last_word = ""
  225. return new_seg
  226. # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
  227. # function 2: merge single "一" and the word behind it
  228. # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
  229. # e.g.
  230. # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
  231. # output seg: [['听一听', 'v']]
  232. def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  233. new_seg = []
  234. # function 1
  235. for i, (word, pos) in enumerate(seg):
  236. if (
  237. i - 1 >= 0
  238. and word == "一"
  239. and i + 1 < len(seg)
  240. and seg[i - 1][0] == seg[i + 1][0]
  241. and seg[i - 1][1] == "v"
  242. ):
  243. new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
  244. else:
  245. if (
  246. i - 2 >= 0
  247. and seg[i - 1][0] == "一"
  248. and seg[i - 2][0] == word
  249. and pos == "v"
  250. ):
  251. continue
  252. else:
  253. new_seg.append([word, pos])
  254. seg = new_seg
  255. new_seg = []
  256. # function 2
  257. for i, (word, pos) in enumerate(seg):
  258. if new_seg and new_seg[-1][0] == "一":
  259. new_seg[-1][0] = new_seg[-1][0] + word
  260. else:
  261. new_seg.append([word, pos])
  262. return new_seg
  263. # the first and the second words are all_tone_three
  264. def _merge_continuous_three_tones(
  265. self, seg: List[Tuple[str, str]]
  266. ) -> List[Tuple[str, str]]:
  267. new_seg = []
  268. sub_finals_list = [
  269. lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
  270. for (word, pos) in seg
  271. ]
  272. assert len(sub_finals_list) == len(seg)
  273. merge_last = [False] * len(seg)
  274. for i, (word, pos) in enumerate(seg):
  275. if (
  276. i - 1 >= 0
  277. and self._all_tone_three(sub_finals_list[i - 1])
  278. and self._all_tone_three(sub_finals_list[i])
  279. and not merge_last[i - 1]
  280. ):
  281. # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
  282. if (
  283. not self._is_reduplication(seg[i - 1][0])
  284. and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
  285. ):
  286. new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
  287. merge_last[i] = True
  288. else:
  289. new_seg.append([word, pos])
  290. else:
  291. new_seg.append([word, pos])
  292. return new_seg
  293. def _is_reduplication(self, word: str) -> bool:
  294. return len(word) == 2 and word[0] == word[1]
  295. # the last char of first word and the first char of second word is tone_three
  296. def _merge_continuous_three_tones_2(
  297. self, seg: List[Tuple[str, str]]
  298. ) -> List[Tuple[str, str]]:
  299. new_seg = []
  300. sub_finals_list = [
  301. lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
  302. for (word, pos) in seg
  303. ]
  304. assert len(sub_finals_list) == len(seg)
  305. merge_last = [False] * len(seg)
  306. for i, (word, pos) in enumerate(seg):
  307. if (
  308. i - 1 >= 0
  309. and sub_finals_list[i - 1][-1][-1] == "3"
  310. and sub_finals_list[i][0][-1] == "3"
  311. and not merge_last[i - 1]
  312. ):
  313. # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
  314. if (
  315. not self._is_reduplication(seg[i - 1][0])
  316. and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
  317. ):
  318. new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
  319. merge_last[i] = True
  320. else:
  321. new_seg.append([word, pos])
  322. else:
  323. new_seg.append([word, pos])
  324. return new_seg
  325. def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  326. new_seg = []
  327. for i, (word, pos) in enumerate(seg):
  328. if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
  329. new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
  330. else:
  331. new_seg.append([word, pos])
  332. return new_seg
  333. def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  334. new_seg = []
  335. for i, (word, pos) in enumerate(seg):
  336. if new_seg and word == new_seg[-1][0]:
  337. new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
  338. else:
  339. new_seg.append([word, pos])
  340. return new_seg
  341. def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
  342. seg = self._merge_bu(seg)
  343. try:
  344. seg = self._merge_yi(seg)
  345. except:
  346. print("_merge_yi failed")
  347. seg = self._merge_reduplication(seg)
  348. try:
  349. seg = self._merge_continuous_three_tones(seg)
  350. except:
  351. print("_merge_continuous_three_tones failed")
  352. try:
  353. seg = self._merge_continuous_three_tones_2(seg)
  354. except:
  355. print("_merge_continuous_three_tones_2 failed")
  356. seg = self._merge_er(seg)
  357. return seg
  358. def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
  359. finals = self._bu_sandhi(word, finals)
  360. finals = self._yi_sandhi(word, finals)
  361. finals = self._neural_sandhi(word, pos, finals)
  362. finals = self._three_sandhi(word, finals)
  363. return finals