clean.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. import itertools
  2. import re
  3. import string
  4. LANGUAGE_UNICODE_RANGE_MAP = {
  5. "ZH": [(0x4E00, 0x9FFF)],
  6. "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
  7. "EN": [(0x0000, 0x007F)],
  8. }
  9. SYMBOLS_MAPPING = {
  10. ":": ",",
  11. ";": ",",
  12. ",": ",",
  13. "。": ".",
  14. "!": "!",
  15. "?": "?",
  16. "\n": ".",
  17. "·": ",",
  18. "、": ",",
  19. "...": "…",
  20. "$": ".",
  21. "“": "'",
  22. "”": "'",
  23. "‘": "'",
  24. "’": "'",
  25. "(": "'",
  26. ")": "'",
  27. "(": "'",
  28. ")": "'",
  29. "《": "'",
  30. "》": "'",
  31. "【": "'",
  32. "】": "'",
  33. "[": "'",
  34. "]": "'",
  35. "—": "-",
  36. "~": "-",
  37. "~": "-",
  38. "・": "-",
  39. "「": "'",
  40. "」": "'",
  41. ";": ",",
  42. ":": ",",
  43. }
  44. REPLACE_SYMBOL_REGEX = re.compile(
  45. "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
  46. )
  47. ALL_KNOWN_UTF8_RANGE = list(
  48. itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
  49. )
  50. REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
  51. "[^"
  52. + "".join(
  53. f"{re.escape(chr(start))}-{re.escape(chr(end))}"
  54. for start, end in ALL_KNOWN_UTF8_RANGE
  55. )
  56. + "]"
  57. )
  58. def clean_text(text):
  59. # Clean the text
  60. text = text.strip()
  61. # Replace all chinese symbols with their english counterparts
  62. text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
  63. text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
  64. return text