clean.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import itertools
  2. import re
  3. LANGUAGE_UNICODE_RANGE_MAP = {
  4. "ZH": [(0x4E00, 0x9FFF)],
  5. "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
  6. "EN": [(0x0000, 0x007F)],
  7. }
  8. SYMBOLS_MAPPING = {
  9. ":": ",",
  10. ";": ",",
  11. ",": ",",
  12. "。": ".",
  13. "!": "!",
  14. "?": "?",
  15. "\n": ".",
  16. "·": ",",
  17. "、": ",",
  18. "...": "…",
  19. "“": "'",
  20. "”": "'",
  21. "‘": "'",
  22. "’": "'",
  23. "(": "'",
  24. ")": "'",
  25. "(": "'",
  26. ")": "'",
  27. "《": "'",
  28. "》": "'",
  29. "【": "'",
  30. "】": "'",
  31. "[": "'",
  32. "]": "'",
  33. "—": "-",
  34. "~": "-",
  35. "~": "-",
  36. "・": "-",
  37. "「": "'",
  38. "」": "'",
  39. ";": ",",
  40. ":": ",",
  41. }
  42. REPLACE_SYMBOL_REGEX = re.compile(
  43. "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
  44. )
  45. ALL_KNOWN_UTF8_RANGE = list(
  46. itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
  47. )
  48. REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
  49. "[^"
  50. + "".join(
  51. f"{re.escape(chr(start))}-{re.escape(chr(end))}"
  52. for start, end in ALL_KNOWN_UTF8_RANGE
  53. )
  54. + "]"
  55. )
  56. def clean_text(text):
  57. # Clean the text
  58. text = text.strip()
  59. # Replace all chinese symbols with their english counterparts
  60. text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
  61. text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
  62. return text