clean.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. import itertools
  2. import re
  3. import string
  4. LANGUAGE_UNICODE_RANGE_MAP = {
  5. "ZH": [(0x4E00, 0x9FFF)],
  6. "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
  7. "EN": [(0x0000, 0x007F)],
  8. }
  9. SYMBOLS_MAPPING = {
  10. ":": ",",
  11. ";": ",",
  12. ",": ",",
  13. "。": ".",
  14. "!": "!",
  15. "?": "?",
  16. "\n": ".",
  17. "·": ",",
  18. "、": ",",
  19. "...": "…",
  20. "“": "'",
  21. "”": "'",
  22. "‘": "'",
  23. "’": "'",
  24. "(": "'",
  25. ")": "'",
  26. "(": "'",
  27. ")": "'",
  28. "《": "'",
  29. "》": "'",
  30. "【": "'",
  31. "】": "'",
  32. "[": "'",
  33. "]": "'",
  34. "—": "-",
  35. "~": "-",
  36. "~": "-",
  37. "・": "-",
  38. "「": "'",
  39. "」": "'",
  40. ";": ",",
  41. ":": ",",
  42. }
  43. REPLACE_SYMBOL_REGEX = re.compile(
  44. "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
  45. )
  46. ALL_KNOWN_UTF8_RANGE = list(
  47. itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
  48. )
  49. REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
  50. "[^"
  51. + "".join(
  52. f"{re.escape(chr(start))}-{re.escape(chr(end))}"
  53. for start, end in ALL_KNOWN_UTF8_RANGE
  54. )
  55. + "]"
  56. )
  57. def clean_text(text):
  58. # Clean the text
  59. text = text.strip()
  60. # Replace all chinese symbols with their english counterparts
  61. text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
  62. text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
  63. return text