clean.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. import re
  2. SYMBOLS_MAPPING = {
  3. "\n": "",
  4. "…": ".",
  5. "“": "'",
  6. "”": "'",
  7. "‘": "'",
  8. "’": "'",
  9. "【": "",
  10. "】": "",
  11. "[": "",
  12. "]": "",
  13. "(": "",
  14. ")": "",
  15. "(": "",
  16. ")": "",
  17. "・": "",
  18. "·": "",
  19. "「": "'",
  20. "」": "'",
  21. "《": "'",
  22. "》": "'",
  23. "—": "",
  24. "~": "",
  25. "~": "",
  26. ":": ",",
  27. ";": ",",
  28. ";": ",",
  29. ":": ",",
  30. }
  31. REPLACE_SYMBOL_REGEX = re.compile(
  32. "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
  33. )
  34. EMOJI_REGEX = re.compile(
  35. "["
  36. "\U0001F600-\U0001F64F" # emoticons
  37. "\U0001F300-\U0001F5FF" # symbols & pictographs
  38. "\U0001F680-\U0001F6FF" # transport & map symbols
  39. "\U0001F1E0-\U0001F1FF" # flags (iOS)
  40. "]+",
  41. flags=re.UNICODE,
  42. )
  43. def clean_text(text):
  44. # Clean the text
  45. text = text.strip()
  46. # Replace all chinese symbols with their english counterparts
  47. text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
  48. # Remove emojis
  49. text = EMOJI_REGEX.sub(r"", text)
  50. # Remove continuous periods (...) and commas (,,,)
  51. text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text)
  52. return text