clean.py 766 B

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import re
  2. SYMBOLS_MAPPING = {
  3. "\n": ".",
  4. "…": ".",
  5. "“": "'",
  6. "”": "'",
  7. "‘": "'",
  8. "’": "'",
  9. "【": "",
  10. "】": "",
  11. "[": "",
  12. "]": "",
  13. "(": "",
  14. ")": "",
  15. "(": "",
  16. ")": "",
  17. "・": "",
  18. "·": "",
  19. "「": "'",
  20. "」": "'",
  21. "《": "'",
  22. "》": "'",
  23. "—": "",
  24. "~": "",
  25. "~": "",
  26. ":": ",",
  27. ";": ",",
  28. ";": ",",
  29. ":": ",",
  30. }
  31. REPLACE_SYMBOL_REGEX = re.compile(
  32. "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
  33. )
  34. def clean_text(text):
  35. # Clean the text
  36. text = text.strip()
  37. # Replace all chinese symbols with their english counterparts
  38. text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
  39. return text