clean.py 556 B

12345678910111213141516171819202122232425262728293031
  1. import re
  2. SYMBOLS_MAPPING = {
  3. "“": "'",
  4. "”": "'",
  5. "‘": "'",
  6. "’": "'",
  7. "【": "",
  8. "】": "",
  9. "[": "",
  10. "]": "",
  11. "(": "",
  12. ")": "",
  13. "(": "",
  14. ")": "",
  15. "・": "·",
  16. }
  17. REPLACE_SYMBOL_REGEX = re.compile(
  18. "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
  19. )
  20. def clean_text(text):
  21. # Clean the text
  22. text = text.strip()
  23. # Replace all chinese symbols with their english counterparts
  24. text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
  25. return text