clean.py 832 B

12345678910111213141516171819202122232425262728293031323334353637
  1. import re
  2. SYMBOLS_MAPPING = {
  3. "‘": "'",
  4. "’": "'",
  5. }
  6. REPLACE_SYMBOL_REGEX = re.compile(
  7. "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
  8. )
  9. EMOJI_REGEX = re.compile(
  10. "["
  11. "\U0001f600-\U0001f64f" # emoticons
  12. "\U0001f300-\U0001f5ff" # symbols & pictographs
  13. "\U0001f680-\U0001f6ff" # transport & map symbols
  14. "\U0001f1e0-\U0001f1ff" # flags (iOS)
  15. "]+",
  16. flags=re.UNICODE,
  17. )
  18. def clean_text(text):
  19. # Clean the text
  20. text = text.strip()
  21. # Replace all chinese symbols with their english counterparts
  22. text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
  23. # Remove emojis
  24. text = EMOJI_REGEX.sub(r"", text)
  25. # Remove continuous periods (...) and commas (,,,)
  26. text = re.sub(r"[,]{2,}", lambda m: m.group()[0], text)
  27. return text