| 12345678910111213141516171819202122232425262728293031323334353637 |
- import re
- SYMBOLS_MAPPING = {
- "‘": "'",
- "’": "'",
- }
- REPLACE_SYMBOL_REGEX = re.compile(
- "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
- )
- EMOJI_REGEX = re.compile(
- "["
- "\U0001f600-\U0001f64f" # emoticons
- "\U0001f300-\U0001f5ff" # symbols & pictographs
- "\U0001f680-\U0001f6ff" # transport & map symbols
- "\U0001f1e0-\U0001f1ff" # flags (iOS)
- "]+",
- flags=re.UNICODE,
- )
- def clean_text(text):
- # Clean the text
- text = text.strip()
- # Replace all chinese symbols with their english counterparts
- text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
- # Remove emojis
- text = EMOJI_REGEX.sub(r"", text)
- # Remove continuous periods (...) and commas (,,,)
- text = re.sub(r"[,]{2,}", lambda m: m.group()[0], text)
- return text
|