|
|
@@ -1,7 +1,7 @@
|
|
|
import re
|
|
|
|
|
|
SYMBOLS_MAPPING = {
|
|
|
- "\n": ".",
|
|
|
+ "\n": "",
|
|
|
"…": ".",
|
|
|
"“": "'",
|
|
|
"”": "'",
|
|
|
@@ -35,6 +35,17 @@ REPLACE_SYMBOL_REGEX = re.compile(
|
|
|
)
|
|
|
|
|
|
|
|
|
+EMOJI_REGEX = re.compile(
|
|
|
+ "["
|
|
|
+ "\U0001F600-\U0001F64F" # emoticons
|
|
|
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
|
|
|
+ "\U0001F680-\U0001F6FF" # transport & map symbols
|
|
|
+ "\U0001F1E0-\U0001F1FF" # flags (iOS)
|
|
|
+ "]+",
|
|
|
+ flags=re.UNICODE,
|
|
|
+)
|
|
|
+
|
|
|
+
|
|
|
def clean_text(text):
|
|
|
# Clean the text
|
|
|
text = text.strip()
|
|
|
@@ -42,4 +53,10 @@ def clean_text(text):
|
|
|
# Replace all chinese symbols with their english counterparts
|
|
|
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
|
|
|
|
|
+ # Remove emojis
|
|
|
+ text = EMOJI_REGEX.sub(r"", text)
|
|
|
+
|
|
|
+ # Remove continuous periods (...) and commas (,,,)
|
|
|
+ text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text)
|
|
|
+
|
|
|
return text
|