|
|
@@ -1,61 +1,24 @@
|
|
|
-import itertools
|
|
|
import re
|
|
|
|
|
|
-LANGUAGE_UNICODE_RANGE_MAP = {
|
|
|
- "ZH": [(0x4E00, 0x9FFF)],
|
|
|
- "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
|
|
|
- "EN": [(0x0000, 0x007F)],
|
|
|
-}
|
|
|
-
|
|
|
SYMBOLS_MAPPING = {
|
|
|
- ":": ",",
|
|
|
- ";": ",",
|
|
|
- ",": ",",
|
|
|
- "。": ".",
|
|
|
- "!": "!",
|
|
|
- "?": "?",
|
|
|
- "\n": ".",
|
|
|
- "·": ",",
|
|
|
- "、": ",",
|
|
|
- "...": "…",
|
|
|
"“": "'",
|
|
|
"”": "'",
|
|
|
"‘": "'",
|
|
|
"’": "'",
|
|
|
- "(": "'",
|
|
|
- ")": "'",
|
|
|
- "(": "'",
|
|
|
- ")": "'",
|
|
|
- "《": "'",
|
|
|
- "》": "'",
|
|
|
- "【": "'",
|
|
|
- "】": "'",
|
|
|
- "[": "'",
|
|
|
- "]": "'",
|
|
|
- "—": "-",
|
|
|
- "~": "-",
|
|
|
- "~": "-",
|
|
|
- "・": "-",
|
|
|
- "「": "'",
|
|
|
- "」": "'",
|
|
|
- ";": ",",
|
|
|
- ":": ",",
|
|
|
+ "【": "",
|
|
|
+ "】": "",
|
|
|
+ "[": "",
|
|
|
+ "]": "",
|
|
|
+ "(": "",
|
|
|
+ ")": "",
|
|
|
+ "(": "",
|
|
|
+ ")": "",
|
|
|
+ "・": "·",
|
|
|
}
|
|
|
|
|
|
REPLACE_SYMBOL_REGEX = re.compile(
|
|
|
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
|
|
|
)
|
|
|
-ALL_KNOWN_UTF8_RANGE = list(
|
|
|
- itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
|
|
|
-)
|
|
|
-REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
|
|
|
- "[^"
|
|
|
- + "".join(
|
|
|
- f"{re.escape(chr(start))}-{re.escape(chr(end))}"
|
|
|
- for start, end in ALL_KNOWN_UTF8_RANGE
|
|
|
- )
|
|
|
- + "]"
|
|
|
-)
|
|
|
|
|
|
|
|
|
def clean_text(text):
|
|
|
@@ -64,6 +27,5 @@ def clean_text(text):
|
|
|
|
|
|
# Replace all chinese symbols with their english counterparts
|
|
|
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
|
|
- # text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
|
|
|
|
|
|
return text
|