| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 |
- import itertools
- import re
- import string
- LANGUAGE_UNICODE_RANGE_MAP = {
- "ZH": [(0x4E00, 0x9FFF)],
- "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
- "EN": [(0x0000, 0x007F)],
- }
- SYMBOLS_MAPPING = {
- ":": ",",
- ";": ",",
- ",": ",",
- "。": ".",
- "!": "!",
- "?": "?",
- "\n": ".",
- "·": ",",
- "、": ",",
- "...": "…",
- "$": ".",
- "“": "'",
- "”": "'",
- "‘": "'",
- "’": "'",
- "(": "'",
- ")": "'",
- "(": "'",
- ")": "'",
- "《": "'",
- "》": "'",
- "【": "'",
- "】": "'",
- "[": "'",
- "]": "'",
- "—": "-",
- "~": "-",
- "~": "-",
- "・": "-",
- "「": "'",
- "」": "'",
- ";": ",",
- ":": ",",
- }
- REPLACE_SYMBOL_REGEX = re.compile(
- "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
- )
- ALL_KNOWN_UTF8_RANGE = list(
- itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
- )
- REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
- "[^"
- + "".join(
- f"{re.escape(chr(start))}-{re.escape(chr(end))}"
- for start, end in ALL_KNOWN_UTF8_RANGE
- )
- + "]"
- )
- def clean_text(text):
- # Clean the text
- text = text.strip()
- # Replace all chinese symbols with their english counterparts
- text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
- text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
- return text
|