| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- import itertools
- import re
- LANGUAGE_UNICODE_RANGE_MAP = {
- "ZH": [(0x4E00, 0x9FFF)],
- "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
- "EN": [(0x0000, 0x007F)],
- }
- SYMBOLS_MAPPING = {
- ":": ",",
- ";": ",",
- ",": ",",
- "。": ".",
- "!": "!",
- "?": "?",
- "\n": ".",
- "·": ",",
- "、": ",",
- "...": "…",
- "“": "'",
- "”": "'",
- "‘": "'",
- "’": "'",
- "(": "'",
- ")": "'",
- "(": "'",
- ")": "'",
- "《": "'",
- "》": "'",
- "【": "'",
- "】": "'",
- "[": "'",
- "]": "'",
- "—": "-",
- "~": "-",
- "~": "-",
- "・": "-",
- "「": "'",
- "」": "'",
- ";": ",",
- ":": ",",
- }
- REPLACE_SYMBOL_REGEX = re.compile(
- "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
- )
- ALL_KNOWN_UTF8_RANGE = list(
- itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
- )
- REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
- "[^"
- + "".join(
- f"{re.escape(chr(start))}-{re.escape(chr(end))}"
- for start, end in ALL_KNOWN_UTF8_RANGE
- )
- + "]"
- )
- def clean_text(text):
- # Clean the text
- text = text.strip()
- # Replace all chinese symbols with their english counterparts
- text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
- text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
- return text
|