| 12345678910111213141516171819202122232425262728293031 |
- import re
- SYMBOLS_MAPPING = {
- "“": "'",
- "”": "'",
- "‘": "'",
- "’": "'",
- "【": "",
- "】": "",
- "[": "",
- "]": "",
- "(": "",
- ")": "",
- "(": "",
- ")": "",
- "・": "·",
- }
- REPLACE_SYMBOL_REGEX = re.compile(
- "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
- )
- def clean_text(text):
- # Clean the text
- text = text.strip()
- # Replace all chinese symbols with their english counterparts
- text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
- return text
|