|
|
@@ -1,5 +1,6 @@
|
|
|
import itertools
|
|
|
import re
|
|
|
+import string
|
|
|
|
|
|
LANGUAGE_UNICODE_RANGE_MAP = {
|
|
|
"ZH": [(0x4E00, 0x9FFF)],
|
|
|
@@ -62,12 +63,9 @@ REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
|
|
|
def clean_text(text):
|
|
|
# Clean the text
|
|
|
text = text.strip()
|
|
|
- # Replace <p:(.*?)> with <PPP(.*?)PPP>
|
|
|
- text = re.sub(r"<p:(.*?)>", r"<PPP\1PPP>", text)
|
|
|
+
|
|
|
# Replace all chinese symbols with their english counterparts
|
|
|
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
|
|
text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
|
|
|
- # Replace <PPP(.*?)PPP> with <p:(.*?)>
|
|
|
- text = re.sub(r"<PPP(.*?)PPP>", r"<p:\1>", text)
|
|
|
|
|
|
return text
|