|
@@ -98,9 +98,13 @@ REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
|
|
|
def clean_text(text):
|
|
def clean_text(text):
|
|
|
# Clean the text
|
|
# Clean the text
|
|
|
text = text.strip()
|
|
text = text.strip()
|
|
|
|
|
+ # Replace <p:(.*?)> with <PPP(.*?)PPP>
|
|
|
|
|
+ text = re.sub(r"<p:(.*?)>", r"<PPP\1PPP>", text)
|
|
|
# Replace all chinese symbols with their english counterparts
|
|
# Replace all chinese symbols with their english counterparts
|
|
|
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
|
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
|
|
text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
|
|
text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
|
|
|
|
|
+ # Replace <PPP(.*?)PPP> with <p:(.*?)>
|
|
|
|
|
+ text = re.sub(r"<PPP(.*?)PPP>", r"<p:\1>", text)
|
|
|
|
|
|
|
|
return text
|
|
return text
|
|
|
|
|
|
|
@@ -231,3 +235,5 @@ if __name__ == "__main__":
|
|
|
"测试一下 Hugging face, BGM声音很大吗?那我改一下. 世界、こんにちは。" # noqa: E501
|
|
"测试一下 Hugging face, BGM声音很大吗?那我改一下. 世界、こんにちは。" # noqa: E501
|
|
|
)
|
|
)
|
|
|
print(segments)
|
|
print(segments)
|
|
|
|
|
+
|
|
|
|
|
+ print(clean_text("测试一下 Hugging face, BGM声音很大吗?那我改一下. 世界、こんにちは。<p:123> <p:aH>"))
|