Lengyue 1 год назад
Родитель
Сommit
031599f2ee
1 измененных файлов с 9 добавлено и 47 удалено
  1. 9 47
      fish_speech/text/clean.py

+ 9 - 47
fish_speech/text/clean.py

@@ -1,61 +1,24 @@
-import itertools
 import re
 
-LANGUAGE_UNICODE_RANGE_MAP = {
-    "ZH": [(0x4E00, 0x9FFF)],
-    "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
-    "EN": [(0x0000, 0x007F)],
-}
-
 SYMBOLS_MAPPING = {
-    ":": ",",
-    ";": ",",
-    ",": ",",
-    "。": ".",
-    "!": "!",
-    "?": "?",
-    "\n": ".",
-    "·": ",",
-    "、": ",",
-    "...": "…",
     "“": "'",
     "”": "'",
     "‘": "'",
     "’": "'",
-    "(": "'",
-    ")": "'",
-    "(": "'",
-    ")": "'",
-    "《": "'",
-    "》": "'",
-    "【": "'",
-    "】": "'",
-    "[": "'",
-    "]": "'",
-    "—": "-",
-    "~": "-",
-    "~": "-",
-    "・": "-",
-    "「": "'",
-    "」": "'",
-    ";": ",",
-    ":": ",",
+    "【": "",
+    "】": "",
+    "[": "",
+    "]": "",
+    "(": "",
+    ")": "",
+    "(": "",
+    ")": "",
+    "・": "·",
 }
 
 REPLACE_SYMBOL_REGEX = re.compile(
     "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
 )
-ALL_KNOWN_UTF8_RANGE = list(
-    itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
-)
-REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
-    "[^"
-    + "".join(
-        f"{re.escape(chr(start))}-{re.escape(chr(end))}"
-        for start, end in ALL_KNOWN_UTF8_RANGE
-    )
-    + "]"
-)
 
 
 def clean_text(text):
@@ -64,6 +27,5 @@ def clean_text(text):
 
     # Replace all chinese symbols with their english counterparts
     text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
-    # text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
 
     return text