Просмотр исходного кода

update clean text for new model

Lengyue 1 год назад
Родитель
Сommit
711209e541
1 измененных файлов с 18 добавлено и 1 удалено
  1. 18 1
      fish_speech/text/clean.py

+ 18 - 1
fish_speech/text/clean.py

@@ -1,7 +1,7 @@
 import re
 
 SYMBOLS_MAPPING = {
-    "\n": ".",
+    "\n": "",
     "…": ".",
     "“": "'",
     "”": "'",
@@ -35,6 +35,17 @@ REPLACE_SYMBOL_REGEX = re.compile(
 )
 
 
+EMOJI_REGEX = re.compile(
+    "["
+    "\U0001F600-\U0001F64F"  # emoticons
+    "\U0001F300-\U0001F5FF"  # symbols & pictographs
+    "\U0001F680-\U0001F6FF"  # transport & map symbols
+    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
+    "]+",
+    flags=re.UNICODE,
+)
+
+
 def clean_text(text):
     # Clean the text
     text = text.strip()
@@ -42,4 +53,10 @@ def clean_text(text):
     # Replace all chinese symbols with their english counterparts
     text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
 
+    # Remove emojis
+    text = EMOJI_REGEX.sub(r"", text)
+
+    # Remove continuous periods (...) and commas (,,,)
+    text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text)
+
     return text