Browse Source

Optimize split text logic

Lengyue 1 year ago
parent
commit
23cd0c2aa5
2 changed files with 11 additions and 7 deletions
  1. 2 4
      fish_speech/text/clean.py
  2. 9 3
      tools/llama/generate.py

+ 2 - 4
fish_speech/text/clean.py

@@ -1,5 +1,6 @@
 import itertools
 import re
+import string
 
 LANGUAGE_UNICODE_RANGE_MAP = {
     "ZH": [(0x4E00, 0x9FFF)],
@@ -62,12 +63,9 @@ REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
 def clean_text(text):
     # Clean the text
     text = text.strip()
-    # Replace <p:(.*?)> with <PPP(.*?)PPP>
-    text = re.sub(r"<p:(.*?)>", r"<PPP\1PPP>", text)
+
     # Replace all chinese symbols with their english counterparts
     text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
     text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
-    # Replace <PPP(.*?)PPP> with <p:(.*?)>
-    text = re.sub(r"<PPP(.*?)PPP>", r"<p:\1>", text)
 
     return text

+ 9 - 3
tools/llama/generate.py

@@ -1,5 +1,6 @@
 import os
 import queue
+import string
 import threading
 import time
 from pathlib import Path
@@ -418,17 +419,22 @@ def split_text(text, min_length):
     text = clean_text(text)
     segments = []
     curr = ""
+
+    def clean_add(curr):
+        curr = curr.strip()
+        if curr and not all(c.isspace() or c in string.punctuation for c in curr):
+            segments.append(curr)
+
     for char in text:
         curr += char
         if char not in [".", "!", "?"]:
             continue
 
         if len(curr) >= min_length:
-            segments.append(curr)
+            clean_add(curr)
             curr = ""
 
-    if curr:
-        segments.append(curr)
+    clean_add(curr)
 
     return segments