2 years ago · 23cd0c2aa5
--- a/fish_speech/text/clean.py
+++ b/fish_speech/text/clean.py
@@ -1,5 +1,6 @@
 
				 import itertools
			
 
				 import re
			
 
				+import string
			
 
				 
			
 
				 LANGUAGE_UNICODE_RANGE_MAP = {
			
 
				     "ZH": [(0x4E00, 0x9FFF)],
			
@@ -62,12 +63,9 @@ REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
 
				 def clean_text(text):
			
 
				     # Clean the text
			
 
				     text = text.strip()
			
 
				-    # Replace <p:(.*?)> with <PPP(.*?)PPP>
			
 
				-    text = re.sub(r"<p:(.*?)>", r"<PPP\1PPP>", text)
			
 
				+
			
 
				     # Replace all chinese symbols with their english counterparts
			
 
				     text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
			
 
				     text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
			
 
				-    # Replace <PPP(.*?)PPP> with <p:(.*?)>
			
 
				-    text = re.sub(r"<PPP(.*?)PPP>", r"<p:\1>", text)
			
 
				 
			
 
				     return text
			
--- a/tools/llama/generate.py
+++ b/tools/llama/generate.py
@@ -1,5 +1,6 @@
 
				 import os
			
 
				 import queue
			
 
				+import string
			
 
				 import threading
			
 
				 import time
			
 
				 from pathlib import Path
			
@@ -418,17 +419,22 @@ def split_text(text, min_length):
 
				     text = clean_text(text)
			
 
				     segments = []
			
 
				     curr = ""
			
 
				+
			
 
				+    def clean_add(curr):
			
 
				+        curr = curr.strip()
			
 
				+        if curr and not all(c.isspace() or c in string.punctuation for c in curr):
			
 
				+            segments.append(curr)
			
 
				+
			
 
				     for char in text:
			
 
				         curr += char
			
 
				         if char not in [".", "!", "?"]:
			
 
				             continue
			
 
				 
			
 
				         if len(curr) >= min_length:
			
 
				-            segments.append(curr)
			
 
				+            clean_add(curr)
			
 
				             curr = ""
			
 
				 
			
 
				-    if curr:
			
 
				-        segments.append(curr)
			
 
				+    clean_add(curr)
			
 
				 
			
 
				     return segments