Lengyue 2 лет назад
Родитель
Сommit
c9e0ad993c
3 измененных файлов с 115 добавлено и 22 удалено
  1. 2 2
      fish_speech/text/__init__.py
  2. 19 10
      fish_speech/text/parser.py
  3. 94 10
      fish_speech/webui/app.py

+ 2 - 2
fish_speech/text/__init__.py

@@ -1,3 +1,3 @@
-from .parser import g2p, parse_text_to_segments, segments_to_phones
+from .parser import clean_text, g2p, parse_text_to_segments, segments_to_phones
 
-__all__ = ["g2p", "parse_text_to_segments", "segments_to_phones"]
+__all__ = ["g2p", "parse_text_to_segments", "segments_to_phones", "clean_text"]

+ 19 - 10
fish_speech/text/parser.py

@@ -94,6 +94,16 @@ REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
 )
 
 
+def clean_text(text):
+    # Clean the text
+    text = text.strip()
+    # Replace all chinese symbols with their english counterparts
+    text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
+    text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
+
+    return text
+
+
 def parse_text_to_segments(text, order=None):
     """
     Parse the text and return a list of segments.
@@ -108,12 +118,7 @@ def parse_text_to_segments(text, order=None):
     order = [language.upper() for language in order]
     assert all(language in language_id_map for language in order)
 
-    # Clean the text
-    text = text.strip()
-    # Replace all chinese symbols with their english counterparts
-    text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
-    text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
-
+    text = clean_text(text)
     texts = re.split(r"(<.*?>)", text)
     texts = [text for text in texts if text.strip() != ""]
 
@@ -123,7 +128,9 @@ def parse_text_to_segments(text, order=None):
         if text.startswith("<") and text.endswith(">") and text[1] != "/":
             current_language = text[1:-1]
             # The following line should be updated later
-            assert current_language.upper() in language_id_map
+            assert (
+                current_language.upper() in language_id_map
+            ), f"Unknown language: {current_language}"
             stack.append(current_language)
         elif text.startswith("</") and text.endswith(">"):
             language = stack.pop()
@@ -132,7 +139,9 @@ def parse_text_to_segments(text, order=None):
         elif stack:
             segments.append(Segment(text, stack[-1]))
         else:
-            segments.extend(parse_unknown_segment(text, order))
+            segments.extend(
+                [i for i in parse_unknown_segment(text, order) if len(i.phones) > 0]
+            )
 
     return segments
 
@@ -210,11 +219,11 @@ def g2p(text, order=None):
 
 if __name__ == "__main__":
     segments = parse_text_to_segments(
-        "毕业然后复活卡b站推荐bug<zh>加流量。<en>Hugging face, B GM</en>声音很大吗</zh>?那我改一下Ё。 <jp>君の虜になってしまえばきっと</jp>"  # noqa: E501
+        "测试一下 Hugging face, BGM声音很大吗?那我改一下. <jp>世界、こんにちは。</jp>"  # noqa: E501
     )
     print(segments)
 
     segments = parse_text_to_segments(
-        "毕业然后复活卡b站推荐bug加流量。Hugging face, BGM 声音很大吗?那我改一下Ё。君の虜になってしまえばきっと"  # noqa: E501
+        "测试一下 Hugging face, BGM声音很大吗?那我改一下. 世界、こんにちは。"  # noqa: E501
     )
     print(segments)

+ 94 - 10
fish_speech/webui/app.py

@@ -1,21 +1,94 @@
+import html
+import traceback
+
 import gradio as gr
 
+from fish_speech.text import parse_text_to_segments, segments_to_phones
+
 HEADER_MD = """
 # Fish Speech
 
 基于 VITS 和 GPT 的多语种语音合成. 项目很大程度上基于 Rcell 的 GPT-VITS.
 """
 
+TEXTBOX_PLACEHOLDER = """在启用自动音素的情况下, 模型默认会全自动将输入文本转换为音素. 例如:
+测试一下 Hugging face, BGM声音很大吗?那我改一下. 世界、こんにちは。
+
+会被转换为:
+<Segment ZH: '测试一下' -> 'c e4 sh ir4 y i2 x ia4'>
+<Segment EN: ' Hugging face, BGM' -> 'HH AH1 G IH0 NG F EY1 S , B AE1 G M'>
+<Segment ZH: '声音很大吗?那我改一下.' -> 'sh eng1 y in1 h en3 d a4 m a5 ? n a4 w o2 g ai3 y i2 x ia4 .'>
+<Segment ZH: '世界,' -> 'sh ir4 j ie4 ,'>
+<Segment JP: 'こんにちは.' -> 'k o N n i ch i w a .'>
+
+如你所见, 最后的句子被分割为了两个部分, 因为该日文包含了汉字, 你可以使用 <jp>...</jp> 标签来指定日文优先级. 例如:
+测试一下 Hugging face, BGM声音很大吗?那我改一下. <jp>世界、こんにちは。</jp>
+
+可以看到, 日文部分被正确地分割了出来:
+...
+<Segment JP: '世界,こんにちは.' -> 's e k a i , k o N n i ch i w a .'>
+"""
+
+
+def build_html_error_message(error):
+    return f"""
+    <div style="color: red; font-weight: bold;">
+        {html.escape(error)}
+    </div>
+    """
+
+
+def prepare_text(text, input_mode, language0, language1, language2):
+    lines = text.splitlines()
+    languages = [language0, language1, language2]
+    languages = [
+        {
+            "中文": "ZH",
+            "日文": "JP",
+            "英文": "EN",
+        }[language]
+        for language in languages
+    ]
+
+    if len(set(languages)) != len(languages):
+        return [], build_html_error_message("语言优先级不能重复.")
+
+    if input_mode != "自动音素转换":
+        return [
+            [idx, line, "-", "-"]
+            for idx, line in enumerate(lines)
+            if line.strip() != ""
+        ], None
+
+    rows = []
+
+    for idx, line in enumerate(lines):
+        if line.strip() == "":
+            continue
+
+        try:
+            segments = parse_text_to_segments(line, order=languages)
+        except Exception:
+            traceback.print_exc()
+            err = traceback.format_exc()
+            return [], build_html_error_message(f"解析 '{line}' 时发生错误. \n\n{err}")
+
+        for segment in segments:
+            rows.append([idx, segment.text, segment.language, segment.phones])
+
+    return rows, None
+
+
 with gr.Blocks(theme=gr.themes.Base()) as app:
     gr.Markdown(HEADER_MD)
 
     with gr.Row():
-        with gr.Column(scale=5):
-            text = gr.Textbox(lines=5, label="输入文本")
+        with gr.Column(scale=3):
+            text = gr.Textbox(label="输入文本", placeholder=TEXTBOX_PLACEHOLDER, lines=3)
 
             with gr.Row():
                 with gr.Tab(label="合成参数"):
-                    gr.Markdown("配置常见的合成参数.")
+                    gr.Markdown("配置常见合成参数.")
 
                     input_mode = gr.Dropdown(
                         choices=["手动输入音素/文本", "自动音素转换"],
@@ -28,23 +101,23 @@ with gr.Blocks(theme=gr.themes.Base()) as app:
 
                     with gr.Column(scale=1):
                         language0 = gr.Dropdown(
-                            choices=["中文", "日文", "英文", "无"],
+                            choices=["中文", "日文", "英文"],
                             label="语言 1",
                             value="中文",
                         )
 
                     with gr.Column(scale=1):
                         language1 = gr.Dropdown(
-                            choices=["中文", "日文", "英文", "无"],
+                            choices=["中文", "日文", "英文"],
                             label="语言 2",
-                            value="文",
+                            value="文",
                         )
 
                     with gr.Column(scale=1):
                         language2 = gr.Dropdown(
-                            choices=["中文", "日文", "英文", "无"],
+                            choices=["中文", "日文", "英文"],
                             label="语言 3",
-                            value="",
+                            value="英文",
                         )
 
             with gr.Row():
@@ -54,10 +127,21 @@ with gr.Blocks(theme=gr.themes.Base()) as app:
                     clear = gr.Button(value="清空")
 
         with gr.Column(scale=3):
+            error = gr.HTML(label="错误信息")
+            parsed_text = gr.Dataframe(label="解析结果", headers=["ID", "文本", "语言", "音素"])
             audio = gr.Audio(label="合成音频")
 
-    generate.click(lambda: None, [input_mode], [audio])
-    # dark_mode.link(lambda val: app.set_theme(gr.themes.Dark() if val else gr.themes.Default()))
+    # Language & Text Parsing
+    kwargs = dict(
+        inputs=[text, input_mode, language0, language1, language2],
+        outputs=[parsed_text, error],
+        trigger_mode="always_last",
+    )
+    text.change(prepare_text, **kwargs)
+    input_mode.change(prepare_text, **kwargs)
+    language0.change(prepare_text, **kwargs)
+    language1.change(prepare_text, **kwargs)
+    language2.change(prepare_text, **kwargs)
 
 if __name__ == "__main__":
     app.launch(show_api=False)