2 jaren geleden · cc4ce60295
--- a/fish_speech/webui/app.py
+++ b/fish_speech/webui/app.py
@@ -1,10 +1,13 @@
 
															 import html
														
 
															 import io
														
 
															+import os
														
 
															 import traceback
														
 
															+import wave
														
 
															 from pathlib import Path
														
 
															 import gradio as gr
														
 
															 import librosa
														
 
															+import numpy as np
														
 
															 import requests
														
 
															 from fish_speech.text import parse_text_to_segments
														
@@ -158,8 +161,10 @@ def build_model_config_block():
 
															     llama_ckpt_path = gr.Dropdown(
														
 
															         label="Llama 模型路径",
														
 
															         value=str(Path("checkpoints/text2semantic-400m-v0.3-4k.pth")),
														
 
															-        choices=[str(pth_file) for pth_file in Path("results").rglob("*text*/*.ckpt")]
														
 
															-        + [str(pth_file) for pth_file in Path("checkpoints").rglob("*text*.pth")],
														
 
															+        choices=[
														
 
															+            str(pth_file) for pth_file in Path("results").rglob("**/text*/**/*.ckpt")
														
 
															+        ]
														
 
															+        + [str(pth_file) for pth_file in Path("checkpoints").rglob("**/*text*.pth")],
														
 
															         allow_custom_value=True,
														
 
															     )
														
 
															     llama_config_name = gr.Textbox(label="Llama 配置文件", value="text2semantic_finetune")
														
@@ -172,8 +177,10 @@ def build_model_config_block():
 
															     vqgan_ckpt_path = gr.Dropdown(
														
 
															         label="VQGAN 模型路径",
														
 
															         value=str(Path("checkpoints/vqgan-v1.pth")),
														
 
															-        choices=[str(pth_file) for pth_file in Path("results").rglob("*vqgan*/*.ckpt")]
														
 
															-        + [str(pth_file) for pth_file in Path("checkpoints").rglob("*vqgan*.pth")],
														
 
															+        choices=[
														
 
															+            str(pth_file) for pth_file in Path("results").rglob("**/vqgan*/**/*.ckpt")
														
 
															+        ]
														
 
															+        + [str(pth_file) for pth_file in Path("checkpoints").rglob("**/*vqgan*.pth")],
														
 
															         allow_custom_value=True,
														
 
															     )
														
 
															     vqgan_config_name = gr.Dropdown(
														
@@ -265,6 +272,81 @@ def inference(
 
															     return (sr, content), None
														
 
															+def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=22050):
														
 
															+    # copy and paste
														
 
															+    wav_buf = io.BytesIO()
														
 
															+    with wave.open(wav_buf, "wb") as vfout:
														
 
															+        vfout.setnchannels(channels)
														
 
															+        vfout.setsampwidth(sample_width)
														
 
															+        vfout.setframerate(sample_rate)
														
 
															+        vfout.writeframes(frame_input)
														
 
															+
														
 
															+    wav_buf.seek(0)
														
 
															+    return wav_buf.read()
														
 
															+
														
 
															+
														
 
															+def inference_stream(
														
 
															+    server_url,
														
 
															+    text,
														
 
															+    input_mode,
														
 
															+    language0,
														
 
															+    language1,
														
 
															+    language2,
														
 
															+    enable_reference_audio,
														
 
															+    reference_audio,
														
 
															+    reference_text,
														
 
															+    max_new_tokens,
														
 
															+    top_k,
														
 
															+    top_p,
														
 
															+    repetition_penalty,
														
 
															+    temperature,
														
 
															+    speaker,
														
 
															+):
														
 
															+    languages = [language0, language1, language2]
														
 
															+    languages = [
														
 
															+        {
														
 
															+            "中文": "zh",
														
 
															+            "日文": "jp",
														
 
															+            "英文": "en",
														
 
															+        }[language]
														
 
															+        for language in languages
														
 
															+    ]
														
 
															+
														
 
															+    if len(set(languages)) != len(languages):
														
 
															+        return []
														
 
															+
														
 
															+    order = ",".join(languages)
														
 
															+    payload = {
														
 
															+        "text": text,
														
 
															+        "prompt_text": reference_text if enable_reference_audio else None,
														
 
															+        "prompt_tokens": reference_audio if enable_reference_audio else None,
														
 
															+        "max_new_tokens": int(max_new_tokens),
														
 
															+        "top_k": int(top_k) if top_k > 0 else None,
														
 
															+        "top_p": top_p,
														
 
															+        "repetition_penalty": repetition_penalty,
														
 
															+        "temperature": temperature,
														
 
															+        "order": order,
														
 
															+        "use_g2p": input_mode == "自动音素",
														
 
															+        "seed": None,
														
 
															+        "speaker": speaker if speaker.strip() != "" else None,
														
 
															+    }
														
 
															+
														
 
															+    resp = requests.post(
														
 
															+        f"{server_url}/v1/models/default/invoke_stream", json=payload, stream=True
														
 
															+    )
														
 
															+    resp.raise_for_status()
														
 
															+
														
 
															+    yield wave_header_chunk(), None
														
 
															+
														
 
															+    for chunk in resp.iter_content(chunk_size=None):
														
 
															+        if chunk:
														
 
															+            content = io.BytesIO(chunk)
														
 
															+            content.seek(0)
														
 
															+            audio, sr = librosa.load(content, sr=None, mono=True)
														
 
															+            print(audio.shape, sr)
														
 
															+            yield (np.concatenate([audio], 0) * 32768).astype(np.int16).tobytes(), None
														
 
															+
														
 
															+
														
 
															 with gr.Blocks(theme=gr.themes.Base()) as app:
														
 
															     gr.Markdown(HEADER_MD)
														
@@ -368,18 +450,34 @@ with gr.Blocks(theme=gr.themes.Base()) as app:
 
															                             value="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
														
 
															                         )
														
 
															-                with gr.Row():
														
 
															-                    with gr.Column(scale=2):
														
 
															-                        generate = gr.Button(value="合成", variant="primary")
														
 
															-                    with gr.Column(scale=1):
														
 
															-                        clear = gr.Button(value="清空")
														
 
															-
														
 
															         with gr.Column(scale=3):
														
 
															-            error = gr.HTML(label="错误信息")
														
 
															-            parsed_text = gr.Dataframe(
														
 
															-                label="解析结果 (仅参考)", headers=["ID", "文本", "语言", "音素"]
														
 
															-            )
														
 
															-            audio = gr.Audio(label="合成音频", type="numpy")
														
 
															+            with gr.Row():
														
 
															+                error = gr.HTML(label="错误信息")
														
 
															+            with gr.Row():
														
 
															+                parsed_text = gr.Dataframe(
														
 
															+                    label="解析结果 (仅参考)", headers=["ID", "文本", "语言", "音素"]
														
 
															+                )
														
 
															+            with gr.Row():
														
 
															+                audio_once = gr.Audio(label="一次合成音频", type="numpy")
														
 
															+            with gr.Row():
														
 
															+                audio_stream = gr.Audio(
														
 
															+                    label="流式合成音频",
														
 
															+                    autoplay=True,
														
 
															+                    streaming=True,
														
 
															+                    show_label=True,
														
 
															+                    interactive=False,
														
 
															+                )
														
 
															+            with gr.Row():
														
 
															+                with gr.Column(scale=3):
														
 
															+                    generate = gr.Button(value="\U0001F3A7 合成", variant="primary")
														
 
															+                    stream_generate = gr.Button(
														
 
															+                        value="\U0001F4A7 流式合成", variant="primary"
														
 
															+                    )
														
 
															+                with gr.Column(scale=1):
														
 
															+                    audio_download = gr.Button(
														
 
															+                        value="\U0001F449 下载流式音频", elem_id="audio_download"
														
 
															+                    )
														
 
															+                    clear = gr.Button(value="清空")
														
 
															     # Language & Text Parsing
														
 
															     kwargs = dict(
														
@@ -422,7 +520,42 @@ with gr.Blocks(theme=gr.themes.Base()) as app:
 
															             temperature,
														
 
															             speaker,
														
 
															         ],
														
 
															-        [audio, error],
														
 
															+        [audio_once, error],
														
 
															+    )
														
 
															+
														
 
															+    stream_generate.click(
														
 
															+        inference_stream,
														
 
															+        [
														
 
															+            server_url,
														
 
															+            text,
														
 
															+            input_mode,
														
 
															+            language0,
														
 
															+            language1,
														
 
															+            language2,
														
 
															+            enable_reference_audio,
														
 
															+            reference_audio,
														
 
															+            reference_text,
														
 
															+            max_new_tokens,
														
 
															+            top_k,
														
 
															+            top_p,
														
 
															+            repetition_penalty,
														
 
															+            temperature,
														
 
															+            speaker,
														
 
															+        ],
														
 
															+        [audio_stream, error],
														
 
															+    ).then(lambda: gr.update(interactive=True), None, [text], queue=False)
														
 
															+
														
 
															+    audio_download.click(
														
 
															+        None,
														
 
															+        js="() => { "
														
 
															+        'var btn = document.getElementById("audio_download"); '
														
 
															+        "btn.disabled = true; "
														
 
															+        "setTimeout(() => { btn.disabled = false; }, 1000); "
														
 
															+        'var win = window.open("http://localhost:8000/v1/models/default/download", '
														
 
															+        '"newwindow", "height=100, width=400, toolbar=no, menubar=no, scrollbars=no, '
														
 
															+        'resizable=no, location=no, status=no"); '
														
 
															+        "setTimeout(function() { win.close(); }, 1000);"
														
 
															+        "}",
														
 
															     )