1 tahun lalu · ca182bcdf8
--- a/fish_speech/webui/__main__.py
+++ b/fish_speech/webui/__main__.py
@@ -1,4 +0,0 @@
 
				-from fish_speech.webui.app import app
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    app.launch(show_api=False)
			
--- a/fish_speech/webui/app.py
+++ b/fish_speech/webui/app.py
@@ -1,563 +0,0 @@
 
				-import html
			
 
				-import io
			
 
				-import os
			
 
				-import traceback
			
 
				-import wave
			
 
				-from pathlib import Path
			
 
				-
			
 
				-import gradio as gr
			
 
				-import librosa
			
 
				-import numpy as np
			
 
				-import requests
			
 
				-
			
 
				-from fish_speech.text import parse_text_to_segments
			
 
				-
			
 
				-HEADER_MD = """
			
 
				-# Fish Speech
			
 
				-
			
 
				-基于 VQ-GAN 和 Llama 的多语种语音合成. 感谢 Rcell 的 GPT-VITS 提供的思路.
			
 
				-"""
			
 
				-
			
 
				-TEXTBOX_PLACEHOLDER = """在启用自动音素的情况下, 模型默认会全自动将输入文本转换为音素. 例如:
			
 
				-测试一下 Hugging face, BGM声音很大吗？那我改一下. 世界、こんにちは。
			
 
				-
			
 
				-会被转换为:
			
 
				-<Segment ZH: '测试一下' -> 'c e4 sh ir4 y i2 x ia4'>
			
 
				-<Segment EN: ' Hugging face, BGM' -> 'HH AH1 G IH0 NG F EY1 S , B AE1 G M'>
			
 
				-<Segment ZH: '声音很大吗?那我改一下.' -> 'sh eng1 y in1 h en3 d a4 m a5 ? n a4 w o2 g ai3 y i2 x ia4 .'>
			
 
				-<Segment ZH: '世界,' -> 'sh ir4 j ie4 ,'>
			
 
				-<Segment JP: 'こんにちは.' -> 'k o N n i ch i w a .'>
			
 
				-
			
 
				-如你所见, 最后的句子被分割为了两个部分, 因为该日文包含了汉字, 你可以使用 <jp>...</jp> 标签来指定日文优先级. 例如:
			
 
				-测试一下 Hugging face, BGM声音很大吗？那我改一下. <jp>世界、こんにちは。</jp>
			
 
				-
			
 
				-可以看到, 日文部分被正确地分割了出来:
			
 
				-...
			
 
				-<Segment JP: '世界,こんにちは.' -> 's e k a i , k o N n i ch i w a .'>
			
 
				-"""
			
 
				-
			
 
				-
			
 
				-def build_html_error_message(error):
			
 
				-    return f"""
			
 
				-    <div style="color: red; font-weight: bold;">
			
 
				-        {html.escape(error)}
			
 
				-    </div>
			
 
				-    """
			
 
				-
			
 
				-
			
 
				-def prepare_text(
			
 
				-    text,
			
 
				-    input_mode,
			
 
				-    language0,
			
 
				-    language1,
			
 
				-    language2,
			
 
				-    enable_reference_audio,
			
 
				-    reference_text,
			
 
				-):
			
 
				-    lines = text.splitlines()
			
 
				-    languages = [language0, language1, language2]
			
 
				-    languages = [
			
 
				-        {
			
 
				-            "中文": "ZH",
			
 
				-            "日文": "JP",
			
 
				-            "英文": "EN",
			
 
				-        }[language]
			
 
				-        for language in languages
			
 
				-    ]
			
 
				-
			
 
				-    if len(set(languages)) != len(languages):
			
 
				-        return [], build_html_error_message("语言优先级不能重复.")
			
 
				-
			
 
				-    if enable_reference_audio:
			
 
				-        reference_text = reference_text.strip() + " "
			
 
				-    else:
			
 
				-        reference_text = ""
			
 
				-
			
 
				-    if input_mode != "自动音素":
			
 
				-        return [
			
 
				-            [idx, reference_text + line, "-", "-"]
			
 
				-            for idx, line in enumerate(lines)
			
 
				-            if line.strip() != ""
			
 
				-        ], None
			
 
				-
			
 
				-    rows = []
			
 
				-
			
 
				-    for idx, line in enumerate(lines):
			
 
				-        if line.strip() == "":
			
 
				-            continue
			
 
				-
			
 
				-        try:
			
 
				-            segments = parse_text_to_segments(reference_text + line, order=languages)
			
 
				-        except Exception:
			
 
				-            traceback.print_exc()
			
 
				-            err = traceback.format_exc()
			
 
				-            return [], build_html_error_message(f"解析 '{line}' 时发生错误. \n\n{err}")
			
 
				-
			
 
				-        for segment in segments:
			
 
				-            rows.append([idx, segment.text, segment.language, " ".join(segment.phones)])
			
 
				-
			
 
				-    return rows, None
			
 
				-
			
 
				-
			
 
				-def load_model(
			
 
				-    server_url,
			
 
				-    llama_ckpt_path,
			
 
				-    llama_config_name,
			
 
				-    tokenizer,
			
 
				-    vqgan_ckpt_path,
			
 
				-    vqgan_config_name,
			
 
				-    device,
			
 
				-    precision,
			
 
				-    compile_model,
			
 
				-):
			
 
				-    payload = {
			
 
				-        "device": device,
			
 
				-        "llama": {
			
 
				-            "config_name": llama_config_name,
			
 
				-            "checkpoint_path": llama_ckpt_path,
			
 
				-            "precision": precision,
			
 
				-            "tokenizer": tokenizer,
			
 
				-            "compile": compile_model,
			
 
				-        },
			
 
				-        "vqgan": {
			
 
				-            "config_name": vqgan_config_name,
			
 
				-            "checkpoint_path": vqgan_ckpt_path,
			
 
				-        },
			
 
				-    }
			
 
				-
			
 
				-    try:
			
 
				-        resp = requests.put(f"{server_url}/v1/models/default", json=payload)
			
 
				-        resp.raise_for_status()
			
 
				-    except Exception:
			
 
				-        traceback.print_exc()
			
 
				-        err = traceback.format_exc()
			
 
				-        return build_html_error_message(f"加载模型时发生错误. \n\n{err}")
			
 
				-
			
 
				-    return "模型加载成功."
			
 
				-
			
 
				-
			
 
				-def build_model_config_block():
			
 
				-    server_url = gr.Textbox(label="服务器地址", value="http://localhost:8000")
			
 
				-
			
 
				-    with gr.Row():
			
 
				-        with gr.Column(scale=1):
			
 
				-            device = gr.Dropdown(
			
 
				-                label="设备",
			
 
				-                choices=["cpu", "cuda"],
			
 
				-                value="cuda",
			
 
				-            )
			
 
				-        with gr.Column(scale=1):
			
 
				-            precision = gr.Dropdown(
			
 
				-                label="精度",
			
 
				-                choices=["bfloat16", "float16"],
			
 
				-                value="float16",
			
 
				-            )
			
 
				-        with gr.Column(scale=1):
			
 
				-            compile_model = gr.Checkbox(
			
 
				-                label="编译模型",
			
 
				-                value=True,
			
 
				-            )
			
 
				-
			
 
				-    llama_ckpt_path = gr.Dropdown(
			
 
				-        label="Llama 模型路径",
			
 
				-        value=str(Path("checkpoints/text2semantic-400m-v0.3-4k.pth")),
			
 
				-        choices=[
			
 
				-            str(pth_file) for pth_file in Path("results").rglob("**/text*/**/*.ckpt")
			
 
				-        ]
			
 
				-        + [str(pth_file) for pth_file in Path("checkpoints").rglob("**/*text*.pth")],
			
 
				-        allow_custom_value=True,
			
 
				-    )
			
 
				-    llama_config_name = gr.Textbox(label="Llama 配置文件", value="text2semantic_finetune")
			
 
				-    tokenizer = gr.Dropdown(
			
 
				-        label="Tokenizer",
			
 
				-        value="fishaudio/speech-lm-v1",
			
 
				-        choices=["fishaudio/speech-lm-v1", "checkpoints"],
			
 
				-    )
			
 
				-
			
 
				-    vqgan_ckpt_path = gr.Dropdown(
			
 
				-        label="VQGAN 模型路径",
			
 
				-        value=str(Path("checkpoints/vqgan-v1.pth")),
			
 
				-        choices=[
			
 
				-            str(pth_file) for pth_file in Path("results").rglob("**/vqgan*/**/*.ckpt")
			
 
				-        ]
			
 
				-        + [str(pth_file) for pth_file in Path("checkpoints").rglob("**/*vqgan*.pth")],
			
 
				-        allow_custom_value=True,
			
 
				-    )
			
 
				-    vqgan_config_name = gr.Dropdown(
			
 
				-        label="VQGAN 配置文件",
			
 
				-        value="vqgan_pretrain",
			
 
				-        choices=["vqgan_pretrain", "vqgan_finetune"],
			
 
				-    )
			
 
				-
			
 
				-    load_model_btn = gr.Button(value="加载模型", variant="primary")
			
 
				-    error = gr.HTML(label="错误信息")
			
 
				-
			
 
				-    load_model_btn.click(
			
 
				-        load_model,
			
 
				-        [
			
 
				-            server_url,
			
 
				-            llama_ckpt_path,
			
 
				-            llama_config_name,
			
 
				-            tokenizer,
			
 
				-            vqgan_ckpt_path,
			
 
				-            vqgan_config_name,
			
 
				-            device,
			
 
				-            precision,
			
 
				-            compile_model,
			
 
				-        ],
			
 
				-        [error],
			
 
				-    )
			
 
				-
			
 
				-    return server_url
			
 
				-
			
 
				-
			
 
				-def inference(
			
 
				-    server_url,
			
 
				-    text,
			
 
				-    input_mode,
			
 
				-    language0,
			
 
				-    language1,
			
 
				-    language2,
			
 
				-    enable_reference_audio,
			
 
				-    reference_audio,
			
 
				-    reference_text,
			
 
				-    max_new_tokens,
			
 
				-    top_k,
			
 
				-    top_p,
			
 
				-    repetition_penalty,
			
 
				-    temperature,
			
 
				-    speaker,
			
 
				-):
			
 
				-    languages = [language0, language1, language2]
			
 
				-    languages = [
			
 
				-        {
			
 
				-            "中文": "zh",
			
 
				-            "日文": "jp",
			
 
				-            "英文": "en",
			
 
				-        }[language]
			
 
				-        for language in languages
			
 
				-    ]
			
 
				-
			
 
				-    if len(set(languages)) != len(languages):
			
 
				-        return [], build_html_error_message("语言优先级不能重复.")
			
 
				-
			
 
				-    order = ",".join(languages)
			
 
				-    payload = {
			
 
				-        "text": text,
			
 
				-        "prompt_text": reference_text if enable_reference_audio else None,
			
 
				-        "prompt_tokens": reference_audio if enable_reference_audio else None,
			
 
				-        "max_new_tokens": int(max_new_tokens),
			
 
				-        "top_k": int(top_k) if top_k > 0 else None,
			
 
				-        "top_p": top_p,
			
 
				-        "repetition_penalty": repetition_penalty,
			
 
				-        "temperature": temperature,
			
 
				-        "order": order,
			
 
				-        "use_g2p": input_mode == "自动音素",
			
 
				-        "seed": None,
			
 
				-        "speaker": speaker if speaker.strip() != "" else None,
			
 
				-    }
			
 
				-
			
 
				-    try:
			
 
				-        resp = requests.post(f"{server_url}/v1/models/default/invoke", json=payload)
			
 
				-        resp.raise_for_status()
			
 
				-    except Exception:
			
 
				-        traceback.print_exc()
			
 
				-        err = traceback.format_exc()
			
 
				-        return [], build_html_error_message(f"推理时发生错误. \n\n{err}")
			
 
				-
			
 
				-    content = io.BytesIO(resp.content)
			
 
				-    content.seek(0)
			
 
				-    content, sr = librosa.load(content, sr=None, mono=True)
			
 
				-
			
 
				-    return (sr, content), None
			
 
				-
			
 
				-
			
 
				-def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=22050):
			
 
				-    # copy and paste
			
 
				-    wav_buf = io.BytesIO()
			
 
				-    with wave.open(wav_buf, "wb") as vfout:
			
 
				-        vfout.setnchannels(channels)
			
 
				-        vfout.setsampwidth(sample_width)
			
 
				-        vfout.setframerate(sample_rate)
			
 
				-        vfout.writeframes(frame_input)
			
 
				-
			
 
				-    wav_buf.seek(0)
			
 
				-    return wav_buf.read()
			
 
				-
			
 
				-
			
 
				-def inference_stream(
			
 
				-    server_url,
			
 
				-    text,
			
 
				-    input_mode,
			
 
				-    language0,
			
 
				-    language1,
			
 
				-    language2,
			
 
				-    enable_reference_audio,
			
 
				-    reference_audio,
			
 
				-    reference_text,
			
 
				-    max_new_tokens,
			
 
				-    top_k,
			
 
				-    top_p,
			
 
				-    repetition_penalty,
			
 
				-    temperature,
			
 
				-    speaker,
			
 
				-):
			
 
				-    languages = [language0, language1, language2]
			
 
				-    languages = [
			
 
				-        {
			
 
				-            "中文": "zh",
			
 
				-            "日文": "jp",
			
 
				-            "英文": "en",
			
 
				-        }[language]
			
 
				-        for language in languages
			
 
				-    ]
			
 
				-
			
 
				-    if len(set(languages)) != len(languages):
			
 
				-        return []
			
 
				-
			
 
				-    order = ",".join(languages)
			
 
				-    payload = {
			
 
				-        "text": text,
			
 
				-        "prompt_text": reference_text if enable_reference_audio else None,
			
 
				-        "prompt_tokens": reference_audio if enable_reference_audio else None,
			
 
				-        "max_new_tokens": int(max_new_tokens),
			
 
				-        "top_k": int(top_k) if top_k > 0 else None,
			
 
				-        "top_p": top_p,
			
 
				-        "repetition_penalty": repetition_penalty,
			
 
				-        "temperature": temperature,
			
 
				-        "order": order,
			
 
				-        "use_g2p": input_mode == "自动音素",
			
 
				-        "seed": None,
			
 
				-        "speaker": speaker if speaker.strip() != "" else None,
			
 
				-    }
			
 
				-
			
 
				-    resp = requests.post(
			
 
				-        f"{server_url}/v1/models/default/invoke_stream", json=payload, stream=True
			
 
				-    )
			
 
				-    resp.raise_for_status()
			
 
				-
			
 
				-    yield wave_header_chunk(), None
			
 
				-
			
 
				-    for chunk in resp.iter_content(chunk_size=None):
			
 
				-        if chunk:
			
 
				-            content = io.BytesIO(chunk)
			
 
				-            content.seek(0)
			
 
				-            audio, sr = librosa.load(content, sr=None, mono=True)
			
 
				-            print(audio.shape, sr)
			
 
				-            yield (np.concatenate([audio], 0) * 32768).astype(np.int16).tobytes(), None
			
 
				-
			
 
				-
			
 
				-with gr.Blocks(theme=gr.themes.Base()) as app:
			
 
				-    gr.Markdown(HEADER_MD)
			
 
				-
			
 
				-    # Use light theme by default
			
 
				-    app.load(
			
 
				-        None,
			
 
				-        None,
			
 
				-        js="() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', 'light');window.location.search = params.toString();}}",
			
 
				-    )
			
 
				-
			
 
				-    # Inference
			
 
				-    with gr.Row():
			
 
				-        with gr.Column(scale=3):
			
 
				-            with gr.Tab(label="模型配置"):
			
 
				-                server_url = build_model_config_block()
			
 
				-
			
 
				-            with gr.Tab(label="推理配置"):
			
 
				-                text = gr.Textbox(
			
 
				-                    label="输入文本", placeholder=TEXTBOX_PLACEHOLDER, lines=15
			
 
				-                )
			
 
				-
			
 
				-                with gr.Row():
			
 
				-                    with gr.Tab(label="合成参数"):
			
 
				-                        gr.Markdown("配置常见合成参数. 自动音素会在推理时自动将文本转换为音素.")
			
 
				-
			
 
				-                        input_mode = gr.Dropdown(
			
 
				-                            choices=["文本", "自动音素"],
			
 
				-                            value="文本",
			
 
				-                            label="输入模式",
			
 
				-                        )
			
 
				-
			
 
				-                        max_new_tokens = gr.Slider(
			
 
				-                            label="最大生成 Token 数",
			
 
				-                            minimum=0,
			
 
				-                            maximum=4096,
			
 
				-                            value=0,  # 0 means no limit
			
 
				-                            step=8,
			
 
				-                        )
			
 
				-
			
 
				-                        top_k = gr.Slider(
			
 
				-                            label="Top-K", minimum=0, maximum=100, value=0, step=1
			
 
				-                        )
			
 
				-
			
 
				-                        top_p = gr.Slider(
			
 
				-                            label="Top-P", minimum=0, maximum=1, value=0.5, step=0.01
			
 
				-                        )
			
 
				-
			
 
				-                        repetition_penalty = gr.Slider(
			
 
				-                            label="重复惩罚", minimum=0, maximum=2, value=1.5, step=0.01
			
 
				-                        )
			
 
				-
			
 
				-                        temperature = gr.Slider(
			
 
				-                            label="温度", minimum=0, maximum=2, value=0.7, step=0.01
			
 
				-                        )
			
 
				-
			
 
				-                        speaker = gr.Textbox(
			
 
				-                            label="说话人",
			
 
				-                            placeholder="说话人",
			
 
				-                            lines=1,
			
 
				-                        )
			
 
				-
			
 
				-                    with gr.Tab(label="语言优先级"):
			
 
				-                        gr.Markdown("该参数只在自动音素转换时生效.")
			
 
				-
			
 
				-                        with gr.Column(scale=1):
			
 
				-                            language0 = gr.Dropdown(
			
 
				-                                choices=["中文", "日文", "英文"],
			
 
				-                                label="语言 1",
			
 
				-                                value="中文",
			
 
				-                            )
			
 
				-
			
 
				-                        with gr.Column(scale=1):
			
 
				-                            language1 = gr.Dropdown(
			
 
				-                                choices=["中文", "日文", "英文"],
			
 
				-                                label="语言 2",
			
 
				-                                value="日文",
			
 
				-                            )
			
 
				-
			
 
				-                        with gr.Column(scale=1):
			
 
				-                            language2 = gr.Dropdown(
			
 
				-                                choices=["中文", "日文", "英文"],
			
 
				-                                label="语言 3",
			
 
				-                                value="英文",
			
 
				-                            )
			
 
				-
			
 
				-                    with gr.Tab(label="参考音频"):
			
 
				-                        gr.Markdown("5-10 秒的参考音频, 适用于指定音色.")
			
 
				-
			
 
				-                        enable_reference_audio = gr.Checkbox(
			
 
				-                            label="启用参考音频", value=False
			
 
				-                        )
			
 
				-                        reference_audio = gr.Audio(
			
 
				-                            label="参考音频",
			
 
				-                            value="docs/assets/audios/0_input.wav",
			
 
				-                            type="filepath",
			
 
				-                        )
			
 
				-                        reference_text = gr.Textbox(
			
 
				-                            label="参考文本",
			
 
				-                            placeholder="参考文本",
			
 
				-                            lines=1,
			
 
				-                            value="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
			
 
				-                        )
			
 
				-
			
 
				-        with gr.Column(scale=3):
			
 
				-            with gr.Row():
			
 
				-                error = gr.HTML(label="错误信息")
			
 
				-            with gr.Row():
			
 
				-                parsed_text = gr.Dataframe(
			
 
				-                    label="解析结果 (仅参考)", headers=["ID", "文本", "语言", "音素"]
			
 
				-                )
			
 
				-            with gr.Row():
			
 
				-                audio_once = gr.Audio(label="一次合成音频", type="numpy")
			
 
				-            with gr.Row():
			
 
				-                audio_stream = gr.Audio(
			
 
				-                    label="流式合成音频",
			
 
				-                    autoplay=True,
			
 
				-                    streaming=True,
			
 
				-                    show_label=True,
			
 
				-                    interactive=False,
			
 
				-                )
			
 
				-            with gr.Row():
			
 
				-                with gr.Column(scale=3):
			
 
				-                    generate = gr.Button(value="\U0001F3A7 合成", variant="primary")
			
 
				-                    stream_generate = gr.Button(
			
 
				-                        value="\U0001F4A7 流式合成", variant="primary"
			
 
				-                    )
			
 
				-                with gr.Column(scale=1):
			
 
				-                    audio_download = gr.Button(
			
 
				-                        value="\U0001F449 下载流式音频", elem_id="audio_download"
			
 
				-                    )
			
 
				-                    clear = gr.Button(value="清空")
			
 
				-
			
 
				-    # Language & Text Parsing
			
 
				-    kwargs = dict(
			
 
				-        inputs=[
			
 
				-            text,
			
 
				-            input_mode,
			
 
				-            language0,
			
 
				-            language1,
			
 
				-            language2,
			
 
				-            enable_reference_audio,
			
 
				-            reference_text,
			
 
				-        ],
			
 
				-        outputs=[parsed_text, error],
			
 
				-        trigger_mode="always_last",
			
 
				-    )
			
 
				-    text.change(prepare_text, **kwargs)
			
 
				-    input_mode.change(prepare_text, **kwargs)
			
 
				-    language0.change(prepare_text, **kwargs)
			
 
				-    language1.change(prepare_text, **kwargs)
			
 
				-    language2.change(prepare_text, **kwargs)
			
 
				-    enable_reference_audio.change(prepare_text, **kwargs)
			
 
				-
			
 
				-    # Submit
			
 
				-    generate.click(
			
 
				-        inference,
			
 
				-        [
			
 
				-            server_url,
			
 
				-            text,
			
 
				-            input_mode,
			
 
				-            language0,
			
 
				-            language1,
			
 
				-            language2,
			
 
				-            enable_reference_audio,
			
 
				-            reference_audio,
			
 
				-            reference_text,
			
 
				-            max_new_tokens,
			
 
				-            top_k,
			
 
				-            top_p,
			
 
				-            repetition_penalty,
			
 
				-            temperature,
			
 
				-            speaker,
			
 
				-        ],
			
 
				-        [audio_once, error],
			
 
				-    )
			
 
				-
			
 
				-    stream_generate.click(
			
 
				-        inference_stream,
			
 
				-        [
			
 
				-            server_url,
			
 
				-            text,
			
 
				-            input_mode,
			
 
				-            language0,
			
 
				-            language1,
			
 
				-            language2,
			
 
				-            enable_reference_audio,
			
 
				-            reference_audio,
			
 
				-            reference_text,
			
 
				-            max_new_tokens,
			
 
				-            top_k,
			
 
				-            top_p,
			
 
				-            repetition_penalty,
			
 
				-            temperature,
			
 
				-            speaker,
			
 
				-        ],
			
 
				-        [audio_stream, error],
			
 
				-    ).then(lambda: gr.update(interactive=True), None, [text], queue=False)
			
 
				-
			
 
				-    audio_download.click(
			
 
				-        None,
			
 
				-        js="() => { "
			
 
				-        'var btn = document.getElementById("audio_download"); '
			
 
				-        "btn.disabled = true; "
			
 
				-        "setTimeout(() => { btn.disabled = false; }, 1000); "
			
 
				-        'var win = window.open("http://localhost:8000/v1/models/default/download", '
			
 
				-        '"newwindow", "height=100, width=400, toolbar=no, menubar=no, scrollbars=no, '
			
 
				-        'resizable=no, location=no, status=no"); '
			
 
				-        "setTimeout(function() { win.close(); }, 1000);"
			
 
				-        "}",
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    app.launch(show_api=False)
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
 
				     "vector_quantize_pytorch>=1.14.7",
			
 
				     "samplerate>=0.2.1",
			
 
				     "resampy>=0.4.3",
			
 
				+    "spaces>=0.26.1"
			
 
				 ]
			
 
				 
			
 
				 [project.optional-dependencies]
			
--- a/tools/llama/generate.py
+++ b/tools/llama/generate.py
@@ -219,7 +219,6 @@ def generate(
 
				     eos_token_id: int = 2,
			
 
				     im_end_id: int = 4,
			
 
				     decode_one_token=decode_one_token_naive,
			
 
				-    precision: torch.dtype = torch.bfloat16,
			
 
				     **sampling_kwargs,
			
 
				 ) -> torch.Tensor:
			
 
				     """
			
@@ -241,7 +240,9 @@ def generate(
 
				 
			
 
				     device, dtype = prompt.device, prompt.dtype
			
 
				     with torch.device(device):
			
 
				-        model.setup_caches(max_batch_size=1, max_seq_len=T_new, dtype=precision)
			
 
				+        model.setup_caches(
			
 
				+            max_batch_size=1, max_seq_len=T_new, dtype=next(model.parameters()).dtype
			
 
				+        )
			
 
				 
			
 
				     codebook_dim = 1 + model.config.num_codebooks
			
 
				     # create an empty tensor of the expected final shape and fill in the current tokens
			
@@ -250,7 +251,13 @@ def generate(
 
				     seq = empty
			
 
				     input_pos = torch.arange(0, T, device=device)
			
 
				 
			
 
				-    next_token = decode_one_token(
			
 
				+    # Use non-accelerated version for now, to avoid compilation overhead
			
 
				+    prefill_decode = (
			
 
				+        decode_one_token_naive
			
 
				+        if isinstance(model, NaiveTransformer)
			
 
				+        else decode_one_token_ar
			
 
				+    )
			
 
				+    next_token = prefill_decode(
			
 
				         model, prompt.view(1, codebook_dim, -1), input_pos, **sampling_kwargs
			
 
				     )
			
 
				     seq[:, T : T + 1] = next_token
			
@@ -338,7 +345,9 @@ def encode_tokens(
 
				     return prompt
			
 
				 
			
 
				 
			
 
				-def load_model(config_name, checkpoint_path, device, precision, max_length):
			
 
				+def load_model(
			
 
				+    config_name, checkpoint_path, device, precision, max_length, compile=False
			
 
				+):
			
 
				     with initialize(version_base="1.3", config_path="../../fish_speech/configs/model"):
			
 
				         cfg = compose(
			
 
				             config_name=config_name, overrides=[f"config.max_seq_len={max_length}"]
			
@@ -379,7 +388,20 @@ def load_model(config_name, checkpoint_path, device, precision, max_length):
 
				     model = model.to(device=device, dtype=precision)
			
 
				     logger.info("Restored model from checkpoint")
			
 
				 
			
 
				-    return model.eval(), cfg
			
 
				+    if isinstance(model, DualARTransformer):
			
 
				+        decode_one_token = decode_one_token_ar
			
 
				+        logger.info("Using DualARTransformer")
			
 
				+    else:
			
 
				+        decode_one_token = decode_one_token_naive
			
 
				+        logger.info("Using NaiveTransformer")
			
 
				+
			
 
				+    if compile:
			
 
				+        logger.info("Compiling function...")
			
 
				+        decode_one_token = torch.compile(
			
 
				+            decode_one_token, mode="reduce-overhead", fullgraph=True
			
 
				+        )
			
 
				+
			
 
				+    return model.eval(), decode_one_token
			
 
				 
			
 
				 
			
 
				 def split_text(text, min_length):
			
@@ -401,76 +423,28 @@ def split_text(text, min_length):
 
				     return segments
			
 
				 
			
 
				 
			
 
				-@click.command()
			
 
				-@click.option(
			
 
				-    "--text",
			
 
				-    type=str,
			
 
				-    default="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
			
 
				-)
			
 
				-@click.option("--prompt-text", type=str, default=None)
			
 
				-@click.option(
			
 
				-    "--prompt-tokens", type=click.Path(path_type=Path, exists=True), default=None
			
 
				-)
			
 
				-@click.option("--num-samples", type=int, default=1)
			
 
				-@click.option("--max-new-tokens", type=int, default=0)
			
 
				-@click.option("--top-k", type=int, default=None)
			
 
				-@click.option("--top-p", type=float, default=0.7)
			
 
				-@click.option("--repetition-penalty", type=float, default=1.5)
			
 
				-@click.option("--temperature", type=float, default=0.7)
			
 
				-@click.option(
			
 
				-    "--checkpoint-path",
			
 
				-    type=click.Path(path_type=Path, exists=True),
			
 
				-    default="results/text2semantic_400m_finetune/step_000002000.pth",
			
 
				-)
			
 
				-@click.option("--config-name", type=str, default="dual_ar_8_codebook_small")
			
 
				-@click.option("--tokenizer", type=str, default="fishaudio/fish-speech-1")
			
 
				-@click.option("--compile/--no-compile", default=False)
			
 
				-@click.option("--seed", type=int, default=42)
			
 
				-@click.option("--speaker", type=str, default=None)
			
 
				-@click.option("--half/--no-half", default=False)
			
 
				-@click.option("--iterative-prompt/--no-iterative-prompt", default=False)
			
 
				-@click.option("--max-length", type=int, default=2048)
			
 
				-@click.option("--chunk-length", type=int, default=30)
			
 
				-def main(
			
 
				+def generate_long(
			
 
				+    *,
			
 
				+    model,
			
 
				+    tokenizer: callable,
			
 
				+    device: str | torch.device,
			
 
				+    decode_one_token: callable,
			
 
				     text: str,
			
 
				-    prompt_text: Optional[str],
			
 
				-    prompt_tokens: Optional[Path],
			
 
				-    num_samples: int,
			
 
				-    max_new_tokens: int,
			
 
				-    top_k: int,
			
 
				-    top_p: int,
			
 
				-    repetition_penalty: float,
			
 
				-    temperature: float,
			
 
				-    checkpoint_path: Path,
			
 
				-    config_name: str,
			
 
				-    tokenizer: str,
			
 
				-    compile: bool,
			
 
				-    seed: int,
			
 
				-    speaker: Optional[str],
			
 
				-    half: bool,
			
 
				-    iterative_prompt: bool,
			
 
				-    max_length: int,
			
 
				-    chunk_length: int,
			
 
				-) -> None:
			
 
				-    device = "cuda"
			
 
				-
			
 
				-    precision = torch.half if half else torch.bfloat16
			
 
				-
			
 
				-    logger.info("Loading model ...")
			
 
				-    t0 = time.time()
			
 
				-    model, cfg = load_model(config_name, checkpoint_path, device, precision, max_length)
			
 
				+    num_samples: int = 1,
			
 
				+    max_new_tokens: int = 0,
			
 
				+    top_k: int = None,
			
 
				+    top_p: int = 0.7,
			
 
				+    repetition_penalty: float = 1.5,
			
 
				+    temperature: float = 0.7,
			
 
				+    compile: bool = False,
			
 
				+    iterative_prompt: bool = True,
			
 
				+    max_length: int = 2048,
			
 
				+    chunk_length: int = 30,
			
 
				+    speaker: Optional[str] = None,
			
 
				+    prompt_text: Optional[str] = None,
			
 
				+    prompt_tokens: Optional[torch.Tensor] = None,
			
 
				+):
			
 
				     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
			
 
				-
			
 
				-    torch.cuda.synchronize()
			
 
				-    logger.info(f"Time to load model: {time.time() - t0:.02f} seconds")
			
 
				-
			
 
				-    tokenizer = AutoTokenizer.from_pretrained(tokenizer)
			
 
				-    prompt_tokens = (
			
 
				-        torch.from_numpy(np.load(prompt_tokens)).to(device)
			
 
				-        if prompt_tokens is not None
			
 
				-        else None
			
 
				-    )
			
 
				-
			
 
				     im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
			
 
				 
			
 
				     use_prompt = prompt_text is not None and prompt_tokens is not None
			
@@ -502,29 +476,17 @@ def main(
 
				 
			
 
				         encoded[0] = torch.cat((encoded_prompt, encoded[0]), dim=1)
			
 
				 
			
 
				-    torch.manual_seed(seed)
			
 
				-    torch.cuda.manual_seed(seed)
			
 
				-
			
 
				-    if isinstance(model, DualARTransformer):
			
 
				-        decode_one_token = decode_one_token_ar
			
 
				-        logger.info("Using DualARTransformer")
			
 
				-    else:
			
 
				-        decode_one_token = decode_one_token_naive
			
 
				-        logger.info("Using NaiveTransformer")
			
 
				-
			
 
				-    if compile:
			
 
				-        logger.info("Compiling function...")
			
 
				-        decode_one_token = torch.compile(
			
 
				-            decode_one_token, mode="reduce-overhead", fullgraph=True
			
 
				-        )
			
 
				-
			
 
				-    for idx in range(num_samples):
			
 
				+    for sample_idx in range(num_samples):
			
 
				         torch.cuda.synchronize()
			
 
				         global_encoded = []
			
 
				         all_codes = []
			
 
				         seg_idx = 0
			
 
				 
			
 
				         while seg_idx < len(encoded):
			
 
				+            logger.info(
			
 
				+                f"Generating sentence {seg_idx + 1}/{len(encoded)} of sample {sample_idx + 1}/{num_samples}"
			
 
				+            )
			
 
				+
			
 
				             seg = encoded[seg_idx]
			
 
				             global_encoded.append(seg)
			
 
				 
			
@@ -557,14 +519,13 @@ def main(
 
				                 eos_token_id=tokenizer.eos_token_id,
			
 
				                 im_end_id=im_end_id,
			
 
				                 decode_one_token=decode_one_token,
			
 
				-                precision=precision,
			
 
				                 temperature=temperature,
			
 
				                 top_k=top_k,
			
 
				                 top_p=top_p,
			
 
				                 repetition_penalty=repetition_penalty,
			
 
				             )
			
 
				 
			
 
				-            if idx == 0 and seg_idx == 0 and compile:
			
 
				+            if sample_idx == 0 and seg_idx == 0 and compile:
			
 
				                 logger.info(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
			
 
				 
			
 
				             torch.cuda.synchronize()
			
@@ -607,6 +568,104 @@ def main(
 
				         codes = torch.cat(all_codes, dim=1)
			
 
				         assert (codes >= 0).all(), f"Negative code found: {codes}"
			
 
				 
			
 
				+        yield codes
			
 
				+
			
 
				+
			
 
				+@click.command()
			
 
				+@click.option(
			
 
				+    "--text",
			
 
				+    type=str,
			
 
				+    default="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
			
 
				+)
			
 
				+@click.option("--prompt-text", type=str, default=None)
			
 
				+@click.option(
			
 
				+    "--prompt-tokens", type=click.Path(path_type=Path, exists=True), default=None
			
 
				+)
			
 
				+@click.option("--num-samples", type=int, default=1)
			
 
				+@click.option("--max-new-tokens", type=int, default=0)
			
 
				+@click.option("--top-k", type=int, default=None)
			
 
				+@click.option("--top-p", type=float, default=0.7)
			
 
				+@click.option("--repetition-penalty", type=float, default=1.5)
			
 
				+@click.option("--temperature", type=float, default=0.7)
			
 
				+@click.option(
			
 
				+    "--checkpoint-path",
			
 
				+    type=click.Path(path_type=Path, exists=True),
			
 
				+    default="results/text2semantic_400m_finetune/step_000002000.pth",
			
 
				+)
			
 
				+@click.option("--config-name", type=str, default="dual_ar_8_codebook_small")
			
 
				+@click.option("--tokenizer", type=str, default="fishaudio/fish-speech-1")
			
 
				+@click.option("--compile/--no-compile", default=False)
			
 
				+@click.option("--seed", type=int, default=42)
			
 
				+@click.option("--speaker", type=str, default=None)
			
 
				+@click.option("--half/--no-half", default=False)
			
 
				+@click.option("--iterative-prompt/--no-iterative-prompt", default=True)
			
 
				+@click.option("--max-length", type=int, default=2048)
			
 
				+@click.option("--chunk-length", type=int, default=30)
			
 
				+def main(
			
 
				+    text: str,
			
 
				+    prompt_text: Optional[str],
			
 
				+    prompt_tokens: Optional[Path],
			
 
				+    num_samples: int,
			
 
				+    max_new_tokens: int,
			
 
				+    top_k: int,
			
 
				+    top_p: int,
			
 
				+    repetition_penalty: float,
			
 
				+    temperature: float,
			
 
				+    checkpoint_path: Path,
			
 
				+    config_name: str,
			
 
				+    tokenizer: str,
			
 
				+    compile: bool,
			
 
				+    seed: int,
			
 
				+    speaker: Optional[str],
			
 
				+    half: bool,
			
 
				+    iterative_prompt: bool,
			
 
				+    max_length: int,
			
 
				+    chunk_length: int,
			
 
				+) -> None:
			
 
				+    device = "cuda"
			
 
				+
			
 
				+    precision = torch.half if half else torch.bfloat16
			
 
				+
			
 
				+    logger.info("Loading model ...")
			
 
				+    t0 = time.time()
			
 
				+    model, decode_one_token = load_model(
			
 
				+        config_name, checkpoint_path, device, precision, max_length, compile=compile
			
 
				+    )
			
 
				+    torch.cuda.synchronize()
			
 
				+    logger.info(f"Time to load model: {time.time() - t0:.02f} seconds")
			
 
				+
			
 
				+    prompt_tokens = (
			
 
				+        torch.from_numpy(np.load(prompt_tokens)).to(device)
			
 
				+        if prompt_tokens is not None
			
 
				+        else None
			
 
				+    )
			
 
				+
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(tokenizer)
			
 
				+    torch.manual_seed(seed)
			
 
				+    torch.cuda.manual_seed(seed)
			
 
				+
			
 
				+    generator = generate_long(
			
 
				+        model=model,
			
 
				+        device=device,
			
 
				+        decode_one_token=decode_one_token,
			
 
				+        text=text,
			
 
				+        num_samples=num_samples,
			
 
				+        max_new_tokens=max_new_tokens,
			
 
				+        top_k=top_k,
			
 
				+        top_p=top_p,
			
 
				+        repetition_penalty=repetition_penalty,
			
 
				+        temperature=temperature,
			
 
				+        tokenizer=tokenizer,
			
 
				+        compile=compile,
			
 
				+        speaker=speaker,
			
 
				+        iterative_prompt=iterative_prompt,
			
 
				+        max_length=max_length,
			
 
				+        chunk_length=chunk_length,
			
 
				+        prompt_text=prompt_text,
			
 
				+        prompt_tokens=prompt_tokens,
			
 
				+    )
			
 
				+
			
 
				+    for idx, codes in enumerate(generator):
			
 
				         np.save(f"codes_{idx}.npy", codes.cpu().numpy())
			
 
				         logger.info(f"Saved codes to codes_{idx}.npy")
			
 
				 
			
--- a/tools/split_protos.py
+++ b/tools/split_protos.py
@@ -1,44 +0,0 @@
 
				-from pathlib import Path
			
 
				-
			
 
				-import click
			
 
				-from loguru import logger
			
 
				-
			
 
				-from fish_speech.datasets.protos.text_data_stream import split_pb_stream
			
 
				-
			
 
				-
			
 
				-@click.command()
			
 
				-@click.argument("input", type=click.Path(exists=True, path_type=Path))
			
 
				-@click.argument("output", type=click.Path(path_type=Path))
			
 
				-@click.option("--chunk-size", type=int, default=1024**3)  # 1GB
			
 
				-def main(input, output, chunk_size):
			
 
				-    chunk_idx = 0
			
 
				-    current_size = 0
			
 
				-    current_file = None
			
 
				-
			
 
				-    if output.exists() is False:
			
 
				-        output.mkdir(parents=True)
			
 
				-
			
 
				-    with open(input, "rb") as f:
			
 
				-        for chunk in split_pb_stream(f):
			
 
				-            if current_file is None or current_size + len(chunk) > chunk_size:
			
 
				-                if current_file is not None:
			
 
				-                    current_file.close()
			
 
				-
			
 
				-                current_file = open(
			
 
				-                    output / f"{input.stem}.{chunk_idx:04d}.protos", "wb"
			
 
				-                )
			
 
				-                chunk_idx += 1
			
 
				-                current_size = 0
			
 
				-                logger.info(f"Writing to {current_file.name}")
			
 
				-
			
 
				-            current_file.write(chunk)
			
 
				-            current_size += len(chunk)
			
 
				-
			
 
				-    if current_file is not None:
			
 
				-        current_file.close()
			
 
				-
			
 
				-    logger.info(f"Split {input} into {chunk_idx} files")
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/tools/to_flac.py
+++ b/tools/to_flac.py
@@ -1,60 +0,0 @@
 
				-import random
			
 
				-import subprocess
			
 
				-from multiprocessing import Pool, cpu_count
			
 
				-from pathlib import Path
			
 
				-
			
 
				-from tqdm import tqdm
			
 
				-
			
 
				-
			
 
				-def convert_to_flac(src_file_path):
			
 
				-    dst_file_path = src_file_path.with_suffix(".flac")
			
 
				-    dst_file_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				-
			
 
				-    try:
			
 
				-        subprocess.check_call(
			
 
				-            [
			
 
				-                "ffmpeg",
			
 
				-                "-y",
			
 
				-                "-i",
			
 
				-                str(src_file_path),
			
 
				-                "-acodec",
			
 
				-                "flac",
			
 
				-                "-threads",
			
 
				-                "0",
			
 
				-                str(dst_file_path),
			
 
				-            ],
			
 
				-            stdout=subprocess.DEVNULL,
			
 
				-            stderr=subprocess.DEVNULL,
			
 
				-        )
			
 
				-
			
 
				-        # remove the input file
			
 
				-        src_file_path.unlink()
			
 
				-        return True
			
 
				-    except subprocess.CalledProcessError:
			
 
				-        return False
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    src_dir = Path("dataset/tts/WenetSpeech/cleaned")
			
 
				-
			
 
				-    wav_files = list(src_dir.rglob("*.wav"))
			
 
				-    random.shuffle(wav_files)
			
 
				-    print(f"Found {len(wav_files)} wav files")
			
 
				-
			
 
				-    success_counter = 0
			
 
				-    fail_counter = 0
			
 
				-
			
 
				-    with Pool(processes=cpu_count(), maxtasksperchild=100) as pool:
			
 
				-        with tqdm(
			
 
				-            pool.imap_unordered(convert_to_flac, wav_files), total=len(wav_files)
			
 
				-        ) as pbar:
			
 
				-            for success in pbar:
			
 
				-                if success:
			
 
				-                    success_counter += 1
			
 
				-                else:
			
 
				-                    fail_counter += 1
			
 
				-
			
 
				-            pbar.set_description(f"Success: {success_counter}, Fail: {fail_counter}")
			
 
				-
			
 
				-    print(f"Successfully converted: {success_counter}")
			
 
				-    print(f"Failed conversions: {fail_counter}")
			
--- a/tools/vqgan/inference.py
+++ b/tools/vqgan/inference.py
@@ -17,8 +17,28 @@ from fish_speech.utils.file import AUDIO_EXTENSIONS
 
				 OmegaConf.register_new_resolver("eval", eval)
			
 
				 
			
 
				 
			
 
				+def load_model(config_name, checkpoint_path, device="cuda"):
			
 
				+    with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
			
 
				+        cfg = compose(config_name=config_name)
			
 
				+
			
 
				+    model: LightningModule = instantiate(cfg.model)
			
 
				+    state_dict = torch.load(
			
 
				+        checkpoint_path,
			
 
				+        map_location=model.device,
			
 
				+    )
			
 
				+
			
 
				+    if "state_dict" in state_dict:
			
 
				+        state_dict = state_dict["state_dict"]
			
 
				+
			
 
				+    model.load_state_dict(state_dict, strict=False)
			
 
				+    model.eval()
			
 
				+    model.to(device)
			
 
				+    logger.info("Restored model from checkpoint")
			
 
				+
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				 @torch.no_grad()
			
 
				-@torch.autocast(device_type="cuda", enabled=True)
			
 
				 @click.command()
			
 
				 @click.option(
			
 
				     "--input-path",
			
@@ -35,21 +55,13 @@ OmegaConf.register_new_resolver("eval", eval)
 
				     "-ckpt",
			
 
				     default="checkpoints/vq-gan-group-fsq-2x1024.pth",
			
 
				 )
			
 
				-def main(input_path, output_path, config_name, checkpoint_path):
			
 
				-    with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
			
 
				-        cfg = compose(config_name=config_name)
			
 
				-
			
 
				-    model: LightningModule = instantiate(cfg.model)
			
 
				-    state_dict = torch.load(
			
 
				-        checkpoint_path,
			
 
				-        map_location=model.device,
			
 
				-    )
			
 
				-    if "state_dict" in state_dict:
			
 
				-        state_dict = state_dict["state_dict"]
			
 
				-    model.load_state_dict(state_dict, strict=False)
			
 
				-    model.eval()
			
 
				-    model.cuda()
			
 
				-    logger.info("Restored model from checkpoint")
			
 
				+@click.option(
			
 
				+    "--device",
			
 
				+    "-d",
			
 
				+    default="cuda",
			
 
				+)
			
 
				+def main(input_path, output_path, config_name, checkpoint_path, device):
			
 
				+    model = load_model(config_name, checkpoint_path, device=device)
			
 
				 
			
 
				     if input_path.suffix in AUDIO_EXTENSIONS:
			
 
				         logger.info(f"Processing in-place reconstruction of {input_path}")
			
@@ -94,7 +106,7 @@ def main(input_path, output_path, config_name, checkpoint_path):
 
				     )
			
 
				 
			
 
				     # Save audio
			
 
				-    fake_audio = fake_audios[0, 0].cpu().numpy().astype(np.float32)
			
 
				+    fake_audio = fake_audios[0, 0].float().cpu().numpy()
			
 
				     sf.write(output_path, fake_audio, model.sampling_rate)
			
 
				     logger.info(f"Saved audio to {output_path}")
			
 
				 
			
--- a/tools/webui.py
+++ b/tools/webui.py
@@ -0,0 +1,304 @@
 
				+import html
			
 
				+import os
			
 
				+from argparse import ArgumentParser
			
 
				+from io import BytesIO
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import gradio as gr
			
 
				+import librosa
			
 
				+import spaces
			
 
				+import torch
			
 
				+from loguru import logger
			
 
				+from torchaudio import functional as AF
			
 
				+from transformers import AutoTokenizer
			
 
				+
			
 
				+from tools.llama.generate import generate_long
			
 
				+from tools.llama.generate import load_model as load_llama_model
			
 
				+from tools.vqgan.inference import load_model as load_vqgan_model
			
 
				+
			
 
				+# Make einx happy
			
 
				+os.environ["EINX_FILTER_TRACEBACK"] = "false"
			
 
				+
			
 
				+
			
 
				+HEADER_MD = """# Fish Speech
			
 
				+
			
 
				+A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).  
			
 
				+由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成. 
			
 
				+
			
 
				+You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).  
			
 
				+你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.  
			
 
				+
			
 
				+Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.  
			
 
				+相关代码使用 BSD-3-Clause 许可证发布，权重使用 CC BY-NC-SA 4.0 许可证发布.
			
 
				+
			
 
				+We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.  
			
 
				+我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
			
 
				+"""
			
 
				+
			
 
				+TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
			
 
				+
			
 
				+
			
 
				+def build_html_error_message(error):
			
 
				+    return f"""
			
 
				+    <div style="color: red; font-weight: bold;">
			
 
				+        {html.escape(error)}
			
 
				+    </div>
			
 
				+    """
			
 
				+
			
 
				+
			
 
				+@spaces.GPU
			
 
				+def inference(
			
 
				+    text,
			
 
				+    enable_reference_audio,
			
 
				+    reference_audio,
			
 
				+    reference_text,
			
 
				+    max_new_tokens,
			
 
				+    chunk_length,
			
 
				+    top_k,
			
 
				+    top_p,
			
 
				+    repetition_penalty,
			
 
				+    temperature,
			
 
				+    speaker,
			
 
				+):
			
 
				+    # Parse reference audio aka prompt
			
 
				+    if enable_reference_audio and reference_audio is not None:
			
 
				+        # reference_audio_sr, reference_audio_content = reference_audio
			
 
				+        reference_audio_content, _ = librosa.load(
			
 
				+            reference_audio, sr=vqgan_model.sampling_rate, mono=True
			
 
				+        )
			
 
				+        audios = torch.from_numpy(reference_audio_content).to(vqgan_model.device)[
			
 
				+            None, None, :
			
 
				+        ]
			
 
				+
			
 
				+        logger.info(
			
 
				+            f"Loaded audio with {audios.shape[2] / vqgan_model.sampling_rate:.2f} seconds"
			
 
				+        )
			
 
				+
			
 
				+        # VQ Encoder
			
 
				+        audio_lengths = torch.tensor(
			
 
				+            [audios.shape[2]], device=vqgan_model.device, dtype=torch.long
			
 
				+        )
			
 
				+        prompt_tokens = vqgan_model.encode(audios, audio_lengths)[0][0]
			
 
				+
			
 
				+    # LLAMA Inference
			
 
				+    result = generate_long(
			
 
				+        model=llama_model,
			
 
				+        tokenizer=llama_tokenizer,
			
 
				+        device=vqgan_model.device,
			
 
				+        decode_one_token=decode_one_token,
			
 
				+        max_new_tokens=max_new_tokens,
			
 
				+        text=text,
			
 
				+        top_k=int(top_k) if top_k > 0 else None,
			
 
				+        top_p=top_p,
			
 
				+        repetition_penalty=repetition_penalty,
			
 
				+        temperature=temperature,
			
 
				+        compile=args.compile,
			
 
				+        iterative_prompt=chunk_length > 0,
			
 
				+        chunk_length=chunk_length,
			
 
				+        max_length=args.max_length,
			
 
				+        speaker=speaker if speaker else None,
			
 
				+        prompt_tokens=prompt_tokens if enable_reference_audio else None,
			
 
				+        prompt_text=reference_text if enable_reference_audio else None,
			
 
				+    )
			
 
				+
			
 
				+    codes = next(result)
			
 
				+
			
 
				+    # VQGAN Inference
			
 
				+    feature_lengths = torch.tensor([codes.shape[1]], device=vqgan_model.device)
			
 
				+    fake_audios = vqgan_model.decode(
			
 
				+        indices=codes[None], feature_lengths=feature_lengths, return_audios=True
			
 
				+    )[0, 0]
			
 
				+
			
 
				+    fake_audios = fake_audios.float().cpu().numpy()
			
 
				+
			
 
				+    return (vqgan_model.sampling_rate, fake_audios), None
			
 
				+
			
 
				+
			
 
				+def build_app():
			
 
				+    with gr.Blocks(theme=gr.themes.Base()) as app:
			
 
				+        gr.Markdown(HEADER_MD)
			
 
				+
			
 
				+        # Use light theme by default
			
 
				+        app.load(
			
 
				+            None,
			
 
				+            None,
			
 
				+            js="() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', 'light');window.location.search = params.toString();}}",
			
 
				+        )
			
 
				+
			
 
				+        # Inference
			
 
				+        with gr.Row():
			
 
				+            with gr.Column(scale=3):
			
 
				+                text = gr.Textbox(
			
 
				+                    label="Input Text / 输入文本", placeholder=TEXTBOX_PLACEHOLDER, lines=15
			
 
				+                )
			
 
				+
			
 
				+                with gr.Row():
			
 
				+                    with gr.Tab(label="Advanced Config / 高级参数"):
			
 
				+                        chunk_length = gr.Slider(
			
 
				+                            label="Iterative Prompt Length, 0 means off / 迭代提示长度，0 表示关闭",
			
 
				+                            minimum=0,
			
 
				+                            maximum=500,
			
 
				+                            value=30,
			
 
				+                            step=8,
			
 
				+                        )
			
 
				+
			
 
				+                        max_new_tokens = gr.Slider(
			
 
				+                            label="Maximum tokens per batch, 0 means no limit / 每批最大令牌数，0 表示无限制",
			
 
				+                            minimum=0,
			
 
				+                            maximum=args.max_length,
			
 
				+                            value=0,  # 0 means no limit
			
 
				+                            step=8,
			
 
				+                        )
			
 
				+
			
 
				+                        top_k = gr.Slider(
			
 
				+                            label="Top-K", minimum=0, maximum=100, value=0, step=1
			
 
				+                        )
			
 
				+
			
 
				+                        top_p = gr.Slider(
			
 
				+                            label="Top-P", minimum=0, maximum=1, value=0.7, step=0.01
			
 
				+                        )
			
 
				+
			
 
				+                        repetition_penalty = gr.Slider(
			
 
				+                            label="Repetition Penalty",
			
 
				+                            minimum=0,
			
 
				+                            maximum=2,
			
 
				+                            value=1.5,
			
 
				+                            step=0.01,
			
 
				+                        )
			
 
				+
			
 
				+                        temperature = gr.Slider(
			
 
				+                            label="Temperature",
			
 
				+                            minimum=0,
			
 
				+                            maximum=2,
			
 
				+                            value=0.7,
			
 
				+                            step=0.01,
			
 
				+                        )
			
 
				+
			
 
				+                        speaker = gr.Textbox(
			
 
				+                            label="Speaker / 说话人",
			
 
				+                            placeholder="Type name of the speaker / 输入说话人的名称",
			
 
				+                            lines=1,
			
 
				+                        )
			
 
				+
			
 
				+                    with gr.Tab(label="Reference Audio / 参考音频"):
			
 
				+                        gr.Markdown(
			
 
				+                            "5 to 10 seconds of reference audio, useful for specifying speaker. \n5 到 10 秒的参考音频，适用于指定音色。"
			
 
				+                        )
			
 
				+
			
 
				+                        enable_reference_audio = gr.Checkbox(
			
 
				+                            label="Enable Reference Audio / 启用参考音频",
			
 
				+                        )
			
 
				+                        reference_audio = gr.Audio(
			
 
				+                            label="Reference Audio / 参考音频",
			
 
				+                            value="docs/assets/audios/0_input.wav",
			
 
				+                            type="filepath",
			
 
				+                        )
			
 
				+                        reference_text = gr.Textbox(
			
 
				+                            label="Reference Text / 参考文本",
			
 
				+                            placeholder="参考文本",
			
 
				+                            lines=1,
			
 
				+                            value="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
			
 
				+                        )
			
 
				+
			
 
				+            with gr.Column(scale=3):
			
 
				+                with gr.Row():
			
 
				+                    error = gr.HTML(label="Error Message / 错误信息")
			
 
				+                with gr.Row():
			
 
				+                    audio = gr.Audio(label="Generated Audio / 音频", type="numpy")
			
 
				+
			
 
				+                with gr.Row():
			
 
				+                    with gr.Column(scale=3):
			
 
				+                        generate = gr.Button(
			
 
				+                            value="\U0001F3A7 Generate / 合成", variant="primary"
			
 
				+                        )
			
 
				+
			
 
				+        # # Submit
			
 
				+        generate.click(
			
 
				+            inference,
			
 
				+            [
			
 
				+                text,
			
 
				+                enable_reference_audio,
			
 
				+                reference_audio,
			
 
				+                reference_text,
			
 
				+                max_new_tokens,
			
 
				+                chunk_length,
			
 
				+                top_k,
			
 
				+                top_p,
			
 
				+                repetition_penalty,
			
 
				+                temperature,
			
 
				+                speaker,
			
 
				+            ],
			
 
				+            [audio, error],
			
 
				+        )
			
 
				+
			
 
				+    return app
			
 
				+
			
 
				+
			
 
				+def parse_args():
			
 
				+    parser = ArgumentParser()
			
 
				+    parser.add_argument(
			
 
				+        "--llama-checkpoint-path",
			
 
				+        type=Path,
			
 
				+        default="checkpoints/text2semantic-medium-v1-2k.pth",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--llama-config-name", type=str, default="dual_ar_2_codebook_medium"
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--vqgan-checkpoint-path",
			
 
				+        type=Path,
			
 
				+        default="checkpoints/vq-gan-group-fsq-2x1024.pth",
			
 
				+    )
			
 
				+    parser.add_argument("--vqgan-config-name", type=str, default="vqgan_pretrain")
			
 
				+    parser.add_argument("--tokenizer", type=str, default="fishaudio/fish-speech-1")
			
 
				+    parser.add_argument("--device", type=str, default="cuda")
			
 
				+    parser.add_argument("--half", action="store_true")
			
 
				+    parser.add_argument("--max-length", type=int, default=2048)
			
 
				+    parser.add_argument("--compile", action="store_true")
			
 
				+
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    args = parse_args()
			
 
				+    args.precision = torch.half if args.half else torch.bfloat16
			
 
				+
			
 
				+    logger.info("Loading Llama model...")
			
 
				+    llama_model, decode_one_token = load_llama_model(
			
 
				+        config_name=args.llama_config_name,
			
 
				+        checkpoint_path=args.llama_checkpoint_path,
			
 
				+        device=args.device,
			
 
				+        precision=args.precision,
			
 
				+        max_length=args.max_length,
			
 
				+        compile=args.compile,
			
 
				+    )
			
 
				+    llama_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
			
 
				+    logger.info("Llama model loaded, loading VQ-GAN model...")
			
 
				+
			
 
				+    vqgan_model = load_vqgan_model(
			
 
				+        config_name=args.vqgan_config_name,
			
 
				+        checkpoint_path=args.vqgan_checkpoint_path,
			
 
				+        device=args.device,
			
 
				+    )
			
 
				+
			
 
				+    logger.info("VQ-GAN model loaded, warming up...")
			
 
				+
			
 
				+    # Dry run to check if the model is loaded correctly and avoid the first-time latency
			
 
				+    inference(
			
 
				+        text="Hello, world!",
			
 
				+        enable_reference_audio=False,
			
 
				+        reference_audio=None,
			
 
				+        reference_text="",
			
 
				+        max_new_tokens=0,
			
 
				+        chunk_length=0,
			
 
				+        top_k=0,  # 0 means no limit
			
 
				+        top_p=0.7,
			
 
				+        repetition_penalty=1.5,
			
 
				+        temperature=0.7,
			
 
				+        speaker=None,
			
 
				+    )
			
 
				+
			
 
				+    logger.info("Warming up done, launching the web UI...")
			
 
				+
			
 
				+    app = build_app()
			
 
				+    app.launch(show_api=False)