il y a 2 ans · 9ebe18a5c1
--- a/fish_speech/i18n/locale/en_US.json
+++ b/fish_speech/i18n/locale/en_US.json
@@ -1,4 +1,5 @@
 
				 {
			
 
				+    "16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
			
 
				     "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
			
 
				     "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
			
 
				     "Accumulate Gradient Batches": "Accumulate Gradient Batches",
			
@@ -6,7 +7,9 @@
 
				     "Added path successfully!": "Added path successfully!",
			
 
				     "Advanced Config": "Advanced Config",
			
 
				     "Base LLAMA Model": "Base LLAMA Model",
			
 
				+    "Batch Inference": "Batch Inference",
			
 
				     "Batch Size": "Batch Size",
			
 
				+    "Changing with the Model Path": "Changing with the Model Path",
			
 
				     "Chinese": "Chinese",
			
 
				     "Compile Model": "Compile Model",
			
 
				     "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
			
--- a/fish_speech/i18n/locale/es_ES.json
+++ b/fish_speech/i18n/locale/es_ES.json
@@ -1,4 +1,5 @@
 
				 {
			
 
				+    "16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
			
 
				     "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
			
 
				     "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
			
 
				     "Accumulate Gradient Batches": "Acumular lotes de gradientes",
			
@@ -6,7 +7,9 @@
 
				     "Added path successfully!": "¡Ruta agregada exitosamente!",
			
 
				     "Advanced Config": "Configuración Avanzada",
			
 
				     "Base LLAMA Model": "Modelo Base LLAMA",
			
 
				+    "Batch Inference": "Inferencia por Lote",
			
 
				     "Batch Size": "Tamaño del Lote",
			
 
				+    "Changing with the Model Path": "Cambiando con la Ruta del Modelo",
			
 
				     "Chinese": "Chino",
			
 
				     "Compile Model": "Compilar Modelo",
			
 
				     "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
			
--- a/fish_speech/i18n/locale/ja_JP.json
+++ b/fish_speech/i18n/locale/ja_JP.json
@@ -1,4 +1,5 @@
 
				 {
			
 
				+    "16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
			
 
				     "5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5～10秒のリファレンスオーディオ。",
			
 
				     "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。",
			
 
				     "Accumulate Gradient Batches": "勾配バッチの累積",
			
@@ -6,7 +7,9 @@
 
				     "Added path successfully!": "パスの追加に成功しました！",
			
 
				     "Advanced Config": "詳細設定",
			
 
				     "Base LLAMA Model": "基本LLAMAモデル",
			
 
				+    "Batch Inference": "バッチ推論",
			
 
				     "Batch Size": "バッチサイズ",
			
 
				+    "Changing with the Model Path": "モデルのパスに伴って変化する",
			
 
				     "Chinese": "中国語",
			
 
				     "Compile Model": "モデルのコンパイル",
			
 
				     "Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
			
--- a/fish_speech/i18n/locale/zh_CN.json
+++ b/fish_speech/i18n/locale/zh_CN.json
@@ -1,4 +1,5 @@
 
				 {
			
 
				+    "16-mixed is recommended for 10+ series GPU": "10+ 系列 GPU 建议使用 16-mixed",
			
 
				     "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频，适用于指定音色。",
			
 
				     "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
			
 
				     "Accumulate Gradient Batches": "梯度累积批次",
			
@@ -6,7 +7,9 @@
 
				     "Added path successfully!": "添加路径成功!",
			
 
				     "Advanced Config": "高级参数",
			
 
				     "Base LLAMA Model": "基础 LLAMA 模型",
			
 
				+    "Batch Inference": "批量推理",
			
 
				     "Batch Size": "批次大小",
			
 
				+    "Changing with the Model Path": "随模型路径变化",
			
 
				     "Chinese": "中文",
			
 
				     "Compile Model": "编译模型",
			
 
				     "Compile the model can significantly reduce the inference time, but will increase cold start time": "编译模型可以显著减少推理时间，但会增加冷启动时间",
			
--- a/fish_speech/webui/manage.py
+++ b/fish_speech/webui/manage.py
@@ -16,14 +16,15 @@ import yaml
 
				 from loguru import logger
			
 
				 from tqdm import tqdm
			
 
				 
			
 
				-from fish_speech.i18n import i18n
			
 
				-from fish_speech.webui.launch_utils import Seafoam, is_module_installed, versions_html
			
 
				-
			
 
				 PYTHON = os.path.join(os.environ.get("PYTHON_FOLDERPATH", ""), "python")
			
 
				 sys.path.insert(0, "")
			
 
				 print(sys.path)
			
 
				 cur_work_dir = Path(os.getcwd()).resolve()
			
 
				 print("You are in ", str(cur_work_dir))
			
 
				+
			
 
				+from fish_speech.i18n import i18n
			
 
				+from fish_speech.webui.launch_utils import Seafoam, is_module_installed, versions_html
			
 
				+
			
 
				 config_path = cur_work_dir / "fish_speech" / "configs"
			
 
				 vqgan_yml_path = config_path / "vqgan_finetune.yaml"
			
 
				 llama_yml_path = config_path / "text2semantic_finetune.yaml"
			
@@ -131,6 +132,26 @@ def change_label(if_label):
 
				         yield build_html_ok_message("Nothing")
			
 
				 
			
 
				 
			
 
				+def change_decoder_config(decoder_model_path):
			
 
				+    if "vits" in decoder_model_path:
			
 
				+        choices = ["vits_decoder_finetune", "vits_decoder_pretrain"]
			
 
				+        return gr.Dropdown(choices=choices, value=choices[0])
			
 
				+    elif "vqgan" in decoder_model_path or "vq-gan" in decoder_model_path:
			
 
				+        choices = ["vqgan_finetune", "vqgan_pretrain"]
			
 
				+        return gr.Dropdown(choices=choices, value=choices[0])
			
 
				+    else:
			
 
				+        raise ValueError("Invalid decoder name")
			
 
				+
			
 
				+
			
 
				+def change_llama_config(llama_model_path):
			
 
				+    if "large" in llama_model_path:
			
 
				+        return gr.Dropdown(value="dual_ar_2_codebook_large", interactive=False)
			
 
				+    elif "medium" in llama_model_path:
			
 
				+        return gr.Dropdown(value="dual_ar_2_codebook_medium", interactive=False)
			
 
				+    else:
			
 
				+        raise ValueError("Invalid model size")
			
 
				+
			
 
				+
			
 
				 def clean_infer_cache():
			
 
				     import tempfile
			
 
				 
			
@@ -685,12 +706,25 @@ def fresh_tb_dir():
 
				 
			
 
				 
			
 
				 def list_decoder_models():
			
 
				-    return (
			
 
				+    paths = (
			
 
				         [str(p) for p in Path("checkpoints").glob("vits*.*")]
			
 
				         + [str(p) for p in Path("checkpoints").glob("vq*.*")]
			
 
				         + [str(p) for p in Path("results").glob("vqgan*/**/*.ckpt")]
			
 
				         + [str(p) for p in Path("results").glob("vits*/**/*.ckpt")]
			
 
				     )
			
 
				+    if not paths:
			
 
				+        logger.warning("No decoder model found")
			
 
				+    return paths
			
 
				+
			
 
				+
			
 
				+def list_llama_models():
			
 
				+    choices = [
			
 
				+        str(p).replace("\\", "/") for p in Path("checkpoints").glob("text2sem*.*")
			
 
				+    ]
			
 
				+    choices += [str(p) for p in Path("results").glob("text2sem*/**/*.ckpt")]
			
 
				+    if not choices:
			
 
				+        logger.warning("No LLaMA model found")
			
 
				+    return choices
			
 
				 
			
 
				 
			
 
				 def fresh_decoder_model():
			
@@ -720,10 +754,11 @@ def fresh_llama_ckpt():
 
				 
			
 
				 
			
 
				 def fresh_llama_model():
			
 
				-    return gr.Dropdown(
			
 
				-        choices=[init_llama_yml["ckpt_path"]]
			
 
				-        + [str(p) for p in Path("results").glob("text2sem*/**/*.ckpt")]
			
 
				-    )
			
 
				+    choices = [
			
 
				+        str(p).replace("\\", "/") for p in Path("checkpoints").glob("text2sem*.*")
			
 
				+    ]
			
 
				+    choices += [str(p) for p in Path("results").glob("text2sem*/**/*.ckpt")]
			
 
				+    return gr.Dropdown(choices=choices)
			
 
				 
			
 
				 
			
 
				 def llama_lora_merge(llama_weight, lora_llama_config, lora_weight, llama_lora_output):
			
@@ -1207,9 +1242,7 @@ with gr.Blocks(
 
				                                 )
			
 
				                                 infer_decoder_config = gr.Dropdown(
			
 
				                                     label=i18n("Decoder Model Config"),
			
 
				-                                    info=i18n(
			
 
				-                                        "Type the path or select from the dropdown"
			
 
				-                                    ),
			
 
				+                                    info=i18n("Changing with the Model Path"),
			
 
				                                     value="vits_decoder_finetune",
			
 
				                                     choices=[
			
 
				                                         "vits_decoder_finetune",
			
@@ -1226,20 +1259,12 @@ with gr.Blocks(
 
				                                         "Type the path or select from the dropdown"
			
 
				                                     ),
			
 
				                                     value=init_llama_yml["ckpt_path"],
			
 
				-                                    choices=[init_llama_yml["ckpt_path"]]
			
 
				-                                    + [
			
 
				-                                        str(p)
			
 
				-                                        for p in Path("results").glob(
			
 
				-                                            "text2sem*/**/*.ckpt"
			
 
				-                                        )
			
 
				-                                    ],
			
 
				+                                    choices=list_llama_models(),
			
 
				                                     allow_custom_value=True,
			
 
				                                 )
			
 
				                                 infer_llama_config = gr.Dropdown(
			
 
				                                     label=i18n("LLAMA Model Config"),
			
 
				-                                    info=i18n(
			
 
				-                                        "Type the path or select from the dropdown"
			
 
				-                                    ),
			
 
				+                                    info=i18n("Changing with the Model Path"),
			
 
				                                     choices=[
			
 
				                                         "dual_ar_2_codebook_large",
			
 
				                                         "dual_ar_2_codebook_medium",
			
@@ -1333,6 +1358,14 @@ with gr.Blocks(
 
				         'toolbar=no, menubar=no, scrollbars=no, resizable=no, location=no, status=no")}',
			
 
				     )
			
 
				     if_label.change(fn=change_label, inputs=[if_label], outputs=[error])
			
 
				+    infer_decoder_model.change(
			
 
				+        fn=change_decoder_config,
			
 
				+        inputs=[infer_decoder_model],
			
 
				+        outputs=[infer_decoder_config],
			
 
				+    )
			
 
				+    infer_llama_model.change(
			
 
				+        fn=change_llama_config, inputs=[infer_llama_model], outputs=[infer_llama_config]
			
 
				+    )
			
 
				     train_btn.click(
			
 
				         fn=train_process,
			
 
				         inputs=[