1 year ago · b8473a9ee5
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ ffmpeg.exe
 
				 asr-label-win-x64.exe
			
 
				 /.cache
			
 
				 /fishenv
			
 
				+/.locale
			
--- a/fish_speech/i18n/__init__.py
+++ b/fish_speech/i18n/__init__.py
@@ -0,0 +1,3 @@
 
				+from .core import i18n
			
 
				+
			
 
				+__all__ = ["i18n"]
			
--- a/fish_speech/i18n/core.py
+++ b/fish_speech/i18n/core.py
@@ -0,0 +1,40 @@
 
				+import json
			
 
				+import locale
			
 
				+from pathlib import Path
			
 
				+
			
 
				+I18N_FILE_PATH = Path(__file__).parent / "locale"
			
 
				+DEFAULT_LANGUAGE = "en_US"
			
 
				+
			
 
				+
			
 
				+def load_language_list(language):
			
 
				+    with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
			
 
				+        language_list = json.load(f)
			
 
				+
			
 
				+    return language_list
			
 
				+
			
 
				+
			
 
				+class I18nAuto:
			
 
				+    def __init__(self):
			
 
				+        i18n_file = Path(".locale")
			
 
				+
			
 
				+        if i18n_file.exists():
			
 
				+            with open(i18n_file, "r", encoding="utf-8") as f:
			
 
				+                language = f.read().strip()
			
 
				+        else:
			
 
				+            # getlocale can't identify the system's language ((None, None))
			
 
				+            language = locale.getdefaultlocale()[0]
			
 
				+
			
 
				+        if (I18N_FILE_PATH / f"{language}.json").exists() is False:
			
 
				+            language = DEFAULT_LANGUAGE
			
 
				+
			
 
				+        self.language = language
			
 
				+        self.language_map = load_language_list(language)
			
 
				+
			
 
				+    def __call__(self, key):
			
 
				+        return self.language_map.get(key, key)
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "Use Language: " + self.language
			
 
				+
			
 
				+
			
 
				+i18n = I18nAuto()
			
--- a/fish_speech/i18n/locale/en_US.json
+++ b/fish_speech/i18n/locale/en_US.json
@@ -0,0 +1,25 @@
 
				+{
			
 
				+    "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
			
 
				+    "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
			
 
				+    "Advanced Config": "Advanced Config",
			
 
				+    "Enable Reference Audio": "Enable Reference Audio",
			
 
				+    "Error Message": "Error Message",
			
 
				+    "Generate": "Generate",
			
 
				+    "Generated Audio": "Generated Audio",
			
 
				+    "Infer interface is closed": "Infer interface is closed",
			
 
				+    "Inferring interface is launched at {}": "Inferring interface is launched at {}",
			
 
				+    "Input Text": "Input Text",
			
 
				+    "Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
			
 
				+    "Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
			
 
				+    "Opened labeler in browser": "Opened labeler in browser",
			
 
				+    "Put your text here.": "Put your text here.",
			
 
				+    "Reference Audio": "Reference Audio",
			
 
				+    "Reference Text": "Reference Text",
			
 
				+    "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.",
			
 
				+    "Repetition Penalty": "Repetition Penalty",
			
 
				+    "Speaker": "Speaker",
			
 
				+    "Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
			
 
				+    "Type name of the speaker": "Type name of the speaker",
			
 
				+    "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
			
 
				+    "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1)."
			
 
				+}
			
--- a/fish_speech/i18n/locale/zh_CN.json
+++ b/fish_speech/i18n/locale/zh_CN.json
@@ -0,0 +1,25 @@
 
				+{
			
 
				+    "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频，适用于指定音色。",
			
 
				+    "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
			
 
				+    "Advanced Config": "高级参数",
			
 
				+    "Enable Reference Audio": "启用参考音频",
			
 
				+    "Error Message": "错误信息",
			
 
				+    "Generate": "生成",
			
 
				+    "Generated Audio": "音频",
			
 
				+    "Infer interface is closed": "推理界面已关闭",
			
 
				+    "Inferring interface is launched at {}": "推理界面已在 {} 上启动",
			
 
				+    "Input Text": "输入文本",
			
 
				+    "Iterative Prompt Length, 0 means off": "迭代提示长度，0 表示关闭",
			
 
				+    "Maximum tokens per batch, 0 means no limit": "每批最大令牌数，0 表示无限制",
			
 
				+    "Opened labeler in browser": "在浏览器中打开标注工具",
			
 
				+    "Put your text here.": "在此处输入文本.",
			
 
				+    "Reference Audio": "参考音频",
			
 
				+    "Reference Text": "参考文本",
			
 
				+    "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "相关代码使用 BSD-3-Clause 许可证发布，权重使用 CC BY-NC-SA 4.0 许可证发布.",
			
 
				+    "Repetition Penalty": "重复惩罚",
			
 
				+    "Speaker": "说话人",
			
 
				+    "Text is too long, please keep it under {} characters.": "文本太长，请保持在 {} 个字符以内.",
			
 
				+    "Type name of the speaker": "输入说话人的名称",
			
 
				+    "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.",
			
 
				+    "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型."
			
 
				+}
			
--- a/fish_speech/i18n/scan.py
+++ b/fish_speech/i18n/scan.py
@@ -0,0 +1,122 @@
 
				+import ast
			
 
				+import glob
			
 
				+import json
			
 
				+from collections import OrderedDict
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from loguru import logger
			
 
				+
			
 
				+from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
			
 
				+
			
 
				+
			
 
				+def extract_i18n_strings(node):
			
 
				+    i18n_strings = []
			
 
				+
			
 
				+    if (
			
 
				+        isinstance(node, ast.Call)
			
 
				+        and isinstance(node.func, ast.Name)
			
 
				+        and node.func.id == "i18n"
			
 
				+    ):
			
 
				+        for arg in node.args:
			
 
				+            if isinstance(arg, ast.Str):
			
 
				+                i18n_strings.append(arg.s)
			
 
				+
			
 
				+    for child_node in ast.iter_child_nodes(node):
			
 
				+        i18n_strings.extend(extract_i18n_strings(child_node))
			
 
				+
			
 
				+    return i18n_strings
			
 
				+
			
 
				+
			
 
				+# scan the directory for all .py files (recursively)
			
 
				+# for each file, parse the code into an AST
			
 
				+# for each AST, extract the i18n strings
			
 
				+
			
 
				+strings = []
			
 
				+folders = ["fish_speech", "tools"]
			
 
				+# for filename in glob.iglob("**/*.py", recursive=True):
			
 
				+for folder in folders:
			
 
				+    for f in Path(folder).rglob("*.py"):
			
 
				+        code = f.read_text(encoding="utf-8")
			
 
				+        if "i18n(" in code:
			
 
				+            tree = ast.parse(code)
			
 
				+            i18n_strings = extract_i18n_strings(tree)
			
 
				+            logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
			
 
				+            strings.extend(i18n_strings)
			
 
				+
			
 
				+code_keys = set(strings)
			
 
				+logger.info(f"Total unique: {len(code_keys)}")
			
 
				+
			
 
				+
			
 
				+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
			
 
				+with open(standard_file, "r", encoding="utf-8") as f:
			
 
				+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
			
 
				+standard_keys = set(standard_data.keys())
			
 
				+
			
 
				+# Define the standard file name
			
 
				+unused_keys = standard_keys - code_keys
			
 
				+logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
			
 
				+for unused_key in unused_keys:
			
 
				+    logger.info(f"\t{unused_key}")
			
 
				+
			
 
				+missing_keys = code_keys - standard_keys
			
 
				+logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
			
 
				+for missing_key in missing_keys:
			
 
				+    logger.info(f"\t{missing_key}")
			
 
				+
			
 
				+code_keys_dict = OrderedDict()
			
 
				+for s in strings:
			
 
				+    code_keys_dict[s] = s
			
 
				+
			
 
				+# write back
			
 
				+with open(standard_file, "w", encoding="utf-8") as f:
			
 
				+    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
			
 
				+    f.write("\n")
			
 
				+
			
 
				+logger.info(f"Updated {standard_file}")
			
 
				+
			
 
				+
			
 
				+# Define the standard file name
			
 
				+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
			
 
				+
			
 
				+# Find all JSON files in the directory
			
 
				+dir_path = I18N_FILE_PATH
			
 
				+languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
			
 
				+
			
 
				+# Load the standard file
			
 
				+with open(standard_file, "r", encoding="utf-8") as f:
			
 
				+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
			
 
				+
			
 
				+# Loop through each language file
			
 
				+for lang_file in languages:
			
 
				+    # Load the language file
			
 
				+    with open(lang_file, "r", encoding="utf-8") as f:
			
 
				+        lang_data = json.load(f, object_pairs_hook=OrderedDict)
			
 
				+
			
 
				+    # Find the difference between the language file and the standard file
			
 
				+    diff = set(standard_data.keys()) - set(lang_data.keys())
			
 
				+
			
 
				+    miss = set(lang_data.keys()) - set(standard_data.keys())
			
 
				+
			
 
				+    # Add any missing keys to the language file
			
 
				+    for key in diff:
			
 
				+        lang_data[key] = "#!" + key
			
 
				+        logger.info(f"Added missing key: {key} to {lang_file}")
			
 
				+
			
 
				+    # Del any extra keys to the language file
			
 
				+    for key in miss:
			
 
				+        del lang_data[key]
			
 
				+        logger.info(f"Del extra key: {key} from {lang_file}")
			
 
				+
			
 
				+    # Sort the keys of the language file to match the order of the standard file
			
 
				+    lang_data = OrderedDict(
			
 
				+        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
			
 
				+    )
			
 
				+
			
 
				+    # Save the updated language file
			
 
				+    with open(lang_file, "w", encoding="utf-8") as f:
			
 
				+        json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
			
 
				+        f.write("\n")
			
 
				+
			
 
				+    logger.info(f"Updated {lang_file}")
			
 
				+
			
 
				+logger.info("Done")
			
--- a/fish_speech/webui/manage.py
+++ b/fish_speech/webui/manage.py
@@ -17,6 +17,7 @@ import yaml
 
				 from loguru import logger
			
 
				 from tqdm import tqdm
			
 
				 
			
 
				+from fish_speech.i18n import i18n
			
 
				 from fish_speech.webui.launch_utils import Seafoam, versions_html
			
 
				 
			
 
				 PYTHON = os.path.join(os.environ.get("PYTHON_FOLDERPATH", ""), "python")
			
@@ -97,7 +98,7 @@ def change_label(if_label):
 
				         # 设置要访问的URL
			
 
				         url = "https://text-labeler.pages.dev/"
			
 
				         webbrowser.open(url)
			
 
				-        yield f"已打开网址"
			
 
				+        yield i18n("Opened labeler in browser")
			
 
				     elif if_label == False:
			
 
				         p_label = None
			
 
				         yield "Nothing"
			
@@ -119,7 +120,10 @@ def change_infer(
 
				         env["GRADIO_SERVER_NAME"] = host
			
 
				         env["GRADIO_SERVER_PORT"] = port
			
 
				         # 启动第二个进程
			
 
				-        yield build_html_ok_message(f"推理界面已开启, 访问 http://{host}:{port}")
			
 
				+        url = f"http://{host}:{port}"
			
 
				+        yield build_html_ok_message(
			
 
				+            i18n("Inferring interface is launched at {}").format(url)
			
 
				+        )
			
 
				         p_infer = subprocess.Popen(
			
 
				             [
			
 
				                 PYTHON,
			
@@ -140,7 +144,7 @@ def change_infer(
 
				     elif if_infer == False and p_infer != None:
			
 
				         kill_process(p_infer.pid)
			
 
				         p_infer = None
			
 
				-        yield build_html_error_message("推理界面已关闭")
			
 
				+        yield build_html_error_message(i18n("Infer interface is closed"))
			
 
				 
			
 
				 
			
 
				 js = load_data_in_raw("fish_speech/webui/js/animate.js")
			
--- a/tools/webui.py
+++ b/tools/webui.py
@@ -12,6 +12,7 @@ import torch
 
				 from loguru import logger
			
 
				 from transformers import AutoTokenizer
			
 
				 
			
 
				+from fish_speech.i18n import i18n
			
 
				 from tools.llama.generate import launch_thread_safe_queue
			
 
				 from tools.vqgan.inference import load_model as load_vqgan_model
			
 
				 
			
@@ -19,22 +20,18 @@ from tools.vqgan.inference import load_model as load_vqgan_model
 
				 os.environ["EINX_FILTER_TRACEBACK"] = "false"
			
 
				 
			
 
				 
			
 
				-HEADER_MD = """# Fish Speech
			
 
				+HEADER_MD = f"""# Fish Speech
			
 
				 
			
 
				-A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).  
			
 
				-由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成. 
			
 
				+{i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")}  
			
 
				 
			
 
				-You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).  
			
 
				-你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.  
			
 
				+{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).")}  
			
 
				 
			
 
				-Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.  
			
 
				-相关代码使用 BSD-3-Clause 许可证发布，权重使用 CC BY-NC-SA 4.0 许可证发布.
			
 
				+{i18n("Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.")}  
			
 
				 
			
 
				-We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.  
			
 
				-我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
			
 
				+{i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}  
			
 
				 """
			
 
				 
			
 
				-TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
			
 
				+TEXTBOX_PLACEHOLDER = i18n("Put your text here.")
			
 
				 
			
 
				 try:
			
 
				     import spaces
			
@@ -76,7 +73,9 @@ def inference(
 
				     if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
			
 
				         return (
			
 
				             None,
			
 
				-            f"Text is too long, please keep it under {args.max_gradio_length} characters.",
			
 
				+            i18n("Text is too long, please keep it under {} characters.").format(
			
 
				+                args.max_gradio_length
			
 
				+            ),
			
 
				         )
			
 
				 
			
 
				     # Parse reference audio aka prompt
			
@@ -171,13 +170,13 @@ def build_app():
 
				         with gr.Row():
			
 
				             with gr.Column(scale=3):
			
 
				                 text = gr.Textbox(
			
 
				-                    label="Input Text / 输入文本", placeholder=TEXTBOX_PLACEHOLDER, lines=15
			
 
				+                    label=i18n("Input Text"), placeholder=TEXTBOX_PLACEHOLDER, lines=15
			
 
				                 )
			
 
				 
			
 
				                 with gr.Row():
			
 
				-                    with gr.Tab(label="Advanced Config / 高级参数"):
			
 
				+                    with gr.Tab(label=i18n("Advanced Config")):
			
 
				                         chunk_length = gr.Slider(
			
 
				-                            label="Iterative Prompt Length, 0 means off / 迭代提示长度，0 表示关闭",
			
 
				+                            label=i18n("Iterative Prompt Length, 0 means off"),
			
 
				                             minimum=0,
			
 
				                             maximum=500,
			
 
				                             value=30,
			
@@ -185,7 +184,7 @@ def build_app():
 
				                         )
			
 
				 
			
 
				                         max_new_tokens = gr.Slider(
			
 
				-                            label="Maximum tokens per batch, 0 means no limit / 每批最大令牌数，0 表示无限制",
			
 
				+                            label=i18n("Maximum tokens per batch, 0 means no limit"),
			
 
				                             minimum=0,
			
 
				                             maximum=args.max_length,
			
 
				                             value=0,  # 0 means no limit
			
@@ -201,7 +200,7 @@ def build_app():
 
				                         )
			
 
				 
			
 
				                         repetition_penalty = gr.Slider(
			
 
				-                            label="Repetition Penalty",
			
 
				+                            label=i18n("Repetition Penalty"),
			
 
				                             minimum=0,
			
 
				                             maximum=2,
			
 
				                             value=1.5,
			
@@ -217,40 +216,42 @@ def build_app():
 
				                         )
			
 
				 
			
 
				                         speaker = gr.Textbox(
			
 
				-                            label="Speaker / 说话人",
			
 
				-                            placeholder="Type name of the speaker / 输入说话人的名称",
			
 
				+                            label=i18n("Speaker"),
			
 
				+                            placeholder=i18n("Type name of the speaker"),
			
 
				                             lines=1,
			
 
				                         )
			
 
				 
			
 
				-                    with gr.Tab(label="Reference Audio / 参考音频"):
			
 
				+                    with gr.Tab(label=i18n("Reference Audio")):
			
 
				                         gr.Markdown(
			
 
				-                            "5 to 10 seconds of reference audio, useful for specifying speaker. \n5 到 10 秒的参考音频，适用于指定音色。"
			
 
				+                            i18n(
			
 
				+                                "5 to 10 seconds of reference audio, useful for specifying speaker."
			
 
				+                            )
			
 
				                         )
			
 
				 
			
 
				                         enable_reference_audio = gr.Checkbox(
			
 
				-                            label="Enable Reference Audio / 启用参考音频",
			
 
				+                            label=i18n("Enable Reference Audio"),
			
 
				                         )
			
 
				                         reference_audio = gr.Audio(
			
 
				-                            label="Reference Audio / 参考音频",
			
 
				+                            label=i18n("Reference Audio"),
			
 
				                             type="filepath",
			
 
				                         )
			
 
				                         reference_text = gr.Textbox(
			
 
				-                            label="Reference Text / 参考文本",
			
 
				-                            placeholder="参考文本",
			
 
				+                            label=i18n("Reference Text"),
			
 
				+                            placeholder=i18n("Reference Text"),
			
 
				                             lines=1,
			
 
				                             value="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
			
 
				                         )
			
 
				 
			
 
				             with gr.Column(scale=3):
			
 
				                 with gr.Row():
			
 
				-                    error = gr.HTML(label="Error Message / 错误信息")
			
 
				+                    error = gr.HTML(label=i18n("Error Message"))
			
 
				                 with gr.Row():
			
 
				-                    audio = gr.Audio(label="Generated Audio / 音频", type="numpy")
			
 
				+                    audio = gr.Audio(label=i18n("Generated Audio"), type="numpy")
			
 
				 
			
 
				                 with gr.Row():
			
 
				                     with gr.Column(scale=3):
			
 
				                         generate = gr.Button(
			
 
				-                            value="\U0001F3A7 Generate / 合成", variant="primary"
			
 
				+                            value="\U0001F3A7 " + i18n("Generate"), variant="primary"
			
 
				                         )
			
 
				 
			
 
				         # # Submit