2 лет назад · cee143d213
--- a/docs/en/index.md
+++ b/docs/en/index.md
@@ -18,7 +18,7 @@ We assume no responsibility for any illegal use of the codebase. Please refer to
 
															 This codebase is released under the `BSD-3-Clause` license, and all models are released under the CC-BY-NC-SA-4.0 license.
														
 
															 <p align="center">
														
 
															-   <img src="/docs/assets/figs/diagram.png" width="75%">
														
 
															+   <img src="../assets/figs/diagram.png" width="75%">
														
 
															 </p>
														
 
															 ## Requirements
														
@@ -63,7 +63,7 @@ Non-professional Windows users can consider the following methods to run the cod
 
															                   <li>After installing Visual Studio Installer, download Visual Studio Community 2022.</li>
														
 
															                   <li>Click the <code>Modify</code> button as shown below, find the <code>Desktop development with C++</code> option, and check it for download.</li>
														
 
															                   <p align="center">
														
 
															-                     <img src="/docs/assets/figs/VS_1.jpg" width="75%">
														
 
															+                     <img src="../assets/figs/VS_1.jpg" width="75%">
														
 
															                   </p>
														
 
															                </ul>
														
 
															             </li>
														
--- a/docs/ja/index.md
+++ b/docs/ja/index.md
@@ -18,7 +18,7 @@
 
															 このコードベースは `BSD-3-Clause` ライセンスの下でリリースされており、すべてのモデルは CC-BY-NC-SA-4.0 ライセンスの下でリリースされています。
														
 
															 <p align="center">
														
 
															-   <img src="/docs/assets/figs/diagram.png" width="75%">
														
 
															+   <img src="../assets/figs/diagram.png" width="75%">
														
 
															 </p>
														
 
															 ## 要件
														
@@ -63,7 +63,7 @@ Windows のプロユーザーは、コードベースを実行するために WS
 
															                   <li>Visual Studio Installerをインストールした後、Visual Studio Community 2022をダウンロードします。</li>
														
 
															                   <li>以下の図のように<code>Modify</code>ボタンをクリックし、<code>Desktop development with C++</code>オプションを見つけてチェックしてダウンロードします。</li>
														
 
															                   <p align="center">
														
 
															-                     <img src="/docs/assets/figs/VS_1.jpg" width="75%">
														
 
															+                     <img src="../assets/figs/VS_1.jpg" width="75%">
														
 
															                   </p>
														
 
															                </ul>
														
 
															             </li>
														
--- a/docs/zh/index.md
+++ b/docs/zh/index.md
@@ -18,7 +18,7 @@
 
															 此代码库根据 `BSD-3-Clause` 许可证发布, 所有模型根据 CC-BY-NC-SA-4.0 许可证发布.
														
 
															 <p align="center">
														
 
															-  <img src="https://s2.loli.net/2024/05/11/h9qSpRboTs5dGMQ.png" width="75%">
														
 
															+   <img src="../assets/figs/diagram.png" width="75%">
														
 
															 </p>
														
 
															 ## 要求
														
@@ -32,6 +32,7 @@ Windows 专业用户可以考虑 WSL2 或 docker 来运行代码库。
 
															 Windows 非专业用户可考虑以下为免 Linux 环境的基础运行方法（附带模型编译功能，即 `torch.compile`）：
														
 
															+
														
 
															 1. 解压项目压缩包。
														
 
															 2. 点击 `install_env.bat` 安装环境。
														
 
															     - 可以通过编辑 `install_env.bat` 的 `USE_MIRROR` 项来决定是否使用镜像站下载。
														
--- a/fish_speech/webui/manage.py
+++ b/fish_speech/webui/manage.py
@@ -251,7 +251,13 @@ def new_explorer(data_path, max_depth):
 
															     )
														
 
															-def add_item(folder: str, method: str, label_lang: str):
														
 
															+def add_item(
														
 
															+    folder: str,
														
 
															+    method: str,
														
 
															+    label_lang: str,
														
 
															+    if_initial_prompt: bool,
														
 
															+    initial_prompt: str | None,
														
 
															+):
														
 
															     folder = folder.strip(" ").strip('"')
														
 
															     folder_path = Path(folder)
														
@@ -260,7 +266,10 @@ def add_item(folder: str, method: str, label_lang: str):
 
															         if folder_path.is_dir():
														
 
															             items.append(folder)
														
 
															             dict_items[folder] = dict(
														
 
															-                type="folder", method=method, label_lang=label_lang
														
 
															+                type="folder",
														
 
															+                method=method,
														
 
															+                label_lang=label_lang,
														
 
															+                initial_prompt=initial_prompt if if_initial_prompt else None,
														
 
															             )
														
 
															         elif folder:
														
 
															             err = folder
														
@@ -269,7 +278,8 @@ def add_item(folder: str, method: str, label_lang: str):
 
															             )
														
 
															     formatted_data = json.dumps(dict_items, ensure_ascii=False, indent=4)
														
 
															-    logger.info(formatted_data)
														
 
															+    logger.info("After Adding: " + formatted_data)
														
 
															+    gr.Info(formatted_data)
														
 
															     return gr.Checkboxgroup(choices=items), build_html_ok_message(
														
 
															         i18n("Added path successfully!")
														
 
															     )
														
@@ -283,6 +293,7 @@ def remove_items(selected_items):
 
															     items = [item for item in items if item in dict_items.keys()]
														
 
															     formatted_data = json.dumps(dict_items, ensure_ascii=False, indent=4)
														
 
															     logger.info(formatted_data)
														
 
															+    gr.Warning("After Removing: " + formatted_data)
														
 
															     return gr.Checkboxgroup(choices=items, value=[]), build_html_ok_message(
														
 
															         i18n("Removed path successfully!")
														
 
															     )
														
@@ -351,6 +362,7 @@ def list_copy(list_file_path, method):
 
															 def check_files(data_path: str, max_depth: int, label_model: str, label_device: str):
														
 
															     global dict_items
														
 
															     data_path = Path(data_path)
														
 
															+    gr.Warning("Pre-processing begins...")
														
 
															     for item, content in dict_items.items():
														
 
															         item_path = Path(item)
														
 
															         tar_path = data_path / item_path.name
														
@@ -369,23 +381,31 @@ def check_files(data_path: str, max_depth: int, label_model: str, label_device:
 
															                     convert_to_mono_in_place(audio_path)
														
 
															             cur_lang = content["label_lang"]
														
 
															+            initial_prompt = content["initial_prompt"]
														
 
															+
														
 
															+            transcribe_cmd = [
														
 
															+                PYTHON,
														
 
															+                "tools/whisper_asr.py",
														
 
															+                "--model-size",
														
 
															+                label_model,
														
 
															+                "--device",
														
 
															+                label_device,
														
 
															+                "--audio-dir",
														
 
															+                tar_path,
														
 
															+                "--save-dir",
														
 
															+                tar_path,
														
 
															+                "--language",
														
 
															+                cur_lang,
														
 
															+            ]
														
 
															+
														
 
															+            if initial_prompt is not None:
														
 
															+                transcribe_cmd += ["--initial-prompt", initial_prompt]
														
 
															+
														
 
															             if cur_lang != "IGNORE":
														
 
															                 try:
														
 
															+                    gr.Warning("Begin To Transcribe")
														
 
															                     subprocess.run(
														
 
															-                        [
														
 
															-                            PYTHON,
														
 
															-                            "tools/whisper_asr.py",
														
 
															-                            "--model-size",
														
 
															-                            label_model,
														
 
															-                            "--device",
														
 
															-                            label_device,
														
 
															-                            "--audio-dir",
														
 
															-                            tar_path,
														
 
															-                            "--save-dir",
														
 
															-                            tar_path,
														
 
															-                            "--language",
														
 
															-                            cur_lang,
														
 
															-                        ],
														
 
															+                        transcribe_cmd,
														
 
															                         env=env,
														
 
															                     )
														
 
															                 except Exception:
														
@@ -408,8 +428,6 @@ def generate_folder_name():
 
															 def train_process(
														
 
															     data_path: str,
														
 
															     option: str,
														
 
															-    min_duration: float,
														
 
															-    max_duration: float,
														
 
															     # llama config
														
 
															     llama_ckpt,
														
 
															     llama_base_config,
														
@@ -428,13 +446,17 @@ def train_process(
 
															     backend = "nccl" if sys.platform == "linux" else "gloo"
														
 
															     new_project = generate_folder_name()
														
 
															-
														
 
															     print("New Project Name: ", new_project)
														
 
															-    if min_duration > max_duration:
														
 
															-        min_duration, max_duration = max_duration, min_duration
														
 
															+    if option == "VQGAN":
														
 
															+        msg = "Skipped VQGAN Training."
														
 
															+        gr.Warning(msg)
														
 
															+        logger.info(msg)
														
 
															     if option == "LLAMA":
														
 
															+        msg = "LLAMA Training begins..."
														
 
															+        gr.Warning(msg)
														
 
															+        logger.info(msg)
														
 
															         subprocess.run(
														
 
															             [
														
 
															                 PYTHON,
														
@@ -565,13 +587,16 @@ def list_llama_models():
 
															     choices = [str(p.parent) for p in Path("checkpoints").glob("merged*/*model*.pth")]
														
 
															     choices += [str(p.parent) for p in Path("checkpoints").glob("fish*/*model*.pth")]
														
 
															     choices += [str(p.parent) for p in Path("checkpoints").glob("fs*/*model*.pth")]
														
 
															+    choices = sorted(choices, reverse=True)
														
 
															     if not choices:
														
 
															         logger.warning("No LLaMA model found")
														
 
															     return choices
														
 
															 def list_lora_llama_models():
														
 
															-    choices = [str(p) for p in Path("results").glob("lora*/**/*.ckpt")]
														
 
															+    choices = sorted(
														
 
															+        [str(p) for p in Path("results").glob("lora*/**/*.ckpt")], reverse=True
														
 
															+    )
														
 
															     if not choices:
														
 
															         logger.warning("No LoRA LLaMA model found")
														
 
															     return choices
														
@@ -607,7 +632,7 @@ def llama_lora_merge(llama_weight, lora_llama_config, lora_weight, llama_lora_ou
 
															                 "Path error, please check the model file exists in the corresponding path"
														
 
															             )
														
 
															         )
														
 
															-
														
 
															+    gr.Warning("Merging begins...")
														
 
															     merge_cmd = [
														
 
															         PYTHON,
														
 
															         "tools/llama/merge_lora.py",
														
@@ -630,6 +655,9 @@ def llama_quantify(llama_weight, quantify_mode):
 
															                 "Path error, please check the model file exists in the corresponding path"
														
 
															             )
														
 
															         )
														
 
															+
														
 
															+    gr.Warning("Quantifying begins...")
														
 
															+
														
 
															     now = generate_folder_name()
														
 
															     quantify_cmd = [
														
 
															         PYTHON,
														
@@ -690,30 +718,6 @@ with gr.Blocks(
 
															                         if_label = gr.Checkbox(
														
 
															                             label=i18n("Open Labeler WebUI"), scale=0, show_label=True
														
 
															                         )
														
 
															-                with gr.Row():
														
 
															-                    min_duration = gr.Slider(
														
 
															-                        label=i18n("Minimum Audio Duration"),
														
 
															-                        value=1.5,
														
 
															-                        step=0.1,
														
 
															-                        minimum=0.4,
														
 
															-                        maximum=30,
														
 
															-                    )
														
 
															-                    max_duration = gr.Slider(
														
 
															-                        label=i18n("Maximum Audio Duration"),
														
 
															-                        value=30,
														
 
															-                        step=0.1,
														
 
															-                        minimum=0.4,
														
 
															-                        maximum=30,
														
 
															-                    )
														
 
															-
														
 
															-                with gr.Row():
														
 
															-                    add_button = gr.Button(
														
 
															-                        "\U000027A1 " + i18n("Add to Processing Area"),
														
 
															-                        variant="primary",
														
 
															-                    )
														
 
															-                    remove_button = gr.Button(
														
 
															-                        "\U000026D4 " + i18n("Remove Selected Data")
														
 
															-                    )
														
 
															                 with gr.Row():
														
 
															                     label_device = gr.Dropdown(
														
@@ -728,9 +732,9 @@ with gr.Blocks(
 
															                     label_model = gr.Dropdown(
														
 
															                         label=i18n("Whisper Model"),
														
 
															                         info=i18n("Faster Whisper, Up to 5g GPU memory usage"),
														
 
															-                        choices=["large-v3"],
														
 
															+                        choices=["large-v3", "medium"],
														
 
															                         value="large-v3",
														
 
															-                        interactive=False,
														
 
															+                        interactive=True,
														
 
															                     )
														
 
															                     label_radio = gr.Dropdown(
														
 
															                         label=i18n("Optional Label Language"),
														
@@ -738,9 +742,9 @@ with gr.Blocks(
 
															                             "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format"
														
 
															                         ),
														
 
															                         choices=[
														
 
															-                            (i18n("Chinese"), "ZH"),
														
 
															-                            (i18n("English"), "EN"),
														
 
															-                            (i18n("Japanese"), "JA"),
														
 
															+                            (i18n("Chinese"), "zh"),
														
 
															+                            (i18n("English"), "en"),
														
 
															+                            (i18n("Japanese"), "ja"),
														
 
															                             (i18n("Disabled"), "IGNORE"),
														
 
															                             (i18n("auto"), "auto"),
														
 
															                         ],
														
@@ -748,6 +752,31 @@ with gr.Blocks(
 
															                         interactive=True,
														
 
															                     )
														
 
															+                with gr.Row():
														
 
															+                    if_initial_prompt = gr.Checkbox(
														
 
															+                        value=False,
														
 
															+                        label=i18n("Enable Initial Prompt"),
														
 
															+                        min_width=120,
														
 
															+                        scale=0,
														
 
															+                    )
														
 
															+                    initial_prompt = gr.Textbox(
														
 
															+                        label=i18n("Initial Prompt"),
														
 
															+                        info=i18n(
														
 
															+                            "Initial prompt can provide contextual or vocabulary-specific guidance to the model."
														
 
															+                        ),
														
 
															+                        placeholder="This audio introduces the basic concepts and applications of artificial intelligence and machine learning.",
														
 
															+                        interactive=False,
														
 
															+                    )
														
 
															+
														
 
															+                with gr.Row():
														
 
															+                    add_button = gr.Button(
														
 
															+                        "\U000027A1 " + i18n("Add to Processing Area"),
														
 
															+                        variant="primary",
														
 
															+                    )
														
 
															+                    remove_button = gr.Button(
														
 
															+                        "\U000026D4 " + i18n("Remove Selected Data")
														
 
															+                    )
														
 
															+
														
 
															             with gr.Tab("\U0001F6E0 " + i18n("Training Configuration")):
														
 
															                 with gr.Row():
														
 
															                     model_type_radio = gr.Radio(
														
@@ -1103,7 +1132,7 @@ with gr.Blocks(
 
															     llama_page.select(lambda: "LLAMA", None, model_type_radio)
														
 
															     add_button.click(
														
 
															         fn=add_item,
														
 
															-        inputs=[textbox, output_radio, label_radio],
														
 
															+        inputs=[textbox, output_radio, label_radio, if_initial_prompt, initial_prompt],
														
 
															         outputs=[checkbox_group, error],
														
 
															     )
														
 
															     remove_button.click(
														
@@ -1116,14 +1145,16 @@ with gr.Blocks(
 
															         'toolbar=no, menubar=no, scrollbars=no, resizable=no, location=no, status=no")}',
														
 
															     )
														
 
															     if_label.change(fn=change_label, inputs=[if_label], outputs=[error])
														
 
															-
														
 
															+    if_initial_prompt.change(
														
 
															+        fn=lambda x: gr.Textbox(value="", interactive=x),
														
 
															+        inputs=[if_initial_prompt],
														
 
															+        outputs=[initial_prompt],
														
 
															+    )
														
 
															     train_btn.click(
														
 
															         fn=train_process,
														
 
															         inputs=[
														
 
															             train_box,
														
 
															             model_type_radio,
														
 
															-            min_duration,
														
 
															-            max_duration,
														
 
															             # llama config
														
 
															             llama_ckpt,
														
 
															             llama_base_config,
														
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -33,12 +33,12 @@ theme:
 
															       toggle:
														
 
															         icon: material/brightness-auto
														
 
															         name: Switch to light mode
														
 
															-  
														
 
															+
														
 
															     # Palette toggle for light mode
														
 
															     - media: "(prefers-color-scheme: light)"
														
 
															       scheme: default
														
 
															       toggle:
														
 
															-        icon: material/brightness-7 
														
 
															+        icon: material/brightness-7
														
 
															         name: Switch to dark mode
														
 
															       primary: black
														
 
															       font:
														
--- a/tools/whisper_asr.py
+++ b/tools/whisper_asr.py
@@ -54,7 +54,17 @@ from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files
 
															 )
														
 
															 @click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
														
 
															 @click.option("--language", default="auto", help="Language of the transcription")
														
 
															-def main(model_size, compute_type, audio_dir, save_dir, sample_rate, device, language):
														
 
															+@click.option("--initial-prompt", default=None, help="Initial prompt for transcribing")
														
 
															+def main(
														
 
															+    model_size,
														
 
															+    compute_type,
														
 
															+    audio_dir,
														
 
															+    save_dir,
														
 
															+    sample_rate,
														
 
															+    device,
														
 
															+    language,
														
 
															+    initial_prompt,
														
 
															+):
														
 
															     logger.info("Loading / Downloading Faster Whisper model...")
														
 
															     model = WhisperModel(
														
@@ -97,7 +107,10 @@ def main(model_size, compute_type, audio_dir, save_dir, sample_rate, device, lan
 
															         audio = AudioSegment.from_file(file_path)
														
 
															         segments, info = model.transcribe(
														
 
															-            file_path, beam_size=5, language=None if language == "auto" else language
														
 
															+            file_path,
														
 
															+            beam_size=5,
														
 
															+            language=None if language == "auto" else language,
														
 
															+            initial_prompt=initial_prompt,
														
 
															         )
														
 
															         print(