1 年間前 · 979b0e5523
--- a/fish_speech/webui/manage.py
+++ b/fish_speech/webui/manage.py
@@ -510,6 +510,10 @@ def train_process(
 
				             )
			
 
				         )
			
 
				         logger.info(project)
			
 
				+
			
 
				+        if llama_check_interval > llama_maxsteps:
			
 
				+            llama_check_interval = llama_maxsteps
			
 
				+
			
 
				         train_cmd = [
			
 
				             PYTHON,
			
 
				             "fish_speech/train.py",
			
@@ -800,7 +804,7 @@ with gr.Blocks(
 
				                                         "Use LoRA can save GPU memory, but may reduce the quality of the model"
			
 
				                                     ),
			
 
				                                     value=True,
			
 
				-                                    interactive=False,
			
 
				+                                    interactive=True,
			
 
				                                 )
			
 
				                                 llama_ckpt = gr.Dropdown(
			
 
				                                     label=i18n("Select LLAMA ckpt"),
			
@@ -816,19 +820,25 @@ with gr.Blocks(
 
				                             with gr.Row(equal_height=False):
			
 
				                                 llama_lr_slider = gr.Slider(
			
 
				                                     label=i18n("Initial Learning Rate"),
			
 
				+                                    info=i18n(
			
 
				+                                        "lr smaller -> usually train slower but more stable"
			
 
				+                                    ),
			
 
				                                     interactive=True,
			
 
				                                     minimum=1e-5,
			
 
				                                     maximum=1e-4,
			
 
				                                     step=1e-5,
			
 
				-                                    value=init_llama_yml["model"]["optimizer"]["lr"],
			
 
				+                                    value=5e-5,
			
 
				                                 )
			
 
				                                 llama_maxsteps_slider = gr.Slider(
			
 
				                                     label=i18n("Maximum Training Steps"),
			
 
				+                                    info=i18n(
			
 
				+                                        "recommend: max_steps = num_audios // batch_size * (2 to 5)"
			
 
				+                                    ),
			
 
				                                     interactive=True,
			
 
				-                                    minimum=50,
			
 
				+                                    minimum=1,
			
 
				                                     maximum=10000,
			
 
				-                                    step=50,
			
 
				-                                    value=init_llama_yml["trainer"]["max_steps"],
			
 
				+                                    step=1,
			
 
				+                                    value=50,
			
 
				                                 )
			
 
				                             with gr.Row(equal_height=False):
			
 
				                                 llama_base_config = gr.Dropdown(
			
@@ -841,13 +851,9 @@ with gr.Blocks(
 
				                                 llama_data_num_workers_slider = gr.Slider(
			
 
				                                     label=i18n("Number of Workers"),
			
 
				                                     minimum=1,
			
 
				-                                    maximum=16,
			
 
				+                                    maximum=32,
			
 
				                                     step=1,
			
 
				-                                    value=(
			
 
				-                                        init_llama_yml["data"]["num_workers"]
			
 
				-                                        if sys.platform == "linux"
			
 
				-                                        else 1
			
 
				-                                    ),
			
 
				+                                    value=4,
			
 
				                                 )
			
 
				                             with gr.Row(equal_height=False):
			
 
				                                 llama_data_batch_size_slider = gr.Slider(
			
@@ -856,7 +862,7 @@ with gr.Blocks(
 
				                                     minimum=1,
			
 
				                                     maximum=32,
			
 
				                                     step=1,
			
 
				-                                    value=init_llama_yml["data"]["batch_size"],
			
 
				+                                    value=4,
			
 
				                                 )
			
 
				                                 llama_data_max_length_slider = gr.Slider(
			
 
				                                     label=i18n("Maximum Length per Sample"),
			
@@ -864,7 +870,7 @@ with gr.Blocks(
 
				                                     minimum=1024,
			
 
				                                     maximum=4096,
			
 
				                                     step=128,
			
 
				-                                    value=init_llama_yml["max_length"],
			
 
				+                                    value=1024,
			
 
				                                 )
			
 
				                             with gr.Row(equal_height=False):
			
 
				                                 llama_precision_dropdown = gr.Dropdown(
			
@@ -878,13 +884,14 @@ with gr.Blocks(
 
				                                 )
			
 
				                                 llama_check_interval_slider = gr.Slider(
			
 
				                                     label=i18n("Save model every n steps"),
			
 
				+                                    info=i18n(
			
 
				+                                        "make sure that it's not greater than max_steps"
			
 
				+                                    ),
			
 
				                                     interactive=True,
			
 
				-                                    minimum=50,
			
 
				+                                    minimum=1,
			
 
				                                     maximum=1000,
			
 
				-                                    step=50,
			
 
				-                                    value=init_llama_yml["trainer"][
			
 
				-                                        "val_check_interval"
			
 
				-                                    ],
			
 
				+                                    step=1,
			
 
				+                                    value=50,
			
 
				                                 )
			
 
				                             with gr.Row(equal_height=False):
			
 
				                                 llama_grad_batches = gr.Slider(
			
--- a/start.bat
+++ b/start.bat
@@ -3,7 +3,11 @@ chcp 65001
 
				 
			
 
				 set USE_MIRROR=true
			
 
				 set PYTHONPATH=%~dp0
			
 
				-set PYTHON_CMD=%cd%\fishenv\env\python
			
 
				+set PYTHON_CMD=python
			
 
				+if exist "fishenv" (
			
 
				+    set PYTHON_CMD=%cd%\fishenv\env\python
			
 
				+)
			
 
				+
			
 
				 set API_FLAG_PATH=%~dp0API_FLAGS.txt
			
 
				 set KMP_DUPLICATE_LIB_OK=TRUE
			
 
				 
			
--- a/tools/auto_rerank.py
+++ b/tools/auto_rerank.py
@@ -40,13 +40,16 @@ def batch_asr_internal(model: WhisperModel, audios, sr):
 
				         assert audio.dim() == 1
			
 
				         audio_np = audio.numpy()
			
 
				         resampled_audio = librosa.resample(audio_np, orig_sr=sr, target_sr=16000)
			
 
				-        resampled_audios.append(torch.from_numpy(resampled_audio))
			
 
				+        resampled_audios.append(resampled_audio)
			
 
				 
			
 
				     trans_results = []
			
 
				 
			
 
				     for resampled_audio in resampled_audios:
			
 
				         segments, info = model.transcribe(
			
 
				-            resampled_audio.numpy(), language=None, beam_size=5
			
 
				+            resampled_audio,
			
 
				+            language=None,
			
 
				+            beam_size=5,
			
 
				+            initial_prompt="Punctuation is needed in any language.",
			
 
				         )
			
 
				         trans_results.append(list(segments))
			
 
				 
			
@@ -71,6 +74,7 @@ def batch_asr_internal(model: WhisperModel, audios, sr):
 
				             last_tr = tr
			
 
				             if max_gap > 3.0:
			
 
				                 huge_gap = True
			
 
				+                break
			
 
				 
			
 
				         sim_text = t2s_converter.convert(text)
			
 
				         results.append(
			
@@ -95,34 +99,37 @@ def is_chinese(text):
 
				     return True
			
 
				 
			
 
				 
			
 
				-def calculate_wer(text1, text2):
			
 
				-    # 将文本分割成字符列表
			
 
				+def calculate_wer(text1, text2, debug=False):
			
 
				     chars1 = remove_punctuation(text1)
			
 
				     chars2 = remove_punctuation(text2)
			
 
				 
			
 
				-    # 计算编辑距离
			
 
				     m, n = len(chars1), len(chars2)
			
 
				-    dp = [[0] * (n + 1) for _ in range(m + 1)]
			
 
				 
			
 
				-    for i in range(m + 1):
			
 
				-        dp[i][0] = i
			
 
				-    for j in range(n + 1):
			
 
				-        dp[0][j] = j
			
 
				+    if m > n:
			
 
				+        chars1, chars2 = chars2, chars1
			
 
				+        m, n = n, m
			
 
				 
			
 
				-    for i in range(1, m + 1):
			
 
				-        for j in range(1, n + 1):
			
 
				+    prev = list(range(m + 1))  # row 0 distance: [0, 1, 2, ...]
			
 
				+    curr = [0] * (m + 1)
			
 
				+
			
 
				+    for j in range(1, n + 1):
			
 
				+        curr[0] = j
			
 
				+        for i in range(1, m + 1):
			
 
				             if chars1[i - 1] == chars2[j - 1]:
			
 
				-                dp[i][j] = dp[i - 1][j - 1]
			
 
				+                curr[i] = prev[i - 1]
			
 
				             else:
			
 
				-                dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1
			
 
				+                curr[i] = min(prev[i], curr[i - 1], prev[i - 1]) + 1
			
 
				+        prev, curr = curr, prev
			
 
				 
			
 
				-    # WER
			
 
				-    edits = dp[m][n]
			
 
				+    edits = prev[m]
			
 
				     tot = max(len(chars1), len(chars2))
			
 
				     wer = edits / tot
			
 
				-    print("            gt:   ", chars1)
			
 
				-    print("          pred:   ", chars2)
			
 
				-    print(" edits/tot = wer: ", edits, "/", tot, "=", wer)
			
 
				+
			
 
				+    if debug:
			
 
				+        print("            gt:   ", chars1)
			
 
				+        print("          pred:   ", chars2)
			
 
				+        print(" edits/tot = wer: ", edits, "/", tot, "=", wer)
			
 
				+
			
 
				     return wer
			
 
				 
			
 
				 
			
--- a/tools/webui.py
+++ b/tools/webui.py
@@ -9,6 +9,7 @@ from functools import partial
 
				 from pathlib import Path
			
 
				 
			
 
				 import gradio as gr
			
 
				+import librosa
			
 
				 import numpy as np
			
 
				 import pyrootutils
			
 
				 import torch
			
@@ -323,6 +324,23 @@ def change_if_load_asr_model(if_load):
 
				         return gr.Checkbox(label="Load faster whisper model", value=if_load)
			
 
				 
			
 
				 
			
 
				+def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text):
			
 
				+    if if_load and asr_model is not None:
			
 
				+        if (
			
 
				+            if_auto_label
			
 
				+            and enable_ref
			
 
				+            and ref_audio is not None
			
 
				+            and ref_text.strip() == ""
			
 
				+        ):
			
 
				+            data, sample_rate = librosa.load(ref_audio)
			
 
				+            res = batch_asr(asr_model, [data], sample_rate)[0]
			
 
				+            ref_text = res["text"]
			
 
				+    else:
			
 
				+        gr.Warning("Whisper model not loaded!")
			
 
				+
			
 
				+    return gr.Textbox(value=ref_text)
			
 
				+
			
 
				+
			
 
				 def build_app():
			
 
				     with gr.Blocks(theme=gr.themes.Base()) as app:
			
 
				         gr.Markdown(HEADER_MD)
			
@@ -419,12 +437,19 @@ def build_app():
 
				                             label=i18n("Reference Audio"),
			
 
				                             type="filepath",
			
 
				                         )
			
 
				-                        reference_text = gr.Textbox(
			
 
				-                            label=i18n("Reference Text"),
			
 
				-                            placeholder=i18n("Reference Text"),
			
 
				-                            lines=1,
			
 
				-                            value="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
			
 
				-                        )
			
 
				+                        with gr.Row():
			
 
				+                            if_auto_label = gr.Checkbox(
			
 
				+                                label=i18n("Auto Labeling"),
			
 
				+                                min_width=100,
			
 
				+                                scale=0,
			
 
				+                                value=False,
			
 
				+                            )
			
 
				+                            reference_text = gr.Textbox(
			
 
				+                                label=i18n("Reference Text"),
			
 
				+                                lines=1,
			
 
				+                                placeholder="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
			
 
				+                                value="",
			
 
				+                            )
			
 
				                     with gr.Tab(label=i18n("Batch Inference")):
			
 
				                         batch_infer_num = gr.Slider(
			
 
				                             label="Batch infer nums",
			
@@ -479,6 +504,22 @@ def build_app():
 
				             outputs=[if_load_asr_model],
			
 
				         )
			
 
				 
			
 
				+        if_auto_label.change(
			
 
				+            fn=lambda: gr.Textbox(value=""),
			
 
				+            inputs=[],
			
 
				+            outputs=[reference_text],
			
 
				+        ).then(
			
 
				+            fn=change_if_auto_label,
			
 
				+            inputs=[
			
 
				+                if_load_asr_model,
			
 
				+                if_auto_label,
			
 
				+                enable_reference_audio,
			
 
				+                reference_audio,
			
 
				+                reference_text,
			
 
				+            ],
			
 
				+            outputs=[reference_text],
			
 
				+        )
			
 
				+
			
 
				         # # Submit
			
 
				         generate.click(
			
 
				             inference_wrapper,