|
@@ -28,7 +28,6 @@ from fish_speech.webui.launch_utils import Seafoam, is_module_installed, version
|
|
|
config_path = cur_work_dir / "fish_speech" / "configs"
|
|
config_path = cur_work_dir / "fish_speech" / "configs"
|
|
|
vqgan_yml_path = config_path / "firefly_gan_vq.yaml"
|
|
vqgan_yml_path = config_path / "firefly_gan_vq.yaml"
|
|
|
llama_yml_path = config_path / "text2semantic_finetune.yaml"
|
|
llama_yml_path = config_path / "text2semantic_finetune.yaml"
|
|
|
-vits_yml_path = config_path / "vits_decoder_finetune.yaml"
|
|
|
|
|
|
|
|
|
|
env = os.environ.copy()
|
|
env = os.environ.copy()
|
|
|
env["no_proxy"] = "127.0.0.1, localhost, 0.0.0.0"
|
|
env["no_proxy"] = "127.0.0.1, localhost, 0.0.0.0"
|
|
@@ -132,26 +131,6 @@ def change_label(if_label):
|
|
|
yield build_html_ok_message("Nothing")
|
|
yield build_html_ok_message("Nothing")
|
|
|
|
|
|
|
|
|
|
|
|
|
-def change_decoder_config(decoder_model_path):
|
|
|
|
|
- if "vits" in decoder_model_path:
|
|
|
|
|
- choices = ["vits_decoder_finetune", "vits_decoder_pretrain"]
|
|
|
|
|
- return gr.Dropdown(choices=choices, value=choices[0])
|
|
|
|
|
- elif "vqgan" in decoder_model_path or "vq-gan" in decoder_model_path:
|
|
|
|
|
- choices = ["firefly_gan_vq", "firefly_gan_vq"]
|
|
|
|
|
- return gr.Dropdown(choices=choices, value=choices[0])
|
|
|
|
|
- else:
|
|
|
|
|
- raise ValueError("Invalid decoder name")
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def change_llama_config(llama_model_path):
|
|
|
|
|
- if "large" in llama_model_path:
|
|
|
|
|
- return gr.Dropdown(value="dual_ar_2_codebook_large", interactive=False)
|
|
|
|
|
- elif "medium" in llama_model_path:
|
|
|
|
|
- return gr.Dropdown(value="dual_ar_2_codebook_medium", interactive=False)
|
|
|
|
|
- else:
|
|
|
|
|
- raise ValueError("Invalid model size")
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
def clean_infer_cache():
|
|
def clean_infer_cache():
|
|
|
import tempfile
|
|
import tempfile
|
|
|
|
|
|
|
@@ -175,7 +154,6 @@ def change_infer(
|
|
|
infer_decoder_model,
|
|
infer_decoder_model,
|
|
|
infer_decoder_config,
|
|
infer_decoder_config,
|
|
|
infer_llama_model,
|
|
infer_llama_model,
|
|
|
- infer_llama_config,
|
|
|
|
|
infer_compile,
|
|
infer_compile,
|
|
|
):
|
|
):
|
|
|
global p_infer
|
|
global p_infer
|
|
@@ -202,10 +180,8 @@ def change_infer(
|
|
|
infer_decoder_config,
|
|
infer_decoder_config,
|
|
|
"--llama-checkpoint-path",
|
|
"--llama-checkpoint-path",
|
|
|
infer_llama_model,
|
|
infer_llama_model,
|
|
|
- "--llama-config-name",
|
|
|
|
|
- infer_llama_config,
|
|
|
|
|
"--tokenizer",
|
|
"--tokenizer",
|
|
|
- "checkpoints",
|
|
|
|
|
|
|
+ "checkpoints/fish-speech-1.2",
|
|
|
]
|
|
]
|
|
|
+ (["--compile"] if infer_compile == "Yes" else []),
|
|
+ (["--compile"] if infer_compile == "Yes" else []),
|
|
|
env=env,
|
|
env=env,
|
|
@@ -429,24 +405,6 @@ def train_process(
|
|
|
option: str,
|
|
option: str,
|
|
|
min_duration: float,
|
|
min_duration: float,
|
|
|
max_duration: float,
|
|
max_duration: float,
|
|
|
- # vq-gan config
|
|
|
|
|
- vqgan_ckpt,
|
|
|
|
|
- vqgan_lr,
|
|
|
|
|
- vqgan_maxsteps,
|
|
|
|
|
- vqgan_data_num_workers,
|
|
|
|
|
- vqgan_data_batch_size,
|
|
|
|
|
- vqgan_data_val_batch_size,
|
|
|
|
|
- vqgan_precision,
|
|
|
|
|
- vqgan_check_interval,
|
|
|
|
|
- # vits config
|
|
|
|
|
- vits_ckpt,
|
|
|
|
|
- vits_lr,
|
|
|
|
|
- vits_maxsteps,
|
|
|
|
|
- vits_data_num_workers,
|
|
|
|
|
- vits_data_batch_size,
|
|
|
|
|
- vits_data_val_batch_size,
|
|
|
|
|
- vits_precision,
|
|
|
|
|
- vits_check_interval,
|
|
|
|
|
# llama config
|
|
# llama config
|
|
|
llama_ckpt,
|
|
llama_ckpt,
|
|
|
llama_base_config,
|
|
llama_base_config,
|
|
@@ -477,108 +435,6 @@ def train_process(
|
|
|
if min_duration > max_duration:
|
|
if min_duration > max_duration:
|
|
|
min_duration, max_duration = max_duration, min_duration
|
|
min_duration, max_duration = max_duration, min_duration
|
|
|
|
|
|
|
|
- if option == "VQGAN" or option == "VITS":
|
|
|
|
|
- subprocess.run(
|
|
|
|
|
- [
|
|
|
|
|
- PYTHON,
|
|
|
|
|
- "tools/vqgan/create_train_split.py",
|
|
|
|
|
- str(data_pre_output.relative_to(cur_work_dir)),
|
|
|
|
|
- "--min-duration",
|
|
|
|
|
- str(min_duration),
|
|
|
|
|
- "--max-duration",
|
|
|
|
|
- str(max_duration),
|
|
|
|
|
- ]
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- if option == "VQGAN":
|
|
|
|
|
- latest = next(
|
|
|
|
|
- iter(
|
|
|
|
|
- sorted(
|
|
|
|
|
- [
|
|
|
|
|
- str(p.relative_to("results"))
|
|
|
|
|
- for p in Path("results").glob("vqgan_*/")
|
|
|
|
|
- ],
|
|
|
|
|
- reverse=True,
|
|
|
|
|
- )
|
|
|
|
|
- ),
|
|
|
|
|
- ("vqgan_" + new_project),
|
|
|
|
|
- )
|
|
|
|
|
- project = (
|
|
|
|
|
- ("vqgan_" + new_project)
|
|
|
|
|
- if vqgan_ckpt == i18n("new")
|
|
|
|
|
- else (
|
|
|
|
|
- latest
|
|
|
|
|
- if vqgan_ckpt == i18n("latest")
|
|
|
|
|
- else Path(vqgan_ckpt).relative_to("results")
|
|
|
|
|
- )
|
|
|
|
|
- )
|
|
|
|
|
- logger.info(project)
|
|
|
|
|
- train_cmd = [
|
|
|
|
|
- PYTHON,
|
|
|
|
|
- "fish_speech/train.py",
|
|
|
|
|
- "--config-name",
|
|
|
|
|
- "firefly_gan_vq",
|
|
|
|
|
- f"project={project}",
|
|
|
|
|
- f"trainer.strategy.process_group_backend={backend}",
|
|
|
|
|
- f"model.optimizer.lr={vqgan_lr}",
|
|
|
|
|
- f"trainer.max_steps={vqgan_maxsteps}",
|
|
|
|
|
- f"data.num_workers={vqgan_data_num_workers}",
|
|
|
|
|
- f"data.batch_size={vqgan_data_batch_size}",
|
|
|
|
|
- f"data.val_batch_size={vqgan_data_val_batch_size}",
|
|
|
|
|
- f"trainer.precision={vqgan_precision}",
|
|
|
|
|
- f"trainer.val_check_interval={vqgan_check_interval}",
|
|
|
|
|
- f"train_dataset.filelist={str(data_pre_output / 'vq_train_filelist.txt')}",
|
|
|
|
|
- f"val_dataset.filelist={str(data_pre_output / 'vq_val_filelist.txt')}",
|
|
|
|
|
- ]
|
|
|
|
|
- logger.info(train_cmd)
|
|
|
|
|
- subprocess.run(train_cmd)
|
|
|
|
|
-
|
|
|
|
|
- if option == "VITS":
|
|
|
|
|
- latest = next(
|
|
|
|
|
- iter(
|
|
|
|
|
- sorted(
|
|
|
|
|
- [
|
|
|
|
|
- str(p.relative_to("results"))
|
|
|
|
|
- for p in Path("results").glob("vits_*/")
|
|
|
|
|
- ],
|
|
|
|
|
- reverse=True,
|
|
|
|
|
- )
|
|
|
|
|
- ),
|
|
|
|
|
- ("vits_" + new_project),
|
|
|
|
|
- )
|
|
|
|
|
- project = (
|
|
|
|
|
- ("vits_" + new_project)
|
|
|
|
|
- if vits_ckpt == i18n("new")
|
|
|
|
|
- else (
|
|
|
|
|
- latest
|
|
|
|
|
- if vits_ckpt == i18n("latest")
|
|
|
|
|
- else Path(vits_ckpt).relative_to("results")
|
|
|
|
|
- )
|
|
|
|
|
- )
|
|
|
|
|
- ckpt_path = str(Path("checkpoints/vits_decoder_v1.1.ckpt"))
|
|
|
|
|
- logger.info(project)
|
|
|
|
|
- train_cmd = [
|
|
|
|
|
- PYTHON,
|
|
|
|
|
- "fish_speech/train.py",
|
|
|
|
|
- "--config-name",
|
|
|
|
|
- "vits_decoder_finetune",
|
|
|
|
|
- f"project={project}",
|
|
|
|
|
- f"ckpt_path={ckpt_path}",
|
|
|
|
|
- f"trainer.strategy.process_group_backend={backend}",
|
|
|
|
|
- "tokenizer.pretrained_model_name_or_path=checkpoints",
|
|
|
|
|
- f"model.optimizer.lr={vits_lr}",
|
|
|
|
|
- f"trainer.max_steps={vits_maxsteps}",
|
|
|
|
|
- f"data.num_workers={vits_data_num_workers}",
|
|
|
|
|
- f"data.batch_size={vits_data_batch_size}",
|
|
|
|
|
- f"data.val_batch_size={vits_data_val_batch_size}",
|
|
|
|
|
- f"trainer.precision={vits_precision}",
|
|
|
|
|
- f"trainer.val_check_interval={vits_check_interval}",
|
|
|
|
|
- f"train_dataset.filelist={str(data_pre_output / 'vq_train_filelist.txt')}",
|
|
|
|
|
- f"val_dataset.filelist={str(data_pre_output / 'vq_val_filelist.txt')}",
|
|
|
|
|
- ]
|
|
|
|
|
- logger.info(train_cmd)
|
|
|
|
|
- subprocess.run(train_cmd)
|
|
|
|
|
-
|
|
|
|
|
if option == "LLAMA":
|
|
if option == "LLAMA":
|
|
|
subprocess.run(
|
|
subprocess.run(
|
|
|
[
|
|
[
|
|
@@ -708,12 +564,9 @@ def fresh_tb_dir():
|
|
|
|
|
|
|
|
|
|
|
|
|
def list_decoder_models():
|
|
def list_decoder_models():
|
|
|
- paths = (
|
|
|
|
|
- [str(p) for p in Path("checkpoints").glob("vits*.*")]
|
|
|
|
|
- + [str(p) for p in Path("checkpoints").glob("vq*.*")]
|
|
|
|
|
- + [str(p) for p in Path("results").glob("vqgan*/**/*.ckpt")]
|
|
|
|
|
- + [str(p) for p in Path("results").glob("vits*/**/*.ckpt")]
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ paths = [str(p) for p in Path("checkpoints").glob("vq*.*")] + [
|
|
|
|
|
+ str(p) for p in Path("results").glob("vqgan*/**/*.ckpt")
|
|
|
|
|
+ ]
|
|
|
if not paths:
|
|
if not paths:
|
|
|
logger.warning("No decoder model found")
|
|
logger.warning("No decoder model found")
|
|
|
return paths
|
|
return paths
|
|
@@ -740,20 +593,6 @@ def fresh_decoder_model():
|
|
|
return gr.Dropdown(choices=list_decoder_models())
|
|
return gr.Dropdown(choices=list_decoder_models())
|
|
|
|
|
|
|
|
|
|
|
|
|
-def fresh_vqgan_ckpt():
|
|
|
|
|
- return gr.Dropdown(
|
|
|
|
|
- choices=[i18n("latest"), i18n("new")]
|
|
|
|
|
- + [str(p) for p in Path("results").glob("vqgan_*/")]
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def fresh_vits_ckpt():
|
|
|
|
|
- return gr.Dropdown(
|
|
|
|
|
- choices=[i18n("latest"), i18n("new")]
|
|
|
|
|
- + [str(p) for p in Path("results").glob("vits_*/")]
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
def fresh_llama_ckpt(llama_use_lora):
|
|
def fresh_llama_ckpt(llama_use_lora):
|
|
|
return gr.Dropdown(
|
|
return gr.Dropdown(
|
|
|
choices=[i18n("latest"), i18n("new")]
|
|
choices=[i18n("latest"), i18n("new")]
|
|
@@ -806,7 +645,6 @@ def llama_lora_merge(llama_weight, lora_llama_config, lora_weight, llama_lora_ou
|
|
|
|
|
|
|
|
init_vqgan_yml = load_yaml_data_in_fact(vqgan_yml_path)
|
|
init_vqgan_yml = load_yaml_data_in_fact(vqgan_yml_path)
|
|
|
init_llama_yml = load_yaml_data_in_fact(llama_yml_path)
|
|
init_llama_yml = load_yaml_data_in_fact(llama_yml_path)
|
|
|
-init_vits_yml = load_yaml_data_in_fact(vits_yml_path)
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(
|
|
with gr.Blocks(
|
|
|
head="<style>\n" + css + "\n</style>",
|
|
head="<style>\n" + css + "\n</style>",
|
|
@@ -905,166 +743,15 @@ with gr.Blocks(
|
|
|
"Select the model to be trained (Depending on the Tab page you are on)"
|
|
"Select the model to be trained (Depending on the Tab page you are on)"
|
|
|
),
|
|
),
|
|
|
interactive=False,
|
|
interactive=False,
|
|
|
- choices=["VQGAN", "VITS", "LLAMA"],
|
|
|
|
|
|
|
+ choices=["VQGAN", "LLAMA"],
|
|
|
value="VQGAN",
|
|
value="VQGAN",
|
|
|
)
|
|
)
|
|
|
with gr.Row():
|
|
with gr.Row():
|
|
|
with gr.Tabs():
|
|
with gr.Tabs():
|
|
|
with gr.Tab(label=i18n("VQGAN Configuration")) as vqgan_page:
|
|
with gr.Tab(label=i18n("VQGAN Configuration")) as vqgan_page:
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vqgan_ckpt = gr.Dropdown(
|
|
|
|
|
- label=i18n("Select VQGAN ckpt"),
|
|
|
|
|
- choices=[i18n("latest"), i18n("new")]
|
|
|
|
|
- + [
|
|
|
|
|
- str(p) for p in Path("results").glob("vqgan_*/")
|
|
|
|
|
- ],
|
|
|
|
|
- value=i18n("latest"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- )
|
|
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vqgan_lr_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Initial Learning Rate"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1e-5,
|
|
|
|
|
- maximum=1e-4,
|
|
|
|
|
- step=1e-5,
|
|
|
|
|
- value=init_vqgan_yml["model"]["optimizer"]["lr"],
|
|
|
|
|
- )
|
|
|
|
|
- vqgan_maxsteps_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Maximum Training Steps"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1000,
|
|
|
|
|
- maximum=100000,
|
|
|
|
|
- step=1000,
|
|
|
|
|
- value=init_vqgan_yml["trainer"]["max_steps"],
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ gr.HTML("You don't need to train this model!")
|
|
|
|
|
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vqgan_data_num_workers_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Number of Workers"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1,
|
|
|
|
|
- maximum=16,
|
|
|
|
|
- step=1,
|
|
|
|
|
- value=init_vqgan_yml["data"]["num_workers"],
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- vqgan_data_batch_size_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Batch Size"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1,
|
|
|
|
|
- maximum=32,
|
|
|
|
|
- step=1,
|
|
|
|
|
- value=init_vqgan_yml["data"]["batch_size"],
|
|
|
|
|
- )
|
|
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vqgan_data_val_batch_size_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Validation Batch Size"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1,
|
|
|
|
|
- maximum=32,
|
|
|
|
|
- step=1,
|
|
|
|
|
- value=init_vqgan_yml["data"]["val_batch_size"],
|
|
|
|
|
- )
|
|
|
|
|
- vqgan_precision_dropdown = gr.Dropdown(
|
|
|
|
|
- label=i18n("Precision"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- choices=["32", "bf16-true", "bf16-mixed"],
|
|
|
|
|
- info=i18n(
|
|
|
|
|
- "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU"
|
|
|
|
|
- ),
|
|
|
|
|
- value=str(init_vqgan_yml["trainer"]["precision"]),
|
|
|
|
|
- )
|
|
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vqgan_check_interval_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Save model every n steps"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=500,
|
|
|
|
|
- maximum=10000,
|
|
|
|
|
- step=500,
|
|
|
|
|
- value=init_vqgan_yml["trainer"][
|
|
|
|
|
- "val_check_interval"
|
|
|
|
|
- ],
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- with gr.Tab(label=i18n("VITS Configuration")) as vits_page:
|
|
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vits_ckpt = gr.Dropdown(
|
|
|
|
|
- label=i18n("Select VITS ckpt"),
|
|
|
|
|
- choices=[i18n("latest"), i18n("new")]
|
|
|
|
|
- + [str(p) for p in Path("results").glob("vits_*/")],
|
|
|
|
|
- value=i18n("latest"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- )
|
|
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vits_lr_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Initial Learning Rate"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1e-5,
|
|
|
|
|
- maximum=1e-4,
|
|
|
|
|
- step=1e-5,
|
|
|
|
|
- value=init_vits_yml["model"]["optimizer"]["lr"],
|
|
|
|
|
- )
|
|
|
|
|
- vits_maxsteps_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Maximum Training Steps"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1000,
|
|
|
|
|
- maximum=100000,
|
|
|
|
|
- step=1000,
|
|
|
|
|
- value=init_vits_yml["trainer"]["max_steps"],
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vits_data_num_workers_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Number of Workers"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1,
|
|
|
|
|
- maximum=16,
|
|
|
|
|
- step=1,
|
|
|
|
|
- value=init_vits_yml["data"]["num_workers"],
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- vits_data_batch_size_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Batch Size"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1,
|
|
|
|
|
- maximum=32,
|
|
|
|
|
- step=1,
|
|
|
|
|
- value=init_vits_yml["data"]["batch_size"],
|
|
|
|
|
- )
|
|
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vits_data_val_batch_size_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Validation Batch Size"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1,
|
|
|
|
|
- maximum=32,
|
|
|
|
|
- step=1,
|
|
|
|
|
- value=init_vits_yml["data"]["val_batch_size"],
|
|
|
|
|
- )
|
|
|
|
|
- vits_precision_dropdown = gr.Dropdown(
|
|
|
|
|
- label=i18n("Precision"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- choices=["32", "bf16-mixed"],
|
|
|
|
|
- info=i18n(
|
|
|
|
|
- "16-mixed is recommended for 10+ series GPU"
|
|
|
|
|
- ),
|
|
|
|
|
- value=str(init_vits_yml["trainer"]["precision"]),
|
|
|
|
|
- )
|
|
|
|
|
- with gr.Row(equal_height=False):
|
|
|
|
|
- vits_check_interval_slider = gr.Slider(
|
|
|
|
|
- label=i18n("Save model every n steps"),
|
|
|
|
|
- interactive=True,
|
|
|
|
|
- minimum=1,
|
|
|
|
|
- maximum=2000,
|
|
|
|
|
- step=1,
|
|
|
|
|
- value=init_vits_yml["trainer"][
|
|
|
|
|
- "val_check_interval"
|
|
|
|
|
- ],
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- with gr.Tab(
|
|
|
|
|
- label=i18n("LLAMA Configuration"), id=3
|
|
|
|
|
- ) as llama_page:
|
|
|
|
|
|
|
+ with gr.Tab(label=i18n("LLAMA Configuration")) as llama_page:
|
|
|
with gr.Row(equal_height=False):
|
|
with gr.Row(equal_height=False):
|
|
|
llama_use_lora = gr.Checkbox(
|
|
llama_use_lora = gr.Checkbox(
|
|
|
label=i18n("Use LoRA"),
|
|
label=i18n("Use LoRA"),
|
|
@@ -1105,10 +792,10 @@ with gr.Blocks(
|
|
|
llama_base_config = gr.Dropdown(
|
|
llama_base_config = gr.Dropdown(
|
|
|
label=i18n("Model Size"),
|
|
label=i18n("Model Size"),
|
|
|
choices=[
|
|
choices=[
|
|
|
- "dual_ar_2_codebook_large",
|
|
|
|
|
- "dual_ar_2_codebook_medium",
|
|
|
|
|
|
|
+ "text2semantic_agent",
|
|
|
|
|
+ "text2semantic_finetune",
|
|
|
],
|
|
],
|
|
|
- value="dual_ar_2_codebook_medium",
|
|
|
|
|
|
|
+ value="text2semantic_finetune",
|
|
|
)
|
|
)
|
|
|
llama_data_num_workers_slider = gr.Slider(
|
|
llama_data_num_workers_slider = gr.Slider(
|
|
|
label=i18n("Number of Workers"),
|
|
label=i18n("Number of Workers"),
|
|
@@ -1190,8 +877,7 @@ with gr.Blocks(
|
|
|
"Type the path or select from the dropdown"
|
|
"Type the path or select from the dropdown"
|
|
|
),
|
|
),
|
|
|
choices=[
|
|
choices=[
|
|
|
- "checkpoints/text2semantic-sft-large-v1.1-4k.pth",
|
|
|
|
|
- "checkpoints/text2semantic-sft-medium-v1.1-4k.pth",
|
|
|
|
|
|
|
+ "checkpoints/fish-speech-1.2/model.pth",
|
|
|
],
|
|
],
|
|
|
value=init_llama_yml["ckpt_path"],
|
|
value=init_llama_yml["ckpt_path"],
|
|
|
allow_custom_value=True,
|
|
allow_custom_value=True,
|
|
@@ -1216,10 +902,10 @@ with gr.Blocks(
|
|
|
"Type the path or select from the dropdown"
|
|
"Type the path or select from the dropdown"
|
|
|
),
|
|
),
|
|
|
choices=[
|
|
choices=[
|
|
|
- "dual_ar_2_codebook_large",
|
|
|
|
|
- "dual_ar_2_codebook_medium",
|
|
|
|
|
|
|
+ "text2semantic_agent",
|
|
|
|
|
+ "text2semantic_finetune",
|
|
|
],
|
|
],
|
|
|
- value="dual_ar_2_codebook_medium",
|
|
|
|
|
|
|
+ value="text2semantic_agent",
|
|
|
allow_custom_value=True,
|
|
allow_custom_value=True,
|
|
|
)
|
|
)
|
|
|
with gr.Row(equal_height=False):
|
|
with gr.Row(equal_height=False):
|
|
@@ -1282,17 +968,14 @@ with gr.Blocks(
|
|
|
"Type the path or select from the dropdown"
|
|
"Type the path or select from the dropdown"
|
|
|
),
|
|
),
|
|
|
choices=list_decoder_models(),
|
|
choices=list_decoder_models(),
|
|
|
- value=init_vits_yml["ckpt_path"],
|
|
|
|
|
|
|
+ value="checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
|
|
|
allow_custom_value=True,
|
|
allow_custom_value=True,
|
|
|
)
|
|
)
|
|
|
infer_decoder_config = gr.Dropdown(
|
|
infer_decoder_config = gr.Dropdown(
|
|
|
label=i18n("Decoder Model Config"),
|
|
label=i18n("Decoder Model Config"),
|
|
|
info=i18n("Changing with the Model Path"),
|
|
info=i18n("Changing with the Model Path"),
|
|
|
- value="vits_decoder_finetune",
|
|
|
|
|
|
|
+ value="firefly_gan_vq",
|
|
|
choices=[
|
|
choices=[
|
|
|
- "vits_decoder_finetune",
|
|
|
|
|
- "vits_decoder_pretrain",
|
|
|
|
|
- "firefly_gan_vq",
|
|
|
|
|
"firefly_gan_vq",
|
|
"firefly_gan_vq",
|
|
|
],
|
|
],
|
|
|
allow_custom_value=True,
|
|
allow_custom_value=True,
|
|
@@ -1303,20 +986,11 @@ with gr.Blocks(
|
|
|
info=i18n(
|
|
info=i18n(
|
|
|
"Type the path or select from the dropdown"
|
|
"Type the path or select from the dropdown"
|
|
|
),
|
|
),
|
|
|
- value=init_llama_yml["ckpt_path"],
|
|
|
|
|
|
|
+ value="checkpoints/fish-speech-1.2",
|
|
|
choices=list_llama_models(),
|
|
choices=list_llama_models(),
|
|
|
allow_custom_value=True,
|
|
allow_custom_value=True,
|
|
|
)
|
|
)
|
|
|
- infer_llama_config = gr.Dropdown(
|
|
|
|
|
- label=i18n("LLAMA Model Config"),
|
|
|
|
|
- info=i18n("Changing with the Model Path"),
|
|
|
|
|
- choices=[
|
|
|
|
|
- "dual_ar_2_codebook_large",
|
|
|
|
|
- "dual_ar_2_codebook_medium",
|
|
|
|
|
- ],
|
|
|
|
|
- value="dual_ar_2_codebook_medium",
|
|
|
|
|
- allow_custom_value=True,
|
|
|
|
|
- )
|
|
|
|
|
|
|
+
|
|
|
with gr.Row():
|
|
with gr.Row():
|
|
|
infer_compile = gr.Radio(
|
|
infer_compile = gr.Radio(
|
|
|
label=i18n("Compile Model"),
|
|
label=i18n("Compile Model"),
|
|
@@ -1388,7 +1062,6 @@ with gr.Blocks(
|
|
|
)
|
|
)
|
|
|
gr.HTML(footer, elem_id="footer")
|
|
gr.HTML(footer, elem_id="footer")
|
|
|
vqgan_page.select(lambda: "VQGAN", None, model_type_radio)
|
|
vqgan_page.select(lambda: "VQGAN", None, model_type_radio)
|
|
|
- vits_page.select(lambda: "VITS", None, model_type_radio)
|
|
|
|
|
llama_page.select(lambda: "LLAMA", None, model_type_radio)
|
|
llama_page.select(lambda: "LLAMA", None, model_type_radio)
|
|
|
add_button.click(
|
|
add_button.click(
|
|
|
fn=add_item,
|
|
fn=add_item,
|
|
@@ -1413,24 +1086,6 @@ with gr.Blocks(
|
|
|
model_type_radio,
|
|
model_type_radio,
|
|
|
min_duration,
|
|
min_duration,
|
|
|
max_duration,
|
|
max_duration,
|
|
|
- # vq-gan config
|
|
|
|
|
- vqgan_ckpt,
|
|
|
|
|
- vqgan_lr_slider,
|
|
|
|
|
- vqgan_maxsteps_slider,
|
|
|
|
|
- vqgan_data_num_workers_slider,
|
|
|
|
|
- vqgan_data_batch_size_slider,
|
|
|
|
|
- vqgan_data_val_batch_size_slider,
|
|
|
|
|
- vqgan_precision_dropdown,
|
|
|
|
|
- vqgan_check_interval_slider,
|
|
|
|
|
- # vits config
|
|
|
|
|
- vits_ckpt,
|
|
|
|
|
- vits_lr_slider,
|
|
|
|
|
- vits_maxsteps_slider,
|
|
|
|
|
- vits_data_num_workers_slider,
|
|
|
|
|
- vits_data_batch_size_slider,
|
|
|
|
|
- vits_data_val_batch_size_slider,
|
|
|
|
|
- vits_precision_dropdown,
|
|
|
|
|
- vits_check_interval_slider,
|
|
|
|
|
# llama config
|
|
# llama config
|
|
|
llama_ckpt,
|
|
llama_ckpt,
|
|
|
llama_base_config,
|
|
llama_base_config,
|
|
@@ -1453,14 +1108,6 @@ with gr.Blocks(
|
|
|
outputs=[train_error],
|
|
outputs=[train_error],
|
|
|
)
|
|
)
|
|
|
tb_dir.change(fn=fresh_tb_dir, inputs=[], outputs=[tb_dir])
|
|
tb_dir.change(fn=fresh_tb_dir, inputs=[], outputs=[tb_dir])
|
|
|
- infer_decoder_model.change(
|
|
|
|
|
- fn=change_decoder_config,
|
|
|
|
|
- inputs=[infer_decoder_model],
|
|
|
|
|
- outputs=[infer_decoder_config],
|
|
|
|
|
- )
|
|
|
|
|
- infer_llama_model.change(
|
|
|
|
|
- fn=change_llama_config, inputs=[infer_llama_model], outputs=[infer_llama_config]
|
|
|
|
|
- )
|
|
|
|
|
infer_decoder_model.change(
|
|
infer_decoder_model.change(
|
|
|
fn=fresh_decoder_model, inputs=[], outputs=[infer_decoder_model]
|
|
fn=fresh_decoder_model, inputs=[], outputs=[infer_decoder_model]
|
|
|
)
|
|
)
|
|
@@ -1476,17 +1123,12 @@ with gr.Blocks(
|
|
|
fresh_btn.click(
|
|
fresh_btn.click(
|
|
|
fn=new_explorer, inputs=[train_box, tree_slider], outputs=[file_markdown]
|
|
fn=new_explorer, inputs=[train_box, tree_slider], outputs=[file_markdown]
|
|
|
)
|
|
)
|
|
|
- vqgan_ckpt.change(fn=fresh_vqgan_ckpt, inputs=[], outputs=[vqgan_ckpt])
|
|
|
|
|
- vits_ckpt.change(fn=fresh_vits_ckpt, inputs=[], outputs=[vits_ckpt])
|
|
|
|
|
llama_use_lora.change(
|
|
llama_use_lora.change(
|
|
|
fn=fresh_llama_ckpt, inputs=[llama_use_lora], outputs=[llama_ckpt]
|
|
fn=fresh_llama_ckpt, inputs=[llama_use_lora], outputs=[llama_ckpt]
|
|
|
)
|
|
)
|
|
|
llama_ckpt.change(
|
|
llama_ckpt.change(
|
|
|
fn=fresh_llama_ckpt, inputs=[llama_use_lora], outputs=[llama_ckpt]
|
|
fn=fresh_llama_ckpt, inputs=[llama_use_lora], outputs=[llama_ckpt]
|
|
|
)
|
|
)
|
|
|
- lora_weight.change(
|
|
|
|
|
- fn=change_llama_config, inputs=[lora_weight], outputs=[lora_llama_config]
|
|
|
|
|
- )
|
|
|
|
|
lora_weight.change(
|
|
lora_weight.change(
|
|
|
fn=lambda: gr.Dropdown(choices=list_lora_llama_models()),
|
|
fn=lambda: gr.Dropdown(choices=list_lora_llama_models()),
|
|
|
inputs=[],
|
|
inputs=[],
|
|
@@ -1506,7 +1148,6 @@ with gr.Blocks(
|
|
|
infer_decoder_model,
|
|
infer_decoder_model,
|
|
|
infer_decoder_config,
|
|
infer_decoder_config,
|
|
|
infer_llama_model,
|
|
infer_llama_model,
|
|
|
- infer_llama_config,
|
|
|
|
|
infer_compile,
|
|
infer_compile,
|
|
|
],
|
|
],
|
|
|
outputs=[infer_error],
|
|
outputs=[infer_error],
|