app.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. import html
  2. import traceback
  3. import gradio as gr
  4. from fish_speech.text import parse_text_to_segments, segments_to_phones
  5. HEADER_MD = """
  6. # Fish Speech
  7. 基于 VITS 和 GPT 的多语种语音合成. 项目很大程度上基于 Rcell 的 GPT-VITS.
  8. """
  9. TEXTBOX_PLACEHOLDER = """在启用自动音素的情况下, 模型默认会全自动将输入文本转换为音素. 例如:
  10. 测试一下 Hugging face, BGM声音很大吗?那我改一下. 世界、こんにちは。
  11. 会被转换为:
  12. <Segment ZH: '测试一下' -> 'c e4 sh ir4 y i2 x ia4'>
  13. <Segment EN: ' Hugging face, BGM' -> 'HH AH1 G IH0 NG F EY1 S , B AE1 G M'>
  14. <Segment ZH: '声音很大吗?那我改一下.' -> 'sh eng1 y in1 h en3 d a4 m a5 ? n a4 w o2 g ai3 y i2 x ia4 .'>
  15. <Segment ZH: '世界,' -> 'sh ir4 j ie4 ,'>
  16. <Segment JP: 'こんにちは.' -> 'k o N n i ch i w a .'>
  17. 如你所见, 最后的句子被分割为了两个部分, 因为该日文包含了汉字, 你可以使用 <jp>...</jp> 标签来指定日文优先级. 例如:
  18. 测试一下 Hugging face, BGM声音很大吗?那我改一下. <jp>世界、こんにちは。</jp>
  19. 可以看到, 日文部分被正确地分割了出来:
  20. ...
  21. <Segment JP: '世界,こんにちは.' -> 's e k a i , k o N n i ch i w a .'>
  22. """
  23. def build_html_error_message(error):
  24. return f"""
  25. <div style="color: red; font-weight: bold;">
  26. {html.escape(error)}
  27. </div>
  28. """
  29. def prepare_text(
  30. text,
  31. input_mode,
  32. language0,
  33. language1,
  34. language2,
  35. enable_reference_audio,
  36. reference_text,
  37. ):
  38. lines = text.splitlines()
  39. languages = [language0, language1, language2]
  40. languages = [
  41. {
  42. "中文": "ZH",
  43. "日文": "JP",
  44. "英文": "EN",
  45. }[language]
  46. for language in languages
  47. ]
  48. if len(set(languages)) != len(languages):
  49. return [], build_html_error_message("语言优先级不能重复.")
  50. if enable_reference_audio:
  51. reference_text = reference_text.strip() + " "
  52. else:
  53. reference_text = ""
  54. if input_mode != "自动音素转换":
  55. return [
  56. [idx, reference_text + line, "-", "-"]
  57. for idx, line in enumerate(lines)
  58. if line.strip() != ""
  59. ], None
  60. rows = []
  61. for idx, line in enumerate(lines):
  62. if line.strip() == "":
  63. continue
  64. try:
  65. segments = parse_text_to_segments(reference_text + line, order=languages)
  66. except Exception:
  67. traceback.print_exc()
  68. err = traceback.format_exc()
  69. return [], build_html_error_message(f"解析 '{line}' 时发生错误. \n\n{err}")
  70. for segment in segments:
  71. rows.append([idx, segment.text, segment.language, " ".join(segment.phones)])
  72. return rows, None
  73. with gr.Blocks(theme=gr.themes.Base()) as app:
  74. gr.Markdown(HEADER_MD)
  75. with gr.Row():
  76. with gr.Column(scale=3):
  77. text = gr.Textbox(label="输入文本", placeholder=TEXTBOX_PLACEHOLDER, lines=3)
  78. with gr.Row():
  79. with gr.Tab(label="合成参数"):
  80. gr.Markdown("配置常见合成参数.")
  81. input_mode = gr.Dropdown(
  82. choices=["手动输入音素/文本", "自动音素转换"],
  83. value="手动输入音素/文本",
  84. label="输入模式",
  85. )
  86. with gr.Tab(label="语言优先级"):
  87. gr.Markdown("该参数只在自动音素转换时生效.")
  88. with gr.Column(scale=1):
  89. language0 = gr.Dropdown(
  90. choices=["中文", "日文", "英文"],
  91. label="语言 1",
  92. value="中文",
  93. )
  94. with gr.Column(scale=1):
  95. language1 = gr.Dropdown(
  96. choices=["中文", "日文", "英文"],
  97. label="语言 2",
  98. value="日文",
  99. )
  100. with gr.Column(scale=1):
  101. language2 = gr.Dropdown(
  102. choices=["中文", "日文", "英文"],
  103. label="语言 3",
  104. value="英文",
  105. )
  106. with gr.Tab(label="参考音频"):
  107. gr.Markdown("3 秒左右的参考音频, 适用于无微调直接推理.")
  108. enable_reference_audio = gr.Checkbox(label="启用参考音频", value=False)
  109. reference_audio = gr.Audio(label="参考音频")
  110. reference_text = gr.Textbox(
  111. label="参考文本",
  112. placeholder="参考文本",
  113. lines=1,
  114. value="万一他很崇拜我们呢? 嘿嘿.",
  115. )
  116. with gr.Row():
  117. with gr.Column(scale=2):
  118. generate = gr.Button(value="合成", variant="primary")
  119. with gr.Column(scale=1):
  120. clear = gr.Button(value="清空")
  121. with gr.Column(scale=3):
  122. error = gr.HTML(label="错误信息")
  123. parsed_text = gr.Dataframe(label="解析结果", headers=["ID", "文本", "语言", "音素"])
  124. audio = gr.Audio(label="合成音频")
  125. # Language & Text Parsing
  126. kwargs = dict(
  127. inputs=[
  128. text,
  129. input_mode,
  130. language0,
  131. language1,
  132. language2,
  133. enable_reference_audio,
  134. reference_text,
  135. ],
  136. outputs=[parsed_text, error],
  137. trigger_mode="always_last",
  138. )
  139. text.change(prepare_text, **kwargs)
  140. input_mode.change(prepare_text, **kwargs)
  141. language0.change(prepare_text, **kwargs)
  142. language1.change(prepare_text, **kwargs)
  143. language2.change(prepare_text, **kwargs)
  144. enable_reference_audio.change(prepare_text, **kwargs)
  145. # Submit
  146. generate.click(lambda: None, outputs=[audio])
  147. if __name__ == "__main__":
  148. app.launch(show_api=False)