2 лет назад · 5dd593fef3
--- a/API_FLAGS.txt
+++ b/API_FLAGS.txt
@@ -1,6 +1,6 @@
 
				 # --infer
			
 
				 # --api
			
 
				---listen 0.0.0.0:8000 \
			
 
				+--listen 0.0.0.0:8080 \
			
 
				 --llama-checkpoint-path "checkpoints/fish-speech-1.2" \
			
 
				 --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
			
 
				 --decoder-config-name firefly_gan_vq
			
--- a/README.md
+++ b/README.md
@@ -28,6 +28,9 @@ We do not hold any responsibility for any illegal usage of the codebase. Please
 
				 ## Online Demo
			
 
				 [Fish Audio](https://fish.audio)   
			
 
				 
			
 
				+## Quick Start
			
 
				+[inference.ipynb](https://nbviewer.org/github/AnyaCoder/fish-speech/blob/main/inference.ipynb)
			
 
				+
			
 
				 ## Videos
			
 
				 #### Demo Video: https://www.bilibili.com/video/BV1wz421B71D
			
 
				 #### Tech slides Video: https://www.bilibili.com/video/BV1zJ4m1K7cj
			
@@ -35,10 +38,12 @@ We do not hold any responsibility for any illegal usage of the codebase. Please
 
				 ## Documents / 文档
			
 
				 - [English](https://speech.fish.audio/en/)
			
 
				 - [中文](https://speech.fish.audio/)
			
 
				+- [日本語](https://speech.fish.audio/)
			
 
				 
			
 
				 ## Samples / 例子
			
 
				 - [English](https://speech.fish.audio/en/samples/)
			
 
				 - [中文](https://speech.fish.audio/samples/)
			
 
				+- [日本語](https://speech.fish.audio/ja/samples/)
			
 
				 
			
 
				 ## Credits / 鸣谢
			
 
				 - [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
			
--- a/docs/en/index.md
+++ b/docs/en/index.md
@@ -23,7 +23,7 @@ This codebase is released under the `BSD-3-Clause` license, and all models are r
 
				 
			
 
				 ## Requirements
			
 
				 
			
 
				-- GPU Memory: 4GB (for inference), 16GB (for fine-tuning)
			
 
				+- GPU Memory: 4GB (for inference), 8GB (for fine-tuning)
			
 
				 - System: Linux, Windows
			
 
				 
			
 
				 ## Windows Setup
			
@@ -67,6 +67,7 @@ Non-professional Windows users can consider the following methods to run the cod
 
				                   </p>
			
 
				                </ul>
			
 
				             </li>
			
 
				+            <li>Install <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
			
 
				       </ol>
			
 
				    </li>
			
 
				    <li>Double-click <code>start.bat</code> to enter the Fish-Speech training inference configuration WebUI page.
			
--- a/docs/en/inference.md
+++ b/docs/en/inference.md
@@ -68,7 +68,7 @@ We provide a HTTP API for inference. You can use the following command to start
 
				 
			
 
				 ```bash
			
 
				 python -m tools.api \
			
 
				-    --listen 0.0.0.0:8000 \
			
 
				+    --listen 0.0.0.0:8080 \
			
 
				     --llama-checkpoint-path "checkpoints/fish-speech-1.2" \
			
 
				     --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
			
 
				     --decoder-config-name firefly_gan_vq
			
@@ -76,7 +76,7 @@ python -m tools.api \
 
				 
			
 
				 If you want to speed up inference, you can add the --compile parameter.
			
 
				 
			
 
				-After that, you can view and test the API at http://127.0.0.1:8000/.
			
 
				+After that, you can view and test the API at http://127.0.0.1:8080/.
			
 
				 
			
 
				 Below is an example of sending a request using `tools/post_api.py`.
			
 
				 
			
@@ -84,7 +84,7 @@ Below is an example of sending a request using `tools/post_api.py`.
 
				 python -m tools.post_api \
			
 
				     --text "Text to be input" \
			
 
				     --reference_audio "Path to reference audio" \
			
 
				-    --reference_text "Text content of the reference audio"
			
 
				+    --reference_text "Text content of the reference audio" \
			
 
				     --streaming True
			
 
				 ```
			
 
				 
			
@@ -130,7 +130,7 @@ python tools/gen_ref.py
 
				 python -m tools.post_api \
			
 
				     --text "Text to be input" \
			
 
				     --speaker "${SPEAKER1}" \
			
 
				-    --emotion "${EMOTION1}"
			
 
				+    --emotion "${EMOTION1}" \
			
 
				     --streaming True
			
 
				 ```
			
 
				 
			
--- a/docs/ja/index.md
+++ b/docs/ja/index.md
@@ -23,7 +23,7 @@
 
				 
			
 
				 ## 要件
			
 
				 
			
 
				-- GPU メモリ: 4GB（推論用）、16GB（微調整用）
			
 
				+- GPU メモリ: 4GB（推論用）、8GB（微調整用）
			
 
				 - システム: Linux、Windows
			
 
				 
			
 
				 ## Windows セットアップ
			
@@ -67,6 +67,7 @@ Windows のプロユーザーは、コードベースを実行するために WS
 
				                   </p>
			
 
				                </ul>
			
 
				             </li>
			
 
				+            <li>インストール <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
			
 
				       </ol>
			
 
				    </li>
			
 
				    <li><code>start.bat</code>をダブルクリックして、Fish-Speechトレーニング推論設定WebUIページに入ります。
			
--- a/docs/ja/inference.md
+++ b/docs/ja/inference.md
@@ -68,7 +68,7 @@ python tools/vqgan/inference.py \
 
				 
			
 
				 ```bash
			
 
				 python -m tools.api \
			
 
				-    --listen 0.0.0.0:8000 \
			
 
				+    --listen 0.0.0.0:8080 \
			
 
				     --llama-checkpoint-path "checkpoints/fish-speech-1.2" \
			
 
				     --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
			
 
				     --decoder-config-name firefly_gan_vq
			
@@ -76,14 +76,16 @@ python -m tools.api \
 
				 
			
 
				 推論を高速化したい場合は、--compile パラメータを追加できます。
			
 
				 
			
 
				-その後、`http://127.0.0.1:8000/`で API を表示およびテストできます。
			
 
				+その後、`http://127.0.0.1:8080/`で API を表示およびテストできます。
			
 
				 
			
 
				 以下は、`tools/post_api.py` を使用してリクエストを送信する例です。
			
 
				 
			
 
				 ```bash
			
 
				-python tools/vqgan/inference.py \
			
 
				-    -i "paimon.wav" \
			
 
				-    --checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
			
 
				+python -m tools.post_api \
			
 
				+    --text "入力するテキスト" \
			
 
				+    --reference_audio "参照音声へのパス" \
			
 
				+    --reference_text "参照音声テキスト" \
			
 
				+    --streaming True
			
 
				 ```
			
 
				 
			
 
				 上記のコマンドは、参照音声の情報に基づいて必要な音声を合成し、ストリーミング方式で返すことを示しています。
			
@@ -131,7 +133,7 @@ python tools/gen_ref.py
 
				 python -m tools.post_api \
			
 
				     --text "入力するテキスト" \
			
 
				     --speaker "${SPEAKER1}" \
			
 
				-    --emotion "${EMOTION1}"
			
 
				+    --emotion "${EMOTION1}" \
			
 
				     --streaming True
			
 
				 
			
 
				 ```
			
--- a/docs/zh/index.md
+++ b/docs/zh/index.md
@@ -23,7 +23,7 @@
 
				 
			
 
				 ## 要求
			
 
				 
			
 
				-- GPU 内存: 4GB (用于推理), 16GB (用于微调)
			
 
				+- GPU 内存: 4GB (用于推理), 8GB (用于微调)
			
 
				 - 系统: Linux, Windows
			
 
				 
			
 
				 ## Windows 配置
			
@@ -67,6 +67,7 @@ Windows 非专业用户可考虑以下为免 Linux 环境的基础运行方法
 
				                   </p>
			
 
				                </ul>
			
 
				             </li>
			
 
				+            <li>インストール <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
			
 
				       </ol>
			
 
				    </li>
			
 
				    <li>双击 start.bat, 进入 Fish-Speech 训练推理配置 WebUI 页面。
			
--- a/docs/zh/inference.md
+++ b/docs/zh/inference.md
@@ -74,7 +74,7 @@ python tools/vqgan/inference.py \
 
				 
			
 
				 ```bash
			
 
				 python -m tools.api \
			
 
				-    --listen 0.0.0.0:8000 \
			
 
				+    --listen 0.0.0.0:8080 \
			
 
				     --llama-checkpoint-path "checkpoints/fish-speech-1.2" \
			
 
				     --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
			
 
				     --decoder-config-name firefly_gan_vq
			
@@ -85,7 +85,7 @@ python -m tools.api \
 
				 HF_ENDPOINT=https://hf-mirror.com python -m ...
			
 
				 ```
			
 
				 
			
 
				-随后, 你可以在 `http://127.0.0.1:8000/` 中查看并测试 API.
			
 
				+随后, 你可以在 `http://127.0.0.1:8080/` 中查看并测试 API.
			
 
				 
			
 
				 下面是使用`tools/post_api.py`发送请求的示例。
			
 
				 
			
@@ -93,7 +93,7 @@ HF_ENDPOINT=https://hf-mirror.com python -m ...
 
				 python -m tools.post_api \
			
 
				     --text "要输入的文本" \
			
 
				     --reference_audio "参考音频路径" \
			
 
				-    --reference_text "参考音频的文本内容"
			
 
				+    --reference_text "参考音频的文本内容" \
			
 
				     --streaming True
			
 
				 ```
			
 
				 
			
--- a/inference.ipynb
+++ b/inference.ipynb
@@ -0,0 +1,211 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## 命令行推理"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### For Windows"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "vscode": {
			
 
				+     "languageId": "bat"
			
 
				+    }
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!chcp 65001"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### For Linux"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import locale\n",
			
 
				+    "locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## API Client\n",
			
 
				+    "\n",
			
 
				+    "需要在终端开启API Server\n",
			
 
				+    "\n",
			
 
				+    "> 音频用本地路径\n",
			
 
				+    "\n",
			
 
				+    "> 文本可以直接用路径，也可以用内容"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "vscode": {
			
 
				+     "languageId": "shellscript"
			
 
				+    }
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!python -m tools.post_api \\\n",
			
 
				+    "    --text \"Hello everyone, I am an open-source text-to-speech model developed by Fish Audio.\" \\\n",
			
 
				+    "    --reference_audio \"D:\\PythonProject\\原神语音中文\\胡桃\\vo_hutao_draw_appear.wav\" \\\n",
			
 
				+    "    --reference_text \"D:\\PythonProject\\原神语音中文\\胡桃\\vo_hutao_draw_appear.lab\" \\\n",
			
 
				+    "    --streaming True"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## For Test"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### 0. 下载模型"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!set HF_ENDPOINT=https://hf-mirror.com\n",
			
 
				+    "# !export HF_ENDPOINT=https://hf-mirror.com\n",
			
 
				+    "!huggingface-cli download fishaudio/fish-speech-1.2 --local-dir checkpoints/fish-speech-1.2/"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### 1. 从语音生成 prompt:\n",
			
 
				+    ">  如果你打算让模型随机选择音色, 你可以跳过这一步.\n",
			
 
				+    "\n",
			
 
				+    "你应该能得到一个 `fake.npy` 文件."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "vscode": {
			
 
				+     "languageId": "shellscript"
			
 
				+    }
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "## 在此输入你的语音路径:\n",
			
 
				+    "src_audio = r\"D:\\PythonProject\\原神语音中文\\胡桃\\vo_hutao_draw_appear.wav\"\n",
			
 
				+    "\n",
			
 
				+    "!python tools/vqgan/inference.py \\\n",
			
 
				+    "    -i {src_audio} \\\n",
			
 
				+    "    --checkpoint-path \"checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth\"\n",
			
 
				+    "\n",
			
 
				+    "from IPython.display import Audio, display\n",
			
 
				+    "audio = Audio(filename=\"fake.wav\")\n",
			
 
				+    "display(audio)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### 2. 从文本生成语义 token:\n",
			
 
				+    "> 该命令会在工作目录下创建 codes_N 文件, 其中 N 是从 0 开始的整数.\n",
			
 
				+    "\n",
			
 
				+    "> 您可以使用 --compile 来融合 cuda 内核以实现更快的推理"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "vscode": {
			
 
				+     "languageId": "shellscript"
			
 
				+    }
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!python tools/llama/generate.py \\\n",
			
 
				+    "    --text \"人间灯火倒映湖中，她的渴望让静水泛起涟漪。若代价只是孤独，那就让这份愿望肆意流淌。流入她所注视的世间，也流入她如湖水般澄澈的目光。\" \\\n",
			
 
				+    "    --prompt-text \"唷，找本堂主有何贵干呀？嗯？你不知道吗，往生堂第七十七代堂主就是胡桃我啦！嘶，不过瞧你的模样，容光焕发，身体健康，嗯…想必是为了工作以外的事来找我，对吧？\" \\\n",
			
 
				+    "    --prompt-tokens \"fake.npy\" \\\n",
			
 
				+    "    --checkpoint-path \"checkpoints/fish-speech-1.2\" \\\n",
			
 
				+    "    --num-samples 2\n",
			
 
				+    "    # --compile"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### 3. 从语义 token 生成人声:"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "vscode": {
			
 
				+     "languageId": "shellscript"
			
 
				+    }
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!python tools/vqgan/inference.py \\\n",
			
 
				+    "    -i \"codes_0.npy\" \\\n",
			
 
				+    "    --checkpoint-path \"checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth\"\n",
			
 
				+    "\n",
			
 
				+    "from IPython.display import Audio, display\n",
			
 
				+    "audio = Audio(filename=\"fake.wav\")\n",
			
 
				+    "display(audio)"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.10.14"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}
			
--- a/tools/api.py
+++ b/tools/api.py
@@ -77,12 +77,25 @@ async def other_exception_handler(exc: "Exception"):
 
				     )
			
 
				 
			
 
				 
			
 
				+def load_audio(reference_audio, sr):
			
 
				+    if len(reference_audio) > 255 or not Path(reference_audio).exists():
			
 
				+        try:
			
 
				+            audio_data = base64.b64decode(reference_audio)
			
 
				+            reference_audio = io.BytesIO(audio_data)
			
 
				+        except base64.binascii.Error:
			
 
				+            raise ValueError("Invalid path or base64 string")
			
 
				+
			
 
				+    audio, _ = librosa.load(reference_audio, sr=sr, mono=True)
			
 
				+    return audio
			
 
				+
			
 
				+
			
 
				 def encode_reference(*, decoder_model, reference_audio, enable_reference_audio):
			
 
				     if enable_reference_audio and reference_audio is not None:
			
 
				         # Load audios, and prepare basic info here
			
 
				-        reference_audio_content, _ = librosa.load(
			
 
				-            reference_audio, sr=decoder_model.spec_transform.sample_rate, mono=True
			
 
				+        reference_audio_content = load_audio(
			
 
				+            reference_audio, decoder_model.spec_transform.sample_rate
			
 
				         )
			
 
				+
			
 
				         audios = torch.from_numpy(reference_audio_content).to(decoder_model.device)[
			
 
				             None, None, :
			
 
				         ]
			
@@ -213,7 +226,7 @@ def inference(req: InvokeRequest):
 
				         reference_audio=req.reference_audio,
			
 
				         enable_reference_audio=req.reference_audio is not None,
			
 
				     )
			
 
				-
			
 
				+    logger.info(f"ref_text: {req.reference_text}")
			
 
				     # LLAMA Inference
			
 
				     request = dict(
			
 
				         device=decoder_model.device,
			
--- a/tools/post_api.py
+++ b/tools/post_api.py
@@ -1,11 +1,29 @@
 
				 import argparse
			
 
				 import base64
			
 
				 import json
			
 
				+from pathlib import Path
			
 
				 
			
 
				 import pyaudio
			
 
				 import requests
			
 
				 
			
 
				 
			
 
				+def wav_to_base64(file_path):
			
 
				+    if not file_path or not Path(file_path).exists():
			
 
				+        return None
			
 
				+    with open(file_path, "rb") as wav_file:
			
 
				+        wav_content = wav_file.read()
			
 
				+        base64_encoded = base64.b64encode(wav_content)
			
 
				+        return base64_encoded.decode("utf-8")
			
 
				+
			
 
				+
			
 
				+def read_ref_text(ref_text):
			
 
				+    path = Path(ref_text)
			
 
				+    if path.exists() and path.is_file():
			
 
				+        with path.open("r", encoding="utf-8") as file:
			
 
				+            return file.read()
			
 
				+    return ref_text
			
 
				+
			
 
				+
			
 
				 def play_audio(audio_content, format, channels, rate):
			
 
				     p = pyaudio.PyAudio()
			
 
				     stream = p.open(format=format, channels=channels, rate=rate, output=True)
			
@@ -24,7 +42,7 @@ if __name__ == "__main__":
 
				         "--url",
			
 
				         "-u",
			
 
				         type=str,
			
 
				-        default="http://127.0.0.1:8000/v1/invoke",
			
 
				+        default="http://127.0.0.1:8080/v1/invoke",
			
 
				         help="URL of the server",
			
 
				     )
			
 
				     parser.add_argument(
			
@@ -34,14 +52,14 @@ if __name__ == "__main__":
 
				         "--reference_audio",
			
 
				         "-ra",
			
 
				         type=str,
			
 
				-        required=False,
			
 
				+        default=None,
			
 
				         help="Path to the WAV file",
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "--reference_text",
			
 
				         "-rt",
			
 
				         type=str,
			
 
				-        required=False,
			
 
				+        default=None,
			
 
				         help="Reference text for voice synthesis",
			
 
				     )
			
 
				     parser.add_argument(
			
@@ -80,10 +98,16 @@ if __name__ == "__main__":
 
				 
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				+    base64_audio = wav_to_base64(args.reference_audio)
			
 
				+
			
 
				+    ref_text = args.reference_text
			
 
				+    if ref_text:
			
 
				+        ref_text = read_ref_text(ref_text)
			
 
				+
			
 
				     data = {
			
 
				         "text": args.text,
			
 
				-        "reference_text": args.reference_text,
			
 
				-        "reference_audio": args.reference_audio,
			
 
				+        "reference_text": ref_text,
			
 
				+        "reference_audio": base64_audio,
			
 
				         "max_new_tokens": args.max_new_tokens,
			
 
				         "chunk_length": args.chunk_length,
			
 
				         "top_p": args.top_p,