فهرست منبع

Add Quick Start (#363)

spicysama 1 سال پیش
والد
کامیت
5dd593fef3
11فایلهای تغییر یافته به همراه283 افزوده شده و 25 حذف شده
  1. 1 1
      API_FLAGS.txt
  2. 5 0
      README.md
  3. 2 1
      docs/en/index.md
  4. 4 4
      docs/en/inference.md
  5. 2 1
      docs/ja/index.md
  6. 8 6
      docs/ja/inference.md
  7. 2 1
      docs/zh/index.md
  8. 3 3
      docs/zh/inference.md
  9. 211 0
      inference.ipynb
  10. 16 3
      tools/api.py
  11. 29 5
      tools/post_api.py

+ 1 - 1
API_FLAGS.txt

@@ -1,6 +1,6 @@
 # --infer
 # --api
---listen 0.0.0.0:8000 \
+--listen 0.0.0.0:8080 \
 --llama-checkpoint-path "checkpoints/fish-speech-1.2" \
 --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
 --decoder-config-name firefly_gan_vq

+ 5 - 0
README.md

@@ -28,6 +28,9 @@ We do not hold any responsibility for any illegal usage of the codebase. Please
 ## Online Demo
 [Fish Audio](https://fish.audio)   
 
+## Quick Start
+[inference.ipynb](https://nbviewer.org/github/AnyaCoder/fish-speech/blob/main/inference.ipynb)
+
 ## Videos
 #### Demo Video: https://www.bilibili.com/video/BV1wz421B71D
 #### Tech slides Video: https://www.bilibili.com/video/BV1zJ4m1K7cj
@@ -35,10 +38,12 @@ We do not hold any responsibility for any illegal usage of the codebase. Please
 ## Documents / 文档
 - [English](https://speech.fish.audio/en/)
 - [中文](https://speech.fish.audio/)
+- [日本語](https://speech.fish.audio/)
 
 ## Samples / 例子
 - [English](https://speech.fish.audio/en/samples/)
 - [中文](https://speech.fish.audio/samples/)
+- [日本語](https://speech.fish.audio/ja/samples/)
 
 ## Credits / 鸣谢
 - [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)

+ 2 - 1
docs/en/index.md

@@ -23,7 +23,7 @@ This codebase is released under the `BSD-3-Clause` license, and all models are r
 
 ## Requirements
 
-- GPU Memory: 4GB (for inference), 16GB (for fine-tuning)
+- GPU Memory: 4GB (for inference), 8GB (for fine-tuning)
 - System: Linux, Windows
 
 ## Windows Setup
@@ -67,6 +67,7 @@ Non-professional Windows users can consider the following methods to run the cod
                   </p>
                </ul>
             </li>
+            <li>Install <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
       </ol>
    </li>
    <li>Double-click <code>start.bat</code> to enter the Fish-Speech training inference configuration WebUI page.

+ 4 - 4
docs/en/inference.md

@@ -68,7 +68,7 @@ We provide a HTTP API for inference. You can use the following command to start
 
 ```bash
 python -m tools.api \
-    --listen 0.0.0.0:8000 \
+    --listen 0.0.0.0:8080 \
     --llama-checkpoint-path "checkpoints/fish-speech-1.2" \
     --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
     --decoder-config-name firefly_gan_vq
@@ -76,7 +76,7 @@ python -m tools.api \
 
 If you want to speed up inference, you can add the --compile parameter.
 
-After that, you can view and test the API at http://127.0.0.1:8000/.
+After that, you can view and test the API at http://127.0.0.1:8080/.
 
 Below is an example of sending a request using `tools/post_api.py`.
 
@@ -84,7 +84,7 @@ Below is an example of sending a request using `tools/post_api.py`.
 python -m tools.post_api \
     --text "Text to be input" \
     --reference_audio "Path to reference audio" \
-    --reference_text "Text content of the reference audio"
+    --reference_text "Text content of the reference audio" \
     --streaming True
 ```
 
@@ -130,7 +130,7 @@ python tools/gen_ref.py
 python -m tools.post_api \
     --text "Text to be input" \
     --speaker "${SPEAKER1}" \
-    --emotion "${EMOTION1}"
+    --emotion "${EMOTION1}" \
     --streaming True
 ```
 

+ 2 - 1
docs/ja/index.md

@@ -23,7 +23,7 @@
 
 ## 要件
 
-- GPU メモリ: 4GB(推論用)、16GB(微調整用)
+- GPU メモリ: 4GB(推論用)、8GB(微調整用)
 - システム: Linux、Windows
 
 ## Windows セットアップ
@@ -67,6 +67,7 @@ Windows のプロユーザーは、コードベースを実行するために WS
                   </p>
                </ul>
             </li>
+            <li>インストール <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
       </ol>
    </li>
    <li><code>start.bat</code>をダブルクリックして、Fish-Speechトレーニング推論設定WebUIページに入ります。

+ 8 - 6
docs/ja/inference.md

@@ -68,7 +68,7 @@ python tools/vqgan/inference.py \
 
 ```bash
 python -m tools.api \
-    --listen 0.0.0.0:8000 \
+    --listen 0.0.0.0:8080 \
     --llama-checkpoint-path "checkpoints/fish-speech-1.2" \
     --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
     --decoder-config-name firefly_gan_vq
@@ -76,14 +76,16 @@ python -m tools.api \
 
 推論を高速化したい場合は、--compile パラメータを追加できます。
 
-その後、`http://127.0.0.1:8000/`で API を表示およびテストできます。
+その後、`http://127.0.0.1:8080/`で API を表示およびテストできます。
 
 以下は、`tools/post_api.py` を使用してリクエストを送信する例です。
 
 ```bash
-python tools/vqgan/inference.py \
-    -i "paimon.wav" \
-    --checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
+python -m tools.post_api \
+    --text "入力するテキスト" \
+    --reference_audio "参照音声へのパス" \
+    --reference_text "参照音声テキスト" \
+    --streaming True
 ```
 
 上記のコマンドは、参照音声の情報に基づいて必要な音声を合成し、ストリーミング方式で返すことを示しています。
@@ -131,7 +133,7 @@ python tools/gen_ref.py
 python -m tools.post_api \
     --text "入力するテキスト" \
     --speaker "${SPEAKER1}" \
-    --emotion "${EMOTION1}"
+    --emotion "${EMOTION1}" \
     --streaming True
 
 ```

+ 2 - 1
docs/zh/index.md

@@ -23,7 +23,7 @@
 
 ## 要求
 
-- GPU 内存: 4GB (用于推理), 16GB (用于微调)
+- GPU 内存: 4GB (用于推理), 8GB (用于微调)
 - 系统: Linux, Windows
 
 ## Windows 配置
@@ -67,6 +67,7 @@ Windows 非专业用户可考虑以下为免 Linux 环境的基础运行方法
                   </p>
                </ul>
             </li>
+            <li>インストール <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
       </ol>
    </li>
    <li>双击 start.bat, 进入 Fish-Speech 训练推理配置 WebUI 页面。

+ 3 - 3
docs/zh/inference.md

@@ -74,7 +74,7 @@ python tools/vqgan/inference.py \
 
 ```bash
 python -m tools.api \
-    --listen 0.0.0.0:8000 \
+    --listen 0.0.0.0:8080 \
     --llama-checkpoint-path "checkpoints/fish-speech-1.2" \
     --decoder-checkpoint-path "checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
     --decoder-config-name firefly_gan_vq
@@ -85,7 +85,7 @@ python -m tools.api \
 HF_ENDPOINT=https://hf-mirror.com python -m ...
 ```
 
-随后, 你可以在 `http://127.0.0.1:8000/` 中查看并测试 API.
+随后, 你可以在 `http://127.0.0.1:8080/` 中查看并测试 API.
 
 下面是使用`tools/post_api.py`发送请求的示例。
 
@@ -93,7 +93,7 @@ HF_ENDPOINT=https://hf-mirror.com python -m ...
 python -m tools.post_api \
     --text "要输入的文本" \
     --reference_audio "参考音频路径" \
-    --reference_text "参考音频的文本内容"
+    --reference_text "参考音频的文本内容" \
     --streaming True
 ```
 

+ 211 - 0
inference.ipynb

@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 命令行推理"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### For Windows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "bat"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!chcp 65001"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### For Linux"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import locale\n",
+    "locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## API Client\n",
+    "\n",
+    "需要在终端开启API Server\n",
+    "\n",
+    "> 音频用本地路径\n",
+    "\n",
+    "> 文本可以直接用路径,也可以用内容"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!python -m tools.post_api \\\n",
+    "    --text \"Hello everyone, I am an open-source text-to-speech model developed by Fish Audio.\" \\\n",
+    "    --reference_audio \"D:\\PythonProject\\原神语音中文\\胡桃\\vo_hutao_draw_appear.wav\" \\\n",
+    "    --reference_text \"D:\\PythonProject\\原神语音中文\\胡桃\\vo_hutao_draw_appear.lab\" \\\n",
+    "    --streaming True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## For Test"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 0. 下载模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!set HF_ENDPOINT=https://hf-mirror.com\n",
+    "# !export HF_ENDPOINT=https://hf-mirror.com\n",
+    "!huggingface-cli download fishaudio/fish-speech-1.2 --local-dir checkpoints/fish-speech-1.2/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. 从语音生成 prompt:\n",
+    ">  如果你打算让模型随机选择音色, 你可以跳过这一步.\n",
+    "\n",
+    "你应该能得到一个 `fake.npy` 文件."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "## 在此输入你的语音路径:\n",
+    "src_audio = r\"D:\\PythonProject\\原神语音中文\\胡桃\\vo_hutao_draw_appear.wav\"\n",
+    "\n",
+    "!python tools/vqgan/inference.py \\\n",
+    "    -i {src_audio} \\\n",
+    "    --checkpoint-path \"checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth\"\n",
+    "\n",
+    "from IPython.display import Audio, display\n",
+    "audio = Audio(filename=\"fake.wav\")\n",
+    "display(audio)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. 从文本生成语义 token:\n",
+    "> 该命令会在工作目录下创建 codes_N 文件, 其中 N 是从 0 开始的整数.\n",
+    "\n",
+    "> 您可以使用 --compile 来融合 cuda 内核以实现更快的推理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!python tools/llama/generate.py \\\n",
+    "    --text \"人间灯火倒映湖中,她的渴望让静水泛起涟漪。若代价只是孤独,那就让这份愿望肆意流淌。流入她所注视的世间,也流入她如湖水般澄澈的目光。\" \\\n",
+    "    --prompt-text \"唷,找本堂主有何贵干呀?嗯?你不知道吗,往生堂第七十七代堂主就是胡桃我啦!嘶,不过瞧你的模样,容光焕发,身体健康,嗯…想必是为了工作以外的事来找我,对吧?\" \\\n",
+    "    --prompt-tokens \"fake.npy\" \\\n",
+    "    --checkpoint-path \"checkpoints/fish-speech-1.2\" \\\n",
+    "    --num-samples 2\n",
+    "    # --compile"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. 从语义 token 生成人声:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!python tools/vqgan/inference.py \\\n",
+    "    -i \"codes_0.npy\" \\\n",
+    "    --checkpoint-path \"checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-4x1024-42hz-generator.pth\"\n",
+    "\n",
+    "from IPython.display import Audio, display\n",
+    "audio = Audio(filename=\"fake.wav\")\n",
+    "display(audio)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

+ 16 - 3
tools/api.py

@@ -77,12 +77,25 @@ async def other_exception_handler(exc: "Exception"):
     )
 
 
+def load_audio(reference_audio, sr):
+    if len(reference_audio) > 255 or not Path(reference_audio).exists():
+        try:
+            audio_data = base64.b64decode(reference_audio)
+            reference_audio = io.BytesIO(audio_data)
+        except base64.binascii.Error:
+            raise ValueError("Invalid path or base64 string")
+
+    audio, _ = librosa.load(reference_audio, sr=sr, mono=True)
+    return audio
+
+
 def encode_reference(*, decoder_model, reference_audio, enable_reference_audio):
     if enable_reference_audio and reference_audio is not None:
         # Load audios, and prepare basic info here
-        reference_audio_content, _ = librosa.load(
-            reference_audio, sr=decoder_model.spec_transform.sample_rate, mono=True
+        reference_audio_content = load_audio(
+            reference_audio, decoder_model.spec_transform.sample_rate
         )
+
         audios = torch.from_numpy(reference_audio_content).to(decoder_model.device)[
             None, None, :
         ]
@@ -213,7 +226,7 @@ def inference(req: InvokeRequest):
         reference_audio=req.reference_audio,
         enable_reference_audio=req.reference_audio is not None,
     )
-
+    logger.info(f"ref_text: {req.reference_text}")
     # LLAMA Inference
     request = dict(
         device=decoder_model.device,

+ 29 - 5
tools/post_api.py

@@ -1,11 +1,29 @@
 import argparse
 import base64
 import json
+from pathlib import Path
 
 import pyaudio
 import requests
 
 
+def wav_to_base64(file_path):
+    if not file_path or not Path(file_path).exists():
+        return None
+    with open(file_path, "rb") as wav_file:
+        wav_content = wav_file.read()
+        base64_encoded = base64.b64encode(wav_content)
+        return base64_encoded.decode("utf-8")
+
+
+def read_ref_text(ref_text):
+    path = Path(ref_text)
+    if path.exists() and path.is_file():
+        with path.open("r", encoding="utf-8") as file:
+            return file.read()
+    return ref_text
+
+
 def play_audio(audio_content, format, channels, rate):
     p = pyaudio.PyAudio()
     stream = p.open(format=format, channels=channels, rate=rate, output=True)
@@ -24,7 +42,7 @@ if __name__ == "__main__":
         "--url",
         "-u",
         type=str,
-        default="http://127.0.0.1:8000/v1/invoke",
+        default="http://127.0.0.1:8080/v1/invoke",
         help="URL of the server",
     )
     parser.add_argument(
@@ -34,14 +52,14 @@ if __name__ == "__main__":
         "--reference_audio",
         "-ra",
         type=str,
-        required=False,
+        default=None,
         help="Path to the WAV file",
     )
     parser.add_argument(
         "--reference_text",
         "-rt",
         type=str,
-        required=False,
+        default=None,
         help="Reference text for voice synthesis",
     )
     parser.add_argument(
@@ -80,10 +98,16 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
 
+    base64_audio = wav_to_base64(args.reference_audio)
+
+    ref_text = args.reference_text
+    if ref_text:
+        ref_text = read_ref_text(ref_text)
+
     data = {
         "text": args.text,
-        "reference_text": args.reference_text,
-        "reference_audio": args.reference_audio,
+        "reference_text": ref_text,
+        "reference_audio": base64_audio,
         "max_new_tokens": args.max_new_tokens,
         "chunk_length": args.chunk_length,
         "top_p": args.top_p,