| 1 |
- {"config":{"lang":["en","zh","ja","pt","ko","ar"],"separator":"[\\s\\-,:!=\\[\\]()\"`/]+|\\.(?!\\d)|&[lg]t;|(?!\\b)(?=[A-Z][a-z])","pipeline":["stopWordFilter"],"fields":{"title":{"boost":1000.0},"text":{"boost":1.0},"tags":{"boost":1000000.0}}},"docs":[{"location":"README.ar/","title":"README.ar","text":"Fish Speech [English](../README.md) | [\u7b80\u4f53\u4e2d\u6587](README.zh.md) | [Portuguese](README.pt-BR.md) | [\u65e5\u672c\u8a9e](README.ja.md) | [\ud55c\uad6d\uc5b4](README.ko.md) | **\u0627\u0644\u0639\u0631\u0628\u064a\u0629** | [Espa\u00f1ol](docs/README.es.md) <p>[!IMPORTANT] \u0625\u0634\u0639\u0627\u0631 \u0627\u0644\u062a\u0631\u062e\u064a\u0635 \u064a\u062a\u0645 \u0625\u0635\u062f\u0627\u0631 \u0642\u0627\u0639\u062f\u0629 \u0627\u0644\u0623\u0643\u0648\u0627\u062f \u0647\u0630\u0647 \u0648\u0623\u0648\u0632\u0627\u0646 \u0627\u0644\u0646\u0645\u0627\u0630\u062c \u0627\u0644\u0645\u0631\u062a\u0628\u0637\u0629 \u0628\u0647\u0627 \u062a\u062d\u062a FISH AUDIO RESEARCH LICENSE. \u064a\u0631\u062c\u0649 \u0627\u0644\u0631\u062c\u0648\u0639 \u0625\u0644\u0649 \u0645\u0644\u0641 LICENSE \u0644\u0645\u0632\u064a\u062f \u0645\u0646 \u0627\u0644\u062a\u0641\u0627\u0635\u064a\u0644.</p> <p>[!WARNING] \u0625\u062e\u0644\u0627\u0621 \u0627\u0644\u0645\u0633\u0624\u0648\u0644\u064a\u0629 \u0627\u0644\u0642\u0627\u0646\u0648\u0646\u064a\u0629 \u0646\u062d\u0646 \u0644\u0627 \u0646\u062a\u062d\u0645\u0644 \u0623\u064a \u0645\u0633\u0624\u0648\u0644\u064a\u0629 \u0639\u0646 \u0623\u064a \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u063a\u064a\u0631 \u0642\u0627\u0646\u0648\u0646\u064a \u0644\u0642\u0627\u0639\u062f\u0629 \u0627\u0644\u0623\u0643\u0648\u0627\u062f. \u064a\u0631\u062c\u0649 \u0627\u0644\u0631\u062c\u0648\u0639 \u0625\u0644\u0649 \u0627\u0644\u0642\u0648\u0627\u0646\u064a\u0646 \u0627\u0644\u0645\u062d\u0644\u064a\u0629 \u0627\u0644\u0645\u062a\u0639\u0644\u0642\u0629 \u0628\u0640 DMCA \u0648\u0627\u0644\u0642\u0648\u0627\u0646\u064a\u0646 \u0627\u0644\u0623\u062e\u0631\u0649 \u0630\u0627\u062a \u0627\u0644\u0635\u0644\u0629.</p>"},{"location":"README.ar/#_1","title":"\u0627\u0644\u0628\u062f\u0627\u064a\u0629 \u0627\u0644\u0633\u0631\u064a\u0639\u0629","text":""},{"location":"README.ar/#_2","title":"\u0631\u0648\u0627\u0628\u0637 \u0627\u0644\u062a\u0648\u062b\u064a\u0642","text":"<p>\u0647\u0630\u0627 \u0647\u0648 \u0627\u0644\u062a\u0648\u062b\u064a\u0642 \u0627\u0644\u0631\u0633\u0645\u064a \u0644\u0640 Fish Audio S2\u060c \u064a\u0631\u062c\u0649 \u0627\u062a\u0628\u0627\u0639 \u0627\u0644\u062a\u0639\u0644\u064a\u0645\u0627\u062a \u0644\u0644\u0628\u062f\u0621 \u0628\u0633\u0647\u0648\u0644\u0629.</p> <ul> <li>\u0627\u0644\u062a\u062b\u0628\u064a\u062a</li> <li>\u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u0639\u0628\u0631 \u062e\u0637 \u0627\u0644\u0623\u0648\u0627\u0645\u0631</li> <li>\u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u0639\u0628\u0631 \u0648\u0627\u062c\u0647\u0629 \u0627\u0644\u0648\u064a\u0628</li> <li>\u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u0627\u0644\u062e\u0627\u062f\u0645</li> <li>\u0646\u0634\u0631 Docker</li> </ul> <p>[!IMPORTANT] \u0625\u0630\u0627 \u0643\u0646\u062a \u062a\u0631\u063a\u0628 \u0641\u064a \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u062e\u0627\u062f\u0645 SGLang\u060c \u0641\u064a\u0631\u062c\u0649 \u0627\u0644\u0631\u062c\u0648\u0639 \u0625\u0644\u0649 SGLang-Omni README.</p>"},{"location":"README.ar/#llm","title":"\u062f\u0644\u064a\u0644 \u0648\u0643\u064a\u0644 LLM","text":"<pre><code>\u064a\u0631\u062c\u0649 \u0642\u0631\u0627\u0621\u0629 https://speech.fish.audio/ar/install/ \u0623\u0648\u0644\u0627\u064b\u060c \u0648\u062a\u062b\u0628\u064a\u062a \u0648\u062a\u0643\u0648\u064a\u0646 Fish Audio S2 \u0648\u0641\u0642\u0627\u064b \u0644\u0644\u0648\u062b\u0627\u0626\u0642.\n</code></pre>"},{"location":"README.ar/#fish-audio-s2-pro","title":"Fish Audio S2 Pro","text":"<p>\u0646\u0638\u0627\u0645 \u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0646\u0635 \u0625\u0644\u0649 \u0643\u0644\u0627\u0645 (TTS) \u0645\u062a\u0639\u062f\u062f \u0627\u0644\u0644\u063a\u0627\u062a \u0627\u0644\u0631\u0627\u0626\u062f \u0641\u064a \u0627\u0644\u0635\u0646\u0627\u0639\u0629\u060c \u0648\u0627\u0644\u0630\u064a \u064a\u0639\u064a\u062f \u062a\u0639\u0631\u064a\u0641 \u062d\u062f\u0648\u062f \u062a\u0648\u0644\u064a\u062f \u0627\u0644\u0635\u0648\u062a.</p> <p>Fish Audio S2 Pro \u0647\u0648 \u0623\u062d\u062f\u062b \u0637\u0631\u0627\u0632 \u0645\u062a\u0639\u062f\u062f \u0627\u0644\u0648\u0633\u0627\u0626\u0637 \u062a\u0645 \u062a\u0637\u0648\u064a\u0631\u0647 \u0628\u0648\u0627\u0633\u0637\u0629 Fish Audio. \u062a\u0645 \u062a\u062f\u0631\u064a\u0628\u0647 \u0639\u0644\u0649 \u0623\u0643\u062b\u0631 \u0645\u0646 10 \u0645\u0644\u0627\u064a\u064a\u0646 \u0633\u0627\u0639\u0629 \u0645\u0646 \u0627\u0644\u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u0635\u0648\u062a\u064a\u0629 \u0627\u0644\u0647\u0627\u0626\u0644\u0629\u060c \u0627\u0644\u062a\u064a \u062a\u063a\u0637\u064a \u0623\u0643\u062b\u0631 \u0645\u0646 80 \u0644\u063a\u0629 \u062d\u0648\u0644 \u0627\u0644\u0639\u0627\u0644\u0645. \u0645\u0646 \u062e\u0644\u0627\u0644 \u0628\u0646\u064a\u0629 \u062b\u0646\u0627\u0626\u064a\u0629 \u0627\u0644\u0627\u0646\u062d\u062f\u0627\u0631 \u0627\u0644\u0630\u0627\u062a\u064a (Dual-AR) \u0627\u0644\u0645\u0628\u062a\u0643\u0631\u0629 \u0648\u062a\u0642\u0646\u064a\u0629 \u062a\u0648\u0627\u0641\u0642 \u0627\u0644\u062a\u0639\u0644\u0645 \u0627\u0644\u062a\u0639\u0632\u064a\u0632\u064a (RL)\u060c \u064a\u0645\u0643\u0646 \u0644\u0640 S2 Pro \u062a\u0648\u0644\u064a\u062f \u0643\u0644\u0627\u0645 \u064a\u062a\u0645\u062a\u0639 \u0628\u0625\u062d\u0633\u0627\u0633 \u0637\u0628\u064a\u0639\u064a \u0648\u0648\u0627\u0642\u0639\u064a \u0648\u0639\u0645\u0642 \u0639\u0627\u0637\u0641\u064a \u0643\u0628\u064a\u0631\u060c \u0645\u0645\u0627 \u064a\u062c\u0639\u0644\u0647 \u0631\u0627\u0626\u062f\u0627\u064b \u0641\u064a \u0627\u0644\u0645\u0646\u0627\u0641\u0633\u0629 \u0628\u064a\u0646 \u0627\u0644\u0623\u0646\u0638\u0645\u0629 \u0627\u0644\u0645\u0641\u062a\u0648\u062d\u0629 \u0648\u0627\u0644\u0645\u063a\u0644\u0642\u0629 \u0627\u0644\u0645\u0635\u062f\u0631.</p> <p>\u062a\u0643\u0645\u0646 \u0627\u0644\u0642\u0648\u0629 \u0627\u0644\u0636\u0627\u0631\u0628\u0629 \u0644\u0640 S2 Pro \u0641\u064a \u062f\u0639\u0645\u0647 \u0644\u0644\u062a\u062d\u0643\u0645 \u0627\u0644\u062f\u0642\u064a\u0642 \u0644\u0644\u063a\u0627\u064a\u0629 \u0641\u064a \u0627\u0644\u0646\u0628\u0631\u0629 \u0648\u0627\u0644\u0639\u0627\u0637\u0641\u0629 \u0639\u0644\u0649 \u0645\u0633\u062a\u0648\u0649 \u0645\u0627 \u062f\u0648\u0646 \u0627\u0644\u0643\u0644\u0645\u0629 (Sub-word Level) \u0645\u0646 \u062e\u0644\u0627\u0644 \u0648\u0633\u0648\u0645 \u0627\u0644\u0644\u063a\u0629 \u0627\u0644\u0637\u0628\u064a\u0639\u064a\u0629 (\u0645\u062b\u0644 <code>[whisper]</code> \u0648 <code>[excited]</code> \u0648 <code>[angry]</code>) \u060c \u0645\u0639 \u062f\u0639\u0645 \u0623\u0635\u0644\u064a \u0644\u062a\u0648\u0644\u064a\u062f \u0645\u062a\u062d\u062f\u062b\u064a\u0646 \u0645\u062a\u0639\u062f\u062f\u064a\u0646 \u0648\u062d\u0648\u0627\u0631\u0627\u062a \u0645\u062a\u0639\u062f\u062f\u0629 \u0627\u0644\u062c\u0648\u0644\u0627\u062a \u0628\u0633\u064a\u0627\u0642 \u0637\u0648\u064a\u0644 \u062c\u062f\u0627\u064b.</p> <p>\u062a\u0641\u0636\u0644 \u0628\u0632\u064a\u0627\u0631\u0629 \u0645\u0648\u0642\u0639 Fish Audio \u0627\u0644\u0631\u0633\u0645\u064a \u0627\u0644\u0622\u0646 \u0644\u062a\u062c\u0631\u0628\u0629 \u0627\u0644\u0639\u0631\u0636 \u0627\u0644\u0645\u0628\u0627\u0634\u0631\u060c \u0623\u0648 \u0627\u0642\u0631\u0623 \u062a\u0642\u0631\u064a\u0631\u0646\u0627 \u0627\u0644\u0641\u0646\u064a \u0648\u0645\u0642\u0627\u0644 \u0627\u0644\u0645\u062f\u0648\u0646\u0629 \u0644\u0644\u062a\u0639\u0631\u0641 \u0639\u0644\u0649 \u0627\u0644\u0645\u0632\u064a\u062f.</p>"},{"location":"README.ar/#_3","title":"\u0645\u062a\u063a\u064a\u0631\u0627\u062a \u0627\u0644\u0646\u0645\u0648\u0630\u062c","text":"\u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0627\u0644\u062d\u062c\u0645 \u0627\u0644\u062a\u0648\u0641\u0631 \u0627\u0644\u0648\u0635\u0641 S2-Pro 4 \u0645\u0644\u064a\u0627\u0631 \u0645\u0639\u0644\u0645\u0629 HuggingFace \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0627\u0644\u0631\u0627\u0626\u062f \u0643\u0627\u0645\u0644 \u0627\u0644\u0645\u064a\u0632\u0627\u062a\u060c \u0645\u0639 \u0623\u0639\u0644\u0649 \u062c\u0648\u062f\u0629 \u0648\u0627\u0633\u062a\u0642\u0631\u0627\u0631 <p>\u0644\u0645\u0632\u064a\u062f \u0645\u0646 \u0627\u0644\u062a\u0641\u0627\u0635\u064a\u0644 \u062d\u0648\u0644 \u0627\u0644\u0646\u0645\u0627\u0630\u062c\u060c \u064a\u0631\u062c\u0649 \u0645\u0631\u0627\u062c\u0639\u0629 \u0627\u0644\u062a\u0642\u0631\u064a\u0631 \u0627\u0644\u0641\u0646\u064a.</p>"},{"location":"README.ar/#benchmarks","title":"\u0646\u062a\u0627\u0626\u062c \u0627\u0644\u0627\u062e\u062a\u0628\u0627\u0631\u0627\u062a \u0627\u0644\u0645\u0631\u062c\u0639\u064a\u0629 (Benchmarks)","text":"\u0627\u0644\u0627\u062e\u062a\u0628\u0627\u0631 Fish Audio S2 Seed-TTS Eval \u2014 WER (\u0627\u0644\u0635\u064a\u0646\u064a\u0629) 0.54% (\u0627\u0644\u0623\u0641\u0636\u0644 \u0625\u062c\u0645\u0627\u0644\u0627\u064b) Seed-TTS Eval \u2014 WER (\u0627\u0644\u0625\u0646\u062c\u0644\u064a\u0632\u064a\u0629) 0.99% (\u0627\u0644\u0623\u0641\u0636\u0644 \u0625\u062c\u0645\u0627\u0644\u0627\u064b) Audio Turing Test (\u0645\u0639 \u0627\u0644\u062a\u0639\u0644\u064a\u0645\u0627\u062a) 0.515 \u0645\u062a\u0648\u0633\u0637 \u062e\u0644\u0641\u064a (Posterior mean) EmergentTTS-Eval \u2014 \u0645\u0639\u062f\u0644 \u0627\u0644\u0641\u0648\u0632 81.88% (\u0627\u0644\u0623\u0639\u0644\u0649 \u0625\u062c\u0645\u0627\u0644\u0627\u064b) Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 \u0627\u0644\u062c\u0648\u062f\u0629 4.51 / 5.0 \u0645\u062a\u0639\u062f\u062f \u0627\u0644\u0644\u063a\u0627\u062a (MiniMax Testset) \u2014 \u0623\u0641\u0636\u0644 WER 11 \u0644\u063a\u0629 \u0645\u0646 \u0623\u0635\u0644 24 \u0645\u062a\u0639\u062f\u062f \u0627\u0644\u0644\u063a\u0627\u062a (MiniMax Testset) \u2014 \u0623\u0641\u0636\u0644 SIM 17 \u0644\u063a\u0629 \u0645\u0646 \u0623\u0635\u0644 24 <p>\u0641\u064a \u062a\u0642\u064a\u064a\u0645 Seed-TTS\u060c \u062d\u0642\u0642 S2 \u0623\u0642\u0644 \u0645\u0639\u062f\u0644 \u062e\u0637\u0623 \u0641\u064a \u0627\u0644\u0643\u0644\u0645\u0627\u062a (WER) \u0628\u064a\u0646 \u062c\u0645\u064a\u0639 \u0627\u0644\u0646\u0645\u0627\u0630\u062c \u0627\u0644\u062a\u064a \u062a\u0645 \u062a\u0642\u064a\u064a\u0645\u0647\u0627 (\u0628\u0645\u0627 \u0641\u064a \u0630\u0644\u0643 \u0627\u0644\u0623\u0646\u0638\u0645\u0629 \u0645\u063a\u0644\u0642\u0629 \u0627\u0644\u0645\u0635\u062f\u0631): Qwen3-TTS (0.77/1.24)\u060c \u0648 MiniMax Speech-02 (0.99/1.90)\u060c \u0648 Seed-TTS (1.12/2.25). \u0648\u0641\u064a \u0627\u062e\u062a\u0628\u0627\u0631 Audio Turing Test\u060c \u0633\u062c\u0644 S2 \u0642\u064a\u0645\u0629 0.515 \u0628\u0632\u064a\u0627\u062f\u0629 \u0642\u062f\u0631\u0647\u0627 24% \u0645\u0642\u0627\u0631\u0646\u0629 \u0628\u0640 Seed-TTS (0.417) \u0648 33% \u0645\u0642\u0627\u0631\u0646\u0629 \u0628\u0640 MiniMax-Speech (0.387). \u0648\u0641\u064a EmergentTTS-Eval\u060c \u062a\u0645\u064a\u0632 S2 \u0628\u0634\u0643\u0644 \u062e\u0627\u0635 \u0641\u064a \u0623\u0628\u0639\u0627\u062f \u0645\u062b\u0644 \u0627\u0644\u0644\u063a\u0648\u064a\u0627\u062a \u0627\u0644\u0645\u0635\u0627\u062d\u0628\u0629 (\u0645\u0639\u062f\u0644 \u0641\u0648\u0632 91.61%)\u060c \u0648\u0627\u0644\u062c\u0645\u0644 \u0627\u0644\u0627\u0633\u062a\u0641\u0647\u0627\u0645\u064a\u0629 (84.41%)\u060c \u0648\u0627\u0644\u062a\u0639\u0642\u064a\u062f \u0627\u0644\u0646\u062d\u0648\u064a (83.39%).</p>"},{"location":"README.ar/#_4","title":"\u0623\u0628\u0631\u0632 \u0627\u0644\u0645\u0645\u064a\u0632\u0627\u062a","text":""},{"location":"README.ar/#_5","title":"\u062a\u062d\u0643\u0645 \u062f\u0642\u064a\u0642 \u0644\u0644\u063a\u0627\u064a\u0629 \u0639\u0628\u0631 \u0627\u0644\u0644\u063a\u0629 \u0627\u0644\u0637\u0628\u064a\u0639\u064a\u0629","text":"<p>\u064a\u0645\u0646\u062d S2 Pro \u0627\u0644\u0635\u0648\u062a \"\u0631\u0648\u062d\u0627\u064b\" \u0644\u0627 \u0645\u062b\u064a\u0644 \u0644\u0647\u0627. \u0645\u0646 \u062e\u0644\u0627\u0644 \u0635\u064a\u063a\u0629 <code>[tag]</code> \u0627\u0644\u0628\u0633\u064a\u0637\u0629\u060c \u064a\u0645\u0643\u0646\u0643 \u062a\u0636\u0645\u064a\u0646 \u062a\u0639\u0644\u064a\u0645\u0627\u062a \u0639\u0627\u0637\u0641\u064a\u0629 \u0628\u062f\u0642\u0629 \u0641\u064a \u0623\u064a \u0645\u0648\u0636\u0639 \u0645\u0646 \u0627\u0644\u0646\u0635. - \u062f\u0639\u0645 \u0623\u0643\u062b\u0631 \u0645\u0646 15,000 \u0648\u0633\u0645 \u0641\u0631\u064a\u062f: \u0644\u0627 \u064a\u0642\u062a\u0635\u0631 \u0639\u0644\u0649 \u0627\u0644\u0625\u0639\u062f\u0627\u062f\u0627\u062a \u0627\u0644\u0645\u0633\u0628\u0642\u0629 \u0627\u0644\u062b\u0627\u0628\u062a\u0629\u060c \u0628\u0644 \u064a\u062f\u0639\u0645 \u0623\u0648\u0635\u0627\u0641 \u0627\u0644\u0646\u0635 \u0627\u0644\u062d\u0631. \u064a\u0645\u0643\u0646\u0643 \u062a\u062c\u0631\u0628\u0629 <code>[whisper in small voice]</code> (\u0647\u0645\u0633 \u0628\u0635\u0648\u062a \u0645\u0646\u062e\u0641\u0636)\u060c \u0623\u0648 <code>[professional broadcast tone]</code> (\u0646\u0628\u0631\u0629 \u0625\u0630\u0627\u0639\u064a\u0629 \u0627\u062d\u062a\u0631\u0627\u0641\u064a\u0629)\u060c \u0623\u0648 <code>[pitch up]</code> (\u0631\u0641\u0639 \u0637\u0628\u0642\u0629 \u0627\u0644\u0635\u0648\u062a). - \u0645\u0643\u062a\u0628\u0629 \u0639\u0648\u0627\u0637\u0641 \u063a\u0646\u064a\u0629: <code>[pause]</code> <code>[emphasis]</code> <code>[laughing]</code> <code>[inhale]</code> <code>[chuckle]</code> <code>[tsk]</code> <code>[singing]</code> <code>[excited]</code> <code>[laughing tone]</code> <code>[interrupting]</code> <code>[chuckling]</code> <code>[excited tone]</code> <code>[volume up]</code> <code>[echo]</code> <code>[angry]</code> <code>[low volume]</code> <code>[sigh]</code> <code>[low voice]</code> <code>[whisper]</code> <code>[screaming]</code> <code>[shouting]</code> <code>[loud]</code> <code>[surprised]</code> <code>[short pause]</code> <code>[exhale]</code> <code>[delight]</code> <code>[panting]</code> <code>[audience laughter]</code> <code>[with strong accent]</code> <code>[volume down]</code> <code>[clearing throat]</code> <code>[sad]</code> <code>[moaning]</code> <code>[shocked]</code></p>"},{"location":"README.ar/#dual-autoregressive","title":"\u0628\u0646\u064a\u0629 \u0645\u0628\u062a\u0643\u0631\u0629 \u062b\u0646\u0627\u0626\u064a\u0629 \u0627\u0644\u0627\u0646\u062d\u062f\u0627\u0631 \u0627\u0644\u0630\u0627\u062a\u064a (Dual-Autoregressive)","text":"<p>\u064a\u0639\u062a\u0645\u062f S2 Pro \u0628\u0646\u064a\u0629 Dual-AR \u0628\u0646\u0638\u0627\u0645 \"\u0631\u0626\u064a\u0633\u064a-\u062a\u0627\u0628\u0639\"\u060c \u062a\u062a\u0643\u0648\u0646 \u0645\u0646 Decoder-only Transformer \u0648\u062a\u0631\u0645\u064a\u0632 \u0635\u0648\u062a\u064a RVQ (10 \u0642\u0648\u0627\u0645\u064a\u0633 \u0623\u0643\u0648\u0627\u062f\u060c \u0628\u0645\u0639\u062f\u0644 \u0625\u0637\u0627\u0631\u0627\u062a \u064a\u0628\u0644\u063a \u062d\u0648\u0627\u0644\u064a 21 \u0647\u0631\u062a\u0632):</p> <ul> <li>Slow AR (4 \u0645\u0644\u064a\u0627\u0631 \u0645\u0639\u0644\u0645\u0629): \u064a\u0639\u0645\u0644 \u0639\u0644\u0649 \u0637\u0648\u0644 \u0627\u0644\u0645\u062d\u0648\u0631 \u0627\u0644\u0632\u0645\u0646\u064a\u060c \u0648\u064a\u062a\u0646\u0628\u0623 \u0628\u0642\u0627\u0645\u0648\u0633 \u0627\u0644\u0623\u0643\u0648\u0627\u062f \u0627\u0644\u062f\u0644\u0627\u0644\u064a \u0627\u0644\u0623\u0633\u0627\u0633\u064a.</li> <li>Fast AR (400 \u0645\u0644\u064a\u0648\u0646 \u0645\u0639\u0644\u0645\u0629): \u064a\u0648\u0644\u062f \u0627\u0644\u0640 9 \u0642\u0648\u0627\u0645\u064a\u0633 \u0627\u0644\u0645\u062a\u0628\u0642\u064a\u0629 \u0641\u064a \u0643\u0644 \u062e\u0637\u0648\u0629 \u0632\u0645\u0646\u064a\u0629\u060c \u0644\u0627\u0633\u062a\u0639\u0627\u062f\u0629 \u0623\u062f\u0642 \u0627\u0644\u062a\u0641\u0627\u0635\u064a\u0644 \u0627\u0644\u0635\u0648\u062a\u064a\u0629 \u0628\u0628\u0631\u0627\u0639\u0629.</li> </ul> <p>\u064a\u062d\u0642\u0642 \u0647\u0630\u0627 \u0627\u0644\u062a\u0635\u063a\u064a\u0631 \u063a\u064a\u0631 \u0627\u0644\u0645\u062a\u0645\u0627\u062b\u0644 \u0623\u0642\u0635\u0649 \u062f\u0631\u062c\u0627\u062a \u0627\u0644\u062f\u0642\u0629 \u0627\u0644\u0635\u0648\u062a\u064a\u0629 \u0645\u0639 \u0632\u064a\u0627\u062f\u0629 \u0633\u0631\u0639\u0629 \u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u0628\u0634\u0643\u0644 \u0643\u0628\u064a\u0631.</p>"},{"location":"README.ar/#rl-alignment","title":"\u062a\u0648\u0627\u0641\u0642 \u0627\u0644\u062a\u0639\u0644\u0645 \u0627\u0644\u062a\u0639\u0632\u064a\u0632\u064a (RL Alignment)","text":"<p>\u064a\u0633\u062a\u062e\u062f\u0645 S2 Pro \u062a\u0642\u0646\u064a\u0629 Group Relative Policy Optimization (GRPO) \u0644\u0644\u062a\u0648\u0627\u0641\u0642 \u0628\u0639\u062f \u0627\u0644\u062a\u062f\u0631\u064a\u0628. \u0646\u0633\u062a\u062e\u062f\u0645 \u0646\u0641\u0633 \u0645\u062c\u0645\u0648\u0639\u0629 \u0627\u0644\u0646\u0645\u0627\u0630\u062c \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645\u0629 \u0641\u064a \u062a\u0646\u0638\u064a\u0641 \u0627\u0644\u0628\u064a\u0627\u0646\u0627\u062a \u0648\u062a\u0635\u0646\u064a\u0641\u0647\u0627 \u0645\u0628\u0627\u0634\u0631\u0629 \u0643\u0646\u0645\u0627\u0630\u062c \u0645\u0643\u0627\u0641\u0623\u0629 (Reward Model)\u060c \u0645\u0645\u0627 \u064a\u062d\u0644 \u0628\u0634\u0643\u0644 \u0645\u062b\u0627\u0644\u064a \u0645\u0634\u0643\u0644\u0629 \u0639\u062f\u0645 \u0627\u0644\u062a\u0637\u0627\u0628\u0642 \u0628\u064a\u0646 \u062a\u0648\u0632\u064a\u0639 \u0628\u064a\u0627\u0646\u0627\u062a \u0645\u0627 \u0642\u0628\u0644 \u0627\u0644\u062a\u062f\u0631\u064a\u0628 \u0648\u0623\u0647\u062f\u0627\u0641 \u0645\u0627 \u0628\u0639\u062f \u0627\u0644\u062a\u062f\u0631\u064a\u0628. - \u0625\u0634\u0627\u0631\u0627\u062a \u0645\u0643\u0627\u0641\u0623\u0629 \u0645\u062a\u0639\u062f\u062f\u0629 \u0627\u0644\u0623\u0628\u0639\u0627\u062f: \u062a\u0642\u064a\u064a\u0645 \u0634\u0627\u0645\u0644 \u0644\u0644\u062f\u0642\u0629 \u0627\u0644\u062f\u0644\u0627\u0644\u064a\u0629\u060c \u0648\u0627\u0644\u0642\u062f\u0631\u0629 \u0639\u0644\u0649 \u0627\u062a\u0628\u0627\u0639 \u0627\u0644\u062a\u0639\u0644\u064a\u0645\u0627\u062a\u060c \u0648\u062a\u0633\u062c\u064a\u0644 \u0627\u0644\u062a\u0641\u0636\u064a\u0644 \u0627\u0644\u0635\u0648\u062a\u064a\u060c \u0648\u062a\u0645\u0627\u062b\u0644 \u0646\u0628\u0631\u0629 \u0627\u0644\u0635\u0648\u062a\u060c \u0644\u0636\u0645\u0627\u0646 \u0623\u0646 \u0643\u0644 \u062b\u0627\u0646\u064a\u0629 \u0645\u0646 \u0627\u0644\u0643\u0644\u0627\u0645 \u0627\u0644\u0645\u0648\u0644\u062f \u062a\u062a\u0648\u0627\u0641\u0642 \u0645\u0639 \u0627\u0644\u062d\u062f\u0633 \u0627\u0644\u0628\u0634\u0631\u064a.</p>"},{"location":"README.ar/#sglang","title":"\u0623\u062f\u0627\u0621 \u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u062a\u062f\u0641\u0642\u064a \u0641\u0627\u0626\u0642 (\u064a\u0639\u062a\u0645\u062f \u0639\u0644\u0649 SGLang)","text":"<p>\u0646\u0638\u0631\u0627\u064b \u0644\u0623\u0646 \u0628\u0646\u064a\u0629 Dual-AR \u062a\u062a\u0645\u0627\u062b\u0644 \u0647\u064a\u0643\u0644\u064a\u0627\u064b \u0645\u0639 \u0628\u0646\u064a\u0629 LLM \u0627\u0644\u0642\u064a\u0627\u0633\u064a\u0629\u060c \u0641\u0625\u0646 S2 Pro \u064a\u062f\u0639\u0645 \u0623\u0635\u0644\u0627\u064b \u062c\u0645\u064a\u0639 \u0645\u064a\u0632\u0627\u062a \u062a\u0633\u0631\u064a\u0639 \u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u0641\u064a SGLang\u060c \u0628\u0645\u0627 \u0641\u064a \u0630\u0644\u0643 \u0627\u0644\u062f\u0641\u0639\u0627\u062a \u0627\u0644\u0645\u0633\u062a\u0645\u0631\u0629 (Continuous Batching)\u060c \u0648 Paged KV Cache\u060c \u0648 CUDA Graph\u060c \u0648\u0627\u0644\u062a\u062e\u0632\u064a\u0646 \u0627\u0644\u0645\u0624\u0642\u062a \u0644\u0644\u0628\u0627\u062f\u0626\u0629 \u0627\u0644\u0642\u0627\u0626\u0645 \u0639\u0644\u0649 RadixAttention.</p> <p>\u0623\u062f\u0627\u0621 \u0648\u062d\u062f\u0629 \u0645\u0639\u0627\u0644\u062c\u0629 \u0631\u0633\u0648\u0645\u0627\u062a NVIDIA H200 \u0648\u0627\u062d\u062f\u0629: - \u0639\u0627\u0645\u0644 \u0627\u0644\u0648\u0642\u062a \u0627\u0644\u062d\u0642\u064a\u0642\u064a (RTF): 0.195 - \u062a\u0623\u062e\u0631 \u0627\u0644\u0635\u0648\u062a \u0627\u0644\u0623\u0648\u0644 (TTFA): \u062d\u0648\u0627\u0644\u064a 100 \u0645\u0644\u0644\u064a \u062b\u0627\u0646\u064a\u0629 - \u0625\u0646\u062a\u0627\u062c\u064a\u0629 \u0641\u0627\u0626\u0642\u0629 \u0627\u0644\u0633\u0631\u0639\u0629: \u062a\u0635\u0644 \u0625\u0644\u0649 3000+ \u0648\u0633\u0645 \u0635\u0648\u062a\u064a/\u062b\u0627\u0646\u064a\u0629 \u0645\u0639 \u0627\u0644\u062d\u0641\u0627\u0638 \u0639\u0644\u0649 RTF < 0.5</p>"},{"location":"README.ar/#_6","title":"\u062f\u0639\u0645 \u0642\u0648\u064a \u0644\u0644\u063a\u0627\u062a \u0627\u0644\u0645\u062a\u0639\u062f\u062f\u0629","text":"<p>\u064a\u062f\u0639\u0645 S2 Pro \u0623\u0643\u062b\u0631 \u0645\u0646 80 \u0644\u063a\u0629\u060c \u0645\u0645\u0627 \u064a\u062a\u064a\u062d \u062a\u0631\u0643\u064a\u0628\u0627\u064b \u0639\u0627\u0644\u064a\u0627\u064b \u0627\u0644\u062c\u0648\u062f\u0629 \u062f\u0648\u0646 \u0627\u0644\u062d\u0627\u062c\u0629 \u0625\u0644\u0649 \u0648\u062d\u062f\u0627\u062a \u0635\u0648\u062a\u064a\u0629 (phonemes) \u0623\u0648 \u0645\u0639\u0627\u0644\u062c\u0629 \u0645\u062d\u062f\u062f\u0629 \u0644\u0643\u0644 \u0644\u063a\u0629:</p> <ul> <li>\u0627\u0644\u0645\u0633\u062a\u0648\u0649 \u0627\u0644\u0623\u0648\u0644 (Tier 1): \u0627\u0644\u064a\u0627\u0628\u0627\u0646\u064a\u0629 (ja)\u060c \u0627\u0644\u0625\u0646\u062c\u0644\u064a\u0632\u064a\u0629 (en)\u060c \u0627\u0644\u0635\u064a\u0646\u064a\u0629 (zh)</li> <li>\u0627\u0644\u0645\u0633\u062a\u0648\u0649 \u0627\u0644\u062b\u0627\u0646\u064a (Tier 2): \u0627\u0644\u0643\u0648\u0631\u064a\u0629 (ko)\u060c \u0627\u0644\u0625\u0633\u0628\u0627\u0646\u064a\u0629 (es)\u060c \u0627\u0644\u0628\u0631\u062a\u063a\u0627\u0644\u064a\u0629 (pt)\u060c \u0627\u0644\u0639\u0631\u0628\u064a\u0629 (ar)\u060c \u0627\u0644\u0631\u0648\u0633\u064a\u0629 (ru)\u060c \u0627\u0644\u0641\u0631\u0646\u0633\u064a\u0629 (fr)\u060c \u0627\u0644\u0623\u0644\u0645\u0627\u0646\u064a\u0629 (de)</li> <li>\u062a\u063a\u0637\u064a\u0629 \u0639\u0627\u0644\u0645\u064a\u0629: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo \u0648\u0627\u0644\u0645\u0632\u064a\u062f.</li> </ul>"},{"location":"README.ar/#_7","title":"\u062a\u0648\u0644\u064a\u062f \u0645\u062a\u062d\u062f\u062b\u064a\u0646 \u0645\u062a\u0639\u062f\u062f\u064a\u0646 \u0623\u0635\u0644\u064a","text":"<p>\u064a\u0633\u0645\u062d Fish Audio S2 \u0644\u0644\u0645\u0633\u062a\u062e\u062f\u0645\u064a\u0646 \u0628\u062a\u062d\u0645\u064a\u0644 \u0639\u064a\u0646\u0629 \u0645\u0631\u062c\u0639\u064a\u0629 \u062a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 \u0645\u062a\u062d\u062f\u062b\u064a\u0646 \u0645\u062a\u0639\u062f\u062f\u064a\u0646\u060c \u0648\u0633\u064a\u0642\u0648\u0645 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0628\u0645\u0639\u0627\u0644\u062c\u0629 \u0645\u064a\u0632\u0627\u062a \u0643\u0644 \u0645\u062a\u062d\u062f\u062b \u0639\u0628\u0631 \u0648\u0633\u0645 <code><|speaker:i|></code>. \u0628\u0639\u062f \u0630\u0644\u0643\u060c \u064a\u0645\u0643\u0646\u0643 \u0627\u0644\u062a\u062d\u0643\u0645 \u0641\u064a \u0623\u062f\u0627\u0621 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0639\u0628\u0631 \u0648\u0633\u0645 \u0645\u0639\u0631\u0641 \u0627\u0644\u0645\u062a\u062d\u062f\u062b\u060c \u0645\u0645\u0627 \u064a\u062a\u064a\u062d \u0644\u062a\u0648\u0644\u064a\u062f \u0648\u0627\u062d\u062f \u0623\u0646 \u064a\u062a\u0636\u0645\u0646 \u0645\u062a\u062d\u062f\u062b\u064a\u0646 \u0645\u062a\u0639\u062f\u062f\u064a\u0646. \u0644\u0645 \u062a\u0639\u062f \u0647\u0646\u0627\u0643 \u062d\u0627\u062c\u0629 \u0644\u062a\u062d\u0645\u064a\u0644 \u0639\u064a\u0646\u0629 \u0645\u0631\u062c\u0639\u064a\u0629 \u0645\u0646\u0641\u0635\u0644\u0629 \u0648\u062a\u0648\u0644\u064a\u062f \u0635\u0648\u062a \u0644\u0643\u0644 \u0645\u062a\u062d\u062f\u062b \u0639\u0644\u0649 \u062d\u062f\u0629 \u0643\u0645\u0627 \u0643\u0627\u0646 \u0641\u064a \u0627\u0644\u0633\u0627\u0628\u0642.</p>"},{"location":"README.ar/#_8","title":"\u062a\u0648\u0644\u064a\u062f \u062d\u0648\u0627\u0631\u0627\u062a \u0645\u062a\u0639\u062f\u062f\u0629 \u0627\u0644\u062c\u0648\u0644\u0627\u062a","text":"<p>\u0628\u0641\u0636\u0644 \u062a\u0648\u0633\u064a\u0639 \u0633\u064a\u0627\u0642 \u0627\u0644\u0646\u0645\u0648\u0630\u062c\u060c \u064a\u0645\u0643\u0646 \u0644\u0646\u0645\u0648\u0630\u062c\u0646\u0627 \u0627\u0644\u0622\u0646 \u0627\u0644\u0627\u0633\u062a\u0641\u0627\u062f\u0629 \u0645\u0646 \u0627\u0644\u0645\u0639\u0644\u0648\u0645\u0627\u062a \u0627\u0644\u0633\u0627\u0628\u0642\u0629 \u0644\u062a\u062d\u0633\u064a\u0646 \u0627\u0644\u062a\u0639\u0628\u064a\u0631 \u0641\u064a \u0627\u0644\u0645\u062d\u062a\u0648\u0649 \u0627\u0644\u0645\u0648\u0644\u062f \u0644\u0627\u062d\u0642\u0627\u064b\u060c \u0645\u0645\u0627 \u064a\u0639\u0632\u0632 \u0645\u0646 \u0637\u0628\u064a\u0639\u064a\u0629 \u0627\u0644\u0645\u062d\u062a\u0648\u0649.</p>"},{"location":"README.ar/#_9","title":"\u0627\u0633\u062a\u0646\u0633\u0627\u062e \u0627\u0644\u0635\u0648\u062a \u0627\u0644\u0633\u0631\u064a\u0639","text":"<p>\u064a\u062f\u0639\u0645 Fish Audio S2 \u0627\u0633\u062a\u0646\u0633\u0627\u062e\u0627\u064b \u062f\u0642\u064a\u0642\u0627\u064b \u0644\u0644\u0635\u0648\u062a \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0639\u064a\u0646\u0627\u062a \u0645\u0631\u062c\u0639\u064a\u0629 \u0642\u0635\u064a\u0631\u0629 (\u0639\u0627\u062f\u0629\u064b 10-30 \u062b\u0627\u0646\u064a\u0629). \u064a\u0644\u062a\u0642\u0637 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0646\u0628\u0631\u0629 \u0627\u0644\u0635\u0648\u062a \u0648\u0623\u0633\u0644\u0648\u0628 \u0627\u0644\u0643\u0644\u0627\u0645 \u0648\u0627\u0644\u0645\u064a\u0648\u0644 \u0627\u0644\u0639\u0627\u0637\u0641\u064a\u0629\u060c \u0645\u0645\u0627 \u064a\u0648\u0644\u062f \u0623\u0635\u0648\u0627\u062a\u0627\u064b \u0645\u0633\u062a\u0646\u0633\u062e\u0629 \u0648\u0627\u0642\u0639\u064a\u0629 \u0648\u0645\u062a\u0633\u0642\u0629 \u062f\u0648\u0646 \u0627\u0644\u062d\u0627\u062c\u0629 \u0625\u0644\u0649 \u0636\u0628\u0637 \u062f\u0642\u064a\u0642 \u0625\u0636\u0627\u0641\u064a. \u0644\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u062e\u0627\u062f\u0645 SGLang\u060c \u064a\u0631\u062c\u0649 \u0627\u0644\u0631\u062c\u0648\u0639 \u0625\u0644\u0649 SGLang-Omni README.</p>"},{"location":"README.ar/#_10","title":"\u0634\u0643\u0631 \u0648\u062a\u0642\u062f\u064a\u0631","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"README.ar/#_11","title":"\u0627\u0644\u062a\u0642\u0631\u064a\u0631 \u0627\u0644\u0641\u0646\u064a","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"README.es/","title":"README.es","text":"Fish Speech [English](../README.md) | [\u7b80\u4f53\u4e2d\u6587](docs/README.zh.md) | [Portuguese](docs/README.pt-BR.md) | [\u65e5\u672c\u8a9e](docs/README.ja.md) | [\ud55c\uad6d\uc5b4](docs/README.ko.md) | [\u0627\u0644\u0639\u0631\u0628\u064a\u0629](docs/README.ar.md) | **Espa\u00f1ol** <p>[!IMPORTANT] Aviso de Licencia Este c\u00f3digo y los pesos de modelo asociados se publican bajo la FISH AUDIO RESEARCH LICENSE. Consulta LICENSE para m\u00e1s detalles. Se tomar\u00e1n acciones ante cualquier violaci\u00f3n de la licencia.</p> <p>[!WARNING] Descargo de Responsabilidad Legal No asumimos ninguna responsabilidad por el uso ilegal de este c\u00f3digo. Consulta las leyes locales relacionadas con DMCA y otras normativas aplicables.</p>"},{"location":"README.es/#inicio-rapido","title":"Inicio R\u00e1pido","text":""},{"location":"README.es/#para-humanos","title":"Para humanos","text":"<p>Aqu\u00ed tienes la documentaci\u00f3n oficial de Fish Audio S2. Sigue las instrucciones para comenzar f\u00e1cilmente.</p> <ul> <li>Instalaci\u00f3n</li> <li>Inferencia por l\u00ednea de comandos</li> <li>Inferencia con WebUI</li> <li>Inferencia en servidor</li> <li>Configuraci\u00f3n de Docker</li> </ul> <p>[!IMPORTANT] Para el servidor SGLang, consulta SGLang-Omni README.</p>"},{"location":"README.es/#para-agentes-llm","title":"Para agentes LLM","text":"<pre><code>Instala y configura Fish-Audio S2 siguiendo las instrucciones aqu\u00ed: https://speech.fish.audio/install/\n</code></pre>"},{"location":"README.es/#fish-audio-s2-pro","title":"Fish Audio S2 Pro","text":"<p>Sistema de texto a voz (TTS) multiling\u00fce de \u00faltima generaci\u00f3n, redefiniendo los l\u00edmites de la generaci\u00f3n de voz.</p> <p>Fish Audio S2 Pro es el modelo multimodal m\u00e1s avanzado desarrollado por Fish Audio. Entrenado con m\u00e1s de 10 millones de horas de datos de audio que abarcan m\u00e1s de 80 idiomas, S2 Pro combina una arquitectura Dual-Autoregressive (Dual-AR) con alineaci\u00f3n mediante aprendizaje por refuerzo (RL) para generar voz extremadamente natural, realista y emocionalmente rica, liderando tanto sistemas open-source como closed-source.</p> <p>La principal fortaleza de S2 Pro es su soporte para control fino a nivel sub-palabra (sub-word level) de prosodia y emoci\u00f3n usando etiquetas en lenguaje natural (por ejemplo <code>[whisper]</code>, <code>[excited]</code>, <code>[angry]</code>), adem\u00e1s de soportar de forma nativa generaci\u00f3n multi-speaker y conversaciones multi-turno.</p> <p>Visita el sitio web de Fish Audio para probarlo en vivo, o lee el informe t\u00e9cnico y el blog para m\u00e1s detalles.</p>"},{"location":"README.es/#variantes-del-modelo","title":"Variantes del modelo","text":"Modelo Tama\u00f1o Disponibilidad Descripci\u00f3n S2-Pro 4B par\u00e1metros HuggingFace Modelo insignia completo con m\u00e1xima calidad y estabilidad <p>M\u00e1s detalles pueden encontrarse en el informe t\u00e9cnico.</p>"},{"location":"README.es/#resultados-de-benchmarks","title":"Resultados de benchmarks","text":"Benchmark Fish Audio S2 Seed-TTS Eval \u2014 WER (Chino) 0.54% (mejor global) Seed-TTS Eval \u2014 WER (Ingl\u00e9s) 0.99% (mejor global) Audio Turing Test (con instrucciones) 0.515 media posterior EmergentTTS-Eval \u2014 Tasa de victoria 81.88% (m\u00e1ximo global) Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 Calidad 4.51 / 5.0 Multiling\u00fce (MiniMax Testset) \u2014 Mejor WER 11 de 24 idiomas Multiling\u00fce (MiniMax Testset) \u2014 Mejor SIM 17 de 24 idiomas <p>En Seed-TTS Eval, S2 logra el menor WER entre todos los modelos evaluados, incluyendo sistemas cerrados: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). En el Audio Turing Test, 0.515 supera a Seed-TTS (0.417) en un 24% y a MiniMax-Speech (0.387) en un 33%. En EmergentTTS-Eval, S2 destaca especialmente en paraling\u00fc\u00edstica (91.61%), preguntas (84.41%) y complejidad sint\u00e1ctica (83.39%).</p>"},{"location":"README.es/#highlights","title":"Highlights","text":""},{"location":"README.es/#control-fino-inline-mediante-lenguaje-natural","title":"Control fino inline mediante lenguaje natural","text":"<p>S2 Pro aporta un nivel de \u201calma\u201d sin precedentes a la voz. Usando sintaxis <code>[tag]</code>, puedes insertar instrucciones emocionales con precisi\u00f3n en cualquier parte del texto.</p> <ul> <li>M\u00e1s de 15,000 tags \u00fanicos soportados</li> <li>Soporta descripciones libres como <code>[whisper in small voice]</code>, <code>[professional broadcast tone]</code>, <code>[pitch up]</code></li> </ul>"},{"location":"README.es/#arquitectura-dual-autoregressive-dual-ar","title":"Arquitectura Dual-Autoregressive (Dual-AR)","text":"<ul> <li>Slow AR (4B par\u00e1metros): modela la estructura temporal</li> <li>Fast AR (400M par\u00e1metros): reconstruye detalles ac\u00fasticos finos</li> </ul>"},{"location":"README.es/#alineacion-mediante-rl","title":"Alineaci\u00f3n mediante RL","text":"<ul> <li>Usa GRPO</li> <li>Se\u00f1ales de recompensa multidimensionales</li> </ul>"},{"location":"README.es/#rendimiento-extremo-en-streaming","title":"Rendimiento extremo en streaming","text":"<ul> <li>RTF: 0.195</li> <li>TTFA: ~100 ms</li> <li>+3000 tokens/s</li> </ul>"},{"location":"README.es/#soporte-multilingue-robusto","title":"Soporte multiling\u00fce robusto","text":"<ul> <li>M\u00e1s de 80 idiomas</li> <li>Sin necesidad de phonemes espec\u00edficos</li> </ul>"},{"location":"README.es/#generacion-multi-speaker-nativa","title":"Generaci\u00f3n multi-speaker nativa","text":"<p>Permite m\u00faltiples hablantes usando <code><|speaker:i|></code> en una sola generaci\u00f3n.</p>"},{"location":"README.es/#generacion-multi-turno","title":"Generaci\u00f3n multi-turno","text":"<p>Mantiene contexto para mejorar la naturalidad.</p>"},{"location":"README.es/#clonacion-de-voz-rapida","title":"Clonaci\u00f3n de voz r\u00e1pida","text":"<ul> <li>Solo 10\u201330 segundos de audio</li> <li>Alta fidelidad de timbre y estilo</li> </ul> <p>Para usar con SGLang Server, consulta el README correspondiente.</p>"},{"location":"README.es/#creditos","title":"Cr\u00e9ditos","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"README.es/#informe-tecnico","title":"Informe T\u00e9cnico","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"README.ja/","title":"README.ja","text":"Fish Speech [English](../README.md) | [\u7b80\u4f53\u4e2d\u6587](README.zh.md) | [Portuguese](README.pt-BR.md) | **\u65e5\u672c\u8a9e** | [\ud55c\uad6d\uc5b4](README.ko.md) | [\u0627\u0644\u0639\u0631\u0628\u064a\u0629](README.ar.md) | [Espa\u00f1ol](docs/README.es.md) <p>[!IMPORTANT] \u30e9\u30a4\u30bb\u30f3\u30b9\u6ce8\u610f\u4e8b\u9805 \u3053\u306e\u30b3\u30fc\u30c9\u30d9\u30fc\u30b9\u304a\u3088\u3073\u95a2\u9023\u3059\u308b\u30e2\u30c7\u30eb\u30a6\u30a7\u30a4\u30c8\u306f FISH AUDIO RESEARCH LICENSE \u306e\u4e0b\u3067\u30ea\u30ea\u30fc\u30b9\u3055\u308c\u3066\u3044\u307e\u3059\u3002\u8a73\u7d30\u306b\u3064\u3044\u3066\u306f LICENSE \u3092\u3054\u53c2\u7167\u304f\u3060\u3055\u3044\u3002</p> <p>[!WARNING] \u6cd5\u7684\u514d\u8cac\u4e8b\u9805 \u79c1\u305f\u3061\u306f\u30b3\u30fc\u30c9\u30d9\u30fc\u30b9\u306e\u4e0d\u6cd5\u306a\u4f7f\u7528\u306b\u3064\u3044\u3066\u4e00\u5207\u306e\u8cac\u4efb\u3092\u8ca0\u3044\u307e\u305b\u3093\u3002DMCA \u53ca\u3073\u305d\u306e\u4ed6\u306e\u95a2\u9023\u6cd5\u5f8b\u306b\u3064\u3044\u3066\u3001\u73fe\u5730\u306e\u6cd5\u5f8b\u3092\u3054\u53c2\u7167\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"README.ja/#_1","title":"\u30af\u30a4\u30c3\u30af\u30b9\u30bf\u30fc\u30c8","text":""},{"location":"README.ja/#_2","title":"\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u5165\u53e3","text":"<p>Fish Audio S2 \u306e\u516c\u5f0f\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u3067\u3059\u3002\u4ee5\u4e0b\u304b\u3089\u3059\u3050\u306b\u59cb\u3081\u3089\u308c\u307e\u3059\u3002</p> <ul> <li>\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb</li> <li>\u30b3\u30de\u30f3\u30c9\u30e9\u30a4\u30f3\u63a8\u8ad6</li> <li>WebUI \u63a8\u8ad6</li> <li>\u30b5\u30fc\u30d0\u30fc\u63a8\u8ad6</li> <li>Docker \u30c7\u30d7\u30ed\u30a4</li> </ul> <p>[!IMPORTANT] SGLang \u30b5\u30fc\u30d0\u30fc\u306b\u3064\u3044\u3066\u306f SGLang-Omni README \u3092\u53c2\u7167\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"README.ja/#llm-agent","title":"LLM Agent \u6307\u5357","text":"<pre><code>https://speech.fish.audio/ja/install/ \u306e\u624b\u9806\u306b\u5f93\u3063\u3066\u3001Fish Audio S2 \u3092\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u30fb\u8a2d\u5b9a\u3057\u3066\u304f\u3060\u3055\u3044\u3002\n</code></pre>"},{"location":"README.ja/#fish-audio-s2-pro","title":"Fish Audio S2 Pro","text":"<p>\u696d\u754c\u6700\u5148\u7aef\u306e\u591a\u8a00\u8a9e\u30c6\u30ad\u30b9\u30c8\u8aad\u307f\u4e0a\u3052 (TTS) \u30b7\u30b9\u30c6\u30e0\u3002\u97f3\u58f0\u751f\u6210\u306e\u9650\u754c\u3092\u518d\u5b9a\u7fa9\u3057\u307e\u3059\u3002</p> <p>Fish Audio S2 Pro \u306f Fish Audio \u304c\u958b\u767a\u3057\u305f\u6700\u9ad8\u5cf0\u306e\u30de\u30eb\u30c1\u30e2\u30fc\u30c0\u30eb\u30e2\u30c7\u30eb\u3067\u3059\u3002\u4e16\u754c 80 \u8a00\u8a9e\u4ee5\u4e0a\u30011,000 \u4e07\u6642\u9593 \u3092\u8d85\u3048\u308b\u81a8\u5927\u306a\u97f3\u58f0\u30c7\u30fc\u30bf\u3067\u5b66\u7fd2\u3055\u308c\u3066\u3044\u307e\u3059\u3002\u9769\u65b0\u7684\u306a \u4e8c\u91cd\u81ea\u5df1\u56de\u5e30 (Dual-AR) \u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3\u3068\u5f37\u5316\u5b66\u7fd2 (RL) \u30a2\u30e9\u30a4\u30e1\u30f3\u30c8\u6280\u8853\u3092\u7d44\u307f\u5408\u308f\u305b\u308b\u3053\u3068\u3067\u3001\u6975\u3081\u3066\u81ea\u7136\u3067\u30ea\u30a2\u30eb\u3001\u304b\u3064\u611f\u60c5\u8c4a\u304b\u306a\u97f3\u58f0\u3092\u751f\u6210\u3057\u3001\u30aa\u30fc\u30d7\u30f3\u30bd\u30fc\u30b9\u304a\u3088\u3073\u30af\u30ed\u30fc\u30ba\u30c9\u30bd\u30fc\u30b9\u306e\u53cc\u65b9\u3067\u30ea\u30fc\u30c0\u30fc\u30b7\u30c3\u30d7\u3092\u767a\u63ee\u3057\u3066\u3044\u307e\u3059\u3002</p> <p>S2 Pro \u306e\u6700\u5927\u306e\u7279\u5fb4\u306f\u3001\u81ea\u7136\u8a00\u8a9e\u30bf\u30b0\uff08\u4f8b\uff1a<code>[whisper]</code>\u3001<code>[excited]</code>\u3001<code>[angry]</code>\uff09\u306b\u3088\u308b\u97fb\u5f8b\u3084\u611f\u60c5\u306e \u30b5\u30d6\u30ef\u30fc\u30c9\u30ec\u30d9\u30eb (Sub-word Level) \u3067\u306e\u6975\u3081\u3066\u7d30\u3084\u304b\u306a\u30a4\u30f3\u30e9\u30a4\u30f3\u5236\u5fa1\u304c\u53ef\u80fd\u3067\u3042\u308b\u70b9\u3067\u3059\u3002\u307e\u305f\u3001\u30de\u30eb\u30c1\u30b9\u30d4\u30fc\u30ab\u30fc\u751f\u6210\u3084\u9577\u6587\u30b3\u30f3\u30c6\u30ad\u30b9\u30c8\u306e\u30de\u30eb\u30c1\u30bf\u30fc\u30f3\u5bfe\u8a71\u751f\u6210\u306b\u3082\u30cd\u30a4\u30c6\u30a3\u30d6\u5bfe\u5fdc\u3057\u3066\u3044\u307e\u3059\u3002</p> <p>\u4eca\u3059\u3050 Fish Audio \u516c\u5f0f\u30b5\u30a4\u30c8 \u3067\u30d7\u30ec\u30a4\u30b0\u30e9\u30a6\u30f3\u30c9\u3092\u4f53\u9a13\u3059\u308b\u304b\u3001\u6280\u8853\u30ec\u30dd\u30fc\u30c8 \u3084 \u30d6\u30ed\u30b0\u8a18\u4e8b \u3092\u8aad\u3093\u3067\u8a73\u7d30\u3092\u78ba\u8a8d\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"README.ja/#_3","title":"\u30e2\u30c7\u30eb\u30d0\u30ea\u30a2\u30f3\u30c8","text":"\u30e2\u30c7\u30eb \u30b5\u30a4\u30ba \u5229\u7528\u53ef\u80fd\u6027 \u8aac\u660e S2-Pro 4B \u30d1\u30e9\u30e1\u30fc\u30bf HuggingFace \u54c1\u8cea\u3068\u5b89\u5b9a\u6027\u3092\u6700\u5927\u5316\u3057\u305f\u3001\u30d5\u30eb\u6a5f\u80fd\u306e\u30d5\u30e9\u30c3\u30b0\u30b7\u30c3\u30d7\u30e2\u30c7\u30eb <p>\u30e2\u30c7\u30eb\u306e\u8a73\u7d30\u306f\u6280\u8853\u30ec\u30dd\u30fc\u30c8\u3092\u3054\u53c2\u7167\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"README.ja/#_4","title":"\u30d9\u30f3\u30c1\u30de\u30fc\u30af\u7d50\u679c","text":"\u30d9\u30f3\u30c1\u30de\u30fc\u30af Fish Audio S2 Seed-TTS Eval \u2014 WER\uff08\u4e2d\u56fd\u8a9e\uff09 0.54%\uff08\u5168\u4f53\u6700\u826f\uff09 Seed-TTS Eval \u2014 WER\uff08\u82f1\u8a9e\uff09 0.99%\uff08\u5168\u4f53\u6700\u826f\uff09 Audio Turing Test\uff08\u6307\u793a\u3042\u308a\uff09 0.515 \u4e8b\u5f8c\u5e73\u5747\u5024 EmergentTTS-Eval \u2014 \u52dd\u7387 81.88%\uff08\u5168\u4f53\u6700\u9ad8\uff09 Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 \u54c1\u8cea 4.51 / 5.0 \u591a\u8a00\u8a9e\uff08MiniMax Testset\uff09\u2014 \u6700\u826f WER 24 \u8a00\u8a9e\u4e2d 11 \u8a00\u8a9e \u591a\u8a00\u8a9e\uff08MiniMax Testset\uff09\u2014 \u6700\u826f SIM 24 \u8a00\u8a9e\u4e2d 17 \u8a00\u8a9e <p>Seed-TTS Eval \u3067\u306f\u3001S2 \u306f\u30af\u30ed\u30fc\u30ba\u30c9\u30bd\u30fc\u30b9\u3092\u542b\u3080\u5168\u8a55\u4fa1\u30e2\u30c7\u30eb\u306e\u4e2d\u3067\u6700\u5c0f WER \u3092\u9054\u6210\u3057\u307e\u3057\u305f\uff1aQwen3-TTS\uff080.77/1.24\uff09\u3001MiniMax Speech-02\uff080.99/1.90\uff09\u3001Seed-TTS\uff081.12/2.25\uff09\u3002Audio Turing Test \u3067\u306f 0.515 \u3092\u8a18\u9332\u3057\u3001Seed-TTS\uff080.417\uff09\u6bd4\u3067 24%\u3001MiniMax-Speech\uff080.387\uff09\u6bd4\u3067 33% \u4e0a\u56de\u308a\u307e\u3057\u305f\u3002EmergentTTS-Eval \u3067\u306f\u3001\u526f\u8a00\u8a9e\u60c5\u5831\uff0891.61%\uff09\u3001\u7591\u554f\u6587\uff0884.41%\uff09\u3001\u7d71\u8a9e\u7684\u8907\u96d1\u6027\uff0883.39%\uff09\u3067\u7279\u306b\u9ad8\u3044\u6210\u7e3e\u3092\u793a\u3057\u3066\u3044\u307e\u3059\u3002</p>"},{"location":"README.ja/#_5","title":"\u30cf\u30a4\u30e9\u30a4\u30c8","text":""},{"location":"README.ja/#_6","title":"\u81ea\u7136\u8a00\u8a9e\u306b\u3088\u308b\u7d30\u7c92\u5ea6\u30a4\u30f3\u30e9\u30a4\u30f3\u5236\u5fa1","text":"<p>S2 Pro \u306f\u97f3\u58f0\u306b\u3053\u308c\u307e\u3067\u306b\u306a\u3044\u300c\u9b42\u300d\u3092\u5bbf\u3089\u305b\u307e\u3059\u3002\u30b7\u30f3\u30d7\u30eb\u306a <code>[tag]</code> \u69cb\u6587\u3092\u4f7f\u7528\u3057\u3066\u3001\u30c6\u30ad\u30b9\u30c8\u5185\u306e\u4efb\u610f\u306e\u5834\u6240\u306b\u611f\u60c5\u306e\u6307\u793a\u3092\u6b63\u78ba\u306b\u57cb\u3081\u8fbc\u3080\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u3002 - 1\u4e075,000\u4ee5\u4e0a\u306e\u30e6\u30cb\u30fc\u30af\u30bf\u30b0\u306b\u5bfe\u5fdc\uff1a\u56fa\u5b9a\u306e\u30d7\u30ea\u30bb\u30c3\u30c8\u306b\u9650\u5b9a\u3055\u308c\u305a\u3001\u81ea\u7531\u5f62\u5f0f\u306e\u30c6\u30ad\u30b9\u30c8\u8a18\u8ff0 \u3092\u30b5\u30dd\u30fc\u30c8\u3057\u307e\u3059\u3002<code>[whisper in small voice]</code> (\u3055\u3055\u3084\u304d\u58f0\u3067), <code>[professional broadcast tone]</code> (\u30d7\u30ed\u306e\u30ca\u30ec\u30fc\u30b7\u30e7\u30f3\u98a8), <code>[pitch up]</code> (\u30d4\u30c3\u30c1\u3092\u4e0a\u3052\u308b) \u306a\u3069\u3092\u8a66\u3057\u3066\u307f\u3066\u304f\u3060\u3055\u3044\u3002 - \u8c4a\u5bcc\u306a\u611f\u60c5\u30e9\u30a4\u30d6\u30e9\u30ea: <code>[pause]</code> <code>[emphasis]</code> <code>[laughing]</code> <code>[inhale]</code> <code>[chuckle]</code> <code>[tsk]</code> <code>[singing]</code> <code>[excited]</code> <code>[laughing tone]</code> <code>[interrupting]</code> <code>[chuckling]</code> <code>[excited tone]</code> <code>[volume up]</code> <code>[echo]</code> <code>[angry]</code> <code>[low volume]</code> <code>[sigh]</code> <code>[low voice]</code> <code>[whisper]</code> <code>[screaming]</code> <code>[shouting]</code> <code>[loud]</code> <code>[surprised]</code> <code>[short pause]</code> <code>[exhale]</code> <code>[delight]</code> <code>[panting]</code> <code>[audience laughter]</code> <code>[with strong accent]</code> <code>[volume down]</code> <code>[clearing throat]</code> <code>[sad]</code> <code>[moaning]</code> <code>[shocked]</code></p>"},{"location":"README.ja/#dual-autoregressive","title":"\u9769\u65b0\u7684\u306a\u4e8c\u91cd\u81ea\u5df1\u56de\u5e30 (Dual-Autoregressive) \u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3","text":"<p>S2 Pro \u306f\u3001Decoder-only Transformer \u3068 RVQ \u30aa\u30fc\u30c7\u30a3\u30aa\u30b3\u30fc\u30c7\u30c3\u30af\uff0810 \u30b3\u30fc\u30c9\u30d6\u30c3\u30af\u3001\u7d04 21 Hz\uff09\u3067\u69cb\u6210\u3055\u308c\u308b\u30de\u30b9\u30bf\u30fc\u30fb\u30b9\u30ec\u30fc\u30d6\u578b\u306e Dual-AR \u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3\u3092\u63a1\u7528\u3057\u3066\u3044\u307e\u3059\uff1a</p> <ul> <li>Slow AR (4B \u30d1\u30e9\u30e1\u30fc\u30bf): \u6642\u9593\u8ef8\u65b9\u5411\u306b\u52d5\u4f5c\u3057\u3001\u6838\u3068\u306a\u308b\u30bb\u30de\u30f3\u30c6\u30a3\u30c3\u30af\u30b3\u30fc\u30c9\u30d6\u30c3\u30af\u3092\u4e88\u6e2c\u3002</li> <li>Fast AR (400M \u30d1\u30e9\u30e1\u30fc\u30bf): \u5404\u6642\u9593\u30b9\u30c6\u30c3\u30d7\u3067\u6b8b\u308a 9 \u500b\u306e\u6b8b\u5dee\u30b3\u30fc\u30c9\u30d6\u30c3\u30af\u3092\u751f\u6210\u3057\u3001\u6975\u3081\u3066\u7e4a\u7d30\u306a\u97f3\u97ff\u30c7\u30a3\u30c6\u30fc\u30eb\u3092\u5fa9\u5143\u3002</li> </ul> <p>\u3053\u306e\u975e\u5bfe\u79f0\u8a2d\u8a08\u306b\u3088\u308a\u3001\u7a76\u6975\u306e\u30aa\u30fc\u30c7\u30a3\u30aa\u5fe0\u5b9f\u5ea6\u3092\u7dad\u6301\u3057\u306a\u304c\u3089\u3001\u63a8\u8ad6\u901f\u5ea6\u3092\u5927\u5e45\u306b\u5411\u4e0a\u3055\u305b\u3066\u3044\u307e\u3059\u3002</p>"},{"location":"README.ja/#rl","title":"\u5f37\u5316\u5b66\u7fd2 (RL) \u30a2\u30e9\u30a4\u30e1\u30f3\u30c8","text":"<p>S2 Pro \u306f\u3001\u4e8b\u5f8c\u5b66\u7fd2\u30a2\u30e9\u30a4\u30e1\u30f3\u30c8\u306b Group Relative Policy Optimization (GRPO) \u6280\u8853\u3092\u63a1\u7528\u3057\u3066\u3044\u307e\u3059\u3002\u30c7\u30fc\u30bf\u306e\u30af\u30ea\u30fc\u30cb\u30f3\u30b0\u3068\u30a2\u30ce\u30c6\u30fc\u30b7\u30e7\u30f3\u306b\u4f7f\u7528\u3057\u305f\u30e2\u30c7\u30eb\u30bb\u30c3\u30c8\u3092\u305d\u306e\u307e\u307e\u5831\u916c\u30e2\u30c7\u30eb (Reward Model) \u3068\u3057\u3066\u4f7f\u7528\u3059\u308b\u3053\u3068\u3067\u3001\u4e8b\u524d\u5b66\u7fd2\u30c7\u30fc\u30bf\u306e\u5206\u5e03\u3068\u4e8b\u5f8c\u5b66\u7fd2\u306e\u76ee\u6a19\u3068\u306e\u9593\u306e\u30df\u30b9\u30de\u30c3\u30c1\u3092\u5b8c\u74a7\u306b\u89e3\u6c7a\u3057\u307e\u3057\u305f\u3002 - \u591a\u6b21\u5143\u306e\u5831\u916c\u4fe1\u53f7: \u610f\u5473\u306e\u6b63\u78ba\u6027\u3001\u6307\u793a\u8ffd\u5f93\u6027\u3001\u97f3\u97ff\u7684\u306a\u597d\u307f\u3001\u97f3\u8272\u306e\u985e\u4f3c\u6027\u3092\u7dcf\u5408\u7684\u306b\u8a55\u4fa1\u3057\u3001\u751f\u6210\u3055\u308c\u308b\u4e00\u79d2\u4e00\u79d2\u306e\u97f3\u58f0\u304c\u4eba\u9593\u306e\u76f4\u611f\u306b\u6cbf\u3046\u3088\u3046\u306b\u3057\u3066\u3044\u307e\u3059\u3002</p>"},{"location":"README.ja/#sglang","title":"SGLang \u306b\u3088\u308b\u7a76\u6975\u306e\u30b9\u30c8\u30ea\u30fc\u30df\u30f3\u30b0\u63a8\u8ad6\u6027\u80fd","text":"<p>Dual-AR \u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3\u306f\u6a19\u6e96\u7684\u306a LLM \u69cb\u9020\u3068\u540c\u578b\u3067\u3042\u308b\u305f\u3081\u3001S2 Pro \u306f SGLang \u306e\u3059\u3079\u3066\u306e\u63a8\u8ad6\u52a0\u901f\u6a5f\u80fd\u3092\u30cd\u30a4\u30c6\u30a3\u30d6\u306b\u30b5\u30dd\u30fc\u30c8\u3057\u3066\u3044\u307e\u3059\u3002\u3053\u308c\u306b\u306f\u3001Continuous Batching\u3001Paged KV Cache\u3001CUDA Graph\u3001RadixAttention \u30d9\u30fc\u30b9\u306e Prefix Caching \u304c\u542b\u307e\u308c\u307e\u3059\u3002</p> <p>NVIDIA H200 GPU 1\u679a\u3067\u306e\u30d1\u30d5\u30a9\u30fc\u30de\u30f3\u30b9\u8868\u73fe: - \u30ea\u30a2\u30eb\u30bf\u30a4\u30e0\u4fc2\u6570 (RTF): 0.195 - \u521d\u56de\u97f3\u58f0\u51fa\u529b\u307e\u3067\u306e\u6642\u9593 (TTFA): \u7d04 100 ms - \u6975\u901f\u30b9\u30eb\u30fc\u30d7\u30c3\u30c8: RTF < 0.5 \u3092\u7dad\u6301\u3057\u3064\u3064 3,000+ acoustic tokens/s</p>"},{"location":"README.ja/#_7","title":"\u5f37\u529b\u306a\u591a\u8a00\u8a9e\u30b5\u30dd\u30fc\u30c8","text":"<p>S2 Pro \u306f 80 \u4ee5\u4e0a\u306e\u8a00\u8a9e\u3092\u30b5\u30dd\u30fc\u30c8\u3057\u3066\u304a\u308a\u3001\u97f3\u7d20\u3084\u7279\u5b9a\u306e\u8a00\u8a9e\u306b\u5bfe\u3059\u308b\u524d\u51e6\u7406\u306a\u3057\u3067\u9ad8\u54c1\u8cea\u306a\u5408\u6210\u3092\u5b9f\u73fe\u3057\u307e\u3059\uff1a</p> <ul> <li>\u7b2c1\u5c64 (Tier 1): \u65e5\u672c\u8a9e (ja), \u82f1\u8a9e (en), \u4e2d\u56fd\u8a9e (zh)</li> <li>\u7b2c2\u5c64 (Tier 2): \u97d3\u56fd\u8a9e (ko), \u30b9\u30da\u30a4\u30f3\u8a9e (es), \u30dd\u30eb\u30c8\u30ac\u30eb\u8a9e (pt), \u30a2\u30e9\u30d3\u30a2\u8a9e (ar), \u30ed\u30b7\u30a2\u8a9e (ru), \u30d5\u30e9\u30f3\u30b9\u8a9e (fr), \u30c9\u30a4\u30c4\u8a9e (de)</li> <li>\u30b0\u30ed\u30fc\u30d0\u30eb\u30ab\u30d0\u30ec\u30c3\u30b8: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, e!t, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo \u306a\u3069\u3002</li> </ul>"},{"location":"README.ja/#_8","title":"\u30cd\u30a4\u30c6\u30a3\u30d6\u306a\u30de\u30eb\u30c1\u30b9\u30d4\u30fc\u30ab\u30fc\u751f\u6210","text":"<p>Fish Audio S2 \u3067\u306f\u3001\u8907\u6570\u306e\u30b9\u30d4\u30fc\u30ab\u30fc\u3092\u542b\u3080\u53c2\u7167\u30aa\u30fc\u30c7\u30a3\u30aa\u3092\u30a2\u30c3\u30d7\u30ed\u30fc\u30c9\u3067\u304d\u3001\u30e2\u30c7\u30eb\u306f <code><|speaker:i|></code> \u30c8\u30fc\u30af\u30f3\u3092\u4ecb\u3057\u3066\u5404\u30b9\u30d4\u30fc\u30ab\u30fc\u306e\u7279\u5fb4\u3092\u51e6\u7406\u3057\u307e\u3059\u3002\u30b9\u30d4\u30fc\u30ab\u30fc ID \u30c8\u30fc\u30af\u30f3\u3092\u4f7f\u7528\u3057\u3066\u30e2\u30c7\u30eb\u306e\u51fa\u529b\u3092\u5236\u5fa1\u3059\u308b\u3053\u3068\u3067\u30011\u56de\u306e\u751f\u6210\u306b\u8907\u6570\u306e\u30b9\u30d4\u30fc\u30ab\u30fc\u3092\u6df7\u5728\u3055\u305b\u308b\u3053\u3068\u304c\u53ef\u80fd\u3067\u3059\u3002\u500b\u5225\u306e\u30b9\u30d4\u30fc\u30ab\u30fc\u3054\u3068\u306b\u53c2\u7167\u30aa\u30fc\u30c7\u30a3\u30aa\u3092\u30a2\u30c3\u30d7\u30ed\u30fc\u30c9\u3057\u76f4\u3059\u624b\u9593\u306f\u3082\u3046\u4e0d\u8981\u3067\u3059\u3002</p>"},{"location":"README.ja/#_9","title":"\u30de\u30eb\u30c1\u30bf\u30fc\u30f3\u5bfe\u8a71\u751f\u6210","text":"<p>\u30b3\u30f3\u30c6\u30ad\u30b9\u30c8\u306e\u62e1\u5f35\u306b\u3088\u308a\u3001\u4ee5\u524d\u306e\u30bf\u30fc\u30f3\u306e\u60c5\u5831\u3092\u5229\u7528\u3057\u3066\u5f8c\u7d9a\u306e\u751f\u6210\u5185\u5bb9\u306e\u8868\u73fe\u529b\u3092\u9ad8\u3081\u308b\u3053\u3068\u304c\u3067\u304d\u3001\u5bfe\u8a71\u3068\u3057\u3066\u306e\u81ea\u7136\u3055\u304c\u5927\u5e45\u306b\u5411\u4e0a\u3057\u307e\u3057\u305f\u3002</p>"},{"location":"README.ja/#_10","title":"\u9ad8\u901f\u97f3\u58f0\u30af\u30ed\u30fc\u30cb\u30f3\u30b0","text":"<p>Fish Audio S2 \u306f\u3001\u77ed\u3044\u53c2\u7167\u30b5\u30f3\u30d7\u30eb\uff08\u901a\u5e38 10\u301c30 \u79d2\uff09\u3092\u4f7f\u7528\u3057\u305f\u6b63\u78ba\u306a\u97f3\u58f0\u30af\u30ed\u30fc\u30cb\u30f3\u30b0\u3092\u30b5\u30dd\u30fc\u30c8\u3057\u3066\u3044\u307e\u3059\u3002\u30e2\u30c7\u30eb\u306f\u97f3\u8272\u3001\u8a71\u3057\u65b9\u3001\u611f\u60c5\u3092\u6349\u3048\u3001\u8ffd\u52a0\u306e\u5fae\u8abf\u6574\u306a\u3057\u3067\u30ea\u30a2\u30eb\u3067\u4e00\u8cab\u3057\u305f\u30af\u30ed\u30fc\u30f3\u97f3\u58f0\u3092\u751f\u6210\u3057\u307e\u3059\u3002 SGLang \u30b5\u30fc\u30d0\u30fc\u306e\u5229\u7528\u306b\u3064\u3044\u3066\u306f\u3001SGLang-Omni README \u3092\u53c2\u7167\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"README.ja/#_11","title":"\u8b1d\u8f9e","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"README.ja/#_12","title":"\u6280\u8853\u30ec\u30dd\u30fc\u30c8","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"README.ko/","title":"README.ko","text":"Fish Speech [English](../README.md) | [\u7b80\u4f53\u4e2d\u6587](README.zh.md) | [Portuguese](README.pt-BR.md) | [\u65e5\u672c\u8a9e](README.ja.md) | **\ud55c\uad6d\uc5b4** | [\u0627\u0644\u0639\u0631\u0628\u064a\u0629](README.ar.md) | [Espa\u00f1ol](docs/README.es.md) <p>[!IMPORTANT] \ub77c\uc774\uc120\uc2a4 \uace0\uc9c0 \uc774 \ucf54\ub4dc\ubca0\uc774\uc2a4 \ubc0f \uad00\ub828 \ubaa8\ub378 \uac00\uc911\uce58\ub294 FISH AUDIO RESEARCH LICENSE \uc5d0 \ub530\ub77c \ubc30\ud3ec\ub429\ub2c8\ub2e4. \uc790\uc138\ud55c \ub0b4\uc6a9\uc740 LICENSE\ub97c \ucc38\uc870\ud558\uc2ed\uc2dc\uc624.</p> <p>[!WARNING] \ubc95\uc801 \uba74\ucc45 \uc870\ud56d \ub2f9\uc0ac\ub294 \ucf54\ub4dc\ubca0\uc774\uc2a4\uc758 \ubd88\ubc95\uc801\uc778 \uc0ac\uc6a9\uc5d0 \ub300\ud574 \uc5b4\ub5a0\ud55c \ucc45\uc784\ub3c4 \uc9c0\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \ud574\ub2f9 \uc9c0\uc5ed\uc758 DMCA \ubc0f \uae30\ud0c0 \uad00\ub828 \ubc95\ub960\uc744 \ucc38\uc870\ud558\uc2ed\uc2dc\uc624.</p>"},{"location":"README.ko/#_1","title":"\ube60\ub978 \uc2dc\uc791","text":""},{"location":"README.ko/#_2","title":"\ubb38\uc11c \uc785\uad6c","text":"<p>Fish Audio S2\uc758 \uacf5\uc2dd \ubb38\uc11c\uc785\ub2c8\ub2e4. \uc9c0\uce68\uc5d0 \ub530\ub77c \uc27d\uac8c \uc2dc\uc791\ud558\uc2ed\uc2dc\uc624.</p> <ul> <li>\uc124\uce58</li> <li>\uba85\ub839\uc904 \ucd94\ub860</li> <li>WebUI \ucd94\ub860</li> <li>\uc11c\ubc84 \ucd94\ub860</li> <li>Docker \ubc30\ud3ec</li> </ul> <p>[!IMPORTANT] SGLang \uc11c\ubc84\ub97c \uc0ac\uc6a9\ud558\ub824\uba74 SGLang-Omni README\ub97c \ucc38\uc870\ud558\uc2ed\uc2dc\uc624.</p>"},{"location":"README.ko/#llm-agent","title":"LLM Agent \uac00\uc774\ub4dc","text":"<pre><code>\uba3c\uc800 https://speech.fish.audio/ko/install/ \uc744 \uc77d\uace0 \ubb38\uc11c\uc5d0 \ub530\ub77c Fish Audio S2\ub97c \uc124\uce58 \ubc0f \uad6c\uc131\ud558\uc2ed\uc2dc\uc624.\n</code></pre>"},{"location":"README.ko/#fish-audio-s2-pro","title":"Fish Audio S2 Pro","text":"<p>\uc74c\uc131 \uc0dd\uc131\uc758 \uacbd\uacc4\ub97c \uc7ac\uc815\uc758\ud558\ub294 \uc5c5\uacc4 \ucd5c\uace0\uc758 \ub2e4\uad6d\uc5b4 \ud14d\uc2a4\ud2b8 \uc74c\uc131 \ubcc0\ud658(TTS) \uc2dc\uc2a4\ud15c.</p> <p>Fish Audio S2 Pro\ub294 Fish Audio\uc5d0\uc11c \uac1c\ubc1c\ud55c \ucd5c\ucca8\ub2e8 \uba40\ud2f0\ubaa8\ub2ec \ubaa8\ub378\uc785\ub2c8\ub2e4. \uc804 \uc138\uacc4 80\uac1c \uc774\uc0c1\uc758 \uc5b8\uc5b4\ub97c \uc544\uc6b0\ub974\ub294 1,000\ub9cc \uc2dc\uac04 \uc774\uc0c1\uc758 \ubc29\ub300\ud55c \uc624\ub514\uc624 \ub370\uc774\ud130\ub85c \ud559\uc2b5\ub418\uc5c8\uc2b5\ub2c8\ub2e4. \ud601\uc2e0\uc801\uc778 \uc774\uc911 \uc790\uae30\ud68c\uadc0(Dual-AR) \uc544\ud0a4\ud14d\ucc98\uc640 \uac15\ud654 \ud559\uc2b5(RL) \uc815\ub82c \uae30\uc220\uc744 \ud1b5\ud574 S2 Pro\ub294 \uadf9\ub3c4\ub85c \uc790\uc5f0\uc2a4\ub7fd\uace0 \uc0ac\uc2e4\uc801\uc774\uba70 \uac10\uc815\uc774 \ud48d\ubd80\ud55c \uc74c\uc131\uc744 \uc0dd\uc131\ud558\uba70, \uc624\ud508 \uc18c\uc2a4\uc640 \ud074\u30ed\u30fc\u30ba\ub4dc \uc18c\uc2a4 \uacbd\uc7c1 \ubaa8\ub450\uc5d0\uc11c \uc120\ub450\ub97c \ub2ec\ub9ac\uace0 \uc788\uc2b5\ub2c8\ub2e4.</p> <p>S2 Pro\uc758 \ud575\uc2ec \uac15\uc810\uc740 \uc790\uc5f0\uc5b4 \ud0dc\uadf8(\uc608: <code>[whisper]</code>, <code>[excited]</code>, <code>[angry]</code>)\ub97c \ud1b5\ud574 \uc6b4\uc728\uacfc \uac10\uc815\uc744 \ud558\uc704 \ub2e8\uc5b4 \uc218\uc900(Sub-word Level)\uc5d0\uc11c \ub9e4\uc6b0 \uc138\ubc00\ud558\uac8c \uc778\ub77c\uc778 \uc81c\uc5b4\ud560 \uc218 \uc788\ub2e4\ub294 \uc810\uc785\ub2c8\ub2e4. \ub610\ud55c \ub2e4\uc911 \ud654\uc790 \uc0dd\uc131 \ubc0f \uae34 \ucee8\ud14d\uc2a4\ud2b8\uc758 \ub2e4\uc911 \ud134 \ub300\ud654 \uc0dd\uc131\uc744 \uae30\ubcf8\uc801\uc73c\ub85c \uc9c0\uc6d0\ud569\ub2c8\ub2e4.</p> <p>\uc9c0\uae08 \ubc14\ub85c Fish Audio \uacf5\uc2dd \uc6f9\uc0ac\uc774\ud2b8\uc5d0\uc11c \uc628\ub77c\uc778 \ub370\ubaa8\ub97c \uccb4\ud5d8\ud558\uac70\ub098, \uae30\uc220 \ubcf4\uace0\uc11c \ubc0f \ube14\ub85c\uadf8 \uac8c\uc2dc\ubb3c\uc744 \ud1b5\ud574 \uc790\uc138\ud788 \uc54c\uc544\ubcf4\uc2ed\uc2dc\uc624.</p>"},{"location":"README.ko/#_3","title":"\ubaa8\ub378 \ubcc0\uccb4","text":"\ubaa8\ub378 \ud06c\uae30 \uac00\uc6a9\uc131 \uc124\uba85 S2-Pro 4B \ud30c\ub77c\ubbf8\ud130 HuggingFace \ucd5c\uace0\uc758 \ud488\uc9c8\uacfc \uc548\uc815\uc131\uc744 \uac16\ucd98 \ubaa8\ub4e0 \uae30\ub2a5\uc744 \uac16\ucd98 \ud50c\ub798\uadf8\uc2ed \ubaa8\ub378 <p>\ubaa8\ub378\uc5d0 \ub300\ud55c \uc790\uc138\ud55c \ub0b4\uc6a9\uc740 \uae30\uc220 \ubcf4\uace0\uc11c\ub97c \ucc38\uc870\ud558\uc2ed\uc2dc\uc624.</p>"},{"location":"README.ko/#_4","title":"\ubca4\uce58\ub9c8\ud06c \uacb0\uacfc","text":"\ubca4\uce58\ub9c8\ud06c Fish Audio S2 Seed-TTS Eval \u2014 WER(\uc911\uad6d\uc5b4) 0.54% (\uc804\uccb4 \ucd5c\uace0) Seed-TTS Eval \u2014 WER(\uc601\uc5b4) 0.99% (\uc804\uccb4 \ucd5c\uace0) Audio Turing Test (\uc9c0\uce68 \ud3ec\ud568) 0.515 \ud6c4\ud5d8 \ud3c9\uade0 EmergentTTS-Eval \u2014 \uc2b9\ub960 81.88% (\uc804\uccb4 \ucd5c\uace0) Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 \ud488\uc9c8 4.51 / 5.0 \ub2e4\uad6d\uc5b4 (MiniMax Testset) \u2014 \ucd5c\uace0 WER 24\uac1c \uc5b8\uc5b4 \uc911 11\uac1c \ub2e4\uad6d\uc5b4 (MiniMax Testset) \u2014 \ucd5c\uace0 SIM 24\uac1c \uc5b8\uc5b4 \uc911 17\uac1c <p>Seed-TTS Eval\uc5d0\uc11c S2\ub294 \ud074\u30ed\u30fc\u30ba\ub4dc \uc18c\uc2a4 \uc2dc\uc2a4\ud15c\uc744 \ud3ec\ud568\ud55c \ubaa8\ub4e0 \ud3c9\uac00 \ubaa8\ub378 \uc911 \uac00\uc7a5 \ub0ae\uc740 WER\uc744 \ub2ec\uc131\ud588\uc2b5\ub2c8\ub2e4: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). Audio Turing Test\uc5d0\uc11c S2\uc758 0.515\ub294 Seed-TTS (0.417) \ub300\ube44 24%, MiniMax-Speech (0.387) \ub300\ube44 33% \ud5a5\uc0c1\ub41c \uc218\uce58\uc785\ub2c8\ub2e4. EmergentTTS-Eval\uc5d0\uc11c S2\ub294 \ubd80\ucc28 \uc5b8\uc5b4\ud559(91.61% \uc2b9\ub960), \uc758\ubb38\ubb38(84.41%), \uad6c\ubb38 \ubcf5\uc7a1\uc131(83.39%) \ub4f1\uc758 \uce21\uba74\uc5d0\uc11c \ud2b9\ud788 \ub450\ub4dc\ub7ec\uc9c4 \uc131\uacfc\ub97c \ubcf4\uc600\uc2b5\ub2c8\ub2e4.</p>"},{"location":"README.ko/#_5","title":"\ud558\uc774\ub77c\uc774\ud2b8","text":""},{"location":"README.ko/#_6","title":"\uc790\uc5f0\uc5b4\ub97c \ud1b5\ud55c \ucd08\ubbf8\uc138 \uc778\ub77c\uc778 \uc81c\uc5b4","text":"<p>S2 Pro\ub294 \uc74c\uc131\uc5d0 \uc804\ub840 \uc5c6\ub294 \"\uc601\ud63c\"\uc744 \ubd80\uc5ec\ud569\ub2c8\ub2e4. \uac04\ub2e8\ud55c <code>[tag]</code> \uad6c\ubb38\uc744 \uc0ac\uc6a9\ud558\uc5ec \ud14d\uc2a4\ud2b8\uc758 \uc5b4\ub290 \uc704\uce58\uc5d0\ub098 \uac10\uc815 \uc9c0\uce68\uc744 \uc815\ud655\ud558\uac8c \uc0bd\uc785\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. - 15,000\uac1c \uc774\uc0c1\uc758 \uace0\uc720 \ud0dc\uadf8 \uc9c0\uc6d0: \uace0\uc815\ub41c \uc0ac\uc804 \uc124\uc815\uc5d0 \uad6d\ud55c\ub418\uc9c0 \uc54a\uace0 \uc790\uc720 \ud615\uc2dd\uc758 \ud14d\uc2a4\ud2b8 \uc124\uba85\uc744 \uc9c0\uc6d0\ud569\ub2c8\ub2e4. <code>[whisper in small voice]</code> (\uc791\uc740 \ubaa9\uc18c\ub9ac\ub85c \uc18d\uc0ad\uc784), <code>[professional broadcast tone]</code> (\uc804\ubb38 \ubc29\uc1a1 \ud1a4), <code>[pitch up]</code> (\uc74c\ub192\uc774 \ub192\uc784) \ub4f1\uc744 \uc2dc\ub3c4\ud574 \ubcf4\uc2ed\uc2dc\uc624. - \ud48d\ubd80\ud55c \uac10\uc815 \ub77c\uc774\ube0c\ub7ec\ub9ac: <code>[pause]</code> <code>[emphasis]</code> <code>[laughing]</code> <code>[inhale]</code> <code>[chuckle]</code> <code>[tsk]</code> <code>[singing]</code> <code>[excited]</code> <code>[laughing tone]</code> <code>[interrupting]</code> <code>[chuckling]</code> <code>[excited tone]</code> <code>[volume up]</code> <code>[echo]</code> <code>[angry]</code> <code>[low volume]</code> <code>[sigh]</code> <code>[low voice]</code> <code>[whisper]</code> <code>[screaming]</code> <code>[shouting]</code> <code>[loud]</code> <code>[surprised]</code> <code>[short pause]</code> <code>[exhale]</code> <code>[delight]</code> <code>[panting]</code> <code>[audience laughter]</code> <code>[with strong accent]</code> <code>[volume down]</code> <code>[clearing throat]</code> <code>[sad]</code> <code>[moaning]</code> <code>[shocked]</code></p>"},{"location":"README.ko/#dual-autoregressive","title":"\ud601\uc2e0\uc801\uc778 \uc774\uc911 \uc790\uae30\ud68c\uadc0 (Dual-Autoregressive) \uc544\ud0a4\ud14d\ucc98","text":"<p>S2 Pro\ub294 Decoder-only Transformer\uc640 RVQ \uc624\ub514\uc624 \ucf54\ub371(10\uac1c \ucf54\ub4dc\ubd81, \uc57d 21Hz \ud504\ub808\uc784 \uc18d\ub3c4)\uc73c\ub85c \uad6c\uc131\ub41c \ub9c8\uc2a4\ud130-\uc2ac\ub808\uc774\ube0c \ubc29\uc2dd\uc758 Dual-AR \uc544\ud0a4\ud14d\ucc98\ub97c \ucc44\ud0dd\ud588\uc2b5\ub2c8\ub2e4.</p> <ul> <li>Slow AR (4B \ud30c\ub77c\ubbf8\ud130): \uc2dc\uac04 \ucd95\uc744 \ub530\ub77c \uc791\ub3d9\ud558\uba70 \ud575\uc2ec \uc758\ubbf8 \ucf54\ub4dc\ubd81\uc744 \uc608\uce21\ud569\ub2c8\ub2e4.</li> <li>Fast AR (400M \ud30c\ub77c\ubbf8\ud130): \uac01 \ud0c0\uc784\uc2a4\ud15d\uc5d0\uc11c \ub098\uba38\uc9c0 9\uac1c\uc758 \uc794\ucc28 \ucf54\ub4dc\ubd81\uc744 \uc0dd\uc131\ud558\uc5ec \uadf9\ub3c4\ub85c \uc815\uad50\ud55c \uc74c\ud5a5 \uc138\ubd80 \uc0ac\ud56d\uc744 \ubcf5\uc6d0\ud569\ub2c8\ub2e4.</li> </ul> <p>\uc774\ub7ec\ud55c \ube44\ub300\uce6d \uc124\uacc4\ub294 \uc624\ub514\uc624\uc758 \ucd5c\uace0 \ucda9\uc2e4\ub3c4\ub97c \ubcf4\uc7a5\ud558\ub294 \ub3d9\uc2dc\uc5d0 \ucd94\ub860 \uc18d\ub3c4\ub97c \ub300\ud3ed \ud5a5\uc0c1\uc2dc\ud0b5\ub2c8\ub2e4.</p>"},{"location":"README.ko/#rl","title":"\uac15\ud654 \ud559\uc2b5 (RL) \uc815\ub82c","text":"<p>S2 Pro\ub294 \uc0ac\ud6c4 \ud559\uc2b5 \uc815\ub82c\uc744 \uc704\ud574 Group Relative Policy Optimization (GRPO) \uae30\uc220\uc744 \ucc44\ud0dd\ud588\uc2b5\ub2c8\ub2e4. \ub370\uc774\ud130 \uc815\uc81c \ubc0f \uc8fc\uc11d \ucc98\ub9ac\uc5d0 \uc0ac\uc6a9\ub41c \uac83\uacfc \ub3d9\uc77c\ud55c \ubaa8\ub378 \uc138\ud2b8\ub97c \ubcf4\uc0c1 \ubaa8\ub378(Reward Model)\ub85c \uc9c1\uc811 \uc0ac\uc6a9\ud568\uc73c\ub85c\uc368 \uc0ac\uc804 \ud559\uc2b5 \ub370\uc774\ud130 \ubd84\ud3ec\uc640 \uc0ac\ud6c4 \ud559\uc2b5 \ubaa9\ud45c \uac04\uc758 \ubd88\uc77c\uce58 \ubb38\uc81c\ub97c \uc644\ubcbd\ud558\uac8c \ud574\uacb0\ud588\uc2b5\ub2c8\ub2e4. - \ub2e4\ucc28\uc6d0 \ubcf4\uc0c1 \uc2e0\ud638: \uc758\ubbf8 \uccb4\uacc4\uc758 \uc815\ud655\uc131, \uc9c0\uce68 \uc900\uc218 \ub2a5\ub825, \uc74c\ud5a5 \uc120\ud638\ub3c4 \uc810\uc218 \ubc0f \uc74c\uc0c9 \uc720\uc0ac\uc131\uc744 \uc885\ud569\uc801\uc73c\ub85c \ud3c9\uac00\ud558\uc5ec \uc0dd\uc131\ub41c \uc74c\uc131\uc758 \ub9e4\ucd08\uac00 \uc778\uac04\uc758 \uc9c1\uad00\uc5d0 \ubd80\ud569\ud558\ub3c4\ub85d \ubcf4\uc7a5\ud569\ub2c8\ub2e4.</p>"},{"location":"README.ko/#sglang","title":"SGLang \uae30\ubc18\uc758 \uadf9\ud55c \uc2a4\ud2b8\ub9ac\ubc0d \ucd94\ub860 \uc131\ub2a5","text":"<p>Dual-AR \uc544\ud0a4\ud14d\ucc98\ub294 \ud45c\uc900 LLM \uad6c\uc870\uc640 \ub3d9\ud615\uc774\ubbc0\ub85c S2 Pro\ub294 Continuous Batching, Paged KV Cache, CUDA Graph \ubc0f RadixAttention \uae30\ubc18 Prefix Caching\uc744 \ud3ec\ud568\ud55c SGLang\uc758 \ubaa8\ub4e0 \ucd94\ub860 \uac00\uc18d \uae30\ub2a5\uc744 \uae30\ubcf8\uc801\uc73c\ub85c \uc9c0\uc6d0\ud569\ub2c8\ub2e4.</p> <p>\ub2e8\uc77c NVIDIA H200 GPU \uc131\ub2a5 \uc9c0\ud45c: - \uc2e4\uc2dc\uac04 \uacc4\uc218 (RTF): 0.195 - \uccab \uc74c\uc131 \uc9c0\uc5f0 (TTFA): \uc57d 100 ms - \ucd08\uace0\uc18d \ucc98\ub9ac\ub7c9: RTF < 0.5 \uc720\uc9c0 \uc2dc \ucc98\ub9ac\ub7c9 3,000+ acoustic tokens/s \ub2ec\uc131</p>"},{"location":"README.ko/#_7","title":"\uac15\ub825\ud55c \ub2e4\uad6d\uc5b4 \uc9c0\uc6d0","text":"<p>S2 Pro\ub294 \uc74c\uc18c\ub098 \ud2b9\uc815 \uc5b8\uc5b4 \ucc98\ub9ac\uac00 \ud544\uc694 \uc5c6\ub294 \uace0\ud488\uc9c8 \ud569\uc131\uc744 80\uac1c \uc774\uc0c1\uc758 \uc5b8\uc5b4\uc5d0\uc11c \uc9c0\uc6d0\ud569\ub2c8\ub2e4.</p> <ul> <li>1\uacc4\uce35 (Tier 1): \uc77c\ubcf8\uc5b4 (ja), \uc601\uc5b4 (en), \uc911\uad6d\uc5b4 (zh)</li> <li>2\uacc4\uce35 (Tier 2): \ud55c\uad6d\uc5b4 (ko), \uc2a4\ud398\uc778\uc5b4 (es), \ud3ec\ub974\ud22c\uac08\uc5b4 (pt), \uc544\ub78d\uc5b4 (ar), \ub7ec\uc2dc\uc544\uc5b4 (ru), \ud504\ub791\uc2a4\uc5b4 (fr), \ub3c5\uc77c\uc5b4 (de)</li> <li>\uae00\ub85c\ubc8c \ucee4\ubc84\ub9ac\uc9c0: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo \ub4f1.</li> </ul>"},{"location":"README.ko/#_8","title":"\ub124\uc774\ud2f0\ube0c \ub2e4\uc911 \ud654\uc790 \uc0dd\uc131","text":"<p>Fish Audio S2\ub97c \uc0ac\uc6a9\ud558\uba74 \uc0ac\uc6a9\uc790\uac00 \uc5ec\ub7ec \ud654\uc790\uac00 \ud3ec\ud568\ub41c \ucc38\uc870 \uc624\ub514\uc624\ub97c \uc5c5\ub85c\ub4dc\ud560 \uc218 \uc788\uc73c\uba70, \ubaa8\ub378\uc740 <code><|speaker:i|></code> \ud1a0\ud070\uc744 \ud1b5\ud574 \uac01 \ud654\uc790\uc758 \ud2b9\uc9d5\uc744 \ucc98\ub9ac\ud569\ub2c8\ub2e4. \uc774\ud6c4 \ud654\uc790 ID \ud1a0\ud070\uc744 \uc0ac\uc6a9\ud558\uc5ec \ubaa8\ub378\uc758 \ud45c\ud604\uc744 \uc81c\uc5b4\ud568\uc73c\ub85c\uc368 \ud55c \ubc88\uc758 \uc0dd\uc131\uc5d0 \uc5ec\ub7ec \ud654\uc790\ub97c \ud3ec\ud568\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \ub354 \uc774\uc0c1 \ud654\uc790\ub9c8\ub2e4 \ubcc4\ub3c4\uc758 \ucc38\uc870 \uc624\ub514\uc624\ub97c \uc5c5\ub85c\ub4dc\ud558\uace0 \uc74c\uc131\uc744 \uc0dd\uc131\ud560 \ud544\uc694\uac00 \uc5c6\uc2b5\ub2c8\ub2e4.</p>"},{"location":"README.ko/#_9","title":"\ub2e4\uc911 \ud134 \ub300\ud654 \uc0dd\uc131","text":"<p>\ubaa8\ub378 \ucee8\ud14d\uc2a4\ud2b8 \ud655\uc7a5\uc5d0 \ud798\uc785\uc5b4 \uc774\uc81c \uc774\uc804 \uc815\ubcf4\uc758 \ub3c4\uc6c0\uc744 \ubc1b\uc544 \ud6c4\uc18d \uc0dd\uc131 \ub0b4\uc6a9\uc758 \ud45c\ud604\ub825\uc744 \ub192\uc774\uace0 \ucf58\ud150\uce20\uc758 \uc790\uc5f0\uc2a4\ub7ec\uc6c0\uc744 \ud5a5\uc0c1\uc2dc\ud0ac \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p>"},{"location":"README.ko/#_10","title":"\uace0\uc18d \uc74c\uc131 \ubcf5\uc81c","text":"<p>Fish Audio S2\ub294 \uc9e7\uc740 \ucc38\uc870 \uc0d8\ud50c(\ubcf4\ud1b5 10-30\ucd08)\uc744 \uc0ac\uc6a9\ud55c \uc815\ud655\ud55c \uc74c\uc131 \ubcf5\uc81c\ub97c \uc9c0\uc6d0\ud569\ub2c8\ub2e4. \ubaa8\ub378\uc740 \uc74c\uc0c9, \ub9d0\ud558\uae30 \uc2a4\ud0c0\uc77c \ubc0f \uac10\uc815\uc801 \uacbd\ud5a5\uc744 \ud3ec\ucc29\ud558\uc5ec \ucd94\uac00\uc801\uc778 \ubbf8\uc138 \uc870\uc815 \uc5c6\uc774\ub3c4 \uc0ac\uc2e4\uc801\uc774\uace0 \uc77c\uad00\ub41c \ubcf5\uc81c \uc74c\uc131\uc744 \uc0dd\uc131\ud569\ub2c8\ub2e4. SGLang \uc11c\ubc84 \uc0ac\uc6a9\uc5d0 \ub300\ud574\uc11c\ub294 SGLang-Omni README\ub97c \ucc38\uc870\ud558\uc2ed\uc2dc\uc624.</p>"},{"location":"README.ko/#_11","title":"\uac10\uc0ac\uc758 \ub9d0","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"README.ko/#_12","title":"\uae30\uc220 \ubcf4\uace0\uc11c","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"README.pt-BR/","title":"README.pt BR","text":"Fish Speech [English](../README.md) | [\u7b80\u4f53\u4e2d\u6587](README.zh.md) | **Portuguese** | [\u65e5\u672c\u8a9e](README.ja.md) | [\ud55c\uad6d\uc5b4](README.ko.md) | [\u0627\u0644\u0639\u0631\u0628\u064a\u0629](README.ar.md) | [Espa\u00f1ol](docs/README.es.md) <p>[!IMPORTANT] Aviso de Licen\u00e7a Este reposit\u00f3rio de c\u00f3digo e seus pesos de modelo associados s\u00e3o lan\u00e7ados sob a FISH AUDIO RESEARCH LICENSE. Consulte LICENSE para obter mais detalhes.</p> <p>[!WARNING] Aviso Legal N\u00e3o nos responsabilizamos por qualquer uso ilegal deste reposit\u00f3rio. Consulte as leis locais sobre DMCA e outras regulamenta\u00e7\u00f5es relevantes.</p>"},{"location":"README.pt-BR/#inicio-rapido","title":"In\u00edcio R\u00e1pido","text":""},{"location":"README.pt-BR/#links-da-documentacao","title":"Links da Documenta\u00e7\u00e3o","text":"<p>Esta \u00e9 a documenta\u00e7\u00e3o oficial do Fish Audio S2, siga as instru\u00e7\u00f5es para come\u00e7ar facilmente.</p> <ul> <li>Instala\u00e7\u00e3o</li> <li>Infer\u00eancia por Linha de Comando</li> <li>Infer\u00eancia por WebUI</li> <li>Infer\u00eancia por Servidor</li> <li>Implanta\u00e7\u00e3o Docker</li> </ul> <p>[!IMPORTANT] Caso deseje utilizar o SGLang Server, consulte o SGLang-Omni README.</p>"},{"location":"README.pt-BR/#guia-para-agentes-de-llm","title":"Guia para Agentes de LLM","text":"<pre><code>Leia primeiro https://speech.fish.audio/install/ e siga a documenta\u00e7\u00e3o para instalar e configurar o Fish Audio S2.\n</code></pre>"},{"location":"README.pt-BR/#fish-audio-s2-pro","title":"Fish Audio S2 Pro","text":"<p>O sistema de convers\u00e3o de texto em fala (TTS) multil\u00edngue l\u00edder do setor, redefinindo as fronteiras da gera\u00e7\u00e3o de voz.</p> <p>Fish Audio S2 Pro \u00e9 o modelo multimodal mais avan\u00e7ado desenvolvido pela Fish Audio. Treinado em mais de 10 milh\u00f5es de horas de dados de \u00e1udio massivos, cobrindo mais de 80 idiomas globais. Atrav\u00e9s de uma arquitetura inovadora de Dual-Autoregressive (Dual-AR) e tecnologia de alinhamento por aprendizado por refor\u00e7o (RL), o S2 Pro \u00e9 capaz de gerar fala com um senso de naturalidade, realismo e riqueza emocional extremos, liderando tanto em competi\u00e7\u00f5es de c\u00f3digo aberto quanto propriet\u00e1rio.</p> <p>O grande diferencial do S2 Pro reside em seu suporte para controle inline de granularidade ultra-fina de pros\u00f3dia e emo\u00e7\u00e3o ao n\u00edvel de sub-palavra (Sub-word Level) via tags de linguagem natural (como <code>[whisper]</code>, <code>[excited]</code>, <code>[angry]</code>), al\u00e9m de suporte nativo para m\u00faltiplos falantes e gera\u00e7\u00e3o de di\u00e1logos de m\u00faltiplos turnos com contexto ultra-longo.</p> <p>Visite agora o site oficial da Fish Audio para experimentar a demonstra\u00e7\u00e3o online, ou leia nosso relat\u00f3rio t\u00e9cnico e artigo no blog para saber mais.</p>"},{"location":"README.pt-BR/#variantes-de-modelo","title":"Variantes de Modelo","text":"Modelo Tamanho Disponibilidade Descri\u00e7\u00e3o S2-Pro 4B par\u00e2metros HuggingFace Modelo flagship completo, com m\u00e1xima qualidade e estabilidade <p>Para mais detalhes sobre os modelos, consulte o relat\u00f3rio t\u00e9cnico.</p>"},{"location":"README.pt-BR/#resultados-de-benchmark","title":"Resultados de Benchmark","text":"Benchmark Fish Audio S2 Seed-TTS Eval \u2014 WER (Chin\u00eas) 0.54% (Melhor geral) Seed-TTS Eval \u2014 WER (Ingl\u00eas) 0.99% (Melhor geral) Audio Turing Test (Com instru\u00e7\u00e3o) 0.515 M\u00e9dia posterior EmergentTTS-Eval \u2014 Taxa de Vit\u00f3ria 81.88% (Maior geral) Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 Qualidade 4.51 / 5.0 Multil\u00edngue (MiniMax Testset) \u2014 Melhor WER 11 de 24 idiomas Multil\u00edngue (MiniMax Testset) \u2014 Melhor SIM 17 de 24 idiomas <p>No Seed-TTS Eval, o S2 alcan\u00e7ou o menor WER entre todos os modelos avaliados (incluindo sistemas propriet\u00e1rios): Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). No Audio Turing Test, o valor de 0.515 do S2 representa um aumento de 24% em rela\u00e7\u00e3o ao Seed-TTS (0.417) e 33% em rela\u00e7\u00e3o ao MiniMax-Speech (0.387). No EmergentTTS-Eval, o S2 destacou-se especialmente em dimens\u00f5es como paralingu\u00edstica (taxa de vit\u00f3ria de 91.61%), frases interrogativas (84.41%) e complexidade sint\u00e1tica (83.39%).</p>"},{"location":"README.pt-BR/#destaques","title":"Destaques","text":""},{"location":"README.pt-BR/#controle-inline-de-granularidade-ultra-fina-via-linguagem-natural","title":"Controle Inline de Granularidade Ultra-Fina via Linguagem Natural","text":"<p>S2 Pro confere \u00e0 voz uma \"espiritualidade\" sem precedentes. Atrav\u00e9s de uma sintaxe simples de <code>[tag]</code>, voc\u00ea pode inserir instru\u00e7\u00f5es emocionais precisamente em qualquer posi\u00e7\u00e3o do texto. - Suporte para mais de 15.000 tags \u00fanicas: N\u00e3o limitado a predefini\u00e7\u00f5es fixas, suporta descri\u00e7\u00f5es textuais de formato livre. Voc\u00ea pode tentar <code>[whisper in small voice]</code> (sussurrando), <code>[professional broadcast tone]</code> (tom de locu\u00e7\u00e3o profissional) ou <code>[pitch up]</code> (aumentar o tom). - Rica biblioteca de emo\u00e7\u00f5es: <code>[pause]</code> <code>[emphasis]</code> <code>[laughing]</code> <code>[inhale]</code> <code>[chuckle]</code> <code>[tsk]</code> <code>[singing]</code> <code>[excited]</code> <code>[laughing tone]</code> <code>[interrupting]</code> <code>[chuckling]</code> <code>[excited tone]</code> <code>[volume up]</code> <code>[echo]</code> <code>[angry]</code> <code>[low volume]</code> <code>[sigh]</code> <code>[low voice]</code> <code>[whisper]</code> <code>[screaming]</code> <code>[shouting]</code> <code>[loud]</code> <code>[surprised]</code> <code>[short pause]</code> <code>[exhale]</code> <code>[delight]</code> <code>[panting]</code> <code>[audience laughter]</code> <code>[with strong accent]</code> <code>[volume down]</code> <code>[clearing throat]</code> <code>[sad]</code> <code>[moaning]</code> <code>[shocked]</code></p>"},{"location":"README.pt-BR/#arquitetura-inovadora-dual-autoregressive-dual-ar","title":"Arquitetura Inovadora Dual-Autoregressive (Dual-AR)","text":"<p>S2 Pro adota uma arquitetura Dual-AR mestre-escravo, consistindo de um Decoder-only Transformer e um codec de \u00e1udio RVQ (10 codebooks, cerca de 21 Hz de taxa de frames):</p> <ul> <li>Slow AR (4B par\u00e2metros): Atua ao longo do eixo temporal, prevendo o codebook sem\u00e2ntico central.</li> <li>Fast AR (400M par\u00e2metros): Gera os 9 codebooks residuais restantes em cada passo de tempo, restaurando detalhes ac\u00fasticos extremos com delicadeza.</li> </ul> <p>Este design assim\u00e9trico garante fidelidade extrema ao \u00e1udio enquanto aumenta significativamente a velocidade de infer\u00eancia.</p>"},{"location":"README.pt-BR/#alinhamento-por-aprendizado-por-reforco-rl-alignment","title":"Alinhamento por Aprendizado por Refor\u00e7o (RL Alignment)","text":"<p>S2 Pro utiliza a tecnologia Group Relative Policy Optimization (GRPO) para o alinhamento p\u00f3s-treinamento. Utilizamos o mesmo conjunto de modelos para limpeza e anota\u00e7\u00e3o de dados diretamente como modelos de recompensa (Reward Model), resolvendo perfeitamente o problema de descasamento entre a distribui\u00e7\u00e3o dos dados de pr\u00e9-treinamento e os objetivos de p\u00f3s-treinamento. - Sinais de recompensa multidimensionais: Avalia de forma abrangente a precis\u00e3o sem\u00e2ntica, a capacidade de seguir instru\u00e7\u00f5es, a pontua\u00e7\u00e3o de prefer\u00eancia ac\u00fastica e a similaridade de timbre, garantindo que cada segundo de fala gerada esteja alinhado com a intui\u00e7\u00e3o humana.</p>"},{"location":"README.pt-BR/#desempenho-de-inferencia-de-streaming-extremo-baseado-em-sglang","title":"Desempenho de Infer\u00eancia de Streaming Extremo (Baseado em SGLang)","text":"<p>Como a arquitetura Dual-AR \u00e9 estruturalmente isomorfa \u00e0 estrutura padr\u00e3o de LLMs, o S2 Pro suporta nativamente todos os recursos de acelera\u00e7\u00e3o de infer\u00eancia do SGLang, incluindo loteamento cont\u00ednuo (Continuous Batching), Paged KV Cache, CUDA Graph e cache de prefixo baseado em RadixAttention.</p> <p>Desempenho em uma \u00fanica GPU NVIDIA H200: - Fator em Tempo Real (RTF): 0.195 - Lat\u00eancia do Primeiro \u00c1udio (TTFA): aprox. 100 ms - Taxa de Transfer\u00eancia Ultrarr\u00e1pida: Alcance de 3.000+ acoustic tokens/s mantendo RTF < 0.5</p>"},{"location":"README.pt-BR/#poderoso-suporte-multilingue","title":"Poderoso Suporte Multil\u00edngue","text":"<p>S2 Pro suporta mais de 80 idiomas, possibilitando s\u00edntese de alta qualidade sem a necessidade de fonemas ou processamento espec\u00edfico por idioma:</p> <ul> <li>Tier 1: Japon\u00eas (ja), Ingl\u00eas (en), Chin\u00eas (zh)</li> <li>Tier 2: Coreano (ko), Espanhol (es), Portugu\u00eas (pt), \u00c1rabe (ar), Russo (ru), Franc\u00eas (fr), Alem\u00e3o (de)</li> <li>Cobertura Global: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo, etc.</li> </ul>"},{"location":"README.pt-BR/#geracao-nativa-multi-falante","title":"Gera\u00e7\u00e3o Nativa Multi-falante","text":"<p>O Fish Audio S2 permite que os usu\u00e1rios enviem \u00e1udio de refer\u00eancia contendo m\u00faltiplos falantes, e o modelo processar\u00e1 as caracter\u00edsticas de cada falante via o token <code><|speaker:i|></code>. Em seguida, voc\u00ea pode controlar o desempenho do modelo atrav\u00e9s do token de ID do falante, permitindo incluir m\u00faltiplos falantes em uma \u00fanica gera\u00e7\u00e3o. N\u00e3o \u00e9 mais necess\u00e1rio enviar \u00e1udios de refer\u00eancia separadamente para cada falante.</p>"},{"location":"README.pt-BR/#geracao-de-dialogos-multiturnos","title":"Gera\u00e7\u00e3o de Di\u00e1logos Multiturnos","text":"<p>Gra\u00e7as \u00e0 expans\u00e3o do contexto do modelo, nosso modelo agora pode aproveitar as informa\u00e7\u00f5es pr\u00e9vias para aumentar a expressividade dos conte\u00fados gerados subsequentemente, elevando assim a naturalidade dos di\u00e1logos.</p>"},{"location":"README.pt-BR/#clonagem-de-voz-rapida","title":"Clonagem de Voz R\u00e1pida","text":"<p>O Fish Audio S2 suporta clonagem de voz precisa usando curtas amostras de refer\u00eancia (normalmente 10-30 segundos). O modelo captura o timbre, o estilo de fala e as tend\u00eancias emocionais, gerando vozes clonadas realistas e consistentes sem necessidade de ajustes finos adicionais. Caso deseje utilizar o SGLang Server, consulte o SGLang-Omni README.</p>"},{"location":"README.pt-BR/#agradecimentos","title":"Agradecimentos","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"README.pt-BR/#relatorio-tecnico","title":"Relat\u00f3rio T\u00e9cnico","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang racing Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"README.zh/","title":"README.zh","text":"Fish Speech [English](../README.md) | **\u7b80\u4f53\u4e2d\u6587** | [Portuguese](README.pt-BR.md) | [\u65e5\u672c\u8a9e](README.ja.md) | [\ud55c\uad6d\uc5b4](README.ko.md) | [\u0627\u0644\u0639\u0631\u0628\u064a\u0629](README.ar.md) | [Espa\u00f1ol](docs/README.es.md) <p>[!IMPORTANT] \u8bb8\u53ef\u8bc1\u58f0\u660e \u6b64\u4ee3\u7801\u5e93\u53ca\u5176\u76f8\u5173\u7684\u6a21\u578b\u6743\u91cd\u5747\u5728 FISH AUDIO RESEARCH LICENSE \u4e0b\u53d1\u5e03\u3002\u66f4\u591a\u8be6\u60c5\u8bf7\u53c2\u8003 LICENSE\u3002</p> <p>[!WARNING] \u6cd5\u5f8b\u514d\u8d23\u58f0\u660e \u6211\u4eec\u4e0d\u5bf9\u4ee3\u7801\u5e93\u7684\u4efb\u4f55\u975e\u6cd5\u4f7f\u7528\u627f\u62c5\u8d23\u4efb\u3002\u8bf7\u53c2\u8003\u60a8\u5f53\u5730\u5173\u4e8e DMCA \u548c\u5176\u4ed6\u76f8\u5173\u6cd5\u5f8b\u7684\u6cd5\u89c4\u3002</p>"},{"location":"README.zh/#_1","title":"\u5feb\u901f\u5f00\u59cb","text":""},{"location":"README.zh/#_2","title":"\u6587\u6863\u5165\u53e3","text":"<p>\u8fd9\u91cc\u662f Fish Audio S2 \u7684\u5b98\u65b9\u6587\u6863\uff0c\u8bf7\u6309\u7167\u8bf4\u660e\u8f7b\u677e\u5165\u95e8\u3002</p> <ul> <li>\u5b89\u88c5</li> <li>\u547d\u4ee4\u884c\u63a8\u7406</li> <li>WebUI \u63a8\u7406</li> <li>\u670d\u52a1\u7aef\u63a8\u7406</li> <li>Docker \u90e8\u7f72</li> </ul> <p>[!IMPORTANT] \u5982\u9700\u4f7f\u7528 SGLang Server\uff0c\u8bf7\u53c2\u8003 SGLang-Omni README\u3002</p>"},{"location":"README.zh/#llm-agent","title":"LLM Agent \u6307\u5357","text":"<pre><code>\u8bf7\u5148\u9605\u8bfb https://speech.fish.audio/zh/install/ \uff0c\u5e76\u6309\u6587\u6863\u5b89\u88c5\u548c\u914d\u7f6e Fish Audio S2\u3002\n</code></pre>"},{"location":"README.zh/#fish-audio-s2-pro","title":"Fish Audio S2 Pro","text":"<p>\u884c\u4e1a\u9876\u5c16\u7684\u591a\u8bed\u8a00\u6587\u672c\u8f6c\u8bed\u97f3 (TTS) \u7cfb\u7edf\uff0c\u91cd\u65b0\u5b9a\u4e49\u58f0\u97f3\u751f\u6210\u7684\u8fb9\u754c\u3002</p> <p>Fish Audio S2 Pro \u662f Fish Audio \u5f00\u53d1\u7684\u6700\u5148\u8fdb\u7684\u591a\u6a21\u6001\u6a21\u578b\u3002S2 Pro \u8bad\u7ec3\u81ea\u8d85\u8fc7 1000 \u4e07\u5c0f\u65f6 \u7684\u6d77\u91cf\u97f3\u9891\u6570\u636e\uff0c\u8986\u76d6\u5168\u7403 80 \u591a\u79cd\u8bed\u8a00\u3002\u901a\u8fc7\u521b\u65b0\u7684 \u53cc\u81ea\u56de\u5f52 (Dual-AR) \u67b6\u6784\u4e0e\u5f3a\u5316\u5b66\u4e60 (RL) \u5bf9\u9f50\u6280\u672f\uff0cS2 Pro \u80fd\u751f\u6210\u6781\u5177\u81ea\u7136\u611f\u3001\u771f\u5b9e\u611f\u4e14\u60c5\u611f\u9971\u6ee1\u7684\u8bed\u97f3\uff0c\u5728\u5f00\u6e90\u4e0e\u95ed\u6e90\u7ade\u4e89\u4e2d\u5747\u5904\u4e8e\u9886\u5148\u5730\u4f4d\u3002</p> <p>S2 Pro \u7684\u6740\u624b\u950f\u5728\u4e8e\u652f\u6301\u901a\u8fc7\u81ea\u7136\u8bed\u8a00\u6807\u7b7e\uff08\u5982 <code>[whisper]</code>\u3001<code>[excited]</code>\u3001<code>[angry]</code>\uff09\u5bf9\u97f5\u5f8b\u4e0e\u60c5\u7eea\u8fdb\u884c \u4e9a\u8bcd\u7ea7\uff08Sub-word Level\uff09 \u7684\u6781\u7ec6\u7c92\u5ea6\u884c\u5185\u63a7\u5236\uff0c\u540c\u65f6\u539f\u751f\u652f\u6301\u591a\u8bf4\u8bdd\u4eba\u4e0e\u8d85\u957f\u4e0a\u4e0b\u6587\u7684\u591a\u8f6e\u5bf9\u8bdd\u751f\u6210\u3002</p> <p>\u7acb\u5373\u8bbf\u95ee Fish Audio \u5b98\u7f51 \u4f53\u9a8c\u5728\u7ebf\u6f14\u793a\uff0c\u6216\u9605\u8bfb\u6211\u4eec\u7684\u6280\u672f\u62a5\u544a\u4e0e\u535a\u5ba2\u6587\u7ae0\u6df1\u5165\u4e86\u89e3\u3002</p>"},{"location":"README.zh/#_3","title":"\u6a21\u578b\u53d8\u4f53","text":"\u6a21\u578b \u5927\u5c0f \u53ef\u7528\u6027 \u63cf\u8ff0 S2-Pro 4B \u53c2\u6570 HuggingFace \u529f\u80fd\u9f50\u5168\u7684\u65d7\u8230\u6a21\u578b\uff0c\u5177\u6709\u6700\u9ad8\u8d28\u91cf\u548c\u7a33\u5b9a\u6027 <p>\u6709\u5173\u6a21\u578b\u7684\u66f4\u591a\u8be6\u60c5\uff0c\u8bf7\u53c2\u89c1\u6280\u672f\u62a5\u544a\u3002</p>"},{"location":"README.zh/#_4","title":"\u57fa\u51c6\u6d4b\u8bd5\u7ed3\u679c","text":"\u57fa\u51c6 Fish Audio S2 Seed-TTS Eval \u2014 WER\uff08\u4e2d\u6587\uff09 0.54%\uff08\u603b\u4f53\u6700\u4f73\uff09 Seed-TTS Eval \u2014 WER\uff08\u82f1\u6587\uff09 0.99%\uff08\u603b\u4f53\u6700\u4f73\uff09 Audio Turing Test\uff08\u542b\u6307\u4ee4\uff09 0.515 \u540e\u9a8c\u5747\u503c EmergentTTS-Eval \u2014 \u80dc\u7387 81.88%\uff08\u603b\u4f53\u6700\u9ad8\uff09 Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 \u8d28\u91cf 4.51 / 5.0 \u591a\u8bed\u8a00\uff08MiniMax Testset\uff09\u2014 \u6700\u4f73 WER 24 \u79cd\u8bed\u8a00\u4e2d\u7684 11 \u79cd \u591a\u8bed\u8a00\uff08MiniMax Testset\uff09\u2014 \u6700\u4f73 SIM 24 \u79cd\u8bed\u8a00\u4e2d\u7684 17 \u79cd <p>\u5728 Seed-TTS Eval \u4e0a\uff0cS2 \u5728\u6240\u6709\u5df2\u8bc4\u4f30\u6a21\u578b\uff08\u5305\u62ec\u95ed\u6e90\u7cfb\u7edf\uff09\u4e2d\u5b9e\u73b0\u4e86\u6700\u4f4e WER\uff1aQwen3-TTS\uff080.77/1.24\uff09\u3001MiniMax Speech-02\uff080.99/1.90\uff09\u3001Seed-TTS\uff081.12/2.25\uff09\u3002\u5728 Audio Turing Test \u4e0a\uff0cS2 \u7684 0.515 \u76f8\u6bd4 Seed-TTS\uff080.417\uff09\u63d0\u5347 24%\uff0c\u76f8\u6bd4 MiniMax-Speech\uff080.387\uff09\u63d0\u5347 33%\u3002\u5728 EmergentTTS-Eval \u4e2d\uff0cS2 \u5728\u526f\u8bed\u8a00\u5b66\uff0891.61% \u80dc\u7387\uff09\u3001\u7591\u95ee\u53e5\uff0884.41%\uff09\u548c\u53e5\u6cd5\u590d\u6742\u5ea6\uff0883.39%\uff09\u7b49\u7ef4\u5ea6\u8868\u73b0\u5c24\u4e3a\u7a81\u51fa\u3002</p>"},{"location":"README.zh/#_5","title":"\u4eae\u70b9","text":""},{"location":"README.zh/#_6","title":"\u901a\u8fc7\u81ea\u7136\u8bed\u8a00\u8fdb\u884c\u6781\u7ec6\u7c92\u5ea6\u884c\u5185\u63a7\u5236","text":"<p>S2 Pro \u8d4b\u4e88\u4e86\u8bed\u97f3\u524d\u6240\u672a\u6709\u7684\u201c\u7075\u6027\u201d\u3002\u901a\u8fc7\u7b80\u5355\u7684 <code>[tag]</code> \u8bed\u6cd5\uff0c\u4f60\u53ef\u4ee5\u5728\u6587\u672c\u7684\u4efb\u4f55\u4f4d\u7f6e\u7cbe\u51c6\u5d4c\u5165\u60c5\u611f\u6307\u4ee4\u3002 - 15,000+ \u72ec\u7279\u6807\u7b7e\u652f\u6301\uff1a\u4e0d\u5c40\u9650\u4e8e\u56fa\u5b9a\u7684\u9884\u8bbe\uff0c\u652f\u6301 \u81ea\u7531\u683c\u5f0f\u7684\u6587\u672c\u63cf\u8ff0\u3002\u4f60\u53ef\u4ee5\u5c1d\u8bd5 <code>[whisper in small voice]</code> (\u4f4e\u58f0\u8033\u8bed), <code>[professional broadcast tone]</code> (\u4e13\u4e1a\u64ad\u97f3\u8154), \u6216 <code>[pitch up]</code> (\u63d0\u9ad8\u97f3\u8c03)\u3002 - \u4e30\u5bcc\u7684\u60c5\u7eea\u5e93\uff1a <code>[pause]</code> <code>[emphasis]</code> <code>[laughing]</code> <code>[inhale]</code> <code>[chuckle]</code> <code>[tsk]</code> <code>[singing]</code> <code>[excited]</code> <code>[laughing tone]</code> <code>[interrupting]</code> <code>[chuckling]</code> <code>[excited tone]</code> <code>[volume up]</code> <code>[echo]</code> <code>[angry]</code> <code>[low volume]</code> <code>[sigh]</code> <code>[low voice]</code> <code>[whisper]</code> <code>[screaming]</code> <code>[shouting]</code> <code>[loud]</code> <code>[surprised]</code> <code>[short pause]</code> <code>[exhale]</code> <code>[delight]</code> <code>[panting]</code> <code>[audience laughter]</code> <code>[with strong accent]</code> <code>[volume down]</code> <code>[clearing throat]</code> <code>[sad]</code> <code>[moaning]</code> <code>[shocked]</code></p>"},{"location":"README.zh/#dual-autoregressive","title":"\u521b\u65b0\u7684\u53cc\u81ea\u56de\u5f52 (Dual-Autoregressive) \u67b6\u6784","text":"<p>S2 Pro \u91c7\u7528\u4e86\u4e3b\u4ece\u5f0f Dual-AR \u67b6\u6784\uff0c\u7531 Decoder-only Transformer \u4e0e RVQ \u97f3\u9891\u7f16\u89e3\u7801\u5668\uff0810 \u4e2a\u7801\u672c\uff0c\u7ea6 21 Hz \u5e27\u7387\uff09\u7ec4\u6210\uff1a</p> <ul> <li>Slow AR (4B \u53c2\u6570)\uff1a\u6cbf\u65f6\u95f4\u8f74\u5de5\u4f5c\uff0c\u9884\u6d4b\u6838\u5fc3\u7684\u8bed\u4e49\u7801\u672c\u3002</li> <li>Fast AR (400M \u53c2\u6570)\uff1a\u5728\u6bcf\u4e2a\u65f6\u95f4\u6b65\u751f\u6210\u5269\u4f59 9 \u4e2a\u6b8b\u5dee\u7801\u672c\uff0c\u7ec6\u817b\u8fd8\u539f\u6781\u81f4\u7684\u97f3\u9891\u7ec6\u8282\u3002</li> </ul> <p>\u8fd9\u79cd\u975e\u5bf9\u79f0\u8bbe\u8ba1\u5728\u4fdd\u8bc1\u97f3\u9891\u6781\u81f4\u4fdd\u771f\u5ea6\u7684\u540c\u65f6\uff0c\u5927\u5e45\u63d0\u5347\u4e86\u63a8\u7406\u901f\u5ea6\u3002</p>"},{"location":"README.zh/#rl-alignment","title":"\u5f3a\u5316\u5b66\u4e60\u5bf9\u9f50 (RL Alignment)","text":"<p>S2 Pro \u91c7\u7528\u4e86 Group Relative Policy Optimization (GRPO) \u6280\u672f\u8fdb\u884c\u540e\u8bad\u7ec3\u5bf9\u9f50\u3002\u6211\u4eec\u5c06\u7528\u4e8e\u6570\u636e\u6e05\u6d17\u4e0e\u6807\u6ce8\u7684\u540c\u4e00\u5957\u6a21\u578b\u76f4\u63a5\u4f5c\u4e3a\u5956\u52b1\u6a21\u578b (Reward Model)\uff0c\u5b8c\u7f8e\u89e3\u51b3\u4e86\u9884\u8bad\u7ec3\u6570\u636e\u5206\u5e03\u4e0e\u540e\u8bad\u7ec3\u76ee\u6807\u4e4b\u95f4\u7684\u4e0d\u5339\u914d\u95ee\u9898\u3002 - \u591a\u7ef4\u5956\u52b1\u4fe1\u53f7\uff1a\u7efc\u5408\u8bc4\u4f30\u8bed\u4e49\u51c6\u786e\u6027\u3001\u6307\u4ee4\u9075\u5faa\u80fd\u529b\u3001\u58f0\u5b66\u504f\u597d\u8bc4\u5206\u4ee5\u53ca\u97f3\u8272\u76f8\u4f3c\u5ea6\uff0c\u786e\u4fdd\u751f\u6210\u7684\u6bcf\u4e00\u79d2\u8bed\u97f3\u90fd\u7b26\u5408\u4eba\u7c7b\u76f4\u89c9\u3002</p>"},{"location":"README.zh/#sglang","title":"\u6781\u81f4\u7684\u6d41\u5f0f\u63a8\u7406\u6027\u80fd (\u57fa\u4e8e SGLang)","text":"<p>\u7531\u4e8e Dual-AR \u67b6\u6784\u4e0e\u6807\u51c6 LLM \u7ed3\u6784\u540c\u6784\uff0cS2 Pro \u539f\u751f\u652f\u6301 SGLang \u7684\u6240\u6709\u63a8\u7406\u52a0\u901f\u7279\u6027\uff0c\u5305\u62ec\u8fde\u7eed\u6279\u5904\u7406 (Continuous Batching)\u3001\u5206\u9875 KV Cache\u3001CUDA Graph \u4e0e\u57fa\u4e8e RadixAttention \u7684\u524d\u7f00\u7f13\u5b58\u3002</p> <p>\u5355\u5f20 NVIDIA H200 GPU \u6027\u80fd\u8868\u73b0\uff1a - \u5b9e\u65f6\u56e0\u5b50 (RTF)\uff1a0.195 - \u9996\u97f3\u5ef6\u8fdf (TTFA)\uff1a\u7ea6 100 ms - \u6781\u901f\u541e\u5410\uff1a\u5728\u4fdd\u6301 RTF < 0.5 \u65f6\uff0c\u541e\u5410\u91cf\u8fbe\u5230 3,000+ acoustic tokens/s</p>"},{"location":"README.zh/#_7","title":"\u5f3a\u5927\u7684\u591a\u8bed\u8a00\u652f\u6301","text":"<p>S2 Pro \u652f\u6301 80 \u591a\u79cd\u8bed\u8a00\uff0c\u65e0\u9700\u97f3\u7d20\u6216\u7279\u5b9a\u8bed\u8a00\u7684\u5904\u7406\u5373\u53ef\u5b9e\u73b0\u9ad8\u8d28\u91cf\u5408\u6210\uff1a</p> <ul> <li>\u7b2c\u4e00\u68af\u961f (Tier 1)\uff1a\u65e5\u8bed (ja), \u82f1\u8bed (en), \u4e2d\u6587 (zh)</li> <li>\u7b2c\u4e8c\u68af\u961f (Tier 2)\uff1a\u97e9\u8bed (ko), \u897f\u73ed\u7259\u8bed (es), \u8461\u8404\u7259\u8bed (pt), \u963f\u62c9\u4f2f\u8bed (ar), \u4fc4\u8bed (ru), \u6cd5\u8bed (fr), \u5fb7\u8bed (de)</li> <li>\u5168\u7403\u8986\u76d6\uff1asv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo \u7b49\u3002</li> </ul>"},{"location":"README.zh/#_8","title":"\u539f\u751f\u591a\u8bf4\u8bdd\u4eba\u751f\u6210","text":"<p>Fish Audio S2 \u5141\u8bb8\u7528\u6237\u4e0a\u4f20\u5305\u542b\u591a\u4e2a\u8bf4\u8bdd\u4eba\u7684\u53c2\u8003\u97f3\u9891\uff0c\u6a21\u578b\u5c06\u901a\u8fc7 <code><|speaker:i|></code> \u4ee4\u724c\u5904\u7406\u6bcf\u4e2a\u8bf4\u8bdd\u4eba\u7684\u7279\u5f81\u3002\u4e4b\u540e\u60a8\u53ef\u4ee5\u901a\u8fc7\u8bf4\u8bdd\u4eba ID \u4ee4\u724c\u63a7\u5236\u6a21\u578b\u7684\u8868\u73b0\uff0c\u4ece\u800c\u5b9e\u73b0\u4e00\u6b21\u751f\u6210\u4e2d\u5305\u542b\u591a\u4e2a\u8bf4\u8bdd\u4eba\u3002\u518d\u4e5f\u4e0d\u9700\u8981\u50cf\u4ee5\u524d\u90a3\u6837\u9488\u5bf9\u6bcf\u4e2a\u8bf4\u8bdd\u4eba\u90fd\u5355\u72ec\u4e0a\u4f20\u53c2\u8003\u97f3\u9891\u4e0e\u751f\u6210\u8bed\u97f3\u4e86\u3002</p>"},{"location":"README.zh/#_9","title":"\u591a\u8f6e\u5bf9\u8bdd\u751f\u6210","text":"<p>\u5f97\u76ca\u4e8e\u6a21\u578b\u4e0a\u4e0b\u6587\u7684\u6269\u5c55\uff0c\u6211\u4eec\u7684\u6a21\u578b\u73b0\u5728\u53ef\u4ee5\u501f\u52a9\u4e0a\u6587\u7684\u4fe1\u606f\u63d0\u9ad8\u540e\u7eed\u751f\u6210\u5185\u5bb9\u7684\u8868\u73b0\u529b\uff0c\u4ece\u800c\u63d0\u5347\u5185\u5bb9\u7684\u81ea\u7136\u5ea6\u3002</p>"},{"location":"README.zh/#_10","title":"\u5feb\u901f\u8bed\u97f3\u514b\u9686","text":"<p>Fish Audio S2 \u652f\u6301\u4f7f\u7528\u77ed\u53c2\u8003\u6837\u672c\uff08\u901a\u5e38\u4e3a 10-30 \u79d2\uff09\u8fdb\u884c\u51c6\u786e\u7684\u8bed\u97f3\u514b\u9686\u3002\u6a21\u578b\u53ef\u4ee5\u6355\u6349\u97f3\u8272\u3001\u8bf4\u8bdd\u98ce\u683c\u548c\u60c5\u611f\u503e\u5411\uff0c\u65e0\u9700\u989d\u5916\u5fae\u8c03\u5373\u53ef\u751f\u6210\u903c\u771f\u4e14\u4e00\u81f4\u7684\u514b\u9686\u8bed\u97f3\u3002 \u5982\u9700\u4f7f\u7528 SGLang Server\uff0c\u8bf7\u53c2\u8003 SGLang-Omni README \u3002</p>"},{"location":"README.zh/#_11","title":"\u81f4\u8c22","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"README.zh/#_12","title":"\u6280\u672f\u62a5\u544a","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"","title":"Introduction","text":"Fish Speech <p>English | \u7b80\u4f53\u4e2d\u6587 | Portuguese | \u65e5\u672c\u8a9e | \ud55c\uad6d\uc5b4 | \u0627\u0644\u0639\u0631\u0628\u064a\u0629 | Espa\u00f1ol</p> <p></p> <p></p> <p>License Notice</p> <p>This codebase and its associated model weights are released under FISH AUDIO RESEARCH LICENSE. Please refer to LICENSE for more details. We will take action against any violation of the license.</p> <p>Legal Disclaimer</p> <p>We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.</p>"},{"location":"#quick-start","title":"Quick Start","text":""},{"location":"#for-human","title":"For Human","text":"<p>Here are the official documents for Fish Audio S2, follow the instructions to get started easily.</p> <ul> <li>Installation</li> <li>Command Line Inference</li> <li>WebUI Inference</li> <li>Server Inference</li> <li>Docker Setup</li> </ul> <p>[!IMPORTANT] For SGLang server, please read SGLang-Omni README.</p>"},{"location":"#for-llm-agent","title":"For LLM Agent","text":"<pre><code>Install and configure Fish-Audio S2 by following the instructions here: https://speech.fish.audio/install/\n</code></pre>"},{"location":"#fish-audio-s2","title":"Fish Audio S2","text":"<p>Best text-to-speech system among both open source and closed source</p> <p>Fish Audio S2 is the latest model developed by Fish Audio. Trained on over 10 million hours of audio across approximately 50 languages, S2 combines reinforcement learning alignment with a Dual-Autoregressive architecture to generate speech that sounds natural, realistic, and emotionally rich.</p> <p>S2 supports fine-grained inline control of prosody and emotion using natural-language tags like <code>[laugh]</code>, <code>[whispers]</code>, and <code>[super happy]</code>, as well as native multi-speaker and multi-turn generation.</p> <p>Visit the Fish Audio website for live playground. Read the blog post and technical report for more details.</p>"},{"location":"#model-variants","title":"Model Variants","text":"Model Size Availability Description S2-Pro 4B parameters HuggingFace Full-featured flagship model with maximum quality and stability <p>More details of the model can be found in the technical report.</p>"},{"location":"#benchmark-results","title":"Benchmark Results","text":"Benchmark Fish Audio S2 Seed-TTS Eval \u2014 WER (Chinese) 0.54% (best overall) Seed-TTS Eval \u2014 WER (English) 0.99% (best overall) Audio Turing Test (with instruction) 0.515 posterior mean EmergentTTS-Eval \u2014 Win Rate 81.88% (highest overall) Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 Quality 4.51 / 5.0 Multilingual (MiniMax Testset) \u2014 Best WER 11 of 24 languages Multilingual (MiniMax Testset) \u2014 Best SIM 17 of 24 languages <p>On Seed-TTS Eval, S2 achieves the lowest WER among all evaluated models including closed-source systems: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). On the Audio Turing Test, 0.515 surpasses Seed-TTS (0.417) by 24% and MiniMax-Speech (0.387) by 33%. On EmergentTTS-Eval, S2 achieves particularly strong results in paralinguistics (91.61% win rate), questions (84.41%), and syntactic complexity (83.39%).</p>"},{"location":"#highlights","title":"Highlights","text":""},{"location":"#fine-grained-inline-control-via-natural-language","title":"Fine-Grained Inline Control via Natural Language","text":"<p>S2 enables localized control over speech generation by embedding natural-language instructions directly at specific word or phrase positions within the text. Rather than relying on a fixed set of predefined tags, S2 accepts free-form textual descriptions \u2014 such as <code>[whisper in small voice]</code>, <code>[professional broadcast tone]</code>, or <code>[pitch up]</code> \u2014 allowing open-ended expression control at the word level.</p>"},{"location":"#dual-autoregressive-architecture","title":"Dual-Autoregressive Architecture","text":"<p>S2 builds on a decoder-only transformer combined with an RVQ-based audio codec (10 codebooks, ~21 Hz frame rate). The Dual-AR architecture splits generation into two stages:</p> <ul> <li>Slow AR operates along the time axis and predicts the primary semantic codebook.</li> <li>Fast AR generates the remaining 9 residual codebooks at each time step, reconstructing fine-grained acoustic detail.</li> </ul> <p>This asymmetric design \u2014 4B parameters along the time axis, 400M parameters along the depth axis \u2014 keeps inference efficient while preserving audio fidelity.</p>"},{"location":"#reinforcement-learning-alignment","title":"Reinforcement Learning Alignment","text":"<p>S2 uses Group Relative Policy Optimization (GRPO) for post-training alignment. The same models used to filter and annotate training data are directly reused as reward models during RL \u2014 eliminating distribution mismatch between pre-training data and post-training objectives. The reward signal combines semantic accuracy, instruction adherence, acoustic preference scoring, and timbre similarity.</p>"},{"location":"#production-streaming-via-sglang","title":"Production Streaming via SGLang","text":"<p>Because the Dual-AR architecture is structurally isomorphic to standard autoregressive LLMs, S2 directly inherits all LLM-native serving optimizations from SGLang \u2014 including continuous batching, paged KV cache, CUDA graph replay, and RadixAttention-based prefix caching.</p> <p>On a single NVIDIA H200 GPU:</p> <ul> <li>Real-Time Factor (RTF): 0.195</li> <li>Time-to-first-audio: ~100 ms</li> <li>Throughput: 3,000+ acoustic tokens/s while maintaining RTF below 0.5</li> </ul>"},{"location":"#multilingual-support","title":"Multilingual Support","text":"<p>S2 supports high-quality multilingual text-to-speech without requiring phonemes or language-specific preprocessing. Including:</p> <p>English, Chinese, Japanese, Korean, Arabics, German, French...</p> <p>AND MORE!</p> <p>The list is constantly expanding, check Fish Audio for the latest releases.</p>"},{"location":"#native-multi-speaker-generation","title":"Native Multi-Speaker Generation","text":"<p>Fish Audio S2 allows users to upload reference audio with multi-speaker, the model will deal with every speaker's feature via <code><|speaker:i|></code> token. Then you can control the model's performance with the speaker id token, allowing a single generation to include multiple speakers. You no longer need to upload reference audio separately for each speaker.</p>"},{"location":"#multi-turn-generation","title":"Multi-Turn Generation","text":"<p>Thanks to the expansion of the model context, our model can now use previous information to improve the expressiveness of subsequent generated content, thereby increasing the naturalness of the content.</p>"},{"location":"#rapid-voice-cloning","title":"Rapid Voice Cloning","text":"<p>Fish Audio S2 supports accurate voice cloning using a short reference sample (typically 10\u201330 seconds). The model captures timbre, speaking style, and emotional tendencies, producing realistic and consistent cloned voices without additional fine-tuning. Please refer to SGLang-Omni README to use the SGLang server.</p>"},{"location":"#credits","title":"Credits","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"#tech-report","title":"Tech Report","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"finetune/","title":"Fine-tuning","text":"<p>Warning</p> <p>We highly do note recoomand users to do fine-tuning on an RL trained model. Fine-tuning a model after RL can shift the model distribution, which may lead to degraded performance.</p> <p>In the current version, you only need to finetune the 'LLAMA' part.</p>"},{"location":"finetune/#fine-tuning-llama","title":"Fine-tuning LLAMA","text":""},{"location":"finetune/#1-prepare-the-dataset","title":"1. Prepare the dataset","text":"<pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u2514\u2500\u2500 30.1-32.71.mp3\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u2514\u2500\u2500 38.79-40.85.mp3\n</code></pre> <p>You need to convert your dataset into the above format and place it under <code>data</code>. The audio file can have the extensions <code>.mp3</code>, <code>.wav</code>, or <code>.flac</code>, and the annotation file should have the extension <code>.lab</code>.</p> <p>Info</p> <p>The <code>.lab</code> annotation file only needs to contain the transcription of the audio, with no special formatting required. For example, if <code>hi.mp3</code> says \"Hello, goodbye,\" then the <code>hi.lab</code> file would contain a single line of text: \"Hello, goodbye.\"</p> <p>Warning</p> <p>It's recommended to apply loudness normalization to the dataset. You can use fish-audio-preprocess to do this.</p> <pre><code>fap loudness-norm data-raw data --clean\n</code></pre>"},{"location":"finetune/#2-batch-extraction-of-semantic-tokens","title":"2. Batch extraction of semantic tokens","text":"<p>Make sure you have downloaded the VQGAN weights. If not, run the following command:</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>You can then run the following command to extract semantic tokens:</p> <pre><code>python tools/vqgan/extract_vq.py data \\\n --num-workers 1 --batch-size 16 \\\n --config-name \"modded_dac_vq\" \\\n --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n</code></pre> <p>Note</p> <p>You can adjust <code>--num-workers</code> and <code>--batch-size</code> to increase extraction speed, but please make sure not to exceed your GPU memory limit.</p> <p>This command will create <code>.npy</code> files in the <code>data</code> directory, as shown below:</p> <pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 21.15-26.44.npy\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.npy\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u251c\u2500\u2500 30.1-32.71.mp3\n\u2502 \u2514\u2500\u2500 30.1-32.71.npy\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u251c\u2500\u2500 38.79-40.85.mp3\n \u2514\u2500\u2500 38.79-40.85.npy\n</code></pre>"},{"location":"finetune/#3-pack-the-dataset-into-protobuf","title":"3. Pack the dataset into protobuf","text":"<pre><code>python tools/llama/build_dataset.py \\\n --input \"data\" \\\n --output \"data/protos\" \\\n --text-extension .lab \\\n --num-workers 16\n</code></pre> <p>After the command finishes executing, you should see the <code>protos</code> file in the <code>data</code> directory.</p>"},{"location":"finetune/#4-finally-fine-tuning-with-lora","title":"4. Finally, fine-tuning with LoRA","text":"<p>Similarly, make sure you have downloaded the <code>LLAMA</code> weights. If not, run the following command:</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>Finally, you can start the fine-tuning by running the following command:</p> <pre><code>python fish_speech/train.py --config-name text2semantic_finetune \\\n project=$project \\\n +lora@model.model.lora_config=r_8_alpha_16\n</code></pre> <p>Note</p> <p>You can modify the training parameters such as <code>batch_size</code>, <code>gradient_accumulation_steps</code>, etc. to fit your GPU memory by modifying <code>fish_speech/configs/text2semantic_finetune.yaml</code>.</p> <p>Note</p> <p>For Windows users, you can use <code>trainer.strategy.process_group_backend=gloo</code> to avoid <code>nccl</code> issues.</p> <p>After training is complete, you can refer to the inference section to test your model.</p> <p>Info</p> <p>By default, the model will only learn the speaker's speech patterns and not the timbre. You still need to use prompts to ensure timbre stability. If you want to learn the timbre, you can increase the number of training steps, but this may lead to overfitting.</p> <p>After training, you need to convert the LoRA weights to regular weights before performing inference.</p> <pre><code>python tools/llama/merge_lora.py \\\n --lora-config r_8_alpha_16 \\\n --base-weight checkpoints/openaudio-s1-mini \\\n --lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n --output checkpoints/openaudio-s1-mini-yth-lora/\n</code></pre> <p>Note</p> <p>You may also try other checkpoints. We suggest using the earliest checkpoint that meets your requirements, as they often perform better on out-of-distribution (OOD) data.</p>"},{"location":"inference/","title":"Inference","text":"<p>The Fish Audio S2 model requires a large amount of VRAM. We recommend using a GPU with at least 24GB for inference.</p>"},{"location":"inference/#download-weights","title":"Download Weights","text":"<p>First, you need to download the model weights:</p> <pre><code>hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n</code></pre>"},{"location":"inference/#command-line-inference","title":"Command Line Inference","text":"<p>Note</p> <p>If you plan to let the model randomly choose a voice timbre, you can skip this step.</p>"},{"location":"inference/#1-get-vq-tokens-from-reference-audio","title":"1. Get VQ tokens from reference audio","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"test.wav\" \\\n --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n</code></pre> <p>You should get a <code>fake.npy</code> and a <code>fake.wav</code>.</p>"},{"location":"inference/#2-generate-semantic-tokens-from-text","title":"2. Generate Semantic tokens from text:","text":"<pre><code>python fish_speech/models/text2semantic/inference.py \\\n --text \"The text you want to convert\" \\\n --prompt-text \"Your reference text\" \\\n --prompt-tokens \"fake.npy\" \\\n # --compile\n</code></pre> <p>This command will create a <code>codes_N</code> file in the working directory, where N is an integer starting from 0.</p> <p>Note</p> <p>You may want to use <code>--compile</code> to fuse CUDA kernels for faster inference. However, we recommend using our sglang inference acceleration optimization. Correspondingly, if you do not plan to use acceleration, you can comment out the <code>--compile</code> parameter.</p> <p>Info</p> <p>For GPUs that do not support bf16, you may need to use the <code>--half</code> parameter.</p>"},{"location":"inference/#3-generate-vocals-from-semantic-tokens","title":"3. Generate vocals from semantic tokens:","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"codes_0.npy\" \\\n</code></pre> <p>After that, you will get a <code>fake.wav</code> file.</p>"},{"location":"inference/#webui-inference","title":"WebUI Inference","text":""},{"location":"inference/#1-gradio-webui","title":"1. Gradio WebUI","text":"<p>For compatibility, we still maintain the Gradio WebUI.</p> <pre><code>python tools/run_webui.py # --compile if you need acceleration\n</code></pre>"},{"location":"inference/#2-awesome-webui","title":"2. Awesome WebUI","text":"<p>Awesome WebUI is a modernized Web interface built with TypeScript, offering richer features and a better user experience.</p> <p>Build WebUI:</p> <p>You need to have Node.js and npm installed on your local machine or server.</p> <ol> <li>Enter the <code>awesome_webui</code> directory: <pre><code>cd awesome_webui\n</code></pre></li> <li>Install dependencies: <pre><code>npm install\n</code></pre></li> <li>Build the WebUI: <pre><code>npm run build\n</code></pre></li> </ol> <p>Start Backend Server:</p> <p>After building the WebUI, return to the project root and start the API server:</p> <pre><code>python tools/api_server.py --listen 0.0.0.0:8888 --compile\n</code></pre> <p>Access:</p> <p>Once the server is running, you can access it via your browser: <code>http://localhost:8888/ui</code></p>"},{"location":"install/","title":"Installation","text":""},{"location":"install/#requirements","title":"Requirements","text":"<ul> <li>GPU Memory: 24GB (Inference)</li> <li>System: Linux, WSL</li> </ul>"},{"location":"install/#system-setup","title":"System Setup","text":"<p>Fish Audio S2 supports multiple installation methods. Choose the one that best fits your development environment.</p> <p>Prerequisites: Install system dependencies for audio processing: <pre><code>apt install portaudio19-dev libsox-dev ffmpeg\n</code></pre></p>"},{"location":"install/#conda","title":"Conda","text":"<pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# GPU installation (choose your CUDA version: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# CPU-only installation\npip install -e .[cpu]\n\n# Default installation (uses PyTorch default index)\npip install -e .\n\n# If you encounter an error during installation due to pyaudio, consider using the following command:\n# conda install pyaudio\n# Then run pip install -e . again\n</code></pre>"},{"location":"install/#uv","title":"UV","text":"<p>UV provides faster dependency resolution and installation:</p> <pre><code># GPU installation (choose your CUDA version: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# CPU-only installation\nuv sync --python 3.12 --extra cpu\n</code></pre>"},{"location":"install/#intel-arc-xpu-support","title":"Intel Arc XPU support","text":"<p>For Intel Arc GPU users, install with XPU support:</p> <pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# Install required C++ standard library\nconda install libstdcxx -c conda-forge\n\n# Install PyTorch with Intel XPU support\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# Install Fish Speech\npip install -e .\n</code></pre> <p>Warning</p> <p>The <code>compile</code> option is not supported on Windows and macOS. If you want to run with compile, you need to install Triton manually.</p>"},{"location":"install/#docker-setup","title":"Docker Setup","text":"<p>Fish Audio S2 series model provides multiple Docker deployment options to suit different needs. You can use pre-built images from Docker Hub, build locally with Docker Compose, or manually build custom images.</p> <p>We provide Docker images for both WebUI and API server on both GPU (CUDA126 by default) and CPU. You can use the pre-built images from Docker Hub, build locally with Docker Compose, or manually build custom images. If you want to build locally, follow the instructions below. If you only want to use pre-built images, follow the inference guide.</p>"},{"location":"install/#prerequisites","title":"Prerequisites","text":"<ul> <li>Docker and Docker Compose installed</li> <li>NVIDIA Docker runtime (for GPU support)</li> <li>At least 24GB GPU memory for CUDA inference</li> </ul>"},{"location":"install/#use-docker-compose","title":"Use docker compose","text":"<p>For development or customization, you can use Docker Compose to build and run locally:</p> <pre><code># Clone the repository first\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# Start WebUI with CUDA\ndocker compose --profile webui up\n\n# Start WebUI with compile optimization\nCOMPILE=1 docker compose --profile webui up\n\n# Start API server\ndocker compose --profile server up\n\n# Start API server with compile optimization \nCOMPILE=1 docker compose --profile server up\n\n# For CPU-only deployment\nBACKEND=cpu docker compose --profile webui up\n</code></pre>"},{"location":"install/#environment-variables-for-docker-compose","title":"Environment Variables for Docker Compose","text":"<p>You can customize the deployment using environment variables:</p> <pre><code># .env file example\nBACKEND=cuda # or cpu\nCOMPILE=1 # Enable compile optimization\nGRADIO_PORT=7860 # WebUI port\nAPI_PORT=8080 # API server port\nUV_VERSION=0.8.15 # UV package manager version\nCUDA_VER=12.9.0 # CUDA base image version (e.g. 12.6.0 for older drivers)\nUV_EXTRA=cu129 # PyTorch CUDA variant (cu126, cu128, cu129) \u2014 must match CUDA_VER\n</code></pre> <p>The command will build the image and run the container. You can access the WebUI at <code>http://localhost:7860</code> and the API server at <code>http://localhost:8080</code>.</p>"},{"location":"install/#manual-docker-build","title":"Manual Docker Build","text":"<p>For advanced users who want to customize the build process:</p> <pre><code># Build WebUI image with CUDA support\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.9.0 \\\n --build-arg UV_EXTRA=cu129 \\\n --target webui \\\n -t fish-speech-webui:cuda .\n\n# Build API server image with CUDA support\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.9.0 \\\n --build-arg UV_EXTRA=cu129 \\\n --target server \\\n -t fish-speech-server:cuda .\n\n# Build CPU-only images (supports multi-platform)\ndocker build \\\n --platform linux/amd64,linux/arm64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cpu \\\n --target webui \\\n -t fish-speech-webui:cpu .\n\n# Build development image\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --target dev \\\n -t fish-speech-dev:cuda .\n</code></pre>"},{"location":"install/#build-arguments","title":"Build Arguments","text":"<ul> <li><code>BACKEND</code>: <code>cuda</code> or <code>cpu</code> (default: <code>cuda</code>)</li> <li><code>CUDA_VER</code>: CUDA version (default: <code>12.6.0</code>)</li> <li><code>UV_EXTRA</code>: UV extra for CUDA (default: <code>cu126</code>)</li> <li><code>UBUNTU_VER</code>: Ubuntu version (default: <code>24.04</code>)</li> <li><code>PY_VER</code>: Python version (default: <code>3.12</code>)</li> </ul>"},{"location":"install/#volume-mounts","title":"Volume Mounts","text":"<p>Both methods require mounting these directories:</p> <ul> <li><code>./checkpoints:/app/checkpoints</code> - Model weights directory</li> <li><code>./references:/app/references</code> - Reference audio files directory</li> </ul>"},{"location":"install/#environment-variables","title":"Environment Variables","text":"<ul> <li><code>COMPILE=1</code> - Enable torch.compile for faster inference (~10x speedup)</li> <li><code>GRADIO_SERVER_NAME=0.0.0.0</code> - WebUI server host</li> <li><code>GRADIO_SERVER_PORT=7860</code> - WebUI server port</li> <li><code>API_SERVER_NAME=0.0.0.0</code> - API server host </li> <li><code>API_SERVER_PORT=8080</code> - API server port</li> </ul> <p>Note</p> <p>The Docker containers expect model weights to be mounted at <code>/app/checkpoints</code>. Make sure to download the required model weights before starting the containers.</p> <p>Warning</p> <p>GPU support requires NVIDIA Docker runtime. For CPU-only deployment, remove the <code>--gpus all</code> flag and use CPU images.</p>"},{"location":"server/","title":"Server","text":"<p>This page covers server-side inference for Fish Audio S2, plus quick links for WebUI inference and Docker deployment.</p>"},{"location":"server/#api-server-inference","title":"API Server Inference","text":"<p>Fish Speech provides an HTTP API server entrypoint at <code>tools/api_server.py</code>.</p>"},{"location":"server/#start-the-server-locally","title":"Start the server locally","text":"<pre><code>python tools/api_server.py \\\n --llama-checkpoint-path checkpoints/s2-pro \\\n --decoder-checkpoint-path checkpoints/s2-pro/codec.pth \\\n --listen 0.0.0.0:8080\n</code></pre> <p>Common options:</p> <ul> <li><code>--compile</code>: enable <code>torch.compile</code> optimization</li> <li><code>--half</code>: use fp16 mode</li> <li><code>--api-key</code>: require bearer token authentication</li> <li><code>--workers</code>: set worker process count</li> </ul>"},{"location":"server/#health-check","title":"Health check","text":"<pre><code>curl -X GET http://127.0.0.1:8080/v1/health\n</code></pre> <p>Expected response:</p> <pre><code>{\"status\":\"ok\"}\n</code></pre>"},{"location":"server/#main-api-endpoint","title":"Main API endpoint","text":"<ul> <li><code>POST /v1/tts</code> for text-to-speech generation</li> <li><code>POST /v1/vqgan/encode</code> for VQ encode</li> <li><code>POST /v1/vqgan/decode</code> for VQ decode</li> </ul>"},{"location":"server/#python-client-example","title":"Python client example","text":"<p>The base TTS model is selected when the server starts. In the example above, the server is started with the <code>checkpoints/s2-pro</code> weights, so every request sent to <code>http://127.0.0.1:8080/v1/tts</code> will use S2-Pro automatically. There is no separate per-request <code>model</code> field in <code>tools/api_client.py</code> for local server calls.</p> <pre><code>python tools/api_client.py \\\n --url http://127.0.0.1:8080/v1/tts \\\n --text \"Hello from Fish Speech\" \\\n --output s2-pro-demo\n</code></pre> <p>If you want to select a saved reference voice, use <code>--reference_id</code>. This chooses the voice reference, not the base TTS model:</p> <pre><code>python tools/api_client.py \\\n --url http://127.0.0.1:8080/v1/tts \\\n --text \"Hello from Fish Speech\" \\\n --reference_id my-speaker \\\n --output s2-pro-demo\n</code></pre>"},{"location":"server/#webui-inference","title":"WebUI Inference","text":"<p>For WebUI usage, see:</p> <ul> <li>WebUI Inference</li> </ul>"},{"location":"server/#docker","title":"Docker","text":"<p>For Docker-based server or WebUI deployment, see:</p> <ul> <li>Docker Setup</li> </ul> <p>You can also start the server profile directly with Docker Compose:</p> <pre><code>docker compose --profile server up\n</code></pre>"},{"location":"zh/","title":"\u4ecb\u7ecd","text":"Fish Speech <p>English | \u7b80\u4f53\u4e2d\u6587 | Portuguese | \u65e5\u672c\u8a9e | \ud55c\uad6d\uc5b4 | \u0627\u0644\u0639\u0631\u0628\u064a\u0629 | Espa\u00f1ol</p> <p></p> <p></p> <p>\u8bb8\u53ef\u58f0\u660e</p> <p>\u6b64\u4ee3\u7801\u5e93\u53ca\u5176\u76f8\u5173\u7684\u6a21\u578b\u6743\u91cd\u5747\u5728 FISH AUDIO RESEARCH LICENSE \u4e0b\u53d1\u5e03\u3002\u66f4\u591a\u8be6\u60c5\u8bf7\u53c2\u8003 LICENSE\u3002</p> <p>\u6cd5\u5f8b\u514d\u8d23\u58f0\u660e</p> <p>\u6211\u4eec\u4e0d\u5bf9\u4ee3\u7801\u5e93\u7684\u4efb\u4f55\u975e\u6cd5\u4f7f\u7528\u627f\u62c5\u8d23\u4efb\u3002\u8bf7\u53c2\u8003\u60a8\u5f53\u5730\u5173\u4e8e DMCA \u548c\u5176\u4ed6\u76f8\u5173\u6cd5\u5f8b\u7684\u6cd5\u89c4\u3002</p>"},{"location":"zh/#_1","title":"\u5feb\u901f\u5f00\u59cb","text":""},{"location":"zh/#_2","title":"\u6587\u6863\u5165\u53e3","text":"<p>\u8fd9\u91cc\u662f Fish Audio S2 \u7684\u5b98\u65b9\u6587\u6863\uff0c\u8bf7\u6309\u7167\u8bf4\u660e\u8f7b\u677e\u5165\u95e8\u3002</p> <ul> <li>\u5b89\u88c5</li> <li>\u547d\u4ee4\u884c\u63a8\u7406</li> <li>WebUI \u63a8\u7406</li> <li>\u670d\u52a1\u7aef\u63a8\u7406</li> <li>Docker \u90e8\u7f72</li> </ul> <p>[!IMPORTANT] \u5982\u9700\u4f7f\u7528 SGLang Server\uff0c\u8bf7\u53c2\u8003 SGLang-Omni README\u3002</p>"},{"location":"zh/#llm-agent","title":"LLM Agent \u6307\u5357","text":"<pre><code>\u8bf7\u5148\u9605\u8bfb https://speech.fish.audio/zh/install/ \uff0c\u5e76\u6309\u6587\u6863\u5b89\u88c5\u548c\u914d\u7f6e Fish Audio S2\u3002\n</code></pre>"},{"location":"zh/#fish-audio-s2","title":"Fish Audio S2","text":"<p>\u5728\u5f00\u6e90\u4e0e\u95ed\u6e90\u65b9\u6848\u4e2d\u90fd\u5904\u4e8e\u9886\u5148\u6c34\u5e73\u7684\u6587\u672c\u8f6c\u8bed\u97f3\u7cfb\u7edf</p> <p>Fish Audio S2 \u662f\u7531 Fish Audio \u5f00\u53d1\u7684\u6700\u65b0\u6a21\u578b\u3002S2 \u5728\u7ea6 50 \u79cd\u8bed\u8a00\u3001\u8d85\u8fc7 1000 \u4e07\u5c0f\u65f6\u97f3\u9891\u6570\u636e\u4e0a\u5b8c\u6210\u8bad\u7ec3\uff0c\u5e76\u7ed3\u5408\u5f3a\u5316\u5b66\u4e60\u5bf9\u9f50\u4e0e\u53cc\u81ea\u56de\u5f52\u67b6\u6784\uff0c\u80fd\u591f\u751f\u6210\u81ea\u7136\u3001\u771f\u5b9e\u4e14\u60c5\u611f\u4e30\u5bcc\u7684\u8bed\u97f3\u3002</p> <p>S2 \u652f\u6301\u901a\u8fc7\u81ea\u7136\u8bed\u8a00\u6807\u7b7e\uff08\u5982 <code>[laugh]</code>\u3001<code>[whispers]</code>\u3001<code>[super happy]</code>\uff09\u5bf9\u97f5\u5f8b\u548c\u60c5\u7eea\u8fdb\u884c\u7ec6\u7c92\u5ea6\u884c\u5185\u63a7\u5236\uff0c\u540c\u65f6\u539f\u751f\u652f\u6301\u591a\u8bf4\u8bdd\u4eba\u548c\u591a\u8f6e\u751f\u6210\u3002</p> <p>\u8bf7\u8bbf\u95ee Fish Audio \u7f51\u7ad9 \u4f53\u9a8c\u5728\u7ebf\u6f14\u793a\uff0c\u5e76\u9605\u8bfb\u535a\u5ba2\u6587\u7ae0\u548c\u6280\u672f\u62a5\u544a\u4e86\u89e3\u66f4\u591a\u7ec6\u8282\u3002</p>"},{"location":"zh/#_3","title":"\u6a21\u578b\u53d8\u4f53","text":"\u6a21\u578b \u5927\u5c0f \u53ef\u7528\u6027 \u63cf\u8ff0 S2-Pro 4B \u53c2\u6570 HuggingFace \u529f\u80fd\u9f50\u5168\u7684\u65d7\u8230\u6a21\u578b\uff0c\u5177\u6709\u6700\u9ad8\u8d28\u91cf\u548c\u7a33\u5b9a\u6027 <p>\u6709\u5173\u6a21\u578b\u7684\u66f4\u591a\u8be6\u60c5\uff0c\u8bf7\u53c2\u89c1\u6280\u672f\u62a5\u544a\u3002</p>"},{"location":"zh/#_4","title":"\u57fa\u51c6\u6d4b\u8bd5\u7ed3\u679c","text":"\u57fa\u51c6 Fish Audio S2 Seed-TTS Eval \u2014 WER\uff08\u4e2d\u6587\uff09 0.54%\uff08\u603b\u4f53\u6700\u4f73\uff09 Seed-TTS Eval \u2014 WER\uff08\u82f1\u6587\uff09 0.99%\uff08\u603b\u4f53\u6700\u4f73\uff09 Audio Turing Test\uff08\u542b\u6307\u4ee4\uff09 0.515 \u540e\u9a8c\u5747\u503c EmergentTTS-Eval \u2014 \u80dc\u7387 81.88%\uff08\u603b\u4f53\u6700\u9ad8\uff09 Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 \u8d28\u91cf 4.51 / 5.0 \u591a\u8bed\u8a00\uff08MiniMax Testset\uff09\u2014 \u6700\u4f73 WER 24 \u79cd\u8bed\u8a00\u4e2d\u7684 11 \u79cd \u591a\u8bed\u8a00\uff08MiniMax Testset\uff09\u2014 \u6700\u4f73 SIM 24 \u79cd\u8bed\u8a00\u4e2d\u7684 17 \u79cd <p>\u5728 Seed-TTS Eval \u4e0a\uff0cS2 \u5728\u6240\u6709\u5df2\u8bc4\u4f30\u6a21\u578b\uff08\u5305\u62ec\u95ed\u6e90\u7cfb\u7edf\uff09\u4e2d\u5b9e\u73b0\u4e86\u6700\u4f4e WER\uff1aQwen3-TTS\uff080.77/1.24\uff09\u3001MiniMax Speech-02\uff080.99/1.90\uff09\u3001Seed-TTS\uff081.12/2.25\uff09\u3002\u5728 Audio Turing Test \u4e0a\uff0cS2 \u7684 0.515 \u76f8\u6bd4 Seed-TTS\uff080.417\uff09\u63d0\u5347 24%\uff0c\u76f8\u6bd4 MiniMax-Speech\uff080.387\uff09\u63d0\u5347 33%\u3002\u5728 EmergentTTS-Eval \u4e2d\uff0cS2 \u5728\u526f\u8bed\u8a00\u5b66\uff0891.61% \u80dc\u7387\uff09\u3001\u7591\u95ee\u53e5\uff0884.41%\uff09\u548c\u53e5\u6cd5\u590d\u6742\u5ea6\uff0883.39%\uff09\u7b49\u7ef4\u5ea6\u8868\u73b0\u5c24\u4e3a\u7a81\u51fa\u3002</p>"},{"location":"zh/#_5","title":"\u4eae\u70b9","text":""},{"location":"zh/#_6","title":"\u901a\u8fc7\u81ea\u7136\u8bed\u8a00\u8fdb\u884c\u7ec6\u7c92\u5ea6\u884c\u5185\u63a7\u5236","text":"<p>Fish Audio S2 \u652f\u6301\u5728\u6587\u672c\u4e2d\u7684\u7279\u5b9a\u8bcd\u6216\u77ed\u8bed\u4f4d\u7f6e\u76f4\u63a5\u5d4c\u5165\u81ea\u7136\u8bed\u8a00\u6307\u4ee4\uff0c\u4ece\u800c\u5bf9\u8bed\u97f3\u751f\u6210\u8fdb\u884c\u5c40\u90e8\u63a7\u5236\u3002\u4e0e\u4f9d\u8d56\u56fa\u5b9a\u9884\u8bbe\u6807\u7b7e\u4e0d\u540c\uff0cS2 \u63a5\u53d7\u81ea\u7531\u5f62\u5f0f\u7684\u6587\u672c\u63cf\u8ff0\uff0c\u4f8b\u5982 [whisper in small voice]\u3001[professional broadcast tone] \u6216 [pitch up]\uff0c\u5b9e\u73b0\u8bcd\u7ea7\u522b\u7684\u5f00\u653e\u5f0f\u8868\u8fbe\u63a7\u5236\u3002</p>"},{"location":"zh/#dual-autoregressive","title":"\u53cc\u81ea\u56de\u5f52\u67b6\u6784\uff08Dual-Autoregressive\uff09","text":"<p>S2 \u57fa\u4e8e\u4ec5\u89e3\u7801\u5668 Transformer\uff0c\u5e76\u7ed3\u5408 RVQ \u97f3\u9891\u7f16\u89e3\u7801\u5668\uff0810 \u4e2a\u7801\u672c\uff0c\u7ea6 21 Hz \u5e27\u7387\uff09\u3002Dual-AR \u67b6\u6784\u5c06\u751f\u6210\u62c6\u5206\u4e3a\u4e24\u4e2a\u9636\u6bb5\uff1a</p> <ul> <li>Slow AR \u6cbf\u65f6\u95f4\u8f74\u8fd0\u884c\uff0c\u9884\u6d4b\u4e3b\u8bed\u4e49\u7801\u672c\u3002</li> <li>Fast AR \u5728\u6bcf\u4e2a\u65f6\u95f4\u6b65\u751f\u6210\u5269\u4f59 9 \u4e2a\u6b8b\u5dee\u7801\u672c\uff0c\u7528\u4e8e\u91cd\u5efa\u7ec6\u7c92\u5ea6\u58f0\u5b66\u7ec6\u8282\u3002</li> </ul> <p>\u8fd9\u79cd\u975e\u5bf9\u79f0\u8bbe\u8ba1\uff08\u65f6\u95f4\u8f74 4B \u53c2\u6570\u3001\u6df1\u5ea6\u8f74 400M \u53c2\u6570\uff09\u5728\u4fdd\u6301\u97f3\u9891\u4fdd\u771f\u5ea6\u7684\u540c\u65f6\uff0c\u63d0\u9ad8\u4e86\u63a8\u7406\u6548\u7387\u3002</p>"},{"location":"zh/#_7","title":"\u5f3a\u5316\u5b66\u4e60\u5bf9\u9f50","text":"<p>S2 \u4f7f\u7528 Group Relative Policy Optimization\uff08GRPO\uff09\u8fdb\u884c\u540e\u8bad\u7ec3\u5bf9\u9f50\u3002\u7528\u4e8e\u8fc7\u6ee4\u548c\u6807\u6ce8\u8bad\u7ec3\u6570\u636e\u7684\u540c\u4e00\u6279\u6a21\u578b\u88ab\u76f4\u63a5\u590d\u7528\u4e3a RL \u7684\u5956\u52b1\u6a21\u578b\uff0c\u4ece\u800c\u907f\u514d\u4e86\u9884\u8bad\u7ec3\u6570\u636e\u5206\u5e03\u4e0e\u540e\u8bad\u7ec3\u76ee\u6807\u4e4b\u95f4\u7684\u4e0d\u5339\u914d\u3002\u5956\u52b1\u4fe1\u53f7\u7efc\u5408\u4e86\u8bed\u4e49\u51c6\u786e\u6027\u3001\u6307\u4ee4\u9075\u5faa\u3001\u58f0\u5b66\u504f\u597d\u8bc4\u5206\u4e0e\u97f3\u8272\u76f8\u4f3c\u5ea6\u3002</p>"},{"location":"zh/#sglang","title":"\u57fa\u4e8e SGLang \u7684\u751f\u4ea7\u7ea7\u6d41\u5f0f\u63a8\u7406","text":"<p>\u7531\u4e8e Dual-AR \u67b6\u6784\u5728\u7ed3\u6784\u4e0a\u4e0e\u6807\u51c6\u81ea\u56de\u5f52 LLM \u540c\u6784\uff0cS2 \u53ef\u4ee5\u76f4\u63a5\u7ee7\u627f SGLang \u63d0\u4f9b\u7684 LLM \u539f\u751f\u670d\u52a1\u4f18\u5316\u80fd\u529b\uff0c\u5305\u62ec\u8fde\u7eed\u6279\u5904\u7406\u3001\u5206\u9875 KV Cache\u3001CUDA Graph Replay \u4e0e\u57fa\u4e8e RadixAttention \u7684\u524d\u7f00\u7f13\u5b58\u3002</p> <p>\u5728\u5355\u5f20 NVIDIA H200 GPU \u4e0a\uff1a</p> <ul> <li>\u5b9e\u65f6\u56e0\u5b50\uff08RTF\uff09\uff1a 0.195</li> <li>\u9996\u97f3\u9891\u5ef6\u8fdf\uff1a \u7ea6 100 ms</li> <li>\u541e\u5410\uff1a \u5728 RTF \u4f4e\u4e8e 0.5 \u7684\u60c5\u51b5\u4e0b\u8fbe\u5230 3,000+ acoustic tokens/s</li> </ul>"},{"location":"zh/#_8","title":"\u591a\u8bed\u8a00\u652f\u6301","text":"<p>Fish Audio S2 \u652f\u6301\u9ad8\u8d28\u91cf\u7684\u591a\u8bed\u8a00\u6587\u672c\u8f6c\u8bed\u97f3\uff0c\u65e0\u9700\u97f3\u7d20\u6216\u7279\u5b9a\u8bed\u8a00\u7684\u9884\u5904\u7406\u3002\u5305\u62ec\uff1a</p> <p>\u82f1\u8bed\u3001\u4e2d\u6587\u3001\u65e5\u8bed\u3001\u97e9\u8bed\u3001\u963f\u62c9\u4f2f\u8bed\u3001\u5fb7\u8bed\u3001\u6cd5\u8bed...</p> <p>\u4ee5\u53ca\u66f4\u591a\uff01</p> <p>\u5217\u8868\u6b63\u5728\u4e0d\u65ad\u6269\u5927\uff0c\u8bf7\u67e5\u770b Fish Audio \u83b7\u53d6\u6700\u65b0\u53d1\u5e03\u3002</p>"},{"location":"zh/#_9","title":"\u539f\u751f\u591a\u8bf4\u8bdd\u4eba\u751f\u6210","text":"<p>Fish Audio S2 \u5141\u8bb8\u7528\u6237\u4e0a\u4f20\u5305\u542b\u591a\u4e2a\u8bf4\u8bdd\u4eba\u7684\u53c2\u8003\u97f3\u9891\uff0c\u6a21\u578b\u5c06\u901a\u8fc7 <code><|speaker:i|></code> \u4ee4\u724c\u5904\u7406\u6bcf\u4e2a\u8bf4\u8bdd\u4eba\u7684\u7279\u5f81\u3002\u4e4b\u540e\u60a8\u53ef\u4ee5\u901a\u8fc7\u8bf4\u8bdd\u4eba ID \u4ee4\u724c\u63a7\u5236\u6a21\u578b\u7684\u8868\u73b0\uff0c\u4ece\u800c\u5b9e\u73b0\u4e00\u6b21\u751f\u6210\u4e2d\u5305\u542b\u591a\u4e2a\u8bf4\u8bdd\u4eba\u3002\u518d\u4e5f\u4e0d\u9700\u8981\u50cf\u4ee5\u524d\u90a3\u6837\u9488\u5bf9\u6bcf\u4e2a\u8bf4\u8bdd\u4eba\u90fd\u5355\u72ec\u4e0a\u4f20\u53c2\u8003\u97f3\u9891\u4e0e\u751f\u6210\u8bed\u97f3\u4e86\u3002</p>"},{"location":"zh/#_10","title":"\u591a\u8f6e\u5bf9\u8bdd\u751f\u6210","text":"<p>\u5f97\u76ca\u4e8e\u6a21\u578b\u4e0a\u4e0b\u6587\u7684\u6269\u5c55\uff0c\u6211\u4eec\u7684\u6a21\u578b\u73b0\u5728\u53ef\u4ee5\u501f\u52a9\u4e0a\u6587\u7684\u4fe1\u606f\u63d0\u9ad8\u540e\u7eed\u751f\u6210\u5185\u5bb9\u7684\u8868\u73b0\u529b\uff0c\u4ece\u800c\u63d0\u5347\u5185\u5bb9\u7684\u81ea\u7136\u5ea6\u3002</p>"},{"location":"zh/#_11","title":"\u5feb\u901f\u8bed\u97f3\u514b\u9686","text":"<p>Fish Audio S2 \u652f\u6301\u4f7f\u7528\u77ed\u53c2\u8003\u6837\u672c\uff08\u901a\u5e38\u4e3a 10-30 \u79d2\uff09\u8fdb\u884c\u51c6\u786e\u7684\u8bed\u97f3\u514b\u9686\u3002\u6a21\u578b\u53ef\u4ee5\u6355\u6349\u97f3\u8272\u3001\u8bf4\u8bdd\u98ce\u683c\u548c\u60c5\u611f\u503e\u5411\uff0c\u65e0\u9700\u989d\u5916\u5fae\u8c03\u5373\u53ef\u751f\u6210\u903c\u771f\u4e14\u4e00\u81f4\u7684\u514b\u9686\u8bed\u97f3\u3002 \u5982\u9700\u4f7f\u7528 SGLang Server\uff0c\u8bf7\u53c2\u8003 SGLang-Omni README \u3002</p>"},{"location":"zh/#_12","title":"\u81f4\u8c22","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"zh/#_13","title":"\u6280\u672f\u62a5\u544a","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"zh/finetune/","title":"\u5fae\u8c03","text":"<p>\u663e\u7136, \u5f53\u4f60\u6253\u5f00\u8fd9\u4e2a\u9875\u9762\u7684\u65f6\u5019, \u4f60\u5df2\u7ecf\u5bf9\u9884\u8bad\u7ec3\u6a21\u578b zero-shot \u7684\u6548\u679c\u4e0d\u7b97\u6ee1\u610f. \u4f60\u60f3\u8981\u5fae\u8c03\u4e00\u4e2a\u6a21\u578b, \u4f7f\u5f97\u5b83\u5728\u4f60\u7684\u6570\u636e\u96c6\u4e0a\u8868\u73b0\u66f4\u597d. </p> <p>\u5728\u76ee\u524d\u7248\u672c\uff0c\u4f60\u53ea\u9700\u8981\u5fae\u8c03'LLAMA'\u90e8\u5206\u5373\u53ef.</p>"},{"location":"zh/finetune/#llama","title":"LLAMA \u5fae\u8c03","text":""},{"location":"zh/finetune/#1","title":"1. \u51c6\u5907\u6570\u636e\u96c6","text":"<pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u2514\u2500\u2500 30.1-32.71.mp3\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u2514\u2500\u2500 38.79-40.85.mp3\n</code></pre> <p>\u4f60\u9700\u8981\u5c06\u6570\u636e\u96c6\u8f6c\u4e3a\u4ee5\u4e0a\u683c\u5f0f, \u5e76\u653e\u5230 <code>data</code> \u4e0b, \u97f3\u9891\u540e\u7f00\u53ef\u4ee5\u4e3a <code>.mp3</code>, <code>.wav</code> \u6216 <code>.flac</code>, \u6807\u6ce8\u6587\u4ef6\u540e\u7f00\u5efa\u8bae\u4e3a <code>.lab</code>.</p> <p>Info</p> <p>\u6807\u6ce8\u6587\u4ef6 <code>.lab</code> \u4ec5\u9700\u5305\u542b\u97f3\u9891\u7684\u8f6c\u5199\u6587\u672c\uff0c\u65e0\u9700\u9075\u5faa\u7279\u6b8a\u683c\u5f0f\u8981\u6c42\u3002\u4f8b\u5982\uff0c\u5982\u679c <code>hi.mp3</code> \u4e2d\u7684\u5185\u5bb9\u662f\u201c\u4f60\u597d\uff0c\u518d\u89c1\u3002\u201d\uff0c\u90a3\u4e48 <code>hi.lab</code> \u6587\u4ef6\u4e2d\u53ea\u9700\u5305\u542b\u4e00\u884c\u6587\u672c\uff1a\u201c\u4f60\u597d\uff0c\u518d\u89c1\u201d\u3002 </p> <p>Warning</p> <p>\u5efa\u8bae\u5148\u5bf9\u6570\u636e\u96c6\u8fdb\u884c\u54cd\u5ea6\u5339\u914d, \u4f60\u53ef\u4ee5\u4f7f\u7528 fish-audio-preprocess \u6765\u5b8c\u6210\u8fd9\u4e00\u6b65\u9aa4. <pre><code>fap loudness-norm data-raw data --clean\n</code></pre></p>"},{"location":"zh/finetune/#2-token","title":"2. \u6279\u91cf\u63d0\u53d6\u8bed\u4e49 token","text":"<p>\u786e\u4fdd\u4f60\u5df2\u7ecf\u4e0b\u8f7d\u4e86 vqgan \u6743\u91cd, \u5982\u679c\u6ca1\u6709, \u8bf7\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4:</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>\u968f\u540e\u53ef\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4\u6765\u63d0\u53d6\u8bed\u4e49 token:</p> <pre><code>python tools/vqgan/extract_vq.py data \\\n --num-workers 1 --batch-size 16 \\\n --config-name \"modded_dac_vq\" \\\n --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n</code></pre> <p>Note</p> <p>\u4f60\u53ef\u4ee5\u8c03\u6574 <code>--num-workers</code> \u548c <code>--batch-size</code> \u6765\u63d0\u9ad8\u63d0\u53d6\u901f\u5ea6, \u4f46\u662f\u8bf7\u6ce8\u610f\u4e0d\u8981\u8d85\u8fc7\u4f60\u7684\u663e\u5b58\u9650\u5236. </p> <p>\u8be5\u547d\u4ee4\u4f1a\u5728 <code>data</code> \u76ee\u5f55\u4e0b\u521b\u5efa <code>.npy</code> \u6587\u4ef6, \u5982\u4e0b\u6240\u793a:</p> <pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 21.15-26.44.npy\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.npy\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u251c\u2500\u2500 30.1-32.71.mp3\n\u2502 \u2514\u2500\u2500 30.1-32.71.npy\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u251c\u2500\u2500 38.79-40.85.mp3\n \u2514\u2500\u2500 38.79-40.85.npy\n</code></pre>"},{"location":"zh/finetune/#3-protobuf","title":"3. \u6253\u5305\u6570\u636e\u96c6\u4e3a protobuf","text":"<pre><code>python tools/llama/build_dataset.py \\\n --input \"data\" \\\n --output \"data/protos\" \\\n --text-extension .lab \\\n --num-workers 16\n</code></pre> <p>\u547d\u4ee4\u6267\u884c\u5b8c\u6bd5\u540e, \u4f60\u5e94\u8be5\u80fd\u5728 <code>data</code> \u76ee\u5f55\u4e0b\u770b\u5230 <code>protos</code> \u6587\u4ef6.</p>"},{"location":"zh/finetune/#4-lora","title":"4. \u6700\u540e, \u4f7f\u7528 LoRA \u8fdb\u884c\u5fae\u8c03","text":"<p>\u540c\u6837\u7684, \u8bf7\u786e\u4fdd\u4f60\u5df2\u7ecf\u4e0b\u8f7d\u4e86 <code>LLAMA</code> \u6743\u91cd, \u5982\u679c\u6ca1\u6709, \u8bf7\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4:</p> <pre><code>huggingface-cli download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n</code></pre> <p>\u6700\u540e, \u4f60\u53ef\u4ee5\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4\u6765\u542f\u52a8\u5fae\u8c03:</p> <pre><code>python fish_speech/train.py --config-name text2semantic_finetune \\\n project=$project \\\n +lora@model.model.lora_config=r_8_alpha_16\n</code></pre> <p>Note</p> <p>\u4f60\u53ef\u4ee5\u901a\u8fc7\u4fee\u6539 <code>fish_speech/configs/text2semantic_finetune.yaml</code> \u6765\u4fee\u6539\u8bad\u7ec3\u53c2\u6570\u5982 <code>batch_size</code>, <code>gradient_accumulation_steps</code> \u7b49, \u6765\u9002\u5e94\u4f60\u7684\u663e\u5b58.</p> <p>Note</p> <p>\u5bf9\u4e8e Windows \u7528\u6237, \u4f60\u53ef\u4ee5\u4f7f\u7528 <code>trainer.strategy.process_group_backend=gloo</code> \u6765\u907f\u514d <code>nccl</code> \u7684\u95ee\u9898.</p> <p>\u8bad\u7ec3\u7ed3\u675f\u540e, \u4f60\u53ef\u4ee5\u53c2\u8003 \u63a8\u7406 \u90e8\u5206\u6765\u6d4b\u8bd5\u4f60\u7684\u6a21\u578b.</p> <p>Info</p> <p>\u9ed8\u8ba4\u914d\u7f6e\u4e0b, \u57fa\u672c\u53ea\u4f1a\u5b66\u5230\u8bf4\u8bdd\u4eba\u7684\u53d1\u97f3\u65b9\u5f0f, \u800c\u4e0d\u5305\u542b\u97f3\u8272, \u4f60\u4f9d\u7136\u9700\u8981\u4f7f\u7528 prompt \u6765\u4fdd\u8bc1\u97f3\u8272\u7684\u7a33\u5b9a\u6027. \u5982\u679c\u4f60\u60f3\u8981\u5b66\u5230\u97f3\u8272, \u8bf7\u5c06\u8bad\u7ec3\u6b65\u6570\u8c03\u5927, \u4f46\u8fd9\u6709\u53ef\u80fd\u4f1a\u5bfc\u81f4\u8fc7\u62df\u5408. </p> <p>\u8bad\u7ec3\u5b8c\u6210\u540e, \u4f60\u9700\u8981\u5148\u5c06 loRA \u7684\u6743\u91cd\u8f6c\u4e3a\u666e\u901a\u6743\u91cd, \u7136\u540e\u518d\u8fdb\u884c\u63a8\u7406.</p> <pre><code>python tools/llama/merge_lora.py \\\n --lora-config r_8_alpha_16 \\\n --base-weight checkpoints/s2-pro \\\n --lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n --output checkpoints/s2-pro-yth-lora/\n</code></pre> <p>Note</p> <p>\u4f60\u4e5f\u53ef\u4ee5\u5c1d\u8bd5\u5176\u4ed6\u7684 checkpoint, \u6211\u4eec\u5efa\u8bae\u4f60\u4f7f\u7528\u6700\u65e9\u7684\u6ee1\u8db3\u4f60\u8981\u6c42\u7684 checkpoint, \u4ed6\u4eec\u901a\u5e38\u5728 OOD \u4e0a\u8868\u73b0\u66f4\u597d.</p>"},{"location":"zh/inference/","title":"\u63a8\u7406","text":"<p>Fish Audio S2 \u6a21\u578b\u9700\u8981\u8f83\u5927\u7684\u663e\u5b58\uff0c\u6211\u4eec\u63a8\u8350\u60a8\u4f7f\u7528\u81f3\u5c1124GB\u7684GPU\u8fdb\u884c\u63a8\u7406\u3002</p>"},{"location":"zh/inference/#_2","title":"\u4e0b\u8f7d\u6743\u91cd","text":"<p>\u9996\u5148\u60a8\u9700\u8981\u4e0b\u8f7d\u6a21\u578b\u6743\u91cd\uff1a</p> <pre><code>hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n</code></pre>"},{"location":"zh/inference/#_3","title":"\u547d\u4ee4\u884c\u63a8\u7406","text":"<p>Note</p> <p>\u5982\u679c\u60a8\u8ba1\u5212\u8ba9\u6a21\u578b\u968f\u673a\u9009\u62e9\u97f3\u8272\uff0c\u53ef\u4ee5\u8df3\u8fc7\u6b64\u6b65\u9aa4\u3002</p>"},{"location":"zh/inference/#1-vq-tokens","title":"1. \u4ece\u53c2\u8003\u97f3\u9891\u83b7\u53d6 VQ tokens","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"test.wav\" \\\n --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n</code></pre> <p>\u60a8\u5e94\u8be5\u4f1a\u5f97\u5230\u4e00\u4e2a <code>fake.npy</code> \u548c\u4e00\u4e2a <code>fake.wav</code>\u3002</p>"},{"location":"zh/inference/#2-semantic-tokens","title":"2. \u4ece\u6587\u672c\u751f\u6210 Semantic tokens\uff1a","text":"<pre><code>python fish_speech/models/text2semantic/inference.py \\\n --text \"\u60a8\u60f3\u8981\u8f6c\u6362\u7684\u6587\u672c\" \\\n --prompt-text \"\u60a8\u7684\u53c2\u8003\u6587\u672c\" \\\n --prompt-tokens \"fake.npy\" \\\n # --compile\n</code></pre> <p>\u6b64\u547d\u4ee4\u5c06\u5728\u5de5\u4f5c\u76ee\u5f55\u4e2d\u521b\u5efa\u4e00\u4e2a <code>codes_N</code> \u6587\u4ef6\uff0c\u5176\u4e2d N \u662f\u4ece 0 \u5f00\u59cb\u7684\u6574\u6570\u3002</p> <p>Note</p> <p>\u60a8\u53ef\u80fd\u5e0c\u671b\u4f7f\u7528 <code>--compile</code> \u6765\u878d\u5408 CUDA \u5185\u6838\u4ee5\u5b9e\u73b0\u66f4\u5feb\u7684\u63a8\u7406\uff0c\u4f46\u662f\u6211\u4eec\u66f4\u63a8\u8350\u60a8\u4f7f\u7528\u6211\u4eecsglang\u7684\u63a8\u7406\u52a0\u901f\u4f18\u5316\u3002 \u76f8\u5e94\u5730\uff0c\u5982\u679c\u60a8\u4e0d\u8ba1\u5212\u4f7f\u7528\u52a0\u901f\uff0c\u53ef\u4ee5\u6ce8\u91ca\u6389 <code>--compile</code> \u53c2\u6570\u3002</p> <p>Info</p> <p>\u5bf9\u4e8e\u4e0d\u652f\u6301 bf16 \u7684 GPU\uff0c\u60a8\u53ef\u80fd\u9700\u8981\u4f7f\u7528 <code>--half</code> \u53c2\u6570\u3002</p>"},{"location":"zh/inference/#3","title":"3. \u4ece\u8bed\u4e49\u4ee4\u724c\u751f\u6210\u58f0\u97f3\uff1a","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"codes_0.npy\" \\\n</code></pre> <p>\u4e4b\u540e\u4f60\u4f1a\u5f97\u5230\u4e00\u4e2afake.wav\u6587\u4ef6\u3002</p>"},{"location":"zh/inference/#webui","title":"WebUI \u63a8\u7406","text":""},{"location":"zh/inference/#1-gradio-webui","title":"1. Gradio WebUI","text":"<p>\u4e3a\u4e86\u4fdd\u6301\u517c\u5bb9\uff0c\u6211\u4eec\u4fdd\u7559\u4e86\u4ee5\u5f80\u7684Gradio WebUI\u3002</p> <pre><code>python tools/run_webui.py # --compile \u5982\u679c\u4f60\u9700\u8981\u52a0\u901f\u7684\u8bdd\n</code></pre>"},{"location":"zh/inference/#2-awesome-webui","title":"2. Awesome WebUI","text":"<p>Awesome WebUI \u662f\u4e00\u4e2a\u57fa\u4e8e TypeScript \u5f00\u53d1\u7684\u73b0\u4ee3\u5316 Web \u754c\u9762\uff0c\u63d0\u4f9b\u66f4\u4e30\u5bcc\u7684\u529f\u80fd\u548c\u66f4\u597d\u7684\u4ea4\u4e92\u4f53\u9a8c\u3002</p> <p>\u6784\u5efa WebUI\uff1a</p> <p>\u60a8\u9700\u8981\u5148\u5728\u672c\u5730\u6216\u8005\u670d\u52a1\u5668\u4e0a\u5b89\u88c5 Node.js \u548c npm\u3002</p> <ol> <li>\u8fdb\u5165 <code>awesome_webui</code> \u76ee\u5f55\uff1a <pre><code>cd awesome_webui\n</code></pre></li> <li>\u5b89\u88c5\u4f9d\u8d56\uff1a <pre><code>npm install\n</code></pre></li> <li>\u6784\u5efa WebUI\uff1a <pre><code>npm run build\n</code></pre></li> </ol> <p>\u542f\u52a8\u540e\u7aef\u670d\u52a1\u5668\uff1a</p> <p>WebUI \u6784\u5efa\u5b8c\u6210\u540e\uff0c\u8fd4\u56de\u9879\u76ee\u6839\u76ee\u5f55\uff0c\u542f\u52a8 API \u670d\u52a1\u5668\uff1a</p> <pre><code>python tools/api_server.py --listen 0.0.0.0:8888 --compile\n</code></pre> <p>\u8bbf\u95ee\uff1a</p> <p>\u5728\u670d\u52a1\u5668\u542f\u52a8\u540e\uff0c\u60a8\u53ef\u4ee5\u901a\u8fc7\u6d4f\u89c8\u5668\u8bbf\u95ee\u4ee5\u4e0b\u5730\u5740\u4f53\u9a8c\uff1a <code>http://localhost:8888/ui</code></p>"},{"location":"zh/install/","title":"\u5b89\u88c5","text":""},{"location":"zh/install/#_1","title":"\u7cfb\u7edf\u8981\u6c42","text":"<ul> <li>GPU \u663e\u5b58\uff1a24GB\uff08\u7528\u4e8e\u63a8\u7406\uff09</li> <li>\u7cfb\u7edf\uff1aLinux\u3001WSL</li> </ul>"},{"location":"zh/install/#_2","title":"\u7cfb\u7edf\u8bbe\u7f6e","text":"<p>Fish Audio S2 \u652f\u6301\u591a\u79cd\u5b89\u88c5\u65b9\u5f0f\u3002\u8bf7\u9009\u62e9\u6700\u9002\u5408\u4f60\u5f53\u524d\u5f00\u53d1\u73af\u5883\u7684\u65b9\u6848\u3002</p> <p>\u524d\u7f6e\u4f9d\u8d56\uff1a\u5148\u5b89\u88c5\u97f3\u9891\u5904\u7406\u6240\u9700\u7684\u7cfb\u7edf\u4f9d\u8d56\uff1a <pre><code>apt install portaudio19-dev libsox-dev ffmpeg\n</code></pre></p>"},{"location":"zh/install/#conda","title":"Conda","text":"<pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# GPU \u5b89\u88c5\uff08\u9009\u62e9 CUDA \u7248\u672c\uff1acu126\u3001cu128\u3001cu129\uff09\npip install -e .[cu129]\n\n# \u4ec5 CPU \u5b89\u88c5\npip install -e .[cpu]\n\n# \u9ed8\u8ba4\u5b89\u88c5\uff08\u4f7f\u7528 PyTorch \u9ed8\u8ba4\u7d22\u5f15\uff09\npip install -e .\n\n# \u5982\u679c\u56e0 pyaudio \u5bfc\u81f4\u5b89\u88c5\u62a5\u9519\uff0c\u53ef\u4ee5\u5148\u6267\u884c\uff1a\n# conda install pyaudio\n# \u7136\u540e\u91cd\u65b0\u6267\u884c pip install -e .\n</code></pre>"},{"location":"zh/install/#uv","title":"UV","text":"<p>UV \u53ef\u4ee5\u66f4\u5feb\u5730\u5b8c\u6210\u4f9d\u8d56\u89e3\u6790\u4e0e\u5b89\u88c5\uff1a</p> <pre><code># GPU \u5b89\u88c5\uff08\u9009\u62e9 CUDA \u7248\u672c\uff1acu126\u3001cu128\u3001cu129\uff09\nuv sync --python 3.12 --extra cu129\n\n# \u4ec5 CPU \u5b89\u88c5\nuv sync --python 3.12 --extra cpu\n</code></pre>"},{"location":"zh/install/#intel-arc-xpu","title":"Intel Arc XPU \u652f\u6301","text":"<p>\u5982\u679c\u4f60\u4f7f\u7528 Intel Arc GPU\uff0c\u53ef\u6309\u4ee5\u4e0b\u65b9\u5f0f\u5b89\u88c5 XPU \u652f\u6301\uff1a</p> <pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# \u5b89\u88c5\u5fc5\u9700\u7684 C++ \u6807\u51c6\u5e93\nconda install libstdcxx -c conda-forge\n\n# \u5b89\u88c5\u652f\u6301 Intel XPU \u7684 PyTorch\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# \u5b89\u88c5 Fish Speech\npip install -e .\n</code></pre> <p>Warning</p> <p><code>compile</code> \u9009\u9879\u6682\u4e0d\u652f\u6301 Windows \u548c macOS\u3002\u82e5\u4f60\u5e0c\u671b\u542f\u7528 compile\uff0c\u8bf7\u624b\u52a8\u5b89\u88c5 Triton\u3002</p>"},{"location":"zh/install/#docker","title":"Docker \u8bbe\u7f6e","text":"<p>Fish Audio S2 \u7cfb\u5217\u6a21\u578b\u63d0\u4f9b\u591a\u79cd Docker \u90e8\u7f72\u65b9\u5f0f\uff0c\u9002\u914d\u4e0d\u540c\u573a\u666f\u3002\u4f60\u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528 Docker Hub \u9884\u6784\u5efa\u955c\u50cf\uff0c\u4e5f\u53ef\u4ee5\u7528 Docker Compose \u672c\u5730\u6784\u5efa\uff0c\u6216\u624b\u52a8\u6784\u5efa\u81ea\u5b9a\u4e49\u955c\u50cf\u3002</p> <p>\u6211\u4eec\u63d0\u4f9b WebUI \u4e0e API Server \u7684 GPU\uff08\u9ed8\u8ba4 CUDA126\uff09\u548c CPU \u955c\u50cf\u3002\u4f60\u53ef\u4ee5\u76f4\u63a5\u7528 Docker Hub \u955c\u50cf\uff0c\u4e5f\u53ef\u4ee5\u5728\u672c\u5730\u6784\u5efa\u3002\u5982\u679c\u4f60\u53ea\u60f3\u4f7f\u7528\u9884\u6784\u5efa\u955c\u50cf\uff0c\u8bf7\u53c2\u8003inference guide\u3002</p>"},{"location":"zh/install/#_3","title":"\u524d\u7f6e\u6761\u4ef6","text":"<ul> <li>\u5df2\u5b89\u88c5 Docker \u548c Docker Compose</li> <li>\uff08GPU \u573a\u666f\uff09\u5df2\u5b89\u88c5 NVIDIA Docker runtime</li> <li>CUDA \u63a8\u7406\u5efa\u8bae\u81f3\u5c11 24GB \u663e\u5b58</li> </ul>"},{"location":"zh/install/#docker-compose","title":"\u4f7f\u7528 Docker Compose","text":"<p>\u5982\u679c\u4f60\u9700\u8981\u5f00\u53d1\u6216\u81ea\u5b9a\u4e49\uff0c\u63a8\u8350\u4f7f\u7528 Docker Compose \u5728\u672c\u5730\u6784\u5efa\u5e76\u8fd0\u884c\uff1a</p> <pre><code># \u5148\u514b\u9686\u4ed3\u5e93\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# \u4f7f\u7528 CUDA \u542f\u52a8 WebUI\ndocker compose --profile webui up\n\n# \u542f\u7528 compile \u4f18\u5316\u542f\u52a8 WebUI\nCOMPILE=1 docker compose --profile webui up\n\n# \u542f\u52a8 API Server\ndocker compose --profile server up\n\n# \u542f\u7528 compile \u4f18\u5316\u542f\u52a8 API Server\nCOMPILE=1 docker compose --profile server up\n\n# \u4ec5 CPU \u90e8\u7f72\nBACKEND=cpu docker compose --profile webui up\n</code></pre>"},{"location":"zh/install/#docker-compose_1","title":"Docker Compose \u73af\u5883\u53d8\u91cf","text":"<p>\u4f60\u53ef\u4ee5\u901a\u8fc7\u73af\u5883\u53d8\u91cf\u5b9a\u5236\u90e8\u7f72\u53c2\u6570\uff1a</p> <pre><code># .env \u6587\u4ef6\u793a\u4f8b\nBACKEND=cuda # \u6216 cpu\nCOMPILE=1 # \u542f\u7528 compile \u4f18\u5316\nGRADIO_PORT=7860 # WebUI \u7aef\u53e3\nAPI_PORT=8080 # API Server \u7aef\u53e3\nUV_VERSION=0.8.15 # UV \u5305\u7ba1\u7406\u5668\u7248\u672c\n</code></pre> <p>\u547d\u4ee4\u6267\u884c\u540e\u4f1a\u81ea\u52a8\u6784\u5efa\u955c\u50cf\u5e76\u542f\u52a8\u5bb9\u5668\u3002\u4f60\u53ef\u4ee5\u901a\u8fc7 <code>http://localhost:7860</code> \u8bbf\u95ee WebUI\uff0c\u901a\u8fc7 <code>http://localhost:8080</code> \u8bbf\u95ee API Server\u3002</p>"},{"location":"zh/install/#docker_1","title":"\u624b\u52a8 Docker \u6784\u5efa","text":"<p>\u5982\u679c\u4f60\u9700\u8981\u66f4\u7ec6\u7c92\u5ea6\u7684\u6784\u5efa\u63a7\u5236\uff0c\u53ef\u4ee5\u624b\u52a8\u6784\u5efa\uff1a</p> <pre><code># \u6784\u5efa\u652f\u6301 CUDA \u7684 WebUI \u955c\u50cf\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target webui \\\n -t fish-speech-webui:cuda .\n\n# \u6784\u5efa\u652f\u6301 CUDA \u7684 API Server \u955c\u50cf\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target server \\\n -t fish-speech-server:cuda .\n\n# \u6784\u5efa\u4ec5 CPU \u955c\u50cf\uff08\u652f\u6301\u591a\u5e73\u53f0\uff09\ndocker build \\\n --platform linux/amd64,linux/arm64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cpu \\\n --target webui \\\n -t fish-speech-webui:cpu .\n\n# \u6784\u5efa\u5f00\u53d1\u955c\u50cf\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --target dev \\\n -t fish-speech-dev:cuda .\n</code></pre>"},{"location":"zh/install/#_4","title":"\u6784\u5efa\u53c2\u6570","text":"<ul> <li><code>BACKEND</code>\uff1a<code>cuda</code> \u6216 <code>cpu</code>\uff08\u9ed8\u8ba4\uff1a<code>cuda</code>\uff09</li> <li><code>CUDA_VER</code>\uff1aCUDA \u7248\u672c\uff08\u9ed8\u8ba4\uff1a<code>12.6.0</code>\uff09</li> <li><code>UV_EXTRA</code>\uff1aUV \u7684 CUDA \u6269\u5c55\uff08\u9ed8\u8ba4\uff1a<code>cu126</code>\uff09</li> <li><code>UBUNTU_VER</code>\uff1aUbuntu \u7248\u672c\uff08\u9ed8\u8ba4\uff1a<code>24.04</code>\uff09</li> <li><code>PY_VER</code>\uff1aPython \u7248\u672c\uff08\u9ed8\u8ba4\uff1a<code>3.12</code>\uff09</li> </ul>"},{"location":"zh/install/#_5","title":"\u5377\u6302\u8f7d","text":"<p>\u4e24\u79cd\u65b9\u6cd5\u90fd\u9700\u8981\u6302\u8f7d\u4ee5\u4e0b\u76ee\u5f55\uff1a</p> <ul> <li><code>./checkpoints:/app/checkpoints</code> - \u6a21\u578b\u6743\u91cd\u76ee\u5f55</li> <li><code>./references:/app/references</code> - \u53c2\u8003\u97f3\u9891\u76ee\u5f55</li> </ul>"},{"location":"zh/install/#_6","title":"\u73af\u5883\u53d8\u91cf","text":"<ul> <li><code>COMPILE=1</code> - \u542f\u7528 <code>torch.compile</code>\uff0c\u53ef\u63d0\u5347\u63a8\u7406\u901f\u5ea6\uff08\u7ea6 10 \u500d\uff09</li> <li><code>GRADIO_SERVER_NAME=0.0.0.0</code> - WebUI \u670d\u52a1\u5730\u5740</li> <li><code>GRADIO_SERVER_PORT=7860</code> - WebUI \u670d\u52a1\u7aef\u53e3</li> <li><code>API_SERVER_NAME=0.0.0.0</code> - API \u670d\u52a1\u5730\u5740</li> <li><code>API_SERVER_PORT=8080</code> - API \u670d\u52a1\u7aef\u53e3</li> </ul> <p>Note</p> <p>Docker \u5bb9\u5668\u9ed8\u8ba4\u4ece <code>/app/checkpoints</code> \u8bfb\u53d6\u6a21\u578b\u6743\u91cd\u3002\u542f\u52a8\u5bb9\u5668\u524d\u8bf7\u5148\u4e0b\u8f7d\u597d\u6240\u9700\u6743\u91cd\u3002</p> <p>Warning</p> <p>GPU \u652f\u6301\u9700\u8981 NVIDIA Docker runtime\u3002\u82e5\u4ec5\u4f7f\u7528 CPU\uff0c\u8bf7\u79fb\u9664 <code>--gpus all</code> \u5e76\u4f7f\u7528 CPU \u955c\u50cf\u3002</p>"},{"location":"ja/","title":"\u306f\u3058\u3081\u306b","text":"Fish Speech <p>English | \u7b80\u4f53\u4e2d\u6587 | Portuguese | \u65e5\u672c\u8a9e | \ud55c\uad6d\uc5b4 | \u0627\u0644\u0639\u0631\u0628\u064a\u0629 | Espa\u00f1ol</p> <p></p> <p></p> <p>\u30e9\u30a4\u30bb\u30f3\u30b9\u901a\u77e5</p> <p>\u3053\u306e\u30b3\u30fc\u30c9\u30d9\u30fc\u30b9\u304a\u3088\u3073\u95a2\u9023\u3059\u308b\u30e2\u30c7\u30eb\u306e\u91cd\u307f\u306f FISH AUDIO RESEARCH LICENSE \u306e\u4e0b\u3067\u30ea\u30ea\u30fc\u30b9\u3055\u308c\u3066\u3044\u307e\u3059\u3002\u8a73\u7d30\u306f LICENSE \u3092\u53c2\u7167\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p> <p>\u6cd5\u7684\u514d\u8cac\u4e8b\u9805</p> <p>\u79c1\u305f\u3061\u306f\u3001\u30b3\u30fc\u30c9\u30d9\u30fc\u30b9\u306e\u3044\u304b\u306a\u308b\u9055\u6cd5\u306a\u4f7f\u7528\u306b\u5bfe\u3057\u3066\u3082\u8cac\u4efb\u3092\u8ca0\u3044\u307e\u305b\u3093\u3002DMCA \u304a\u3088\u3073\u305d\u306e\u4ed6\u306e\u95a2\u9023\u6cd5\u306b\u95a2\u3059\u308b\u73fe\u5730\u306e\u898f\u5236\u3092\u53c2\u7167\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"ja/#_1","title":"\u30af\u30a4\u30c3\u30af\u30b9\u30bf\u30fc\u30c8","text":""},{"location":"ja/#_2","title":"\u307e\u305a\u306f\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u304b\u3089","text":"<p>Fish Audio S2 \u306e\u516c\u5f0f\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u3067\u3059\u3002\u4ee5\u4e0b\u304b\u3089\u3059\u3050\u306b\u59cb\u3081\u3089\u308c\u307e\u3059\u3002</p> <ul> <li>\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb</li> <li>\u30b3\u30de\u30f3\u30c9\u30e9\u30a4\u30f3\u63a8\u8ad6</li> <li>WebUI \u63a8\u8ad6</li> <li>\u30b5\u30fc\u30d0\u30fc\u63a8\u8ad6</li> <li>Docker \u30bb\u30c3\u30c8\u30a2\u30c3\u30d7</li> </ul> <p>[!IMPORTANT] SGLang \u30b5\u30fc\u30d0\u30fc\u306b\u3064\u3044\u3066\u306f SGLang-Omni README \u3092\u53c2\u7167\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"ja/#llm-agent","title":"LLM Agent \u5411\u3051","text":"<pre><code>https://speech.fish.audio/ja/install/ \u306e\u624b\u9806\u306b\u5f93\u3063\u3066\u3001Fish Audio S2 \u3092\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u30fb\u8a2d\u5b9a\u3057\u3066\u304f\u3060\u3055\u3044\u3002\n</code></pre>"},{"location":"ja/#fish-audio-s2","title":"Fish Audio S2","text":"<p>\u30aa\u30fc\u30d7\u30f3\u30bd\u30fc\u30b9\u304a\u3088\u3073\u30af\u30ed\u30fc\u30ba\u30c9\u30bd\u30fc\u30b9\u306e\u4e2d\u3067\u6700\u3082\u512a\u308c\u305f\u30c6\u30ad\u30b9\u30c8\u8aad\u307f\u4e0a\u3052\u30b7\u30b9\u30c6\u30e0</p> <p>Fish Audio S2 \u306f Fish Audio \u304c\u958b\u767a\u3057\u305f\u6700\u65b0\u30e2\u30c7\u30eb\u3067\u3059\u3002\u7d04 50 \u8a00\u8a9e\u30fb1,000 \u4e07\u6642\u9593\u8d85\u306e\u97f3\u58f0\u30c7\u30fc\u30bf\u3067\u5b66\u7fd2\u3055\u308c\u3001\u5f37\u5316\u5b66\u7fd2\u30a2\u30e9\u30a4\u30f3\u30e1\u30f3\u30c8\u3068 Dual-Autoregressive \u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3\u3092\u7d44\u307f\u5408\u308f\u305b\u308b\u3053\u3068\u3067\u3001\u81ea\u7136\u3067\u30ea\u30a2\u30eb\u304b\u3064\u611f\u60c5\u8868\u73fe\u8c4a\u304b\u306a\u97f3\u58f0\u3092\u751f\u6210\u3057\u307e\u3059\u3002</p> <p>S2 \u306f <code>[laugh]</code>\u3001<code>[whispers]</code>\u3001<code>[super happy]</code> \u3068\u3044\u3063\u305f\u81ea\u7136\u8a00\u8a9e\u30bf\u30b0\u3067\u3001\u97fb\u5f8b\u3084\u611f\u60c5\u3092\u6587\u4e2d\u306e\u4efb\u610f\u4f4d\u7f6e\u3067\u7d30\u304b\u304f\u5236\u5fa1\u3067\u304d\u307e\u3059\u3002\u3055\u3089\u306b\u3001\u30de\u30eb\u30c1\u30b9\u30d4\u30fc\u30ab\u30fc\u751f\u6210\u3068\u30de\u30eb\u30c1\u30bf\u30fc\u30f3\u751f\u6210\u306b\u3082\u30cd\u30a4\u30c6\u30a3\u30d6\u5bfe\u5fdc\u3057\u3066\u3044\u307e\u3059\u3002</p> <p>\u30e9\u30a4\u30d6\u30c7\u30e2\u306f Fish Audio \u30a6\u30a7\u30d6\u30b5\u30a4\u30c8 \u304b\u3089\u3001\u8a73\u7d30\u306f \u30d6\u30ed\u30b0\u8a18\u4e8b \u3068 \u6280\u8853\u30ec\u30dd\u30fc\u30c8 \u3092\u3054\u89a7\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"ja/#_3","title":"\u30e2\u30c7\u30eb\u30d0\u30ea\u30a2\u30f3\u30c8","text":"\u30e2\u30c7\u30eb \u30b5\u30a4\u30ba \u5229\u7528\u53ef\u80fd\u6027 \u8aac\u660e S2-Pro 4B \u30d1\u30e9\u30e1\u30fc\u30bf HuggingFace \u54c1\u8cea\u3068\u5b89\u5b9a\u6027\u3092\u6700\u5927\u5316\u3057\u305f\u30d5\u30eb\u6a5f\u80fd\u306e\u30d5\u30e9\u30c3\u30b0\u30b7\u30c3\u30d7\u30e2\u30c7\u30eb <p>\u30e2\u30c7\u30eb\u306e\u8a73\u7d30\u306f\u6280\u8853\u30ec\u30dd\u30fc\u30c8\u3092\u3054\u53c2\u7167\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"ja/#_4","title":"\u30d9\u30f3\u30c1\u30de\u30fc\u30af\u7d50\u679c","text":"\u30d9\u30f3\u30c1\u30de\u30fc\u30af Fish Audio S2 Seed-TTS Eval \u2014 WER\uff08\u4e2d\u56fd\u8a9e\uff09 0.54%\uff08\u5168\u4f53\u6700\u826f\uff09 Seed-TTS Eval \u2014 WER\uff08\u82f1\u8a9e\uff09 0.99%\uff08\u5168\u4f53\u6700\u826f\uff09 Audio Turing Test\uff08\u6307\u793a\u3042\u308a\uff09 0.515 \u4e8b\u5f8c\u5e73\u5747\u5024 EmergentTTS-Eval \u2014 \u52dd\u7387 81.88%\uff08\u5168\u4f53\u6700\u9ad8\uff09 Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 \u54c1\u8cea 4.51 / 5.0 \u591a\u8a00\u8a9e\uff08MiniMax Testset\uff09\u2014 \u6700\u826f WER 24 \u8a00\u8a9e\u4e2d 11 \u8a00\u8a9e \u591a\u8a00\u8a9e\uff08MiniMax Testset\uff09\u2014 \u6700\u826f SIM 24 \u8a00\u8a9e\u4e2d 17 \u8a00\u8a9e <p>Seed-TTS Eval \u3067\u306f\u3001S2 \u306f\u30af\u30ed\u30fc\u30ba\u30c9\u30bd\u30fc\u30b9\u3092\u542b\u3080\u5168\u8a55\u4fa1\u30e2\u30c7\u30eb\u306e\u4e2d\u3067\u6700\u5c0f WER \u3092\u9054\u6210\u3057\u307e\u3057\u305f\uff1aQwen3-TTS\uff080.77/1.24\uff09\u3001MiniMax Speech-02\uff080.99/1.90\uff09\u3001Seed-TTS\uff081.12/2.25\uff09\u3002Audio Turing Test \u3067\u306f 0.515 \u3092\u8a18\u9332\u3057\u3001Seed-TTS\uff080.417\uff09\u6bd4\u3067 24%\u3001MiniMax-Speech\uff080.387\uff09\u6bd4\u3067 33% \u4e0a\u56de\u308a\u307e\u3057\u305f\u3002EmergentTTS-Eval \u3067\u306f\u3001\u526f\u8a00\u8a9e\u60c5\u5831\uff0891.61%\uff09\u3001\u7591\u554f\u6587\uff0884.41%\uff09\u3001\u7d71\u8a9e\u7684\u8907\u96d1\u6027\uff0883.39%\uff09\u3067\u7279\u306b\u9ad8\u3044\u6210\u7e3e\u3092\u793a\u3057\u3066\u3044\u307e\u3059\u3002</p>"},{"location":"ja/#_5","title":"\u30cf\u30a4\u30e9\u30a4\u30c8","text":""},{"location":"ja/#_6","title":"\u81ea\u7136\u8a00\u8a9e\u306b\u3088\u308b\u7d30\u7c92\u5ea6\u30a4\u30f3\u30e9\u30a4\u30f3\u5236\u5fa1","text":"<p>Fish Audio S2 \u3067\u306f\u3001\u30c6\u30ad\u30b9\u30c8\u5185\u306e\u7279\u5b9a\u306e\u5358\u8a9e\u3084\u30d5\u30ec\u30fc\u30ba\u4f4d\u7f6e\u306b\u81ea\u7136\u8a00\u8a9e\u306e\u6307\u793a\u3092\u76f4\u63a5\u57cb\u3081\u8fbc\u3080\u3053\u3068\u3067\u3001\u97f3\u58f0\u751f\u6210\u3092\u5c40\u6240\u7684\u306b\u5236\u5fa1\u3067\u304d\u307e\u3059\u3002\u56fa\u5b9a\u306e\u4e8b\u524d\u5b9a\u7fa9\u30bf\u30b0\u306b\u4f9d\u5b58\u3059\u308b\u306e\u3067\u306f\u306a\u304f\u3001S2 \u306f [whisper in small voice]\u3001[professional broadcast tone]\u3001[pitch up] \u306e\u3088\u3046\u306a\u81ea\u7531\u5f62\u5f0f\u306e\u30c6\u30ad\u30b9\u30c8\u8a18\u8ff0\u3092\u53d7\u3051\u4ed8\u3051\u3001\u5358\u8a9e\u30ec\u30d9\u30eb\u3067\u8868\u73fe\u3092\u30aa\u30fc\u30d7\u30f3\u30a8\u30f3\u30c9\u306b\u5236\u5fa1\u3067\u304d\u307e\u3059\u3002</p>"},{"location":"ja/#dual-autoregressive","title":"\u4e8c\u91cd\u81ea\u5df1\u56de\u5e30\uff08Dual-Autoregressive\uff09\u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3","text":"<p>S2 \u306f\u30c7\u30b3\u30fc\u30c0\u30fc\u5c02\u7528 Transformer \u3068 RVQ \u30d9\u30fc\u30b9\u306e\u97f3\u58f0\u30b3\u30fc\u30c7\u30c3\u30af\uff0810 codebooks\u3001\u7d04 21 Hz\uff09\u3092\u7d44\u307f\u5408\u308f\u305b\u3066\u3044\u307e\u3059\u3002Dual-AR \u306f\u751f\u6210\u3092 2 \u6bb5\u968e\u306b\u5206\u5272\u3057\u307e\u3059\u3002</p> <ul> <li>Slow AR \u306f\u6642\u9593\u8ef8\u65b9\u5411\u306b\u52d5\u4f5c\u3057\u3001\u4e3b\u3068\u306a\u308b semantic codebook \u3092\u4e88\u6e2c\u3002</li> <li>Fast AR \u306f\u5404\u6642\u523b\u3067\u6b8b\u308a 9 \u500b\u306e residual codebook \u3092\u751f\u6210\u3057\u3001\u7d30\u304b\u306a\u97f3\u97ff\u30c7\u30a3\u30c6\u30fc\u30eb\u3092\u5fa9\u5143\u3002</li> </ul> <p>\u3053\u306e\u975e\u5bfe\u79f0\u8a2d\u8a08\uff08\u6642\u9593\u8ef8 4B \u30d1\u30e9\u30e1\u30fc\u30bf\u3001\u6df1\u3055\u8ef8 400M \u30d1\u30e9\u30e1\u30fc\u30bf\uff09\u306b\u3088\u308a\u3001\u97f3\u8cea\u3092\u4fdd\u3061\u306a\u304c\u3089\u63a8\u8ad6\u52b9\u7387\u3092\u9ad8\u3081\u3066\u3044\u307e\u3059\u3002</p>"},{"location":"ja/#_7","title":"\u5f37\u5316\u5b66\u7fd2\u30a2\u30e9\u30a4\u30f3\u30e1\u30f3\u30c8","text":"<p>S2 \u306f\u5f8c\u5b66\u7fd2\u30a2\u30e9\u30a4\u30f3\u30e1\u30f3\u30c8\u306b Group Relative Policy Optimization\uff08GRPO\uff09\u3092\u63a1\u7528\u3057\u3066\u3044\u307e\u3059\u3002\u5b66\u7fd2\u30c7\u30fc\u30bf\u306e\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u3068\u30a2\u30ce\u30c6\u30fc\u30b7\u30e7\u30f3\u306b\u4f7f\u3063\u305f\u540c\u4e00\u30e2\u30c7\u30eb\u7fa4\u3092\u3001\u305d\u306e\u307e\u307e RL \u306e\u5831\u916c\u30e2\u30c7\u30eb\u3068\u3057\u3066\u518d\u5229\u7528\u3059\u308b\u3053\u3068\u3067\u3001\u4e8b\u524d\u5b66\u7fd2\u30c7\u30fc\u30bf\u5206\u5e03\u3068\u4e8b\u5f8c\u5b66\u7fd2\u76ee\u7684\u306e\u30df\u30b9\u30de\u30c3\u30c1\u3092\u6291\u5236\u3057\u3066\u3044\u307e\u3059\u3002\u5831\u916c\u4fe1\u53f7\u306b\u306f\u3001\u610f\u5473\u7684\u6b63\u78ba\u6027\u3001\u6307\u793a\u8ffd\u5f93\u6027\u3001\u97f3\u97ff\u7684\u9078\u597d\u30b9\u30b3\u30a2\u3001\u97f3\u8272\u985e\u4f3c\u5ea6\u304c\u542b\u307e\u308c\u307e\u3059\u3002</p>"},{"location":"ja/#sglang","title":"SGLang \u306b\u3088\u308b\u672c\u756a\u5411\u3051\u30b9\u30c8\u30ea\u30fc\u30df\u30f3\u30b0","text":"<p>Dual-AR \u306f\u69cb\u9020\u7684\u306b\u6a19\u6e96\u7684\u306a\u81ea\u5df1\u56de\u5e30 LLM \u3068\u540c\u578b\u306e\u305f\u3081\u3001S2 \u306f SGLang \u306e LLM \u5411\u3051\u6700\u9069\u5316\u3092\u305d\u306e\u307e\u307e\u6d3b\u7528\u3067\u304d\u307e\u3059\u3002\u305f\u3068\u3048\u3070 continuous batching\u3001paged KV cache\u3001CUDA graph replay\u3001RadixAttention \u30d9\u30fc\u30b9\u306e prefix caching \u3067\u3059\u3002</p> <p>\u5358\u4e00\u306e NVIDIA H200 GPU \u3067\u306e\u5b9f\u6e2c:</p> <ul> <li>RTF\uff08Real-Time Factor\uff09: 0.195</li> <li>\u521d\u56de\u97f3\u58f0\u51fa\u529b\u307e\u3067\u306e\u6642\u9593: \u7d04 100 ms</li> <li>\u30b9\u30eb\u30fc\u30d7\u30c3\u30c8: RTF 0.5 \u672a\u6e80\u3092\u7dad\u6301\u3057\u3064\u3064 3,000+ acoustic tokens/s</li> </ul>"},{"location":"ja/#_8","title":"\u591a\u8a00\u8a9e\u30b5\u30dd\u30fc\u30c8","text":"<p>Fish Audio S2 \u306f\u3001\u97f3\u7d20\u3084\u8a00\u8a9e\u56fa\u6709\u306e\u524d\u51e6\u7406\u3092\u5fc5\u8981\u3068\u305b\u305a\u306b\u3001\u9ad8\u54c1\u8cea\u306a\u591a\u8a00\u8a9e\u30c6\u30ad\u30b9\u30c8\u8aad\u307f\u4e0a\u3052\u3092\u30b5\u30dd\u30fc\u30c8\u3057\u307e\u3059\u3002\u4ee5\u4e0b\u3092\u542b\u307f\u307e\u3059\uff1a</p> <p>\u82f1\u8a9e\u3001\u4e2d\u56fd\u8a9e\u3001\u65e5\u672c\u8a9e\u3001\u97d3\u56fd\u8a9e\u3001\u30a2\u30e9\u30d3\u30a2\u8a9e\u3001\u30c9\u30a4\u30c4\u8a9e\u3001\u30d5\u30e9\u30f3\u30b9\u8a9e...</p> <p>\u3055\u3089\u306b\u591a\u304f\uff01</p> <p>\u30ea\u30b9\u30c8\u306f\u5e38\u306b\u62e1\u5927\u3057\u3066\u3044\u307e\u3059\u3002\u6700\u65b0\u306e\u30ea\u30ea\u30fc\u30b9\u306b\u3064\u3044\u3066\u306f Fish Audio \u3092\u78ba\u8a8d\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"ja/#_9","title":"\u30cd\u30a4\u30c6\u30a3\u30d6\u306a\u30de\u30eb\u30c1\u30b9\u30d4\u30fc\u30ab\u30fc\u751f\u6210","text":"<p>Fish Audio S2 \u3067\u306f\u3001\u30e6\u30fc\u30b6\u30fc\u304c\u8907\u6570\u306e\u30b9\u30d4\u30fc\u30ab\u30fc\u3092\u542b\u3080\u53c2\u7167\u30aa\u30fc\u30c7\u30a3\u30aa\u3092\u30a2\u30c3\u30d7\u30ed\u30fc\u30c9\u3067\u304d\u3001\u30e2\u30c7\u30eb\u306f <code><|speaker:i|></code> \u30c8\u30fc\u30af\u30f3\u3092\u4ecb\u3057\u3066\u5404\u30b9\u30d4\u30fc\u30ab\u30fc\u306e\u7279\u5fb4\u3092\u51e6\u7406\u3057\u307e\u3059\u3002\u305d\u306e\u5f8c\u3001\u30b9\u30d4\u30fc\u30ab\u30fcID\u30c8\u30fc\u30af\u30f3\u3092\u4f7f\u7528\u3057\u3066\u30e2\u30c7\u30eb\u306e\u30d1\u30d5\u30a9\u30fc\u30de\u30f3\u30b9\u3092\u5236\u5fa1\u3057\u30011\u56de\u306e\u751f\u6210\u3067\u8907\u6570\u306e\u30b9\u30d4\u30fc\u30ab\u30fc\u3092\u542b\u3081\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u3002\u4ee5\u524d\u306e\u3088\u3046\u306b\u5404\u30b9\u30d4\u30fc\u30ab\u30fc\u306b\u5bfe\u3057\u3066\u500b\u5225\u306b\u53c2\u7167\u30aa\u30fc\u30c7\u30a3\u30aa\u3092\u30a2\u30c3\u30d7\u30ed\u30fc\u30c9\u3057\u3066\u97f3\u58f0\u3092\u751f\u6210\u3059\u308b\u5fc5\u8981\u306f\u3082\u3046\u3042\u308a\u307e\u305b\u3093\u3002</p>"},{"location":"ja/#_10","title":"\u30de\u30eb\u30c1\u30bf\u30fc\u30f3\u5bfe\u8a71\u751f\u6210","text":"<p>\u30e2\u30c7\u30eb\u306e\u30b3\u30f3\u30c6\u30ad\u30b9\u30c8\u306e\u62e1\u5f35\u306b\u3088\u308a\u3001\u4ee5\u524d\u306e\u60c5\u5831\u3092\u4f7f\u7528\u3057\u3066\u5f8c\u7d9a\u306e\u751f\u6210\u3055\u308c\u305f\u30b3\u30f3\u30c6\u30f3\u30c4\u306e\u8868\u73fe\u529b\u3092\u5411\u4e0a\u3055\u305b\u3001\u30b3\u30f3\u30c6\u30f3\u30c4\u306e\u81ea\u7136\u3055\u3092\u9ad8\u3081\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u3088\u3046\u306b\u306a\u308a\u307e\u3057\u305f\u3002</p>"},{"location":"ja/#_11","title":"\u9ad8\u901f\u97f3\u58f0\u30af\u30ed\u30fc\u30cb\u30f3\u30b0","text":"<p>Fish Audio S2 \u306f\u3001\u77ed\u3044\u53c2\u7167\u30b5\u30f3\u30d7\u30eb\uff08\u901a\u5e3810\u301c30\u79d2\uff09\u3092\u4f7f\u7528\u3057\u305f\u6b63\u78ba\u306a\u97f3\u58f0\u30af\u30ed\u30fc\u30cb\u30f3\u30b0\u3092\u30b5\u30dd\u30fc\u30c8\u3057\u3066\u3044\u307e\u3059\u3002\u30e2\u30c7\u30eb\u306f\u97f3\u8272\u3001\u8a71\u3057\u65b9\u3001\u611f\u60c5\u7684\u306a\u50be\u5411\u3092\u6349\u3048\u3001\u8ffd\u52a0\u306e\u5fae\u8abf\u6574\u306a\u3057\u3067\u30ea\u30a2\u30eb\u3067\u4e00\u8cab\u3057\u305f\u30af\u30ed\u30fc\u30f3\u97f3\u58f0\u3092\u751f\u6210\u3057\u307e\u3059\u3002 SGLang \u30b5\u30fc\u30d0\u30fc\u306e\u5229\u7528\u306b\u3064\u3044\u3066\u306f SGLang-Omni README \u3092\u53c2\u7167\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"ja/#_12","title":"\u30af\u30ec\u30b8\u30c3\u30c8","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"ja/#_13","title":"\u6280\u8853\u30ec\u30dd\u30fc\u30c8","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"ja/finetune/","title":"\u30d5\u30a1\u30a4\u30f3\u30c1\u30e5\u30fc\u30cb\u30f3\u30b0","text":"<p>\u3053\u306e\u30da\u30fc\u30b8\u3092\u958b\u3044\u305f\u3068\u3044\u3046\u3053\u3068\u306f\u3001\u660e\u3089\u304b\u306b\u3001\u4e8b\u524d\u5b66\u7fd2\u6e08\u307f\u30e2\u30c7\u30eb\u306e\u30bc\u30ed\u30b7\u30e7\u30c3\u30c8\u6027\u80fd\u306b\u6e80\u8db3\u3057\u3066\u3044\u306a\u3044\u3068\u3044\u3046\u3053\u3068\u3067\u3057\u3087\u3046\u3002\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u3067\u3088\u308a\u826f\u3044\u6027\u80fd\u3092\u767a\u63ee\u3059\u308b\u3088\u3046\u306b\u30e2\u30c7\u30eb\u3092\u30d5\u30a1\u30a4\u30f3\u30c1\u30e5\u30fc\u30cb\u30f3\u30b0\u3057\u305f\u3044\u3068\u304a\u8003\u3048\u306e\u306f\u305a\u3067\u3059\u3002</p> <p>\u73fe\u5728\u306e\u30d0\u30fc\u30b8\u30e7\u30f3\u3067\u306f\u3001\u300cLLAMA\u300d\u90e8\u5206\u306e\u307f\u3092\u30d5\u30a1\u30a4\u30f3\u30c1\u30e5\u30fc\u30cb\u30f3\u30b0\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002</p>"},{"location":"ja/finetune/#llama","title":"LLAMA \u306e\u30d5\u30a1\u30a4\u30f3\u30c1\u30e5\u30fc\u30cb\u30f3\u30b0","text":""},{"location":"ja/finetune/#1","title":"1. \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u306e\u6e96\u5099","text":"<pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u2514\u2500\u2500 30.1-32.71.mp3\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u2514\u2500\u2500 38.79-40.85.mp3\n</code></pre> <p>\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u3092\u4e0a\u8a18\u306e\u5f62\u5f0f\u306b\u5909\u63db\u3057\u3001<code>data</code> \u30c7\u30a3\u30ec\u30af\u30c8\u30ea\u306b\u914d\u7f6e\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002\u97f3\u58f0\u30d5\u30a1\u30a4\u30eb\u306e\u62e1\u5f35\u5b50\u306f <code>.mp3</code>\u3001<code>.wav</code>\u3001\u307e\u305f\u306f <code>.flac</code> \u304c\u4f7f\u7528\u3067\u304d\u3001\u6ce8\u91c8\u30d5\u30a1\u30a4\u30eb\u306e\u62e1\u5f35\u5b50\u306f <code>.lab</code> \u306b\u3059\u308b\u3053\u3068\u3092\u63a8\u5968\u3057\u307e\u3059\u3002</p> <p>Info</p> <p><code>.lab</code> \u6ce8\u91c8\u30d5\u30a1\u30a4\u30eb\u306b\u306f\u3001\u97f3\u58f0\u306e\u66f8\u304d\u8d77\u3053\u3057\u30c6\u30ad\u30b9\u30c8\u306e\u307f\u3092\u542b\u3081\u308b\u5fc5\u8981\u304c\u3042\u308a\u3001\u7279\u5225\u306a\u30d5\u30a9\u30fc\u30de\u30c3\u30c8\u8981\u4ef6\u306f\u3042\u308a\u307e\u305b\u3093\u3002\u305f\u3068\u3048\u3070\u3001<code>hi.mp3</code> \u306e\u5185\u5bb9\u304c\u300c\u3053\u3093\u306b\u3061\u306f\u3001\u3055\u3088\u3046\u306a\u3089\u3002\u300d\u3067\u3042\u308b\u5834\u5408\u3001<code>hi.lab</code> \u30d5\u30a1\u30a4\u30eb\u306b\u306f\u300c\u3053\u3093\u306b\u3061\u306f\u3001\u3055\u3088\u3046\u306a\u3089\u3002\u300d\u3068\u3044\u3046\u4e00\u884c\u306e\u30c6\u30ad\u30b9\u30c8\u306e\u307f\u304c\u542b\u307e\u308c\u307e\u3059\u3002</p> <p>Warning</p> <p>\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u306b\u30e9\u30a6\u30c9\u30cd\u30b9\u6b63\u898f\u5316\u3092\u9069\u7528\u3059\u308b\u3053\u3068\u3092\u304a\u52e7\u3081\u3057\u307e\u3059\u3002\u3053\u308c\u306b\u306f fish-audio-preprocess \u3092\u4f7f\u7528\u3067\u304d\u307e\u3059\u3002 <pre><code>fap loudness-norm data-raw data --clean\n</code></pre></p>"},{"location":"ja/finetune/#2","title":"2. \u30bb\u30de\u30f3\u30c6\u30a3\u30c3\u30af\u30c8\u30fc\u30af\u30f3\u306e\u4e00\u62ec\u62bd\u51fa","text":"<p>VQGAN\u306e\u91cd\u307f\u3092\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3057\u3066\u3044\u308b\u3053\u3068\u3092\u78ba\u8a8d\u3057\u3066\u304f\u3060\u3055\u3044\u3002\u307e\u3060\u306e\u5834\u5408\u306f\u3001\u6b21\u306e\u30b3\u30de\u30f3\u30c9\u3092\u5b9f\u884c\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>\u305d\u306e\u5f8c\u3001\u6b21\u306e\u30b3\u30de\u30f3\u30c9\u3092\u5b9f\u884c\u3057\u3066\u30bb\u30de\u30f3\u30c6\u30a3\u30c3\u30af\u30c8\u30fc\u30af\u30f3\u3092\u62bd\u51fa\u3067\u304d\u307e\u3059\u3002</p> <pre><code>python tools/vqgan/extract_vq.py data \\\n --num-workers 1 --batch-size 16 \\\n --config-name \"modded_dac_vq\" \\\n --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n</code></pre> <p>Note</p> <p><code>--num-workers</code> \u3068 <code>--batch-size</code> \u3092\u8abf\u6574\u3057\u3066\u62bd\u51fa\u901f\u5ea6\u3092\u5411\u4e0a\u3055\u305b\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u304c\u3001GPU\u30e1\u30e2\u30ea\u306e\u5236\u9650\u3092\u8d85\u3048\u306a\u3044\u3088\u3046\u306b\u6ce8\u610f\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p> <p>\u3053\u306e\u30b3\u30de\u30f3\u30c9\u306f <code>data</code> \u30c7\u30a3\u30ec\u30af\u30c8\u30ea\u306b <code>.npy</code> \u30d5\u30a1\u30a4\u30eb\u3092\u4f5c\u6210\u3057\u307e\u3059\u3002\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u306a\u308a\u307e\u3059\u3002</p> <pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 21.15-26.44.npy\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.npy\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u251c\u2500\u2500 30.1-32.71.mp3\n\u2502 \u2514\u2500\u2500 30.1-32.71.npy\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u251c\u2500\u2500 38.79-40.85.mp3\n \u2514\u2500\u2500 38.79-40.85.npy```\n\n### 3. \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u3092 protobuf \u306b\u30d1\u30c3\u30af\u3059\u308b\n\n```bash\npython tools/llama/build_dataset.py \\\n --input \"data\" \\\n --output \"data/protos\" \\\n --text-extension .lab \\\n --num-workers 16\n</code></pre> <p>\u30b3\u30de\u30f3\u30c9\u306e\u5b9f\u884c\u304c\u5b8c\u4e86\u3059\u308b\u3068\u3001<code>data</code> \u30c7\u30a3\u30ec\u30af\u30c8\u30ea\u306b <code>protos</code> \u30d5\u30a1\u30a4\u30eb\u304c\u8868\u793a\u3055\u308c\u308b\u306f\u305a\u3067\u3059\u3002</p>"},{"location":"ja/finetune/#4-lora","title":"4. \u6700\u5f8c\u306b LoRA \u3067\u30d5\u30a1\u30a4\u30f3\u30c1\u30e5\u30fc\u30cb\u30f3\u30b0","text":"<p>\u540c\u69d8\u306b\u3001<code>LLAMA</code> \u306e\u91cd\u307f\u3092\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3057\u3066\u3044\u308b\u3053\u3068\u3092\u78ba\u8a8d\u3057\u3066\u304f\u3060\u3055\u3044\u3002\u307e\u3060\u306e\u5834\u5408\u306f\u3001\u6b21\u306e\u30b3\u30de\u30f3\u30c9\u3092\u5b9f\u884c\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>\u6700\u5f8c\u306b\u3001\u6b21\u306e\u30b3\u30de\u30f3\u30c9\u3092\u5b9f\u884c\u3057\u3066\u30d5\u30a1\u30a4\u30f3\u30c1\u30e5\u30fc\u30cb\u30f3\u30b0\u3092\u958b\u59cb\u3067\u304d\u307e\u3059\u3002</p> <pre><code>python fish_speech/train.py --config-name text2semantic_finetune \\\n project=$project \\\n +lora@model.model.lora_config=r_8_alpha_16\n</code></pre> <p>Note</p> <p><code>fish_speech/configs/text2semantic_finetune.yaml</code> \u3092\u5909\u66f4\u3059\u308b\u3053\u3068\u3067\u3001<code>batch_size</code> \u3084 <code>gradient_accumulation_steps</code> \u306a\u3069\u306e\u30c8\u30ec\u30fc\u30cb\u30f3\u30b0\u30d1\u30e9\u30e1\u30fc\u30bf\u3092GPU\u30e1\u30e2\u30ea\u306b\u5408\u308f\u305b\u3066\u5909\u66f4\u3067\u304d\u307e\u3059\u3002</p> <p>Note</p> <p>Windows \u30e6\u30fc\u30b6\u30fc\u306e\u5834\u5408\u3001<code>trainer.strategy.process_group_backend=gloo</code> \u3092\u4f7f\u7528\u3057\u3066 <code>nccl</code> \u306e\u554f\u984c\u3092\u56de\u907f\u3067\u304d\u307e\u3059\u3002</p> <p>\u30c8\u30ec\u30fc\u30cb\u30f3\u30b0\u304c\u5b8c\u4e86\u3057\u305f\u3089\u3001\u63a8\u8ad6 \u306e\u30bb\u30af\u30b7\u30e7\u30f3\u3092\u53c2\u7167\u3057\u3066\u30e2\u30c7\u30eb\u3092\u30c6\u30b9\u30c8\u3067\u304d\u307e\u3059\u3002</p> <p>Info</p> <p>\u30c7\u30d5\u30a9\u30eb\u30c8\u8a2d\u5b9a\u3067\u306f\u3001\u30e2\u30c7\u30eb\u306f\u8a71\u8005\u306e\u767a\u97f3\u65b9\u6cd5\u306e\u307f\u3092\u5b66\u7fd2\u3057\u3001\u97f3\u8272\u306f\u5b66\u7fd2\u3057\u307e\u305b\u3093\u3002\u97f3\u8272\u306e\u5b89\u5b9a\u6027\u3092\u78ba\u4fdd\u3059\u308b\u305f\u3081\u306b\u306f\u3001\u4f9d\u7136\u3068\u3057\u3066\u30d7\u30ed\u30f3\u30d7\u30c8\u3092\u4f7f\u7528\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002 \u97f3\u8272\u3092\u5b66\u7fd2\u3055\u305b\u305f\u3044\u5834\u5408\u306f\u3001\u30c8\u30ec\u30fc\u30cb\u30f3\u30b0\u30b9\u30c6\u30c3\u30d7\u6570\u3092\u5897\u3084\u3057\u3066\u304f\u3060\u3055\u3044\u3002\u305f\u3060\u3057\u3001\u3053\u308c\u306b\u3088\u308a\u904e\u5b66\u7fd2\u304c\u767a\u751f\u3059\u308b\u53ef\u80fd\u6027\u304c\u3042\u308a\u307e\u3059\u3002</p> <p>\u30c8\u30ec\u30fc\u30cb\u30f3\u30b0\u5f8c\u3001\u63a8\u8ad6\u3092\u884c\u3046\u524d\u306b LoRA \u306e\u91cd\u307f\u3092\u901a\u5e38\u306e\u91cd\u307f\u306b\u5909\u63db\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002</p> <pre><code>python tools/llama/merge_lora.py \\\n --lora-config r_8_alpha_16 \\\n --base-weight checkpoints/openaudio-s1-mini \\\n --lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n --output checkpoints/openaudio-s1-mini-yth-lora/\n</code></pre> <p>Note</p> <p>\u4ed6\u306e\u30c1\u30a7\u30c3\u30af\u30dd\u30a4\u30f3\u30c8\u3092\u8a66\u3059\u3053\u3068\u3082\u3067\u304d\u307e\u3059\u3002\u8981\u4ef6\u3092\u6e80\u305f\u3059\u6700\u3082\u65e9\u3044\u30c1\u30a7\u30c3\u30af\u30dd\u30a4\u30f3\u30c8\u3092\u4f7f\u7528\u3059\u308b\u3053\u3068\u3092\u304a\u52e7\u3081\u3057\u307e\u3059\u3002\u3053\u308c\u3089\u306f\u901a\u5e38\u3001OOD\uff08\u5206\u5e03\u5916\uff09\u30c7\u30fc\u30bf\u306b\u5bfe\u3057\u3066\u3088\u308a\u826f\u3044\u30d1\u30d5\u30a9\u30fc\u30de\u30f3\u30b9\u3092\u767a\u63ee\u3057\u307e\u3059\u3002</p>"},{"location":"ja/inference/","title":"\u63a8\u8ad6","text":"<p>Fish Audio S2 \u30e2\u30c7\u30eb\u306f\u5927\u304d\u306a\u30d3\u30c7\u30aa\u30e1\u30e2\u30ea\u3092\u5fc5\u8981\u3068\u3057\u307e\u3059\u3002\u63a8\u8ad6\u306b\u306f\u5c11\u306a\u304f\u3068\u3082 24GB \u306e GPU \u3092\u4f7f\u7528\u3059\u308b\u3053\u3068\u3092\u304a\u52e7\u3081\u3057\u307e\u3059\u3002</p>"},{"location":"ja/inference/#_2","title":"\u91cd\u307f\u306e\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9","text":"<p>\u307e\u305a\u3001\u30e2\u30c7\u30eb\u306e\u91cd\u307f\u3092\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\uff1a</p> <pre><code>hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n</code></pre>"},{"location":"ja/inference/#_3","title":"\u30b3\u30de\u30f3\u30c9\u30e9\u30a4\u30f3\u63a8\u8ad6","text":"<p>Note</p> <p>\u30e2\u30c7\u30eb\u306b\u97f3\u58f0\u3092\u30e9\u30f3\u30c0\u30e0\u306b\u9078\u629e\u3055\u305b\u308b\u5834\u5408\u306f\u3001\u3053\u306e\u30b9\u30c6\u30c3\u30d7\u3092\u30b9\u30ad\u30c3\u30d7\u3067\u304d\u307e\u3059\u3002</p>"},{"location":"ja/inference/#1-vq","title":"1. \u30ea\u30d5\u30a1\u30ec\u30f3\u30b9\u30aa\u30fc\u30c7\u30a3\u30aa\u304b\u3089 VQ \u30c8\u30fc\u30af\u30f3\u3092\u53d6\u5f97\u3059\u308b","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"test.wav\" \\\n --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n</code></pre> <p><code>fake.npy</code> \u3068 <code>fake.wav</code> \u304c\u751f\u6210\u3055\u308c\u308b\u306f\u305a\u3067\u3059\u3002</p>"},{"location":"ja/inference/#2-semantic","title":"2. \u30c6\u30ad\u30b9\u30c8\u304b\u3089 Semantic \u30c8\u30fc\u30af\u30f3\u3092\u751f\u6210\u3059\u308b\uff1a","text":"<pre><code>python fish_speech/models/text2semantic/inference.py \\\n --text \"\u5909\u63db\u3057\u305f\u3044\u30c6\u30ad\u30b9\u30c8\" \\\n --prompt-text \"\u30ea\u30d5\u30a1\u30ec\u30f3\u30b9\u30c6\u30ad\u30b9\u30c8\" \\\n --prompt-tokens \"fake.npy\" \\\n # --compile\n</code></pre> <p>\u3053\u306e\u30b3\u30de\u30f3\u30c9\u306f\u3001\u4f5c\u696d\u30c7\u30a3\u30ec\u30af\u30c8\u30ea\u306b <code>codes_N</code> \u30d5\u30a1\u30a4\u30eb\u3092\u4f5c\u6210\u3057\u307e\u3059\u3002\u3053\u3053\u3067 N \u306f 0 \u304b\u3089\u59cb\u307e\u308b\u6574\u6570\u3067\u3059\u3002</p> <p>Note</p> <p>\u3088\u308a\u9ad8\u901f\u306a\u63a8\u8ad6\u306e\u305f\u3081\u306b CUDA \u30ab\u30fc\u30cd\u30eb\u3092\u878d\u5408\u3059\u308b <code>--compile</code> \u3092\u4f7f\u7528\u3057\u305f\u3044\u5834\u5408\u304c\u3042\u308a\u307e\u3059\u304c\u3001\u79c1\u305f\u3061\u306e sglang \u63a8\u8ad6\u52a0\u901f\u6700\u9069\u5316\u3092\u4f7f\u7528\u3059\u308b\u3053\u3068\u3092\u304a\u52e7\u3081\u3057\u307e\u3059\u3002 \u540c\u69d8\u306b\u3001\u52a0\u901f\u3092\u4f7f\u7528\u3059\u308b\u4e88\u5b9a\u304c\u306a\u3044\u5834\u5408\u306f\u3001<code>--compile</code> \u30d1\u30e9\u30e1\u30fc\u30bf\u3092\u30b3\u30e1\u30f3\u30c8\u30a2\u30a6\u30c8\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p> <p>Info</p> <p>bf16 \u3092\u30b5\u30dd\u30fc\u30c8\u3057\u3066\u3044\u306a\u3044 GPU \u306e\u5834\u5408\u3001<code>--half</code> \u30d1\u30e9\u30e1\u30fc\u30bf\u3092\u4f7f\u7528\u3059\u308b\u5fc5\u8981\u304c\u3042\u308b\u304b\u3082\u3057\u308c\u307e\u305b\u3093\u3002</p>"},{"location":"ja/inference/#3","title":"3. \u30bb\u30de\u30f3\u30c6\u30a3\u30c3\u30af\u30c8\u30fc\u30af\u30f3\u304b\u3089\u97f3\u58f0\u3092\u751f\u6210\u3059\u308b\uff1a","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"codes_0.npy\" \\\n</code></pre> <p>\u305d\u306e\u5f8c\u3001<code>fake.wav</code> \u30d5\u30a1\u30a4\u30eb\u304c\u53d6\u5f97\u3067\u304d\u307e\u3059\u3002</p>"},{"location":"ja/inference/#webui","title":"WebUI \u63a8\u8ad6","text":""},{"location":"ja/inference/#1-gradio-webui","title":"1. Gradio WebUI","text":"<p>\u4e92\u63db\u6027\u3092\u7dad\u6301\u3059\u308b\u305f\u3081\u3001\u4ee5\u524d\u306e Gradio WebUI \u3082\u5f15\u304d\u7d9a\u304d\u5229\u7528\u53ef\u80fd\u3067\u3059\u3002</p> <pre><code>python tools/run_webui.py # \u52a0\u901f\u304c\u5fc5\u8981\u306a\u5834\u5408\u306f --compile\n</code></pre>"},{"location":"ja/inference/#2-awesome-webui","title":"2. Awesome WebUI","text":"<p>Awesome WebUI \u306f TypeScript \u3067\u958b\u767a\u3055\u308c\u305f\u3001\u3088\u308a\u8c4a\u5bcc\u306a\u6a5f\u80fd\u3068\u512a\u308c\u305f\u30e6\u30fc\u30b6\u30fc\u4f53\u9a13\u3092\u63d0\u4f9b\u3059\u308b\u6700\u65b0\u306e Web \u30a4\u30f3\u30bf\u30fc\u30d5\u30a7\u30fc\u30b9\u3067\u3059\u3002</p> <p>WebUI \u306e\u30d3\u30eb\u30c9\uff1a</p> <p>\u30ed\u30fc\u30ab\u30eb\u307e\u305f\u306f\u30b5\u30fc\u30d0\u30fc\u306b Node.js \u3068 npm \u304c\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3055\u308c\u3066\u3044\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002</p> <ol> <li><code>awesome_webui</code> \u30c7\u30a3\u30ec\u30af\u30c8\u30ea\u306b\u79fb\u52d5\u3057\u307e\u3059\uff1a <pre><code>cd awesome_webui\n</code></pre></li> <li>\u4f9d\u5b58\u95a2\u4fc2\u3092\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3057\u307e\u3059\uff1a <pre><code>npm install\n</code></pre></li> <li>WebUI \u3092\u30d3\u30eb\u30c9\u3057\u307e\u3059\uff1a <pre><code>npm run build\n</code></pre></li> </ol> <p>\u30d0\u30c3\u30af\u30a8\u30f3\u30c9\u30b5\u30fc\u30d0\u30fc\u306e\u8d77\u52d5\uff1a</p> <p>WebUI \u306e\u30d3\u30eb\u30c9\u304c\u5b8c\u4e86\u3057\u305f\u3089\u3001\u30d7\u30ed\u30b8\u30a7\u30af\u30c8\u306e\u30eb\u30fc\u30c8\u306b\u623b\u308a\u3001API \u30b5\u30fc\u30d0\u30fc\u3092\u8d77\u52d5\u3057\u307e\u3059\uff1a</p> <pre><code>python tools/api_server.py --listen 0.0.0.0:8888 --compile\n</code></pre> <p>\u30a2\u30af\u30bb\u30b9\uff1a</p> <p>\u30b5\u30fc\u30d0\u30fc\u304c\u8d77\u52d5\u3057\u305f\u3089\u3001\u30d6\u30e9\u30a6\u30b6\u304b\u3089\u4ee5\u4e0b\u306e\u30a2\u30c9\u30ec\u30b9\u306b\u30a2\u30af\u30bb\u30b9\u3057\u3066\u4f53\u9a13\u3067\u304d\u307e\u3059\uff1a <code>http://localhost:8888/ui</code></p>"},{"location":"ja/install/","title":"\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb","text":""},{"location":"ja/install/#_1","title":"\u5fc5\u8981\u6761\u4ef6","text":"<ul> <li>GPU\u30e1\u30e2\u30ea: 24GB (\u63a8\u8ad6\u6642)</li> <li>\u30b7\u30b9\u30c6\u30e0: Linux, WSL</li> </ul>"},{"location":"ja/install/#_2","title":"\u30b7\u30b9\u30c6\u30e0\u30bb\u30c3\u30c8\u30a2\u30c3\u30d7","text":"<p>Fish Audio S2\u306f\u8907\u6570\u306e\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u65b9\u6cd5\u3092\u30b5\u30dd\u30fc\u30c8\u3057\u3066\u3044\u307e\u3059\u3002\u3054\u81ea\u8eab\u306e\u958b\u767a\u74b0\u5883\u306b\u6700\u3082\u9069\u3057\u305f\u65b9\u6cd5\u3092\u304a\u9078\u3073\u304f\u3060\u3055\u3044\u3002</p> <p>\u524d\u63d0\u6761\u4ef6: \u97f3\u58f0\u51e6\u7406\u306e\u305f\u3081\u306e\u30b7\u30b9\u30c6\u30e0\u4f9d\u5b58\u95a2\u4fc2\u3092\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3057\u307e\u3059: <pre><code>apt install portaudio19-dev libsox-dev ffmpeg\n</code></pre></p>"},{"location":"ja/install/#conda","title":"Conda","text":"<pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# GPU\u7248\u306e\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb (CUDA\u30d0\u30fc\u30b8\u30e7\u30f3\u3092\u9078\u629e: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# CPU\u7248\u306e\u307f\u306e\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\npip install -e .[cpu]\n\n# \u30c7\u30d5\u30a9\u30eb\u30c8\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb (PyTorch\u306e\u30c7\u30d5\u30a9\u30eb\u30c8\u30a4\u30f3\u30c7\u30c3\u30af\u30b9\u3092\u4f7f\u7528)\npip install -e .\n\n# pyaudio\u306e\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3067\u30a8\u30e9\u30fc\u304c\u767a\u751f\u3059\u308b\u5834\u5408\u306f\u3001\u4ee5\u4e0b\u306e\u30b3\u30de\u30f3\u30c9\u3092\u8a66\u3057\u3066\u304f\u3060\u3055\u3044\uff1a\n# conda install pyaudio\n# \u305d\u306e\u5f8c\u3001\u518d\u5ea6 pip install -e . \u3092\u5b9f\u884c\u3057\u3066\u304f\u3060\u3055\u3044\n</code></pre>"},{"location":"ja/install/#uv","title":"UV","text":"<p>UV\u306f\u3088\u308a\u9ad8\u901f\u306a\u4f9d\u5b58\u95a2\u4fc2\u306e\u89e3\u6c7a\u3068\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3092\u5b9f\u73fe\u3057\u307e\u3059:</p> <pre><code># GPU\u7248\u306e\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb (CUDA\u30d0\u30fc\u30b8\u30e7\u30f3\u3092\u9078\u629e: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# CPU\u7248\u306e\u307f\u306e\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\nuv sync --python 3.12 --extra cpu\n</code></pre>"},{"location":"ja/install/#intel-arc-xpu","title":"Intel Arc XPU \u30b5\u30dd\u30fc\u30c8","text":"<p>Intel Arc GPU\u30e6\u30fc\u30b6\u30fc\u306f\u3001\u4ee5\u4e0b\u306e\u624b\u9806\u3067XPU\u30b5\u30dd\u30fc\u30c8\u3092\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3057\u3066\u304f\u3060\u3055\u3044:</p> <pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# \u5fc5\u8981\u306aC++\u6a19\u6e96\u30e9\u30a4\u30d6\u30e9\u30ea\u3092\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\nconda install libstdcxx -c conda-forge\n\n# Intel XPU\u5bfe\u5fdc\u306ePyTorch\u3092\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# Fish Speech\u306e\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\npip install -e .\n</code></pre> <p>Warning</p> <p><code>compile</code>\u30aa\u30d7\u30b7\u30e7\u30f3\u306fWindows\u3068macOS\u3067\u306f\u30b5\u30dd\u30fc\u30c8\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u30b3\u30f3\u30d1\u30a4\u30eb\u3092\u6709\u52b9\u306b\u3057\u3066\u5b9f\u884c\u3057\u305f\u3044\u5834\u5408\u306f\u3001\u3054\u81ea\u8eab\u3067Triton\u3092\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002</p>"},{"location":"ja/install/#docker","title":"Docker\u30bb\u30c3\u30c8\u30a2\u30c3\u30d7","text":"<p>Fish Audio S2\u30b7\u30ea\u30fc\u30ba\u30e2\u30c7\u30eb\u306f\u3001\u3055\u307e\u3056\u307e\u306a\u30cb\u30fc\u30ba\u306b\u5fdc\u3048\u308b\u305f\u3081\u8907\u6570\u306eDocker\u30c7\u30d7\u30ed\u30a4\u30e1\u30f3\u30c8\u30aa\u30d7\u30b7\u30e7\u30f3\u3092\u63d0\u4f9b\u3057\u3066\u3044\u307e\u3059\u3002Docker Hub\u306e\u30d3\u30eb\u30c9\u6e08\u307f\u30a4\u30e1\u30fc\u30b8\u3092\u4f7f\u7528\u3059\u308b\u304b\u3001Docker Compose\u3067\u30ed\u30fc\u30ab\u30eb\u30d3\u30eb\u30c9\u3059\u308b\u304b\u3001\u624b\u52d5\u3067\u30ab\u30b9\u30bf\u30e0\u30a4\u30e1\u30fc\u30b8\u3092\u30d3\u30eb\u30c9\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u3002</p> <p>WebUI\u3068API\u30b5\u30fc\u30d0\u30fc\u306e\u4e21\u65b9\u306b\u3064\u3044\u3066\u3001GPU\uff08\u30c7\u30d5\u30a9\u30eb\u30c8\u306fCUDA 12.6\uff09\u7248\u3068CPU\u7248\u306eDocker\u30a4\u30e1\u30fc\u30b8\u3092\u63d0\u4f9b\u3057\u3066\u3044\u307e\u3059\u3002Docker Hub\u306e\u30d3\u30eb\u30c9\u6e08\u307f\u30a4\u30e1\u30fc\u30b8\u3092\u4f7f\u7528\u3059\u308b\u304b\u3001Docker Compose\u3067\u30ed\u30fc\u30ab\u30eb\u30d3\u30eb\u30c9\u3059\u308b\u304b\u3001\u624b\u52d5\u3067\u30ab\u30b9\u30bf\u30e0\u30a4\u30e1\u30fc\u30b8\u3092\u30d3\u30eb\u30c9\u3059\u308b\u304b\u3092\u9078\u629e\u3067\u304d\u307e\u3059\u3002\u30ed\u30fc\u30ab\u30eb\u3067\u30d3\u30eb\u30c9\u3059\u308b\u5834\u5408\u306f\u3001\u4ee5\u4e0b\u306e\u624b\u9806\u306b\u5f93\u3063\u3066\u304f\u3060\u3055\u3044\u3002\u30d3\u30eb\u30c9\u6e08\u307f\u30a4\u30e1\u30fc\u30b8\u3092\u4f7f\u7528\u3059\u308b\u3060\u3051\u306e\u5834\u5408\u306f\u3001\u63a8\u8ad6\u30ac\u30a4\u30c9\u3092\u76f4\u63a5\u53c2\u7167\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"ja/install/#_3","title":"\u524d\u63d0\u6761\u4ef6","text":"<ul> <li>Docker\u3068Docker Compose\u304c\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u6e08\u307f\u3067\u3042\u308b\u3053\u3068</li> <li>NVIDIA Docker\u30e9\u30f3\u30bf\u30a4\u30e0\u304c\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u6e08\u307f\u3067\u3042\u308b\u3053\u3068\uff08GPU\u30b5\u30dd\u30fc\u30c8\u7528\uff09</li> <li>CUDA\u306b\u3088\u308b\u63a8\u8ad6\u306e\u305f\u3081\u306b\u3001\u5c11\u306a\u304f\u3068\u308224GB\u306eGPU\u30e1\u30e2\u30ea\u304c\u3042\u308b\u3053\u3068</li> </ul>"},{"location":"ja/install/#docker-compose","title":"Docker Compose\u306e\u4f7f\u7528","text":"<p>\u958b\u767a\u3084\u30ab\u30b9\u30bf\u30de\u30a4\u30ba\u306e\u305f\u3081\u306b\u3001Docker Compose\u3092\u4f7f\u7528\u3057\u3066\u30ed\u30fc\u30ab\u30eb\u3067\u30d3\u30eb\u30c9\u30fb\u5b9f\u884c\u3067\u304d\u307e\u3059:</p> <pre><code># \u307e\u305a\u3001\u30ea\u30dd\u30b8\u30c8\u30ea\u3092\u30af\u30ed\u30fc\u30f3\u3057\u307e\u3059\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# CUDA\u3067WebUI\u3092\u8d77\u52d5\ndocker compose --profile webui up\n\n# \u30b3\u30f3\u30d1\u30a4\u30eb\u6700\u9069\u5316\u3092\u6709\u52b9\u306b\u3057\u3066WebUI\u3092\u8d77\u52d5\nCOMPILE=1 docker compose --profile webui up\n\n# API\u30b5\u30fc\u30d0\u30fc\u3092\u8d77\u52d5\ndocker compose --profile server up\n\n# \u30b3\u30f3\u30d1\u30a4\u30eb\u6700\u9069\u5316\u3092\u6709\u52b9\u306b\u3057\u3066API\u30b5\u30fc\u30d0\u30fc\u3092\u8d77\u52d5\nCOMPILE=1 docker compose --profile server up\n\n# CPU\u306e\u307f\u3067\u306e\u30c7\u30d7\u30ed\u30a4\nBACKEND=cpu docker compose --profile webui up\n</code></pre>"},{"location":"ja/install/#docker-compose_1","title":"Docker Compose \u74b0\u5883\u5909\u6570","text":"<p>\u74b0\u5883\u5909\u6570\u3092\u4f7f\u7528\u3057\u3066\u30c7\u30d7\u30ed\u30a4\u30e1\u30f3\u30c8\u3092\u30ab\u30b9\u30bf\u30de\u30a4\u30ba\u3067\u304d\u307e\u3059:</p> <pre><code># .env \u30d5\u30a1\u30a4\u30eb\u306e\u4f8b\nBACKEND=cuda # \u307e\u305f\u306f cpu\nCOMPILE=1 # \u30b3\u30f3\u30d1\u30a4\u30eb\u6700\u9069\u5316\u3092\u6709\u52b9\u5316\nGRADIO_PORT=7860 # WebUI\u306e\u30dd\u30fc\u30c8\nAPI_PORT=8080 # API\u30b5\u30fc\u30d0\u30fc\u306e\u30dd\u30fc\u30c8\nUV_VERSION=0.8.15 # UV\u30d1\u30c3\u30b1\u30fc\u30b8\u30de\u30cd\u30fc\u30b8\u30e3\u30fc\u306e\u30d0\u30fc\u30b8\u30e7\u30f3\n</code></pre> <p>\u3053\u306e\u30b3\u30de\u30f3\u30c9\u306f\u30a4\u30e1\u30fc\u30b8\u3092\u30d3\u30eb\u30c9\u3057\u3001\u30b3\u30f3\u30c6\u30ca\u3092\u5b9f\u884c\u3057\u307e\u3059\u3002WebUI\u306b\u306f<code>http://localhost:7860</code>\u3067\u3001API\u30b5\u30fc\u30d0\u30fc\u306b\u306f<code>http://localhost:8080</code>\u3067\u30a2\u30af\u30bb\u30b9\u3067\u304d\u307e\u3059\u3002</p>"},{"location":"ja/install/#docker_1","title":"\u624b\u52d5\u3067\u306eDocker\u30d3\u30eb\u30c9","text":"<p>\u30d3\u30eb\u30c9\u30d7\u30ed\u30bb\u30b9\u3092\u30ab\u30b9\u30bf\u30de\u30a4\u30ba\u3057\u305f\u3044\u4e0a\u7d1a\u8005\u5411\u3051:</p> <pre><code># CUDA\u30b5\u30dd\u30fc\u30c8\u4ed8\u304d\u306eWebUI\u30a4\u30e1\u30fc\u30b8\u3092\u30d3\u30eb\u30c9\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target webui \\\n -t fish-speech-webui:cuda .\n\n# CUDA\u30b5\u30dd\u30fc\u30c8\u4ed8\u304d\u306eAPI\u30b5\u30fc\u30d0\u30fc\u30a4\u30e1\u30fc\u30b8\u3092\u30d3\u30eb\u30c9\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target server \\\n -t fish-speech-server:cuda .\n\n# CPU\u306e\u307f\u306e\u30a4\u30e1\u30fc\u30b8\u3092\u30d3\u30eb\u30c9\uff08\u30de\u30eb\u30c1\u30d7\u30e9\u30c3\u30c8\u30d5\u30a9\u30fc\u30e0\u5bfe\u5fdc\uff09\ndocker build \\\n --platform linux/amd64,linux/arm64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cpu \\\n --target webui \\\n -t fish-speech-webui:cpu .\n\n# \u958b\u767a\u7528\u30a4\u30e1\u30fc\u30b8\u3092\u30d3\u30eb\u30c9\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --target dev \\\n -t fish-speech-dev:cuda .\n</code></pre>"},{"location":"ja/install/#_4","title":"\u30d3\u30eb\u30c9\u5f15\u6570","text":"<ul> <li><code>BACKEND</code>: <code>cuda</code> \u307e\u305f\u306f <code>cpu</code> (\u30c7\u30d5\u30a9\u30eb\u30c8: <code>cuda</code>)</li> <li><code>CUDA_VER</code>: CUDA\u30d0\u30fc\u30b8\u30e7\u30f3 (\u30c7\u30d5\u30a9\u30eb\u30c8: <code>12.6.0</code>)</li> <li><code>UV_EXTRA</code>: CUDA\u7528\u306eUV\u8ffd\u52a0\u30d1\u30c3\u30b1\u30fc\u30b8 (\u30c7\u30d5\u30a9\u30eb\u30c8: <code>cu126</code>)</li> <li><code>UBUNTU_VER</code>: Ubuntu\u30d0\u30fc\u30b8\u30e7\u30f3 (\u30c7\u30d5\u30a9\u30eb\u30c8: <code>24.04</code>)</li> <li><code>PY_VER</code>: Python\u30d0\u30fc\u30b8\u30e7\u30f3 (\u30c7\u30d5\u30a9\u30eb\u30c8: <code>3.12</code>)</li> </ul>"},{"location":"ja/install/#_5","title":"\u30dc\u30ea\u30e5\u30fc\u30e0\u30de\u30a6\u30f3\u30c8","text":"<p>\u3069\u3061\u3089\u306e\u65b9\u6cd5\u3067\u3082\u3001\u4ee5\u4e0b\u306e\u30c7\u30a3\u30ec\u30af\u30c8\u30ea\u3092\u30de\u30a6\u30f3\u30c8\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059:</p> <ul> <li><code>./checkpoints:/app/checkpoints</code> - \u30e2\u30c7\u30eb\u306e\u91cd\u307f\u30d5\u30a1\u30a4\u30eb\u7528\u30c7\u30a3\u30ec\u30af\u30c8\u30ea</li> <li><code>./references:/app/references</code> - \u53c2\u7167\u97f3\u58f0\u30d5\u30a1\u30a4\u30eb\u7528\u30c7\u30a3\u30ec\u30af\u30c8\u30ea</li> </ul>"},{"location":"ja/install/#_6","title":"\u74b0\u5883\u5909\u6570","text":"<ul> <li><code>COMPILE=1</code> - <code>torch.compile</code>\u3092\u6709\u52b9\u306b\u3057\u3066\u63a8\u8ad6\u3092\u9ad8\u901f\u5316\uff08\u7d0410\u500d\uff09</li> <li><code>GRADIO_SERVER_NAME=0.0.0.0</code> - WebUI\u30b5\u30fc\u30d0\u30fc\u306e\u30db\u30b9\u30c8</li> <li><code>GRADIO_SERVER_PORT=7860</code> - WebUI\u30b5\u30fc\u30d0\u30fc\u306e\u30dd\u30fc\u30c8</li> <li><code>API_SERVER_NAME=0.0.0.0</code> - API\u30b5\u30fc\u30d0\u30fc\u306e\u30db\u30b9\u30c8</li> <li><code>API_SERVER_PORT=8080</code> - API\u30b5\u30fc\u30d0\u30fc\u306e\u30dd\u30fc\u30c8</li> </ul> <p>Note</p> <p>Docker\u30b3\u30f3\u30c6\u30ca\u306f\u3001\u30e2\u30c7\u30eb\u306e\u91cd\u307f\u304c<code>/app/checkpoints</code>\u306b\u30de\u30a6\u30f3\u30c8\u3055\u308c\u308b\u3053\u3068\u3092\u60f3\u5b9a\u3057\u3066\u3044\u307e\u3059\u3002\u30b3\u30f3\u30c6\u30ca\u3092\u8d77\u52d5\u3059\u308b\u524d\u306b\u3001\u5fc5\u8981\u306a\u30e2\u30c7\u30eb\u306e\u91cd\u307f\u3092\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p> <p>Warning</p> <p>GPU\u30b5\u30dd\u30fc\u30c8\u306b\u306fNVIDIA Docker\u30e9\u30f3\u30bf\u30a4\u30e0\u304c\u5fc5\u8981\u3067\u3059\u3002CPU\u306e\u307f\u3067\u30c7\u30d7\u30ed\u30a4\u3059\u308b\u5834\u5408\u306f\u3001<code>--gpus all</code>\u30d5\u30e9\u30b0\u3092\u524a\u9664\u3057\u3001CPU\u7528\u306e\u30a4\u30e1\u30fc\u30b8\u3092\u4f7f\u7528\u3057\u3066\u304f\u3060\u3055\u3044\u3002</p>"},{"location":"pt/","title":"Introdu\u00e7\u00e3o","text":"Fish Speech <p>English | \u7b80\u4f53\u4e2d\u6587 | Portuguese | \u65e5\u672c\u8a9e | \ud55c\uad6d\uc5b4 | \u0627\u0644\u0639\u0631\u0628\u064a\u0629 | Espa\u00f1ol</p> <p></p> <p></p> <p>Aviso de Licen\u00e7a</p> <p>Este reposit\u00f3rio e todos os pesos de modelo associados s\u00e3o lan\u00e7ados sob a FISH AUDIO RESEARCH LICENSE. Consulte LICENSE para mais detalhes.</p> <p>Isen\u00e7\u00e3o de Responsabilidade Legal</p> <p>N\u00e3o nos responsabilizamos por qualquer uso ilegal da base de c\u00f3digos. Consulte as regulamenta\u00e7\u00f5es locais sobre DMCA e outras leis relacionadas.</p>"},{"location":"pt/#inicio-rapido","title":"In\u00edcio R\u00e1pido","text":""},{"location":"pt/#comece-pela-documentacao","title":"Comece pela documenta\u00e7\u00e3o","text":"<p>Esta \u00e9 a documenta\u00e7\u00e3o oficial do Fish Audio S2. Voc\u00ea pode come\u00e7ar por aqui:</p> <ul> <li>Instala\u00e7\u00e3o</li> <li>Infer\u00eancia por Linha de Comando</li> <li>Infer\u00eancia WebUI</li> <li>Infer\u00eancia via Servidor</li> <li>Configura\u00e7\u00e3o Docker</li> </ul> <p>[!IMPORTANT] Para servidor com SGLang, consulte o SGLang-Omni README.</p>"},{"location":"pt/#guia-para-agentes-llm","title":"Guia para agentes LLM","text":"<pre><code>Instale e configure o Fish Audio S2 seguindo as instru\u00e7\u00f5es em https://speech.fish.audio/pt/install/ .\n</code></pre>"},{"location":"pt/#fish-audio-s2","title":"Fish Audio S2","text":"<p>O melhor sistema de convers\u00e3o de texto em fala entre c\u00f3digo aberto e c\u00f3digo fechado</p> <p>O Fish Audio S2 \u00e9 o modelo mais recente da Fish Audio. Treinado com mais de 10 milh\u00f5es de horas de \u00e1udio em cerca de 50 idiomas, o S2 combina alinhamento por refor\u00e7o com uma arquitetura Dual-Autoregressive para gerar fala natural, realista e emocionalmente expressiva.</p> <p>O S2 permite controle fino de pros\u00f3dia e emo\u00e7\u00e3o dentro da pr\u00f3pria frase com tags em linguagem natural, como <code>[laugh]</code>, <code>[whispers]</code> e <code>[super happy]</code>, al\u00e9m de oferecer suporte nativo a m\u00faltiplos falantes e m\u00faltiplos turnos.</p> <p>AcesVisite o site da Fish Audio para demonstra\u00e7\u00f5es ao vivo. Leia a postagem no blog e o relat\u00f3rio t\u00e9cnico para mais detalhes.</p>"},{"location":"pt/#variantes-do-modelo","title":"Variantes do Modelo","text":"Modelo Tamanho Disponibilidade Descri\u00e7\u00e3o S2-Pro 4B par\u00e2metros HuggingFace Modelo carro-chefe completo com m\u00e1xima qualidade e estabilidade <p>Mais detalhes podem ser encontrados no relat\u00f3rio t\u00e9cnico.</p>"},{"location":"pt/#resultados-de-benchmark","title":"Resultados de Benchmark","text":"Benchmark Fish Audio S2 Seed-TTS Eval \u2014 WER (Chin\u00eas) 0.54% (melhor geral) Seed-TTS Eval \u2014 WER (Ingl\u00eas) 0.99% (melhor geral) Audio Turing Test (com instru\u00e7\u00e3o) 0.515 m\u00e9dia a posteriori EmergentTTS-Eval \u2014 Taxa de vit\u00f3ria 81.88% (maior geral) Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 Qualidade 4.51 / 5.0 Multil\u00edngue (MiniMax Testset) \u2014 Melhor WER 11 de 24 idiomas Multil\u00edngue (MiniMax Testset) \u2014 Melhor SIM 17 de 24 idiomas <p>No Seed-TTS Eval, o S2 obteve o menor WER entre todos os modelos avaliados, incluindo sistemas fechados: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90) e Seed-TTS (1.12/2.25). No Audio Turing Test, o valor 0.515 supera o Seed-TTS (0.417) em 24% e o MiniMax-Speech (0.387) em 33%. No EmergentTTS-Eval, o S2 se destacou especialmente em paralingu\u00edstica (91.61%), perguntas (84.41%) e complexidade sint\u00e1tica (83.39%).</p>"},{"location":"pt/#destaques","title":"Destaques","text":""},{"location":"pt/#controle-inline-refinado-via-linguagem-natural","title":"Controle Inline Refinado via Linguagem Natural","text":"<p>O Fish Audio S2 permite controle localizado da gera\u00e7\u00e3o de fala ao incorporar instru\u00e7\u00f5es em linguagem natural diretamente em posi\u00e7\u00f5es espec\u00edficas de palavras ou frases no texto. Em vez de depender de um conjunto fixo de tags predefinidas, o S2 aceita descri\u00e7\u00f5es textuais livres, como [whisper in small voice], [professional broadcast tone] ou [pitch up], permitindo controle de express\u00e3o aberto no n\u00edvel da palavra.</p>"},{"location":"pt/#arquitetura-dual-autoregressive","title":"Arquitetura Dual-Autoregressive","text":"<p>O S2 \u00e9 baseado em um transformer apenas decodificador, combinado com um codec de \u00e1udio RVQ (10 codebooks, ~21 Hz de taxa de quadros). A arquitetura Dual-AR divide a gera\u00e7\u00e3o em duas etapas:</p> <ul> <li>Slow AR opera no eixo temporal e prev\u00ea o codebook sem\u00e2ntico principal.</li> <li>Fast AR gera os 9 codebooks residuais restantes em cada passo de tempo, reconstruindo detalhes ac\u00fasticos finos.</li> </ul> <p>Esse desenho assim\u00e9trico (4B par\u00e2metros no eixo temporal e 400M no eixo de profundidade) mant\u00e9m a infer\u00eancia eficiente sem sacrificar fidelidade de \u00e1udio.</p>"},{"location":"pt/#alinhamento-por-reforco","title":"Alinhamento por Refor\u00e7o","text":"<p>O S2 usa Group Relative Policy Optimization (GRPO) no p\u00f3s-treinamento. Os mesmos modelos usados para filtrar e anotar dados de treino s\u00e3o reutilizados diretamente como modelos de recompensa no RL, eliminando o desalinhamento de distribui\u00e7\u00e3o entre os dados de pr\u00e9-treinamento e os objetivos de p\u00f3s-treinamento. O sinal de recompensa combina precis\u00e3o sem\u00e2ntica, ader\u00eancia \u00e0 instru\u00e7\u00e3o, prefer\u00eancia ac\u00fastica e similaridade de timbre.</p>"},{"location":"pt/#streaming-em-producao-com-sglang","title":"Streaming em Produ\u00e7\u00e3o com SGLang","text":"<p>Como a arquitetura Dual-AR \u00e9 estruturalmente isom\u00f3rfica a LLMs autoregressivos padr\u00e3o, o S2 herda diretamente as otimiza\u00e7\u00f5es nativas de serving do SGLang, incluindo continuous batching, paged KV cache, CUDA graph replay e prefix caching com RadixAttention.</p> <p>Em uma \u00fanica NVIDIA H200:</p> <ul> <li>RTF (Real-Time Factor): 0.195</li> <li>Tempo at\u00e9 o primeiro \u00e1udio: ~100 ms</li> <li>Throughput: mais de 3.000 acoustic tokens/s mantendo RTF abaixo de 0.5</li> </ul>"},{"location":"pt/#suporte-multilingue","title":"Suporte Multil\u00edngue","text":"<p>O Fish Audio S2 oferece suporte a convers\u00e3o de texto em fala multil\u00edngue de alta qualidade sem a necessidade de fonemas ou processamento espec\u00edfico de idioma. Incluindo:</p> <p>Ingl\u00eas, Chin\u00eas, Japon\u00eas, Coreano, \u00c1rabe, Alem\u00e3o, Franc\u00eas...</p> <p>E MUITO MAIS!</p> <p>A lista est\u00e1 em constante expans\u00e3o, verifique o Fish Audio para os lan\u00e7amentos mais recentes.</p>"},{"location":"pt/#geracao-nativa-de-multiplos-falantes","title":"Gera\u00e7\u00e3o Nativa de M\u00faltiplos Falantes","text":"<p>O Fish Audio S2 permite enviar um \u00e1udio de refer\u00eancia com v\u00e1rios falantes; o modelo processa as caracter\u00edsticas de cada voz por meio do token <code><|speaker:i|></code>. Depois, voc\u00ea controla o comportamento do modelo com o token de ID do falante, permitindo incluir v\u00e1rias vozes em uma \u00fanica gera\u00e7\u00e3o. Assim, n\u00e3o \u00e9 mais necess\u00e1rio subir um \u00e1udio de refer\u00eancia separado para cada falante.</p>"},{"location":"pt/#geracao-de-multiplos-turnos","title":"Gera\u00e7\u00e3o de M\u00faltiplos Turnos","text":"<p>Gra\u00e7as \u00e0 extens\u00e3o do contexto do modelo, nosso modelo agora pode usar informa\u00e7\u00f5es anteriores para melhorar a expressividade e a naturalidade dos conte\u00fados gerados subsequentemente.</p>"},{"location":"pt/#clonagem-de-voz-rapida","title":"Clonagem de Voz R\u00e1pida","text":"<p>O Fish Audio S2 suporta clonagem de voz precisa usando uma pequena amostra de refer\u00eancia (tipicamente de 10 a 30 segundos). O modelo captura o timbre, o estilo de fala e as tend\u00eancias emocionais, produzindo vozes clonadas realistas e consistentes sem ajuste fino adicional. Para usar o servidor SGLang, consulte SGLang-Omni README .</p>"},{"location":"pt/#creditos","title":"Cr\u00e9ditos","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"pt/#relatorio-tecnico","title":"Relat\u00f3rio T\u00e9cnico","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"pt/finetune/","title":"Ajuste Fino (Fine-tuning)","text":"<p>Obviamente, ao abrir esta p\u00e1gina, voc\u00ea n\u00e3o estava satisfeito com o desempenho do modelo pr\u00e9-treinado em modo zero-shot. Voc\u00ea deseja fazer um ajuste fino em um modelo para melhorar seu desempenho em seu conjunto de dados.</p> <p>Na vers\u00e3o atual, voc\u00ea s\u00f3 precisa fazer o ajuste fino da parte 'LLAMA'.</p>"},{"location":"pt/finetune/#ajuste-fino-do-llama","title":"Ajuste Fino do LLAMA","text":""},{"location":"pt/finetune/#1-prepare-o-conjunto-de-dados","title":"1. Prepare o conjunto de dados","text":"<pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u2514\u2500\u2500 30.1-32.71.mp3\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u2514\u2500\u2500 38.79-40.85.mp3\n</code></pre> <p>Voc\u00ea precisa converter seu conjunto de dados para o formato acima e coloc\u00e1-lo no diret\u00f3rio <code>data</code>. O arquivo de \u00e1udio pode ter as extens\u00f5es <code>.mp3</code>, <code>.wav</code> ou <code>.flac</code>, e o arquivo de anota\u00e7\u00e3o deve ter a extens\u00e3o <code>.lab</code>.</p> <p>Info</p> <p>O arquivo de anota\u00e7\u00e3o <code>.lab</code> precisa conter apenas a transcri\u00e7\u00e3o do \u00e1udio, sem necessidade de formata\u00e7\u00e3o especial. Por exemplo, se <code>hi.mp3</code> contiver \"Ol\u00e1, adeus.\", ent\u00e3o o arquivo <code>hi.lab</code> conter\u00e1 uma \u00fanica linha de texto: \"Ol\u00e1, adeus.\".</p> <p>Warning</p> <p>Recomenda-se aplicar a normaliza\u00e7\u00e3o de volume (loudness) ao conjunto de dados. Voc\u00ea pode usar o fish-audio-preprocess para fazer isso. <pre><code>fap loudness-norm data-raw data --clean\n</code></pre></p>"},{"location":"pt/finetune/#2-extracao-em-lote-de-tokens-semanticos","title":"2. Extra\u00e7\u00e3o em lote de tokens sem\u00e2nticos","text":"<p>Certifique-se de que voc\u00ea baixou os pesos do VQGAN. Se n\u00e3o, execute o seguinte comando:</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>Em seguida, voc\u00ea pode executar o seguinte comando para extrair os tokens sem\u00e2nticos:</p> <pre><code>python tools/vqgan/extract_vq.py data \\\n --num-workers 1 --batch-size 16 \\\n --config-name \"modded_dac_vq\" \\\n --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n</code></pre> <p>Note</p> <p>Voc\u00ea pode ajustar <code>--num-workers</code> e <code>--batch-size</code> para aumentar a velocidade de extra\u00e7\u00e3o, mas certifique-se de n\u00e3o exceder o limite de mem\u00f3ria da sua GPU.</p> <p>Este comando criar\u00e1 arquivos <code>.npy</code> no diret\u00f3rio <code>data</code>, como mostrado abaixo:</p> <pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 21.15-26.44.npy\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.npy\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u251c\u2500\u2500 30.1-32.71.mp3\n\u2502 \u2514\u2500\u2500 30.1-32.71.npy\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u251c\u2500\u2500 38.79-40.85.mp3\n \u2514\u2500\u2500 38.79-40.85.npy\n</code></pre>"},{"location":"pt/finetune/#3-empacote-o-conjunto-de-dados-em-protobuf","title":"3. Empacote o conjunto de dados em protobuf","text":"<pre><code>python tools/llama/build_dataset.py \\\n --input \"data\" \\\n --output \"data/protos\" \\\n --text-extension .lab \\\n --num-workers 16\n</code></pre> <p>Ap\u00f3s a conclus\u00e3o da execu\u00e7\u00e3o do comando, voc\u00ea dever\u00e1 ver o arquivo <code>protos</code> no diret\u00f3rio <code>data</code>.</p>"},{"location":"pt/finetune/#4-finalmente-ajuste-fino-com-lora","title":"4. Finalmente, ajuste fino com LoRA","text":"<p>Da mesma forma, certifique-se de que voc\u00ea baixou os pesos do <code>LLAMA</code>. Se n\u00e3o, execute o seguinte comando:</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>Finalmente, voc\u00ea pode iniciar o ajuste fino executando o seguinte comando:</p> <pre><code>python fish_speech/train.py --config-name text2semantic_finetune \\\n project=$project \\\n +lora@model.model.lora_config=r_8_alpha_16\n</code></pre> <p>Note</p> <p>Voc\u00ea pode modificar os par\u00e2metros de treinamento, como <code>batch_size</code>, <code>gradient_accumulation_steps</code>, etc., para se adequar \u00e0 mem\u00f3ria da sua GPU, modificando <code>fish_speech/configs/text2semantic_finetune.yaml</code>.</p> <p>Note</p> <p>Para usu\u00e1rios do Windows, voc\u00ea pode usar <code>trainer.strategy.process_group_backend=gloo</code> para evitar problemas com <code>nccl</code>.</p> <p>Ap\u00f3s o treinamento ser conclu\u00eddo, voc\u00ea pode consultar a se\u00e7\u00e3o de infer\u00eancia para testar seu modelo.</p> <p>Info</p> <p>Por padr\u00e3o, o modelo aprender\u00e1 apenas os padr\u00f5es de fala do locutor e n\u00e3o o timbre. Voc\u00ea ainda precisar\u00e1 usar prompts para garantir a estabilidade do timbre. Se voc\u00ea quiser aprender o timbre, pode aumentar o n\u00famero de passos de treinamento, mas isso pode levar a um sobreajuste (overfitting).</p> <p>Ap\u00f3s o treinamento, voc\u00ea precisa converter os pesos do LoRA para pesos regulares antes de realizar a infer\u00eancia.</p> <p><code>bash python tools/llama/merge_lora.py \\ --lora-config r_8_alpha_16 \\ --base-weight checkpoints/openaudio-s1-mini \\ --lora-weight results/$project/checkpoints/step_000000010.ckpt \\ --output checkpoints/openaudio-s1-mini-yth-lora/</code></p> <p>Note</p> <p>Voc\u00ea tamb\u00e9m pode tentar outros checkpoints. Sugerimos usar o checkpoint mais antigo que atenda aos seus requisitos, pois eles geralmente t\u00eam um desempenho melhor em dados fora de distribui\u00e7\u00e3o (OOD).</p>"},{"location":"pt/inference/","title":"Infer\u00eancia","text":"<p>O modelo Fish Audio S2 requer uma grande quantidade de VRAM. Recomendamos o uso de uma GPU com pelo menos 24GB para infer\u00eancia.</p>"},{"location":"pt/inference/#baixar-pesos","title":"Baixar Pesos","text":"<p>Primeiro, voc\u00ea precisa baixar os pesos do modelo:</p> <pre><code>hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n</code></pre>"},{"location":"pt/inference/#inferencia-por-linha-de-comando","title":"Infer\u00eancia por Linha de Comando","text":"<p>Note</p> <p>Se voc\u00ea planeja deixar o modelo escolher aleatoriamente um timbre de voz, pode pular esta etapa.</p>"},{"location":"pt/inference/#1-obter-tokens-vq-do-audio-de-referencia","title":"1. Obter tokens VQ do \u00e1udio de refer\u00eancia","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"test.wav\" \\\n --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n</code></pre> <p>Voc\u00ea deve obter um <code>fake.npy</code> e um <code>fake.wav</code>.</p>"},{"location":"pt/inference/#2-gerar-tokens-semanticos-a-partir-do-texto","title":"2. Gerar tokens Sem\u00e2nticos a partir do texto:","text":"<pre><code>python fish_speech/models/text2semantic/inference.py \\\n --text \"O texto que voc\u00ea deseja converter\" \\\n --prompt-text \"Seu texto de refer\u00eancia\" \\\n --prompt-tokens \"fake.npy\" \\\n # --compile\n</code></pre> <p>Este comando criar\u00e1 um arquivo <code>codes_N</code> no diret\u00f3rio de trabalho, onde N \u00e9 um n\u00famero inteiro come\u00e7ando em 0.</p> <p>Note</p> <p>Voc\u00ea pode querer usar <code>--compile</code> para fundir kernels CUDA para uma infer\u00eancia mais r\u00e1pida. No entanto, recomendamos usar nossa otimiza\u00e7\u00e3o de acelera\u00e7\u00e3o de infer\u00eancia sglang. Da mesma forma, se voc\u00ea n\u00e3o planeja usar acelera\u00e7\u00e3o, pode comentar o par\u00e2metro <code>--compile</code>.</p> <p>Info</p> <p>Para GPUs que n\u00e3o suportam bf16, voc\u00ea pode precisar usar o par\u00e2metro <code>--half</code>.</p>"},{"location":"pt/inference/#3-gerar-vocais-a-partir-de-tokens-semanticos","title":"3. Gerar vocais a partir de tokens sem\u00e2nticos:","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"codes_0.npy\" \\\n</code></pre> <p>Depois disso, voc\u00ea obter\u00e1 um arquivo <code>fake.wav</code>.</p>"},{"location":"pt/inference/#inferencia-webui","title":"Infer\u00eancia WebUI","text":""},{"location":"pt/inference/#1-gradio-webui","title":"1. Gradio WebUI","text":"<p>Para manter a compatibilidade, mantemos a interface Gradio WebUI anterior.</p> <pre><code>python tools/run_webui.py # --compile se voc\u00ea precisar de acelera\u00e7\u00e3o\n</code></pre>"},{"location":"pt/inference/#2-awesome-webui","title":"2. Awesome WebUI","text":"<p>A Awesome WebUI \u00e9 uma interface web moderna baseada em TypeScript, oferecendo funcionalidades mais ricas e uma melhor experi\u00eancia do usu\u00e1rio.</p> <p>Construir a WebUI:</p> <p>Voc\u00ea precisa ter o Node.js e o npm instalados em seu computador local ou servidor.</p> <ol> <li>Entre no diret\u00f3rio <code>awesome_webui</code>: <pre><code>cd awesome_webui\n</code></pre></li> <li>Instale as depend\u00eancias: <pre><code>npm install\n</code></pre></li> <li>Construa a WebUI: <pre><code>npm run build\n</code></pre></li> </ol> <p>Iniciar o Servidor Backend:</p> <p>Ap\u00f3s a constru\u00e7\u00e3o da WebUI, retorne ao diret\u00f3rio raiz do projeto e inicie o servidor API:</p> <pre><code>python tools/api_server.py --listen 0.0.0.0:8888 --compile\n</code></pre> <p>Acesso:</p> <p>Ap\u00f3s o servidor ser iniciado, voc\u00ea pode acess\u00e1-lo atrav\u00e9s do navegador no seguinte endere\u00e7o: <code>http://localhost:8888/ui</code></p>"},{"location":"pt/install/","title":"Instala\u00e7\u00e3o","text":""},{"location":"pt/install/#requisitos","title":"Requisitos","text":"<ul> <li>Mem\u00f3ria da GPU: 24GB (Infer\u00eancia)</li> <li>Sistema: Linux, WSL</li> </ul>"},{"location":"pt/install/#configuracao-do-sistema","title":"Configura\u00e7\u00e3o do Sistema","text":"<p>O Fish Audio S2 suporta m\u00faltiplos m\u00e9todos de instala\u00e7\u00e3o. Escolha o que melhor se adapta ao seu ambiente de desenvolvimento.</p> <p>Pr\u00e9-requisitos: Instale as depend\u00eancias de sistema para processamento de \u00e1udio: <pre><code>apt install portaudio19-dev libsox-dev ffmpeg\n</code></pre></p>"},{"location":"pt/install/#conda","title":"Conda","text":"<pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# Instala\u00e7\u00e3o com GPU (escolha a sua vers\u00e3o do CUDA: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# Instala\u00e7\u00e3o apenas para CPU\npip install -e .[cpu]\n\n# Instala\u00e7\u00e3o padr\u00e3o (usa o \u00edndice padr\u00e3o do PyTorch)\npip install -e .\n\n# Se encontrar um erro durante a instala\u00e7\u00e3o devido ao pyaudio, considere usar o seguinte comando:\n# conda install pyaudio\n# De seguida, execute pip install -e . novamente\n</code></pre>"},{"location":"pt/install/#uv","title":"UV","text":"<p>O UV oferece uma resolu\u00e7\u00e3o e instala\u00e7\u00e3o de depend\u00eancias mais r\u00e1pidas:</p> <pre><code># Instala\u00e7\u00e3o com GPU (escolha a sua vers\u00e3o do CUDA: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# Instala\u00e7\u00e3o apenas para CPU\nuv sync --python 3.12 --extra cpu\n</code></pre>"},{"location":"pt/install/#suporte-para-intel-arc-xpu","title":"Suporte para Intel Arc XPU","text":"<p>Para utilizadores de GPUs Intel Arc, instale o suporte XPU da seguinte forma:</p> <pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# Instalar a biblioteca padr\u00e3o C++ necess\u00e1ria\nconda install libstdcxx -c conda-forge\n\n# Instalar o PyTorch com suporte para Intel XPU\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# Instalar o Fish Speech\npip install -e .\n</code></pre> <p>Warning</p> <p>A op\u00e7\u00e3o <code>compile</code> n\u00e3o \u00e9 suportada no Windows e macOS. Se desejar executar com compila\u00e7\u00e3o, ter\u00e1 de instalar o Triton manualmente.</p>"},{"location":"pt/install/#configuracao-do-docker","title":"Configura\u00e7\u00e3o do Docker","text":"<p>O modelo da s\u00e9rie Fish Audio S2 oferece m\u00faltiplas op\u00e7\u00f5es de implementa\u00e7\u00e3o com Docker para satisfazer diferentes necessidades. Pode usar imagens pr\u00e9-constru\u00eddas do Docker Hub, construir localmente com o Docker Compose, ou construir manualmente imagens personalizadas.</p> <p>Fornecemos imagens Docker para a WebUI e o servidor API, tanto para GPU (CUDA 12.6 por defeito) como para CPU. Pode usar as imagens pr\u00e9-constru\u00eddas do Docker Hub, construir localmente com o Docker Compose, ou construir manualmente imagens personalizadas. Se quiser construir localmente, siga as instru\u00e7\u00f5es abaixo. Se apenas quiser usar as imagens pr\u00e9-constru\u00eddas, siga diretamente o guia de infer\u00eancia.</p>"},{"location":"pt/install/#pre-requisitos","title":"Pr\u00e9-requisitos","text":"<ul> <li>Docker e Docker Compose instalados</li> <li>NVIDIA Docker runtime instalado (para suporte de GPU)</li> <li>Pelo menos 24GB de mem\u00f3ria de GPU para infer\u00eancia com CUDA</li> </ul>"},{"location":"pt/install/#usar-o-docker-compose","title":"Usar o Docker Compose","text":"<p>Para desenvolvimento ou personaliza\u00e7\u00e3o, pode usar o Docker Compose para construir e executar localmente:</p> <pre><code># Primeiro, clone o reposit\u00f3rio\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# Iniciar a WebUI com CUDA\ndocker compose --profile webui up\n\n# Iniciar a WebUI com otimiza\u00e7\u00e3o de compila\u00e7\u00e3o\nCOMPILE=1 docker compose --profile webui up\n\n# Iniciar o servidor API\ndocker compose --profile server up\n\n# Iniciar o servidor API com otimiza\u00e7\u00e3o de compila\u00e7\u00e3o\nCOMPILE=1 docker compose --profile server up\n\n# Implementa\u00e7\u00e3o apenas com CPU\nBACKEND=cpu docker compose --profile webui up\n</code></pre>"},{"location":"pt/install/#variaveis-de-ambiente-para-o-docker-compose","title":"Vari\u00e1veis de Ambiente para o Docker Compose","text":"<p>Pode personalizar a implementa\u00e7\u00e3o usando vari\u00e1veis de ambiente:</p> <pre><code># Exemplo de ficheiro .env\nBACKEND=cuda # ou cpu\nCOMPILE=1 # Ativar otimiza\u00e7\u00e3o de compila\u00e7\u00e3o\nGRADIO_PORT=7860 # Porta da WebUI\nAPI_PORT=8080 # Porta do servidor API\nUV_VERSION=0.8.15 # Vers\u00e3o do gestor de pacotes UV\n</code></pre> <p>O comando ir\u00e1 construir a imagem e executar o contentor. Pode aceder \u00e0 WebUI em <code>http://localhost:7860</code> e ao servidor API em <code>http://localhost:8080</code>.</p>"},{"location":"pt/install/#construcao-manual-com-docker","title":"Constru\u00e7\u00e3o Manual com Docker","text":"<p>Para utilizadores avan\u00e7ados que desejam personalizar o processo de constru\u00e7\u00e3o:</p> <pre><code># Construir imagem da WebUI com suporte CUDA\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target webui \\\n -t fish-speech-webui:cuda .\n\n# Construir imagem do servidor API com suporte CUDA\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target server \\\n -t fish-speech-server:cuda .\n\n# Construir imagem apenas para CPU (suporta multiplataforma)\ndocker build \\\n --platform linux/amd64,linux/arm64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cpu \\\n --target webui \\\n -t fish-speech-webui:cpu .\n\n# Construir imagem de desenvolvimento\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --target dev \\\n -t fish-speech-dev:cuda .\n</code></pre>"},{"location":"pt/install/#argumentos-de-construcao","title":"Argumentos de Constru\u00e7\u00e3o","text":"<ul> <li><code>BACKEND</code>: <code>cuda</code> ou <code>cpu</code> (padr\u00e3o: <code>cuda</code>)</li> <li><code>CUDA_VER</code>: Vers\u00e3o do CUDA (padr\u00e3o: <code>12.6.0</code>)</li> <li><code>UV_EXTRA</code>: Pacote extra do UV para CUDA (padr\u00e3o: <code>cu126</code>)</li> <li><code>UBUNTU_VER</code>: Vers\u00e3o do Ubuntu (padr\u00e3o: <code>24.04</code>)</li> <li><code>PY_VER</code>: Vers\u00e3o do Python (padr\u00e3o: <code>3.12</code>)</li> </ul>"},{"location":"pt/install/#montagem-de-volumes","title":"Montagem de Volumes","text":"<p>Ambos os m\u00e9todos requerem a montagem dos seguintes diret\u00f3rios:</p> <ul> <li><code>./checkpoints:/app/checkpoints</code> - Diret\u00f3rio dos pesos do modelo</li> <li><code>./references:/app/references</code> - Diret\u00f3rio dos ficheiros de \u00e1udio de refer\u00eancia</li> </ul>"},{"location":"pt/install/#variaveis-de-ambiente","title":"Vari\u00e1veis de Ambiente","text":"<ul> <li><code>COMPILE=1</code> - Ativa o <code>torch.compile</code> para uma infer\u00eancia mais r\u00e1pida (cerca de 10x)</li> <li><code>GRADIO_SERVER_NAME=0.0.0.0</code> - Anfitri\u00e3o do servidor WebUI</li> <li><code>GRADIO_SERVER_PORT=7860</code> - Porta do servidor WebUI</li> <li><code>API_SERVER_NAME=0.0.0.0</code> - Anfitri\u00e3o do servidor API</li> <li><code>API_SERVER_PORT=8080</code> - Porta do servidor API</li> </ul> <p>Note</p> <p>Os contentores Docker esperam que os pesos do modelo sejam montados em <code>/app/checkpoints</code>. Certifique-se de que descarregou os pesos do modelo necess\u00e1rios antes de iniciar os contentores.</p> <p>Warning</p> <p>O suporte para GPU requer o NVIDIA Docker runtime. Para implementa\u00e7\u00f5es apenas com CPU, remova a flag <code>--gpus all</code> e use as imagens de CPU.</p>"},{"location":"ko/","title":"\uc18c\uac1c","text":"Fish Speech <p>English | \u7b80\u4f53\u4e2d\u6587 | Portuguese | \u65e5\u672c\u8a9e | \ud55c\uad6d\uc5b4 | \u0627\u0644\u0639\u0631\u0628\u064a\u0629 | Espa\u00f1ol</p> <p></p> <p></p> <p>\ub77c\uc774\uc120\uc2a4 \uacf5\uc9c0</p> <p>\uc774 \ucf54\ub4dc\ubca0\uc774\uc2a4 \ubc0f \uad00\ub828 \ubaa8\ub378 \uac00\uc911\uce58\ub294 FISH AUDIO RESEARCH LICENSE \ud558\uc5d0 \ub9b4\ub9ac\uc2a4\ub418\uc5c8\uc2b5\ub2c8\ub2e4. \uc790\uc138\ud55c \ub0b4\uc6a9\uc740 LICENSE\ub97c \ucc38\uc870\ud558\uc2ed\uc2dc\uc624.</p> <p>\ubc95\uc801 \uba74\ucc45 \uc870\ud56d</p> <p>\ucf54\ub4dc\ubca0\uc774\uc2a4\uc758 \ubd88\ubc95\uc801\uc778 \uc0ac\uc6a9\uc5d0 \ub300\ud574 \ub2f9\uc0ac\ub294 \uc5b4\ub5a0\ud55c \ucc45\uc784\ub3c4 \uc9c0\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. DMCA \ubc0f \uae30\ud0c0 \uad00\ub828 \ubc95\ub960\uc5d0 \uad00\ud55c \ud604\uc9c0 \uaddc\uc815\uc744 \ucc38\uc870\ud558\uc2ed\uc2dc\uc624.</p>"},{"location":"ko/#_1","title":"\ube60\ub978 \uc2dc\uc791","text":""},{"location":"ko/#_2","title":"\ubb38\uc11c\ub85c \ubc14\ub85c \uc2dc\uc791\ud558\uae30","text":"<p>Fish Audio S2 \uacf5\uc2dd \ubb38\uc11c\uc785\ub2c8\ub2e4. \uc544\ub798 \ub9c1\ud06c\uc5d0\uc11c \ubc14\ub85c \uc2dc\uc791\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p> <ul> <li>\uc124\uce58</li> <li>\ucee4\ub9e8\ub4dc\ub77c\uc778 \ucd94\ub860</li> <li>WebUI \ucd94\ub860</li> <li>\uc11c\ubc84 \ucd94\ub860</li> <li>Docker \uc124\uc815</li> </ul> <p>[!IMPORTANT] SGLang \uc11c\ubc84\ub294 SGLang-Omni README\ub97c \ucc38\uace0\ud558\uc138\uc694.</p>"},{"location":"ko/#llm-agent","title":"LLM Agent \uac00\uc774\ub4dc","text":"<pre><code>https://speech.fish.audio/ko/install/ \ubb38\uc11c\ub97c \ub530\ub77c Fish Audio S2\ub97c \uc124\uce58\ud558\uace0 \uad6c\uc131\ud558\uc138\uc694.\n</code></pre>"},{"location":"ko/#fish-audio-s2","title":"Fish Audio S2","text":"<p>\uc624\ud508 \uc18c\uc2a4\uc640 \ud074\ub85c\uc988\ub4dc \uc18c\uc2a4 \ubaa8\ub450\uc5d0\uc11c \uac00\uc7a5 \ub6f0\uc5b4\ub09c \ud14d\uc2a4\ud2b8 \uc74c\uc131 \ubcc0\ud658 \uc2dc\uc2a4\ud15c</p> <p>Fish Audio S2\ub294 Fish Audio\uac00 \uac1c\ubc1c\ud55c \ucd5c\uc2e0 \ubaa8\ub378\uc785\ub2c8\ub2e4. \uc57d 50\uac1c \uc5b8\uc5b4, 1,000\ub9cc \uc2dc\uac04 \uc774\uc0c1\uc758 \uc624\ub514\uc624 \ub370\uc774\ud130\ub85c \ud559\uc2b5\ub418\uc5c8\uace0, \uac15\ud654\ud559\uc2b5 \uc815\ub82c\uacfc Dual-Autoregressive \uc544\ud0a4\ud14d\ucc98\ub97c \uacb0\ud569\ud574 \uc790\uc5f0\uc2a4\ub7fd\uace0 \uc0ac\uc2e4\uc801\uc774\uba70 \uac10\uc815 \ud45c\ud604\uc774 \ud48d\ubd80\ud55c \uc74c\uc131\uc744 \uc0dd\uc131\ud569\ub2c8\ub2e4.</p> <p>S2\ub294 <code>[laugh]</code>, <code>[whispers]</code>, <code>[super happy]</code> \uac19\uc740 \uc790\uc5f0\uc5b4 \ud0dc\uadf8\ub97c \uc0ac\uc6a9\ud574 \uc6b4\uc728\uacfc \uac10\uc815\uc744 \ubb38\uc7a5 \ub0b4\ubd80\uc5d0\uc11c \uc138\ubc00\ud558\uac8c \uc81c\uc5b4\ud560 \uc218 \uc788\uc73c\uba70, \uba40\ud2f0 \ud654\uc790/\uba40\ud2f0 \ud134 \uc0dd\uc131\ub3c4 \ub124\uc774\ud2f0\ube0c\ub85c \uc9c0\uc6d0\ud569\ub2c8\ub2e4.</p> <p>\uc2e4\uc2dc\uac04 \ub370\ubaa8\ub294 Fish Audio \uc6f9\uc0ac\uc774\ud2b8\uc5d0\uc11c, \uc790\uc138\ud55c \ub0b4\uc6a9\uc740 \ube14\ub85c\uadf8 \uae00\uacfc \uae30\uc220 \ubcf4\uace0\uc11c\uc5d0\uc11c \ud655\uc778\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p>"},{"location":"ko/#_3","title":"\ubaa8\ub378 \ubcc0\ud615","text":"\ubaa8\ub378 \ud06c\uae30 \uac00\uc6a9\uc131 \uc124\uba85 S2-Pro 4B \ub9e4\uac1c\ubcc0\uc218 HuggingFace \ucd5c\uace0 \uc218\uc900\uc758 \ud488\uc9c8\uacfc \uc548\uc815\uc131\uc744 \uc81c\uacf5\ud558\ub294 \ud480\uae30\ub2a5 \ud50c\ub798\uadf8\uc2ed \ubaa8\ub378 <p>\ubaa8\ub378 \uc0c1\uc138\ub294 \uae30\uc220 \ubcf4\uace0\uc11c\ub97c \ucc38\uace0\ud558\uc138\uc694.</p>"},{"location":"ko/#_4","title":"\ubca4\uce58\ub9c8\ud06c \uacb0\uacfc","text":"\ubca4\uce58\ub9c8\ud06c Fish Audio S2 Seed-TTS Eval \u2014 WER (\uc911\uad6d\uc5b4) 0.54% (\uc804\uccb4 \ucd5c\uace0) Seed-TTS Eval \u2014 WER (\uc601\uc5b4) 0.99% (\uc804\uccb4 \ucd5c\uace0) Audio Turing Test (\uc9c0\uc2dc \ud3ec\ud568) 0.515 \uc0ac\ud6c4 \ud3c9\uade0 EmergentTTS-Eval \u2014 \uc2b9\ub960 81.88% (\uc804\uccb4 \ucd5c\uace0) Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 \ud488\uc9c8 4.51 / 5.0 \ub2e4\uad6d\uc5b4 (MiniMax Testset) \u2014 \ucd5c\uace0 WER 24\uac1c \uc5b8\uc5b4 \uc911 11\uac1c \ub2e4\uad6d\uc5b4 (MiniMax Testset) \u2014 \ucd5c\uace0 SIM 24\uac1c \uc5b8\uc5b4 \uc911 17\uac1c <p>Seed-TTS Eval\uc5d0\uc11c S2\ub294 \ud074\ub85c\uc988\ub4dc \uc18c\uc2a4 \uc2dc\uc2a4\ud15c\uc744 \ud3ec\ud568\ud55c \uc804\uccb4 \ube44\uad50 \ubaa8\ub378 \uc911 \uac00\uc7a5 \ub0ae\uc740 WER\ub97c \uae30\ub85d\ud588\uc2b5\ub2c8\ub2e4: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). Audio Turing Test\uc5d0\uc11c\ub294 0.515\ub97c \uae30\ub85d\ud574 Seed-TTS (0.417) \ub300\ube44 24%, MiniMax-Speech (0.387) \ub300\ube44 33% \ub192\uc558\uc2b5\ub2c8\ub2e4. EmergentTTS-Eval\uc5d0\uc11c\ub294 \ud30c\ub77c\uc5b8\uc5b4 \ud45c\ud604(91.61%), \uc758\ubb38\ubb38(84.41%), \uad6c\ubb38 \ubcf5\uc7a1\ub3c4(83.39%)\uc5d0\uc11c \ud2b9\ud788 \uac15\ud55c \uc131\ub2a5\uc744 \ubcf4\uc600\uc2b5\ub2c8\ub2e4.</p>"},{"location":"ko/#_5","title":"\uc8fc\uc694 \ud2b9\uc9d5","text":""},{"location":"ko/#_6","title":"\uc790\uc5f0\uc5b4 \uae30\ubc18 \uc138\ubc00\ud55c \uc778\ub77c\uc778 \uc81c\uc5b4","text":"<p>Fish Audio S2\ub294 \ud14d\uc2a4\ud2b8\uc758 \ud2b9\uc815 \ub2e8\uc5b4 \ub610\ub294 \uad6c\ubb38 \uc704\uce58\uc5d0 \uc790\uc5f0\uc5b4 \uc9c0\uc2dc\ub97c \uc9c1\uc811 \uc0bd\uc785\ud574 \uc74c\uc131 \uc0dd\uc131\uc744 \uad6d\uc18c\uc801\uc73c\ub85c \uc81c\uc5b4\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \uace0\uc815\ub41c \uc0ac\uc804 \uc815\uc758 \ud0dc\uadf8\uc5d0 \uc758\uc874\ud558\ub294 \ub300\uc2e0, S2\ub294 [whisper in small voice], [professional broadcast tone], [pitch up] \uac19\uc740 \uc790\uc720 \ud615\uc2dd \ud14d\uc2a4\ud2b8 \uc124\uba85\uc744 \ubc1b\uc544 \ub2e8\uc5b4 \uc218\uc900\uc758 \uac1c\ubc29\ud615 \ud45c\ud604 \uc81c\uc5b4\ub97c \uc9c0\uc6d0\ud569\ub2c8\ub2e4.</p>"},{"location":"ko/#dual-autoregressive","title":"Dual-Autoregressive \uc544\ud0a4\ud14d\ucc98","text":"<p>S2\ub294 decoder-only Transformer\uc640 RVQ \uae30\ubc18 \uc624\ub514\uc624 \ucf54\ub371(10 codebooks, \uc57d 21 Hz \ud504\ub808\uc784\ub808\uc774\ud2b8)\uc744 \uacb0\ud569\ud569\ub2c8\ub2e4. Dual-AR\uc740 \uc0dd\uc131 \uacfc\uc815\uc744 \ub450 \ub2e8\uacc4\ub85c \ub098\ub215\ub2c8\ub2e4.</p> <ul> <li>Slow AR: \uc2dc\uac04\ucd95\uc744 \ub530\ub77c \ub3d9\uc791\ud558\uba70 \uc8fc semantic codebook\uc744 \uc608\uce21</li> <li>Fast AR: \uac01 \uc2dc\uc810\uc5d0\uc11c \ub098\uba38\uc9c0 9\uac1c residual codebook\uc744 \uc0dd\uc131\ud574 \uc138\ubc00\ud55c \uc74c\ud5a5 \ub514\ud14c\uc77c\uc744 \ubcf5\uc6d0</li> </ul> <p>\uc774 \ube44\ub300\uce6d \uc124\uacc4(\uc2dc\uac04\ucd95 4B \ud30c\ub77c\ubbf8\ud130, \uae4a\uc774\ucd95 400M \ud30c\ub77c\ubbf8\ud130)\ub294 \uc74c\uc9c8\uc744 \uc720\uc9c0\ud558\uba74\uc11c \ucd94\ub860 \ud6a8\uc728\uc744 \ub192\uc785\ub2c8\ub2e4.</p>"},{"location":"ko/#_7","title":"\uac15\ud654\ud559\uc2b5 \uc815\ub82c","text":"<p>S2\ub294 \ud6c4\ud559\uc2b5 \uc815\ub82c\uc744 \uc704\ud574 Group Relative Policy Optimization(GRPO)\uc744 \uc0ac\uc6a9\ud569\ub2c8\ub2e4. \ud559\uc2b5 \ub370\uc774\ud130 \ud544\ud130\ub9c1/\ub77c\ubca8\ub9c1\uc5d0 \uc4f0\uc778 \ub3d9\uc77c\ud55c \ubaa8\ub378\uc744 RL \ubcf4\uc0c1 \ubaa8\ub378\ub85c \uc7ac\uc0ac\uc6a9\ud574, \uc0ac\uc804\ud559\uc2b5 \ub370\uc774\ud130 \ubd84\ud3ec\uc640 \ud6c4\ud559\uc2b5 \ubaa9\ud45c \uac04\uc758 \ubd84\ud3ec \ubd88\uc77c\uce58\ub97c \uc904\uc600\uc2b5\ub2c8\ub2e4. \ubcf4\uc0c1 \uc2e0\ud638\ub294 \uc758\ubbf8 \uc815\ud655\ub3c4, \uc9c0\uc2dc \uc900\uc218\ub3c4, \uc74c\ud5a5 \uc120\ud638 \uc810\uc218, \uc74c\uc0c9 \uc720\uc0ac\ub3c4\ub97c \ud568\uaed8 \ubc18\uc601\ud569\ub2c8\ub2e4.</p>"},{"location":"ko/#sglang","title":"SGLang \uae30\ubc18 \ud504\ub85c\ub355\uc158 \uc2a4\ud2b8\ub9ac\ubc0d","text":"<p>Dual-AR \uad6c\uc870\ub294 \ud45c\uc900 \uc790\uae30\ud68c\uadc0 LLM\uacfc \uad6c\uc870\uc801\uc73c\ub85c \ub3d9\ud615\uc774\uae30 \ub54c\ubb38\uc5d0, S2\ub294 SGLang\uc758 LLM \uc11c\ube59 \ucd5c\uc801\ud654\ub97c \uadf8\ub300\ub85c \ud65c\uc6a9\ud569\ub2c8\ub2e4. \uc608: continuous batching, paged KV cache, CUDA graph replay, RadixAttention \uae30\ubc18 prefix caching.</p> <p>NVIDIA H200 \ub2e8\uc77c GPU \uae30\uc900:</p> <ul> <li>\uc2e4\uc2dc\uac04 \uacc4\uc218(RTF): 0.195</li> <li>\uccab \uc624\ub514\uc624 \ucd9c\ub825\uae4c\uc9c0 \uc2dc\uac04: \uc57d 100 ms</li> <li>\ucc98\ub9ac\ub7c9: RTF 0.5 \ubbf8\ub9cc \uc720\uc9c0 \uc2dc 3,000+ acoustic tokens/s</li> </ul>"},{"location":"ko/#_8","title":"\ub2e4\uad6d\uc5b4 \uc9c0\uc6d0","text":"<p>Fish Audio S2\ub294 \uc74c\uc18c\ub098 \uc5b8\uc5b4\ubcc4 \uc804\ucc98\ub9ac \uc5c6\uc774 \uace0\ud488\uc9c8 \ub2e4\uad6d\uc5b4 \ud14d\uc2a4\ud2b8 \uc74c\uc131 \ubcc0\ud658\uc744 \uc9c0\uc6d0\ud569\ub2c8\ub2e4. \ud3ec\ud568 \uc0ac\ud56d:</p> <p>\uc601\uc5b4, \uc911\uad6d\uc5b4, \uc77c\ubcf8\uc5b4, \ud55c\uad6d\uc5b4, \uc544\ub78d\uc5b4, \ub3c5\uc77c\uc5b4, \ud504\ub791\uc2a4\uc5b4...</p> <p>\uadf8\ub9ac\uace0 \ub354 \ub9ce\uc774!</p> <p>\ubaa9\ub85d\uc740 \uacc4\uc18d \ud655\uc7a5\ub418\uace0 \uc788\uc2b5\ub2c8\ub2e4. \ucd5c\uc2e0 \ub9b4\ub9ac\uc2a4\ub294 Fish Audio\ub97c \ud655\uc778\ud558\uc138\uc694.</p>"},{"location":"ko/#_9","title":"\ub124\uc774\ud2f0\ube0c \uba40\ud2f0 \ud654\uc790 \uc0dd\uc131","text":"<p>Fish Audio S2\ub294 \uc0ac\uc6a9\uc790\uac00 \uc5ec\ub7ec \ud654\uc790\uac00 \ud3ec\ud568\ub41c \ucc38\uc870 \uc624\ub514\uc624\ub97c \uc5c5\ub85c\ub4dc\ud560 \uc218 \uc788\ub3c4\ub85d \ud558\uba70, \ubaa8\ub378\uc740 <code><|speaker:i|></code> \ud1a0\ud070\uc744 \ud1b5\ud574 \uac01 \ud654\uc790\uc758 \ud2b9\uc9d5\uc744 \ucc98\ub9ac\ud569\ub2c8\ub2e4. \uadf8\ub7f0 \ub2e4\uc74c \ud654\uc790 ID \ud1a0\ud070\uc73c\ub85c \ubaa8\ub378\uc758 \uc131\ub2a5\uc744 \uc81c\uc5b4\ud558\uc5ec \ud55c \ubc88\uc758 \uc0dd\uc131\uc73c\ub85c \uc5ec\ub7ec \ud654\uc790\ub97c \ud3ec\ud568\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \uc774\uc804\ucc98\ub7fc \uac01 \ud654\uc790\ub9c8\ub2e4 \ubcc4\ub3c4\ub85c \ucc38\uc870 \uc624\ub514\uc624\ub97c \uc5c5\ub85c\ub4dc\ud558\uace0 \uc74c\uc131\uc744 \uc0dd\uc131\ud560 \ud544\uc694\uac00 \uc5c6\uc2b5\ub2c8\ub2e4.</p>"},{"location":"ko/#_10","title":"\uba40\ud2f0 \ud134 \ub300\ud654 \uc0dd\uc131","text":"<p>\ubaa8\ub378 \ucee8\ud14d\uc2a4\ud2b8\uc758 \ud655\uc7a5 \ub355\ubd84\uc5d0 \uc774\uc81c \uc774\uc804 \uc815\ubcf4\ub97c \ud65c\uc6a9\ud558\uc5ec \ud6c4\uc18d \uc0dd\uc131 \ucf58\ud150\uce20\uc758 \ud45c\ud604\ub825\uc744 \ub192\uc774\uace0 \ucf58\ud150\uce20\uc758 \uc790\uc5f0\uc2a4\ub7ec\uc6c0\uc744 \ud5a5\uc0c1\uc2dc\ud0ac \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p>"},{"location":"ko/#_11","title":"\ube60\ub978 \uc74c\uc131 \ubcf5\uc81c","text":"<p>Fish Audio S2\ub294 \uc9e7\uc740 \ucc38\uc870 \uc0d8\ud50c(\uc77c\ubc18\uc801\uc73c\ub85c 10-30\ucd08)\uc744 \uc0ac\uc6a9\ud558\uc5ec \uc815\ud655\ud55c \uc74c\uc131 \ubcf5\uc81c\ub97c \uc9c0\uc6d0\ud569\ub2c8\ub2e4. \ubaa8\ub378\uc740 \uc74c\uc0c9, \ub9d0\ud558\uae30 \uc2a4\ud0c0\uc77c \ubc0f \uac10\uc815\uc801 \uacbd\ud5a5\uc744 \ucea1\ucc98\ud558\uc5ec \ucd94\uac00 \ubbf8\uc138 \uc870\uc815 \uc5c6\uc774 \uc0ac\uc2e4\uc801\uc774\uace0 \uc77c\uad00\ub41c \ubcf5\uc81c \uc74c\uc131\uc744 \uc0dd\uc131\ud569\ub2c8\ub2e4. SGLang \uc11c\ubc84 \uc0ac\uc6a9\uc740 SGLang-Omni README \ub97c \ucc38\uace0\ud558\uc138\uc694.</p>"},{"location":"ko/#_12","title":"\ud06c\ub808\ub527","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"ko/#_13","title":"\uae30\uc220 \ubcf4\uace0\uc11c","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"ko/finetune/","title":"\ubbf8\uc138 \uc870\uc815 (Fine-tuning)","text":"<p>\uc774 \ud398\uc774\uc9c0\ub97c \uc5f4\uc5c8\ub2e4\ub294 \uac83\uc740, \uc0ac\uc804 \ud6c8\ub828\ub41c \ubaa8\ub378\uc758 \uc81c\ub85c\uc0f7(zero-shot) \uc131\ub2a5\uc5d0 \ub9cc\uc871\ud558\uc9c0 \ubabb\ud588\ub2e4\ub294 \uc758\ubbf8\uc77c \uac83\uc785\ub2c8\ub2e4. \uc5ec\ub7ec\ubd84\uc758 \ub370\uc774\ud130\uc14b\uc5d0\uc11c \ub354 \ub098\uc740 \uc131\ub2a5\uc744 \ub0b4\ub3c4\ub85d \ubaa8\ub378\uc744 \ubbf8\uc138 \uc870\uc815\ud558\uace0 \uc2f6\uc73c\uc2e4 \uac81\ub2c8\ub2e4.</p> <p>\ud604\uc7ac \ubc84\uc804\uc5d0\uc11c\ub294 'LLAMA' \ubd80\ubd84\ub9cc \ubbf8\uc138 \uc870\uc815\ud558\uba74 \ub429\ub2c8\ub2e4.</p>"},{"location":"ko/finetune/#llama","title":"LLAMA \ubbf8\uc138 \uc870\uc815","text":""},{"location":"ko/finetune/#1","title":"1. \ub370\uc774\ud130\uc14b \uc900\ube44","text":"<pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u2514\u2500\u2500 30.1-32.71.mp3\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u2514\u2500\u2500 38.79-40.85.mp3\n</code></pre> <p>\ub370\uc774\ud130\uc14b\uc744 \uc704 \ud615\uc2dd\uc73c\ub85c \ubcc0\ud658\ud558\uc5ec <code>data</code> \ud3f4\ub354 \uc544\ub798\uc5d0 \ubc30\uce58\ud574\uc57c \ud569\ub2c8\ub2e4. \uc624\ub514\uc624 \ud30c\uc77c \ud655\uc7a5\uc790\ub294 <code>.mp3</code>, <code>.wav</code> \ub610\ub294 <code>.flac</code>\uc77c \uc218 \uc788\uc73c\uba70, \uc8fc\uc11d \ud30c\uc77c \ud655\uc7a5\uc790\ub294 <code>.lab</code>\uc744 \uad8c\uc7a5\ud569\ub2c8\ub2e4.</p> <p>Info</p> <p><code>.lab</code> \uc8fc\uc11d \ud30c\uc77c\uc5d0\ub294 \uc624\ub514\uc624\uc758 \uc804\uc0ac \ud14d\uc2a4\ud2b8\ub9cc \ud3ec\ud568\ud558\uba74 \ub418\uba70, \ud2b9\ubcc4\ud55c \ud615\uc2dd \uc694\uad6c\uc0ac\ud56d\uc740 \uc5c6\uc2b5\ub2c8\ub2e4. \uc608\ub97c \ub4e4\uc5b4 <code>hi.mp3</code>\uc758 \ub0b4\uc6a9\uc774 \"\uc548\ub155\ud558\uc138\uc694, \uc548\ub155\ud788 \uac00\uc138\uc694.\"\ub77c\uba74, <code>hi.lab</code> \ud30c\uc77c\uc5d0\ub294 \"\uc548\ub155\ud558\uc138\uc694, \uc548\ub155\ud788 \uac00\uc138\uc694.\"\ub77c\ub294 \ud55c \uc904\uc758 \ud14d\uc2a4\ud2b8\ub9cc \ud3ec\ud568\ud558\uba74 \ub429\ub2c8\ub2e4.</p> <p>Warning</p> <p>\ub370\uc774\ud130\uc14b\uc5d0 \uc74c\ub7c9 \uc815\uaddc\ud654\ub97c \uc801\uc6a9\ud558\ub294 \uac83\uc774 \uc88b\uc2b5\ub2c8\ub2e4. \uc774\ub97c \uc704\ud574 fish-audio-preprocess\ub97c \uc0ac\uc6a9\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. <pre><code>fap loudness-norm data-raw data --clean\n</code></pre></p>"},{"location":"ko/finetune/#2","title":"2. \uc2dc\ub9e8\ud2f1 \ud1a0\ud070 \uc77c\uad04 \ucd94\ucd9c","text":"<p>VQGAN \uac00\uc911\uce58\ub97c \ub2e4\uc6b4\ub85c\ub4dc\ud588\ub294\uc9c0 \ud655\uc778\ud558\uc138\uc694. \uadf8\ub807\uc9c0 \uc54a\uc740 \uacbd\uc6b0 \ub2e4\uc74c \uba85\ub839\uc744 \uc2e4\ud589\ud558\uc138\uc694.</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>\uadf8\ub7f0 \ub2e4\uc74c \ub2e4\uc74c \uba85\ub839\uc744 \uc2e4\ud589\ud558\uc5ec \uc2dc\ub9e8\ud2f1 \ud1a0\ud070\uc744 \ucd94\ucd9c\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p> <pre><code>python tools/vqgan/extract_vq.py data \\\n --num-workers 1 --batch-size 16 \\\n --config-name \"modded_dac_vq\" \\\n --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n</code></pre> <p>Note</p> <p><code>--num-workers</code>\uc640 <code>--batch-size</code>\ub97c \uc870\uc815\ud558\uc5ec \ucd94\ucd9c \uc18d\ub3c4\ub97c \ub192\uc77c \uc218 \uc788\uc9c0\ub9cc, GPU \uba54\ubaa8\ub9ac \ud55c\ub3c4\ub97c \ucd08\uacfc\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud558\uc138\uc694.</p> <p>\uc774 \uba85\ub839\uc740 <code>data</code> \ub514\ub809\ud1a0\ub9ac\uc5d0 <code>.npy</code> \ud30c\uc77c\uc744 \uc0dd\uc131\ud569\ub2c8\ub2e4. \uacb0\uacfc\ub294 \ub2e4\uc74c\uacfc \uac19\uc2b5\ub2c8\ub2e4.</p> <pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 21.15-26.44.npy\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.npy\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u251c\u2500\u2500 30.1-32.71.mp3\n\u2502 \u2514\u2500\u2500 30.1-32.71.npy\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u251c\u2500\u2500 38.79-40.85.mp3\n \u2514\u2500\u2500 38.79-40.85.npy\n</code></pre>"},{"location":"ko/finetune/#3-protobuf","title":"3. \ub370\uc774\ud130\uc14b\uc744 protobuf\ub85c \ud328\ud0b9\ud558\uae30","text":"<pre><code>python tools/llama/build_dataset.py \\\n --input \"data\" \\\n --output \"data/protos\" \\\n --text-extension .lab \\\n --num-workers 16\n</code></pre> <p>\uba85\ub839 \uc2e4\ud589\uc774 \uc644\ub8cc\ub418\uba74 <code>data</code> \ub514\ub809\ud1a0\ub9ac\uc5d0\uc11c <code>protos</code> \ud30c\uc77c\uc744 \ubcfc \uc218 \uc788\uc5b4\uc57c \ud569\ub2c8\ub2e4.</p>"},{"location":"ko/finetune/#4-lora","title":"4. \ub9c8\uc9c0\ub9c9\uc73c\ub85c, LoRA\ub85c \ubbf8\uc138 \uc870\uc815\ud558\uae30","text":"<p>\ub9c8\ucc2c\uac00\uc9c0\ub85c, <code>LLAMA</code> \uac00\uc911\uce58\ub97c \ub2e4\uc6b4\ub85c\ub4dc\ud588\ub294\uc9c0 \ud655\uc778\ud558\uc138\uc694. \uadf8\ub807\uc9c0 \uc54a\uc740 \uacbd\uc6b0 \ub2e4\uc74c \uba85\ub839\uc744 \uc2e4\ud589\ud558\uc138\uc694.</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>\ub9c8\uc9c0\ub9c9\uc73c\ub85c, \ub2e4\uc74c \uba85\ub839\uc744 \uc2e4\ud589\ud558\uc5ec \ubbf8\uc138 \uc870\uc815\uc744 \uc2dc\uc791\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p> <pre><code>python fish_speech/train.py --config-name text2semantic_finetune \\\n project=$project \\\n +lora@model.model.lora_config=r_8_alpha_16\n</code></pre> <p>Note</p> <p><code>fish_speech/configs/text2semantic_finetune.yaml</code> \ud30c\uc77c\uc744 \uc218\uc815\ud558\uc5ec <code>batch_size</code>, <code>gradient_accumulation_steps</code> \ub4f1 \ud6c8\ub828 \ub9e4\uac1c\ubcc0\uc218\ub97c GPU \uba54\ubaa8\ub9ac\uc5d0 \ub9de\uac8c \uc870\uc815\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p> <p>Note</p> <p>Windows \uc0ac\uc6a9\uc790\uc758 \uacbd\uc6b0, <code>trainer.strategy.process_group_backend=gloo</code>\ub97c \uc0ac\uc6a9\ud558\uc5ec <code>nccl</code> \uad00\ub828 \ubb38\uc81c\ub97c \ud53c\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p> <p>\ud6c8\ub828\uc774 \uc644\ub8cc\ub418\uba74 \ucd94\ub860 \uc139\uc158\uc744 \ucc38\uc870\ud558\uc5ec \ubaa8\ub378\uc744 \ud14c\uc2a4\ud2b8\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p> <p>Info</p> <p>\uae30\ubcf8 \uc124\uc815\uc5d0\uc11c\ub294 \ubaa8\ub378\uc774 \ud654\uc790\uc758 \ubc1c\uc74c \ubc29\uc2dd\ub9cc \ud559\uc2b5\ud558\uace0 \uc74c\uc0c9\uc740 \ud559\uc2b5\ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc74c\uc0c9 \uc548\uc815\uc131\uc744 \ubcf4\uc7a5\ud558\ub824\uba74 \uc5ec\uc804\ud788 \ud504\ub86c\ud504\ud2b8\ub97c \uc0ac\uc6a9\ud574\uc57c \ud569\ub2c8\ub2e4. \uc74c\uc0c9\uc744 \ud559\uc2b5\uc2dc\ud0a4\uace0 \uc2f6\ub2e4\uba74 \ud6c8\ub828 \uc2a4\ud15d \uc218\ub97c \ub298\ub9ac\ub418, \uc774\ub294 \uacfc\uc801\ud569(overfitting)\uc73c\ub85c \uc774\uc5b4\uc9c8 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p> <p>\ud6c8\ub828 \ud6c4, \ucd94\ub860\uc744 \uc218\ud589\ud558\uae30 \uc804\uc5d0 LoRA \uac00\uc911\uce58\ub97c \uc77c\ubc18 \uac00\uc911\uce58\ub85c \ubcc0\ud658\ud574\uc57c \ud569\ub2c8\ub2e4.</p> <pre><code>python tools/llama/merge_lora.py \\\n --lora-config r_8_alpha_16 \\\n --base-weight checkpoints/openaudio-s1-mini \\\n --lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n --output checkpoints/openaudio-s1-mini-yth-lora/\n</code></pre> <p>Note</p> <p>\ub2e4\ub978 \uccb4\ud06c\ud3ec\uc778\ud2b8\ub97c \uc2dc\ub3c4\ud574 \ubcfc \uc218\ub3c4 \uc788\uc2b5\ub2c8\ub2e4. \uc694\uad6c \uc0ac\ud56d\uc744 \ucda9\uc871\ud558\ub294 \uac00\uc7a5 \uc774\ub978 \uccb4\ud06c\ud3ec\uc778\ud2b8\ub97c \uc0ac\uc6a9\ud558\ub294 \uac83\uc774 \uc88b\uc2b5\ub2c8\ub2e4. \uc774\ub7ec\ud55c \uccb4\ud06c\ud3ec\uc778\ud2b8\ub294 \ubcf4\ud1b5 OOD(\ubd84\ud3ec \uc678) \ub370\uc774\ud130\uc5d0\uc11c \ub354 \ub098\uc740 \uc131\ub2a5\uc744 \ubcf4\uc785\ub2c8\ub2e4.</p>"},{"location":"ko/inference/","title":"\ucd94\ub860","text":"<p>Fish Audio S2 \ubaa8\ub378\uc740 \ud070 \ube44\ub514\uc624 \uba54\ubaa8\ub9ac(VRAM)\uac00 \ud544\uc694\ud569\ub2c8\ub2e4. \ucd94\ub860\uc744 \uc704\ud574 \ucd5c\uc18c 24GB \uc774\uc0c1\uc758 GPU\ub97c \uc0ac\uc6a9\ud558\ub294 \uac83\uc744 \uad8c\uc7a5\ud569\ub2c8\ub2e4.</p>"},{"location":"ko/inference/#_2","title":"\uac00\uc911\uce58 \ub2e4\uc6b4\ub85c\ub4dc","text":"<p>\uba3c\uc800 \ubaa8\ub378 \uac00\uc911\uce58\ub97c \ub2e4\uc6b4\ub85c\ub4dc\ud574\uc57c \ud569\ub2c8\ub2e4:</p> <pre><code>hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n</code></pre>"},{"location":"ko/inference/#_3","title":"\uba85\ub839\uc904 \ucd94\ub860","text":"<p>Note</p> <p>\ubaa8\ub378\uc774 \uc74c\uc0c9\uc744 \ubb34\uc791\uc704\ub85c \uc120\ud0dd\ud558\uac8c \ud558\ub824\uba74 \uc774 \ub2e8\uacc4\ub97c \uac74\ub108\ub6f8 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p>"},{"location":"ko/inference/#1-vq","title":"1. \ucc38\uc870 \uc624\ub514\uc624\uc5d0\uc11c VQ \ud1a0\ud070 \uac00\uc838\uc624\uae30","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"test.wav\" \\\n --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n</code></pre> <p><code>fake.npy</code>\uc640 <code>fake.wav</code> \ud30c\uc77c\uc774 \uc0dd\uc131\ub429\ub2c8\ub2e4.</p>"},{"location":"ko/inference/#2-semantic","title":"2. \ud14d\uc2a4\ud2b8\uc5d0\uc11c Semantic \ud1a0\ud070 \uc0dd\uc131:","text":"<pre><code>python fish_speech/models/text2semantic/inference.py \\\n --text \"\ubcc0\ud658\ud558\ub824\ub294 \ud14d\uc2a4\ud2b8\" \\\n --prompt-text \"\ucc38\uc870 \ud14d\uc2a4\ud2b8\" \\\n --prompt-tokens \"fake.npy\" \\\n # --compile\n</code></pre> <p>\uc774 \uba85\ub839\uc740 \uc791\uc5c5 \ub514\ub809\ud1a0\ub9ac\uc5d0 <code>codes_N</code> \ud30c\uc77c\uc744 \uc0dd\uc131\ud569\ub2c8\ub2e4. \uc5ec\uae30\uc11c N\uc740 0\ubd80\ud130 \uc2dc\uc791\ud558\ub294 \uc815\uc218\uc785\ub2c8\ub2e4.</p> <p>Note</p> <p>\ub354 \ube60\ub978 \ucd94\ub860\uc744 \uc704\ud574 CUDA \ucee4\ub110\uc744 \ubcd1\ud569\ud558\ub294 <code>--compile</code>\uc744 \uc0ac\uc6a9\ud558\uace0 \uc2f6\uc744 \uc218 \uc788\uc9c0\ub9cc, \ub2f9\uc0ac\uc758 sglang \ucd94\ub860 \uac00\uc18d \ucd5c\uc801\ud654\ub97c \uc0ac\uc6a9\ud558\ub294 \uac83\uc744 \ub354 \uad8c\uc7a5\ud569\ub2c8\ub2e4. \ub9c8\ucc2c\uac00\uc9c0\ub85c \uac00\uc18d\uc744 \uc0ac\uc6a9\ud560 \uacc4\ud68d\uc774 \uc5c6\ub2e4\uba74 <code>--compile</code> \ub9e4\uac1c\ubcc0\uc218\ub97c \uc8fc\uc11d \ucc98\ub9ac\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p> <p>Info</p> <p>bf16\uc744 \uc9c0\uc6d0\ud558\uc9c0 \uc54a\ub294 GPU\uc758 \uacbd\uc6b0 <code>--half</code> \ub9e4\uac1c\ubcc0\uc218\ub97c \uc0ac\uc6a9\ud574\uc57c \ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p>"},{"location":"ko/inference/#3","title":"3. \uc2dc\ub9e8\ud2f1 \ud1a0\ud070\uc5d0\uc11c \uc74c\uc131 \uc0dd\uc131:","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"codes_0.npy\" \\\n</code></pre> <p>\uc774\ud6c4 <code>fake.wav</code> \ud30c\uc77c\uc744 \uc5bb\uac8c \ub429\ub2c8\ub2e4.</p>"},{"location":"ko/inference/#webui","title":"WebUI \ucd94\ub860","text":""},{"location":"ko/inference/#1-gradio-webui","title":"1. Gradio WebUI","text":"<p>\ud638\ud658\uc131\uc744 \uc720\uc9c0\ud558\uae30 \uc704\ud574 \uae30\uc874\uc758 Gradio WebUI\ub97c \ubcf4\uc874\ud558\uace0 \uc788\uc2b5\ub2c8\ub2e4.</p> <pre><code>python tools/run_webui.py # \uac00\uc18d\uc774 \ud544\uc694\ud55c \uacbd\uc6b0 --compile\n</code></pre>"},{"location":"ko/inference/#2-awesome-webui","title":"2. Awesome WebUI","text":"<p>Awesome WebUI\ub294 TypeScript \uae30\ubc18\uc73c\ub85c \uac1c\ubc1c\ub41c \ud604\ub300\uc801\uc778 \uc6f9 \uc778\ud130\ud398\uc774\uc2a4\ub85c, \ub354 \ud48d\ubd80\ud55c \uae30\ub2a5\uacfc \ud5a5\uc0c1\ub41c \uc0ac\uc6a9\uc790 \uacbd\ud5d8\uc744 \uc81c\uacf5\ud569\ub2c8\ub2e4.</p> <p>WebUI \ube4c\ub4dc:</p> <p>\ub85c\uceec \ub610\ub294 \uc11c\ubc84\uc5d0 Node.js\uc640 npm\uc774 \uc124\uce58\ub418\uc5b4 \uc788\uc5b4\uc57c \ud569\ub2c8\ub2e4.</p> <ol> <li><code>awesome_webui</code> \ub514\ub809\ud1a0\ub9ac\ub85c \uc774\ub3d9\ud569\ub2c8\ub2e4: <pre><code>cd awesome_webui\n</code></pre></li> <li>\uc758\uc874\uc131 \uc124\uce58: <pre><code>npm install\n</code></pre></li> <li>WebUI \ube4c\ub4dc: <pre><code>npm run build\n</code></pre></li> </ol> <p>\ubc31\uc5d4\ub4dc \uc11c\ubc84 \uc2e4\ud589:</p> <p>WebUI \ube4c\ub4dc\uac00 \uc644\ub8cc\ub418\uba74 \ud504\ub85c\uc81d\ud2b8 \ub8e8\ud2b8\ub85c \ub3cc\uc544\uac00 API \uc11c\ubc84\ub97c \uc2e4\ud589\ud569\ub2c8\ub2e4:</p> <pre><code>python tools/api_server.py --listen 0.0.0.0:8888 --compile\n</code></pre> <p>\uc811\uc18d:</p> <p>\uc11c\ubc84\uac00 \uc2e4\ud589\ub41c \ud6c4 \ube0c\ub77c\uc6b0\uc800\ub97c \ud1b5\ud574 \ub2e4\uc74c \uc8fc\uc18c\ub85c \uc811\uc18d\ud558\uba74 \uccb4\ud5d8\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4: <code>http://localhost:8888/ui</code></p>"},{"location":"ko/install/","title":"\uc124\uce58","text":""},{"location":"ko/install/#_1","title":"\uc694\uad6c \uc0ac\uc591","text":"<ul> <li>GPU \uba54\ubaa8\ub9ac: 24GB (\ucd94\ub860 \uc2dc)</li> <li>\uc2dc\uc2a4\ud15c: Linux, WSL</li> </ul>"},{"location":"ko/install/#_2","title":"\uc2dc\uc2a4\ud15c \uc124\uc815","text":"<p>Fish Audio S2\ub294 \ub2e4\uc591\ud55c \uc124\uce58 \ubc29\ubc95\uc744 \uc9c0\uc6d0\ud569\ub2c8\ub2e4. \uc790\uc2e0\uc758 \uac1c\ubc1c \ud658\uacbd\uc5d0 \uac00\uc7a5 \uc801\ud569\ud55c \ubc29\ubc95\uc744 \uc120\ud0dd\ud558\uc138\uc694.</p> <p>\uc0ac\uc804 \uc694\uad6c\uc0ac\ud56d: \uc624\ub514\uc624 \ucc98\ub9ac\ub97c \uc704\ud55c \uc2dc\uc2a4\ud15c \uc758\uc874\uc131\uc744 \uc124\uce58\ud569\ub2c8\ub2e4: <pre><code>apt install portaudio19-dev libsox-dev ffmpeg\n</code></pre></p>"},{"location":"ko/install/#conda","title":"Conda","text":"<pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# GPU \ubc84\uc804 \uc124\uce58 (CUDA \ubc84\uc804 \uc120\ud0dd: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# CPU \ubc84\uc804\ub9cc \uc124\uce58\npip install -e .[cpu]\n\n# \uae30\ubcf8 \uc124\uce58 (PyTorch \uae30\ubcf8 \uc778\ub371\uc2a4 \uc0ac\uc6a9)\npip install -e .\n\n# pyaudio \uc124\uce58 \uc911 \uc624\ub958\uac00 \ubc1c\uc0dd\ud558\uba74 \ub2e4\uc74c \uba85\ub839\uc744 \uc0ac\uc6a9\ud574 \ubcf4\uc138\uc694:\n# conda install pyaudio\n# \uadf8\ub7f0 \ub2e4\uc74c pip install -e . \ub97c \ub2e4\uc2dc \uc2e4\ud589\ud558\uc138\uc694\n</code></pre>"},{"location":"ko/install/#uv","title":"UV","text":"<p>UV\ub294 \ub354 \ube60\ub978 \uc758\uc874\uc131 \ud574\uacb0 \ubc0f \uc124\uce58\ub97c \uc81c\uacf5\ud569\ub2c8\ub2e4:</p> <pre><code># GPU \ubc84\uc804 \uc124\uce58 (CUDA \ubc84\uc804 \uc120\ud0dd: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# CPU \ubc84\uc804\ub9cc \uc124\uce58\nuv sync --python 3.12 --extra cpu\n</code></pre>"},{"location":"ko/install/#intel-arc-xpu","title":"Intel Arc XPU \uc9c0\uc6d0","text":"<p>Intel Arc GPU \uc0ac\uc6a9\uc790\ub294 \ub2e4\uc74c\uc744 \ud1b5\ud574 XPU \uc9c0\uc6d0\uc744 \uc124\uce58\ud558\uc138\uc694:</p> <pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# \ud544\uc694\ud55c C++ \ud45c\uc900 \ub77c\uc774\ube0c\ub7ec\ub9ac \uc124\uce58\nconda install libstdcxx -c conda-forge\n\n# Intel XPU\ub97c \uc9c0\uc6d0\ud558\ub294 PyTorch \uc124\uce58\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# Fish Speech \uc124\uce58\npip install -e .\n</code></pre> <p>Warning</p> <p><code>compile</code> \uc635\uc158\uc740 Windows\uc640 macOS\uc5d0\uc11c \uc9c0\uc6d0\ub418\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \ucef4\ud30c\uc77c\uc744 \ud65c\uc131\ud654\ud558\uc5ec \uc2e4\ud589\ud558\ub824\uba74 Triton\uc744 \uc9c1\uc811 \uc124\uce58\ud574\uc57c \ud569\ub2c8\ub2e4.</p>"},{"location":"ko/install/#docker","title":"Docker \uc124\uc815","text":"<p>Fish Audio S2 \uc2dc\ub9ac\uc988 \ubaa8\ub378\uc740 \ub2e4\uc591\ud55c \uc694\uad6c\uc5d0 \ubd80\uc751\ud558\uae30 \uc704\ud574 \uc5ec\ub7ec Docker \ubc30\ud3ec \uc635\uc158\uc744 \uc81c\uacf5\ud569\ub2c8\ub2e4. Docker Hub\uc758 \uc0ac\uc804 \ube4c\ub4dc\ub41c \uc774\ubbf8\uc9c0\ub97c \uc0ac\uc6a9\ud558\uac70\ub098, Docker Compose\ub85c \ub85c\uceec\uc5d0\uc11c \ube4c\ub4dc\ud558\uac70\ub098, \uc218\ub3d9\uc73c\ub85c \uc0ac\uc6a9\uc790 \uc815\uc758 \uc774\ubbf8\uc9c0\ub97c \ube4c\ub4dc\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p> <p>WebUI\uc640 API \uc11c\ubc84 \ubaa8\ub450\uc5d0 \ub300\ud574 GPU(\uae30\ubcf8\uac12 CUDA 12.6) \ubc0f CPU \ubc84\uc804\uc758 Docker \uc774\ubbf8\uc9c0\ub97c \uc81c\uacf5\ud569\ub2c8\ub2e4. Docker Hub\uc758 \uc0ac\uc804 \ube4c\ub4dc\ub41c \uc774\ubbf8\uc9c0\ub97c \uc0ac\uc6a9\ud558\uac70\ub098, Docker Compose\ub85c \ub85c\uceec\uc5d0\uc11c \ube4c\ub4dc\ud558\uac70\ub098, \uc218\ub3d9\uc73c\ub85c \uc0ac\uc6a9\uc790 \uc815\uc758 \uc774\ubbf8\uc9c0\ub97c \ube4c\ub4dc\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \ub85c\uceec\uc5d0\uc11c \ube4c\ub4dc\ud558\ub824\uba74 \uc544\ub798 \uc9c0\uce68\uc744 \ub530\ub974\uc138\uc694. \uc0ac\uc804 \ube4c\ub4dc\ub41c \uc774\ubbf8\uc9c0\ub97c \uc0ac\uc6a9\ud558\ub824\uba74 \ucd94\ub860 \uac00\uc774\ub4dc\ub97c \uc9c1\uc811 \ucc38\uc870\ud558\uc138\uc694.</p>"},{"location":"ko/install/#_3","title":"\uc0ac\uc804 \uc694\uad6c\uc0ac\ud56d","text":"<ul> <li>Docker \ubc0f Docker Compose \uc124\uce58</li> <li>NVIDIA Docker \ub7f0\ud0c0\uc784 \uc124\uce58 (GPU \uc9c0\uc6d0\uc6a9)</li> <li>CUDA \ucd94\ub860\uc744 \uc704\ud55c \ucd5c\uc18c 24GB\uc758 GPU \uba54\ubaa8\ub9ac</li> </ul>"},{"location":"ko/install/#docker-compose","title":"Docker Compose \uc0ac\uc6a9","text":"<p>\uac1c\ubc1c \ub610\ub294 \uc0ac\uc6a9\uc790 \uc815\uc758\ub97c \uc704\ud574 Docker Compose\ub97c \uc0ac\uc6a9\ud558\uc5ec \ub85c\uceec\uc5d0\uc11c \ube4c\ub4dc\ud558\uace0 \uc2e4\ud589\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4:</p> <pre><code># \uba3c\uc800 \ub9ac\ud3ec\uc9c0\ud1a0\ub9ac\ub97c \ud074\ub860\ud569\ub2c8\ub2e4\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# CUDA\ub85c WebUI \uc2dc\uc791\ndocker compose --profile webui up\n\n# \ucef4\ud30c\uc77c \ucd5c\uc801\ud654\ub85c WebUI \uc2dc\uc791\nCOMPILE=1 docker compose --profile webui up\n\n# API \uc11c\ubc84 \uc2dc\uc791\ndocker compose --profile server up\n\n# \ucef4\ud30c\uc77c \ucd5c\uc801\ud654\ub85c API \uc11c\ubc84 \uc2dc\uc791\nCOMPILE=1 docker compose --profile server up\n\n# CPU \uc804\uc6a9 \ubc30\ud3ec\nBACKEND=cpu docker compose --profile webui up\n</code></pre>"},{"location":"ko/install/#docker-compose_1","title":"Docker Compose \ud658\uacbd \ubcc0\uc218","text":"<p>\ud658\uacbd \ubcc0\uc218\ub97c \uc0ac\uc6a9\ud558\uc5ec \ubc30\ud3ec\ub97c \uc0ac\uc6a9\uc790 \uc815\uc758\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4:</p> <pre><code># .env \ud30c\uc77c \uc608\uc2dc\nBACKEND=cuda # \ub610\ub294 cpu\nCOMPILE=1 # \ucef4\ud30c\uc77c \ucd5c\uc801\ud654 \ud65c\uc131\ud654\nGRADIO_PORT=7860 # WebUI \ud3ec\ud2b8\nAPI_PORT=8080 # API \uc11c\ubc84 \ud3ec\ud2b8\nUV_VERSION=0.8.15 # UV \ud328\ud0a4\uc9c0 \uad00\ub9ac\uc790 \ubc84\uc804\n</code></pre> <p>\uc774 \uba85\ub839\uc740 \uc774\ubbf8\uc9c0\ub97c \ube4c\ub4dc\ud558\uace0 \ucee8\ud14c\uc774\ub108\ub97c \uc2e4\ud589\ud569\ub2c8\ub2e4. WebUI\ub294 <code>http://localhost:7860</code>\uc5d0\uc11c, API \uc11c\ubc84\ub294 <code>http://localhost:8080</code>\uc5d0\uc11c \uc811\uadfc\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.</p>"},{"location":"ko/install/#docker_1","title":"\uc218\ub3d9 Docker \ube4c\ub4dc","text":"<p>\ube4c\ub4dc \ud504\ub85c\uc138\uc2a4\ub97c \uc0ac\uc6a9\uc790 \uc815\uc758\ud558\ub824\ub294 \uace0\uae09 \uc0ac\uc6a9\uc790\ub97c \uc704\ud574:</p> <pre><code># CUDA\ub97c \uc9c0\uc6d0\ud558\ub294 WebUI \uc774\ubbf8\uc9c0 \ube4c\ub4dc\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target webui \\\n -t fish-speech-webui:cuda .\n\n# CUDA\ub97c \uc9c0\uc6d0\ud558\ub294 API \uc11c\ubc84 \uc774\ubbf8\uc9c0 \ube4c\ub4dc\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target server \\\n -t fish-speech-server:cuda .\n\n# CPU \uc804\uc6a9 \uc774\ubbf8\uc9c0 \ube4c\ub4dc (\uba40\ud2f0 \ud50c\ub7ab\ud3fc \uc9c0\uc6d0)\ndocker build \\\n --platform linux/amd64,linux/arm64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cpu \\\n --target webui \\\n -t fish-speech-webui:cpu .\n\n# \uac1c\ubc1c\uc6a9 \uc774\ubbf8\uc9c0 \ube4c\ub4dc\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --target dev \\\n -t fish-speech-dev:cuda .\n</code></pre>"},{"location":"ko/install/#_4","title":"\ube4c\ub4dc \uc778\uc790","text":"<ul> <li><code>BACKEND</code>: <code>cuda</code> \ub610\ub294 <code>cpu</code> (\uae30\ubcf8\uac12: <code>cuda</code>)</li> <li><code>CUDA_VER</code>: CUDA \ubc84\uc804 (\uae30\ubcf8\uac12: <code>12.6.0</code>)</li> <li><code>UV_EXTRA</code>: CUDA\uc6a9 UV \ucd94\uac00 \ud328\ud0a4\uc9c0 (\uae30\ubcf8\uac12: <code>cu126</code>)</li> <li><code>UBUNTU_VER</code>: Ubuntu \ubc84\uc804 (\uae30\ubcf8\uac12: <code>24.04</code>)</li> <li><code>PY_VER</code>: Python \ubc84\uc804 (\uae30\ubcf8\uac12: <code>3.12</code>)</li> </ul>"},{"location":"ko/install/#_5","title":"\ubcfc\ub968 \ub9c8\uc6b4\ud2b8","text":"<p>\ub450 \ubc29\ubc95 \ubaa8\ub450 \ub2e4\uc74c \ub514\ub809\ud1a0\ub9ac\ub97c \ub9c8\uc6b4\ud2b8\ud574\uc57c \ud569\ub2c8\ub2e4:</p> <ul> <li><code>./checkpoints:/app/checkpoints</code> - \ubaa8\ub378 \uac00\uc911\uce58 \ub514\ub809\ud1a0\ub9ac</li> <li><code>./references:/app/references</code> - \ucc38\uc870 \uc624\ub514\uc624 \ud30c\uc77c \ub514\ub809\ud1a0\ub9ac</li> </ul>"},{"location":"ko/install/#_6","title":"\ud658\uacbd \ubcc0\uc218","text":"<ul> <li><code>COMPILE=1</code> - <code>torch.compile</code>\uc744 \ud65c\uc131\ud654\ud558\uc5ec \ucd94\ub860 \uc18d\ub3c4 \ud5a5\uc0c1 (\uc57d 10\ubc30)</li> <li><code>GRADIO_SERVER_NAME=0.0.0.0</code> - WebUI \uc11c\ubc84 \ud638\uc2a4\ud2b8</li> <li><code>GRADIO_SERVER_PORT=7860</code> - WebUI \uc11c\ubc84 \ud3ec\ud2b8</li> <li><code>API_SERVER_NAME=0.0.0.0</code> - API \uc11c\ubc84 \ud638\uc2a4\ud2b8</li> <li><code>API_SERVER_PORT=8080</code> - API \uc11c\ubc84 \ud3ec\ud2b8</li> </ul> <p>Note</p> <p>Docker \ucee8\ud14c\uc774\ub108\ub294 \ubaa8\ub378 \uac00\uc911\uce58\uac00 <code>/app/checkpoints</code>\uc5d0 \ub9c8\uc6b4\ud2b8\ub420 \uac83\uc73c\ub85c \uc608\uc0c1\ud569\ub2c8\ub2e4. \ucee8\ud14c\uc774\ub108\ub97c \uc2dc\uc791\ud558\uae30 \uc804\uc5d0 \ud544\uc694\ud55c \ubaa8\ub378 \uac00\uc911\uce58\ub97c \ub2e4\uc6b4\ub85c\ub4dc\ud588\ub294\uc9c0 \ud655\uc778\ud558\uc138\uc694.</p> <p>Warning</p> <p>GPU \uc9c0\uc6d0\uc5d0\ub294 NVIDIA Docker \ub7f0\ud0c0\uc784\uc774 \ud544\uc694\ud569\ub2c8\ub2e4. CPU \uc804\uc6a9 \ubc30\ud3ec\uc758 \uacbd\uc6b0 <code>--gpus all</code> \ud50c\ub798\uadf8\ub97c \uc81c\uac70\ud558\uace0 CPU \uc774\ubbf8\uc9c0\ub97c \uc0ac\uc6a9\ud558\uc138\uc694.</p>"},{"location":"ar/","title":"\u0645\u0642\u062f\u0645\u0629","text":"Fish Speech <p>English | \u7b80\u4f53\u4e2d\u6587 | Portuguese | \u65e5\u672c\u8a9e | \ud55c\uad6d\uc5b4 | \u0627\u0644\u0639\u0631\u0628\u064a\u0629</p> <p></p> <p></p> <p>\u062a\u0646\u0628\u064a\u0647 \u0627\u0644\u062a\u0631\u062e\u064a\u0635</p> <p>\u064a\u062a\u0645 \u0625\u0635\u062f\u0627\u0631 \u0642\u0627\u0639\u062f\u0629 \u0627\u0644\u0623\u0643\u0648\u0627\u062f \u0647\u0630\u0647 \u0648\u0623\u0648\u0632\u0627\u0646 \u0627\u0644\u0646\u0645\u0627\u0630\u062c \u0627\u0644\u0645\u0631\u062a\u0628\u0637\u0629 \u0628\u0647\u0627 \u0628\u0645\u0648\u062c\u0628 \u0631\u062e\u0635\u0629 FISH AUDIO RESEARCH LICENSE. \u064a\u0631\u062c\u0649 \u0627\u0644\u0631\u062c\u0648\u0639 \u0625\u0644\u0649 LICENSE \u0644\u0645\u0632\u064a\u062f \u0645\u0646 \u0627\u0644\u062a\u0641\u0627\u0635\u064a\u0644.</p> <p>\u0625\u062e\u0644\u0627\u0621 \u0627\u0644\u0645\u0633\u0624\u0648\u0644\u064a\u0629 \u0627\u0644\u0642\u0627\u0646\u0648\u0646\u064a\u0629</p> <p>\u0646\u062d\u0646 \u0644\u0627 \u0646\u062a\u062d\u0645\u0644 \u0623\u064a \u0645\u0633\u0624\u0648\u0644\u064a\u0629 \u0639\u0646 \u0623\u064a \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u063a\u064a\u0631 \u0642\u0627\u0646\u0648\u0646\u064a \u0644\u0642\u0627\u0639\u062f\u0629 \u0627\u0644\u0623\u0643\u0648\u0627\u062f. \u064a\u0631\u062c\u0649 \u0645\u0631\u0627\u062c\u0639\u0629 \u0627\u0644\u0642\u0648\u0627\u0646\u064a\u0646 \u0627\u0644\u0645\u062d\u0644\u064a\u0629 \u0627\u0644\u0645\u062a\u0639\u0644\u0642\u0629 \u0628\u0640 DMCA \u0648\u0627\u0644\u0642\u0648\u0627\u0646\u064a\u0646 \u0627\u0644\u0623\u062e\u0631\u0649 \u0630\u0627\u062a \u0627\u0644\u0635\u0644\u0629.</p>"},{"location":"ar/#_1","title":"\u0627\u0644\u0628\u062f\u0621 \u0627\u0644\u0633\u0631\u064a\u0639","text":""},{"location":"ar/#_2","title":"\u0627\u0628\u062f\u0623 \u0645\u0646 \u0627\u0644\u0648\u062b\u0627\u0626\u0642","text":"<p>\u0647\u0630\u0647 \u0647\u064a \u0627\u0644\u0648\u062b\u0627\u0626\u0642 \u0627\u0644\u0631\u0633\u0645\u064a\u0629 \u0644\u0640 Fish Audio S2\u060c \u0648\u064a\u0645\u0643\u0646\u0643 \u0627\u0644\u0628\u062f\u0621 \u0645\u0628\u0627\u0634\u0631\u0629 \u0639\u0628\u0631 \u0627\u0644\u0631\u0648\u0627\u0628\u0637 \u0627\u0644\u062a\u0627\u0644\u064a\u0629:</p> <ul> <li>\u0627\u0644\u062a\u062b\u0628\u064a\u062a</li> <li>\u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u0639\u0628\u0631 \u0633\u0637\u0631 \u0627\u0644\u0623\u0648\u0627\u0645\u0631</li> <li>\u0627\u0633\u062a\u062f\u0644\u0627\u0644 WebUI</li> <li>\u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u0639\u0628\u0631 \u0627\u0644\u062e\u0627\u062f\u0645</li> <li>\u0625\u0639\u062f\u0627\u062f Docker</li> </ul> <p>[!IMPORTANT] \u0628\u0627\u0644\u0646\u0633\u0628\u0629 \u0644\u062e\u0627\u062f\u0645 SGLang\u060c \u0631\u0627\u062c\u0639 SGLang-Omni README.</p>"},{"location":"ar/#llm","title":"\u062f\u0644\u064a\u0644 \u0648\u0643\u0644\u0627\u0621 LLM","text":"<pre><code>\u0642\u0645 \u0628\u062a\u062b\u0628\u064a\u062a \u0648\u0625\u0639\u062f\u0627\u062f Fish Audio S2 \u0628\u0627\u062a\u0628\u0627\u0639 \u0627\u0644\u062a\u0639\u0644\u064a\u0645\u0627\u062a \u0641\u064a https://speech.fish.audio/ar/install/ .\n</code></pre>"},{"location":"ar/#fish-audio-s2","title":"Fish Audio S2","text":"<p>\u0623\u0641\u0636\u0644 \u0646\u0638\u0627\u0645 \u0644\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0646\u0635 \u0625\u0644\u0649 \u0643\u0644\u0627\u0645 \u0628\u064a\u0646 \u0627\u0644\u0623\u0646\u0638\u0645\u0629 \u0645\u0641\u062a\u0648\u062d\u0629 \u0627\u0644\u0645\u0635\u062f\u0631 \u0648\u0645\u063a\u0644\u0642\u0629 \u0627\u0644\u0645\u0635\u062f\u0631</p> <p>Fish Audio S2 \u0647\u0648 \u0623\u062d\u062f\u062b \u0646\u0645\u0648\u0630\u062c \u0645\u0646 Fish Audio. \u062a\u0645 \u062a\u062f\u0631\u064a\u0628\u0647 \u0639\u0644\u0649 \u0623\u0643\u062b\u0631 \u0645\u0646 10 \u0645\u0644\u0627\u064a\u064a\u0646 \u0633\u0627\u0639\u0629 \u0635\u0648\u062a\u064a\u0629 \u0639\u0628\u0631 \u0646\u062d\u0648 50 \u0644\u063a\u0629\u060c \u0648\u064a\u062c\u0645\u0639 \u0628\u064a\u0646 \u0627\u0644\u0645\u0648\u0627\u0621\u0645\u0629 \u0628\u0627\u0644\u062a\u0639\u0644\u0645 \u0627\u0644\u0645\u0639\u0632\u0632 \u0648\u0628\u0646\u064a\u0629 Dual-Autoregressive \u0644\u0625\u0646\u062a\u0627\u062c \u0643\u0644\u0627\u0645 \u0637\u0628\u064a\u0639\u064a \u0648\u0648\u0627\u0642\u0639\u064a \u0648\u063a\u0646\u064a \u0628\u0627\u0644\u062a\u0639\u0628\u064a\u0631 \u0627\u0644\u0639\u0627\u0637\u0641\u064a.</p> <p>\u064a\u062f\u0639\u0645 S2 \u0627\u0644\u062a\u062d\u0643\u0645 \u0627\u0644\u062f\u0642\u064a\u0642 \u0641\u064a \u0627\u0644\u0646\u0628\u0631\u0629 \u0648\u0627\u0644\u0639\u0627\u0637\u0641\u0629 \u062f\u0627\u062e\u0644 \u0627\u0644\u0646\u0635 \u0646\u0641\u0633\u0647 \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0648\u0633\u0648\u0645 \u0628\u0627\u0644\u0644\u063a\u0629 \u0627\u0644\u0637\u0628\u064a\u0639\u064a\u0629 \u0645\u062b\u0644 <code>[laugh]</code> \u0648<code>[whispers]</code> \u0648<code>[super happy]</code>\u060c \u0643\u0645\u0627 \u064a\u062f\u0639\u0645 \u0628\u0634\u0643\u0644 \u0623\u0635\u064a\u0644 \u062a\u0648\u0644\u064a\u062f \u0645\u062a\u062d\u062f\u062b\u064a\u0646 \u0645\u062a\u0639\u062f\u062f\u064a\u0646 \u0648\u062d\u0648\u0627\u0631\u0627\u062a \u0645\u062a\u0639\u062f\u062f\u0629 \u0627\u0644\u0623\u062f\u0648\u0627\u0631.</p> <p>\u064a\u0645\u0643\u0646\u0643 \u062a\u062c\u0631\u0628\u0629 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0645\u0628\u0627\u0634\u0631\u0629 \u0639\u0628\u0631 \u0645\u0648\u0642\u0639 Fish Audio\u060c \u0648\u0642\u0631\u0627\u0621\u0629 \u0627\u0644\u0645\u0632\u064a\u062f \u0641\u064a \u0645\u0646\u0634\u0648\u0631 \u0627\u0644\u0645\u062f\u0648\u0646\u0629 \u0648\u0627\u0644\u062a\u0642\u0631\u064a\u0631 \u0627\u0644\u062a\u0642\u0646\u064a.</p>"},{"location":"ar/#_3","title":"\u0625\u0635\u062f\u0627\u0631\u0627\u062a \u0627\u0644\u0646\u0645\u0648\u0630\u062c","text":"\u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0627\u0644\u062d\u062c\u0645 \u0627\u0644\u062a\u0648\u0641\u0631 \u0627\u0644\u0648\u0635\u0641 S2-Pro 4B \u0645\u0639\u0644\u0645\u0629 HuggingFace \u0646\u0645\u0648\u0630\u062c \u0631\u0627\u0626\u062f \u0643\u0627\u0645\u0644 \u0627\u0644\u0645\u064a\u0632\u0627\u062a \u0628\u0623\u0639\u0644\u0649 \u0645\u0633\u062a\u0648\u0649 \u0645\u0646 \u0627\u0644\u062c\u0648\u062f\u0629 \u0648\u0627\u0644\u0627\u0633\u062a\u0642\u0631\u0627\u0631 <p>\u064a\u0645\u0643\u0646 \u0627\u0644\u0639\u062b\u0648\u0631 \u0639\u0644\u0649 \u0645\u0632\u064a\u062f \u0645\u0646 \u0627\u0644\u062a\u0641\u0627\u0635\u064a\u0644 \u0641\u064a \u0627\u0644\u062a\u0642\u0631\u064a\u0631 \u0627\u0644\u062a\u0642\u0646\u064a.</p>"},{"location":"ar/#_4","title":"\u0646\u062a\u0627\u0626\u062c \u0627\u0644\u0642\u064a\u0627\u0633 \u0627\u0644\u0645\u0639\u064a\u0627\u0631\u064a","text":"\u0627\u0644\u0645\u0639\u064a\u0627\u0631 Fish Audio S2 Seed-TTS Eval \u2014 WER (\u0627\u0644\u0635\u064a\u0646\u064a\u0629) 0.54% (\u0627\u0644\u0623\u0641\u0636\u0644 \u0625\u062c\u0645\u0627\u0644\u0627\u064b) Seed-TTS Eval \u2014 WER (\u0627\u0644\u0625\u0646\u062c\u0644\u064a\u0632\u064a\u0629) 0.99% (\u0627\u0644\u0623\u0641\u0636\u0644 \u0625\u062c\u0645\u0627\u0644\u0627\u064b) Audio Turing Test (\u0645\u0639 \u0627\u0644\u062a\u0639\u0644\u064a\u0645\u0627\u062a) 0.515 \u0627\u0644\u0645\u062a\u0648\u0633\u0637 \u0627\u0644\u0628\u0639\u062f\u064a EmergentTTS-Eval \u2014 \u0645\u0639\u062f\u0644 \u0627\u0644\u0641\u0648\u0632 81.88% (\u0627\u0644\u0623\u0639\u0644\u0649 \u0625\u062c\u0645\u0627\u0644\u0627\u064b) Fish Instruction Benchmark \u2014 TAR 93.3% Fish Instruction Benchmark \u2014 \u0627\u0644\u062c\u0648\u062f\u0629 4.51 / 5.0 \u0645\u062a\u0639\u062f\u062f \u0627\u0644\u0644\u063a\u0627\u062a (MiniMax Testset) \u2014 \u0623\u0641\u0636\u0644 WER 11 \u0645\u0646 24 \u0644\u063a\u0629 \u0645\u062a\u0639\u062f\u062f \u0627\u0644\u0644\u063a\u0627\u062a (MiniMax Testset) \u2014 \u0623\u0641\u0636\u0644 SIM 17 \u0645\u0646 24 \u0644\u063a\u0629 <p>\u0641\u064a Seed-TTS Eval\u060c \u062d\u0642\u0642 S2 \u0623\u0642\u0644 WER \u0628\u064a\u0646 \u062c\u0645\u064a\u0639 \u0627\u0644\u0646\u0645\u0627\u0630\u062c \u0627\u0644\u062a\u064a \u062a\u0645 \u062a\u0642\u064a\u064a\u0645\u0647\u0627\u060c \u0628\u0645\u0627 \u0641\u064a \u0630\u0644\u0643 \u0627\u0644\u0623\u0646\u0638\u0645\u0629 \u0627\u0644\u0645\u063a\u0644\u0642\u0629: Qwen3-TTS \u200f(0.77/1.24)\u060c \u0648MiniMax Speech-02 \u200f(0.99/1.90)\u060c \u0648Seed-TTS \u200f(1.12/2.25). \u0648\u0641\u064a Audio Turing Test\u060c \u062a\u0641\u0648\u0642\u062a \u0642\u064a\u0645\u0629 0.515 \u0639\u0644\u0649 Seed-TTS \u200f(0.417) \u0628\u0646\u0633\u0628\u0629 24% \u0648\u0639\u0644\u0649 MiniMax-Speech \u200f(0.387) \u0628\u0646\u0633\u0628\u0629 33%. \u0648\u0641\u064a EmergentTTS-Eval\u060c \u062d\u0642\u0642 S2 \u0646\u062a\u0627\u0626\u062c \u0642\u0648\u064a\u0629 \u0628\u0634\u0643\u0644 \u062e\u0627\u0635 \u0641\u064a \u0627\u0644\u062e\u0635\u0627\u0626\u0635 \u0634\u0628\u0647 \u0627\u0644\u0644\u063a\u0648\u064a\u0629 (91.61%)\u060c \u0648\u0627\u0644\u0623\u0633\u0626\u0644\u0629 (84.41%)\u060c \u0648\u0627\u0644\u062a\u0639\u0642\u064a\u062f \u0627\u0644\u0646\u062d\u0648\u064a (83.39%).</p>"},{"location":"ar/#_5","title":"\u0623\u0628\u0631\u0632 \u0627\u0644\u0645\u0645\u064a\u0632\u0627\u062a","text":""},{"location":"ar/#_6","title":"\u062a\u062d\u0643\u0645 \u0645\u0636\u0645\u0651\u0646 \u062f\u0642\u064a\u0642 \u0639\u0628\u0631 \u0627\u0644\u0644\u063a\u0629 \u0627\u0644\u0637\u0628\u064a\u0639\u064a\u0629","text":"<p>\u064a\u062a\u064a\u062d Fish Audio S2 \u062a\u062d\u0643\u0645\u064b\u0627 \u0645\u0648\u0636\u0639\u064a\u064b\u0627 \u0641\u064a \u062a\u0648\u0644\u064a\u062f \u0627\u0644\u0643\u0644\u0627\u0645 \u0645\u0646 \u062e\u0644\u0627\u0644 \u062a\u0636\u0645\u064a\u0646 \u062a\u0639\u0644\u064a\u0645\u0627\u062a \u0628\u0627\u0644\u0644\u063a\u0629 \u0627\u0644\u0637\u0628\u064a\u0639\u064a\u0629 \u0645\u0628\u0627\u0634\u0631\u0629 \u0639\u0646\u062f \u0645\u0648\u0627\u0642\u0639 \u0643\u0644\u0645\u0627\u062a \u0623\u0648 \u0639\u0628\u0627\u0631\u0627\u062a \u0645\u062d\u062f\u062f\u0629 \u062f\u0627\u062e\u0644 \u0627\u0644\u0646\u0635. \u0648\u0628\u062f\u0644\u064b\u0627 \u0645\u0646 \u0627\u0644\u0627\u0639\u062a\u0645\u0627\u062f \u0639\u0644\u0649 \u0645\u062c\u0645\u0648\u0639\u0629 \u062b\u0627\u0628\u062a\u0629 \u0645\u0646 \u0627\u0644\u0648\u0633\u0648\u0645 \u0627\u0644\u0645\u064f\u0639\u0631\u0651\u0641\u0629 \u0645\u0633\u0628\u0642\u064b\u0627\u060c \u064a\u0642\u0628\u0644 S2 \u0623\u0648\u0635\u0627\u0641\u064b\u0627 \u0646\u0635\u064a\u0629 \u062d\u0631\u0629 \u0645\u062b\u0644 [whisper in small voice] \u0623\u0648 [professional broadcast tone] \u0623\u0648 [pitch up]\u060c \u0645\u0645\u0627 \u064a\u062a\u064a\u062d \u062a\u062d\u0643\u0645\u064b\u0627 \u0645\u0641\u062a\u0648\u062d\u064b\u0627 \u0641\u064a \u0627\u0644\u062a\u0639\u0628\u064a\u0631 \u0639\u0644\u0649 \u0645\u0633\u062a\u0648\u0649 \u0627\u0644\u0643\u0644\u0645\u0629.</p>"},{"location":"ar/#dual-autoregressive","title":"\u0628\u0646\u064a\u0629 Dual-Autoregressive","text":"<p>\u064a\u0639\u062a\u0645\u062f S2 \u0639\u0644\u0649 Transformer \u0623\u062d\u0627\u062f\u064a \u0627\u0644\u0627\u062a\u062c\u0627\u0647 (Decoder-only) \u0645\u0639 \u0645\u064f\u0631\u0645\u0651\u0632 \u0635\u0648\u062a\u064a \u0642\u0627\u0626\u0645 \u0639\u0644\u0649 RVQ (\u0639\u062f\u062f 10 codebooks \u0648\u0628\u0645\u0639\u062f\u0644 \u0625\u0637\u0627\u0631\u0627\u062a \u064a\u0642\u0627\u0631\u0628 21 \u0647\u0631\u062a\u0632). \u0648\u062a\u064f\u0642\u0633\u0651\u0645 \u0628\u0646\u064a\u0629 Dual-AR \u0639\u0645\u0644\u064a\u0629 \u0627\u0644\u062a\u0648\u0644\u064a\u062f \u0625\u0644\u0649 \u0645\u0631\u062d\u0644\u062a\u064a\u0646:</p> <ul> <li>Slow AR \u064a\u0639\u0645\u0644 \u0639\u0644\u0649 \u0627\u0644\u0645\u062d\u0648\u0631 \u0627\u0644\u0632\u0645\u0646\u064a \u0648\u064a\u062a\u0646\u0628\u0623 \u0628\u0627\u0644\u0640 semantic codebook \u0627\u0644\u0623\u0633\u0627\u0633\u064a.</li> <li>Fast AR \u064a\u0648\u0644\u0651\u062f \u0627\u0644\u0640 9 residual codebooks \u0627\u0644\u0645\u062a\u0628\u0642\u064a\u0629 \u0641\u064a \u0643\u0644 \u062e\u0637\u0648\u0629 \u0632\u0645\u0646\u064a\u0629 \u0644\u0625\u0639\u0627\u062f\u0629 \u0628\u0646\u0627\u0621 \u0627\u0644\u062a\u0641\u0627\u0635\u064a\u0644 \u0627\u0644\u0635\u0648\u062a\u064a\u0629 \u0627\u0644\u062f\u0642\u064a\u0642\u0629.</li> </ul> <p>\u0647\u0630\u0627 \u0627\u0644\u062a\u0635\u0645\u064a\u0645 \u063a\u064a\u0631 \u0627\u0644\u0645\u062a\u0645\u0627\u062b\u0644 (4B \u0645\u0639\u0644\u0645\u0629 \u0639\u0644\u0649 \u0627\u0644\u0645\u062d\u0648\u0631 \u0627\u0644\u0632\u0645\u0646\u064a \u0648400M \u0639\u0644\u0649 \u0645\u062d\u0648\u0631 \u0627\u0644\u0639\u0645\u0642) \u064a\u0631\u0641\u0639 \u0643\u0641\u0627\u0621\u0629 \u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u0645\u0639 \u0627\u0644\u062d\u0641\u0627\u0638 \u0639\u0644\u0649 \u062c\u0648\u062f\u0629 \u0627\u0644\u0635\u0648\u062a.</p>"},{"location":"ar/#_7","title":"\u0627\u0644\u0645\u0648\u0627\u0621\u0645\u0629 \u0628\u0627\u0644\u062a\u0639\u0644\u0645 \u0627\u0644\u0645\u0639\u0632\u0632","text":"<p>\u064a\u0633\u062a\u062e\u062f\u0645 S2 \u062e\u0648\u0627\u0631\u0632\u0645\u064a\u0629 Group Relative Policy Optimization (GRPO) \u0644\u0644\u0645\u0648\u0627\u0621\u0645\u0629 \u0628\u0639\u062f \u0627\u0644\u062a\u062f\u0631\u064a\u0628. \u0648\u064a\u062a\u0645 \u0625\u0639\u0627\u062f\u0629 \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0646\u0641\u0633 \u0627\u0644\u0646\u0645\u0627\u0630\u062c \u0627\u0644\u062a\u064a \u0627\u0633\u062a\u064f\u062e\u062f\u0645\u062a \u0644\u062a\u0635\u0641\u064a\u0629 \u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u062a\u062f\u0631\u064a\u0628 \u0648\u062a\u0639\u0644\u064a\u0642\u0647\u0627 \u0643\u0646\u0645\u0627\u0630\u062c \u0645\u0643\u0627\u0641\u0623\u0629 \u0641\u064a \u0627\u0644\u062a\u0639\u0644\u0645 \u0627\u0644\u0645\u0639\u0632\u0632 \u0645\u0628\u0627\u0634\u0631\u0629\u060c \u0645\u0645\u0627 \u064a\u0644\u063a\u064a \u0639\u062f\u0645 \u062a\u0637\u0627\u0628\u0642 \u0627\u0644\u062a\u0648\u0632\u064a\u0639 \u0628\u064a\u0646 \u0628\u064a\u0627\u0646\u0627\u062a \u0645\u0627 \u0642\u0628\u0644 \u0627\u0644\u062a\u062f\u0631\u064a\u0628 \u0648\u0623\u0647\u062f\u0627\u0641 \u0645\u0627 \u0628\u0639\u062f \u0627\u0644\u062a\u062f\u0631\u064a\u0628. \u0648\u062a\u062c\u0645\u0639 \u0625\u0634\u0627\u0631\u0629 \u0627\u0644\u0645\u0643\u0627\u0641\u0623\u0629 \u0628\u064a\u0646 \u0627\u0644\u062f\u0642\u0629 \u0627\u0644\u062f\u0644\u0627\u0644\u064a\u0629\u060c \u0648\u0627\u0644\u0627\u0644\u062a\u0632\u0627\u0645 \u0628\u0627\u0644\u062a\u0639\u0644\u064a\u0645\u0627\u062a\u060c \u0648\u062a\u0642\u064a\u064a\u0645 \u0627\u0644\u062a\u0641\u0636\u064a\u0644 \u0627\u0644\u0635\u0648\u062a\u064a\u060c \u0648\u062a\u0634\u0627\u0628\u0647 \u0627\u0644\u0646\u0628\u0631\u0629.</p>"},{"location":"ar/#sglang","title":"\u0627\u0644\u0628\u062b \u0627\u0644\u0625\u0646\u062a\u0627\u062c\u064a \u0639\u0628\u0631 SGLang","text":"<p>\u0644\u0623\u0646 \u0628\u0646\u064a\u0629 Dual-AR \u0645\u062a\u0645\u0627\u062b\u0644\u0629 \u0628\u0646\u064a\u0648\u064a\u064b\u0627 \u0645\u0639 \u0646\u0645\u0627\u0630\u062c LLM autoregressive \u0627\u0644\u0642\u064a\u0627\u0633\u064a\u0629\u060c \u0641\u0625\u0646 S2 \u064a\u0631\u062b \u0645\u0628\u0627\u0634\u0631\u0629 \u062a\u062d\u0633\u064a\u0646\u0627\u062a \u0627\u0644\u062e\u062f\u0645\u0629 \u0627\u0644\u0623\u0635\u0644\u064a\u0629 \u0641\u064a SGLang\u060c \u0628\u0645\u0627 \u0641\u064a \u0630\u0644\u0643: continuous batching\u060c \u0648paged KV cache\u060c \u0648CUDA graph replay\u060c \u0648prefix caching \u0627\u0644\u0645\u0639\u062a\u0645\u062f \u0639\u0644\u0649 RadixAttention.</p> <p>\u0639\u0644\u0649 \u0628\u0637\u0627\u0642\u0629 NVIDIA H200 \u0648\u0627\u062d\u062f\u0629:</p> <ul> <li>\u0639\u0627\u0645\u0644 \u0627\u0644\u0632\u0645\u0646 \u0627\u0644\u062d\u0642\u064a\u0642\u064a (RTF): 0.195</li> <li>\u0627\u0644\u0632\u0645\u0646 \u062d\u062a\u0649 \u0623\u0648\u0644 \u0645\u0642\u0637\u0639 \u0635\u0648\u062a\u064a: \u062d\u0648\u0627\u0644\u064a 100 \u0645\u0644\u0644\u064a \u062b\u0627\u0646\u064a\u0629</li> <li>\u0645\u0639\u062f\u0644 \u0627\u0644\u0645\u0639\u0627\u0644\u062c\u0629: \u0623\u0643\u062b\u0631 \u0645\u0646 3,000 acoustic tokens/s \u0645\u0639 \u0627\u0644\u062d\u0641\u0627\u0638 \u0639\u0644\u0649 RTF \u0623\u0642\u0644 \u0645\u0646 0.5</li> </ul>"},{"location":"ar/#_8","title":"\u062f\u0639\u0645 \u0644\u063a\u0627\u062a \u0645\u062a\u0639\u062f\u062f\u0629","text":"<p>\u064a\u062f\u0639\u0645 Fish Audio S2 \u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0646\u0635 \u0625\u0644\u0649 \u0643\u0644\u0627\u0645 \u0628\u062c\u0648\u062f\u0629 \u0639\u0627\u0644\u064a\u0629 \u0648\u0644\u063a\u0627\u062a \u0645\u062a\u0639\u062f\u062f\u0629 \u062f\u0648\u0646 \u0627\u0644\u062d\u0627\u062c\u0629 \u0625\u0644\u0649 \u0631\u0645\u0648\u0632 \u0635\u0648\u062a\u064a\u0629 \u0623\u0648 \u0645\u0639\u0627\u0644\u062c\u0629 \u0645\u0633\u0628\u0642\u0629 \u062e\u0627\u0635\u0629 \u0628\u0643\u0644 \u0644\u063a\u0629. \u0628\u0645\u0627 \u0641\u064a \u0630\u0644\u0643:</p> <p>\u0627\u0644\u0625\u0646\u062c\u0644\u064a\u0632\u064a\u0629\u060c \u0627\u0644\u0635\u064a\u0646\u064a\u0629\u060c \u0627\u0644\u064a\u0627\u0628\u0627\u0646\u064a\u0629\u060c \u0627\u0644\u0643\u0648\u0631\u064a\u0629\u060c \u0627\u0644\u0639\u0631\u0628\u064a\u0629\u060c \u0627\u0644\u0623\u0644\u0645\u0627\u0646\u064a\u0629\u060c \u0627\u0644\u0641\u0631\u0646\u0633\u064a\u0629...</p> <p>\u0648\u0623\u0643\u062b\u0631 \u0645\u0646 \u0630\u0644\u0643 \u0628\u0643\u062b\u064a\u0631!</p> <p>\u0627\u0644\u0642\u0627\u0626\u0645\u0629 \u0641\u064a \u062a\u0648\u0633\u0639 \u0645\u0633\u062a\u0645\u0631\u060c \u062a\u062d\u0642\u0642 \u0645\u0646 Fish Audio \u0644\u0645\u0639\u0631\u0641\u0629 \u0623\u062d\u062f\u062b \u0627\u0644\u0625\u0635\u062f\u0627\u0631\u0627\u062a.</p>"},{"location":"ar/#_9","title":"\u062a\u0648\u0644\u064a\u062f \u0623\u0635\u0644\u064a \u0644\u0645\u062a\u062d\u062f\u062b\u064a\u0646 \u0645\u062a\u0639\u062f\u062f\u064a\u0646","text":"<p>\u064a\u0633\u0645\u062d Fish Audio S2 \u0644\u0644\u0645\u0633\u062a\u062e\u062f\u0645\u064a\u0646 \u0628\u0631\u0641\u0639 \u0635\u0648\u062a \u0645\u0631\u062c\u0639\u064a \u064a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 \u0645\u062a\u062d\u062f\u062b\u064a\u0646 \u0645\u062a\u0639\u062f\u062f\u064a\u0646\u060c \u0648\u0633\u064a\u062a\u0639\u0627\u0645\u0644 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0645\u0639 \u0645\u064a\u0632\u0627\u062a \u0643\u0644 \u0645\u062a\u062d\u062f\u062b \u0639\u0628\u0631 \u0631\u0645\u0632 <code><|speaker:i|></code>. \u064a\u0645\u0643\u0646\u0643 \u0628\u0639\u062f \u0630\u0644\u0643 \u0627\u0644\u062a\u062d\u0643\u0645 \u0641\u064a \u0623\u062f\u0627\u0621 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0631\u0645\u0632 \u0645\u0639\u0631\u0641 \u0627\u0644\u0645\u062a\u062d\u062f\u062b\u060c \u0645\u0645\u0627 \u064a\u0633\u0645\u062d \u0628\u062a\u0648\u0644\u064a\u062f \u0648\u0627\u062d\u062f \u064a\u062a\u0636\u0645\u0646 \u0645\u062a\u062d\u062f\u062b\u064a\u0646 \u0645\u062a\u0639\u062f\u062f\u064a\u0646. \u0644\u0645 \u062a\u0639\u062f \u0628\u062d\u0627\u062c\u0629 \u0644\u0631\u0641\u0639 \u0645\u0644\u0641\u0627\u062a \u0645\u0631\u062c\u0639\u064a\u0629 \u0645\u0646\u0641\u0635\u0644\u0629 \u0644\u0643\u0644 \u0645\u062a\u062d\u062f\u062b.</p>"},{"location":"ar/#_10","title":"\u062a\u0648\u0644\u064a\u062f \u062d\u0648\u0627\u0631\u0627\u062a \u0645\u062a\u0639\u062f\u062f\u0629 \u0627\u0644\u0623\u062f\u0648\u0627\u0631","text":"<p>\u0628\u0641\u0636\u0644 \u062a\u0648\u0633\u064a\u0639 \u0633\u064a\u0627\u0642 \u0627\u0644\u0646\u0645\u0648\u0630\u062c\u060c \u064a\u0645\u0643\u0646 \u0644\u0646\u0645\u0648\u0630\u062c\u0646\u0627 \u0627\u0644\u0622\u0646 \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0627\u0644\u0645\u0639\u0644\u0648\u0645\u0627\u062a \u0627\u0644\u0633\u0627\u0628\u0642\u0629 \u0644\u062a\u062d\u0633\u064a\u0646 \u0627\u0644\u062a\u0639\u0628\u064a\u0631 \u0641\u064a \u0627\u0644\u0645\u062d\u062a\u0648\u0649 \u0627\u0644\u0645\u0648\u0644\u062f \u0644\u0627\u062d\u0642\u0627\u064b\u060c \u0645\u0645\u0627 \u064a\u0632\u064a\u062f \u0645\u0646 \u0637\u0628\u064a\u0639\u064a\u0629 \u0627\u0644\u0645\u062d\u062a\u0648\u0649.</p>"},{"location":"ar/#_11","title":"\u0627\u0633\u062a\u0646\u0633\u0627\u062e \u0635\u0648\u062a \u0633\u0631\u064a\u0639","text":"<p>\u064a\u062f\u0639\u0645 Fish Audio S2 \u0627\u0633\u062a\u0646\u0633\u0627\u062e \u0627\u0644\u0635\u0648\u062a \u0628\u062f\u0642\u0629 \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0639\u064a\u0646\u0629 \u0645\u0631\u062c\u0639\u064a\u0629 \u0642\u0635\u064a\u0631\u0629 (\u0639\u0627\u062f\u0629\u064b 10-30 \u062b\u0627\u0646\u064a\u0629). \u064a\u0644\u062a\u0642\u0637 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0646\u0628\u0631\u0629 \u0627\u0644\u0635\u0648\u062a\u060c \u0648\u0623\u0633\u0644\u0648\u0628 \u0627\u0644\u062a\u062d\u062f\u062b\u060c \u0648\u0627\u0644\u0645\u064a\u0648\u0644 \u0627\u0644\u0639\u0627\u0637\u0641\u064a\u0629\u060c \u0645\u0645\u0627 \u064a\u0646\u062a\u062c \u0623\u0635\u0648\u0627\u062a\u0627\u064b \u0645\u0633\u062a\u0646\u0633\u062e\u0629 \u0648\u0627\u0642\u0639\u064a\u0629 \u0648\u0645\u062a\u0633\u0642\u0629 \u062f\u0648\u0646 \u0627\u0644\u062d\u0627\u062c\u0629 \u0625\u0644\u0649 \u0636\u0628\u0637 \u062f\u0642\u064a\u0642 \u0625\u0636\u0627\u0641\u064a. \u0644\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u062e\u0627\u062f\u0645 SGLang\u060c \u0631\u0627\u062c\u0639 SGLang-Omni README .</p>"},{"location":"ar/#_12","title":"\u0634\u0643\u0631 \u0648\u062a\u0642\u062f\u064a\u0631","text":"<ul> <li>VITS2 (daniilrobnikov)</li> <li>Bert-VITS2</li> <li>GPT VITS</li> <li>MQTTS</li> <li>GPT Fast</li> <li>GPT-SoVITS</li> <li>Qwen3</li> </ul>"},{"location":"ar/#_13","title":"\u0627\u0644\u062a\u0642\u0631\u064a\u0631 \u0627\u0644\u062a\u0642\u0646\u064a","text":"<pre><code>@misc{fish-speech-v1.4,\n title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n year={2024},\n eprint={2411.01156},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n title={Fish Audio S2 Technical Report}, \n author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n year={2026},\n eprint={2603.08823},\n archivePrefix={arXiv},\n primaryClass={cs.SD},\n url={https://arxiv.org/abs/2603.08823}, \n}\n</code></pre>"},{"location":"ar/finetune/","title":"\u0627\u0644\u0636\u0628\u0637 \u0627\u0644\u062f\u0642\u064a\u0642 (Fine-tuning)","text":"<p>\u0645\u0646 \u0627\u0644\u0648\u0627\u0636\u062d \u0623\u0646\u0643 \u0639\u0646\u062f\u0645\u0627 \u0641\u062a\u062d\u062a \u0647\u0630\u0647 \u0627\u0644\u0635\u0641\u062d\u0629\u060c \u0644\u0645 \u062a\u0643\u0646 \u0631\u0627\u0636\u064a\u064b\u0627 \u0639\u0646 \u0623\u062f\u0627\u0621 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0627\u0644\u0645\u062f\u0631\u0628 \u0645\u0633\u0628\u0642\u064b\u0627 \u0641\u064a \u0648\u0636\u0639 zero-shot. \u0623\u0646\u062a \u062a\u0631\u063a\u0628 \u0641\u064a \u0625\u062c\u0631\u0627\u0621 \u0636\u0628\u0637 \u062f\u0642\u064a\u0642 \u0644\u0646\u0645\u0648\u0630\u062c \u0644\u062a\u062d\u0633\u064a\u0646 \u0623\u062f\u0627\u0626\u0647 \u0639\u0644\u0649 \u0645\u062c\u0645\u0648\u0639\u0629 \u0627\u0644\u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u062e\u0627\u0635\u0629 \u0628\u0643.</p> <p>\u0641\u064a \u0627\u0644\u0625\u0635\u062f\u0627\u0631 \u0627\u0644\u062d\u0627\u0644\u064a\u060c \u0645\u0627 \u0639\u0644\u064a\u0643 \u0633\u0648\u0649 \u0625\u062c\u0631\u0627\u0621 \u0627\u0644\u0636\u0628\u0637 \u0627\u0644\u062f\u0642\u064a\u0642 \u0644\u062c\u0632\u0621 'LLAMA'.</p>"},{"location":"ar/finetune/#llama","title":"\u0627\u0644\u0636\u0628\u0637 \u0627\u0644\u062f\u0642\u064a\u0642 \u0644\u0640 LLAMA","text":""},{"location":"ar/finetune/#1","title":"1. \u0625\u0639\u062f\u0627\u062f \u0645\u062c\u0645\u0648\u0639\u0629 \u0627\u0644\u0628\u064a\u0627\u0646\u0627\u062a","text":"<pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u2514\u2500\u2500 30.1-32.71.mp3\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u2514\u2500\u2500 38.79-40.85.mp3\n</code></pre> <p>\u062a\u062d\u062a\u0627\u062c \u0625\u0644\u0649 \u062a\u062d\u0648\u064a\u0644 \u0645\u062c\u0645\u0648\u0639\u0629 \u0627\u0644\u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u062e\u0627\u0635\u0629 \u0628\u0643 \u0625\u0644\u0649 \u0627\u0644\u062a\u0646\u0633\u064a\u0642 \u0623\u0639\u0644\u0627\u0647 \u0648\u0648\u0636\u0639\u0647\u0627 \u062a\u062d\u062a \u0645\u062c\u0644\u062f <code>data</code>. \u064a\u0645\u0643\u0646 \u0623\u0646 \u064a\u0643\u0648\u0646 \u0644\u0644\u0645\u0644\u0641 \u0627\u0644\u0635\u0648\u062a\u064a \u0627\u0644\u0627\u0645\u062a\u062f\u0627\u062f\u0627\u062a <code>.mp3</code>\u060c <code>.wav</code>\u060c \u0623\u0648 <code>.flac</code>\u060c \u0648\u064a\u062c\u0628 \u0623\u0646 \u064a\u0643\u0648\u0646 \u0644\u0645\u0644\u0641 \u0627\u0644\u062a\u0639\u0644\u064a\u0642\u0627\u062a \u0627\u0644\u062a\u0648\u0636\u064a\u062d\u064a\u0629 \u0627\u0644\u0627\u0645\u062a\u062f\u0627\u062f <code>.lab</code>.</p> <p>\u062a\u0646\u0633\u064a\u0642 \u0645\u062c\u0645\u0648\u0639\u0629 \u0627\u0644\u0628\u064a\u0627\u0646\u0627\u062a</p> <p>\u064a\u062d\u062a\u0627\u062c \u0645\u0644\u0641 \u0627\u0644\u062a\u0639\u0644\u064a\u0642\u0627\u062a \u0627\u0644\u062a\u0648\u0636\u064a\u062d\u064a\u0629 <code>.lab</code> \u0641\u0642\u0637 \u0625\u0644\u0649 \u0627\u062d\u062a\u0648\u0627\u0621 \u0627\u0644\u0646\u0635 \u0627\u0644\u0645\u0643\u062a\u0648\u0628 \u0644\u0644\u0645\u0642\u0637\u0639 \u0627\u0644\u0635\u0648\u062a\u064a\u060c \u062f\u0648\u0646 \u0627\u0644\u062d\u0627\u062c\u0629 \u0625\u0644\u0649 \u062a\u0646\u0633\u064a\u0642 \u062e\u0627\u0635. \u0639\u0644\u0649 \u0633\u0628\u064a\u0644 \u0627\u0644\u0645\u062b\u0627\u0644\u060c \u0625\u0630\u0627 \u0643\u0627\u0646 \u0645\u062d\u062a\u0648\u0649 <code>hi.mp3</code> \u0647\u0648 \"\u0645\u0631\u062d\u0628\u064b\u0627\u060c \u0648\u062f\u0627\u0639\u064b\u0627\"\u060c \u0641\u0633\u064a\u062d\u062a\u0648\u064a \u0645\u0644\u0641 <code>hi.lab</code> \u0639\u0644\u0649 \u0633\u0637\u0631 \u0648\u0627\u062d\u062f \u0645\u0646 \u0627\u0644\u0646\u0635: \"\u0645\u0631\u062d\u0628\u064b\u0627\u060c \u0648\u062f\u0627\u0639\u064b\u0627\".</p> <p>\u062a\u062d\u0630\u064a\u0631</p> <p>\u064a\u0648\u0635\u0649 \u0628\u062a\u0637\u0628\u064a\u0642 \u062a\u0633\u0648\u064a\u0629 \u062c\u0647\u0627\u0631\u0629 \u0627\u0644\u0635\u0648\u062a (loudness normalization) \u0639\u0644\u0649 \u0645\u062c\u0645\u0648\u0639\u0629 \u0627\u0644\u0628\u064a\u0627\u0646\u0627\u062a. \u064a\u0645\u0643\u0646\u0643 \u0627\u0633\u062a\u062e\u062f\u0627\u0645 fish-audio-preprocess \u0644\u0644\u0642\u064a\u0627\u0645 \u0628\u0630\u0644\u0643. <pre><code>fap loudness-norm data-raw data --clean\n</code></pre></p>"},{"location":"ar/finetune/#2-semantic-tokens","title":"2. \u0627\u0644\u0627\u0633\u062a\u062e\u0631\u0627\u062c \u0627\u0644\u062f\u0641\u0639\u064a \u0644\u0644\u0631\u0645\u0648\u0632 \u0627\u0644\u062f\u0644\u0627\u0644\u064a\u0629 (semantic tokens)","text":"<p>\u062a\u0623\u0643\u062f \u0645\u0646 \u0623\u0646\u0643 \u0642\u0645\u062a \u0628\u062a\u0646\u0632\u064a\u0644 \u0623\u0648\u0632\u0627\u0646 VQGAN. \u0625\u0630\u0627 \u0644\u0645 \u062a\u0643\u0646 \u0642\u062f \u0641\u0639\u0644\u062a\u060c \u0642\u0645 \u0628\u062a\u0634\u063a\u064a\u0644 \u0627\u0644\u0623\u0645\u0631 \u0627\u0644\u062a\u0627\u0644\u064a:</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>\u064a\u0645\u0643\u0646\u0643 \u0628\u0639\u062f \u0630\u0644\u0643 \u062a\u0634\u063a\u064a\u0644 \u0627\u0644\u0623\u0645\u0631 \u0627\u0644\u062a\u0627\u0644\u064a \u0644\u0627\u0633\u062a\u062e\u0631\u0627\u062c \u0627\u0644\u0631\u0645\u0648\u0632 \u0627\u0644\u062f\u0644\u0627\u0644\u064a\u0629:</p> <pre><code>python tools/vqgan/extract_vq.py data \\\n --num-workers 1 --batch-size 16 \\\n --config-name \"modded_dac_vq\" \\\n --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n</code></pre> <p>\u0645\u0644\u0627\u062d\u0638\u0629</p> <p>\u064a\u0645\u0643\u0646\u0643 \u0636\u0628\u0637 <code>--num-workers</code> \u0648 <code>--batch-size</code> \u0644\u0632\u064a\u0627\u062f\u0629 \u0633\u0631\u0639\u0629 \u0627\u0644\u0627\u0633\u062a\u062e\u0631\u0627\u062c\u060c \u0648\u0644\u0643\u0646 \u064a\u0631\u062c\u0649 \u0627\u0644\u062a\u0623\u0643\u062f \u0645\u0646 \u0639\u062f\u0645 \u062a\u062c\u0627\u0648\u0632 \u062d\u062f \u0630\u0627\u0643\u0631\u0629 \u0648\u062d\u062f\u0629 \u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0631\u0633\u0648\u0645\u0627\u062a (GPU) \u0627\u0644\u062e\u0627\u0635\u0629 \u0628\u0643.</p> <p>\u0633\u064a\u0642\u0648\u0645 \u0647\u0630\u0627 \u0627\u0644\u0623\u0645\u0631 \u0628\u0625\u0646\u0634\u0627\u0621 \u0645\u0644\u0641\u0627\u062a <code>.npy</code> \u0641\u064a \u0645\u062c\u0644\u062f <code>data</code>\u060c \u0643\u0645\u0627 \u0647\u0648 \u0645\u0648\u0636\u062d \u0623\u062f\u0646\u0627\u0647:</p> <pre><code>.\n\u251c\u2500\u2500 SPK1\n\u2502 \u251c\u2500\u2500 21.15-26.44.lab\n\u2502 \u251c\u2500\u2500 21.15-26.44.mp3\n\u2502 \u251c\u2500\u2500 21.15-26.44.npy\n\u2502 \u251c\u2500\u2500 27.51-29.98.lab\n\u2502 \u251c\u2500\u2500 27.51-29.98.mp3\n\u2502 \u251c\u2500\u2500 27.51-29.98.npy\n\u2502 \u251c\u2500\u2500 30.1-32.71.lab\n\u2502 \u251c\u2500\u2500 30.1-32.71.mp3\n\u2502 \u2514\u2500\u2500 30.1-32.71.npy\n\u2514\u2500\u2500 SPK2\n \u251c\u2500\u2500 38.79-40.85.lab\n \u251c\u2500\u2500 38.79-40.85.mp3\n \u2514\u2500\u2500 38.79-40.85.npy\n</code></pre>"},{"location":"ar/finetune/#3-protobuf","title":"3. \u062d\u0632\u0645 \u0645\u062c\u0645\u0648\u0639\u0629 \u0627\u0644\u0628\u064a\u0627\u0646\u0627\u062a \u0641\u064a protobuf","text":"<pre><code>python tools/llama/build_dataset.py \\\n --input \"data\" \\\n --output \"data/protos\" \\\n --text-extension .lab \\\n --num-workers 16\n</code></pre> <p>\u0628\u0639\u062f \u0627\u0646\u062a\u0647\u0627\u0621 \u062a\u0646\u0641\u064a\u0630 \u0627\u0644\u0623\u0645\u0631\u060c \u064a\u062c\u0628 \u0623\u0646 \u062a\u0631\u0649 \u0645\u0644\u0641 <code>protos</code> \u0641\u064a \u0645\u062c\u0644\u062f <code>data</code>.</p>"},{"location":"ar/finetune/#4-lora","title":"4. \u0623\u062e\u064a\u0631\u064b\u0627\u060c \u0627\u0644\u0636\u0628\u0637 \u0627\u0644\u062f\u0642\u064a\u0642 \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 LoRA","text":"<p>\u0628\u0627\u0644\u0645\u062b\u0644\u060c \u062a\u0623\u0643\u062f \u0645\u0646 \u0623\u0646\u0643 \u0642\u0645\u062a \u0628\u062a\u0646\u0632\u064a\u0644 \u0623\u0648\u0632\u0627\u0646 <code>LLAMA</code>. \u0625\u0630\u0627 \u0644\u0645 \u062a\u0643\u0646 \u0642\u062f \u0641\u0639\u0644\u062a\u060c \u0642\u0645 \u0628\u062a\u0634\u063a\u064a\u0644 \u0627\u0644\u0623\u0645\u0631 \u0627\u0644\u062a\u0627\u0644\u064a:</p> <pre><code>huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n</code></pre> <p>\u0623\u062e\u064a\u0631\u064b\u0627\u060c \u064a\u0645\u0643\u0646\u0643 \u0628\u062f\u0621 \u0627\u0644\u0636\u0628\u0637 \u0627\u0644\u062f\u0642\u064a\u0642 \u0639\u0646 \u0637\u0631\u064a\u0642 \u062a\u0634\u063a\u064a\u0644 \u0627\u0644\u0623\u0645\u0631 \u0627\u0644\u062a\u0627\u0644\u064a:</p> <pre><code>python fish_speech/train.py --config-name text2semantic_finetune \\\n project=$project \\\n +lora@model.model.lora_config=r_8_alpha_16\n</code></pre> <p>\u0645\u0644\u0627\u062d\u0638\u0629</p> <p>\u064a\u0645\u0643\u0646\u0643 \u062a\u0639\u062f\u064a\u0644 \u0645\u0639\u0644\u0645\u0627\u062a \u0627\u0644\u062a\u062f\u0631\u064a\u0628 \u0645\u062b\u0644 <code>batch_size</code>\u060c <code>gradient_accumulation_steps</code>\u060c \u0648\u0645\u0627 \u0625\u0644\u0649 \u0630\u0644\u0643 \u0644\u062a\u0646\u0627\u0633\u0628 \u0630\u0627\u0643\u0631\u0629 \u0648\u062d\u062f\u0629 \u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0631\u0633\u0648\u0645\u0627\u062a \u0627\u0644\u062e\u0627\u0635\u0629 \u0628\u0643 \u0639\u0646 \u0637\u0631\u064a\u0642 \u062a\u0639\u062f\u064a\u0644 <code>fish_speech/configs/text2semantic_finetune.yaml</code>.</p> <p>\u0645\u0644\u0627\u062d\u0638\u0629</p> <p>\u0644\u0645\u0633\u062a\u062e\u062f\u0645\u064a Windows\u060c \u064a\u0645\u0643\u0646\u0643 \u0627\u0633\u062a\u062e\u062f\u0627\u0645 <code>trainer.strategy.process_group_backend=gloo</code> \u0644\u062a\u062c\u0646\u0628 \u0645\u0634\u0643\u0644\u0627\u062a <code>nccl</code>.</p> <p>\u0628\u0639\u062f \u0627\u0643\u062a\u0645\u0627\u0644 \u0627\u0644\u062a\u062f\u0631\u064a\u0628\u060c \u064a\u0645\u0643\u0646\u0643 \u0627\u0644\u0631\u062c\u0648\u0639 \u0625\u0644\u0649 \u0642\u0633\u0645 \u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 (inference) \u0644\u0627\u062e\u062a\u0628\u0627\u0631 \u0646\u0645\u0648\u0630\u062c\u0643.</p> <p>\u0645\u0639\u0644\u0648\u0645\u0627\u062a</p> <p>\u0628\u0634\u0643\u0644 \u0627\u0641\u062a\u0631\u0627\u0636\u064a\u060c \u0633\u064a\u062a\u0639\u0644\u0645 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0641\u0642\u0637 \u0623\u0646\u0645\u0627\u0637 \u0643\u0644\u0627\u0645 \u0627\u0644\u0645\u062a\u062d\u062f\u062b \u0648\u0644\u064a\u0633 \u062c\u0631\u0633 \u0627\u0644\u0635\u0648\u062a (timbre). \u0644\u0627 \u062a\u0632\u0627\u0644 \u0628\u062d\u0627\u062c\u0629 \u0625\u0644\u0649 \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0627\u0644\u062a\u0644\u0642\u064a\u0646\u0627\u062a (prompts) \u0644\u0636\u0645\u0627\u0646 \u0627\u0633\u062a\u0642\u0631\u0627\u0631 \u062c\u0631\u0633 \u0627\u0644\u0635\u0648\u062a. \u0625\u0630\u0627 \u0643\u0646\u062a \u062a\u0631\u063a\u0628 \u0641\u064a \u062a\u0639\u0644\u0645 \u062c\u0631\u0633 \u0627\u0644\u0635\u0648\u062a\u060c \u064a\u0645\u0643\u0646\u0643 \u0632\u064a\u0627\u062f\u0629 \u0639\u062f\u062f \u062e\u0637\u0648\u0627\u062a \u0627\u0644\u062a\u062f\u0631\u064a\u0628\u060c \u0648\u0644\u0643\u0646 \u0647\u0630\u0627 \u0642\u062f \u064a\u0624\u062f\u064a \u0625\u0644\u0649 \u0627\u0644\u0625\u0641\u0631\u0627\u0637 \u0641\u064a \u0627\u0644\u062a\u062e\u0635\u064a\u0635 (overfitting).</p> <p>\u0628\u0639\u062f \u0627\u0644\u062a\u062f\u0631\u064a\u0628\u060c \u062a\u062d\u062a\u0627\u062c \u0625\u0644\u0649 \u062a\u062d\u0648\u064a\u0644 \u0623\u0648\u0632\u0627\u0646 LoRA \u0625\u0644\u0649 \u0623\u0648\u0632\u0627\u0646 \u0639\u0627\u062f\u064a\u0629 \u0642\u0628\u0644 \u0625\u062c\u0631\u0627\u0621 \u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644.</p> <pre><code>python tools/llama/merge_lora.py \\\n --lora-config r_8_alpha_16 \\\n --base-weight checkpoints/openaudio-s1-mini \\\n --lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n --output checkpoints/openaudio-s1-mini-yth-lora/\n</code></pre> <p>\u0645\u0644\u0627\u062d\u0638\u0629</p> <p>\u064a\u0645\u0643\u0646\u0643 \u0623\u064a\u0636\u064b\u0627 \u062a\u062c\u0631\u0628\u0629 \u0646\u0642\u0627\u0637 \u062a\u062d\u0642\u0642 (checkpoints) \u0623\u062e\u0631\u0649. \u0646\u0642\u062a\u0631\u062d \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0623\u0642\u062f\u0645 \u0646\u0642\u0637\u0629 \u062a\u062d\u0642\u0642 \u062a\u0644\u0628\u064a \u0645\u062a\u0637\u0644\u0628\u0627\u062a\u0643\u060c \u062d\u064a\u062b \u0625\u0646\u0647\u0627 \u063a\u0627\u0644\u0628\u064b\u0627 \u0645\u0627 \u062a\u0624\u062f\u064a \u0623\u062f\u0627\u0621\u064b \u0623\u0641\u0636\u0644 \u0639\u0644\u0649 \u0627\u0644\u0628\u064a\u0627\u0646\u0627\u062a \u062e\u0627\u0631\u062c \u0627\u0644\u062a\u0648\u0632\u064a\u0639 (OOD).</p>"},{"location":"ar/inference/","title":"\u0627\u0644\u0627\u0633\u062a\u0646\u062a\u0627\u062c","text":"<p>\u064a\u062a\u0637\u0644\u0628 \u0646\u0645\u0648\u0630\u062c Fish Audio S2 \u0630\u0627\u0643\u0631\u0629 \u0641\u064a\u062f\u064a\u0648 (VRAM) \u0643\u0628\u064a\u0631\u0629. \u0646\u0648\u0635\u064a \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0648\u062d\u062f\u0629 \u0645\u0639\u0627\u0644\u062c\u0629 \u0631\u0633\u0648\u0645\u0627\u062a (GPU) \u0628\u0633\u0639\u0629 24 \u062c\u064a\u062c\u0627\u0628\u0627\u064a\u062a \u0639\u0644\u0649 \u0627\u0644\u0623\u0642\u0644 \u0644\u0644\u0627\u0633\u062a\u0646\u062a\u0627\u062c.</p>"},{"location":"ar/inference/#_2","title":"\u062a\u062d\u0645\u064a\u0644 \u0627\u0644\u0623\u0648\u0632\u0627\u0646","text":"<p>\u0623\u0648\u0644\u0627\u064b \u060c \u062a\u062d\u062a\u0627\u062c \u0625\u0644\u0649 \u062a\u062d\u0645\u064a\u0644 \u0623\u0648\u0632\u0627\u0646 \u0627\u0644\u0646\u0645\u0648\u0630\u062c:</p> <pre><code>hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n</code></pre>"},{"location":"ar/inference/#_3","title":"\u0627\u0644\u0627\u0633\u062a\u0646\u062a\u0627\u062c \u0639\u0628\u0631 \u062e\u0637 \u0627\u0644\u0623\u0648\u0627\u0645\u0631","text":"<p>Note</p> <p>\u0625\u0630\u0627 \u0643\u0646\u062a \u062a\u062e\u0637\u0637 \u0644\u062a\u0631\u0643 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u064a\u062e\u062a\u0627\u0631 \u0646\u063a\u0645\u0629 \u0627\u0644\u0635\u0648\u062a \u0639\u0634\u0648\u0627\u0626\u064a\u064b\u0627 \u060c \u0641\u064a\u0645\u0643\u0646\u0643 \u062a\u062e\u0637\u064a \u0647\u0630\u0647 \u0627\u0644\u062e\u0637\u0648\u0629.</p>"},{"location":"ar/inference/#1-vq","title":"1. \u0627\u0644\u062d\u0635\u0648\u0644 \u0639\u0644\u0649 \u0631\u0645\u0648\u0632 VQ \u0645\u0646 \u0627\u0644\u0635\u0648\u062a \u0627\u0644\u0645\u0631\u062c\u0639\u064a","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"test.wav\" \\\n --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n</code></pre> <p>\u064a\u062c\u0628 \u0623\u0646 \u062a\u062d\u0635\u0644 \u0639\u0644\u0649 <code>fake.npy</code> \u0648 <code>fake.wav</code>.</p>"},{"location":"ar/inference/#2-semantic-tokens","title":"2. \u062a\u0648\u0644\u064a\u062f \u0627\u0644\u0631\u0645\u0648\u0632 \u0627\u0644\u062f\u0644\u0627\u0644\u064a\u0629 (Semantic tokens) \u0645\u0646 \u0627\u0644\u0646\u0635:","text":"<pre><code>python fish_speech/models/text2semantic/inference.py \\\n --text \"\u0627\u0644\u0646\u0635 \u0627\u0644\u0630\u064a \u062a\u0631\u064a\u062f \u062a\u062d\u0648\u064a\u0644\u0647\" \\\n --prompt-text \"\u0627\u0644\u0646\u0635 \u0627\u0644\u0645\u0631\u062c\u0639\u064a \u0627\u0644\u062e\u0627\u0635 \u0628\u0643\" \\\n --prompt-tokens \"fake.npy\" \\\n # --compile\n</code></pre> <p>\u0633\u064a\u0642\u0648\u0645 \u0647\u0630\u0627 \u0627\u0644\u0623\u0645\u0631 \u0628\u0625\u0646\u0634\u0627\u0621 \u0645\u0644\u0641 <code>codes_N</code> \u0641\u064a \u062f\u0644\u064a\u0644 \u0627\u0644\u0639\u0645\u0644 \u060c \u062d\u064a\u062b N \u0647\u0648 \u0639\u062f\u062f \u0635\u062d\u064a\u062d \u064a\u0628\u062f\u0623 \u0645\u0646 0.</p> <p>Note</p> <p>\u0642\u062f \u062a\u0631\u063a\u0628 \u0641\u064a \u0627\u0633\u062a\u062e\u062f\u0627\u0645 <code>--compile</code> \u0644\u062f\u0645\u062c \u0646\u0648\u0649 CUDA \u0644\u0627\u0633\u062a\u0646\u062a\u0627\u062c \u0623\u0633\u0631\u0639. \u0648\u0645\u0639 \u0630\u0644\u0643 \u060c \u0646\u0648\u0635\u064a \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u062a\u062d\u0633\u064a\u0646 \u062a\u0633\u0631\u064a\u0639 \u0627\u0644\u0627\u0633\u062a\u0646\u062a\u0627\u062c sglang \u0627\u0644\u062e\u0627\u0635 \u0628\u0646\u0627. \u0628\u0627\u0644\u0645\u0642\u0627\u0628\u0644 \u060c \u0625\u0630\u0627 \u0643\u0646\u062a \u0644\u0627 \u062a\u062e\u0637\u0637 \u0644\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0627\u0644\u062a\u0633\u0631\u064a\u0639 \u060c \u064a\u0645\u0643\u0646\u0643 \u0627\u0644\u062a\u0639\u0644\u064a\u0642 \u0639\u0644\u0649 \u0645\u0639\u0644\u0645\u0629 <code>--compile</code>.</p> <p>Info</p> <p>\u0628\u0627\u0644\u0646\u0633\u0628\u0629 \u0644\u0648\u062d\u062f\u0627\u062a \u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0631\u0633\u0648\u0645\u0627\u062a \u0627\u0644\u062a\u064a \u0644\u0627 \u062a\u062f\u0639\u0645 bf16 \u060c \u0642\u062f \u062a\u062d\u062a\u0627\u062c \u0625\u0644\u0649 \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0645\u0639\u0644\u0645\u0629 <code>--half</code>.</p>"},{"location":"ar/inference/#3","title":"3. \u062a\u0648\u0644\u064a\u062f \u0627\u0644\u0635\u0648\u062a \u0645\u0646 \u0627\u0644\u0631\u0645\u0648\u0632 \u0627\u0644\u062f\u0644\u0627\u0644\u064a\u0629:","text":"<pre><code>python fish_speech/models/dac/inference.py \\\n -i \"codes_0.npy\" \\\n</code></pre> <p>\u0628\u0639\u062f \u0630\u0644\u0643 \u0633\u062a\u062d\u0635\u0644 \u0639\u0644\u0649 \u0645\u0644\u0641 <code>fake.wav</code>.</p>"},{"location":"ar/inference/#webui","title":"\u0627\u0633\u062a\u0646\u062a\u0627\u062c WebUI","text":""},{"location":"ar/inference/#1-gradio-webui","title":"1. Gradio WebUI","text":"<p>\u0644\u0644\u062d\u0641\u0627\u0638 \u0639\u0644\u0649 \u0627\u0644\u062a\u0648\u0627\u0641\u0642\u060c \u0645\u0627 \u0632\u0644\u0646\u0627 \u0646\u062d\u062a\u0641\u0638 \u0628\u0648\u0627\u062c\u0647\u0629 Gradio WebUI \u0627\u0644\u0633\u0627\u0628\u0642\u0629.</p> <pre><code>python tools/run_webui.py # --compile \u0625\u0630\u0627 \u0643\u0646\u062a \u0628\u062d\u0627\u062c\u0629 \u0625\u0644\u0649 \u062a\u0633\u0631\u064a\u0639\n</code></pre>"},{"location":"ar/inference/#2-awesome-webui","title":"2. Awesome WebUI","text":"<p>\u062a\u0639\u062f Awesome WebUI \u0648\u0627\u062c\u0647\u0629 \u0648\u064a\u0628 \u062d\u062f\u064a\u062b\u0629 \u062a\u0639\u062a\u0645\u062f \u0639\u0644\u0649 TypeScript\u060c \u0648\u062a\u0648\u0641\u0631 \u0645\u064a\u0632\u0627\u062a \u0623\u063a\u0646\u0649 \u0648\u062a\u062c\u0631\u0628\u0629 \u0645\u0633\u062a\u062e\u062f\u0645 \u0623\u0641\u0636\u0644.</p> <p>\u0628\u0646\u0627\u0621 WebUI:</p> <p>\u064a\u062c\u0628 \u0623\u0646 \u064a\u0643\u0648\u0646 \u0644\u062f\u064a\u0643 Node.js \u0648 npm \u0645\u062b\u0628\u062a\u064a\u0646 \u0639\u0644\u0649 \u062c\u0647\u0627\u0632\u0643 \u0627\u0644\u0645\u062d\u0644\u064a \u0623\u0648 \u0627\u0644\u062e\u0627\u062f\u0645.</p> <ol> <li>\u0627\u062f\u062e\u0644 \u0625\u0644\u0649 \u062f\u0644\u064a\u0644 <code>awesome_webui</code>: <pre><code>cd awesome_webui\n</code></pre></li> <li>\u062a\u062b\u0628\u064a\u062a \u0627\u0644\u062a\u0628\u0639\u064a\u0627\u062a: <pre><code>npm install\n</code></pre></li> <li>\u0628\u0646\u0627\u0621 WebUI: <pre><code>npm run build\n</code></pre></li> </ol> <p>\u0628\u062f\u0621 \u062a\u0634\u063a\u064a\u0644 \u062e\u0627\u062f\u0645 \u0627\u0644\u062e\u0644\u0641\u064a\u0629:</p> <p>\u0628\u0639\u062f \u0628\u0646\u0627\u0621 WebUI\u060c \u0639\u062f \u0625\u0644\u0649 \u062f\u0644\u064a\u0644 \u062c\u0630\u0631 \u0627\u0644\u0645\u0634\u0631\u0648\u0639 \u0648\u0642\u0645 \u0628\u062a\u0634\u063a\u064a\u0644 \u062e\u0627\u062f\u0645 API:</p> <pre><code>python tools/api_server.py --listen 0.0.0.0:8888 --compile\n</code></pre> <p>\u0627\u0644\u0648\u0635\u0648\u0644:</p> <p>\u0628\u0645\u062c\u0631\u062f \u062a\u0634\u063a\u064a\u0644 \u0627\u0644\u062e\u0627\u062f\u0645\u060c \u064a\u0645\u0643\u0646\u0643 \u0627\u0644\u0648\u0635\u0648\u0644 \u0625\u0644\u064a\u0647 \u0639\u0628\u0631 \u0627\u0644\u0645\u062a\u0635\u0641\u062d \u0639\u0644\u0649 \u0627\u0644\u0639\u0646\u0648\u0627\u0646 \u0627\u0644\u062a\u0627\u0644\u064a: <code>http://localhost:8888/ui</code></p>"},{"location":"ar/install/","title":"\u0627\u0644\u062a\u062b\u0628\u064a\u062a","text":""},{"location":"ar/install/#_1","title":"\u0627\u0644\u0645\u062a\u0637\u0644\u0628\u0627\u062a","text":"<ul> <li>\u0630\u0627\u0643\u0631\u0629 \u0648\u062d\u062f\u0629 \u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0631\u0633\u0648\u0645\u0627\u062a (GPU): 24 \u062c\u064a\u062c\u0627\u0628\u0627\u064a\u062a (\u0644\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644)</li> <li>\u0627\u0644\u0646\u0638\u0627\u0645: Linux, WSL</li> </ul>"},{"location":"ar/install/#_2","title":"\u0625\u0639\u062f\u0627\u062f \u0627\u0644\u0646\u0638\u0627\u0645","text":"<p>\u064a\u062f\u0639\u0645 Fish Audio S2 \u0637\u0631\u0642 \u062a\u062b\u0628\u064a\u062a \u0645\u062a\u0639\u062f\u062f\u0629. \u0627\u062e\u062a\u0631 \u0627\u0644\u0637\u0631\u064a\u0642\u0629 \u0627\u0644\u062a\u064a \u062a\u0646\u0627\u0633\u0628 \u0628\u064a\u0626\u0629 \u0627\u0644\u062a\u0637\u0648\u064a\u0631 \u0627\u0644\u062e\u0627\u0635\u0629 \u0628\u0643.</p> <p>\u0627\u0644\u0645\u062a\u0637\u0644\u0628\u0627\u062a \u0627\u0644\u0623\u0633\u0627\u0633\u064a\u0629: \u0642\u0645 \u0628\u062a\u062b\u0628\u064a\u062a \u062a\u0628\u0639\u064a\u0627\u062a \u0627\u0644\u0646\u0638\u0627\u0645 \u0644\u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0635\u0648\u062a: <pre><code>apt install portaudio19-dev libsox-dev ffmpeg\n</code></pre></p>"},{"location":"ar/install/#conda","title":"Conda","text":"<pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# \u062a\u062b\u0628\u064a\u062a \u0646\u0633\u062e\u0629 GPU (\u0627\u062e\u062a\u0631 \u0625\u0635\u062f\u0627\u0631 CUDA \u0627\u0644\u062e\u0627\u0635 \u0628\u0643: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# \u062a\u062b\u0628\u064a\u062a \u0646\u0633\u062e\u0629 CPU \u0641\u0642\u0637\npip install -e .[cpu]\n\n# \u0627\u0644\u062a\u062b\u0628\u064a\u062a \u0627\u0644\u0627\u0641\u062a\u0631\u0627\u0636\u064a (\u064a\u0633\u062a\u062e\u062f\u0645 \u0641\u0647\u0631\u0633 PyTorch \u0627\u0644\u0627\u0641\u062a\u0631\u0627\u0636\u064a)\npip install -e .\n\n# \u0625\u0630\u0627 \u0648\u0627\u062c\u0647\u062a \u062e\u0637\u0623 \u0623\u062b\u0646\u0627\u0621 \u0627\u0644\u062a\u062b\u0628\u064a\u062a \u0628\u0633\u0628\u0628 pyaudio\u060c \u0641\u0641\u0643\u0631 \u0641\u064a \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0627\u0644\u0623\u0645\u0631 \u0627\u0644\u062a\u0627\u0644\u064a:\n# conda install pyaudio\n# \u062b\u0645 \u0642\u0645 \u0628\u062a\u0634\u063a\u064a\u0644 pip install -e . \u0645\u0631\u0629 \u0623\u062e\u0631\u0649\n</code></pre>"},{"location":"ar/install/#uv","title":"UV","text":"<p>\u064a\u0648\u0641\u0631 UV \u062d\u0644\u0627\u064b \u0623\u0633\u0631\u0639 \u0644\u062a\u062b\u0628\u064a\u062a \u0627\u0644\u062a\u0628\u0639\u064a\u0627\u062a:</p> <pre><code># \u062a\u062b\u0628\u064a\u062a \u0646\u0633\u062e\u0629 GPU (\u0627\u062e\u062a\u0631 \u0625\u0635\u062f\u0627\u0631 CUDA \u0627\u0644\u062e\u0627\u0635 \u0628\u0643: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# \u062a\u062b\u0628\u064a\u062a \u0646\u0633\u062e\u0629 CPU \u0641\u0642\u0637\nuv sync --python 3.12 --extra cpu\n</code></pre>"},{"location":"ar/install/#intel-arc-xpu","title":"\u062f\u0639\u0645 Intel Arc XPU","text":"<p>\u0644\u0645\u0633\u062a\u062e\u062f\u0645\u064a \u0648\u062d\u062f\u0627\u062a \u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0631\u0633\u0648\u0645\u0627\u062a Intel Arc\u060c \u0642\u0645 \u0628\u0627\u0644\u062a\u062b\u0628\u064a\u062a \u0645\u0639 \u062f\u0639\u0645 XPU \u0639\u0644\u0649 \u0627\u0644\u0646\u062d\u0648 \u0627\u0644\u062a\u0627\u0644\u064a:</p> <pre><code>conda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# \u062a\u062b\u0628\u064a\u062a \u0645\u0643\u062a\u0628\u0629 C++ \u0627\u0644\u0642\u064a\u0627\u0633\u064a\u0629 \u0627\u0644\u0645\u0637\u0644\u0648\u0628\u0629\nconda install libstdcxx -c conda-forge\n\n# \u062a\u062b\u0628\u064a\u062a PyTorch \u0645\u0639 \u062f\u0639\u0645 Intel XPU\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# \u062a\u062b\u0628\u064a\u062a Fish Speech\npip install -e .\n</code></pre> <p>Warning</p> <p>\u062e\u064a\u0627\u0631 <code>compile</code> \u063a\u064a\u0631 \u0645\u062f\u0639\u0648\u0645 \u0639\u0644\u0649 \u0623\u0646\u0638\u0645\u0629 Windows \u0648 macOS. \u0625\u0630\u0627 \u0643\u0646\u062a \u062a\u0631\u063a\u0628 \u0641\u064a \u0627\u0644\u062a\u0634\u063a\u064a\u0644 \u0645\u0639 \u0627\u0644\u062a\u062c\u0645\u064a\u0639\u060c \u0633\u062a\u062d\u062a\u0627\u062c \u0625\u0644\u0649 \u062a\u062b\u0628\u064a\u062a Triton \u0628\u0646\u0641\u0633\u0643.</p>"},{"location":"ar/install/#docker","title":"\u0625\u0639\u062f\u0627\u062f Docker","text":"<p>\u064a\u0648\u0641\u0631 \u0646\u0645\u0648\u0630\u062c \u0633\u0644\u0633\u0644\u0629 Fish Audio S2 \u062e\u064a\u0627\u0631\u0627\u062a \u0646\u0634\u0631 \u0645\u062a\u0639\u062f\u062f\u0629 \u0645\u0639 Docker \u0644\u062a\u0644\u0628\u064a\u0629 \u0627\u0644\u0627\u062d\u062a\u064a\u0627\u062c\u0627\u062a \u0627\u0644\u0645\u062e\u062a\u0644\u0641\u0629. \u064a\u0645\u0643\u0646\u0643 \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0627\u0644\u0635\u0648\u0631 \u0627\u0644\u0645\u0639\u062f\u0629 \u0645\u0633\u0628\u0642\u064b\u0627 \u0645\u0646 Docker Hub\u060c \u0623\u0648 \u0627\u0644\u0628\u0646\u0627\u0621 \u0645\u062d\u0644\u064a\u064b\u0627 \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 Docker Compose\u060c \u0623\u0648 \u0628\u0646\u0627\u0621 \u0635\u0648\u0631 \u0645\u062e\u0635\u0635\u0629 \u064a\u062f\u0648\u064a\u064b\u0627.</p> <p>\u0644\u0642\u062f \u0642\u062f\u0645\u0646\u0627 \u0635\u0648\u0631 Docker \u0644\u0643\u0644 \u0645\u0646 \u0648\u0627\u062c\u0647\u0629 \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645 \u0627\u0644\u0631\u0633\u0648\u0645\u064a\u0629 (WebUI) \u0648\u062e\u0627\u062f\u0645 API\u060c \u0644\u0643\u0644 \u0645\u0646 \u0648\u062d\u062f\u0627\u062a \u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0631\u0633\u0648\u0645\u0627\u062a (GPU) (CUDA 12.6 \u0627\u0641\u062a\u0631\u0627\u0636\u064a\u064b\u0627) \u0648\u0648\u062d\u062f\u0627\u062a \u0627\u0644\u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0645\u0631\u0643\u0632\u064a\u0629 (CPU). \u064a\u0645\u0643\u0646\u0643 \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0627\u0644\u0635\u0648\u0631 \u0627\u0644\u0645\u0639\u062f\u0629 \u0645\u0633\u0628\u0642\u064b\u0627 \u0645\u0646 Docker Hub\u060c \u0623\u0648 \u0627\u0644\u0628\u0646\u0627\u0621 \u0645\u062d\u0644\u064a\u064b\u0627 \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 Docker Compose\u060c \u0623\u0648 \u0628\u0646\u0627\u0621 \u0635\u0648\u0631 \u0645\u062e\u0635\u0635\u0629 \u064a\u062f\u0648\u064a\u064b\u0627. \u0625\u0630\u0627 \u0643\u0646\u062a \u062a\u0631\u063a\u0628 \u0641\u064a \u0627\u0644\u0628\u0646\u0627\u0621 \u0645\u062d\u0644\u064a\u064b\u0627\u060c \u0641\u0627\u062a\u0628\u0639 \u0627\u0644\u0625\u0631\u0634\u0627\u062f\u0627\u062a \u0623\u062f\u0646\u0627\u0647. \u0625\u0630\u0627 \u0643\u0646\u062a \u062a\u0631\u063a\u0628 \u0641\u0642\u0637 \u0641\u064a \u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0627\u0644\u0635\u0648\u0631 \u0627\u0644\u0645\u0639\u062f\u0629 \u0645\u0633\u0628\u0642\u064b\u0627\u060c \u0641\u0627\u062a\u0628\u0639 \u0645\u0628\u0627\u0634\u0631\u0629\u064b \u062f\u0644\u064a\u0644 \u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644.</p>"},{"location":"ar/install/#_3","title":"\u0627\u0644\u0645\u062a\u0637\u0644\u0628\u0627\u062a \u0627\u0644\u0623\u0633\u0627\u0633\u064a\u0629","text":"<ul> <li>\u062a\u062b\u0628\u064a\u062a Docker \u0648 Docker Compose</li> <li>\u062a\u062b\u0628\u064a\u062a NVIDIA Docker runtime (\u0644\u062f\u0639\u0645 GPU)</li> <li>\u0630\u0627\u0643\u0631\u0629 GPU \u0644\u0627 \u062a\u0642\u0644 \u0639\u0646 24 \u062c\u064a\u062c\u0627\u0628\u0627\u064a\u062a \u0644\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 CUDA</li> </ul>"},{"location":"ar/install/#docker-compose","title":"\u0627\u0633\u062a\u062e\u062f\u0627\u0645 Docker Compose","text":"<p>\u0644\u0644\u062a\u0637\u0648\u064a\u0631 \u0623\u0648 \u0627\u0644\u062a\u062e\u0635\u064a\u0635\u060c \u064a\u0645\u0643\u0646\u0643 \u0627\u0633\u062a\u062e\u062f\u0627\u0645 Docker Compose \u0644\u0644\u0628\u0646\u0627\u0621 \u0648\u0627\u0644\u062a\u0634\u063a\u064a\u0644 \u0645\u062d\u0644\u064a\u064b\u0627:</p> <pre><code># \u0623\u0648\u0644\u0627\u064b\u060c \u0627\u0633\u062a\u0646\u0633\u062e \u0627\u0644\u0645\u0633\u062a\u0648\u062f\u0639\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# \u0628\u062f\u0621 \u0648\u0627\u062c\u0647\u0629 \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645 \u0627\u0644\u0631\u0633\u0648\u0645\u064a\u0629 (WebUI) \u0645\u0639 CUDA\ndocker compose --profile webui up\n\n# \u0628\u062f\u0621 \u0648\u0627\u062c\u0647\u0629 \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645 \u0627\u0644\u0631\u0633\u0648\u0645\u064a\u0629 (WebUI) \u0645\u0639 \u062a\u062d\u0633\u064a\u0646 \u0627\u0644\u062a\u062c\u0645\u064a\u0639\nCOMPILE=1 docker compose --profile webui up\n\n# \u0628\u062f\u0621 \u062e\u0627\u062f\u0645 API\ndocker compose --profile server up\n\n# \u0628\u062f\u0621 \u062e\u0627\u062f\u0645 API \u0645\u0639 \u062a\u062d\u0633\u064a\u0646 \u0627\u0644\u062a\u062c\u0645\u064a\u0639\nCOMPILE=1 docker compose --profile server up\n\n# \u0627\u0644\u0646\u0634\u0631 \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 CPU \u0641\u0642\u0637\nBACKEND=cpu docker compose --profile webui up\n</code></pre>"},{"location":"ar/install/#docker-compose_1","title":"\u0645\u062a\u063a\u064a\u0631\u0627\u062a \u0627\u0644\u0628\u064a\u0626\u0629 \u0644\u0640 Docker Compose","text":"<p>\u064a\u0645\u0643\u0646\u0643 \u062a\u062e\u0635\u064a\u0635 \u0627\u0644\u0646\u0634\u0631 \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 \u0645\u062a\u063a\u064a\u0631\u0627\u062a \u0627\u0644\u0628\u064a\u0626\u0629:</p> <pre><code># \u0645\u062b\u0627\u0644 \u0639\u0644\u0649 \u0645\u0644\u0641 .env\nBACKEND=cuda # \u0623\u0648 cpu\nCOMPILE=1 # \u062a\u0645\u0643\u064a\u0646 \u062a\u062d\u0633\u064a\u0646 \u0627\u0644\u062a\u062c\u0645\u064a\u0639\nGRADIO_PORT=7860 # \u0645\u0646\u0641\u0630 \u0648\u0627\u062c\u0647\u0629 \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645 \u0627\u0644\u0631\u0633\u0648\u0645\u064a\u0629 (WebUI)\nAPI_PORT=8080 # \u0645\u0646\u0641\u0630 \u062e\u0627\u062f\u0645 API\nUV_VERSION=0.8.15 # \u0625\u0635\u062f\u0627\u0631 \u0645\u062f\u064a\u0631 \u0627\u0644\u062d\u0632\u0645 UV\n</code></pre> <p>\u0633\u064a\u0642\u0648\u0645 \u0627\u0644\u0623\u0645\u0631 \u0628\u0628\u0646\u0627\u0621 \u0627\u0644\u0635\u0648\u0631\u0629 \u0648\u062a\u0634\u063a\u064a\u0644 \u0627\u0644\u062d\u0627\u0648\u064a\u0629. \u064a\u0645\u0643\u0646\u0643 \u0627\u0644\u0648\u0635\u0648\u0644 \u0625\u0644\u0649 \u0648\u0627\u062c\u0647\u0629 \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645 \u0627\u0644\u0631\u0633\u0648\u0645\u064a\u0629 (WebUI) \u0639\u0644\u0649 <code>http://localhost:7860</code> \u0648\u062e\u0627\u062f\u0645 API \u0639\u0644\u0649 <code>http://localhost:8080</code>.</p>"},{"location":"ar/install/#docker_1","title":"\u0627\u0644\u0628\u0646\u0627\u0621 \u0627\u0644\u064a\u062f\u0648\u064a \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 Docker","text":"<p>\u0644\u0644\u0645\u0633\u062a\u062e\u062f\u0645\u064a\u0646 \u0627\u0644\u0645\u062a\u0642\u062f\u0645\u064a\u0646 \u0627\u0644\u0630\u064a\u0646 \u064a\u0631\u063a\u0628\u0648\u0646 \u0641\u064a \u062a\u062e\u0635\u064a\u0635 \u0639\u0645\u0644\u064a\u0629 \u0627\u0644\u0628\u0646\u0627\u0621:</p> <pre><code># \u0628\u0646\u0627\u0621 \u0635\u0648\u0631\u0629 \u0648\u0627\u062c\u0647\u0629 \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645 \u0627\u0644\u0631\u0633\u0648\u0645\u064a\u0629 (WebUI) \u0645\u0639 \u062f\u0639\u0645 CUDA\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target webui \\\n -t fish-speech-webui:cuda .\n\n# \u0628\u0646\u0627\u0621 \u0635\u0648\u0631\u0629 \u062e\u0627\u062f\u0645 API \u0645\u0639 \u062f\u0639\u0645 CUDA\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --build-arg CUDA_VER=12.6.0 \\\n --build-arg UV_EXTRA=cu126 \\\n --target server \\\n -t fish-speech-server:cuda .\n\n# \u0628\u0646\u0627\u0621 \u0635\u0648\u0631\u0629 CPU \u0641\u0642\u0637 (\u062a\u062f\u0639\u0645 \u0645\u0646\u0635\u0627\u062a \u0645\u062a\u0639\u062f\u062f\u0629)\ndocker build \\\n --platform linux/amd64,linux/arm64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cpu \\\n --target webui \\\n -t fish-speech-webui:cpu .\n\n# \u0628\u0646\u0627\u0621 \u0635\u0648\u0631\u0629 \u0627\u0644\u062a\u0637\u0648\u064a\u0631\ndocker build \\\n --platform linux/amd64 \\\n -f docker/Dockerfile \\\n --build-arg BACKEND=cuda \\\n --target dev \\\n -t fish-speech-dev:cuda .\n</code></pre>"},{"location":"ar/install/#_4","title":"\u0648\u0633\u064a\u0637\u0627\u062a \u0627\u0644\u0628\u0646\u0627\u0621","text":"<ul> <li><code>BACKEND</code>: <code>cuda</code> \u0623\u0648 <code>cpu</code> (\u0627\u0644\u0627\u0641\u062a\u0631\u0627\u0636\u064a: <code>cuda</code>)</li> <li><code>CUDA_VER</code>: \u0625\u0635\u062f\u0627\u0631 CUDA (\u0627\u0644\u0627\u0641\u062a\u0631\u0627\u0636\u064a: <code>12.6.0</code>)</li> <li><code>UV_EXTRA</code>: \u062d\u0632\u0645\u0629 UV \u0625\u0636\u0627\u0641\u064a\u0629 \u0644\u0640 CUDA (\u0627\u0644\u0627\u0641\u062a\u0631\u0627\u0636\u064a: <code>cu126</code>)</li> <li><code>UBUNTU_VER</code>: \u0625\u0635\u062f\u0627\u0631 Ubuntu (\u0627\u0644\u0627\u0641\u062a\u0631\u0627\u0636\u064a: <code>24.04</code>)</li> <li><code>PY_VER</code>: \u0625\u0635\u062f\u0627\u0631 Python (\u0627\u0644\u0627\u0641\u062a\u0631\u0627\u0636\u064a: <code>3.12</code>)</li> </ul>"},{"location":"ar/install/#_5","title":"\u062a\u062d\u0645\u064a\u0644 \u0627\u0644\u0645\u062c\u0644\u062f\u0627\u062a","text":"<p>\u062a\u062a\u0637\u0644\u0628 \u0643\u0644\u062a\u0627 \u0627\u0644\u0637\u0631\u064a\u0642\u062a\u064a\u0646 \u062a\u062d\u0645\u064a\u0644 \u0627\u0644\u0645\u062c\u0644\u062f\u0627\u062a \u0627\u0644\u062a\u0627\u0644\u064a\u0629:</p> <ul> <li><code>./checkpoints:/app/checkpoints</code> - \u0645\u062c\u0644\u062f \u0623\u0648\u0632\u0627\u0646 \u0627\u0644\u0646\u0645\u0648\u0630\u062c</li> <li><code>./references:/app/references</code> - \u0645\u062c\u0644\u062f \u0645\u0644\u0641\u0627\u062a \u0627\u0644\u0635\u0648\u062a \u0627\u0644\u0645\u0631\u062c\u0639\u064a\u0629</li> </ul>"},{"location":"ar/install/#_6","title":"\u0645\u062a\u063a\u064a\u0631\u0627\u062a \u0627\u0644\u0628\u064a\u0626\u0629","text":"<ul> <li><code>COMPILE=1</code> - \u062a\u0645\u0643\u064a\u0646 <code>torch.compile</code> \u0644\u062a\u0633\u0631\u064a\u0639 \u0627\u0644\u0627\u0633\u062a\u062f\u0644\u0627\u0644 (\u062d\u0648\u0627\u0644\u064a 10 \u0623\u0636\u0639\u0627\u0641)</li> <li><code>GRADIO_SERVER_NAME=0.0.0.0</code> - \u0645\u0636\u064a\u0641 \u062e\u0627\u062f\u0645 \u0648\u0627\u062c\u0647\u0629 \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645 \u0627\u0644\u0631\u0633\u0648\u0645\u064a\u0629 (WebUI)</li> <li><code>GRADIO_SERVER_PORT=7860</code> - \u0645\u0646\u0641\u0630 \u062e\u0627\u062f\u0645 \u0648\u0627\u062c\u0647\u0629 \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645 \u0627\u0644\u0631\u0633\u0648\u0645\u064a\u0629 (WebUI)</li> <li><code>API_SERVER_NAME=0.0.0.0</code> - \u0645\u0636\u064a\u0641 \u062e\u0627\u062f\u0645 API</li> <li><code>API_SERVER_PORT=8080</code> - \u0645\u0646\u0641\u0630 \u062e\u0627\u062f\u0645 API</li> </ul> <p>Note</p> <p>\u062a\u062a\u0648\u0642\u0639 \u062d\u0627\u0648\u064a\u0627\u062a Docker \u0623\u0646 \u064a\u062a\u0645 \u062a\u062d\u0645\u064a\u0644 \u0623\u0648\u0632\u0627\u0646 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0641\u064a <code>/app/checkpoints</code>. \u062a\u0623\u0643\u062f \u0645\u0646 \u062a\u0646\u0632\u064a\u0644 \u0623\u0648\u0632\u0627\u0646 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0627\u0644\u0645\u0637\u0644\u0648\u0628\u0629 \u0642\u0628\u0644 \u0628\u062f\u0621 \u0627\u0644\u062d\u0627\u0648\u064a\u0627\u062a.</p> <p>Warning</p> <p>\u064a\u062a\u0637\u0644\u0628 \u062f\u0639\u0645 GPU \u0648\u062c\u0648\u062f NVIDIA Docker runtime. \u0644\u0644\u0646\u0634\u0631 \u0628\u0627\u0633\u062a\u062e\u062f\u0627\u0645 CPU \u0641\u0642\u0637\u060c \u0642\u0645 \u0628\u0625\u0632\u0627\u0644\u0629 \u0639\u0644\u0627\u0645\u0629 <code>--gpus all</code> \u0648\u0627\u0633\u062a\u062e\u062f\u0645 \u0635\u0648\u0631 CPU.</p>"}]}
|