Explorar o código

S2 beta (#1164)

* Update to support S2 model.

* fix gradio webui bug.

* update docs and license for S2 Model.

* Fix torch.compile and DAC bugs.

* Fix LICENSE.

* fix pyproject.toml bug.

* [fix]:fix hf style ckpy load problem

* [fix]:fix docker and docs

---------

Co-authored-by: PoTaTo-Mika <1228427403@qq.com>
Co-authored-by: Whale-Dolphin <whaledolphin@fish.audio>
Leng Yue hai 1 mes
pai
achega
daa9b4f31c
Modificáronse 63 ficheiros con 2115 adicións e 3753 borrados
  1. 0 1
      .dockerignore
  2. 94 201
      LICENSE
  3. 30 121
      README.md
  4. 4 4
      docker/Dockerfile
  5. 35 126
      docs/README.ar.md
  6. 31 122
      docs/README.ja.md
  7. 30 121
      docs/README.ko.md
  8. 31 122
      docs/README.pt-BR.md
  9. 33 122
      docs/README.zh.md
  10. 90 131
      docs/ar/index.md
  11. 17 132
      docs/ar/inference.md
  12. 8 4
      docs/ar/install.md
  13. 0 80
      docs/ar/samples.md
  14. BIN=BIN
      docs/assets/Elo.jpg
  15. BIN=BIN
      docs/assets/Thumbnail.jpg
  16. BIN=BIN
      docs/assets/chattemplate.png
  17. BIN=BIN
      docs/assets/openaudio.jpg
  18. BIN=BIN
      docs/assets/openaudio.png
  19. BIN=BIN
      docs/assets/totalability.png
  20. 89 117
      docs/en/index.md
  21. 12 142
      docs/en/inference.md
  22. 8 4
      docs/en/install.md
  23. 0 80
      docs/en/samples.md
  24. 89 122
      docs/ja/index.md
  25. 19 132
      docs/ja/inference.md
  26. 8 4
      docs/ja/install.md
  27. 0 80
      docs/ja/samples.md
  28. 89 119
      docs/ko/index.md
  29. 16 129
      docs/ko/inference.md
  30. 8 4
      docs/ko/install.md
  31. 0 80
      docs/ko/samples.md
  32. 89 120
      docs/pt/index.md
  33. 14 127
      docs/pt/inference.md
  34. 8 4
      docs/pt/install.md
  35. 0 80
      docs/pt/samples.md
  36. 4 16
      docs/zh/finetune.md
  37. 90 121
      docs/zh/index.md
  38. 10 123
      docs/zh/inference.md
  39. 7 150
      docs/zh/install.md
  40. 0 80
      docs/zh/samples.md
  41. 2 2
      fish_speech/configs/modded_dac_vq.yaml
  42. 58 31
      fish_speech/content_sequence.py
  43. 174 0
      fish_speech/conversation.py
  44. 3 3
      fish_speech/i18n/locale/en_US.json
  45. 3 3
      fish_speech/i18n/locale/es_ES.json
  46. 3 3
      fish_speech/i18n/locale/ja_JP.json
  47. 3 3
      fish_speech/i18n/locale/ko_KR.json
  48. 3 4
      fish_speech/i18n/locale/pt_BR.json
  49. 3 3
      fish_speech/i18n/locale/zh_CN.json
  50. 1 7
      fish_speech/inference_engine/vq_manager.py
  51. 6 3
      fish_speech/models/dac/inference.py
  52. 89 26
      fish_speech/models/dac/modded_dac.py
  53. 4 8
      fish_speech/models/dac/rvq.py
  54. 447 170
      fish_speech/models/text2semantic/inference.py
  55. 167 81
      fish_speech/models/text2semantic/llama.py
  56. 69 138
      fish_speech/tokenizer.py
  57. 12 9
      pyproject.toml
  58. 0 55
      tools/download_models.py
  59. 2 2
      tools/run_webui.py
  60. 2 2
      tools/server/api_utils.py
  61. 1 1
      tools/vqgan/extract_vq.py
  62. 1 1
      tools/webui/variables.py
  63. 99 77
      uv.lock

+ 0 - 1
.dockerignore

@@ -139,7 +139,6 @@ azure-pipelines.yml
 
 # Large data files
 *.csv
-*.json
 *.jsonl
 *.parquet
 *.h5

+ 94 - 201
LICENSE

@@ -1,201 +1,94 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2024 Fish Audio
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+# FISH AUDIO RESEARCH LICENSE AGREEMENT
+
+**Last Updated: March 7, 2026**
+
+## I. INTRODUCTION
+
+This Agreement applies to any individual person or entity ("You", "Your" or "Licensee") that uses or distributes any portion or element of the Fish Audio Materials or Derivative Works thereof for any Research, Non-Commercial, or Commercial purpose. Capitalized terms not otherwise defined herein are defined in Section V below.
+
+This Agreement is intended to allow research and non-commercial uses of the Materials free of charge. Any Commercial use of the Materials requires a separate license from Fish Audio.
+
+By clicking "I Accept" or by using, distributing, or accessing any portion or element of the Fish Audio Materials or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement. If You are acting on behalf of a company, organization or other entity, then "You" includes you and that entity, and You agree that You: (i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and (ii) You agree to the terms of this Agreement on that entity's behalf.
+
+## II. RESEARCH & NON-COMMERCIAL USE LICENSE
+
+Subject to the terms of this Agreement, Fish Audio grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Fish Audio's intellectual property or other rights owned by Fish Audio embodied in the Fish Audio Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Fish Audio Materials for any Research or Non-Commercial Purpose.
+
+"Research Purpose" means academic or scientific advancement, and in each case, is not primarily intended for commercial advantage or monetary compensation to You or others.
+
+"Non-Commercial Purpose" means any purpose other than a Research Purpose that is not primarily intended for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist) or evaluation and testing.
+
+## III. COMMERCIAL USE
+
+**Any use of the Fish Audio Materials or Derivative Works for a Commercial Purpose requires a separate written license agreement from Fish Audio.** No commercial rights are granted under this Agreement.
+
+"Commercial Purpose" means any purpose other than a Research Purpose or Non-Commercial Purpose that is primarily intended for or directed toward commercial advantage or monetary compensation to You or others, including but not limited to: (i) creating, modifying, or distributing Your product or service, including via a hosted service or application programming interface, (ii) Your business's or organization's internal operations, and (iii) any use in connection with a product or service for which You charge a fee or generate revenue, whether directly or indirectly.
+
+To obtain a commercial license, please contact Fish Audio at:
+
+- **Website:** [https://fish.audio](https://fish.audio)
+- **Email:** business@fish.audio
+
+## IV. GENERAL TERMS
+
+Your Research and Non-Commercial License under this Agreement is subject to the following terms.
+
+### a. Distribution & Attribution
+
+If You distribute or make available the Fish Audio Materials or a Derivative Work to a third party, or a product or service that uses any portion of them, You shall: (i) provide a copy of this Agreement to that third party, (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This model is licensed under the Fish Audio Research License, Copyright © 39 AI, INC. All Rights Reserved.", and (iii) prominently display "Built with Fish Audio" on a related website, user interface, blogpost, about page, or product documentation.
+
+If You create a Derivative Work, You may add your own attribution notice(s) to the "Notice" text file included with that Derivative Work, provided that You clearly indicate which attributions apply to the Fish Audio Materials and state in the "Notice" text file that You changed the Fish Audio Materials and how it was modified.
+
+### b. Use Restrictions
+
+Your use of the Fish Audio Materials and Derivative Works, including any output or results of the Fish Audio Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control Laws and equivalent regulations) and adhere to Fish Audio's Acceptable Use Policy, which is hereby incorporated by reference.
+
+Furthermore, You will not use the Fish Audio Materials or Derivative Works, or any output or results of the Fish Audio Materials or Derivative Works, to create or improve any foundational generative AI model (excluding the Models or Derivative Works).
+
+### c. Intellectual Property
+
+**(i) Trademark License.** No trademark licenses are granted under this Agreement, and in connection with the Fish Audio Materials or Derivative Works, You may not use any name or mark owned by or associated with Fish Audio or any of its Affiliates, except as required under Section IV(a) herein.
+
+**(ii) Ownership of Derivative Works.** As between You and Fish Audio, You are the owner of Derivative Works You create, subject to Fish Audio's ownership of the Fish Audio Materials and any Derivative Works made by or for Fish Audio.
+
+**(iii) Ownership of Outputs.** As between You and Fish Audio, You own any outputs generated from the Models or Derivative Works to the extent permitted by applicable law.
+
+**(iv) Disputes.** If You or Your Affiliate(s) institute litigation or other proceedings against Fish Audio (including a cross-claim or counterclaim in a lawsuit) alleging that the Fish Audio Materials, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by You, then any licenses granted to You under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Fish Audio from and against any claim by any third party arising out of or related to Your use or distribution of the Fish Audio Materials or Derivative Works in violation of this Agreement.
+
+**(v) Feedback.** From time to time, You may provide Fish Audio with verbal and/or written suggestions, comments or other feedback related to Fish Audio's existing or prospective technology, products or services (collectively, "Feedback"). You are not obligated to provide Fish Audio with Feedback, but to the extent that You do, You hereby grant Fish Audio a perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive, worldwide right and license to exploit the Feedback in any manner without restriction. Your Feedback is provided "AS IS" and You make no warranties whatsoever about any Feedback.
+
+### d. Disclaimer of Warranty
+
+UNLESS REQUIRED BY APPLICABLE LAW, THE FISH AUDIO MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE FISH AUDIO MATERIALS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE FISH AUDIO MATERIALS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS.
+
+### e. Limitation of Liability
+
+IN NO EVENT WILL FISH AUDIO OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF FISH AUDIO OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+
+### f. Term and Termination
+
+The term of this Agreement will commence upon Your acceptance of this Agreement or access to the Fish Audio Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Fish Audio may terminate this Agreement if You are in breach of any term or condition of this Agreement. Upon termination of this Agreement, You shall delete and cease use of any Fish Audio Materials or Derivative Works. Sections IV(d), (e), and (g) shall survive the termination of this Agreement.
+
+### g. Governing Law
+
+This Agreement will be governed by and construed in accordance with the laws of the United States and the State of California without regard to choice of law principles, and the UN Convention on Contracts for International Sale of Goods does not apply to this Agreement.
+
+## V. DEFINITIONS
+
+**"Affiliate(s)"** means any entity that directly or indirectly controls, is controlled by, or is under common control with the subject entity; for purposes of this definition, "control" means direct or indirect ownership or control of more than 50% of the voting interests of the subject entity.
+
+**"Agreement"** means this Fish Audio Research License Agreement.
+
+**"Derivative Work(s)"** means (a) any derivative work of the Fish Audio Materials as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model's output, including "fine tune" and "low-rank adaptation" models derived from a Model or a Model's output, but do not include the output of any Model.
+
+**"Documentation"** means any specifications, manuals, documentation, and other written information provided by Fish Audio related to the Software or Models.
+
+**"Fish Audio"** or **"we"** means 39 AI, INC. and its Affiliates.
+
+**"Model(s)"** means, collectively, Fish Audio's proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing.
+
+**"Software"** means Fish Audio's proprietary software made available under this Agreement now or in the future.
+
+**"Fish Audio Materials"** means, collectively, Fish Audio's proprietary Models, Software and Documentation (and any portion or combination thereof) made available under this Agreement.
+
+**"Trade Control Laws"** means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.

+ 30 - 121
README.md

@@ -34,168 +34,77 @@
     <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
       <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
     </a>
-    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
-        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
-    </a>
-    <a target="_blank" href="https://huggingface.co/fishaudio/openaudio-s1-mini">
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
         <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
     </a>
 </div>
 
 > [!IMPORTANT]
 > **License Notice**  
-> This codebase is released under **Apache License** and all model weights are released under **CC-BY-NC-SA-4.0 License**. Please refer to [LICENSE](LICENSE) for more details.
+> This codebase and its associated model weights are released under **[FISH AUDIO RESEARCH LICENSE](LICENSE)**. Please refer to [LICENSE](LICENSE) for more details.
 
 > [!WARNING]
 > **Legal Disclaimer**  
 > We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.
 
-## FishAudio-S1  
-**True human-like Text-to-Speech and Voice Cloning**
+## Start Here
 
-FishAudio-S1 is an expressive text-to-speech (TTS) and voice cloning model developed by [Fish Audio](https://fish.audio/), designed to generate speech that sounds natural, realistic, and emotionally rich — not robotic, not flat, and not constrained to studio-style narration.
+Here are the official documents for Fish Speech, follow the instructions to get started easily.
 
-FishAudio-S1 focuses on how humans actually speak: with emotion, variation, pauses, and intent.
+- [Installation](https://speech.fish.audio/install/)
+- [Inference](https://speech.fish.audio/inference/)
 
-### Announcement 🎉
+## Fish Audio S2  
+**Best Text-to-speech system among both open source and closed source**
 
-We are excited to announce that we have rebranded to **Fish Audio** — introducing a revolutionary new series of advanced Text-to-Speech models that builds upon the foundation of Fish-Speech.
+Fish Audio S2 is the latest model developed by [Fish Audio](https://fish.audio/), designed to generate speech that sounds natural, realistic, and emotionally rich — not robotic, not flat, and not constrained to studio-style narration.
 
-We are proud to release **FishAudio-S1** (also known as OpenAudio S1) as the first model in this series, delivering significant improvements in quality, performance, and capabilities.
+Fish Audio S2 focuses on daily conversation and dialogue, which enables native multi-speaker and multi-turn generation. Also supports instruction control.
 
-FishAudio-S1 comes in two versions: **FishAudio-S1** and **FishAudio-S1-mini**. Both models are now available on [Fish Audio Playground](https://fish.audio) (for **FishAudio-S1**) and [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini) (for **FishAudio-S1-mini**).
+The S2 series contains several models, the open-sourced model is S2-Pro, which is best model in the collection. 
 
-Visit the [Fish Audio website](https://fish.audio/) for live playground tech report.
+Visit the [Fish Audio website](https://fish.audio/) for live playground.
 
 ### Model Variants
 
 | Model | Size | Availability | Description |
 |------|------|-------------|-------------|
-| FishAudio-S1 | 4B parameters | [fish.audio](https://fish.audio/) | Full-featured flagship model with maximum quality and stability |
-| FishAudio-S1-mini | 0.5B parameters | [huggingface](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) | Open-source distilled model with core capabilities |
+| S2-Pro | 4B parameters | [huggingface](https://huggingface.co/fishaudio/s2-pro) | Full-featured flagship model with maximum quality and stability |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | Our closed source model with faster speed and lower latency |
 
-Both S1 and S1-mini incorporate online Reinforcement Learning from Human Feedback (RLHF).
-
-### Start Here
-
-Here are the official documents for Fish Speech, follow the instructions to get started easily.
-
-- [Installation](https://speech.fish.audio/install/)
-- [Finetune](https://speech.fish.audio/finetune/)
-- [Inference](https://speech.fish.audio/inference/)
-- [Samples](https://speech.fish.audio/samples/)
+More details of the model can be found in the technical report.
 
 ## Highlights
 
-### **Excellent TTS quality**
-
-We use Seed TTS Eval Metrics to evaluate the model performance, and the results show that FishAudio S1 achieves **0.008 WER** and **0.004 CER** on English text, which is significantly better than previous models. (English, auto eval, based on OpenAI gpt-4o-transcribe, speaker distance using Revai/pyannote-wespeaker-voxceleb-resnet34-LM)
-
-| Model | Word Error Rate (WER) | Character Error Rate (CER) | Speaker Distance |
-|-------|----------------------|---------------------------|------------------|
-| **S1** | **0.008**  | **0.004**  | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
-
-
-### **Best Model in TTS-Arena2** 🏆
-
-FishAudio S1 has achieved the **#1 ranking** on [TTS-Arena2](https://arena.speechcolab.org/), the benchmark for text-to-speech evaluation:
-
-<div align="center">
-    <img src="docs/assets/Elo.jpg" alt="TTS-Arena2 Ranking" style="width: 75%;" />
-</div>
-
-### True Human-Like Speech
-
-FishAudio-S1 generates speech that sounds natural and conversational rather than robotic or overly polished. The model captures subtle variations in timing, emphasis, and prosody, avoiding the “studio recording” effect common in traditional TTS systems.
-
-### **Emotion Control and Expressiveness**
-
-FishAudio S1 is the first TTS model to support **open-domain fine-grained emotion control** through explicit emotion and tone markers. We can now precisely steer how a voice sounds:
-
-- **Basic emotions**:
-```
-(angry) (sad) (excited) (surprised) (satisfied) (delighted) 
-(scared) (worried) (upset) (nervous) (frustrated) (depressed)
-(empathetic) (embarrassed) (disgusted) (moved) (proud) (relaxed)
-(grateful) (confident) (interested) (curious) (confused) (joyful)
-```
-
-- **Advanced emotions**:
-```
-(disdainful) (unhappy) (anxious) (hysterical) (indifferent) 
-(impatient) (guilty) (scornful) (panicked) (furious) (reluctant)
-(keen) (disapproving) (negative) (denying) (astonished) (serious)
-(sarcastic) (conciliative) (comforting) (sincere) (sneering)
-(hesitating) (yielding) (painful) (awkward) (amused)
-```
+<img src="./docs/assets/totalability.png" width=200%>
 
-- **Tone markers**:
-```
-(in a hurry tone) (shouting) (screaming) (whispering) (soft tone)
-```
-
-- **Special audio effects**:
-```
-(laughing) (chuckling) (sobbing) (crying loudly) (sighing) (panting)
-(groaning) (crowd laughing) (background laughter) (audience laughing)
-```
+### Fine-Grained Inline Control via Natural Language
 
-You can also use Ha,ha,ha to control, there's many other cases waiting to be explored by yourself.
+Fish Audio S2 enables localized control over speech generation by embedding natural-language instructions directly at specific word or phrase positions within the text. Rather than relying on a fixed set of predefined tags, S2 accepts free-form textual descriptions — such as [whisper in small voice], [professional broadcast tone], or [pitch up] — allowing open-ended expression control at the word level.
 
 ### Multilingual Support
 
-FishAudio-S1 supports high-quality multilingual text-to-speech without requiring phonemes or language-specific preprocessing.
-
-**Languages supporting emotion markers include:**
-English, Chinese, Japanese, German, French, Spanish, Korean, Arabic, Russian, Dutch, Italian, Polish, and Portuguese.
-
-The list is constantly expanding, check [Fish Audio](https://fish.audio/) for the latest releases.
-
-### Rapid Voice Cloning
-
-FishAudio-S1 supports accurate voice cloning using a short reference sample (typically 10–30 seconds). The model captures timbre, speaking style, and emotional tendencies, producing realistic and consistent cloned voices without additional fine-tuning.
+Fish Audio S2 supports high-quality multilingual text-to-speech without requiring phonemes or language-specific preprocessing. Including:
 
-## **Features**
+**English, Chinese, Japanese, Korean, Arabics, German, French...**
 
-1. **Zero-shot & Few-shot TTS:** Input a 10 to 30-second vocal sample to generate high-quality TTS output. **For detailed guidelines, see [Voice Cloning Best Practices](https://docs.fish.audio/resources/best-practices/voice-cloning).**
+**AND MORE!**
 
-2. **Multilingual & Cross-lingual Support:** Simply copy and paste multilingual text into the input box—no need to worry about the language. Currently supports English, Japanese, Korean, Chinese, French, German, Arabic, and Spanish.
-
-3. **No Phoneme Dependency:** The model has strong generalization capabilities and does not rely on phonemes for TTS. It can handle text in any language script.
-
-4. **Highly Accurate:** Achieves a low CER (Character Error Rate) of around 0.4% and WER (Word Error Rate) of around 0.8% for Seed-TTS Eval.
+The list is constantly expanding, check [Fish Audio](https://fish.audio/) for the latest releases.
 
-5. **Fast:** Accelerated by torch compile, the real-time factor is approximately 1:7 on an Nvidia RTX 4090 GPU.
+### Native multi-speaker generation
 
-6. **WebUI Inference:** Features an easy-to-use, Gradio-based web UI compatible with Chrome, Firefox, Edge, and other browsers.
+<img src="./docs/assets/chattemplate.png" width=200%>
 
-7. **Deploy-Friendly:** Easily set up an inference server with native support for Linux and Windows (macOS support coming soon), minimizing performance loss.
+Fish Audio S2 allows users to upload reference audio with multi-speaker, the model will deal with every speaker's feature via `<|speaker:i|>` token. Then you can control the model's performance with the speaker id token, allowing a single generation to include multiple speakers. You no longer need to upload reference audio separately for each speaker.
 
-## **Media & Demos**
+### Multi-turn generation
 
-<div align="center">
+Thanks to the expansion of the model context, our model can now use previous information to improve the expressiveness of subsequent generated content, thereby increasing the naturalness of the content.
 
-### **Social Media**
-<a href="https://x.com/hehe6z/status/1980303682932744439" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-Latest_Demo-black?style=for-the-badge&logo=x&logoColor=white" alt="Latest Demo on X" />
-</a>
-
-### **Interactive Demos**
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish.Audio-Try_FishAudio_S1-blue?style=for-the-badge" alt="Try FishAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-Use_S1_Mini-yellow?style=for-the-badge" alt="Try S1 Mini" />
-</a>
-
-### **Video Showcases**
-
-<a href="https://www.youtube.com/watch?v=WR1FY32Lhps" target="_blank">
-    <img src="docs/assets/Thumbnail.jpg" alt="FishAudio S1 Video" style="width: 50%;" />
-</a>
+### Rapid Voice Cloning
 
-</div>
+Fish Audio S2 supports accurate voice cloning using a short reference sample (typically 10–30 seconds). The model captures timbre, speaking style, and emotional tendencies, producing realistic and consistent cloned voices without additional fine-tuning.
 
 ---
 
@@ -209,7 +118,7 @@ FishAudio-S1 supports accurate voice cloning using a short reference sample (typ
 - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
 - [Qwen3](https://github.com/QwenLM/Qwen3)
 
-## Tech Report (V1.4)
+## Tech Report
 ```bibtex
 @misc{fish-speech-v1.4,
       title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},

+ 4 - 4
docker/Dockerfile

@@ -281,8 +281,8 @@ ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
 
 ARG GRADIO_SERVER_NAME="0.0.0.0"
 ARG GRADIO_SERVER_PORT=7860
-ARG LLAMA_CHECKPOINT_PATH="checkpoints/openaudio-s1-mini"
-ARG DECODER_CHECKPOINT_PATH="checkpoints/openaudio-s1-mini/codec.pth"
+ARG LLAMA_CHECKPOINT_PATH="checkpoints/s2-pro"
+ARG DECODER_CHECKPOINT_PATH="checkpoints/s2-pro/codec.pth"
 ARG DECODER_CONFIG_NAME="modded_dac_vq"
 
 
@@ -330,8 +330,8 @@ ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
 
 ARG API_SERVER_NAME="0.0.0.0"
 ARG API_SERVER_PORT=8080
-ARG LLAMA_CHECKPOINT_PATH="checkpoints/openaudio-s1-mini"
-ARG DECODER_CHECKPOINT_PATH="checkpoints/openaudio-s1-mini/codec.pth"
+ARG LLAMA_CHECKPOINT_PATH="checkpoints/s2-pro"
+ARG DECODER_CHECKPOINT_PATH="checkpoints/s2-pro/codec.pth"
 ARG DECODER_CONFIG_NAME="modded_dac_vq"
 
 # Expose port

+ 35 - 126
docs/README.ar.md

@@ -34,172 +34,81 @@
     <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
       <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
     </a>
-    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
-        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
-    </a>
-    <a target="_blank" href="https://huggingface.co/fishaudio/openaudio-s1-mini">
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
         <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
     </a>
 </div>
 
 > [!IMPORTANT]
 > **إشعار الترخيص**
-> تم إصدار قاعدة الكود هذه بموجب **ترخيص Apache** وتم إصدار جميع أوزان النموذج بموجب **ترخيص CC-BY-NC-SA-4.0**. يرجى الرجوع إلى [LICENSE](../LICENSE) لمزيد من التفاصيل.
+> يتم إصدار هذا المشروع وأوزان النماذج المرتبطة به بموجب رخصة **[FISH AUDIO RESEARCH LICENSE](../LICENSE)**. يرجى الرجوع إلى [LICENSE](../LICENSE) لمزيد من التفاصيل.
 
 > [!WARNING]
-> **إخلاء المسؤولية القانوني**
-> نحن لا نتحمل أي مسؤولية عن أي استخدام غير قانوني لقاعدة الكود. يرجى الرجوع إلى القوانين المحلية الخاصة بك فيما يتعلق بقانون الألفية الجديدة لحقوق طبع ونشر المواد الرقمية والقوانين الأخرى ذات الصلة.
+> **إخلاء المسؤولية القانونية**
+> نحن لا نتحمل أي مسؤولية عن أي استخدام غير قانوني لهذا المشروع. يرجى الرجوع إلى القوانين المحلية المتعلقة بحقوق الطبع والنشر الرقمية (DMCA) والقوانين الأخرى ذات الصلة.
 
-## FishAudio-S1
-**تحويل النص إلى كلام واستنساخ الصوت بجودة تحاكي الإنسان**
+## ابدأ من هنا
 
-FishAudio-S1 هو نموذج معبر لتحويل النص إلى كلام (TTS) واستنساخ الصوت طورته [Fish Audio](https://fish.audio/)، مصمم لتوليد كلام يبدو طبيعيًا وواقعيًا وغنيًا عاطفيًا — ليس آليًا، وليس مسطحًا، وغير مقيد بأسلوب السرد الاستوديو.
+هذه الوثائق الرسمية لـ Fish Speech، اتبع التعليمات للبدء بسهولة.
 
-يركز FishAudio-S1 على كيفية تحدث البشر فعليًا: بعاطفة وتنوع وتوقفات ونية.
+- [التثبيت](https://speech.fish.audio/ar/install/)
+- [الاستنتاج](https://speech.fish.audio/ar/inference/)
 
-### إعلان 🎉
+## Fish Audio S2
+**أفضل نظام لتحويل النص إلى كلام بين الأنظمة مفتوحة المصدر ومغلقة المصدر**
 
-يسعدنا أن نعلن أننا قمنا بإعادة تسمية العلامة التجارية إلى **Fish Audio** — تقديم سلسلة جديدة ثورية من نماذج تحويل النص إلى كلام المتقدمة التي تبني على أساس Fish-Speech.
+Fish Audio S2 هو أحدث نموذج طورته [Fish Audio](https://fish.audio/)، صُمم لإنتاج كلام يبدو طبيعياً وواقعياً وغنياً بالعواطف — ليس آلياً، ولا مسطحاً، ولا يقتصر على أسلوب السرد في الاستوديوهات.
 
-نحن فخورون بإصدار **FishAudio-S1** (المعروف أيضًا باسم OpenAudio S1) كنموذج أول في هذه السلسلة، حيث يوفر تحسينات كبيرة في الجودة والأداء والقدرات.
+يركز Fish Audio S2 على المحادثات والحوارات اليومية، مما يتيح توليد أصوات لمتحدثين متعددين وجلسات حوارية متعددة الأدوار بشكل أصلي. كما يدعم التحكم عبر التعليمات.
 
-يأتي FishAudio-S1 في نسختين: **FishAudio-S1** و **FishAudio-S1-mini**. كلا النموذجين متاحان الآن على [Fish Audio Playground](https://fish.audio) (لـ **FishAudio-S1**) و [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini) (لـ **FishAudio-S1-mini**).
+تحتوي سلسلة S2 على نماذج متعددة، النموذج مفتوح المصدر هو S2-Pro، وهو الأفضل في المجموعة.
 
-قم بزيارة [موقع Fish Audio](https://fish.audio/) للـ playground المباشر والتقرير التقني.
+تفضل بزيارة [موقع Fish Audio](https://fish.audio/) لتجربة مباشرة.
 
-### متغيرات النموذج
+### إصدارات النموذج
 
 | النموذج | الحجم | التوفر | الوصف |
 |------|------|-------------|-------------|
-| FishAudio-S1 | 4B معامل | [fish.audio](https://fish.audio/) | النموذج الرئيسي كامل الميزات مع أقصى جودة واستقرار |
-| FishAudio-S1-mini | 0.5B معامل | [huggingface](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) | نموذج مقطر مفتوح المصدر بالقدرات الأساسية |
-
-كلا النموذجين S1 و S1-mini يتضمنان التعلم المعزز من التغذية الراجعة البشرية (RLHF) عبر الإنترنت.
-
-### ابدأ هنا
+| S2-Pro | 4B معايير | [huggingface](https://huggingface.co/fishaudio/s2-pro) | نموذج رائد كامل الميزات بأعلى جودة واستقرار |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | نموذجنا مغلق المصدر بسرعة أكبر وتأخير أقل |
 
-فيما يلي المستندات الرسمية لـ Fish Speech، اتبع التعليمات للبدء بسهولة.
-
-- [التثبيت](https://speech.fish.audio/install/)
-- [الضبط الدقيق](https://speech.fish.audio/finetune/)
-- [الاستدلال](https://speech.fish.audio/inference/)
-- [العينات](https://speech.fish.audio/samples/)
+يمكن العثور على مزيد من التفاصيل حول النموذج في التقرير التقني.
 
 ## أبرز المميزات
 
-### **جودة TTS ممتازة**
-
-نستخدم مقاييس تقييم Seed TTS لتقييم أداء النموذج، وتظهر النتائج أن FishAudio S1 يحقق **0.008 WER** و **0.004 CER** على النص الإنجليزي، وهو أفضل بشكل ملحوظ من النماذج السابقة. (الإنجليزية، التقييم التلقائي، بناءً على OpenAI gpt-4o-transcribe، مسافة المتحدث باستخدام Revai/pyannote-wespeaker-voxceleb-resnet34-LM)
-
-| النموذج | معدل الخطأ في الكلمات (WER) | معدل الخطأ في الأحرف (CER) | مسافة المتحدث |
-|-------|----------------------|---------------------------|------------------|
-| **S1** | **0.008**  | **0.004**  | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
-
-
-### **أفضل نموذج في TTS-Arena2** 🏆
-
-حقق FishAudio S1 **المركز الأول** على [TTS-Arena2](https://arena.speechcolab.org/)، المعيار لتقييم تحويل النص إلى كلام:
-
-<div align="center">
-    <img src="../docs/assets/Elo.jpg" alt="TTS-Arena2 Ranking" style="width: 75%;" />
-</div>
-
-### كلام يشبه الإنسان حقًا
-
-يولد FishAudio-S1 كلامًا يبدو طبيعيًا ومحادثيًا بدلاً من الآلي أو المصقول بشكل مفرط. يلتقط النموذج التنوعات الدقيقة في التوقيت والتأكيد والنبرة، متجنبًا تأثير "التسجيل الاستوديو" الشائع في أنظمة TTS التقليدية.
-
-### **التحكم في العواطف والتعبيرية**
-
-FishAudio S1 هو أول نموذج TTS يدعم **التحكم الدقيق في العواطف في المجال المفتوح** من خلال علامات العاطفة والنبرة الصريحة. يمكننا الآن توجيه كيفية صوت الصوت بدقة:
-
-- **العواطف الأساسية**:
-```
-(غاضب) (حزين) (متحمس) (مندهش) (راضي) (مسرور)
-(خائف) (قلق) (منزعج) (متوتر) (محبط) (مكتئب)
-(متعاطف) (محرج) (مشمئز) (متحرك) (فخور) (مرتاح)
-(ممتن) (واثق) (مهتم) (فضولي) (مرتبك) (مبتهج)
-```
-
-- **العواطف المتقدمة**:
-```
-(محتقر) (غير سعيد) (قلق) (هستيري) (غير مبال)
-(غير صبور) (مذنب) (ساخر) (ذعر) (غاضب) (متردد)
-(متحمس) (غير موافق) (سلبي) (نافي) (مندهش) (جاد)
-(ساخر) (مصالح) (مريح) (صادق) (ساخر)
-(متردد) (مستسلم) (مؤلم) (محرج) (مسلي)
-```
-
-- **علامات النبرة**:
-```
-(بنبرة مستعجلة) (يصرخ) (يصرخ) (يهمهم) (بنبرة ناعمة)
-```
+<img src="./assets/totalability.png" width=200%>
 
-- **تأثيرات صوتية خاصة**:
-```
-(يضحك) (يقهقه) (ينتحب) (يبكي بصوت عال) (يتنهد) (يلهث)
-(يئن) (ضحك الجمهور) (ضحك في الخلفية) (ضحك الجمهور)
-```
+### تحكم مضمّن دقيق عبر اللغة الطبيعية
 
مكنك أيضًا استخدام Ha,ha,ha للتحكم، وهناك العديد من الحالات الأخرى التي تنتظر استكشافها بنفسك.
+يتيح Fish Audio S2 تحكمًا موضعيًا في توليد الكلام من خلال تضمين تعليمات باللغة الطبيعية مباشرة عند مواقع كلمات أو عبارات محددة داخل النص. وبدلًا من الاعتماد على مجموعة ثابتة من الوسوم المُعرّفة مسبقًا، يقبل S2 أوصافًا نصية حرة مثل [whisper in small voice] أو [professional broadcast tone] أو [pitch up]، مما يتيح تحكمًا مفتوحًا في التعبير على مستوى الكلمة.
 
-### دعم متعدد اللغات
+### دعم لغات متعددة
 
-يدعم FishAudio-S1 تحويل النص إلى كلام متعدد اللغات عالي الجودة دون الحاجة إلى الفونيمات أو المعالجة المسبقة الخاصة باللغة.
+يدعم Fish Audio S2 تحويل النص إلى كلام بجودة عالية ولغات متعددة دون الحاجة إلى رموز صوتية أو معالجة مسبقة خاصة بكل لغة. بما في ذلك:
 
-**اللغات التي تدعم علامات العاطفة تشمل:**
-الإنجليزية، الصينية، اليابانية، الألمانية، الفرنسية، الإسبانية، الكورية، العربية، الروسية، الهولندية، الإيطالية، البولندية، والبرتغالية.
+**الإنجليزية، الصينية، اليابانية، الكورية، العربية، الألمانية، الفرنسية...**
 
-القائمة في توسع مستمر، تحقق من [Fish Audio](https://fish.audio/) لأحدث الإصدارات.
+**وأكثر من ذلك بكثير!**
 
-### استنساخ الصوت السريع
+القائمة في توسع مستمر، تحقق من [Fish Audio](https://fish.audio/) لمعرفة أحدث الإصدارات.
 
-يدعم FishAudio-S1 استنساخ الصوت الدقيق باستخدام عينة مرجعية قصيرة (عادة 10-30 ثانية). يلتقط النموذج الجرس وأسلوب الكلام والميول العاطفية، مما ينتج أصواتًا مستنسخة واقعية ومتسقة دون ضبط دقيق إضافي.
+### توليد أصلي لمتحدثين متعددين
 
-## **الميزات**
+<img src="./assets/chattemplate.png" width=200%>
 
-1. **TTS بدون عينات وقليل العينات:** أدخل عينة صوتية مدتها 10 إلى 30 ثانية لتوليد مخرجات TTS عالية الجودة. **للحصول على إرشادات مفصلة، راجع [أفضل ممارسات استنساخ الصوت](https://docs.fish.audio/resources/best-practices/voice-cloning).**
+يسمح Fish Audio S2 للمستخدمين برفع صوت مرجعي يحتوي على متحدثين متعددين، وسيتعامل النموذج مع ميزات كل متحدث عبر رمز `<|speaker:i|>`. يمكنك بعد ذلك التحكم في أداء النموذج باستخدام رمز معرف المتحدث، مما يسمح بتوليد واحد يتضمن متحدثين متعددين. لم تعد بحاجة لرفع ملفات مرجعية منفصلة لكل متحدث.
 
-2. **الدعم متعدد اللغات وعبر اللغات:** ما عليك سوى نسخ ولصق النص متعدد اللغات في مربع الإدخال — لا داعي للقلق بشأن اللغة. يدعم حاليًا الإنجليزية واليابانية والكورية والصينية والفرنسية والألمانية والعربية والإسبانية.
+### توليد حوارات متعددة الأدوار
 
-3. **لا يعتمد على الفونيمات:** يتمتع النموذج بقدرات تعميم قوية ولا يعتمد على الفونيمات لـ TTS. يمكنه التعامل مع النص بأي لغة نصية.
+بفضل توسيع سياق النموذج، يمكن لنموذجنا الآن استخدام المعلومات السابقة لتحسين التعبير في المحتوى المولد لاحقاً، مما يزيد من طبيعية المحتوى.
 
-4. **دقيق للغاية:** يحقق معدل خطأ في الأحرف (CER) حوالي 0.4٪ ومعدل خطأ في الكلمات (WER) حوالي 0.8٪ لـ Seed-TTS Eval.
+### استنساخ صوت سريع
 
-5. **سريع:** مع التسريع بواسطة torch compile، فإن عامل الوقت الحقيقي هو حوالي 1:7 على بطاقة Nvidia RTX 4090 GPU.
-
-6. **استدلال WebUI:** يتميز بواجهة ويب سهلة الاستخدام تعتمد على Gradio متوافقة مع Chrome و Firefox و Edge والمتصفحات الأخرى.
-
-7. **سهولة النشر:** يمكنك إعداد خادم استدلال بسهولة مع دعم أصلي لأنظمة Linux و Windows (دعم macOS قريبًا)، مما يقلل من فقدان الأداء.
-
-## **وسائل الإعلام والعروض التوضيحية**
-
-<div align="center">
-
-### **وسائل التواصل الاجتماعي**
-<a href="https://x.com/hehe6z/status/1980303682932744439" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-Latest_Demo-black?style=for-the-badge&logo=x&logoColor=white" alt="أحدث عرض توضيحي على X" />
-</a>
-
-### **العروض التوضيحية التفاعلية**
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish.Audio-Try_FishAudio_S1-blue?style=for-the-badge" alt="جرب FishAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-Use_S1_Mini-yellow?style=for-the-badge" alt="استخدم S1 Mini" />
-</a>
-
-### **عروض الفيديو**
-
-<a href="https://www.youtube.com/watch?v=WR1FY32Lhps" target="_blank">
-    <img src="../docs/assets/Thumbnail.jpg" alt="فيديو FishAudio S1" style="width: 50%;" />
-</a>
-
-</div>
+يدعم Fish Audio S2 استنساخ الصوت بدقة باستخدام عينة مرجعية قصيرة (عادةً 10-30 ثانية). يلتقط النموذج نبرة الصوت، وأسلوب التحدث، والميول العاطفية، مما ينتج أصواتاً مستنسخة واقعية ومتسقة دون الحاجة إلى ضبط دقيق إضافي.
 
 ---
 
-## الاعتمادات
+## شكر وتقدير
 
 - [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
 - [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
@@ -209,7 +118,7 @@ FishAudio S1 هو أول نموذج TTS يدعم **التحكم الدقيق ف
 - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
 - [Qwen3](https://github.com/QwenLM/Qwen3)
 
-## التقرير التقني (V1.4)
+## التقرير التقني
 ```bibtex
 @misc{fish-speech-v1.4,
       title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},

+ 31 - 122
docs/README.ja.md

@@ -34,168 +34,77 @@
     <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
       <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
     </a>
-    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
-        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
-    </a>
-    <a target="_blank" href="https://huggingface.co/fishaudio/openaudio-s1-mini">
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
         <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
     </a>
 </div>
 
 > [!IMPORTANT]
 > **ライセンス注意事項**
-> このコードベースは**Apache License**の下でリリースされ、すべてのモデルウェイトは**CC-BY-NC-SA-4.0 License**の下でリリースされています。詳細については[LICENSE](../LICENSE)をご参照ください。
+> このコードベースおよび関連するモデルウェイトは **[FISH AUDIO RESEARCH LICENSE](../LICENSE)** の下でリリースされています。詳細については [LICENSE](../LICENSE) をご参照ください。
 
 > [!WARNING]
 > **法的免責事項**
-> 私たちはコードベースの不法な使用について一切の責任を負いません。DMCA及びその他の関連法律について、現地の法律をご参照ください。
+> 私たちはコードベースの不法な使用について一切の責任を負いません。DMCA 及びその他の関連法律について、現地の法律をご参照ください。
 
-## FishAudio-S1
-**人間のように自然な音声合成と音声クローニング**
+## ここから始める
 
-FishAudio-S1は、[Fish Audio](https://fish.audio/)が開発した表現力豊かなtext-to-speech (TTS) と音声クローニングモデルです。自然で、リアルで、感情豊かな音声を生成するように設計されています——ロボット的でなく、平坦でなく、スタジオ風のナレーションに制限されません
+こちらは Fish Speech の公式ドキュメントです。手順に従って簡単に始めることができます
 
-FishAudio-S1は、人間が実際に話す方法に焦点を当てています:感情、変化、間、意図を持って。
+- [インストール](https://speech.fish.audio/ja/install/)
+- [推論](https://speech.fish.audio/ja/inference/)
 
-### 発表 🎉
+## Fish Audio S2
+**オープンソースおよびクローズドソースの中で最も優れたテキスト読み上げシステム**
 
-**Fish Audio**へのリブランドを発表できることを嬉しく思います。Fish-Speechの基盤を元に構築された、革新的な新しい高度Text-to-Speechモデルシリーズを紹介します
+Fish Audio S2 は、[Fish Audio](https://fish.audio/) によって開発された最新のモデルで、自然でリアル、かつ感情豊かな音声を生成するように設計されています。ロボット的ではなく、平坦でもなく、スタジオスタイルのナレーションに限定されません
 
-このシリーズの最初のモデルとして**FishAudio-S1**(OpenAudio S1としても知られる)をリリースできることを誇りに思います。品質、性能、機能において大幅な改善を実現しました
+Fish Audio S2 は日常の会話に焦点を当てており、ネイティブなマルチスピーカーおよびマルチターンの生成をサポートしています。また、命令制御もサポートしています
 
-FishAudio-S1には2つのバージョンがあります:**FishAudio-S1**と**FishAudio-S1-mini**。両モデルとも[Fish Audio Playground](https://fish.audio)(**FishAudio-S1**用)と[Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini)(**FishAudio-S1-mini**用)で利用可能です。
+S2 シリーズには複数のモデルが含まれており、オープンソースモデルは S2-Pro で、このシリーズの中で最もパフォーマンスの高いモデルです。
 
-ライブplaygroundと技術レポートについては[Fish Audioウェブサイト](https://fish.audio/)をご覧ください。
+リアルタイムのエクスペリエンスについては、[Fish Audio Webサイト](https://fish.audio/) にアクセスしてください。
 
 ### モデルバリアント
 
 | モデル | サイズ | 利用可能性 | 説明 |
 |------|------|-------------|-------------|
-| FishAudio-S1 | 4Bパラメータ | [fish.audio](https://fish.audio/) | 最高品質と安定性を備えたフル機能のフラッグシップモデル |
-| FishAudio-S1-mini | 0.5Bパラメータ | [huggingface](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) | コア機能を持つオープンソース蒸留モデル |
+| S2-Pro | 4B パラメータ | [huggingface](https://huggingface.co/fishaudio/s2-pro) | 最高の品質と安定性を備えた、フル機能のフラッグシップモデル |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | より高速で低遅延なクローズドソースモデル |
 
-S1とS1-miniの両方がオンライン人間フィードバック強化学習(RLHF)を組み込んでいます。
-
-### はじめに
-
-こちらは Fish Speech の公式ドキュメントです。手順に従って簡単に始めることができます。
-
-- [インストール](https://speech.fish.audio/ja/install/)
-- [ファインチューニング](https://speech.fish.audio/ja/finetune/)
-- [推論](https://speech.fish.audio/ja/inference/)
-- [サンプル](https://speech.fish.audio/samples/)
+モデルの詳細については、技術レポートを参照してください。
 
 ## ハイライト
 
-### **優秀なTTS品質**
-
-Seed TTS Eval Metricsを使用してモデル性能を評価した結果、FishAudio S1は英語テキストで**0.008 WER**と**0.004 CER**を達成し、これは従来のモデルより大幅に優れています。(英語、自動評価、OpenAI gpt-4o-transcribeベース、Revai/pyannote-wespeaker-voxceleb-resnet34-LMを使用した話者距離)
-
-| モデル | 単語誤り率 (WER) | 文字誤り率 (CER) | 話者距離 |
-|-------|------------------|------------------|----------|
-| **S1** | **0.008** | **0.004** | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
-
-
-### **TTS-Arena2でのベストモデル** 🏆
-
-FishAudio S1は、テキスト音声変換評価のベンチマークである[TTS-Arena2](https://arena.speechcolab.org/)で**1位**を獲得しました:
-
-<div align="center">
-    <img src="assets/Elo.jpg" alt="TTS-Arena2 Ranking" style="width: 75%;" />
-</div>
-
-### 真に人間らしい音声
-
-FishAudio-S1は、ロボット的または過度に洗練されたものではなく、自然で会話的な音声を生成します。モデルはタイミング、強調、韻律の微妙な変化を捉え、従来のTTSシステムに共通する「スタジオ録音」効果を回避します。
-
-### **感情制御と表現力**
-
-FishAudio S1は、明示的な感情とトーンマーカーを通じて**オープンドメインの細粒度感情制御**をサポートする最初のTTSモデルです。音声の響き方を正確に制御できるようになりました:
-
-- **基本感情**:
-```
-(怒った) (悲しい) (興奮した) (驚いた) (満足した) (喜んだ)
-(恐れた) (心配した) (動揺した) (緊張した) (イライラした) (憂鬱な)
-(共感的な) (恥ずかしい) (嫌悪した) (感動した) (誇らしい) (リラックスした)
-(感謝する) (自信のある) (興味のある) (好奇心のある) (混乱した) (喜びに満ちた)
-```
-
-- **高度な感情**:
-```
-(軽蔑的な) (不幸な) (不安な) (ヒステリックな) (無関心な)
-(せっかちな) (罪悪感のある) (軽蔑した) (パニックした) (激怒した) (しぶしぶの)
-(熱心な) (不賛成の) (否定的な) (否認する) (驚愕した) (真剣な)
-(皮肉な) (宥める) (慰める) (誠実な) (冷笑する)
-(躊躇する) (屈服する) (苦痛な) (気まずい) (面白がる)
-```
+<img src="./assets/totalability.png" width=200%>
 
-- **トーンマーカー**:
-```
-(急いだトーン) (叫ぶ) (悲鳴) (囁く) (柔らかいトーン)
-```
+### 自然言語による細粒度インライン制御
 
-- **特別な音響効果**:
-```
-(笑う) (くすくす笑う) (すすり泣く) (大声で泣く) (ため息) (息切れ)
-(うめく) (群衆の笑い声) (背景の笑い声) (聴衆の笑い声)
-```
-
-また、「ハ、ハ、ハ」を使って制御することもでき、あなた自身が探索できる多くの他のケースがあります。
+Fish Audio S2 では、テキスト内の特定の単語やフレーズ位置に自然言語の指示を直接埋め込むことで、音声生成を局所的に制御できます。固定の事前定義タグに依存するのではなく、S2 は [whisper in small voice]、[professional broadcast tone]、[pitch up] のような自由形式のテキスト記述を受け付け、単語レベルで表現をオープンエンドに制御できます。
 
 ### 多言語サポート
 
-FishAudio-S1は、音素や言語固有の前処理を必要とせずに、高品質な多言語text-to-speechをサポートしています。
-
-**感情マーカーをサポートする言語:**
-英語、中国語、日本語、ドイツ語、フランス語、スペイン語、韓国語、アラビア語、ロシア語、オランダ語、イタリア語、ポーランド語、ポルトガル語。
-
-リストは常に拡大しています。最新リリースについては[Fish Audio](https://fish.audio/)をご確認ください。
-
-### 高速音声クローニング
-
-FishAudio-S1は、短い参照サンプル(通常10〜30秒)を使用した正確な音声クローニングをサポートしています。モデルは音色、話し方、感情傾向を捉え、追加のファインチューニングなしでリアルで一貫したクローン音声を生成します。
+Fish Audio S2 は、音素や言語固有の前処理を必要とせずに、高品質な多言語テキスト読み上げをサポートします。以下を含みます:
 
-## **機能**
+**英語、中国語、日本語、韓国語、アラビア語、ドイツ語、フランス語...**
 
-1. **ゼロショット・少数ショットTTS:** 10〜30秒の音声サンプルを入力して高品質のTTS出力を生成します。**詳細なガイドラインについては、[Voice Cloning Best Practices](https://docs.fish.audio/resources/best-practices/voice-cloning)をご覧ください。**
+**さらに多く!**
 
-2. **多言語・言語横断サポート:** 多言語テキストを入力ボックスにコピー&ペーストするだけで、言語を気にする必要はありません。現在、英語、日本語、韓国語、中国語、フランス語、ドイツ語、アラビア語、スペイン語をサポートしています
+リストは常に拡大しています。最新のリリースについては [Fish Audio](https://fish.audio/) を確認してください。
 
-3. **音素依存なし:** モデルは強い汎化能力を持ち、TTSに音素に依存しません。どの言語の文字体系のテキストも処理できます。
+### ネイティブなマルチスピーカー生成
 
-4. **高精度:** Seed-TTS Evalで約0.4%の低いCER(文字誤り率)と約0.8%のWER(単語誤り率)を達成します。
+<img src="./assets/chattemplate.png" width=200%>
 
-5. **高速:** torch compileによる加速により、Nvidia RTX 4090 GPUで約1:7のリアルタイム係数を実現します
+Fish Audio S2 では、ユーザーが複数のスピーカーを含む参照オーディオをアップロードでき、モデルは `<|speaker:i|>` トークンを介して各スピーカーの特徴を処理します。その後、スピーカーIDトークンを使用してモデルのパフォーマンスを制御し、1回の生成で複数のスピーカーを含めることができます。以前のように各スピーカーに対して個別に参照オーディオをアップロードして音声を生成する必要はもうありません。
 
-6. **WebUI推論:** 使いやすいGradioベースのWeb UIを搭載し、Chrome、Firefox、Edgeなどのブラウザと互換性があります。
+### マルチターン対話生成
 
-7. **デプロイフレンドリー:** Linux と Windows をネイティブサポートし(macOS サポートも近日対応予定)、パフォーマンスの低下を最小限に抑えながら、推論サーバーを簡単にセットアップできます
+モデルのコンテキストの拡張により、以前の情報を使用して後続の生成されたコンテンツの表現力を向上させ、コンテンツの自然さを高めることができるようになりました。
 
-## **メディア・デモ**
-
-<div align="center">
-
-### **ソーシャルメディア**
-<a href="https://x.com/hehe6z/status/1980303682932744439" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-Latest_Demo-black?style=for-the-badge&logo=x&logoColor=white" alt="Latest Demo on X" />
-</a>
-
-### **インタラクティブデモ**
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish.Audio-Try_FishAudio_S1-blue?style=for-the-badge" alt="Try FishAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-Use_S1_Mini-yellow?style=for-the-badge" alt="Use S1 Mini" />
-</a>
-
-### **ビデオショーケース**
-
-<a href="https://www.youtube.com/watch?v=WR1FY32Lhps" target="_blank">
-    <img src="assets/Thumbnail.jpg" alt="FishAudio S1 Video" style="width: 50%;" />
-</a>
+### 高速音声クローニング
 
-</div>
+Fish Audio S2 は、短い参照サンプル(通常10〜30秒)を使用した正確な音声クローニングをサポートしています。モデルは音色、話し方、感情的な傾向を捉え、追加の微調整なしでリアルで一貫したクローン音声を生成します。
 
 ---
 
@@ -209,7 +118,7 @@ FishAudio-S1は、短い参照サンプル(通常10〜30秒)を使用した
 - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
 - [Qwen3](https://github.com/QwenLM/Qwen3)
 
-## 技術レポート (V1.4)
+## 技術レポート
 ```bibtex
 @misc{fish-speech-v1.4,
       title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},

+ 30 - 121
docs/README.ko.md

@@ -34,168 +34,77 @@
     <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
       <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
     </a>
-    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
-        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
-    </a>
-    <a target="_blank" href="https://huggingface.co/fishaudio/openaudio-s1-mini">
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
         <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
     </a>
 </div>
 
 > [!IMPORTANT]
 > **라이선스 고지사항**
-> 이 코드베이스는 **Apache License** 하에 릴리스되며, 모든 모델 가중치는 **CC-BY-NC-SA-4.0 License** 하에 릴리스됩니다. 자세한 내용은 [LICENSE](../LICENSE)를 참조하세요.
+> 이 코드베이스 및 관련 모델 가중치는 **[FISH AUDIO RESEARCH LICENSE](../LICENSE)** 하에 릴리스됩니다. 자세한 내용은 [LICENSE](../LICENSE)를 참조하세요.
 
 > [!WARNING]
 > **법적 면책조항**
 > 저희는 코드베이스의 불법적인 사용에 대해 어떠한 책임도 지지 않습니다. DMCA 및 기타 관련 법률에 대한 현지 법률을 참조하세요.
 
-## FishAudio-S1
-**사람처럼 자연스러운 음성 합성과 음성 복제**
+## 여기서 시작하세요
 
-FishAudio-S1은 [Fish Audio](https://fish.audio/)가 개발한 표현력 있는 text-to-speech (TTS) 및 음성 복제 모델입니다. 자연스럽고, 사실적이며, 감정이 풍부한 음성을 생성하도록 설계되었습니다 — 로봇같지 않고, 평평하지 않으며, 스튜디오 스타일 나레이션에 제한되지 않습니다.
+여기는 Fish Speech의 공식 문서입니다. 지침을 따라 쉽게 시작하세요.
 
-FishAudio-S1은 인간이 실제로 말하는 방식에 초점을 맞춥니다: 감정, 변화, 휴지, 의도를 가지고.
+- [설치](https://speech.fish.audio/ko/install/)
+- [추론](https://speech.fish.audio/ko/inference/)
 
-### 발표 🎉
+## Fish Audio S2
+**오픈 소스와 클로즈드 소스 모두에서 가장 뛰어난 텍스트 음성 변환 시스템**
 
-**Fish Audio**로의 리브랜딩을 발표하게 되어 기쁩니다. Fish-Speech의 기반 위에 구축된 혁신적인 새로운 고급 Text-to-Speech 모델 시리즈를 소개합니다.
+Fish Audio S2는 [Fish Audio](https://fish.audio/)가 개발한 최신 모델로, 자연스럽고 사실적이며 감정적으로 풍부한 음성을 생성하도록 설계되었습니다. 로봇 같지 않고, 평평하지 않으며, 스튜디오 스타일의 내레이션에 제한되지 않습니다.
 
-이 시리즈의 첫 번째 모델인 **FishAudio-S1** (OpenAudio S1으로도 알려짐)을 출시하게 되어 자랑스럽습니다. 품질, 성능, 기능에서 상당한 개선을 제공합니다.
+Fish Audio S2는 일상적인 대화와 대화에 집중하여 네이티브 멀티 화자 및 멀티 턴 생성을 가능하게 합니다. 또한 명령 제어도 지원합니다.
 
-FishAudio-S1은 두 가지 버전으로 제공됩니다: **FishAudio-S1**과 **FishAudio-S1-mini**. 두 모델 모두 [Fish Audio Playground](https://fish.audio)(**FishAudio-S1**용)와 [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini)(**FishAudio-S1-mini**용)에서 사용할 수 있습니다.
+S2 시리즈에는 여러 모델이 포함되어 있으며, 오픈 소스 모델은 S2-Pro로 컬렉션 중 최고의 모델입니다.
 
-라이브 playground와 기술 보고서는 [Fish Audio 웹사이트](https://fish.audio/)를 방문하세요.
+실시간 체험을 위해 [Fish Audio 웹사이트](https://fish.audio/)를 방문하세요.
 
 ### 모델 변형
 
 | 모델 | 크기 | 가용성 | 설명 |
 |------|------|-------------|-------------|
-| FishAudio-S1 | 4B 매개변수 | [fish.audio](https://fish.audio/) | 최고 품질과 안정성을 갖춘 전체 기능 플래그십 모델 |
-| FishAudio-S1-mini | 0.5B 매개변수 | [huggingface](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) | 핵심 기능을 갖춘 오픈소스 증류 모델 |
+| S2-Pro | 4B 매개변수 | [huggingface](https://huggingface.co/fishaudio/s2-pro) | 최고의 품질과 안정성을 갖춘 전체 기능 플래그십 모델 |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | 더 빠른 속도와 더 낮은 지연 시간을 가진 클로즈드 소스 모델 |
 
-S1과 S1-mini 모두 온라인 인간 피드백 강화학습(RLHF)을 통합하고 있습니다.
-
-### 시작하기
-
-여기는 Fish Speech의 공식 문서입니다. 지침을 따라 쉽게 시작하세요.
-
-- [설치](https://speech.fish.audio/ko/install/)
-- [파인튜닝](https://speech.fish.audio/ko/finetune/)
-- [추론](https://speech.fish.audio/ko/inference/)
-- [샘플](https://speech.fish.audio/samples/)
+모델에 대한 자세한 내용은 기술 보고서를 참조하십시오.
 
 ## 주요 특징
 
-### **뛰어난 TTS 품질**
-
-우리는 Seed TTS Eval Metrics를 사용하여 모델 성능을 평가했으며, 결과에 따르면 FishAudio S1은 영어 텍스트에서 **0.008 WER**과 **0.004 CER**을 달성하여 이전 모델들보다 상당히 우수한 성능을 보입니다. (영어, 자동 평가, OpenAI gpt-4o-transcribe 기반, Revai/pyannote-wespeaker-voxceleb-resnet34-LM을 사용한 화자 거리)
-
-| 모델 | 단어 오류율 (WER) | 문자 오류율 (CER) | 화자 거리 |
-|-------|----------------------|---------------------------|------------------|
-| **S1** | **0.008**  | **0.004**  | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
-
-
-### **TTS-Arena2 최고 모델** 🏆
-
-FishAudio S1은 텍스트 음성 변환 평가의 벤치마크인 [TTS-Arena2](https://arena.speechcolab.org/)에서 **1위**를 달성했습니다:
-
-<div align="center">
-    <img src="../docs/assets/Elo.jpg" alt="TTS-Arena2 순위" style="width: 75%;" />
-</div>
-
-### 진정한 인간다운 음성
-
-FishAudio-S1은 로봇같거나 과도하게 다듬어진 것이 아닌, 자연스럽고 대화적인 음성을 생성합니다. 모델은 타이밍, 강조, 운율의 미묘한 변화를 포착하여 전통적인 TTS 시스템에서 흔한 "스튜디오 녹음" 효과를 피합니다.
-
-### **감정 제어 및 표현력**
-
-FishAudio S1은 명시적 감정 및 톤 마커를 통해 **오픈 도메인 세밀한 감정 제어**를 지원하는 최초의 TTS 모델입니다. 이제 음성이 어떻게 들릴지 정확하게 제어할 수 있습니다:
-
-- **기본 감정**:
-```
-(화난) (슬픈) (흥분한) (놀란) (만족한) (기쁜)
-(무서워하는) (걱정하는) (속상한) (긴장한) (좌절한) (우울한)
-(공감하는) (당황한) (역겨워하는) (감동한) (자랑스러운) (편안한)
-(감사하는) (자신있는) (관심있는) (호기심있는) (혼란스러운) (즐거운)
-```
-
-- **고급 감정**:
-```
-(경멸하는) (불행한) (불안한) (히스테리한) (무관심한)
-(조급한) (죄책감있는) (냉소적인) (공황상태인) (분노한) (마지못한)
-(열성적인) (반대하는) (부정적인) (부인하는) (놀란) (진지한)
-(비꼬는) (달래는) (위로하는) (진심인) (비웃는)
-(망설이는) (굴복하는) (고통스러운) (어색한) (재미있어하는)
-```
+<img src="./assets/totalability.png" width=200%>
 
-- **톤 마커**:
-```
-(급한 톤) (외치기) (비명지르기) (속삭이기) (부드러운 톤)
-```
-
-- **특별한 오디오 효과**:
-```
-(웃음) (킥킥거림) (흐느낌) (큰 소리로 우는 것) (한숨) (헐떡거림)
-(신음) (군중 웃음) (배경 웃음) (관객 웃음)
-```
+### 자연어 기반 세밀한 인라인 제어
 
-또한 **하, 하, 하**를 사용하여 제어할 수도 있으며, 여러분이 직접 탐험할 수 있는 많은 다른 경우들이 있습니다.
+Fish Audio S2는 텍스트의 특정 단어 또는 구문 위치에 자연어 지시를 직접 삽입해 음성 생성을 국소적으로 제어할 수 있습니다. 고정된 사전 정의 태그에 의존하는 대신, S2는 [whisper in small voice], [professional broadcast tone], [pitch up] 같은 자유 형식 텍스트 설명을 받아 단어 수준의 개방형 표현 제어를 지원합니다.
 
 ### 다국어 지원
 
-FishAudio-S1은 음소나 언어별 전처리 없이 고품질 다국어 text-to-speech를 지원합니다.
-
-**감정 마커를 지원하는 언어:**
-영어, 중국어, 일본어, 독일어, 프랑스어, 스페인어, 한국어, 아랍어, 러시아어, 네덜란드어, 이탈리아어, 폴란드어, 포르투갈어.
-
-목록은 계속 확장되고 있습니다. 최신 릴리스는 [Fish Audio](https://fish.audio/)를 확인하세요.
-
-### 빠른 음성 복제
-
-FishAudio-S1은 짧은 참조 샘플(일반적으로 10-30초)을 사용한 정확한 음성 복제를 지원합니다. 모델은 음색, 말하기 스타일, 감정 성향을 포착하여 추가 파인튜닝 없이 사실적이고 일관된 복제 음성을 생성합니다.
+Fish Audio S2는 음소나 언어별 전처리 없이 고품질 다국어 텍스트 음성 변환을 지원합니다. 포함 사항:
 
-## **기능**
+**영어, 중국어, 일본어, 한국어, 아랍어, 독일어, 프랑스어...**
 
-1. **제로샷 및 퓨샷 TTS:** 10~30초의 음성 샘플을 입력하여 고품질 TTS 출력을 생성합니다. **자세한 가이드라인은 [음성 복제 모범 사례](https://docs.fish.audio/resources/best-practices/voice-cloning)를 참조하세요.**
+**그리고 더 많이!**
 
-2. **다국어 및 교차 언어 지원:** 다국어 텍스트를 입력 상자에 복사하여 붙여넣기만 하면 됩니다. 언어를 걱정할 필요가 없습니다. 현재 영어, 일본어, 한국어, 중국어, 프랑스어, 독일어, 아랍어, 스페인어를 지원합니다.
-
-3. **음소 의존성 없음:** 모델은 강력한 일반화 능력을 가지고 있으며 TTS를 위해 음소에 의존하지 않습니다. 모든 언어 스크립트의 텍스트를 처리할 수 있습니다.
-
-4. **높은 정확도:** Seed-TTS Eval에서 약 0.4%의 낮은 CER(문자 오류율)과 약 0.8%의 WER(단어 오류율)을 달성합니다.
+목록은 계속 확장되고 있습니다. 최신 릴리스는 [Fish Audio](https://fish.audio/)를 확인하세요.
 
-5. **빠른 속도:** torch compile로 가속화되어 Nvidia RTX 4090 GPU에서 실시간 팩터가 약 1:7입니다.
+### 네이티브 멀티 화자 생성
 
-6. **WebUI 추론:** 사용하기 쉬운 Gradio 기반 웹 UI를 제공하며 Chrome, Firefox, Edge 등 다른 브라우저와 호환됩니다.
+<img src="./assets/chattemplate.png" width=200%>
 
-7. **배포 용이성:** Linux 및 Windows를 기본 지원하며(macOS 지원 예정), 성능 손실을 최소화하면서 추론 서버를 쉽게 설정할 수 있습니다.
+Fish Audio S2는 사용자가 여러 화자가 포함된 참조 오디오를 업로드할 수 있도록 하며, 모델은 `<|speaker:i|>` 토큰을 통해 각 화자의 특징을 처리합니다. 그런 다음 화자 ID 토큰으로 모델의 성능을 제어하여 한 번의 생성으로 여러 화자를 포함할 수 있습니다. 이전처럼 각 화자마다 별도로 참조 오디오를 업로드하고 음성을 생성할 필요가 없습니다.
 
-## **미디어 및 데모**
+### 멀티 턴 대화 생성
 
-<div align="center">
+모델 컨텍스트의 확장 덕분에 이제 이전 정보를 활용하여 후속 생성 콘텐츠의 표현력을 높이고 콘텐츠의 자연스러움을 향상시킬 수 있습니다.
 
-### **소셜 미디어**
-<a href="https://x.com/hehe6z/status/1980303682932744439" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-Latest_Demo-black?style=for-the-badge&logo=x&logoColor=white" alt="X에서 최신 데모" />
-</a>
-
-### **인터랙티브 데모**
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish.Audio-Try_FishAudio_S1-blue?style=for-the-badge" alt="FishAudio S1 체험하기" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-Use_S1_Mini-yellow?style=for-the-badge" alt="S1 Mini 사용하기" />
-</a>
-
-### **비디오 쇼케이스**
-
-<a href="https://www.youtube.com/watch?v=WR1FY32Lhps" target="_blank">
-    <img src="../docs/assets/Thumbnail.jpg" alt="FishAudio S1 Video" style="width: 50%;" />
-</a>
+### 빠른 음성 복제
 
-</div>
+Fish Audio S2는 짧은 참조 샘플(일반적으로 10-30초)을 사용하여 정확한 음성 복제를 지원합니다. 모델은 음색, 말하기 스타일 및 감정적 경향을 캡처하여 추가 미세 조정 없이 사실적이고 일관된 복제 음성을 생성합니다.
 
 ---
 
@@ -209,7 +118,7 @@ FishAudio-S1은 짧은 참조 샘플(일반적으로 10-30초)을 사용한 정
 - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
 - [Qwen3](https://github.com/QwenLM/Qwen3)
 
-## 기술 보고서 (V1.4)
+## 기술 보고서
 ```bibtex
 @misc{fish-speech-v1.4,
       title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},

+ 31 - 122
docs/README.pt-BR.md

@@ -34,168 +34,77 @@
     <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
       <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
     </a>
-    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
-        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
-    </a>
-    <a target="_blank" href="https://huggingface.co/fishaudio/openaudio-s1-mini">
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
         <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
     </a>
 </div>
 
 > [!IMPORTANT]
 > **Aviso de Licença**
-> Esta base de código é lançada sob a **Licença Apache** e todos os pesos dos modelos são lançados sob a **Licença CC-BY-NC-SA-4.0**. Consulte [LICENSE](../LICENSE) para mais detalhes.
+> Este repositório e os pesos de modelo associados são lançados sob a **[FISH AUDIO RESEARCH LICENSE](../LICENSE)**. Consulte [LICENSE](../LICENSE) para obter mais detalhes.
 
 > [!WARNING]
 > **Isenção de Responsabilidade Legal**
-> Não assumimos qualquer responsabilidade pelo uso ilegal da base de código. Consulte as leis locais sobre DMCA e outras leis relacionadas.
+> Não nos responsabilizamos por qualquer uso ilegal do repositório. Consulte as leis locais sobre DMCA e outras leis relacionadas.
 
-## FishAudio-S1
-**Síntese de Voz e Clonagem de Voz com Qualidade Humana**
+## Comece Aqui
 
-FishAudio-S1 é um modelo expressivo de text-to-speech (TTS) e clonagem de voz desenvolvido pela [Fish Audio](https://fish.audio/), projetado para gerar fala que soa natural, realista e emocionalmente rica — não robótica, não plana e não restrita à narração estilo estúdio.
+Aqui estão os documentos oficiais do Fish Speech, siga as instruções para começar facilmente.
 
-FishAudio-S1 foca em como os humanos realmente falam: com emoção, variação, pausas e intenção.
+- [Instalação](https://speech.fish.audio/pt/install/)
+- [Inferência](https://speech.fish.audio/pt/inference/)
 
-### Anúncio 🎉
+## Fish Audio S2
+**O melhor sistema de conversão de texto em fala entre código aberto e código fechado**
 
-Estamos animados em anunciar que mudamos nossa marca para **Fish Audio** — introduzindo uma nova série revolucionária de modelos avançados de Text-to-Speech que se baseia na fundação do Fish-Speech.
+O Fish Audio S2 é o modelo mais recente desenvolvido pela [Fish Audio](https://fish.audio/), projetado para gerar falas que soam naturais, realistas e emocionalmente ricas — não robóticas, não monótonas e não limitadas à narração em estilo de estúdio.
 
-Temos o orgulho de lançar o **FishAudio-S1** (também conhecido como OpenAudio S1) como o primeiro modelo desta série, oferecendo melhorias significativas em qualidade, desempenho e capacidades.
+O Fish Audio S2 foca em conversas diárias e diálogos, o que permite a geração nativa de múltiplos falantes e turnos. Também suporta controle por instrução.
 
-O FishAudio-S1 vem em duas versões: **FishAudio-S1** e **FishAudio-S1-mini**. Ambos os modelos estão agora disponíveis no [Fish Audio Playground](https://fish.audio) (para **FishAudio-S1**) e [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini) (para **FishAudio-S1-mini**).
+A série S2 contém vários modelos, o modelo de código aberto é o S2-Pro, que é o melhor modelo da coleção.
 
-Visite o [site Fish Audio](https://fish.audio/) para playground ao vivo e relatório técnico.
+Visite o [site da Fish Audio](https://fish.audio/) para um playground ao vivo.
 
 ### Variantes do Modelo
 
 | Modelo | Tamanho | Disponibilidade | Descrição |
 |------|------|-------------|-------------|
-| FishAudio-S1 | 4B parâmetros | [fish.audio](https://fish.audio/) | Modelo flagship com recursos completos, máxima qualidade e estabilidade |
-| FishAudio-S1-mini | 0.5B parâmetros | [huggingface](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) | Modelo destilado open-source com capacidades principais |
-
-Tanto S1 quanto S1-mini incorporam Aprendizado por Reforço online com Feedback Humano (RLHF).
-
-### Comece Aqui
-
-Aqui estão os documentos oficiais do Fish Speech, siga as instruções para começar facilmente.
+| S2-Pro | 4B parâmetros | [huggingface](https://huggingface.co/fishaudio/s2-pro) | Modelo carro-chefe completo com máxima qualidade e estabilidade |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | Nosso modelo de código fechado com maior velocidade e menor latência |
 
-- [Instalação](https://speech.fish.audio/install/)
-- [Fine-tune](https://speech.fish.audio/finetune/)
-- [Inferência](https://speech.fish.audio/inference/)
-- [Amostras](https://speech.fish.audio/samples/)
+Mais detalhes do modelo podem ser encontrados no relatório técnico.
 
 ## Destaques
 
-### **Excelente qualidade TTS**
-
-Usamos as métricas de avaliação Seed TTS para avaliar o desempenho do modelo, e os resultados mostram que o FishAudio S1 alcança **0.008 WER** e **0.004 CER** em texto em inglês, que é significativamente melhor que modelos anteriores. (Inglês, avaliação automática, baseada no OpenAI gpt-4o-transcribe, distância do locutor usando Revai/pyannote-wespeaker-voxceleb-resnet34-LM)
-
-| Modelo | Taxa de Erro de Palavra (WER) | Taxa de Erro de Caractere (CER) | Distância do Locutor |
-|-------|----------------------|---------------------------|------------------|
-| **S1** | **0.008**  | **0.004**  | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
-
-
-### **Melhor Modelo no TTS-Arena2** 🏆
-
-O FishAudio S1 alcançou a **classificação #1** no [TTS-Arena2](https://arena.speechcolab.org/), o benchmark para avaliação de text-to-speech:
-
-<div align="center">
-    <img src="assets/Elo.jpg" alt="Classificação TTS-Arena2" style="width: 75%;" />
-</div>
-
-### Fala Verdadeiramente Humana
-
-FishAudio-S1 gera fala que soa natural e conversacional, em vez de robótica ou excessivamente polida. O modelo captura variações sutis em tempo, ênfase e prosódia, evitando o efeito "gravação de estúdio" comum em sistemas TTS tradicionais.
-
-### **Controle de Emoção e Expressividade**
-
-FishAudio S1 é o primeiro modelo TTS a suportar **controle de emoção refinado em domínio aberto** através de marcadores explícitos de emoção e tom. Agora podemos direcionar precisamente como uma voz soa:
-
-- **Emoções básicas**:
-```
-(raivoso) (triste) (animado) (surpreso) (satisfeito) (encantado)
-(assustado) (preocupado) (chateado) (nervoso) (frustrado) (deprimido)
-(empático) (envergonhado) (enojado) (emocionado) (orgulhoso) (relaxado)
-(grato) (confiante) (interessado) (curioso) (confuso) (alegre)
-```
-
-- **Emoções avançadas**:
-```
-(desdenhoso) (infeliz) (ansioso) (histérico) (indiferente)
-(impaciente) (culpado) (desprezível) (em pânico) (furioso) (relutante)
-(entusiasmado) (desaprovador) (negativo) (negando) (espantado) (sério)
-(sarcástico) (conciliador) (consolador) (sincero) (escarnecedor)
-(hesitante) (cedendo) (doloroso) (constrangido) (divertido)
-```
-
-- **Marcadores de tom**:
-```
-(tom apressado) (gritando) (gritando alto) (sussurrando) (tom suave)
-```
+<img src="./assets/totalability.png" width=200%>
 
-- **Efeitos de áudio especiais**:
-```
-(rindo) (dando risinhos) (soluçando) (chorando alto) (suspirando) (ofegando)
-(gemendo) (risos da multidão) (risos de fundo) (risos da audiência)
-```
+### Controle Inline Refinado via Linguagem Natural
 
-Você também pode usar Ha,ha,ha para controlar, há muitos outros casos esperando para serem explorados por você mesmo.
+O Fish Audio S2 permite controle localizado da geração de fala ao incorporar instruções em linguagem natural diretamente em posições específicas de palavras ou frases no texto. Em vez de depender de um conjunto fixo de tags predefinidas, o S2 aceita descrições textuais livres, como [whisper in small voice], [professional broadcast tone] ou [pitch up], permitindo controle de expressão aberto no nível da palavra.
 
 ### Suporte Multilíngue
 
-FishAudio-S1 suporta text-to-speech multilíngue de alta qualidade sem exigir fonemas ou pré-processamento específico de idioma.
-
-**Idiomas que suportam marcadores de emoção incluem:**
-Inglês, Chinês, Japonês, Alemão, Francês, Espanhol, Coreano, Árabe, Russo, Holandês, Italiano, Polonês e Português.
-
-A lista está em constante expansão, verifique [Fish Audio](https://fish.audio/) para os últimos lançamentos.
-
-### Clonagem de Voz Rápida
-
-FishAudio-S1 suporta clonagem de voz precisa usando uma amostra de referência curta (tipicamente 10-30 segundos). O modelo captura timbre, estilo de fala e tendências emocionais, produzindo vozes clonadas realistas e consistentes sem ajuste fino adicional.
-
-## **Recursos**
+O Fish Audio S2 oferece suporte a conversão de texto em fala multilíngue de alta qualidade sem a necessidade de fonemas ou processamento específico de idioma. Incluindo:
 
-1. **TTS Zero-shot e Few-shot:** Insira uma amostra vocal de 10 a 30 segundos para gerar saída TTS de alta qualidade. **Para diretrizes detalhadas, veja [Melhores Práticas de Clonagem de Voz](https://docs.fish.audio/resources/best-practices/voice-cloning).**
+**Inglês, Chinês, Japonês, Coreano, Árabe, Alemão, Francês...**
 
-2. **Suporte Multilíngue e Cross-lingual:** Simplesmente copie e cole texto multilíngue na caixa de entrada—não precisa se preocupar com o idioma. Atualmente suporta inglês, japonês, coreano, chinês, francês, alemão, árabe e espanhol.
+**E MUITO MAIS!**
 
-3. **Sem Dependência de Fonema:** O modelo tem fortes capacidades de generalização e não depende de fonemas para TTS. Pode lidar com texto em qualquer script de idioma.
+A lista está em constante expansão, verifique o [Fish Audio](https://fish.audio/) para os lançamentos mais recentes.
 
-4. **Altamente Preciso:** Alcança um baixo CER (Taxa de Erro de Caractere) de cerca de 0.4% e WER (Taxa de Erro de Palavra) de cerca de 0.8% para Seed-TTS Eval.
+### Geração Nativa de Múltiplos Falantes
 
-5. **Rápido:** Com aceleração por torch compile, o fator de tempo real é aproximadamente 1:7 em uma GPU Nvidia RTX 4090.
+<img src="./assets/chattemplate.png" width=200%>
 
-6. **Inferência via WebUI:** Apresenta uma interface de usuário baseada em Gradio, fácil de usar e compatível com Chrome, Firefox, Edge e outros navegadores.
+O Fish Audio S2 permite que os usuários carreguem áudio de referência com vários falantes; o modelo lidará com as características de cada falante por meio do token `<|speaker:i|>`. Então, você pode controlar o desempenho do modelo com the token de ID do falante, permitindo que uma única geração inclua vários falantes. Você não precisa mais carregar áudios de referência separadamente para cada falante.
 
-7. **Amigável para Implantação:** Configure facilmente um servidor de inferência com suporte nativo para Linux e Windows (suporte para macOS em breve), minimizando a perda de desempenho.
+### Geração de Múltiplos Turnos
 
-## **Mídia e Demos**
+Graças à extensão do contexto do modelo, nosso modelo agora pode usar informações anteriores para melhorar a expressividade e a naturalidade dos conteúdos gerados subsequentemente.
 
-<div align="center">
-
-### **Mídia Social**
-<a href="https://x.com/hehe6z/status/1980303682932744439" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-Latest_Demo-black?style=for-the-badge&logo=x&logoColor=white" alt="Demo Mais Recente no X" />
-</a>
-
-### **Demos Interativos**
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish.Audio-Try_FishAudio_S1-blue?style=for-the-badge" alt="Experimente FishAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-Use_S1_Mini-yellow?style=for-the-badge" alt="Use S1 Mini" />
-</a>
-
-### **Vitrines de Vídeo**
-
-<a href="https://www.youtube.com/watch?v=WR1FY32Lhps" target="_blank">
-    <img src="assets/Thumbnail.jpg" alt="FishAudio S1 Video" style="width: 50%;" />
-</a>
+### Clonagem de Voz Rápida
 
-</div>
+O Fish Audio S2 suporta clonagem de voz precisa usando uma pequena amostra de referência (tipicamente de 10 a 30 segundos). O modelo captura o timbre, o estilo de fala e as tendências emocionais, produzindo vozes clonadas realistas e consistentes sem ajuste fino adicional.
 
 ---
 
@@ -209,7 +118,7 @@ FishAudio-S1 suporta clonagem de voz precisa usando uma amostra de referência c
 - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
 - [Qwen3](https://github.com/QwenLM/Qwen3)
 
-## Relatório Técnico (V1.4)
+## Relatório Técnico
 ```bibtex
 @misc{fish-speech-v1.4,
       title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},

+ 33 - 122
docs/README.zh.md

@@ -34,168 +34,78 @@
     <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
       <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
     </a>
-    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
-        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
-    </a>
-    <a target="_blank" href="https://huggingface.co/fishaudio/openaudio-s1-mini">
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
         <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
     </a>
 </div>
 
 > [!IMPORTANT]
 > **许可证声明**
-> 此代码库在 **Apache License** 下发布,所有模型权重在 **CC-BY-NC-SA-4.0 License** 下发布。更多详情请参考 [LICENSE](../LICENSE)。
+> 此代码库及其相关的模型权重均在 **[FISH AUDIO RESEARCH LICENSE](../LICENSE)** 下发布。更多详情请参考 [LICENSE](../LICENSE)。
+
 
 > [!WARNING]
 > **法律免责声明**
 > 我们不对代码库的任何非法使用承担责任。请参考您当地关于 DMCA 和其他相关法律的法规。
 
-## FishAudio-S1
-**真人级语音生成 & 声音克隆**
+## 从这里开始
 
-FishAudio-S1 是由 [Fish Audio](https://fish.audio/) 开发的富有表现力的文本转语音 (TTS) 和语音克隆模型,旨在生成听起来自然、真实且情感丰富的语音——不机械、不平淡,也不局限于录音室风格的朗读
+这里是 Fish Speech 的官方文档,请按照说明轻松入门
 
-FishAudio-S1 专注于人类真实的说话方式:带有情感、变化、停顿和意图。
+- [安装](https://speech.fish.audio/zh/install/)
+- [推理](https://speech.fish.audio/zh/inference/)
 
-### 公告 🎉
+## Fish Audio S2
+**开源和闭源中最出色的文本转语音系统**
 
-我们很高兴地宣布,我们已将品牌重塑为 **Fish Audio** —— 推出基于 Fish-Speech 基础构建的革命性新一代高级文本转语音模型系列
+Fish Audio S2 是由 [Fish Audio](https://fish.audio/) 开发的最新模型,旨在生成听起来自然、真实且情感丰富的语音——不机械、不平淡,也不局限于录音室风格的朗读
 
-我们自豪地发布 **FishAudio-S1**(也称为 OpenAudio S1)作为该系列的第一个模型,在质量、性能和功能方面都有显著改进
+Fish Audio S2 专注于日常对话,支持原生多说话人和多轮生成。同时支持指令控制
 
-FishAudio-S1 提供两个版本:**FishAudio-S1** 和 **FishAudio-S1-mini**。两个模型现在都可以在 [Fish Audio Playground](https://fish.audio)(**FishAudio-S1**)和 [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini)(**FishAudio-S1-mini**)上使用
+S2 系列包含多个模型,开源模型为 S2-Pro,是该系列中性能最强的模型
 
-请访问 [Fish Audio 网站](https://fish.audio/) 获取实时 playground 和技术报告
+请访问 [Fish Audio 网站](https://fish.audio/) 以获取实时体验
 
-### 模型版本
+### 模型变体
 
 | 模型 | 大小 | 可用性 | 描述 |
 |------|------|-------------|-------------|
-| FishAudio-S1 | 4B 参数 | [fish.audio](https://fish.audio/) | 功能齐全的旗舰模型,具有最高质量和稳定性 |
-| FishAudio-S1-mini | 0.5B 参数 | [huggingface](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) | 开源精简模型,具有核心功能 |
+| S2-Pro | 4B 参数 | [huggingface](https://huggingface.co/fishaudio/s2-pro) | 功能齐全的旗舰模型,具有最高质量和稳定性 |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | 我们的闭源模型,具有更快的速度和更低的延迟 |
 
-S1 和 S1-mini 都集成了在线人类反馈强化学习(RLHF)。
-
-### 开始使用
-
-这里是 Fish Speech 的官方文档,按照说明轻松开始使用。
-
-- [安装](https://speech.fish.audio/zh/install/)
-- [微调](https://speech.fish.audio/zh/finetune/)
-- [推理](https://speech.fish.audio/zh/inference/)
-- [示例](https://speech.fish.audio/samples/)
+有关模型的更多详情,请参见技术报告。
 
 ## 亮点
 
-### **出色的 TTS 质量**
-
-我们使用 Seed TTS 评估指标来评估模型性能,结果显示 FishAudio S1 在英语文本上达到了 **0.008 WER** 和 **0.004 CER**,这比以前的模型显著更好。(英语,自动评估,基于 OpenAI gpt-4o-transcribe,使用 Revai/pyannote-wespeaker-voxceleb-resnet34-LM 进行说话人距离计算)
-
-| 模型 | 词错误率 (WER) | 字符错误率 (CER) | 说话人距离 |
-|-------|----------------------|---------------------------|------------------|
-| **S1** | **0.008**  | **0.004**  | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
-
-
-### **TTS-Arena2 最佳模型** 🏆
-
-FishAudio S1 在 [TTS-Arena2](https://arena.speechcolab.org/) 上取得了 **第一名**,这是文本转语音评估的基准:
-
-<div align="center">
-    <img src="../docs/assets/Elo.jpg" alt="TTS-Arena2 排名" style="width: 75%;" />
-</div>
-
-### 真正类人的语音
-
-FishAudio-S1 生成的语音听起来自然且具有对话感,而不是机械或过度修饰。模型捕捉了时间、重音和韵律的细微变化,避免了传统 TTS 系统常见的"录音室录音"效果。
-
-### **情感控制与表现力**
-
-FishAudio S1 是首个支持通过显式情感和语调标记进行**开放领域细粒度情感控制**的 TTS 模型。我们现在可以精确控制语音的情感表达:
-
-- **基础情感**:
-```
-(生气) (伤心) (兴奋) (惊讶) (满意) (高兴)
-(害怕) (担心) (沮丧) (紧张) (挫败) (郁闷)
-(同情) (尴尬) (厌恶) (感动) (自豪) (放松)
-(感激) (自信) (感兴趣) (好奇) (困惑) (快乐)
-```
-
-- **高级情感**:
-```
-(鄙视) (不开心) (焦虑) (歇斯底里) (冷漠)
-(不耐烦) (内疚) (轻蔑) (恐慌) (愤怒) (不情愿)
-(热衷) (不赞成) (消极) (否认) (震惊) (严肃)
-(讽刺) (安抚) (安慰) (真诚) (冷笑)
-(犹豫) (屈服) (痛苦) (尴尬) (觉得有趣)
-```
+<img src="./assets/totalability.png" width=200%>
 
-- **语调标记**:
-```
-(急促的语调) (喊叫) (尖叫) (耳语) (柔和的语调)
-```
-
-- **特殊音频效果**:
-```
-(笑声) (轻笑) (抽泣) (大声哭泣) (叹息) (喘息)
-(呻吟) (人群笑声) (背景笑声) (观众笑声)
-```
+### 通过自然语言进行细粒度行内控制
 
-您也可以使用 哈,哈,哈 来控制,还有许多其他情况等待您自己探索
+Fish Audio S2 支持在文本中的特定词或短语位置直接嵌入自然语言指令,从而对语音生成进行局部控制。与依赖固定预设标签不同,S2 接受自由形式的文本描述,例如 [whisper in small voice]、[professional broadcast tone] 或 [pitch up],实现词级别的开放式表达控制。
 
 ### 多语言支持
 
-FishAudio-S1 支持高质量的多语言文本转语音,无需音素或语言特定的预处理。
-
-**支持情感标记的语言包括:**
-英语、中文、日语、德语、法语、西班牙语、韩语、阿拉伯语、俄语、荷兰语、意大利语、波兰语和葡萄牙语。
-
-语言列表持续扩展中,请访问 [Fish Audio](https://fish.audio/) 获取最新版本。
-
-### 快速语音克隆
-
-FishAudio-S1 支持使用短参考样本(通常 10-30 秒)进行准确的语音克隆。模型可以捕捉音色、说话风格和情感倾向,无需额外微调即可生成逼真且一致的克隆语音。
-
-## **功能**
+Fish Audio S2 支持高质量的多语言文本转语音,无需音素或特定语言的预处理。包括:
 
-1. **零样本和少样本 TTS:** 输入 10 到 30 秒的语音样本以生成高质量的 TTS 输出。**详细指南请参见 [语音克隆最佳实践](https://docs.fish.audio/resources/best-practices/voice-cloning)。**
+**英语、中文、日语、韩语、阿拉伯语、德语、法语...**
 
-2. **多语言和跨语言支持:** 只需将多语言文本复制并粘贴到输入框中——无需担心语言问题。目前支持英语、日语、韩语、中文、法语、德语、阿拉伯语和西班牙语。
+**以及更多!**
 
-3. **无音素依赖:** 模型具有强大的泛化能力,不依赖音素进行 TTS。它可以处理任何语言脚本的文本
+列表正在不断扩大,请查看 [Fish Audio](https://fish.audio/) 获取最新发布。
 
-4. **高准确性:** 在 Seed-TTS Eval 上实现约 0.4% 的低 CER(字符错误率)和约 0.8% 的 WER(词错误率)。
+### 原生多说话人生成
 
-5. **快速:** 通过 torch compile 加速,在 Nvidia RTX 4090 GPU 上的实时因子约为 1:7。
+<img src="./assets/chattemplate.png" width=200%>
 
-6. **WebUI 推理:** 提供简单易用的、基于 Gradio 的 Web UI,兼容 Chrome、Firefox、Edge 等浏览器
+Fish Audio S2 允许用户上传包含多个说话人的参考音频,模型将通过 `<|speaker:i|>` 令牌处理每个说话人的特征。之后您可以通过说话人 ID 令牌控制模型的表现,从而实现一次生成中包含多个说话人。再也不需要像以前那样针对每个说话人都单独上传参考音频与生成语音了。
 
-7. **易于部署:** 轻松设置推理服务器,原生支持 Linux 和 Windows(即将支持 macOS),最大限度地减少性能损失。
+### 多轮对话生成
 
-## **媒体和演示**
+得益于模型上下文的扩展,我们的模型现在可以借助上文的信息提高后续生成内容的表现力,从而提升内容的自然度。
 
-<div align="center">
-
-### **社交媒体**
-<a href="https://x.com/hehe6z/status/1980303682932744439" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-最新演示-black?style=for-the-badge&logo=x&logoColor=white" alt="X 上的最新演示" />
-</a>
-
-### **交互式演示**
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish.Audio-试用_FishAudio_S1-blue?style=for-the-badge" alt="试用 FishAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-使用_S1_Mini-yellow?style=for-the-badge" alt="使用 S1 Mini" />
-</a>
-
-### **视频展示**
-
-<a href="https://www.youtube.com/watch?v=WR1FY32Lhps" target="_blank">
-    <img src="../docs/assets/Thumbnail.jpg" alt="FishAudio S1 Video" style="width: 50%;" />
-</a>
+### 快速语音克隆
 
-</div>
+Fish Audio S2 支持使用短参考样本(通常为 10-30 秒)进行准确的语音克隆。模型可以捕捉音色、说话风格和情感倾向,无需额外微调即可生成逼真且一致的克隆语音。
 
 ---
 
@@ -209,7 +119,8 @@ FishAudio-S1 支持使用短参考样本(通常 10-30 秒)进行准确的语
 - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
 - [Qwen3](https://github.com/QwenLM/Qwen3)
 
-## 技术报告 (V1.4)
+## 技术报告
+
 ```bibtex
 @misc{fish-speech-v1.4,
       title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},

+ 90 - 131
docs/ar/index.md

@@ -1,174 +1,133 @@
-# OpenAudio (سابقاً Fish-Speech)
-
-<div align="center">
-
 <div align="center">
+<h1>Fish Speech</h1>
 
-<img src="../assets/openaudio.jpg" alt="OpenAudio" style="display: block; margin: 0 auto; width: 35%;"/>
-
-</div>
-
-<strong>سلسلة نماذج تحويل النص إلى كلام المتقدمة</strong>
+[English](../en/) | [简体中文](../zh/) | [Portuguese](../pt/) | [日本語](../ja/) | [한국어](../ko/) | **العربية** <br>
 
-<div>
-<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
-<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
-<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
-<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+<a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
+<a href="https://trendshift.io/repositories/7014" target="_blank">
+    <img src="https://trendshift.io/api/badge/repositories/7014" alt="fishaudio%2Ffish-speech | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
 </a>
 </div>
 
-<strong>جربه الآن:</strong> <a href="https://fish.audio">Fish Audio Playground</a> | <strong>تعلم المزيد:</strong> <a href="https://openaudio.com">موقع OpenAudio</a>
-
-</div>·
-
----
-
-!!! note "إشعار الترخيص"
-    يتم إصدار قاعدة الكود هذه تحت **رخصة Apache** ويتم إصدار جميع أوزان النماذج تحت **رخصة CC-BY-NC-SA-4.0**. يرجى الرجوع إلى [رخصة الكود](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) و [رخصة النموذج](https://spdx.org/licenses/CC-BY-NC-SA-4.0) لمزيد من التفاصيل.
-
-!!! warning "إخلاء المسؤولية القانونية"
-    نحن لا نتحمل أي مسؤولية عن أي استخدام غير قانوني لقاعدة الكود. يرجى الرجوع إلى القوانين المحلية حول DMCA والقوانين الأخرى ذات الصلة.
-
-## **المقدمة**
-
-نحن متحمسون للإعلان عن إعادة تسمية علامتنا التجارية إلى **OpenAudio** - تقديم سلسلة جديدة من نماذج تحويل النص إلى كلام المتقدمة التي تبني على أساس Fish-Speech مع تحسينات كبيرة وقدرات جديدة.
-
-**Openaudio-S1-mini**: [المدونة](https://openaudio.com/blogs/s1); [الفيديو](https://www.youtube.com/watch?v=SYuPvd7m06A); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini);
-
-**Fish-Speech v1.5**: [الفيديو](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5);
-
-## **النقاط البارزة**
-
-### **جودة TTS ممتازة**
-
-نستخدم مقاييس تقييم Seed TTS لتقييم أداء النموذج، وتظهر النتائج أن OpenAudio S1 يحقق **0.008 WER** و **0.004 CER** على النص الإنجليزي، وهو أفضل بكثير من النماذج السابقة. (الإنجليزية، التقييم التلقائي، بناءً على OpenAI gpt-4o-transcribe، مسافة المتحدث باستخدام Revai/pyannote-wespeaker-voxceleb-resnet34-LM)
-
-| النموذج | معدل خطأ الكلمات (WER) | معدل خطأ الأحرف (CER) | مسافة المتحدث |
-|:-----:|:--------------------:|:-------------------------:|:----------------:|
-| **S1** | **0.008** | **0.004** | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
-
-### **أفضل نموذج في TTS-Arena2**
-
-حقق OpenAudio S1 **المرتبة الأولى** في [TTS-Arena2](https://arena.speechcolab.org/)، المعيار لتقييم تحويل النص إلى كلام:
+<br>
 
 <div align="center">
-    <img src="../assets/Elo.jpg" alt="TTS-Arena2 Ranking" style="width: 75%;" />
+    <img src="https://count.getloli.com/get/@fish-speech?theme=asoul" /><br>
 </div>
 
-### **التحكم في الكلام**
-يدعم OpenAudio S1 **مجموعة متنوعة من العلامات العاطفية والنبرة والخاصة** لتعزيز تركيب الكلام:
-
-- **المشاعر الأساسية**:
-```
-(غاضب) (حزين) (متحمس) (مندهش) (راضي) (مسرور) 
-(خائف) (قلق) (منزعج) (عصبي) (محبط) (مكتئب)
-(متعاطف) (محرج) (مشمئز) (متأثر) (فخور) (مسترخي)
-(ممتن) (واثق) (مهتم) (فضولي) (مرتبك) (مبتهج)
-```
+<br>
 
-- **المشاعر المتقدمة**:
-```
-(محتقر) (غير سعيد) (قلق) (هستيري) (غير مبال) 
-(نافد الصبر) (مذنب) (ازدرائي) (مذعور) (غاضب) (مترد)
-(متحمس) (غير موافق) (سلبي) (منكر) (مندهش) (جدي)
-(ساخر) (مصالح) (مواسي) (صادق) (ساخر)
-(متردد) (مستسلم) (مؤلم) (محرج) (مسلي)
-```
+<div align="center">
+    <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+        <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+    </a>
+    <a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+        <img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+    </a>
+    <a target="_blank" href="https://pd.qq.com/s/bwxia254o">
+      <img alt="QQ Channel" src="https://img.shields.io/badge/QQ-blue?logo=tencentqq">
+    </a>
+</div>
 
-(الدعم للإنجليزية والصينية واليابانية الآن، والمزيد من اللغات قادم قريبًا!)
+<div align="center">
+    <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
+      <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
+    </a>
+    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
+        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
+    </a>
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
+        <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
+    </a>
+</div>
 
-- **علامات النبرة**:
-```
-(بنبرة مستعجلة) (صراخ) (صراخ) (همس) (نبرة ناعمة)
-```
+!!! info "تنبيه الترخيص"
+    يتم إصدار قاعدة الأكواد هذه وأوزان النماذج المرتبطة بها بموجب رخصة **FISH AUDIO RESEARCH LICENSE**. يرجى الرجوع إلى [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) لمزيد من التفاصيل.
 
-- **تأثيرات صوتية خاصة**:
-```
-(ضحك) (قهقهة) (نشيج) (بكاء بصوت عالٍ) (تنهد) (لهاث)
-(أنين) (ضحك الجمهور) (ضحك الخلفية) (ضحك الجمهور)
-```
+!!! warning "إخلاء المسؤولية القانونية"
+    نحن لا نتحمل أي مسؤولية عن أي استخدام غير قانوني لقاعدة الأكواد. يرجى مراجعة القوانين المحلية المتعلقة بـ DMCA والقوانين الأخرى ذات الصلة.
 
-يمكنك أيضًا استخدام ها،ها،ها للتحكم، هناك العديد من الحالات الأخرى في انتظار استكشافك بنفسك.
+## ابدأ من هنا
 
-### **نوعان من النماذج**
+هذا هو الوثائق الرسمية لـ Fish Speech. يرجى اتباع التعليمات للبدء بسهولة.
 
-نقدم متغيرين من النماذج لتناسب الاحتياجات المختلفة:
+- [التثبيت](install.md)
+- [الاستنتاج](inference.md)
 
-- **OpenAudio S1 (4 مليار معامل)**: نموذجنا الرئيسي كامل الميزات المتاح على [fish.audio](https://fish.audio)، يقدم أعلى جودة لتركيب الكلام مع جميع الميزات المتقدمة.
+## Fish Audio S2
+**أفضل نظام لتحويل النص إلى كلام في كل من المصادر المفتوحة والمغلقة**
 
-- **OpenAudio S1-mini (0.5 مليار معامل)**: إصدار مقطر بالقدرات الأساسية، متاح على [Hugging Face Space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini)، محسن للاستنتاج الأسرع مع الحفاظ على الجودة الممتازة.
+Fish Audio S2 هو أحدث نموذج تم تطويره بواسطة [Fish Audio](https://fish.audio/)، وهو مصمم لتوليد كلام يبدو طبيعيًا وأصليًا وغنيًا بالعاطفة — غير ميكانيكي أو مسطح أو مقتصر على القراءة بأسلوب الاستوديو.
 
-كل من S1 و S1-mini يدمجان التعلم المعزز عبر الإنترنت من ردود الفعل البشرية (RLHF).
+يركز Fish Audio S2 على المحادثات اليومية، ويدعم توليد المتحدثين المتعددين الأصليين وتوليد الحوارات متعددة الأدوار. كما يدعم التحكم التعليمي.
 
-## **الميزات**
+تتضمن سلسلة S2 نماذج متعددة. النموذج المفتوح المصدر هو S2-Pro، وهو أقوى نموذج في السلسلة.
 
-1. **TTS بدون عينات وبعينات قليلة:** أدخل عينة صوتية من 10 إلى 30 ثانية لإنتاج مخرجات TTS عالية الجودة. **للإرشادات التفصيلية، راجع [أفضل ممارسات استنساخ الصوت](https://docs.fish.audio/text-to-speech/voice-clone-best-practices).**
+يرجى زيارة [موقع Fish Audio](https://fish.audio/) لتجربة فورية.
 
-2. **الدعم متعدد اللغات وعبر اللغات:** ببساطة انسخ والصق النص متعدد اللغات في مربع الإدخال—لا حاجة للقلق بشأن اللغة. يدعم حاليًا الإنجليزية واليابانية والكورية والصينية والفرنسية والألمانية والعربية والإسبانية.
+### متغيرات النموذج
 
-3. **لا يعتمد على الصوتيات:** النموذج لديه قدرات تعميم قوية ولا يعتمد على الصوتيات لـ TTS. يمكنه التعامل مع النص في أي نص لغوي.
+| النموذج | الحجم | التوفر | الوصف |
+|------|------|-------------|-------------|
+| S2-Pro | 4B معاملات | [huggingface](https://huggingface.co/fishaudio/s2-pro) | نموذج رائد بكامل الميزات مع أعلى جودة واستقرار |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | نموذجنا المغلق المصدر بسرعات أعلى وزمن وصول أقل |
 
-4. **دقيق للغاية:** يحقق معدل خطأ أحرف منخفض (CER) حوالي 0.4% ومعدل خطأ كلمات (WER) حوالي 0.8% لـ Seed-TTS Eval.
+لمزيد من التفاصيل حول النماذج ، يرجى مراجعة التقرير الفني.
 
-5. **سريع:** مع تسريع torch compile، عامل الوقت الحقيقي حوالي 1:7 على GPU Nvidia RTX 4090.
+## أبرز المميزات
 
-6. **استنتاج WebUI:** يتميز بواجهة ويب سهلة الاستخدام قائمة على Gradio متوافقة مع Chrome وFirefox وEdge والمتصفحات الأخرى.
+<img src="../assets/totalability.png" width=200%>
 
-7. **استنتاج GUI:** يوفر واجهة رسومية PyQt6 تعمل بسلاسة مع خادم API. يدعم Linux وWindows وmacOS. [راجع GUI](https://github.com/AnyaCoder/fish-speech-gui).
+### التحكم باللغة الطبيعية
 
-8. **صديق للنشر:** قم بإعداد خادم استنتاج بسهولة مع دعم أصلي لـ Linux وWindows (MacOS قادم قريبًا)، مما يقلل من فقدان السرعة.
+يسمح Fish Audio S2 للمستخدمين باستخدام اللغة الطبيعية للتحكم في أداء كل جملة ، والمعلومات غير اللفظية ، والعواطف ، والمزيد من خصائص الصوت ، بدلاً من مجرد استخدام علامات قصيرة للتحكم بشكل غامض في أداء النموذج. يؤدي ذلك إلى تحسين الجودة الإجمالية للمحتوى المولّد بشكل كبير.
 
-## **الوسائط والعروض التوضيحية**
+### دعم لغات متعددة
 
-<!-- <div align="center"> -->
+يدعم Fish Audio S2 تحويل النص إلى كلام متعدد اللغات بجودة عالية دون الحاجة إلى وحدات صوتية أو معالجة مسبقة خاصة باللغة. يشمل ذلك:
 
-<h3><strong>وسائل التواصل الاجتماعي</strong></h3>
-<a href="https://x.com/FishAudio/status/1929915992299450398" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-أحدث_عرض_توضيحي-black?style=for-the-badge&logo=x&logoColor=white" alt="أحدث عرض توضيحي على X" />
-</a>
+**الإنجليزية ، الصينية ، اليابانية ، الكورية ، العربية ، الألمانية ، الفرنسية ...**
 
-<h3><strong>العروض التوضيحية التفاعلية</strong></h3>
+**والمزيد في المستقبل!**
 
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish_Audio-جرب_OpenAudio_S1-blue?style=for-the-badge" alt="جرب OpenAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-جرب_S1_Mini-yellow?style=for-the-badge" alt="جرب S1 Mini" />
-</a>
+القائمة تتوسع باستمرار ، يرجى التحقق من [Fish Audio](https://fish.audio/) للحصول على أحدث الإصدارات.
 
-<h3><strong>عروض الفيديو</strong></h3>
-<div align="center">
-<iframe width="560" height="315" src="https://www.youtube.com/embed/SYuPvd7m06A" title="OpenAudio S1 Video" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-</div>
+### توليد المتحدثين المتعددين الأصليين
 
-## **الوثائق**
+<img src="../assets/chattemplate.png" width=200%>
 
-### البداية السريعة
-- [بناء البيئة](install.md) - إعداد بيئة التطوير الخاصة بك
-- [دليل الاستنتاج](inference.md) - تشغيل النموذج وإنتاج الكلام
+يسمح Fish Audio S2 للمستخدمين بتحميل عينات صوتية مرجعية تحتوي على متحدثين متعددين ، وسيقوم النموذج بمعالجة خصائص كل متحدث من خلال رمز `<|speaker:i|>`. بعد ذلك ، يمكنك التحكم في أداء النموذج عبر رموز معرف المتحدث ، مما يحقق تعدد المتحدثين في عملية توليد واحدة. لا داعي بعد الآن لتحميل أصوات مرجعية وتوليد كلام لكل متحدث على حدة.
 
-## **المجتمع والدعم**
+### توليد الحوارات متعددة الأدوار
 
-- **Discord:** انضم إلى [مجتمع Discord](https://discord.gg/Es5qTB9BcN) الخاص بنا
-- **الموقع:** قم بزيارة [OpenAudio.com](https://openaudio.com) للحصول على آخر التحديثات
-- **جرب عبر الإنترنت:** [Fish Audio Playground](https://fish.audio)
+بفضل توسيع سياق النموذج ، يمكن لنموذجنا الآن استخدام معلومات السياق السابق لتحسين التعبير عن المحتوى المولّد لاحقًا ، وبالتالي زيادة طبيعية المحتوى.
 
-- تحويل النص إلى كلام (TTS)
-- توليف صوت الغناء (SVS)
-- تحويل الصوت من أي إلى أي (Any-to-any voice conversion)
-- استنساخ الصوت بدون أو بالقليل من العينات (Zero or few-shot voice cloning)
-- استنساخ الصوت عبر اللغات (Cross-lingual voice cloning)
-- إنشاء المحتوى (Content creation)
+### استنساخ الصوت السريع
 
-!!! note "ملاحظة الترخيص"
-    يتم إصدار هذا الكود المصدري بموجب **رخصة أباتشي** ويتم إصدار جميع أوزان النماذج بموجب **رخصة CC-BY-NC-SA-4.0**. يرجى الرجوع إلى [رخصة الكود](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) و [رخصة النموذج](https://spdx.org/licenses/CC-BY-NC-SA-4.0) لمزيد من التفاصيل.
+يدعم Fish Audio S2 استنساخ الصوت الدقيق باستخدام عينات مرجعية قصيرة (عادة 10-30 ثانية). يمكن للنموذج التقاط نبرة الصوت وأسلوب التحدث والميل العاطفي ، وتوليد أصوات مستنسخة واقعية ومتسقة دون ضبط دقيق إضافي.
 
-## النماذج
+---
 
-OpenAudio S1 هو النموذج الأول في سلسلة OpenAudio. وهو مُرمِّز صوتي VQ-GAN مزدوج المُ解码 يمكنه إعادة بناء الصوت من أكواد VQ.
+## شكر وتقدير
+
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+
+## التقرير الفني
+
+```bibtex
+@misc{fish-speech-v1.4,
+      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},
+      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},
+      year={2024},
+      eprint={2411.01156},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2411.01156},
+}
+```

+ 17 - 132
docs/ar/inference.md

@@ -1,173 +1,58 @@
 # الاستنتاج
 
-نظرًا لأن نموذج vocoder قد تغير، تحتاج إلى VRAM أكثر من ذي قبل، يُنصح بـ 12GB للاستنتاج السلس.
-
-ندعم سطر الأوامر و HTTP API و WebUI للاستنتاج، يمكنك اختيار أي طريقة تفضلها.
+يتطلب نموذج Fish Audio S2 ذاكرة فيديو (VRAM) كبيرة. نوصي باستخدام وحدة معالجة رسومات (GPU) بسعة 24 جيجابايت على الأقل للاستنتاج.
 
 ## تحميل الأوزان
 
-أولاً تحتاج إلى تحميل أوزان النموذج:
+أولاً ، تحتاج إلى تحميل أوزان النموذج:
 
 ```bash
-hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
+hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ```
 
-## استنتاج سطر الأوامر
+## الاستنتاج عبر خط الأوامر
 
 !!! note
-    إذا كنت تخطط لترك النموذج يختار نبرة صوت عشوائياً، يمكنك تخطي هذه الخطوة.
+    إذا كنت تخطط لترك النموذج يختار نغمة الصوت عشوائيًا ، فيمكنك تخطي هذه الخطوة.
 
 ### 1. الحصول على رموز VQ من الصوت المرجعي
 
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "ref_audio_name.wav" \
-    --checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth"
+    -i "test.wav" \
+    --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
 
 يجب أن تحصل على `fake.npy` و `fake.wav`.
 
-### 2. إنتاج الرموز الدلالية من النص:
+### 2. توليد الرموز الدلالية (Semantic tokens) من النص:
 
 ```bash
 python fish_speech/models/text2semantic/inference.py \
     --text "النص الذي تريد تحويله" \
     --prompt-text "النص المرجعي الخاص بك" \
     --prompt-tokens "fake.npy" \
-    --compile
+    # --compile
 ```
 
-هذا الأمر سينشئ ملف `codes_N` في دليل العمل، حيث N هو عدد صحيح يبدأ من 0.
+سيقوم هذا الأمر بإنشاء ملف `codes_N` في دليل العمل ، حيث N هو عدد صحيح يبدأ من 0.
 
 !!! note
-    قد ترغب في استخدام `--compile` لدمج نوى CUDA للاستنتاج الأسرع (~15 رمز/ثانية -> ~150 رمز/ثانية، على GPU RTX 4090).
-    وفقاً لذلك، إذا كنت لا تخطط لاستخدام التسريع، يمكنك التعليق على معامل `--compile`.
+    قد ترغب في استخدام `--compile` لدمج نوى CUDA لاستنتاج أسرع. ومع ذلك ، نوصي باستخدام تحسين تسريع الاستنتاج sglang الخاص بنا.
+    بالمقابل ، إذا كنت لا تخطط لاستخدام التسريع ، يمكنك التعليق على معلمة `--compile`.
 
 !!! info
-    بالنسبة لوحدات GPU التي لا تدعم bf16، قد تحتاج إلى استخدام معامل `--half`.
-
-### 3. إنتاج الأصوات من الرموز الدلالية:
+    بالنسبة لوحدات معالجة الرسومات التي لا تدعم bf16 ، قد تحتاج إلى استخدام معلمة `--half`.
 
-!!! warning "تحذير مستقبلي"
-    لقد احتفظنا بإمكانية الوصول إلى الواجهة من المسار الأصلي (tools/vqgan/inference.py)، لكن هذه الواجهة قد تُزال في الإصدارات اللاحقة، لذا يرجى تغيير الكود الخاص بك في أقرب وقت ممكن.
+### 3. توليد الصوت من الرموز الدلالية:
 
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "codes_0.npy"
+    -i "codes_0.npy" \
 ```
 
-## استنتاج HTTP API
-
-نوفر HTTP API للاستنتاج. يمكنك استخدام الأمر التالي لبدء الخادم:
-
-```bash
-python -m tools.api_server \
-    --listen 0.0.0.0:8080 \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-> إذا كنت تريد تسريع الاستنتاج، يمكنك إضافة معامل `--compile`.
-
-بعد ذلك، يمكنك عرض واختبار API على http://127.0.0.1:8080/.
-
-## استنتاج GUI 
-[تحميل العميل](https://github.com/AnyaCoder/fish-speech-gui/releases)
+بعد ذلك ستحصل على ملف `fake.wav`.
 
 ## استنتاج WebUI
 
-يمكنك بدء WebUI باستخدام الأمر التالي:
-
-```bash
-python -m tools.run_webui \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-أو ببساطة
-
-```bash
-python -m tools.run_webui
-```
-> إذا كنت تريد تسريع الاستنتاج، يمكنك إضافة معامل `--compile`.
-
-!!! note
-    يمكنك حفظ ملف التسمية وملف الصوت المرجعي مسبقاً في مجلد `references` في الدليل الرئيسي (الذي تحتاج إلى إنشاؤه بنفسك)، بحيث يمكنك استدعاؤها مباشرة في WebUI.
-
-!!! note
-    يمكنك استخدام متغيرات بيئة Gradio، مثل `GRADIO_SHARE`، `GRADIO_SERVER_PORT`، `GRADIO_SERVER_NAME` لتكوين WebUI.
-
-استمتع!
-
-## الاستدلال باستخدام Docker
-
-يوفر OpenAudio حاويات Docker للاستدلال لكل من واجهة المستخدم الرسومية (WebUI) وخادم API. يمكنك استخدام أمر `docker run` مباشرة لبدء تشغيل الحاوية.
-
-تحتاج إلى تحضير ما يلي:
-- تثبيت Docker و NVIDIA Docker runtime (لدعم GPU)
-- تنزيل أوزان النموذج (راجع قسم [تحميل الأوزان](#تحميل-الأوزان))
-- ملفات الصوت المرجعية (اختياري، لاستنساخ الصوت)
-
-```bash
-# إنشاء مجلدات لأوزان النموذج والصوت المرجعي
-mkdir -p checkpoints references
-
-# تنزيل أوزان النموذج (إذا لم يتم ذلك بعد)
-# hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
-
-# بدء واجهة المستخدم الرسومية (WebUI) مع دعم CUDA (موصى به للحصول على أفضل أداء)
-docker run -d \
-    --name fish-speech-webui \
-    --gpus all \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-webui-cuda
-
-# الاستدلال باستخدام CPU فقط (أبطأ، ولكنه يعمل بدون GPU)
-docker run -d \
-    --name fish-speech-webui-cpu \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-webui-cpu
-```
-
-```bash
-# بدء خادم API مع دعم CUDA
-docker run -d \
-    --name fish-speech-server \
-    --gpus all \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-server-cuda
-
-# الاستدلال باستخدام CPU فقط
-docker run -d \
-    --name fish-speech-server-cpu \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-server-cpu
-```
-
-يمكنك تخصيص حاويات Docker باستخدام متغيرات البيئة هذه:
-
-- `COMPILE=1` - تمكين `torch.compile` لتسريع الاستدلال (حوالي 10 أضعاف، CUDA فقط)
-- `GRADIO_SERVER_NAME=0.0.0.0` - مضيف خادم واجهة المستخدم الرسومية (WebUI) (الافتراضي: 0.0.0.0)
-- `GRADIO_SERVER_PORT=7860` - منفذ خادم واجهة المستخدم الرسومية (WebUI) (الافتراضي: 7860)
-- `API_SERVER_NAME=0.0.0.0` - مضيف خادم API (الافتراضي: 0.0.0.0)
-- `API_SERVER_PORT=8080` - منفذ خادم API (الافتراضي: 8080)
-- `LLAMA_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini` - مسار أوزان النموذج
-- `DECODER_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini/codec.pth` - مسار أوزان وحدة فك التشفير
-- `DECODER_CONFIG_NAME=modded_dac_vq` - اسم تكوين وحدة فك التشفير
-```
-
-استخدام واجهة المستخدم الرسومية (WebUI) وخادم API هو نفسه الموضح في الدليل أعلاه.
-
-استمتع!
+قيد التطوير.

+ 8 - 4
docs/ar/install.md

@@ -1,11 +1,11 @@
 ## المتطلبات
 
-- ذاكرة وحدة معالجة الرسومات (GPU): 12 جيجابايت (للاستدلال)
+- ذاكرة وحدة معالجة الرسومات (GPU): 24 جيجابايت (للاستدلال)
 - النظام: Linux, WSL
 
 ## إعداد النظام
 
-يدعم OpenAudio طرق تثبيت متعددة. اختر الطريقة التي تناسب بيئة التطوير الخاصة بك.
+يدعم Fish Audio S2 طرق تثبيت متعددة. اختر الطريقة التي تناسب بيئة التطوير الخاصة بك.
 
 **المتطلبات الأساسية**: قم بتثبيت تبعيات النظام لمعالجة الصوت:
 ``` bash
@@ -26,6 +26,10 @@ pip install -e .[cpu]
 
 # التثبيت الافتراضي (يستخدم فهرس PyTorch الافتراضي)
 pip install -e .
+
+# إذا واجهت خطأ أثناء التثبيت بسبب pyaudio، ففكر في استخدام الأمر التالي:
+# conda install pyaudio
+# ثم قم بتشغيل pip install -e . مرة أخرى
 ```
 
 ### UV
@@ -63,7 +67,7 @@ pip install -e .
 
 ## إعداد Docker
 
-يوفر نموذج سلسلة OpenAudio S1 خيارات نشر متعددة مع Docker لتلبية الاحتياجات المختلفة. يمكنك استخدام الصور المعدة مسبقًا من Docker Hub، أو البناء محليًا باستخدام Docker Compose، أو بناء صور مخصصة يدويًا.
+يوفر نموذج سلسلة Fish Audio S2 خيارات نشر متعددة مع Docker لتلبية الاحتياجات المختلفة. يمكنك استخدام الصور المعدة مسبقًا من Docker Hub، أو البناء محليًا باستخدام Docker Compose، أو بناء صور مخصصة يدويًا.
 
 لقد قدمنا صور Docker لكل من واجهة المستخدم الرسومية (WebUI) وخادم API، لكل من وحدات معالجة الرسومات (GPU) (CUDA 12.6 افتراضيًا) ووحدات المعالجة المركزية (CPU). يمكنك استخدام الصور المعدة مسبقًا من Docker Hub، أو البناء محليًا باستخدام Docker Compose، أو بناء صور مخصصة يدويًا. إذا كنت ترغب في البناء محليًا، فاتبع الإرشادات أدناه. إذا كنت ترغب فقط في استخدام الصور المعدة مسبقًا، فاتبع مباشرةً [دليل الاستدلال](inference.md).
 
@@ -71,7 +75,7 @@ pip install -e .
 
 - تثبيت Docker و Docker Compose
 - تثبيت NVIDIA Docker runtime (لدعم GPU)
-- ذاكرة GPU لا تقل عن 12 جيجابايت للاستدلال باستخدام CUDA
+- ذاكرة GPU لا تقل عن 24 جيجابايت للاستدلال باستخدام CUDA
 
 ### استخدام Docker Compose
 

+ 0 - 80
docs/ar/samples.md

@@ -1,80 +0,0 @@
-# أمثلة
-
-## التحكم في المشاعر (*ميزة جديدة)
-
-### أمثلة المشاعر الأساسية
-
-| نوع المشاعر | اللغة | الصوت المدخل | الصوت المُولّد | الأمر |
-|-------------|--------|-------------|---------------|-------|
-| **السعادة** | الصينية | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy_refer.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy.wav" /> | (happy)嘿嘿...博士,悄悄告诉你一件事——我重新开始练小提琴了。 |
-| **الاشمئزاز** | اليابانية | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/disgusted.wav" /> | (digusted)あなたは、本当に気持ち悪い、嫌い…(disgusted)それでも、慰めを求めますの? |
-| **الغضب** | الإنجليزية | - | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/angry.wav" /> | (angry)I want you to go out immediately! I don't want to see you again, or I will try to kill you! |
-| **الغضب** | الصينية | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/作战中4.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/angry.wav" /> | (angry)我让你快滚,你是耳聋吗?!...(angry)信不信我揍你! |
-| **الدهشة** | الصينية | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/surprised.wav" /> | (surprised)今天你过生日?既然这样的话,我就勉为其难祝你生日快乐吧。(surprised)要不要看看你的桌子底下? |
-| **الحزن** | اليابانية | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref2.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/sad.wav" /> | (sad)他の小隊長と比べて、私はまだ多くのことを学ばなくてはなりません......(sad)皆さんのペースに追いつけるよう精一杯努力いたしますわ。 |
-
-## التأثيرات اللغوية المصاحبة (*ميزة جديدة)
-
-### تأثيرات الضحك
-
-| المثال | اللغة | الأمر | الصوت |
-|--------|--------|-------|--------|
-| **مثال 1** | الصينية | 大家好啊,(笑声)哈哈,我是从来不带节奏的血狼破军,今天来点大家想看的东西。 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh1.wav" /> |
-| **مثال 2** | الصينية | (笑声)哈哈(笑声),虽然说"三角洲行动"的策划说他们没有暗改(笑声)哈哈(笑声),但是我相信,大家心里都有数。对不起,实在是太搞笑了,忍不住笑了出来。(笑声)哈哈(笑声) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh2.wav" /> |
-| **مثال 3** | الإنجليزية | (laughing)haha(laughing), though many people say that homeless cats need our help, (laughing)haha(laughing), but seldom do they really do something that is useful to the cats, (laughing)haha(laughing) sorry, but this is very interesting. | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/laugh.wav" /> |
-
-### تأثيرات الصراخ القتالي
-
-| المثال | اللغة | الأمر | الصوت |
-|--------|--------|-------|--------|
-| **مثال الصراخ القتالي** | الإنجليزية | (shouting)oh my god !!!(shouting)(shouting)(shouting), baby(shouting)you (shouting)are (shouting)a piece of sweet, soft(shouting), delicious cake!!! | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/shout.wav" /> |
-
-## اختبار استقرار النص الطويل
-
-### اختبار النص الطويل الصيني
-
-**نص الاختبار الصيني:**
-```
-你们这个是什么群啊,你们这是害人不浅啊你们这个群!谁是群主,出来!真的太过分了。你们搞这个群干什么?
-我儿子每一科的成绩都不过那个平均分呐,他现在初二,你叫我儿子怎么办啊?他现在还不到高中啊?
-你们害死我儿子了!快点出来你这个群主!再这样我去报警了啊!我跟你们说你们这一帮人啊,一天到晚啊,
-搞这些什么游戏啊,动漫啊,会害死你们的,你们没有前途我跟你说。你们这九百多个人,好好学习不好吗?
-一天到晚在上网。有什么意思啊?麻烦你重视一下你们的生活的目标啊?有一点学习目标行不行?一天到晚上网是不是人啊?
-```
-
-| محتوى الاختبار | المتحدث/الشخصية | الصوت المدخل | الصوت المُولّد |
-|----------------|------------------|-------------|---------------|
-| **اختبار النص الطويل** | شي (أركنايتس) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio.wav" /> |
-| **متحدث عشوائي** | عشوائي (تحذير مستوى الصوت) | لا يوجد | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio2.wav" /> |
-
-### اختبار النص الطويل الإنجليزي
-
-**نص الاختبار الإنجليزي:**
-```
-In the realm of advanced technology, the evolution of artificial intelligence stands as a 
-monumental achievement. This dynamic field, constantly pushing the boundaries of what 
-machines can do, has seen rapid growth and innovation. From deciphering complex data 
-patterns to driving cars autonomously, AI's applications are vast and diverse.
-```
-
-| محتوى الاختبار | المتحدث | الصوت المدخل | الصوت المُولّد |
-|----------------|----------|-------------|---------------|
-| **متحدث عشوائي 1** | عشوائي | لا يوجد | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio.wav" /> |
-| **متحدث عشوائي 2** | عشوائي | لا يوجد | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio2.wav" /> |
-
-### اختبار النص الطويل الياباني
-
-**نص الاختبار الياباني:**
-```
-宇宙に始まりはあるが、終わりはない。無限。
-星にもまた始まりはあるが、自らの力をもって滅び逝く。有限。
-英知を持つ者こそ、最も愚かであること。歴史からも読み取れる。
-海に生ける魚は、陸の世界を知らない。彼らが英知を持てば、それもまた滅び逝く。
-人間が光の速さを超えるのは、魚たちが陸で生活を始めるよりも滑稽。
-これは抗える者たちに対する、神からの最後通告とも言えよう。
-```
-
-| محتوى الاختبار | المتحدث/الشخصية | الصوت المدخل | الصوت المُولّد |
-|----------------|------------------|-------------|---------------|
-| **اختبار النص الطويل** | ساكيكو تويوجاوا | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio.wav" /> |
-| **متحدث عشوائي** | عشوائي | لا يوجد | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio2.wav" /> |

BIN=BIN
docs/assets/Elo.jpg


BIN=BIN
docs/assets/Thumbnail.jpg


BIN=BIN
docs/assets/chattemplate.png


BIN=BIN
docs/assets/openaudio.jpg


BIN=BIN
docs/assets/openaudio.png


BIN=BIN
docs/assets/totalability.png


+ 89 - 117
docs/en/index.md

@@ -1,161 +1,133 @@
-# OpenAudio (formerly Fish-Speech)
-
-<div align="center">
-
 <div align="center">
+<h1>Fish Speech</h1>
 
-<img src="../assets/openaudio.jpg" alt="OpenAudio" style="display: block; margin: 0 auto; width: 35%;"/>
-
-</div>
-
-<strong>Advanced Text-to-Speech Model Series</strong>
+**English** | [简体中文](../zh/) | [Portuguese](../pt/) | [日本語](../ja/) | [한국어](../ko/) | [العربية](../ar/) <br>
 
-<div>
-<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
-<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
-<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
-<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+<a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
+<a href="https://trendshift.io/repositories/7014" target="_blank">
+    <img src="https://trendshift.io/api/badge/repositories/7014" alt="fishaudio%2Ffish-speech | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
 </a>
 </div>
 
-<strong>Try it now:</strong> <a href="https://fish.audio">Fish Audio Playground</a> | <strong>Learn more:</strong> <a href="https://openaudio.com">OpenAudio Website</a>
+<br>
 
+<div align="center">
+    <img src="https://count.getloli.com/get/@fish-speech?theme=asoul" /><br>
 </div>
 
----
-
-!!! note "License Notice"
-    This codebase is released under **Apache License** and all model weights are released under **CC-BY-NC-SA-4.0 License**. Please refer to [CODE LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) and [MODEL LICENSE](https://spdx.org/licenses/CC-BY-NC-SA-4.0) for more details.
-
-!!! warning "Legal Disclaimer"
-    We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.
-
-## **Introduction**
-
-We are excited to announce that we have rebranded to **OpenAudio** - introducing a brand new series of advanced Text-to-Speech models that builds upon the foundation of Fish-Speech with significant improvements and new capabilities.
-
-**Openaudio-S1-mini**: [Blog](https://openaudio.com/blogs/s1); [Video](https://www.youtube.com/watch?v=SYuPvd7m06A); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini);
-
-**Fish-Speech v1.5**: [Video](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5);
-
-## **Highlights**
-
-### **Excellent TTS quality**
-
-We use Seed TTS Eval Metrics to evaluate the model performance, and the results show that OpenAudio S1 achieves **0.008 WER** and **0.004 CER** on English text, which is significantly better than previous models. (English, auto eval, based on OpenAI gpt-4o-transcribe, speaker distance using Revai/pyannote-wespeaker-voxceleb-resnet34-LM)
-
-| Model | Word Error Rate (WER) | Character Error Rate (CER) | Speaker Distance |
-|:-----:|:--------------------:|:-------------------------:|:----------------:|
-| **S1** | **0.008** | **0.004** | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
+<br>
 
-### **Best Model in TTS-Arena2**
-
-OpenAudio S1 has achieved the **#1 ranking** on [TTS-Arena2](https://arena.speechcolab.org/), the benchmark for text-to-speech evaluation:
+<div align="center">
+    <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+        <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+    </a>
+    <a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+        <img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+    </a>
+    <a target="_blank" href="https://pd.qq.com/s/bwxia254o">
+      <img alt="QQ Channel" src="https://img.shields.io/badge/QQ-blue?logo=tencentqq">
+    </a>
+</div>
 
 <div align="center">
-    <img src="assets/Elo.jpg" alt="TTS-Arena2 Ranking" style="width: 75%;" />
+    <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
+      <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
+    </a>
+    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
+        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
+    </a>
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
+        <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
+    </a>
 </div>
 
-### **Speech Control**
-OpenAudio S1 **supports a variety of emotional, tone, and special markers** to enhance speech synthesis:
+!!! info "License Notice"
+    This codebase and its associated model weights are released under **FISH AUDIO RESEARCH LICENSE**. Please refer to [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) for more details.
 
-- **Basic emotions**:
-```
-(angry) (sad) (excited) (surprised) (satisfied) (delighted) 
-(scared) (worried) (upset) (nervous) (frustrated) (depressed)
-(empathetic) (embarrassed) (disgusted) (moved) (proud) (relaxed)
-(grateful) (confident) (interested) (curious) (confused) (joyful)
-```
+!!! warning "Legal Disclaimer"
+    We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.
 
-- **Advanced emotions**:
-```
-(disdainful) (unhappy) (anxious) (hysterical) (indifferent) 
-(impatient) (guilty) (scornful) (panicked) (furious) (reluctant)
-(keen) (disapproving) (negative) (denying) (astonished) (serious)
-(sarcastic) (conciliative) (comforting) (sincere) (sneering)
-(hesitating) (yielding) (painful) (awkward) (amused)
-```
+## Get Started
 
-(Support for English, Chinese and Japanese now, and more languages is coming soon!)
+This is the official documentation for Fish Speech. Please follow the instructions to get started easily.
 
-- **Tone markers**:
-```
-(in a hurry tone) (shouting) (screaming) (whispering) (soft tone)
-```
+- [Installation](install.md)
+- [Inference](inference.md)
 
-- **Special audio effects**:
-```
-(laughing) (chuckling) (sobbing) (crying loudly) (sighing) (panting)
-(groaning) (crowd laughing) (background laughter) (audience laughing)
-```
+## Fish Audio S2
+**The best text-to-speech system in both open-source and closed-source**
 
-You can also use Ha,ha,ha to control, there's many other cases waiting to be explored by yourself.
+Fish Audio S2 is the latest model developed by [Fish Audio](https://fish.audio/), designed to generate speech that sounds natural, authentic, and emotionally rich—not mechanical, flat, or confined to studio-style reading.
 
-### **Two Type of Models**
+Fish Audio S2 focuses on everyday conversations, supporting native multi-speaker and multi-round generation. It also supports instruction control.
 
-We offer two model variants to suit different needs:
+The S2 series includes multiple models. The open-source model is S2-Pro, which is the most powerful model in the series.
 
-- **OpenAudio S1 (4B parameters)**: Our full-featured flagship model available on [fish.audio](https://fish.audio), delivering the highest quality speech synthesis with all advanced features.
+Please visit the [Fish Audio website](https://fish.audio/) for a real-time experience.
 
-- **OpenAudio S1-mini (0.5B parameters)**: A distilled version with core capabilities, available on [Hugging Face Space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini), optimized for faster inference while maintaining excellent quality.
+### Model Variants
 
-Both S1 and S1-mini incorporate online Reinforcement Learning from Human Feedback (RLHF).
+| Model | Size | Availability | Description |
+|------|------|-------------|-------------|
+| S2-Pro | 4B Parameters | [huggingface](https://huggingface.co/fishaudio/s2-pro) | Full-featured flagship model with the highest quality and stability |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | Our closed-source model with faster speed and lower latency |
 
-## **Features**
+For more details on the models, please see the technical report.
 
-1. **Zero-shot & Few-shot TTS:** Input a 10 to 30-second vocal sample to generate high-quality TTS output. **For detailed guidelines, see [Voice Cloning Best Practices](https://docs.fish.audio/text-to-speech/voice-clone-best-practices).**
+## Highlights
 
-2. **Multilingual & Cross-lingual Support:** Simply copy and paste multilingual text into the input box—no need to worry about the language. Currently supports English, Japanese, Korean, Chinese, French, German, Arabic, and Spanish.
+<img src="../assets/totalability.png" width=200%>
 
-3. **No Phoneme Dependency:** The model has strong generalization capabilities and does not rely on phonemes for TTS. It can handle text in any language script.
+### Natural Language Control
 
-4. **Highly Accurate:** Achieves a low CER (Character Error Rate) of around 0.4% and WER (Word Error Rate) of around 0.8% for Seed-TTS Eval.
+Fish Audio S2 allows users to use natural language to control the performance, paralinguistic information, emotions, and more voice characteristics of each sentence, instead of just using short tags to vaguely control the model's performance. This greatly improves the overall quality of the generated content.
 
-5. **Fast:** Accelerated by torch compile, the real-time factor is approximately 1:7 on an Nvidia RTX 4090 GPU.
+### Multilingual Support
 
-6. **WebUI Inference:** Features an easy-to-use, Gradio-based web UI compatible with Chrome, Firefox, Edge, and other browsers.
+Fish Audio S2 supports high-quality multilingual text-to-speech without the need for phonemes or language-specific preprocessing. Including:
 
-7. **GUI Inference:** Offers a PyQt6 graphical interface that works seamlessly with the API server. Supports Linux, Windows, and macOS. [See GUI](https://github.com/AnyaCoder/fish-speech-gui).
+**English, Chinese, Japanese, Korean, Arabic, German, French...**
 
-8. **Deploy-Friendly:** Easily set up an inference server with native support for Linux, Windows (MacOS comming soon), minimizing speed loss.
+**And more!**
 
-## **Media & Demos**
+The list is constantly expanding, please check [Fish Audio](https://fish.audio/) for the latest releases.
 
-<!-- <div align="center"> -->
+### Native Multi-speaker Generation
 
-<h3><strong>Social Media</strong></h3>
-<a href="https://x.com/FishAudio/status/1929915992299450398" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-Latest_Demo-black?style=for-the-badge&logo=x&logoColor=white" alt="Latest Demo on X" />
-</a>
+<img src="../assets/chattemplate.png" width=200%>
 
-<h3><strong>Interactive Demos</strong></h3>
-
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish_Audio-Try_OpenAudio_S1-blue?style=for-the-badge" alt="Try OpenAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-Try_S1_Mini-yellow?style=for-the-badge" alt="Try S1 Mini" />
-</a>
+Fish Audio S2 allows users to upload reference audio containing multiple speakers, and the model will process each speaker's characteristics through the `<|speaker:i|>` token. You can then control the model's performance via speaker ID tokens, achieving multiple speakers in a single generation. No more need to upload reference audio and generate speech for each speaker individually.
 
-<h3><strong>Video Showcases</strong></h3>
-<div align="center">
-<iframe width="560" height="315" src="https://www.youtube.com/embed/SYuPvd7m06A" title="OpenAudio S1 Video" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-</div>
+### Multi-round Dialogue Generation
 
-## **Documentation**
+Thanks to the expansion of the model's context, our model can now use the information from the previous context to improve the expressiveness of the subsequent generated content, thereby enhancing the naturalness of the content.
 
-### Quick Start
-- [Build Environment](en/install.md) - Set up your development environment
-- [Inference Guide](en/inference.md) - Run the model and generate speech
+### Fast Voice Cloning
 
+Fish Audio S2 supports accurate voice cloning using short reference samples (typically 10-30 seconds). The model can capture timbre, speaking style, and emotional tendency, generating realistic and consistent cloned voices without additional fine-tuning.
 
-## **Community & Support**
+---
 
-- **Discord:** Join our [Discord community](https://discord.gg/Es5qTB9BcN)
-- **Website:** Visit [OpenAudio.com](https://openaudio.com) for latest updates
-- **Try Online:** [Fish Audio Playground](https://fish.audio)
+## Acknowledgements
+
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+
+## Technical Report
+
+```bibtex
+@misc{fish-speech-v1.4,
+      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},
+      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},
+      year={2024},
+      eprint={2411.01156},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2411.01156},
+}
+```

+ 12 - 142
docs/en/inference.md

@@ -1,54 +1,44 @@
 # Inference
 
-As the vocoder model has been changed, you need more VRAM than before, 12GB is recommended for fluently inference.
-
-We support command line, HTTP API and WebUI for inference, you can choose any method you like.
+The Fish Audio S2 model requires a large amount of VRAM. We recommend using a GPU with at least 24GB for inference.
 
 ## Download Weights
 
-First you need to download the model weights:
+First, you need to download the model weights:
 
 ```bash
-
-# Requires "huggingface_hub[cli]" to be installed
-# pip install huggingface_hub[cli]
-# or 
-# uv tool install huggingface_hub[cli]
-
-hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
+hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ```
 
 ## Command Line Inference
 
-### 1. Get VQ tokens from reference audio
-
 !!! note
     If you plan to let the model randomly choose a voice timbre, you can skip this step.
 
+### 1. Get VQ tokens from reference audio
+
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "ref_audio_name.wav" \
-    --checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth"
+    -i "test.wav" \
+    --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
 
 You should get a `fake.npy` and a `fake.wav`.
 
-### 2. Generate semantic tokens from text:
+### 2. Generate Semantic tokens from text:
 
 ```bash
 python fish_speech/models/text2semantic/inference.py \
     --text "The text you want to convert" \
     --prompt-text "Your reference text" \
     --prompt-tokens "fake.npy" \
-    --compile
+    # --compile
 ```
-with `--prompt-tokens "fake.npy"` and `--prompt-text "Your reference text"` from step 1.
-If you want to let the model randomly choose a voice timbre, skip the two parameters.
 
 This command will create a `codes_N` file in the working directory, where N is an integer starting from 0.
 
 !!! note
-    You may want to use `--compile` to fuse CUDA kernels for faster inference (~15 tokens/second -> ~150 tokens/second, on RTX 4090 GPU).
+    You may want to use `--compile` to fuse CUDA kernels for faster inference. However, we recommend using our sglang inference acceleration optimization.
     Correspondingly, if you do not plan to use acceleration, you can comment out the `--compile` parameter.
 
 !!! info
@@ -56,133 +46,13 @@ This command will create a `codes_N` file in the working directory, where N is a
 
 ### 3. Generate vocals from semantic tokens:
 
-!!! warning "Future Warning"
-    We have kept the interface accessible from the original path (tools/vqgan/inference.py), but this interface may be removed in subsequent releases, so please change your code as soon as possible.
-
 ```bash
 python fish_speech/models/dac/inference.py \
     -i "codes_0.npy" \
 ```
 
-## HTTP API Inference
-
-We provide a HTTP API for inference. You can use the following command to start the server:
-
-```bash
-python -m tools.api_server \
-    --listen 0.0.0.0:8080 \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-
-# or with uv
-uv run tools/api_server.py \
-    --listen 0.0.0.0:8080 \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-> If you want to speed up inference, you can add the `--compile` parameter.
-
-After that, you can view and test the API at http://127.0.0.1:8080/.
-
-## GUI Inference 
-[Download client](https://github.com/AnyaCoder/fish-speech-gui/releases)
+After that, you will get a `fake.wav` file.
 
 ## WebUI Inference
 
-You can start the WebUI using the following command:
-
-```bash
-python -m tools.run_webui \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-Or simply
-
-```bash
-python -m tools.run_webui
-```
-> If you want to speed up inference, you can add the `--compile` parameter.
-
-
-!!! note
-    You can save the label file and reference audio file in advance to the `references` folder in the main directory (which you need to create yourself), so that you can directly call them in the WebUI.
-    Inside the `references` folder, put subdirectories named `<voice_id>`, and put the label file (`sample.lab`, containing the reference text) and reference audio file (`sample.wav`) in the subdirectory.
-
-!!! note
-    You can use Gradio environment variables, such as `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME` to configure WebUI.
-
-## Docker Inference
-
-OpenAudio provides Docker containers for both WebUI and API server inference. You can directly use `docker run` to start the container.
-
-You need to prepare the following:
-- Docker installed with NVIDIA Docker runtime (for GPU support)
-- Model weights downloaded (see [Download Weights](#download-weights) section)
-- Reference audio files (optional, for voice cloning)
-
-```bash
-# Create directories for model weights and reference audio
-mkdir -p checkpoints references
-
-# Download model weights (if not already done)
-# hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
-
-# Start WebUI with CUDA support (recommended for best performance)
-docker run -d \
-    --name fish-speech-webui \
-    --gpus all \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-webui-cuda
-
-# For CPU-only inference (slower, but works without GPU)
-docker run -d \
-    --name fish-speech-webui-cpu \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-webui-cpu
-```
-
-```bash
-# Start API server with CUDA support
-docker run -d \
-    --name fish-speech-server \
-    --gpus all \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-server-cuda
-
-# For CPU-only inference
-docker run -d \
-    --name fish-speech-server-cpu \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-server-cpu
-```
-
-You can customize the Docker containers using these environment variables:
-
-- `COMPILE=1` - Enable torch.compile for ~10x faster inference (CUDA only)
-- `GRADIO_SERVER_NAME=0.0.0.0` - WebUI server host (default: 0.0.0.0)
-- `GRADIO_SERVER_PORT=7860` - WebUI server port (default: 7860)
-- `API_SERVER_NAME=0.0.0.0` - API server host (default: 0.0.0.0)
-- `API_SERVER_PORT=8080` - API server port (default: 8080)
-- `LLAMA_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini` - Path to model weights
-- `DECODER_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini/codec.pth` - Path to decoder weights
-- `DECODER_CONFIG_NAME=modded_dac_vq` - Decoder configuration name
-```
-
-The usage of webui and api server is the same as the webui and api server guide above.
-
-Enjoy
+Coming soon.

+ 8 - 4
docs/en/install.md

@@ -1,11 +1,11 @@
 ## Requirements
 
-- GPU Memory: 12GB (Inference)
+- GPU Memory: 24GB (Inference)
 - System: Linux, WSL
 
 ## System Setup
 
-OpenAudio supports multiple installation methods. Choose the one that best fits your development environment.
+Fish Audio S2 supports multiple installation methods. Choose the one that best fits your development environment.
 
 **Prerequisites**: Install system dependencies for audio processing:
 ``` bash
@@ -26,6 +26,10 @@ pip install -e .[cpu]
 
 # Default installation (uses PyTorch default index)
 pip install -e .
+
+# If you encounter an error during installation due to pyaudio, consider using the following command:
+# conda install pyaudio
+# Then run pip install -e . again
 ```
 
 ### UV
@@ -63,7 +67,7 @@ pip install -e .
 
 ## Docker Setup
 
-OpenAudio S1 series model provides multiple Docker deployment options to suit different needs. You can use pre-built images from Docker Hub, build locally with Docker Compose, or manually build custom images.
+Fish Audio S2 series model provides multiple Docker deployment options to suit different needs. You can use pre-built images from Docker Hub, build locally with Docker Compose, or manually build custom images.
 
 We provided Docker images for both WebUI and API server on both GPU(CUDA126 for default) and CPU. You can use the pre-built images from Docker Hub, or build locally with Docker Compose, or manually build custom images. If you want to build locally, follow the instructions below. If you just want to use the pre-built images, follow [inference guide](en/inference.md) to use directly.
 
@@ -71,7 +75,7 @@ We provided Docker images for both WebUI and API server on both GPU(CUDA126 for
 
 - Docker and Docker Compose installed
 - NVIDIA Docker runtime (for GPU support)
-- At least 12GB GPU memory for CUDA inference
+- At least 24GB GPU memory for CUDA inference
 
 # Use docker compose
 

+ 0 - 80
docs/en/samples.md

@@ -1,80 +0,0 @@
-# Examples
-
-## Emotional Control (*New Feature)
-
-### Basic Emotional Examples
-
-| Emotion Type | Language | Input Audio | Synthesized Audio | Prompt |
-|-------------|----------|-------------|-------------------|--------|
-| **Happy** | Chinese | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy_refer.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy.wav" /> | (happy)嘿嘿...博士,悄悄告诉你一件事——我重新开始练小提琴了。 |
-| **Disgusted** | Japanese | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/disgusted.wav" /> | (digusted)あなたは、本当に気持ち悪い、嫌い…(disgusted)それでも、慰めを求めますの? |
-| **Angry** | English | - | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/angry.wav" /> | (angry)I want you to go out immediately! I don't want to see you again, or I will try to kill you! |
-| **Angry** | Chinese | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/作战中4.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/angry.wav" /> | (angry)我让你快滚,你是耳聋吗?!...(angry)信不信我揍你! |
-| **Surprised** | Chinese | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/surprised.wav" /> | (surprised)今天你过生日?既然这样的话,我就勉为其难祝你生日快乐吧。(surprised)要不要看看你的桌子底下? |
-| **Sad** | Japanese | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref2.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/sad.wav" /> | (sad)他の小隊長と比べて、私はまだ多くのことを学ばなくてはなりません......(sad)皆さんのペースに追いつけるよう精一杯努力いたしますわ。 |
-
-## Paralinguistic Effects (*New Feature)
-
-### Laughter Effects
-
-| Example | Language | Prompt | Audio |
-|---------|----------|--------|-------|
-| **Example 1** | Chinese | 大家好啊,(笑声)哈哈,我是从来不带节奏的血狼破军,今天来点大家想看的东西。 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh1.wav" /> |
-| **Example 2** | Chinese | (笑声)哈哈(笑声),虽然说"三角洲行动"的策划说他们没有暗改(笑声)哈哈(笑声),但是我相信,大家心里都有数。对不起,实在是太搞笑了,忍不住笑了出来。(笑声)哈哈(笑声) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh2.wav" /> |
-| **Example 3** | English | (laughing)haha(laughing), though many people say that homeless cats need our help, (laughing)haha(laughing), but seldom do they really do something that is useful to the cats, (laughing)haha(laughing) sorry, but this is very interesting. | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/laugh.wav" /> |
-
-### Battle Cry Effects
-
-| Example | Language | Prompt | Audio |
-|---------|----------|--------|-------|
-| **Battle Cry Example** | English | (shouting)oh my god !!!(shouting)(shouting)(shouting), baby(shouting)you (shouting)are (shouting)a piece of sweet, soft(shouting), delicious cake!!! | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/shout.wav" /> |
-
-## Long Text Stability Test
-
-### Chinese Long Text Test
-
-**Chinese Test Text:**
-```
-你们这个是什么群啊,你们这是害人不浅啊你们这个群!谁是群主,出来!真的太过分了。你们搞这个群干什么?
-我儿子每一科的成绩都不过那个平均分呐,他现在初二,你叫我儿子怎么办啊?他现在还不到高中啊?
-你们害死我儿子了!快点出来你这个群主!再这样我去报警了啊!我跟你们说你们这一帮人啊,一天到晚啊,
-搞这些什么游戏啊,动漫啊,会害死你们的,你们没有前途我跟你说。你们这九百多个人,好好学习不好吗?
-一天到晚在上网。有什么意思啊?麻烦你重视一下你们的生活的目标啊?有一点学习目标行不行?一天到晚上网是不是人啊?
-```
-
-| Test Content | Speaker/Character | Input Audio | Synthesized Audio |
-|-------------|------------------|-------------|-------------------|
-| **Long Text Test** | Xi (Arknights) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio.wav" /> |
-| **Random Speaker** | Random (Volume Warning) | None | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio2.wav" /> |
-
-### English Long Text Test
-
-**English Test Text:**
-```
-In the realm of advanced technology, the evolution of artificial intelligence stands as a 
-monumental achievement. This dynamic field, constantly pushing the boundaries of what 
-machines can do, has seen rapid growth and innovation. From deciphering complex data 
-patterns to driving cars autonomously, AI's applications are vast and diverse.
-```
-
-| Test Content | Speaker | Input Audio | Synthesized Audio |
-|-------------|---------|-------------|-------------------|
-| **Random Speaker 1** | Random | None | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio.wav" /> |
-| **Random Speaker 2** | Random | None | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio2.wav" /> |
-
-### Japanese Long Text Test
-
-**Japanese Test Text:**
-```
-宇宙に始まりはあるが、終わりはない。無限。
-星にもまた始まりはあるが、自らの力をもって滅び逝く。有限。
-英知を持つ者こそ、最も愚かであること。歴史からも読み取れる。
-海に生ける魚は、陸の世界を知らない。彼らが英知を持てば、それもまた滅び逝く。
-人間が光の速さを超えるのは、魚たちが陸で生活を始めるよりも滑稽。
-これは抗える者たちに対する、神からの最後通告とも言えよう。
-```
-
-| Test Content | Speaker/Character | Input Audio | Synthesized Audio |
-|-------------|------------------|-------------|-------------------|
-| **Long Text Test** | Sakiko Toyogawa | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio.wav" /> |
-| **Random Speaker** | Random | None | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio2.wav" /> |

+ 89 - 122
docs/ja/index.md

@@ -1,166 +1,133 @@
-# OpenAudio (旧 Fish-Speech)
-
-<div align="center">
-
 <div align="center">
+<h1>Fish Speech</h1>
 
-<img src="../assets/openaudio.jpg" alt="OpenAudio" style="display: block; margin: 0 auto; width: 35%;"/>
-
-</div>
-
-<strong>先進的なText-to-Speechモデルシリーズ</strong>
+[English](../en/) | [简体中文](../zh/) | [Portuguese](../pt/) | **日本語** | [한국어](../ko/) | [العربية](../ar/) <br>
 
-<div>
-<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
-<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
-<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
-<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+<a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
+<a href="https://trendshift.io/repositories/7014" target="_blank">
+    <img src="https://trendshift.io/api/badge/repositories/7014" alt="fishaudio%2Ffish-speech | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
 </a>
 </div>
 
-<strong>今すぐ試す:</strong> <a href="https://fish.audio">Fish Audio Playground</a> | <strong>詳細情報:</strong> <a href="https://openaudio.com">OpenAudio ウェブサイト</a>
+<br>
 
+<div align="center">
+    <img src="https://count.getloli.com/get/@fish-speech?theme=asoul" /><br>
 </div>
 
----
-
-!!! note "ライセンスに関するお知らせ"
-    このコードベースは **Apache ライセンス** の下でリリースされ、すべてのモデルウェイトは **CC-BY-NC-SA-4.0 ライセンス** の下でリリースされています。詳細については、[コードライセンス](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) と [モデルライセンス](https://spdx.org/licenses/CC-BY-NC-SA-4.0) を参照してください。
-
-!!! warning "法的免責事項"
-    コードベースの違法な使用について、当方は一切の責任を負いません。お住まいの地域のDMCAおよびその他の関連法規をご参照ください。
-
-## **紹介**
-
-私たちは **OpenAudio** への改名を発表できることを嬉しく思います。Fish-Speechを基盤とし、大幅な改善と新機能を加えた、新しい先進的なText-to-Speechモデルシリーズを紹介します。
-
-**Openaudio-S1-mini**: [ブログ](https://openaudio.com/blogs/s1); [動画](https://www.youtube.com/watch?v=SYuPvd7m06A); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini);
-
-**Fish-Speech v1.5**: [動画](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5);
-
-## **ハイライト**
-
-### **優秀なTTS品質**
-
-Seed TTS評価指標を使用してモデルのパフォーマンスを評価した結果、OpenAudio S1は英語テキストで**0.008 WER**と**0.004 CER**を達成し、以前のモデルより大幅に改善されました。(英語、自動評価、OpenAI gpt-4o-転写に基づく、話者距離はRevai/pyannote-wespeaker-voxceleb-resnet34-LM使用)
-
-| モデル | 単語誤り率 (WER) | 文字誤り率 (CER) | 話者距離 |
-|:-----:|:--------------------:|:-------------------------:|:----------------:|
-| **S1** | **0.008** | **0.004** | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
-
-### **TTS-Arena2最高モデル**
-
-OpenAudio S1は[TTS-Arena2](https://arena.speechcolab.org/)で**#1ランキング**を達成しました。これはtext-to-speech評価のベンチマークです:
+<br>
 
 <div align="center">
-    <img src="../assets/Elo.jpg" alt="TTS-Arena2 Ranking" style="width: 75%;" />
+    <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+        <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+    </a>
+    <a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+        <img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+    </a>
+    <a target="_blank" href="https://pd.qq.com/s/bwxia254o">
+      <img alt="QQ Channel" src="https://img.shields.io/badge/QQ-blue?logo=tencentqq">
+    </a>
 </div>
 
-### **音声制御**
-OpenAudio S1は**多様な感情、トーン、特殊マーカーをサポート**して音声合成を強化します:
-
-- **基本感情**:
-```
-(怒った) (悲しい) (興奮した) (驚いた) (満足した) (喜んだ) 
-(怖がった) (心配した) (動揺した) (緊張した) (欲求不満な) (落ち込んだ)
-(共感した) (恥ずかしい) (嫌悪した) (感動した) (誇らしい) (リラックスした)
-(感謝した) (自信のある) (興味のある) (好奇心のある) (困惑した) (楽しい)
-```
-
-- **高度な感情**:
-```
-(軽蔑的な) (不幸な) (不安な) (ヒステリックな) (無関心な) 
-(いらいらした) (罪悪感のある) (軽蔑的な) (パニックした) (激怒した) (不本意な)
-(熱心な) (不賛成の) (否定的な) (否定する) (驚いた) (真剣な)
-(皮肉な) (和解的な) (慰める) (誠実な) (冷笑的な)
-(躊躇する) (譲歩する) (痛々しい) (気まずい) (面白がった)
-```
+<div align="center">
+    <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
+      <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
+    </a>
+    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
+        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
+    </a>
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
+        <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
+    </a>
+</div>
 
-(現在英語、中国語、日本語をサポート、より多くの言語が近日公開予定!)
+!!! info "ライセンス通知"
+    このコードベースおよび関連するモデルの重みは **FISH AUDIO RESEARCH LICENSE** の下でリリースされています。詳細は [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) を参照してください。
 
-- **トーンマーカー**:
-```
-(急いだ調子で) (叫んで) (悲鳴をあげて) (ささやいて) (柔らかい調子で)
-```
+!!! warning "法的免責事項"
+    私たちは、コードベースのいかなる違法な使用に対しても責任を負いません。DMCA およびその他の関連法に関する現地の規制を参照してください。
 
-- **特殊音響効果**:
-```
-(笑って) (くすくす笑って) (すすり泣いて) (大声で泣いて) (ため息をついて) (息を切らして)
-(うめいて) (群衆の笑い声) (背景の笑い声) (観客の笑い声)
-```
+## ここから始める
 
-Ha,ha,haを使用してコントロールすることもでき、他にも多くの使用法があなた自身の探索を待っています。
+これは Fish Speech の公式ドキュメントです。説明に従って簡単に使い始めることができます。
 
-### **2つのモデルタイプ**
+- [インストール](install.md)
+- [推論](inference.md)
 
-異なるニーズに対応する2つのモデルバリエーションを提供しています:
+## Fish Audio S2
+**オープンソースおよびクローズドソースの中で最高峰のテキスト読み上げシステム**
 
-- **OpenAudio S1 (40億パラメータ)**:[fish.audio](https://fish.audio) で利用可能な全機能搭載のフラッグシップモデルで、すべての高度な機能を備えた最高品質の音声合成を提供します
+Fish Audio S2 は [Fish Audio](https://fish.audio/) によって開発された最新のモデルで、自然でリアル、かつ感情豊かな音声を生成するように設計されています。機械的でも平坦でもなく、スタジオスタイルの朗読に限定されません
 
-- **OpenAudio S1-mini (5億パラメータ)**:コア機能を備えた蒸留版で、[Hugging Face Space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) で利用可能です。優秀な品質を維持しながら、より高速な推論のために最適化されています。
+Fish Audio S2 は日常会話に焦点を当てており、ネイティブなマルチ話者およびマルチターン生成をサポートしています。また、指示制御もサポートしています。
 
-S1とS1-miniの両方にオンライン人間フィードバック強化学習(RLHF)が組み込まれています。
+S2 シリーズには複数のモデルが含まれており、オープンソースモデルは S2-Pro で、シリーズの中で最も強力なモデルです。
 
-## **機能**
+リアルタイム体験については、[Fish Audio Webサイト](https://fish.audio/) をご覧ください。
 
-1. **ゼロショット・フューショットTTS:** 10〜30秒の音声サンプルを入力するだけで高品質なTTS出力を生成します。**詳細なガイドラインについては、[音声クローニングのベストプラクティス](https://docs.fish.audio/text-to-speech/voice-clone-best-practices)をご覧ください。**
+### モデルバリアント
 
-2. **多言語・言語横断サポート:** 多言語テキストを入力ボックスにコピー&ペーストするだけで、言語を気にする必要はありません。現在、英語、日本語、韓国語、中国語、フランス語、ドイツ語、アラビア語、スペイン語をサポートしています。
+| モデル | サイズ | 利用可能性 | 説明 |
+|------|------|-------------|-------------|
+| S2-Pro | 4B パラメータ | [huggingface](https://huggingface.co/fishaudio/s2-pro) | 最高品質と安定性を備えたフル機能のフラッグシップモデル |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | より高速で低遅延のクローズドソースモデル |
 
-3. **音素依存なし:** このモデルは強力な汎化能力を持ち、TTSに音素に依存しません。あらゆる言語スクリプトのテキストを処理できます。
+モデルの詳細については、技術レポートを参照してください
 
-4. **高精度:** Seed-TTS Evalで低い文字誤り率(CER)約0.4%と単語誤り率(WER)約0.8%を達成します。
+## ハイライト
 
-5. **高速:** torch compile加速により、Nvidia RTX 4090でリアルタイム係数約1:7。
+<img src="../assets/totalability.png" width=200%>
 
-6. **WebUI推論:** Chrome、Firefox、Edge、その他のブラウザと互換性のあるGradioベースの使いやすいWebUIを備えています。
+### 自然言語制御
 
-7. **GUI推論:** APIサーバーとシームレスに連携するPyQt6グラフィカルインターフェースを提供します。Linux、Windows、macOSをサポートします。[GUIを見る](https://github.com/AnyaCoder/fish-speech-gui)
+Fish Audio S2 では、ユーザーが自然言語を使用して各文のパフォーマンス、副言語情報、感情、その他の音声特性を制御できます。短いタグを使用してモデルのパフォーマンスを曖昧に制御するだけでなく、生成されるコンテンツ全体の品質を大幅に向上させます
 
-8. **デプロイフレンドリー:** Linux、Windows(MacOS近日公開)のネイティブサポートで推論サーバーを簡単にセットアップし、速度低下を最小化します。
+### 多言語サポート
 
-## **メディア・デモ**
+Fish Audio S2 は、音素や特定の言語のプリプロセスを必要とせず、高品質な多言語テキスト読み上げをサポートしています。以下を含みます:
 
-<!-- <div align="center"> -->
+**英語、中国語、日本語、韓国語、アラビア語、ドイツ語、フランス語...**
 
-<h3><strong>ソーシャルメディア</strong></h3>
-<a href="https://x.com/FishAudio/status/1929915992299450398" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-最新デモ-black?style=for-the-badge&logo=x&logoColor=white" alt="Latest Demo on X" />
-</a>
+**さらに追加予定!**
 
-<h3><strong>インタラクティブデモ</strong></h3>
+リストは常に拡大しています。最新のリリースについては [Fish Audio](https://fish.audio/) を確認してください。
 
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish_Audio-OpenAudio_S1を試す-blue?style=for-the-badge" alt="Try OpenAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-S1_Miniを試す-yellow?style=for-the-badge" alt="Try S1 Mini" />
-</a>
+### ネイティブマルチ話者生成
 
-<h3><strong>動画ショーケース</strong></h3>
-<div align="center">
-<iframe width="560" height="315" src="https://www.youtube.com/embed/SYuPvd7m06A" title="OpenAudio S1 Video" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-</div>
+<img src="../assets/chattemplate.png" width=200%>
 
-## **ドキュメント**
+Fish Audio S2 では、ユーザーが複数の話者を含むリファレンスオーディオをアップロードでき、モデルは `<|speaker:i|>` トークンを通じて各話者の特徴を処理します。その後、話者 ID トークンを介してモデルのパフォーマンスを制御し、1 回の生成で複数の話者を実現できます。話者ごとに個別にリファレンスオーディオをアップロードして音声を生成する必要はもうありません。
 
-### クイックスタート
-- [環境構築](install.md) - 開発環境をセットアップ
-- [推論ガイド](inference.md) - モデルを実行して音声を生成
+### マルチターン対話生成
 
-## **コミュニティ・サポート**
+モデルのコンテキストの拡張により、以前のコンテキストの情報を使用して、その後に生成されるコンテンツの表現力を向上させ、コンテンツの自然度を高めることができるようになりました。
 
-- **Discord:** [Discordコミュニティ](https://discord.gg/Es5qTB9BcN)に参加
-- **ウェブサイト:** 最新アップデートは[OpenAudio.com](https://openaudio.com)をご覧ください
-- **オンライン試用:** [Fish Audio Playground](https://fish.audio)
+### 高速音声クローン
 
-このコードベースは **Apache ライセンス** の下でリリースされ、すべてのモデルウェイトは **CC-BY-NC-SA-4.0 ライセンス** の下でリリースされています。詳細については、[コードライセンス](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) と [モデルライセンス](https://spdx.org/licenses/CC-BY-NC-SA-4.0) を参照してください
+Fish Audio S2 は、短いリファレンスサンプル(通常 10〜30 秒)を使用した正確な音声クローンをサポートしています。モデルは音色、話し方、感情的な傾向を捉えることができ、追加の微調整なしでリアルで一貫したクローン音声を生成できます。
 
-## モデル
+---
 
-OpenAudio S1 は OpenAudio シリーズの最初のモデルです。これは、VQ コードからオーディオを再構築できるデュアルデコーダ VQ-GAN ボコーダです。
+## 謝辞
+
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+
+## 技術報告
+
+```bibtex
+@misc{fish-speech-v1.4,
+      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},
+      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},
+      year={2024},
+      eprint={2411.01156},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2411.01156},
+}
+```

+ 19 - 132
docs/ja/inference.md

@@ -1,171 +1,58 @@
 # 推論
 
-ボコーダーモデルが変更されたため、以前よりも多くのVRAMが必要です。スムーズな推論には12GBを推奨します。
-
-推論には、コマンドライン、HTTP API、WebUIをサポートしており、お好きな方法を選択できます。
+Fish Audio S2 モデルは大きなビデオメモリを必要とします。推論には少なくとも 24GB の GPU を使用することをお勧めします。
 
 ## 重みのダウンロード
 
 まず、モデルの重みをダウンロードする必要があります:
 
 ```bash
-hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
+hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ```
 
 ## コマンドライン推論
 
 !!! note
-    モデルにランダムに音色を選択させる場合は、この手順をスキップできます。
+    モデルに音声をランダムに選択させる場合は、このステップをスキップできます。
 
-### 1. 参照音声からVQトークンを取得
+### 1. リファレンスオーディオから VQ トークンを取得する
 
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "ref_audio_name.wav" \
-    --checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth"
+    -i "test.wav" \
+    --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
 
-`fake.npy` と `fake.wav` が得られるはずです。
+`fake.npy` と `fake.wav` が生成されるはずです。
 
-### 2. テキストからセマンティックトークンを生成
+### 2. テキストから Semantic トークンを生成する
 
 ```bash
 python fish_speech/models/text2semantic/inference.py \
     --text "変換したいテキスト" \
-    --prompt-text "参照テキスト" \
+    --prompt-text "リファレンステキスト" \
     --prompt-tokens "fake.npy" \
-    --compile
+    # --compile
 ```
 
-このコマンドは、作業ディレクトリに `codes_N` ファイルを作成します(Nは0から始まる整数)
+このコマンドは、作業ディレクトリに `codes_N` ファイルを作成します。ここで N は 0 から始まる整数です
 
 !!! note
-    より高速な推論のために `--compile` を使用してCUDAカーネルを融合することができます(約15トークン/秒 -> 約150トークン/秒, RTX 4090 GPU)
-    対応して、加速を使用しない場合は、`--compile` パラメータをコメントアウトできます
+    より高速な推論のために CUDA カーネルを融合する `--compile` を使用したい場合がありますが、私たちの sglang 推論加速最適化を使用することをお勧めします
+    同様に、加速を使用する予定がない場合は、`--compile` パラメータをコメントアウトしてください
 
 !!! info
-    bf16をサポートしないGPUの場合、`--half` パラメータの使用が必要かもしれません。
-
-### 3. セマンティックトークンから音声を生成:
+    bf16 をサポートしていない GPU の場合、`--half` パラメータを使用する必要があるかもしれません。
 
-!!! warning "将来の警告"
-    元のパス(tools/vqgan/inference.py)からアクセス可能なインターフェースを維持していますが、このインターフェースは後続のリリースで削除される可能性があるため、できるだけ早くコードを変更してください。
+### 3. セマンティックトークンから音声を生成する:
 
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "codes_0.npy"
-```
-
-## HTTP API推論
-
-推論用のHTTP APIを提供しています。以下のコマンドでサーバーを開始できます:
-
-```bash
-python -m tools.api_server \
-    --listen 0.0.0.0:8080 \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-> 推論を高速化したい場合は、`--compile` パラメータを追加できます。
-
-その後、http://127.0.0.1:8080/ でAPIを表示・テストできます。
-
-## GUI推論 
-[クライアントをダウンロード](https://github.com/AnyaCoder/fish-speech-gui/releases)
-
-## WebUI推論
-
-以下のコマンドでWebUIを開始できます:
-
-```bash
-python -m tools.run_webui \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-または単純に
-
-```bash
-python -m tools.run_webui
+    -i "codes_0.npy" \
 ```
-> 推論を高速化したい場合は、`--compile` パラメータを追加できます。
-
-!!! note
-    ラベルファイルと参照音声ファイルをメインディレクトリの `references` フォルダに事前に保存することができます(自分で作成する必要があります)。これにより、WebUIで直接呼び出すことができます。
-
-!!! note
-    `GRADIO_SHARE`、`GRADIO_SERVER_PORT`、`GRADIO_SERVER_NAME` などのGradio環境変数を使用してWebUIを設定できます。
-
-## Dockerでの推論
-
-OpenAudioは、WebUIとAPIサーバーの両方でDockerコンテナを提供しています。`docker run`コマンドを直接使用してコンテナを起動できます。
-
-以下の準備が必要です:
-- DockerとNVIDIA Dockerランタイムがインストール済みであること(GPUサポート用)
-- モデルの重みがダウンロード済みであること([重みのダウンロード](#重みのダウンロード)セクションを参照)
-- 参照音声ファイル(オプション、声のクローニング用)
 
-```bash
-# モデルの重みと参照音声用のディレクトリを作成
-mkdir -p checkpoints references
-
-# モデルの重みをダウンロード(まだの場合)
-# hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
-
-# CUDAサポート付きでWebUIを起動(推奨、最高のパフォーマンス)
-docker run -d \
-    --name fish-speech-webui \
-    --gpus all \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-webui-cuda
-
-# CPUのみでの推論(低速ですが、GPUなしで動作します)
-docker run -d \
-    --name fish-speech-webui-cpu \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-webui-cpu
-```
-
-```bash
-# CUDAサポート付きでAPIサーバーを起動
-docker run -d \
-    --name fish-speech-server \
-    --gpus all \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-server-cuda
-
-# CPUのみでの推論
-docker run -d \
-    --name fish-speech-server-cpu \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-server-cpu
-```
-
-以下の環境変数を使用してDockerコンテナをカスタマイズできます:
-
-- `COMPILE=1` - `torch.compile`を有効にして推論を高速化(約10倍、CUDAのみ)
-- `GRADIO_SERVER_NAME=0.0.0.0` - WebUIサーバーのホスト(デフォルト: 0.0.0.0)
-- `GRADIO_SERVER_PORT=7860` - WebUIサーバーのポート(デフォルト: 7860)
-- `API_SERVER_NAME=0.0.0.0` - APIサーバーのホスト(デフォルト: 0.0.0.0)
-- `API_SERVER_PORT=8080` - APIサーバーのポート(デフォルト: 8080)
-- `LLAMA_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini` - モデルの重みへのパス
-- `DECODER_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini/codec.pth` - デコーダーの重みへのパス
-- `DECODER_CONFIG_NAME=modded_dac_vq` - デコーダーの設定名
-```
+その後、`fake.wav` ファイルが取得できます。
 
-WebUIとAPIサーバーの使い方は、上記のガイドと同じです。
+## WebUI 推論
 
-お楽しみください!
+まもなく公開予定です。

+ 8 - 4
docs/ja/install.md

@@ -1,11 +1,11 @@
 ## 必要条件
 
-- GPUメモリ: 12GB (推論時)
+- GPUメモリ: 24GB (推論時)
 - システム: Linux, WSL
 
 ## システムセットアップ
 
-OpenAudioは複数のインストール方法をサポートしています。ご自身の開発環境に最も適した方法をお選びください。
+Fish Audio S2は複数のインストール方法をサポートしています。ご自身の開発環境に最も適した方法をお選びください。
 
 **前提条件**: 音声処理のためのシステム依存関係をインストールします:
 ``` bash
@@ -26,6 +26,10 @@ pip install -e .[cpu]
 
 # デフォルトインストール (PyTorchのデフォルトインデックスを使用)
 pip install -e .
+
+# pyaudioのインストールでエラーが発生する場合は、以下のコマンドを試してください:
+# conda install pyaudio
+# その後、再度 pip install -e . を実行してください
 ```
 
 ### UV
@@ -63,7 +67,7 @@ pip install -e .
 
 ## Dockerセットアップ
 
-OpenAudio S1シリーズモデルは、さまざまなニーズに応えるため複数のDockerデプロイメントオプションを提供しています。Docker Hubのビルド済みイメージを使用するか、Docker Composeでローカルビルドするか、手動でカスタムイメージをビルドすることができます。
+Fish Audio S2シリーズモデルは、さまざまなニーズに応えるため複数のDockerデプロイメントオプションを提供しています。Docker Hubのビルド済みイメージを使用するか、Docker Composeでローカルビルドするか、手動でカスタムイメージをビルドすることができます。
 
 WebUIとAPIサーバーの両方について、GPU(デフォルトはCUDA 12.6)版とCPU版のDockerイメージを提供しています。Docker Hubのビルド済みイメージを使用するか、Docker Composeでローカルビルドするか、手動でカスタムイメージをビルドするかを選択できます。ローカルでビルドする場合は、以下の手順に従ってください。ビルド済みイメージを使用するだけの場合は、[推論ガイド](inference.md)を直接参照してください。
 
@@ -71,7 +75,7 @@ WebUIとAPIサーバーの両方について、GPU(デフォルトはCUDA 12.6
 
 - DockerとDocker Composeがインストール済みであること
 - NVIDIA Dockerランタイムがインストール済みであること(GPUサポート用)
-- CUDAによる推論のために、少なくとも12GBのGPUメモリがあること
+- CUDAによる推論のために、少なくとも24GBのGPUメモリがあること
 
 ### Docker Composeの使用
 

+ 0 - 80
docs/ja/samples.md

@@ -1,80 +0,0 @@
-# サンプル
-
-## 感情制御(*新機能)
-
-### 基本感情サンプル
-
-| 感情タイプ | 言語 | 入力音声 | 合成音声 | プロンプト |
-|-----------|------|----------|----------|-----------|
-| **嬉しい** | 中国語 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy_refer.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy.wav" /> | (happy)嘿嘿...博士,悄悄告诉你一件事——我重新开始练小提琴了。 |
-| **嫌悪** | 日本語 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/disgusted.wav" /> | (digusted)あなたは、本当に気持ち悪い、嫌い…(disgusted)それでも、慰めを求めますの? |
-| **怒り** | 英語 | - | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/angry.wav" /> | (angry)I want you to go out immediately! I don't want to see you again, or I will try to kill you! |
-| **怒り** | 中国語 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/作战中4.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/angry.wav" /> | (angry)我让你快滚,你是耳聋吗?!...(angry)信不信我揍你! |
-| **驚き** | 中国語 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/surprised.wav" /> | (surprised)今天你过生日?既然这样的话,我就勉为其难祝你生日快乐吧。(surprised)要不要看看你的桌子底下? |
-| **悲しみ** | 日本語 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref2.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/sad.wav" /> | (sad)他の小隊長と比べて、私はまだ多くのことを学ばなくてはなりません......(sad)皆さんのペースに追いつけるよう精一杯努力いたしますわ。 |
-
-## パラ言語効果(*新機能)
-
-### 笑い声効果
-
-| サンプル | 言語 | プロンプト | 音声 |
-|---------|------|-----------|------|
-| **サンプル 1** | 中国語 | 大家好啊,(笑声)哈哈,我是从来不带节奏的血狼破军,今天来点大家想看的东西。 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh1.wav" /> |
-| **サンプル 2** | 中国語 | (笑声)哈哈(笑声),虽然说"三角洲行动"的策划说他们没有暗改(笑声)哈哈(笑声),但是我相信,大家心里都有数。对不起,实在是太搞笑了,忍不住笑了出来。(笑声)哈哈(笑声) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh2.wav" /> |
-| **サンプル 3** | 英語 | (laughing)haha(laughing), though many people say that homeless cats need our help, (laughing)haha(laughing), but seldom do they really do something that is useful to the cats, (laughing)haha(laughing) sorry, but this is very interesting. | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/laugh.wav" /> |
-
-### 戦吼効果
-
-| サンプル | 言語 | プロンプト | 音声 |
-|---------|------|-----------|------|
-| **戦吼サンプル** | 英語 | (shouting)oh my god !!!(shouting)(shouting)(shouting), baby(shouting)you (shouting)are (shouting)a piece of sweet, soft(shouting), delicious cake!!! | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/shout.wav" /> |
-
-## 長文安定性テスト
-
-### 中国語長文テスト
-
-**中国語テストテキスト:**
-```
-你们这个是什么群啊,你们这是害人不浅啊你们这个群!谁是群主,出来!真的太过分了。你们搞这个群干什么?
-我儿子每一科的成绩都不过那个平均分呐,他现在初二,你叫我儿子怎么办啊?他现在还不到高中啊?
-你们害死我儿子了!快点出来你这个群主!再这样我去报警了啊!我跟你们说你们这一帮人啊,一天到晚啊,
-搞这些什么游戏啊,动漫啊,会害死你们的,你们没有前途我跟你说。你们这九百多个人,好好学习不好吗?
-一天到晚在上网。有什么意思啊?麻烦你重视一下你们的生活的目标啊?有一点学习目标行不行?一天到晚上网是不是人啊?
-```
-
-| テスト内容 | 話者/キャラクター | 入力音声 | 合成音声 |
-|-----------|------------------|----------|----------|
-| **長文テスト** | 夕(アークナイツ) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio.wav" /> |
-| **ランダム話者** | ランダム(音量注意) | なし | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio2.wav" /> |
-
-### 英語長文テスト
-
-**英語テストテキスト:**
-```
-In the realm of advanced technology, the evolution of artificial intelligence stands as a 
-monumental achievement. This dynamic field, constantly pushing the boundaries of what 
-machines can do, has seen rapid growth and innovation. From deciphering complex data 
-patterns to driving cars autonomously, AI's applications are vast and diverse.
-```
-
-| テスト内容 | 話者 | 入力音声 | 合成音声 |
-|-----------|------|----------|----------|
-| **ランダム話者 1** | ランダム | なし | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio.wav" /> |
-| **ランダム話者 2** | ランダム | なし | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio2.wav" /> |
-
-### 日本語長文テスト
-
-**日本語テストテキスト:**
-```
-宇宙に始まりはあるが、終わりはない。無限。
-星にもまた始まりはあるが、自らの力をもって滅び逝く。有限。
-英知を持つ者こそ、最も愚かであること。歴史からも読み取れる。
-海に生ける魚は、陸の世界を知らない。彼らが英知を持てば、それもまた滅び逝く。
-人間が光の速さを超えるのは、魚たちが陸で生活を始めるよりも滑稽。
-これは抗える者たちに対する、神からの最後通告とも言えよう。
-```
-
-| テスト内容 | 話者/キャラクター | 入力音声 | 合成音声 |
-|-----------|------------------|----------|----------|
-| **長文テスト** | 豊川祥子 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio.wav" /> |
-| **ランダム話者** | ランダム | なし | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio2.wav" /> |

+ 89 - 119
docs/ko/index.md

@@ -1,163 +1,133 @@
-# OpenAudio (구 Fish-Speech)
-
-<div align="center">
-
 <div align="center">
+<h1>Fish Speech</h1>
 
-<img src="../assets/openaudio.jpg" alt="OpenAudio" style="display: block; margin: 0 auto; width: 35%;"/>
-
-</div>
-
-<strong>고급 텍스트-음성 변환 모델 시리즈</strong>
+[English](../en/) | [简体中文](../zh/) | [Portuguese](../pt/) | [日本語](../ja/) | **한국어** | [العربية](../ar/) <br>
 
-<div>
-<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
-<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
-<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
-<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+<a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
+<a href="https://trendshift.io/repositories/7014" target="_blank">
+    <img src="https://trendshift.io/api/badge/repositories/7014" alt="fishaudio%2Ffish-speech | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
 </a>
 </div>
 
-<strong>지금 체험:</strong> <a href="https://fish.audio">Fish Audio Playground</a> | <strong>자세히 알아보기:</strong> <a href="https://openaudio.com">OpenAudio 웹사이트</a>
+<br>
 
+<div align="center">
+    <img src="https://count.getloli.com/get/@fish-speech?theme=asoul" /><br>
 </div>
 
----
-
-!!! note "라이선스 공지"
-    이 코드베이스는 **Apache 라이선스**에 따라 배포되며, 모든 모델 가중치는 **CC-BY-NC-SA-4.0 라이선스**에 따라 배포됩니다. 자세한 내용은 [코드 라이선스](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) 및 [모델 라이선스](https://spdx.org/licenses/CC-BY-NC-SA-4.0)를 참조하십시오.
-
-!!! warning "법적 면책조항"
-    코드베이스의 불법적인 사용에 대해서는 일체 책임을 지지 않습니다. 귀하의 지역의 DMCA 및 기타 관련 법률을 참고하시기 바랍니다.
-
-## **소개**
-
-저희는 **OpenAudio**로의 브랜드 변경을 발표하게 되어 기쁩니다. Fish-Speech를 기반으로 하여 상당한 개선과 새로운 기능을 추가한 새로운 고급 텍스트-음성 변환 모델 시리즈를 소개합니다.
-
-**Openaudio-S1-mini**: [블로그](https://openaudio.com/blogs/s1); [동영상](https://www.youtube.com/watch?v=SYuPvd7m06A); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini);
-
-**Fish-Speech v1.5**: [동영상](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5);
-
-## **주요 특징**
-
-### **뛰어난 TTS 품질**
-
-Seed TTS 평가 지표를 사용하여 모델 성능을 평가한 결과, OpenAudio S1은 영어 텍스트에서 **0.008 WER**과 **0.004 CER**을 달성하여 이전 모델보다 현저히 향상되었습니다. (영어, 자동 평가, OpenAI gpt-4o-전사 기반, 화자 거리는 Revai/pyannote-wespeaker-voxceleb-resnet34-LM 사용)
-
-| 모델 | 단어 오류율 (WER) | 문자 오류율 (CER) | 화자 거리 |
-|:-----:|:--------------------:|:-------------------------:|:----------------:|
-| **S1** | **0.008** | **0.004** | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
+<br>
 
-### **TTS-Arena2 최고 모델**
-
-OpenAudio S1은 [TTS-Arena2](https://arena.speechcolab.org/)에서 **#1 순위**를 달성했습니다. 이는 텍스트 음성 변환 평가의 기준입니다:
+<div align="center">
+    <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+        <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+    </a>
+    <a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+        <img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+    </a>
+    <a target="_blank" href="https://pd.qq.com/s/bwxia254o">
+      <img alt="QQ Channel" src="https://img.shields.io/badge/QQ-blue?logo=tencentqq">
+    </a>
+</div>
 
 <div align="center">
-    <img src="assets/Elo.jpg" alt="TTS-Arena2 Ranking" style="width: 75%;" />
+    <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
+      <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
+    </a>
+    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
+        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
+    </a>
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
+        <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
+    </a>
 </div>
 
-### **음성 제어**
-OpenAudio S1은 **다양한 감정, 톤, 특수 마커를 지원**하여 음성 합성을 향상시킵니다:
+!!! info "라이선스 공지"
+    이 코드베이스 및 관련 모델 가중치는 **FISH AUDIO RESEARCH LICENSE** 하에 릴리스되었습니다. 자세한 내용은 [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE)를 참조하십시오.
 
-- **기본 감정**:
-```
-(화난) (슬픈) (흥미진진한) (놀란) (만족한) (기쁜) 
-(무서워하는) (걱정하는) (속상한) (긴장한) (좌절한) (우울한)
-(공감하는) (당황한) (역겨워하는) (감동한) (자랑스러운) (편안한)
-(감사한) (자신감있는) (관심있는) (호기심있는) (혼란스러운) (즐거운)
-```
+!!! warning "법적 면책 조항"
+    코드베이스의 불법적인 사용에 대해 당사는 어떠한 책임도 지지 않습니다. DMCA 및 기타 관련 법률에 관한 현지 규정을 참조하십시오.
 
-- **고급 감정**:
-```
-(경멸하는) (불행한) (불안한) (히스테리컬한) (무관심한) 
-(참을성없는) (죄책감있는) (멸시하는) (공황상태의) (격분한) (마지못한)
-(열망하는) (불찬성하는) (부정적인) (부인하는) (놀란) (진지한)
-(비꼬는) (화해하는) (위로하는) (진실한) (비웃는)
-(주저하는) (굴복하는) (고통스러운) (어색한) (재미있어하는)
-```
+## 시작하기
 
-(현재 영어, 중국어, 일본어를 지원하며, 더 많은 언어가 곧 출시될 예정입니다!)
+Fish Speech의 공식 문서입니다. 지침에 따라 쉽게 시작할 수 있습니다.
 
-- **톤 마커**:
-```
-(서두르는 톤으로) (소리치며) (비명지르며) (속삭이며) (부드러운 톤으로)
-```
+- [설치](install.md)
+- [추론](inference.md)
 
-- **특수 음향 효과**:
-```
-(웃으며) (킥킥거리며) (흐느끼며) (크게 울며) (한숨쉬며) (헐떡이며)
-(신음하며) (군중 웃음소리) (배경 웃음소리) (관객 웃음소리)
-```
+## Fish Audio S2
+**오픈 소스 및 클로즈드 소스 중 최고봉의 텍스트 음성 변환 시스템**
 
-Ha,ha,ha를 사용하여 제어할 수도 있으며, 여러분 스스로 탐구할 수 있는 다른 많은 사용법이 있습니다.
+Fish Audio S2는 [Fish Audio](https://fish.audio/)에서 개발한 최신 모델로, 자연스럽고 사실적이며 감정이 풍부한 음성을 생성하도록 설계되었습니다. 기계적이거나 평면적이지 않으며, 스튜디오 스타일의 낭독에 국한되지 않습니다.
 
-### **두 가지 모델 유형**
+Fish Audio S2는 일상 대화에 중점을 두고 있으며, 네이티브 다중 화자 및 다중 턴 생성을 지원합니다. 또한 명령 제어를 지원합니다.
 
-<div align="center">
+S2 시리즈에는 여러 모델이 포함되어 있으며, 오픈 소스 모델은 S2-Pro로, 시리즈 중에서 가장 강력한 모델입니다.
 
-| 모델 | 크기 | 가용성 | 특징 |
-|-------|------|--------------|----------|
-| **S1** | 40억 매개변수 | [fish.audio](https://fish.audio)에서 이용 가능 | 모든 기능을 갖춘 플래그십 모델 |
-| **S1-mini** | 5억 매개변수 | huggingface [hf space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini)에서 이용 가능 | 핵심 기능을 갖춘 경량화 버전 |
+실시간 체험은 [Fish Audio 웹사이트](https://fish.audio/)를 방문해 주세요.
 
-</div>
+### 모델 변형
 
-S1과 S1-mini 모두 온라인 인간 피드백 강화 학습(RLHF)이 통합되어 있습니다.
+| 모델 | 크기 | 가용성 | 설명 |
+|------|------|-------------|-------------|
+| S2-Pro | 4B 매개변수 | [huggingface](https://huggingface.co/fishaudio/s2-pro) | 최고의 품질과 안정성을 갖춘 풀 기능 플래그십 모델 |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | 더 빠른 속도와 짧은 지연 시간을 갖춘 클로즈드 소스 모델 |
 
-## **기능**
+모델에 대한 자세한 내용은 기술 보고서를 참조하십시오.
 
-1. **제로샷 및 퓨샷 TTS:** 10~30초의 음성 샘플을 입력하여 고품질 TTS 출력을 생성합니다. **자세한 가이드라인은 [음성 복제 모범 사례](https://docs.fish.audio/text-to-speech/voice-clone-best-practices)를 참조하세요.**
+## 하이라이트
 
-2. **다국어 및 교차 언어 지원:** 다국어 텍스트를 입력 상자에 복사하여 붙여넣기만 하면 됩니다. 언어에 대해 걱정할 필요가 없습니다. 현재 영어, 일본어, 한국어, 중국어, 프랑스어, 독일어, 아랍어, 스페인어를 지원합니다.
+<img src="../assets/totalability.png" width=200%>
 
-3. **음소 의존성 없음:** 이 모델은 강력한 일반화 능력을 가지고 있으며 TTS에 음소에 의존하지 않습니다. 어떤 언어 스크립트의 텍스트도 처리할 수 있습니다.
+### 자연어 제어
 
-4. **높은 정확도:** Seed-TTS Eval에서 약 0.4%의 낮은 문자 오류율(CER)과 약 0.8%의 단어 오류율(WER)을 달성합니다.
+Fish Audio S2를 사용하면 사용자가 자연어를 사용하여 각 문장의 퍼포먼스, 부언어 정보, 감정 및 기타 음성 특성을 제어할 수 있습니다. 짧은 태그를 사용하여 모델의 퍼포먼스를 모호하게 제어하는 것뿐만 아니라 생성된 콘텐츠 전체의 품질을 크게 향상시킵니다.
 
-5. **빠른 속도:** torch compile 가속을 통해 Nvidia RTX 4090 실시간 계수 약 1:7.
+### 다국어 지원
 
-6. **WebUI 추론:** Chrome, Firefox, Edge 및 기타 브라우저와 호환되는 사용하기 쉬운 Gradio 기반 웹 UI를 제공합니다.
+Fish Audio S2는 음소나 특정 언어의 전처리 없이도 고품질의 다국어 텍스트 음성 변환을 지원합니다. 다음을 포함합니다:
 
-7. **GUI 추론:** API 서버와 원활하게 작동하는 PyQt6 그래픽 인터페이스를 제공합니다. Linux, Windows, macOS를 지원합니다. [GUI 보기](https://github.com/AnyaCoder/fish-speech-gui).
+**영어, 중국어, 일본어, 한국어, 아랍어, 독일어, 프랑스어...**
 
-8. **배포 친화적:** Linux, Windows (MacOS 곧 출시 예정)의 네이티브 지원으로 추론 서버를 쉽게 설정하여 속도 손실을 최소화합니다.
+**그리고 더욱 추가될 예정입니다!**
 
-## **미디어 및 데모**
+목록은 지속적으로 확대되고 있으며, 최신 릴리스는 [Fish Audio](https://fish.audio/)를 확인하십시오.
 
-<!-- <div align="center"> -->
+### 네이티브 다중 화자 생성
 
-<h3><strong>소셜 미디어</strong></h3>
-<a href="https://x.com/FishAudio/status/1929915992299450398" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-최신_데모-black?style=for-the-badge&logo=x&logoColor=white" alt="Latest Demo on X" />
-</a>
+<img src="../assets/chattemplate.png" width=200%>
 
-<h3><strong>인터랙티브 데모</strong></h3>
+Fish Audio S2를 사용하면 사용자가 여러 화자가 포함된 참조 오디오를 업로드할 수 있으며, 모델은 `<|speaker:i|>` 토큰을 통해 각 화자의 특성을 처리합니다. 이후 화자 ID 토큰을 통해 모델의 퍼포먼스를 제어하여 한 번의 생성으로 여러 화자를 구현할 수 있습니다. 화자마다 개별적으로 참조 오디오를 업로드하고 음성을 생성할 필요가 더 이상 없습니다.
 
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish_Audio-OpenAudio_S1_체험-blue?style=for-the-badge" alt="Try OpenAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-S1_Mini_체험-yellow?style=for-the-badge" alt="Try S1 Mini" />
-</a>
+### 다중 턴 대화 생성
 
-<h3><strong>동영상 쇼케이스</strong></h3>
-<div align="center">
-<iframe width="560" height="315" src="https://www.youtube.com/embed/SYuPvd7m06A" title="OpenAudio S1 Video" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-</div>
+모델 컨텍스트의 확장 덕분에, 이전 컨텍스트의 정보를 사용하여 이후에 생성되는 콘텐츠의 표현력을 개선하고 콘텐츠의 자연스러움을 높일 수 있게 되었습니다.
 
-## **문서**
+### 빠른 음성 클로닝
 
-### 빠른 시작
-- [환경 구축](install.md) - 개발 환경 설정
-- [추론 가이드](inference.md) - 모델 실행 및 음성 생성
+Fish Audio S2는 짧은 참조 샘플(보통 10~30초)을 사용한 정확한 음성 클로닝을 지원합니다. 모델은 음색, 말하기 스타일 및 감정적 경향을 포착할 수 있으며, 추가 미세 조정 없이도 사실적이고 일관된 클로닝 음성을 생성할 수 있습니다.
 
-## **커뮤니티 및 지원**
+---
 
-- **Discord:** [Discord 커뮤니티](https://discord.gg/Es5qTB9BcN)에 참여하세요
-- **웹사이트:** 최신 업데이트는 [OpenAudio.com](https://openaudio.com)을 방문하세요
-- **온라인 체험:** [Fish Audio Playground](https://fish.audio)
+## 감사의 인사
+
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+
+## 기술 보고서
+
+```bibtex
+@misc{fish-speech-v1.4,
+      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},
+      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},
+      year={2024},
+      eprint={2411.01156},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2411.01156},
+}
+```

+ 16 - 129
docs/ko/inference.md

@@ -1,171 +1,58 @@
 # 추론
 
-보코더 모델이 변경되어 이전보다 더 많은 VRAM이 필요하며, 원활한 추론을 위해 12GB를 권장합니다.
-
-추론을 위해 명령줄, HTTP API, WebUI를 지원하며, 원하는 방법을 선택할 수 있습니다.
+Fish Audio S2 모델은 큰 비디오 메모리(VRAM)가 필요합니다. 추론을 위해 최소 24GB 이상의 GPU를 사용하는 것을 권장합니다.
 
 ## 가중치 다운로드
 
 먼저 모델 가중치를 다운로드해야 합니다:
 
 ```bash
-hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
+hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ```
 
 ## 명령줄 추론
 
 !!! note
-    모델이 임의로 음색을 선택하도록 하려면 이 단계를 건너뛸 수 있습니다.
+    모델이 음색을 무작위로 선택하게 하려면 이 단계를 건너뛸 수 있습니다.
 
-### 1. 참조 오디오에서 VQ 토큰 
+### 1. 참조 오디오에서 VQ 토큰 가져오
 
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "ref_audio_name.wav" \
-    --checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth"
+    -i "test.wav" \
+    --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
 
-`fake.npy`와 `fake.wav`를 얻을 수 있습니다.
+`fake.npy`와 `fake.wav` 파일이 생성됩니다.
 
-### 2. 텍스트에서 의미 토큰 생성:
+### 2. 텍스트에서 Semantic 토큰 생성:
 
 ```bash
 python fish_speech/models/text2semantic/inference.py \
-    --text "변환하고 싶은 텍스트" \
+    --text "변환하려는 텍스트" \
     --prompt-text "참조 텍스트" \
     --prompt-tokens "fake.npy" \
-    --compile
+    # --compile
 ```
 
 이 명령은 작업 디렉토리에 `codes_N` 파일을 생성합니다. 여기서 N은 0부터 시작하는 정수입니다.
 
 !!! note
-    더 빠른 추론을 위해 `--compile`을 사용하여 CUDA 커널을 융합할 수 있습니다(약 15 토큰/초 -> 약 150 토큰/초, RTX 4090 GPU).
-    이에 따라 가속을 사용하지 않으려면 `--compile` 매개변수를 주석 처리할 수 있습니다.
+    더 빠른 추론을 위해 CUDA 커널을 병합하는 `--compile`을 사용하고 싶을 수 있지만, 당사의 sglang 추론 가속 최적화를 사용하는 것을 더 권장합니다.
+    마찬가지로 가속을 사용할 계획이 없다면 `--compile` 매개변수를 주석 처리할 수 있습니다.
 
 !!! info
     bf16을 지원하지 않는 GPU의 경우 `--half` 매개변수를 사용해야 할 수 있습니다.
 
-### 3. 의미 토큰에서 음성 생성:
-
-!!! warning "향후 경고"
-    원래 경로(tools/vqgan/inference.py)에서 액세스 가능한 인터페이스를 유지하고 있지만, 이 인터페이스는 향후 릴리스에서 제거될 수 있으므로 가능한 한 빨리 코드를 변경해 주세요.
+### 3. 시맨틱 토큰에서 음성 생성:
 
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "codes_0.npy"
+    -i "codes_0.npy" \
 ```
 
-## HTTP API 추론
-
-추론을 위한 HTTP API를 제공합니다. 다음 명령으로 서버를 시작할 수 있습니다:
-
-```bash
-python -m tools.api_server \
-    --listen 0.0.0.0:8080 \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-> 추론을 가속화하려면 `--compile` 매개변수를 추가할 수 있습니다.
-
-그 후 http://127.0.0.1:8080/ 에서 API를 보고 테스트할 수 있습니다.
-
-## GUI 추론 
-[클라이언트 다운로드](https://github.com/AnyaCoder/fish-speech-gui/releases)
+이후 `fake.wav` 파일을 얻게 됩니다.
 
 ## WebUI 추론
 
-다음 명령으로 WebUI를 시작할 수 있습니다:
-
-```bash
-python -m tools.run_webui \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-또는 간단히
-
-```bash
-python -m tools.run_webui
-```
-> 추론을 가속화하려면 `--compile` 매개변수를 추가할 수 있습니다.
-
-!!! note
-    라벨 파일과 참조 오디오 파일을 메인 디렉토리의 `references` 폴더에 미리 저장할 수 있습니다(직접 생성해야 함). 이렇게 하면 WebUI에서 직접 호출할 수 있습니다.
-
-!!! note
-    `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME`과 같은 Gradio 환경 변수를 사용하여 WebUI를 구성할 수 있습니다.
-
-## Docker 추론
-
-OpenAudio는 WebUI 및 API 서버 추론을 위한 Docker 컨테이너를 제공합니다. `docker run` 명령을 직접 사용하여 컨테이너를 시작할 수 있습니다.
-
-다음 사항을 준비해야 합니다:
-- Docker 및 NVIDIA Docker 런타임 설치 (GPU 지원용)
-- 모델 가중치 다운로드 ([가중치 다운로드](#가중치-다운로드) 섹션 참조)
-- 참조 오디오 파일 (선택 사항, 음성 복제용)
-
-```bash
-# 모델 가중치 및 참조 오디오용 디렉토리 생성
-mkdir -p checkpoints references
-
-# 모델 가중치 다운로드 (아직 다운로드하지 않은 경우)
-# hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
-
-# CUDA 지원으로 WebUI 시작 (권장, 최상의 성능)
-docker run -d \
-    --name fish-speech-webui \
-    --gpus all \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-webui-cuda
-
-# CPU 전용 추론 (느리지만 GPU 없이 작동)
-docker run -d \
-    --name fish-speech-webui-cpu \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-webui-cpu
-```
-
-```bash
-# CUDA 지원으로 API 서버 시작
-docker run -d \
-    --name fish-speech-server \
-    --gpus all \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-server-cuda
-
-# CPU 전용 추론
-docker run -d \
-    --name fish-speech-server-cpu \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-server-cpu
-```
-
-다음 환경 변수를 사용하여 Docker 컨테이너를 사용자 정의할 수 있습니다:
-
-- `COMPILE=1` - `torch.compile`을 활성화하여 추론 속도 향상 (약 10배, CUDA 전용)
-- `GRADIO_SERVER_NAME=0.0.0.0` - WebUI 서버 호스트 (기본값: 0.0.0.0)
-- `GRADIO_SERVER_PORT=7860` - WebUI 서버 포트 (기본값: 7860)
-- `API_SERVER_NAME=0.0.0.0` - API 서버 호스트 (기본값: 0.0.0.0)
-- `API_SERVER_PORT=8080` - API 서버 포트 (기본값: 8080)
-- `LLAMA_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini` - 모델 가중치 경로
-- `DECODER_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini/codec.pth` - 디코더 가중치 경로
-- `DECODER_CONFIG_NAME=modded_dac_vq` - 디코더 구성 이름
-```
-
-WebUI 및 API 서버의 사용법은 위 가이드와 동일합니다.
-
-즐기세요!
+준비 중입니다.

+ 8 - 4
docs/ko/install.md

@@ -1,11 +1,11 @@
 ## 요구 사양
 
-- GPU 메모리: 12GB (추론 시)
+- GPU 메모리: 24GB (추론 시)
 - 시스템: Linux, WSL
 
 ## 시스템 설정
 
-OpenAudio는 다양한 설치 방법을 지원합니다. 자신의 개발 환경에 가장 적합한 방법을 선택하세요.
+Fish Audio S2는 다양한 설치 방법을 지원합니다. 자신의 개발 환경에 가장 적합한 방법을 선택하세요.
 
 **사전 요구사항**: 오디오 처리를 위한 시스템 의존성을 설치합니다:
 ``` bash
@@ -26,6 +26,10 @@ pip install -e .[cpu]
 
 # 기본 설치 (PyTorch 기본 인덱스 사용)
 pip install -e .
+
+# pyaudio 설치 중 오류가 발생하면 다음 명령을 사용해 보세요:
+# conda install pyaudio
+# 그런 다음 pip install -e . 를 다시 실행하세요
 ```
 
 ### UV
@@ -63,7 +67,7 @@ pip install -e .
 
 ## Docker 설정
 
-OpenAudio S1 시리즈 모델은 다양한 요구에 부응하기 위해 여러 Docker 배포 옵션을 제공합니다. Docker Hub의 사전 빌드된 이미지를 사용하거나, Docker Compose로 로컬에서 빌드하거나, 수동으로 사용자 정의 이미지를 빌드할 수 있습니다.
+Fish Audio S2 시리즈 모델은 다양한 요구에 부응하기 위해 여러 Docker 배포 옵션을 제공합니다. Docker Hub의 사전 빌드된 이미지를 사용하거나, Docker Compose로 로컬에서 빌드하거나, 수동으로 사용자 정의 이미지를 빌드할 수 있습니다.
 
 WebUI와 API 서버 모두에 대해 GPU(기본값 CUDA 12.6) 및 CPU 버전의 Docker 이미지를 제공합니다. Docker Hub의 사전 빌드된 이미지를 사용하거나, Docker Compose로 로컬에서 빌드하거나, 수동으로 사용자 정의 이미지를 빌드할 수 있습니다. 로컬에서 빌드하려면 아래 지침을 따르세요. 사전 빌드된 이미지를 사용하려면 [추론 가이드](inference.md)를 직접 참조하세요.
 
@@ -71,7 +75,7 @@ WebUI와 API 서버 모두에 대해 GPU(기본값 CUDA 12.6) 및 CPU 버전의
 
 - Docker 및 Docker Compose 설치
 - NVIDIA Docker 런타임 설치 (GPU 지원용)
-- CUDA 추론을 위한 최소 12GB의 GPU 메모리
+- CUDA 추론을 위한 최소 24GB의 GPU 메모리
 
 ### Docker Compose 사용
 

+ 0 - 80
docs/ko/samples.md

@@ -1,80 +0,0 @@
-# 예제
-
-## 감정 제어 (*새 기능)
-
-### 기본 감정 예제
-
-| 감정 유형 | 언어 | 입력 오디오 | 합성 오디오 | 프롬프트 |
-|-----------|------|-------------|-------------|----------|
-| **기쁨** | 중국어 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy_refer.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy.wav" /> | (happy)嘿嘿...博士,悄悄告诉你一件事——我重新开始练小提琴了。 |
-| **혐오** | 일본어 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/disgusted.wav" /> | (digusted)あなたは、本当に気持ち悪い、嫌い…(disgusted)それでも、慰めを求めますの? |
-| **분노** | 영어 | - | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/angry.wav" /> | (angry)I want you to go out immediately! I don't want to see you again, or I will try to kill you! |
-| **분노** | 중국어 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/作战中4.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/angry.wav" /> | (angry)我让你快滚,你是耳聋吗?!...(angry)信不信我揍你! |
-| **놀람** | 중국어 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/surprised.wav" /> | (surprised)今天你过生日?既然这样的话,我就勉为其难祝你生日快乐吧。(surprised)要不要看看你的桌子底下? |
-| **슬픔** | 일본어 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref2.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/sad.wav" /> | (sad)他の小隊長と比べて、私はまだ多くのことを学ばなくてはなりません......(sad)皆さんのペースに追いつけるよう精一杯努力いたしますわ。 |
-
-## 부언어 효과 (*새 기능)
-
-### 웃음소리 효과
-
-| 예제 | 언어 | 프롬프트 | 오디오 |
-|------|------|---------|--------|
-| **예제 1** | 중국어 | 大家好啊,(笑声)哈哈,我是从来不带节奏的血狼破军,今天来点大家想看的东西。 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh1.wav" /> |
-| **예제 2** | 중국어 | (笑声)哈哈(笑声),虽然说"三角洲行动"的策划说他们没有暗改(笑声)哈哈(笑声),但是我相信,大家心里都有数。对不起,实在是太搞笑了,忍不住笑了出来。(笑声)哈哈(笑声) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh2.wav" /> |
-| **예제 3** | 영어 | (laughing)haha(laughing), though many people say that homeless cats need our help, (laughing)haha(laughing), but seldom do they really do something that is useful to the cats, (laughing)haha(laughing) sorry, but this is very interesting. | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/laugh.wav" /> |
-
-### 전투 함성 효과
-
-| 예제 | 언어 | 프롬프트 | 오디오 |
-|------|------|---------|--------|
-| **전투 함성 예제** | 영어 | (shouting)oh my god !!!(shouting)(shouting)(shouting), baby(shouting)you (shouting)are (shouting)a piece of sweet, soft(shouting), delicious cake!!! | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/shout.wav" /> |
-
-## 장문 안정성 테스트
-
-### 중국어 장문 테스트
-
-**중국어 테스트 텍스트:**
-```
-你们这个是什么群啊,你们这是害人不浅啊你们这个群!谁是群主,出来!真的太过分了。你们搞这个群干什么?
-我儿子每一科的成绩都不过那个平均分呐,他现在初二,你叫我儿子怎么办啊?他现在还不到高中啊?
-你们害死我儿子了!快点出来你这个群主!再这样我去报警了啊!我跟你们说你们这一帮人啊,一天到晚啊,
-搞这些什么游戏啊,动漫啊,会害死你们的,你们没有前途我跟你说。你们这九百多个人,好好学习不好吗?
-一天到晚在上网。有什么意思啊?麻烦你重视一下你们的生活的目标啊?有一点学习目标行不行?一天到晚上网是不是人啊?
-```
-
-| 테스트 내용 | 화자/캐릭터 | 입력 오디오 | 합성 오디오 |
-|-------------|-------------|-------------|-------------|
-| **장문 테스트** | 시 (아크나이츠) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio.wav" /> |
-| **랜덤 화자** | 랜덤 (볼륨 주의) | 없음 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio2.wav" /> |
-
-### 영어 장문 테스트
-
-**영어 테스트 텍스트:**
-```
-In the realm of advanced technology, the evolution of artificial intelligence stands as a 
-monumental achievement. This dynamic field, constantly pushing the boundaries of what 
-machines can do, has seen rapid growth and innovation. From deciphering complex data 
-patterns to driving cars autonomously, AI's applications are vast and diverse.
-```
-
-| 테스트 내용 | 화자 | 입력 오디오 | 합성 오디오 |
-|-------------|------|-------------|-------------|
-| **랜덤 화자 1** | 랜덤 | 없음 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio.wav" /> |
-| **랜덤 화자 2** | 랜덤 | 없음 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio2.wav" /> |
-
-### 일본어 장문 테스트
-
-**일본어 테스트 텍스트:**
-```
-宇宙に始まりはあるが、終わりはない。無限。
-星にもまた始まりはあるが、自らの力をもって滅び逝く。有限。
-英知を持つ者こそ、最も愚かであること。歴史からも読み取れる。
-海に生ける魚は、陸の世界を知らない。彼らが英知を持てば、それもまた滅び逝く。
-人間が光の速さを超えるのは、魚たちが陸で生活を始めるよりも滑稽。
-これは抗える者たちに対する、神からの最後通告とも言えよう。
-```
-
-| 테스트 내용 | 화자/캐릭터 | 입력 오디오 | 합성 오디오 |
-|-------------|-------------|-------------|-------------|
-| **장문 테스트** | 도요가와 사키코 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio.wav" /> |
-| **랜덤 화자** | 랜덤 | 없음 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio2.wav" /> |

+ 89 - 120
docs/pt/index.md

@@ -1,164 +1,133 @@
-# OpenAudio (anteriormente Fish-Speech)
-
-<div align="center">
-
 <div align="center">
+<h1>Fish Speech</h1>
 
-<img src="../assets/openaudio.jpg" alt="OpenAudio" style="display: block; margin: 0 auto; width: 35%;"/>
-
-</div>
-
-<strong>Série Avançada de Modelos Text-to-Speech</strong>
+[English](../en/) | [简体中文](../zh/) | **Portuguese** | [日本語](../ja/) | [한국어](../ko/) | [العربية](../ar/) <br>
 
-<div>
-<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
-<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
-<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
-<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+<a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
+<a href="https://trendshift.io/repositories/7014" target="_blank">
+    <img src="https://trendshift.io/api/badge/repositories/7014" alt="fishaudio%2Ffish-speech | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
 </a>
 </div>
 
-<strong>Experimente agora:</strong> <a href="https://fish.audio">Fish Audio Playground</a> | <strong>Saiba mais:</strong> <a href="https://openaudio.com">Site OpenAudio</a>
+<br>
 
+<div align="center">
+    <img src="https://count.getloli.com/get/@fish-speech?theme=asoul" /><br>
 </div>
 
----
-
-!!! note "Aviso de Licença"
-    Esta base de código é lançada sob a **Licença Apache** e todos os pesos dos modelos são lançados sob a **Licença CC-BY-NC-SA-4.0**. Consulte a [LICENÇA DO CÓDIGO](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) e a [LICENÇA DO MODELO](https://spdx.org/licenses/CC-BY-NC-SA-4.0) para mais detalhes.
-
-!!! warning "Aviso Legal"
-    Não assumimos nenhuma responsabilidade pelo uso ilegal da base de código. Consulte as leis locais sobre DMCA e outras leis relevantes.
-
-## **Introdução**
-
-Estamos empolgados em anunciar que mudamos nossa marca para **OpenAudio** - introduzindo uma nova série de modelos avançados de Text-to-Speech que se baseia na fundação do Fish-Speech com melhorias significativas e novas capacidades.
-
-**OpenAudio-S1-mini**: [Blog](https://openaudio.com/blogs/s1); [Vídeo](https://www.youtube.com/watch?v=SYuPvd7m06A); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini);
-
-**Fish-Speech v1.5**: [Vídeo](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5);
-
-## **Destaques**
-
-### **Qualidade TTS Excelente**
-
-Utilizamos as métricas Seed TTS Eval para avaliar o desempenho do modelo, e os resultados mostram que o OpenAudio S1 alcança **0.008 WER** e **0.004 CER** em texto inglês, que é significativamente melhor que modelos anteriores. (Inglês, avaliação automática, baseada na transcrição OpenAI gpt-4o, distância do falante usando Revai/pyannote-wespeaker-voxceleb-resnet34-LM)
-
-| Modelo | Taxa de Erro de Palavras (WER) | Taxa de Erro de Caracteres (CER) | Distância do Falante |
-|:-----:|:--------------------:|:-------------------------:|:----------------:|
-| **S1** | **0.008** | **0.004** | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
-
-### **Melhor Modelo no TTS-Arena2**
-
-OpenAudio S1 alcançou a **classificação #1** no [TTS-Arena2](https://arena.speechcolab.org/), o benchmark para avaliação de text-to-speech:
+<br>
 
 <div align="center">
-    <img src="assets/Elo.jpg" alt="TTS-Arena2 Ranking" style="width: 75%;" />
+    <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+        <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+    </a>
+    <a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+        <img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+    </a>
+    <a target="_blank" href="https://pd.qq.com/s/bwxia254o">
+      <img alt="QQ Channel" src="https://img.shields.io/badge/QQ-blue?logo=tencentqq">
+    </a>
 </div>
 
-### **Controle de Fala**
-OpenAudio S1 **suporta uma variedade de marcadores emocionais, de tom e especiais** para aprimorar a síntese de fala:
-
-- **Emoções básicas**:
-```
-(raivoso) (triste) (animado) (surpreso) (satisfeito) (encantado) 
-(com medo) (preocupado) (chateado) (nervoso) (frustrado) (deprimido)
-(empático) (envergonhado) (nojento) (comovido) (orgulhoso) (relaxado)
-(grato) (confiante) (interessado) (curioso) (confuso) (alegre)
-```
+<div align="center">
+    <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
+      <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
+    </a>
+    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
+        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
+    </a>
+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
+        <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
+    </a>
+</div>
 
-- **Emoções avançadas**:
-```
-(desdenhoso) (infeliz) (ansioso) (histérico) (indiferente) 
-(impaciente) (culpado) (desprezível) (em pânico) (furioso) (relutante)
-(entusiasmado) (desaprovador) (negativo) (negando) (espantado) (sério)
-(sarcástico) (conciliador) (consolador) (sincero) (zombeteiro)
-(hesitante) (cedendo) (doloroso) (constrangido) (divertido)
-```
+!!! info "Aviso de Licença"
+    Este repositório e todos os pesos de modelo associados são lançados sob a **FISH AUDIO RESEARCH LICENSE**. Consulte [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) para mais detalhes.
 
-(Suporte para inglês, chinês e japonês agora, e mais idiomas em breve!)
+!!! warning "Isenção de Responsabilidade Legal"
+    Não nos responsabilizamos por qualquer uso ilegal da base de códigos. Consulte as regulamentações locais sobre DMCA e outras leis relacionadas.
 
-- **Marcadores de tom**:
-```
-(em tom de pressa) (gritando) (berrando) (sussurrando) (tom suave)
-```
+## Começar
 
-- **Efeitos sonoros especiais**:
-```
-(rindo) (gargalhando) (soluçando) (chorando alto) (suspirando) (ofegante)
-(gemendo) (risada da multidão) (risada de fundo) (risada da plateia)
-```
+Esta é a documentação oficial do Fish Speech. Siga as instruções para começar facilmente.
 
-Você também pode usar Ha,ha,ha para controlar, há muitos outros casos esperando para serem explorados por você mesmo.
+- [Instalação](install.md)
+- [Inferência](inference.md)
 
-### **Dois Tipos de Modelos**
+## Fish Audio S2
+**O melhor sistema de texto para fala em código aberto e código fechado**
 
-Oferecemos duas variantes de modelo para atender diferentes necessidades:
+O Fish Audio S2 é o modelo mais recente desenvolvido pela [Fish Audio](https://fish.audio/), projetado para gerar fala que soe natural, autêntica e emocionalmente rica — não mecânica, monótona ou confinada à leitura em estúdio.
 
-- **OpenAudio S1 (4B parâmetros)**: Nosso modelo principal com todas as funcionalidades disponível em [fish.audio](https://fish.audio), oferecendo a mais alta qualidade de síntese de fala com todas as características avançadas.
+O Fish Audio S2 foca em conversas cotidianas, suportando geração nativa de múltiplos locutores e múltiplos turnos. Também suporta controle por instruções.
 
-- **OpenAudio S1-mini (0.5B parâmetros)**: Uma versão destilada com capacidades principais, disponível no [Hugging Face Space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini), otimizada para inferência mais rápida mantendo excelente qualidade.
+A série S2 inclui vários modelos. O modelo de código aberto é o S2-Pro, que é o modelo mais poderoso da série.
 
-Tanto o S1 quanto o S1-mini incorporam Aprendizado por Reforço Online com Feedback Humano (RLHF).
+Visite o [site da Fish Audio](https://fish.audio/) para uma experiência em tempo real.
 
-## **Características**
+### Variantes do Modelo
 
-1. **TTS Zero-shot e Few-shot:** Insira uma amostra vocal de 10 a 30 segundos para gerar saída TTS de alta qualidade. **Para diretrizes detalhadas, veja [Melhores Práticas de Clonagem de Voz](https://docs.fish.audio/text-to-speech/voice-clone-best-practices).**
+| Modelo | Tamanho | Disponibilidade | Descrição |
+|------|------|-------------|-------------|
+| S2-Pro | 4B Parâmetros | [huggingface](https://huggingface.co/fishaudio/s2-pro) | Modelo emblemático completo com a mais alta qualidade e estabilidade |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | Nosso modelo de código fechado com maior velocidade e menor latência |
 
-2. **Suporte Multilíngue e Cross-lingual:** Simplesmente copie e cole texto multilíngue na caixa de entrada—não precisa se preocupar com o idioma. Atualmente suporta inglês, japonês, coreano, chinês, francês, alemão, árabe e espanhol.
+Para mais detalhes sobre os modelos, consulte o relatório técnico.
 
-3. **Sem Dependência de Fonemas:** O modelo tem fortes capacidades de generalização e não depende de fonemas para TTS. Pode lidar com texto em qualquer script de idioma.
+## Destaques
 
-4. **Altamente Preciso:** Alcança uma baixa Taxa de Erro de Caracteres (CER) de cerca de 0,4% e Taxa de Erro de Palavras (WER) de cerca de 0,8% para Seed-TTS Eval.
+<img src="../assets/totalability.png" width=200%>
 
-5. **Rápido:** Com aceleração fish-tech, o fator de tempo real é aproximadamente 1:5 em um laptop Nvidia RTX 4060 e 1:15 em um Nvidia RTX 4090.
+### Controle por Linguagem Natural
 
-6. **Inferência WebUI:** Apresenta uma interface web fácil de usar baseada em Gradio, compatível com Chrome, Firefox, Edge e outros navegadores.
+O Fish Audio S2 permite que os usuários usem linguagem natural para controlar o desempenho, informações paralinguísticas, emoções e outras características de voz de cada frase, em vez de usar apenas tags curtas para controlar vagamente o desempenho do modelo. Isso aumenta muito a qualidade geral do conteúdo gerado.
 
-7. **Inferência GUI:** Oferece uma interface gráfica PyQt6 que funciona perfeitamente com o servidor API. Suporta Linux, Windows e macOS. [Ver GUI](https://github.com/AnyaCoder/fish-speech-gui).
+### Suporte Multilíngue
 
-8. **Amigável para Deploy:** Configure facilmente um servidor de inferência com suporte nativo para Linux, Windows (MacOS em breve), minimizando a perda de velocidade.
+O Fish Audio S2 suporta conversão de texto em fala multilíngue de alta qualidade sem a necessidade de fonemas ou pré-processamento específico por idioma. Incluindo:
 
-## **Mídia e Demos**
+**Inglês, Chinês, Japonês, Coreano, Árabe, Alemão, Francês...**
 
-<!-- <div align="center"> -->
+**E muito mais!**
 
-<h3><strong>Mídia Social</strong></h3>
-<a href="https://x.com/FishAudio/status/1929915992299450398" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-Demo_Mais_Recente-black?style=for-the-badge&logo=x&logoColor=white" alt="Latest Demo on X" />
-</a>
+A lista está em constante expansão. Verifique a [Fish Audio](https://fish.audio/) para os lançamentos mais recentes.
 
-<h3><strong>Demos Interativos</strong></h3>
+### Geração Nativa de Múltiplos Locutores
 
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish_Audio-Experimente_OpenAudio_S1-blue?style=for-the-badge" alt="Try OpenAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-Experimente_S1_Mini-yellow?style=for-the-badge" alt="Try S1 Mini" />
-</a>
+<img src="../assets/chattemplate.png" width=200%>
 
-<h3><strong>Showcases em Vídeo</strong></h3>
-<div align="center">
-<iframe width="560" height="315" src="https://www.youtube.com/embed/SYuPvd7m06A" title="OpenAudio S1 Video" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-</div>
+O Fish Audio S2 permite que os usuários carreguem áudio de referência contendo múltiplos locutores, e o modelo processará as características de cada locutor por meio do token `<|speaker:i|>`. Você pode então controlar o desempenho do modelo por meio de tokens de ID de locutor, alcançando múltiplos locutores em uma única geração. Não há mais necessidade de carregar áudio de referência e gerar fala para cada locutor individualmente.
 
-## **Documentação**
+### Geração de Diálogos em Múltiplos Turnos
 
-### Início Rápido
-- [Configurar Ambiente](install.md) - Configure seu ambiente de desenvolvimento
-- [Guia de Inferência](inference.md) - Execute o modelo e gere fala
+Graças à expansão do contexto do modelo, nosso modelo agora pode usar as informações das partes anteriores do diálogo para melhorar a expressividade do conteúdo gerado subsequentemente, aumentando assim a naturalidade do conteúdo.
 
-## **Comunidade e Suporte**
+### Clonagem de Voz Rápida
 
-- **Discord:** Junte-se à nossa [comunidade Discord](https://discord.gg/Es5qTB9BcN)
-- **Site:** Visite [OpenAudio.com](https://openaudio.com) para as últimas atualizações
-- **Experimente Online:** [Fish Audio Playground](https://fish.audio)
+O Fish Audio S2 suporta clonagem de voz precisa usando amostras de referência curtas (geralmente de 10 a 30 segundos). O modelo pode capturar timbre, estilo de fala e tendência emocional, gerando vozes clonadas realistas e consistentes sem ajuste fino adicional.
 
-## Modelos
+---
 
-O OpenAudio S1 é o primeiro modelo da série OpenAudio. É um vocoder VQ-GAN de descodificador duplo que pode reconstruir áudio a partir de códigos VQ.
+## Agradecimentos
+
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+
+## Relatório Técnico
+
+```bibtex
+@misc{fish-speech-v1.4,
+      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},
+      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},
+      year={2024},
+      eprint={2411.01156},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2411.01156},
+}
+```

+ 14 - 127
docs/pt/inference.md

@@ -1,15 +1,13 @@
 # Inferência
 
-Como o modelo vocoder foi alterado, você precisa de mais VRAM do que antes, sendo recomendado 12GB para inferência fluente.
-
-Suportamos linha de comando, API HTTP e WebUI para inferência, você pode escolher qualquer método que preferir.
+O modelo Fish Audio S2 requer uma grande quantidade de VRAM. Recomendamos o uso de uma GPU com pelo menos 24GB para inferência.
 
 ## Baixar Pesos
 
-Primeiro você precisa baixar os pesos do modelo:
+Primeiro, você precisa baixar os pesos do modelo:
 
 ```bash
-hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
+hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ```
 
 ## Inferência por Linha de Comando
@@ -21,151 +19,40 @@ hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-min
 
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "ref_audio_name.wav" \
-    --checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth"
+    -i "test.wav" \
+    --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
 
 Você deve obter um `fake.npy` e um `fake.wav`.
 
-### 2. Gerar tokens semânticos do texto:
+### 2. Gerar tokens Semânticos a partir do texto:
 
 ```bash
 python fish_speech/models/text2semantic/inference.py \
-    --text "O texto que você quer converter" \
+    --text "O texto que você deseja converter" \
     --prompt-text "Seu texto de referência" \
     --prompt-tokens "fake.npy" \
-    --compile
+    # --compile
 ```
 
-Este comando criará um arquivo `codes_N` no diretório de trabalho, onde N é um inteiro começando de 0.
+Este comando criará um arquivo `codes_N` no diretório de trabalho, onde N é um número inteiro começando em 0.
 
 !!! note
-    Você pode querer usar `--compile` para fundir kernels CUDA para inferência mais rápida (~30 tokens/segundo -> ~500 tokens/segundo).
-    Correspondentemente, se você não planeja usar aceleração, pode comentar o parâmetro `--compile`.
+    Você pode querer usar `--compile` para fundir kernels CUDA para uma inferência mais rápida. No entanto, recomendamos usar nossa otimização de aceleração de inferência sglang.
+    Da mesma forma, se você não planeja usar aceleração, pode comentar o parâmetro `--compile`.
 
 !!! info
     Para GPUs que não suportam bf16, você pode precisar usar o parâmetro `--half`.
 
 ### 3. Gerar vocais a partir de tokens semânticos:
 
-!!! warning "Aviso Futuro"
-    Mantivemos a interface acessível do caminho original (tools/vqgan/inference.py), mas esta interface pode ser removida em versões subsequentes, então por favor altere seu código o mais breve possível.
-
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "codes_0.npy"
+    -i "codes_0.npy" \
 ```
 
-## Inferência com API HTTP
-
-Fornecemos uma API HTTP para inferência. Você pode usar o seguinte comando para iniciar o servidor:
-
-```bash
-python -m tools.api_server \
-    --listen 0.0.0.0:8080 \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-> Se você quiser acelerar a inferência, pode adicionar o parâmetro `--compile`.
-
-Depois disso, você pode visualizar e testar a API em http://127.0.0.1:8080/.
-
-## Inferência GUI 
-[Baixar cliente](https://github.com/AnyaCoder/fish-speech-gui/releases)
+Depois disso, você obterá um arquivo `fake.wav`.
 
 ## Inferência WebUI
 
-Você pode iniciar o WebUI usando o seguinte comando:
-
-```bash
-python -m tools.run_webui \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-Ou simplesmente
-
-```bash
-python -m tools.run_webui
-```
-> Se você quiser acelerar a inferência, pode adicionar o parâmetro `--compile`.
-
-!!! note
-    Você pode salvar o arquivo de rótulo e o arquivo de áudio de referência antecipadamente na pasta `references` no diretório principal (que você precisa criar), para que possa chamá-los diretamente no WebUI.
-
-!!! note
-    Você pode usar variáveis de ambiente do Gradio, como `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME` para configurar o WebUI.
-
-## Inferência com Docker
-
-O OpenAudio fornece contentores Docker para inferência tanto na WebUI como no servidor API. Pode usar diretamente o comando `docker run` para iniciar o contentor.
-
-É necessário preparar o seguinte:
-- Docker e NVIDIA Docker runtime instalados (para suporte de GPU)
-- Pesos do modelo descarregados (consulte a secção [Baixar Pesos](#baixar-pesos))
-- Ficheiros de áudio de referência (opcional, para clonagem de voz)
-
-```bash
-# Criar diretórios para os pesos do modelo e áudios de referência
-mkdir -p checkpoints references
-
-# Descarregar os pesos do modelo (se ainda não o fez)
-# hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
-
-# Iniciar a WebUI com suporte CUDA (recomendado para melhor desempenho)
-docker run -d \
-    --name fish-speech-webui \
-    --gpus all \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-webui-cuda
-
-# Inferência apenas com CPU (mais lento, mas funciona sem GPU)
-docker run -d \
-    --name fish-speech-webui-cpu \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-webui-cpu
-```
-
-```bash
-# Iniciar o servidor API com suporte CUDA
-docker run -d \
-    --name fish-speech-server \
-    --gpus all \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-server-cuda
-
-# Inferência apenas com CPU
-docker run -d \
-    --name fish-speech-server-cpu \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-server-cpu
-```
-
-Pode personalizar os contentores Docker usando estas variáveis de ambiente:
-
-- `COMPILE=1` - Ativa o `torch.compile` para uma inferência mais rápida (cerca de 10x, apenas com CUDA)
-- `GRADIO_SERVER_NAME=0.0.0.0` - Anfitrião do servidor WebUI (padrão: 0.0.0.0)
-- `GRADIO_SERVER_PORT=7860` - Porta do servidor WebUI (padrão: 7860)
-- `API_SERVER_NAME=0.0.0.0` - Anfitrião do servidor API (padrão: 0.0.0.0)
-- `API_SERVER_PORT=8080` - Porta do servidor API (padrão: 8080)
-- `LLAMA_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini` - Caminho para os pesos do modelo
-- `DECODER_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini/codec.pth` - Caminho para os pesos do descodificador
-- `DECODER_CONFIG_NAME=modded_dac_vq` - Nome da configuração do descodificador
-```
-
-O uso da WebUI e do servidor API é o mesmo que o descrito no guia acima.
-
-Divirta-se!
+Em breve.

+ 8 - 4
docs/pt/install.md

@@ -1,11 +1,11 @@
 ## Requisitos
 
-- Memória da GPU: 12GB (Inferência)
+- Memória da GPU: 24GB (Inferência)
 - Sistema: Linux, WSL
 
 ## Configuração do Sistema
 
-O OpenAudio suporta múltiplos métodos de instalação. Escolha o que melhor se adapta ao seu ambiente de desenvolvimento.
+O Fish Audio S2 suporta múltiplos métodos de instalação. Escolha o que melhor se adapta ao seu ambiente de desenvolvimento.
 
 **Pré-requisitos**: Instale as dependências de sistema para processamento de áudio:
 ``` bash
@@ -26,6 +26,10 @@ pip install -e .[cpu]
 
 # Instalação padrão (usa o índice padrão do PyTorch)
 pip install -e .
+
+# Se encontrar um erro durante a instalação devido ao pyaudio, considere usar o seguinte comando:
+# conda install pyaudio
+# De seguida, execute pip install -e . novamente
 ```
 
 ### UV
@@ -63,7 +67,7 @@ pip install -e .
 
 ## Configuração do Docker
 
-O modelo da série OpenAudio S1 oferece múltiplas opções de implementação com Docker para satisfazer diferentes necessidades. Pode usar imagens pré-construídas do Docker Hub, construir localmente com o Docker Compose, ou construir manualmente imagens personalizadas.
+O modelo da série Fish Audio S2 oferece múltiplas opções de implementação com Docker para satisfazer diferentes necessidades. Pode usar imagens pré-construídas do Docker Hub, construir localmente com o Docker Compose, ou construir manualmente imagens personalizadas.
 
 Fornecemos imagens Docker para a WebUI e o servidor API, tanto para GPU (CUDA 12.6 por defeito) como para CPU. Pode usar as imagens pré-construídas do Docker Hub, construir localmente com o Docker Compose, ou construir manualmente imagens personalizadas. Se quiser construir localmente, siga as instruções abaixo. Se apenas quiser usar as imagens pré-construídas, siga diretamente o [guia de inferência](inference.md).
 
@@ -71,7 +75,7 @@ Fornecemos imagens Docker para a WebUI e o servidor API, tanto para GPU (CUDA 12
 
 - Docker e Docker Compose instalados
 - NVIDIA Docker runtime instalado (para suporte de GPU)
-- Pelo menos 12GB de memória de GPU para inferência com CUDA
+- Pelo menos 24GB de memória de GPU para inferência com CUDA
 
 ### Usar o Docker Compose
 

+ 0 - 80
docs/pt/samples.md

@@ -1,80 +0,0 @@
-# Exemplos
-
-## Controle Emocional (*Nova Funcionalidade)
-
-### Exemplos Emocionais Básicos
-
-| Tipo de Emoção | Idioma | Áudio de Entrada | Áudio Sintetizado | Prompt |
-|----------------|--------|------------------|-------------------|--------|
-| **Feliz** | Chinês | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy_refer.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy.wav" /> | (happy)嘿嘿...博士,悄悄告诉你一件事——我重新开始练小提琴了。 |
-| **Nojo** | Japonês | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/disgusted.wav" /> | (digusted)あなたは、本当に気持ち悪い、嫌い…(disgusted)それでも、慰めを求めますの? |
-| **Raiva** | Inglês | - | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/angry.wav" /> | (angry)I want you to go out immediately! I don't want to see you again, or I will try to kill you! |
-| **Raiva** | Chinês | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/作战中4.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/angry.wav" /> | (angry)我让你快滚,你是耳聋吗?!...(angry)信不信我揍你! |
-| **Surpresa** | Chinês | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/surprised.wav" /> | (surprised)今天你过生日?既然这样的话,我就勉为其难祝你生日快乐吧。(surprised)要不要看看你的桌子底下? |
-| **Tristeza** | Japonês | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref2.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/sad.wav" /> | (sad)他の小隊長と比べて、私はまだ多くのことを学ばなくてはなりません......(sad)皆さんのペースに追いつけるよう精一杯努力いたしますわ。 |
-
-## Efeitos Paralinguísticos (*Nova Funcionalidade)
-
-### Efeitos de Risos
-
-| Exemplo | Idioma | Prompt | Áudio |
-|---------|--------|--------|-------|
-| **Exemplo 1** | Chinês | 大家好啊,(笑声)哈哈,我是从来不带节奏的血狼破军,今天来点大家想看的东西。 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh1.wav" /> |
-| **Exemplo 2** | Chinês | (笑声)哈哈(笑声),虽然说"三角洲行动"的策划说他们没有暗改(笑声)哈哈(笑声),但是我相信,大家心里都有数。对不起,实在是太搞笑了,忍不住笑了出来。(笑声)哈哈(笑声) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh2.wav" /> |
-| **Exemplo 3** | Inglês | (laughing)haha(laughing), though many people say that homeless cats need our help, (laughing)haha(laughing), but seldom do they really do something that is useful to the cats, (laughing)haha(laughing) sorry, but this is very interesting. | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/laugh.wav" /> |
-
-### Efeitos de Grito de Guerra
-
-| Exemplo | Idioma | Prompt | Áudio |
-|---------|--------|--------|-------|
-| **Exemplo de Grito de Guerra** | Inglês | (shouting)oh my god !!!(shouting)(shouting)(shouting), baby(shouting)you (shouting)are (shouting)a piece of sweet, soft(shouting), delicious cake!!! | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/shout.wav" /> |
-
-## Teste de Estabilidade de Texto Longo
-
-### Teste de Texto Longo Chinês
-
-**Texto de Teste Chinês:**
-```
-你们这个是什么群啊,你们这是害人不浅啊你们这个群!谁是群主,出来!真的太过分了。你们搞这个群干什么?
-我儿子每一科的成绩都不过那个平均分呐,他现在初二,你叫我儿子怎么办啊?他现在还不到高中啊?
-你们害死我儿子了!快点出来你这个群主!再这样我去报警了啊!我跟你们说你们这一帮人啊,一天到晚啊,
-搞这些什么游戏啊,动漫啊,会害死你们的,你们没有前途我跟你说。你们这九百多个人,好好学习不好吗?
-一天到晚在上网。有什么意思啊?麻烦你重视一下你们的生活的目标啊?有一点学习目标行不行?一天到晚上网是不是人啊?
-```
-
-| Conteúdo do Teste | Falante/Personagem | Áudio de Entrada | Áudio Sintetizado |
-|-------------------|-------------------|------------------|-------------------|
-| **Teste de Texto Longo** | Xi (Arknights) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio.wav" /> |
-| **Falante Aleatório** | Aleatório (Aviso de Volume) | Nenhum | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio2.wav" /> |
-
-### Teste de Texto Longo Inglês
-
-**Texto de Teste Inglês:**
-```
-In the realm of advanced technology, the evolution of artificial intelligence stands as a 
-monumental achievement. This dynamic field, constantly pushing the boundaries of what 
-machines can do, has seen rapid growth and innovation. From deciphering complex data 
-patterns to driving cars autonomously, AI's applications are vast and diverse.
-```
-
-| Conteúdo do Teste | Falante | Áudio de Entrada | Áudio Sintetizado |
-|-------------------|---------|------------------|-------------------|
-| **Falante Aleatório 1** | Aleatório | Nenhum | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio.wav" /> |
-| **Falante Aleatório 2** | Aleatório | Nenhum | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio2.wav" /> |
-
-### Teste de Texto Longo Japonês
-
-**Texto de Teste Japonês:**
-```
-宇宙に始まりはあるが、終わりはない。無限。
-星にもまた始まりはあるが、自らの力をもって滅び逝く。有限。
-英知を持つ者こそ、最も愚かであること。歴史からも読み取れる。
-海に生ける魚は、陸の世界を知らない。彼らが英知を持てば、それもまた滅び逝く。
-人間が光の速さを超えるのは、魚たちが陸で生活を始めるよりも滑稽。
-これは抗える者たちに対する、神からの最後通告とも言えよう。
-```
-
-| Conteúdo do Teste | Falante/Personagem | Áudio de Entrada | Áudio Sintetizado |
-|-------------------|-------------------|------------------|-------------------|
-| **Teste de Texto Longo** | Sakiko Toyogawa | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio.wav" /> |
-| **Falante Aleatório** | Aleatório | Nenhum | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio2.wav" /> |

+ 4 - 16
docs/zh/finetune.md

@@ -40,19 +40,13 @@
 huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
 ```
 
-对于中国大陆用户, 可使用 mirror 下载.
-
-```bash
-HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
-```
-
 随后可运行以下命令来提取语义 token:
 
 ```bash
 python tools/vqgan/extract_vq.py data \
     --num-workers 1 --batch-size 16 \
     --config-name "modded_dac_vq" \
-    --checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth"
+    --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
 
 !!! note
@@ -96,13 +90,7 @@ python tools/llama/build_dataset.py \
 同样的, 请确保你已经下载了 `LLAMA` 权重, 如果没有, 请运行以下命令:
 
 ```bash
-huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
-```
-
-对于中国大陆用户, 可使用 mirror 下载.
-
-```bash
-HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
+huggingface-cli download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ```
 
 最后, 你可以运行以下命令来启动微调:
@@ -130,9 +118,9 @@ python fish_speech/train.py --config-name text2semantic_finetune \
 ```bash
 python tools/llama/merge_lora.py \
 	--lora-config r_8_alpha_16 \
-	--base-weight checkpoints/openaudio-s1-mini \
+	--base-weight checkpoints/s2-pro \
 	--lora-weight results/$project/checkpoints/step_000000010.ckpt \
-	--output checkpoints/openaudio-s1-mini-yth-lora/
+	--output checkpoints/s2-pro-yth-lora/
 ```
 
 !!! note

+ 90 - 121
docs/zh/index.md

@@ -1,164 +1,133 @@
-# OpenAudio (原 Fish-Speech)
-
-<div align="center">
-
 <div align="center">
+<h1>Fish Speech</h1>
 
-<img src="../assets/openaudio.jpg" alt="OpenAudio" style="display: block; margin: 0 auto; width: 35%;"/>
-
-</div>
-
-<strong>先进的文字转语音模型系列</strong>
+[English](../README.md) | **简体中文** | [Portuguese](../README.pt-BR.md) | [日本語](../README.ja.md) | [한국어](../README.ko.md) | [العربية](../README.ar.md) <br>
 
-<div>
-<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
-<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+<a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
+<a href="https://trendshift.io/repositories/7014" target="_blank">
+    <img src="https://trendshift.io/api/badge/repositories/7014" alt="fishaudio%2Ffish-speech | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
 </a>
-<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
-<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
-</a>
-<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
-<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
-</a>
-</div>·
-
-<strong>立即试用:</strong> <a href="https://fish.audio">Fish Audio Playground</a> | <strong>了解更多:</strong> <a href="https://openaudio.com">OpenAudio 网站</a>
-
 </div>
 
----
-
-!!! note "许可声明"
-    本代码库在 **Apache 许可证**下发布,所有模型权重在 **CC-BY-NC-SA-4.0 许可证**下发布。更多详情请参阅 [代码许可证](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) 和 [模型许可证](https://spdx.org/licenses/CC-BY-NC-SA-4.0)。
-
-!!! warning "法律免责声明"
-    我们不对代码库的任何非法使用承担责任。请参考您所在地区有关 DMCA 和其他相关法律的规定。
-
-## **介绍**
-
-我们很高兴地宣布,我们已经更名为 **OpenAudio** - 推出全新的先进文字转语音模型系列,在 Fish-Speech 的基础上进行了重大改进并增加了新功能。
-
-**Openaudio-S1-mini**: [博客](https://openaudio.com/blogs/s1); [视频](https://www.youtube.com/watch?v=SYuPvd7m06A); [Hugging Face](https://huggingface.co/fishaudio/openaudio-s1-mini);
-
-**Fish-Speech v1.5**: [视频](https://www.bilibili.com/video/BV1EKiDYBE4o/); [Hugging Face](https://huggingface.co/fishaudio/fish-speech-1.5);
-
-## **亮点**
-
-### **优秀的 TTS 质量**
-
-我们使用 Seed TTS 评估指标来评估模型性能,结果显示 OpenAudio S1 在英文文本上达到了 **0.008 WER** 和 **0.004 CER**,明显优于以前的模型。(英语,自动评估,基于 OpenAI gpt-4o-转录,说话人距离使用 Revai/pyannote-wespeaker-voxceleb-resnet34-LM)
-
-| 模型 | 词错误率 (WER) | 字符错误率 (CER) | 说话人距离 |
-|:-----:|:--------------------:|:-------------------------:|:----------------:|
-| **S1** | **0.008** | **0.004** | **0.332** |
-| **S1-mini** | **0.011** | **0.005** | **0.380** |
+<br>
 
-### **TTS-Arena2 最佳模型**
+<div align="center">
+    <img src="https://count.getloli.com/get/@fish-speech?theme=asoul" /><br>
+</div>
 
-OpenAudio S1 在 [TTS-Arena2](https://arena.speechcolab.org/) 上获得了 **#1 排名**,这是文字转语音评估的基准:
+<br>
 
 <div align="center">
-    <img src="../assets/Elo.jpg" alt="TTS-Arena2 Ranking" style="width: 75%;" />
+    <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+        <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+    </a>
+    <a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+        <img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+    </a>
+    <a target="_blank" href="https://pd.qq.com/s/bwxia254o">
+      <img alt="QQ Channel" src="https://img.shields.io/badge/QQ-blue?logo=tencentqq">
+    </a>
 </div>
 
-### **语音控制**
-OpenAudio S1 **支持多种情感、语调和特殊标记**来增强语音合成效果:
+<div align="center">
+    <a target="_blank" href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2">
+      <img alt="TTS-Arena2 Score" src="https://img.shields.io/badge/TTS_Arena2-Rank_%231-gold?style=flat-square&logo=trophy&logoColor=white">
+    </a>
+    <a target="_blank" href="https://huggingface.co/spaces/fishaudio/fish-speech-1">
+        <img alt="Huggingface" src="https://img.shields.io/badge/🤗%20-space%20demo-yellow"/>
+    </a>
+    <a target="_blank" href="https://huggingface.co/fishaudio/openaudio-s1-mini">
+        <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
+    </a>
+</div>
 
-- **基础情感**:
-```
-(生气) (伤心) (兴奋) (惊讶) (满意) (高兴) 
-(害怕) (担心) (沮丧) (紧张) (失望) (沮丧)
-(共情) (尴尬) (厌恶) (感动) (自豪) (放松)
-(感激) (自信) (感兴趣) (好奇) (困惑) (快乐)
-```
+!!! info "许可声明"
+    此代码库及其相关的模型权重均在 **FISH AUDIO RESEARCH LICENSE** 下发布。更多详情请参考 [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE)。
 
-- **高级情感**:
-```
-(鄙视) (不高兴) (焦虑) (歇斯底里) (漠不关心) 
-(不耐烦) (内疚) (轻蔑) (恐慌) (愤怒) (不情愿)
-(渴望) (不赞成) (否定) (否认) (惊讶) (严肃)
-(讽刺) (和解) (安慰) (真诚) (冷笑)
-(犹豫) (让步) (痛苦) (尴尬) (开心)
-```
+!!! warning "法律免责声明"
+    我们不对代码库的任何非法使用承担责任。请参考您当地关于 DMCA 和其他相关法律的法规。
 
-(现在支持英语、中文和日语,更多语言即将推出!)
+## 从这里开始
 
-- **语调标记**:
-```
-(匆忙的语调) (大喊) (尖叫) (耳语) (轻声)
-```
+这里是 Fish Speech 的官方文档,请按照说明轻松入门。
 
-- **特殊音效**:
-```
-(笑) (轻笑) (抽泣) (大哭) (叹气) (喘气)
-(呻吟) (群体笑声) (背景笑声) (观众笑声)
-```
+- [安装](install.md)
+- [推理](inference.md)
 
-您还可以使用 Ha,ha,ha 来控制,还有许多其他用法等待您自己探索。
+## Fish Audio S2
+**开源和闭源中最出色的文本转语音系统**
 
-### **两种模型类型**
+Fish Audio S2 是由 [Fish Audio](https://fish.audio/) 开发的最新模型,旨在生成听起来自然、真实且情感丰富的语音——不机械、不平淡,也不局限于录音室风格的朗读。
 
-我们提供两种模型变体以满足不同需求:
+Fish Audio S2 专注于日常对话,支持原生多说话人和多轮生成。同时支持指令控制。
 
-- **OpenAudio S1 (40亿参数)**:我们功能齐全的旗舰模型,可在 [fish.audio](https://fish.audio) 上使用,提供最高质量的语音合成和所有高级功能
+S2 系列包含多个模型,开源模型为 S2-Pro,是该系列中性能最强的模型
 
-- **OpenAudio S1-mini (5亿参数)**:具有核心功能的蒸馏版本,可在 [Hugging Face Space](https://huggingface.co/spaces/fishaudio/openaudio-s1-mini) 上使用,针对更快推理进行优化,同时保持出色的质量
+请访问 [Fish Audio 网站](https://fish.audio/) 以获取实时体验
 
-S1 和 S1-mini 都集成了在线人类反馈强化学习 (RLHF)。
+### 模型变体
 
-## **功能特性**
+| 模型 | 大小 | 可用性 | 描述 |
+|------|------|-------------|-------------|
+| S2-Pro | 4B 参数 | [huggingface]() | 功能齐全的旗舰模型,具有最高质量和稳定性 |
+| S2-Flash | - - - - | [fish.audio](https://fish.audio/) | 我们的闭源模型,具有更快的速度和更低的延迟 |
 
-1. **零样本和少样本 TTS:** 输入 10 到 30 秒的语音样本即可生成高质量的 TTS 输出。**详细指南请参见 [语音克隆最佳实践](https://docs.fish.audio/text-to-speech/voice-clone-best-practices)。**
+有关模型的更多详情,请参见技术报告。
 
-2. **多语言和跨语言支持:** 只需复制粘贴多语言文本到输入框即可——无需担心语言问题。目前支持英语、日语、韩语、中文、法语、德语、阿拉伯语和西班牙语。
+## 亮点
 
-3. **无音素依赖:** 该模型具有强大的泛化能力,不依赖音素进行 TTS。它可以处理任何语言文字的文本。
+<img src="../assets/totalability.png" width=200%>
 
-4. **高度准确:** 在 Seed-TTS Eval 中实现低字符错误率 (CER) 约 0.4% 和词错误率 (WER) 约 0.8%。
+### 自然语言控制
 
-5. **快速:** 通过 torch compile 加速,在 Nvidia RTX 4090 GPU 上实时因子 (RTF) 约为 1:7
+Fish Audio S2 允许用户使用自然语言去控制每一句内容的表现,副语言信息,情绪以及更多语音特征,而不单单局限于使用简短的标签去模糊地控制模型的表现,这极大的提高了生成内容整体的质量
 
-6. **WebUI 推理:** 具有易于使用的基于 Gradio 的网络界面,兼容 Chrome、Firefox、Edge 和其他浏览器。
+### 多语言支持
 
-7. **GUI 推理:** 提供与 API 服务器无缝配合的 PyQt6 图形界面。支持 Linux、Windows 和 macOS。[查看 GUI](https://github.com/AnyaCoder/fish-speech-gui)。
+Fish Audio S2 支持高质量的多语言文本转语音,无需音素或特定语言的预处理。包括:
 
-8. **部署友好:** 轻松设置推理服务器,原生支持 Linux、Windows(MacOS 即将推出),最小化速度损失。
+**英语、中文、日语、韩语、阿拉伯语、德语、法语...**
 
-## **媒体和演示**
+**以及更多!**
 
-<!-- <div align="center"> -->
+列表正在不断扩大,请查看 [Fish Audio](https://fish.audio/) 获取最新发布。
 
-<h3><strong>社交媒体</strong></h3>
-<a href="https://x.com/FishAudio/status/1929915992299450398" target="_blank">
-    <img src="https://img.shields.io/badge/𝕏-最新演示-black?style=for-the-badge&logo=x&logoColor=white" alt="Latest Demo on X" />
-</a>
+### 原生多说话人生成
 
-<h3><strong>互动演示</strong></h3>
-
-<a href="https://fish.audio" target="_blank">
-    <img src="https://img.shields.io/badge/Fish_Audio-试用_OpenAudio_S1-blue?style=for-the-badge" alt="Try OpenAudio S1" />
-</a>
-<a href="https://huggingface.co/spaces/fishaudio/openaudio-s1-mini" target="_blank">
-    <img src="https://img.shields.io/badge/Hugging_Face-试用_S1_Mini-yellow?style=for-the-badge" alt="Try S1 Mini" />
-</a>
+<img src="../assets/chattemplate.png" width=200%>
 
-<h3><strong>视频展示</strong></h3>
-<div align="center">
-<iframe width="560" height="315" src="https://www.youtube.com/embed/SYuPvd7m06A" title="OpenAudio S1 Video" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-</div>
+Fish Audio S2 允许用户上传包含多个说话人的参考音频,模型将通过 `<|speaker:i|>` 令牌处理每个说话人的特征。之后您可以通过说话人 ID 令牌控制模型的表现,从而实现一次生成中包含多个说话人。再也不需要像以前那样针对每个说话人都单独上传参考音频与生成语音了。
 
-## **文档**
+### 多轮对话生成
 
-### 快速开始
-- [构建环境](install.md) - 设置您的开发环境
-- [推理指南](inference.md) - 运行模型并生成语音
+得益于模型上下文的扩展,我们的模型现在可以借助上文的信息提高后续生成内容的表现力,从而提升内容的自然度。
 
-## **社区和支持**
+### 快速语音克隆
 
-- **Discord:** 加入我们的 [Discord 社区](https://discord.gg/Es5qTB9BcN)
-- **网站:** 访问 [OpenAudio.com](https://openaudio.com) 获取最新更新
-- **在线试用:** [Fish Audio Playground](https://fish.audio)
+Fish Audio S2 支持使用短参考样本(通常为 10-30 秒)进行准确的语音克隆。模型可以捕捉音色、说话风格和情感倾向,无需额外微调即可生成逼真且一致的克隆语音。
 
-## 模型
+---
 
-OpenAudio S1 是 OpenAudio 系列的第一个模型。它是一个双解码器 VQ-GAN 声码器,可以从 VQ 码元重建音频。
+## 致谢
+
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+
+## 技术报告
+
+```bibtex
+@misc{fish-speech-v1.4,
+      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},
+      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},
+      year={2024},
+      eprint={2411.01156},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2411.01156},
+}
+```

+ 10 - 123
docs/zh/inference.md

@@ -1,15 +1,13 @@
 # 推理
 
-由于声码器模型已更改,您需要比以前更多的 VRAM,建议使用 12GB 进行流畅推理。
-
-我们支持命令行、HTTP API 和 WebUI 进行推理,您可以选择任何您喜欢的方法。
+Fish Audio S2 模型需要较大的显存,我们推荐您使用至少24GB的GPU进行推理。
 
 ## 下载权重
 
 首先您需要下载模型权重:
 
 ```bash
-hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
+hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ```
 
 ## 命令行推理
@@ -17,30 +15,30 @@ hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-min
 !!! note
     如果您计划让模型随机选择音色,可以跳过此步骤。
 
-### 1. 从参考音频获取 VQ 令牌
+### 1. 从参考音频获取 VQ tokens
 
 ```bash
 python fish_speech/models/dac/inference.py \
-    -i "ref_audio_name.wav" \
-    --checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth"
+    -i "test.wav" \
+    --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
 
 您应该会得到一个 `fake.npy` 和一个 `fake.wav`。
 
-### 2. 从文本生成语义令牌
+### 2. 从文本生成 Semantic tokens
 
 ```bash
 python fish_speech/models/text2semantic/inference.py \
     --text "您想要转换的文本" \
     --prompt-text "您的参考文本" \
     --prompt-tokens "fake.npy" \
-    --compile
+    # --compile
 ```
 
 此命令将在工作目录中创建一个 `codes_N` 文件,其中 N 是从 0 开始的整数。
 
 !!! note
-    您可能希望使用 `--compile` 来融合 CUDA 内核以实现更快的推理(~15 token/秒 -> ~150 token/秒,在RTX 4090 GPU上)
+    您可能希望使用 `--compile` 来融合 CUDA 内核以实现更快的推理,但是我们更推荐您使用我们sglang的推理加速优化
     相应地,如果您不计划使用加速,可以注释掉 `--compile` 参数。
 
 !!! info
@@ -48,124 +46,13 @@ python fish_speech/models/text2semantic/inference.py \
 
 ### 3. 从语义令牌生成声音:
 
-!!! warning "未来警告"
-    我们保留了从原始路径(tools/vqgan/inference.py)访问接口的能力,但此接口可能在后续版本中被删除,因此请尽快更改您的代码。
-
 ```bash
 python fish_speech/models/dac/inference.py \
     -i "codes_0.npy" \
 ```
 
-## HTTP API 推理
-
-我们提供HTTP API进行推理。您可以使用以下命令启动服务器:
-
-```bash
-python -m tools.api_server \
-    --listen 0.0.0.0:8080 \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-> 如果您想要加速推理,可以添加 `--compile` 参数。
-
-之后,您可以在 http://127.0.0.1:8080/ 查看和测试API。
-
-## GUI 推理 
-[下载客户端](https://github.com/AnyaCoder/fish-speech-gui/releases)
+之后你会得到一个fake.wav文件。
 
 ## WebUI 推理
 
-您可以使用以下命令启动WebUI:
-
-```bash
-python -m tools.run_webui \
-    --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \
-    --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \
-    --decoder-config-name modded_dac_vq
-```
-
-或者简单地
-
-```bash
-python -m tools.run_webui
-```
-> 如果您想要加速推理,可以添加 `--compile` 参数。
-
-!!! note
-    您可以提前将标签文件和参考音频文件保存到主目录的 `references` 文件夹中(需要自己创建),这样就可以在WebUI中直接调用它们。
-
-!!! note
-    您可以使用Gradio环境变量,如 `GRADIO_SHARE`、`GRADIO_SERVER_PORT`、`GRADIO_SERVER_NAME` 来配置WebUI。
-
-## Docker 推理
-
-OpenAudio 为 WebUI 和 API 服务器推理提供了 Docker 容器。您可以直接使用 `docker run` 命令来启动容器。
-
-您需要准备以下内容:
-- 已安装 Docker 和 NVIDIA Docker 运行时 (用于 GPU 支持)
-- 已下载模型权重 (参见 [下载权重](#下载权重) 部分)
-- 参考音频文件 (可选, 用于声音克隆)
-
-```bash
-# 为模型权重和参考音频创建目录
-mkdir -p checkpoints references
-
-# 下载模型权重 (如果尚未下载)
-# hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
-
-# 启动支持 CUDA 的 WebUI (推荐, 性能最佳)
-docker run -d \
-    --name fish-speech-webui \
-    --gpus all \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-webui-cuda
-
-# 仅 CPU 推理 (较慢, 但无需 GPU)
-docker run -d \
-    --name fish-speech-webui-cpu \
-    -p 7860:7860 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-webui-cpu
-```
-
-```bash
-# 启动支持 CUDA 的 API 服务器
-docker run -d \
-    --name fish-speech-server \
-    --gpus all \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    -e COMPILE=1 \
-    fishaudio/fish-speech:latest-server-cuda
-
-# 仅 CPU 推理
-docker run -d \
-    --name fish-speech-server-cpu \
-    -p 8080:8080 \
-    -v ./checkpoints:/app/checkpoints \
-    -v ./references:/app/references \
-    fishaudio/fish-speech:latest-server-cpu
-```
-
-您可以使用以下环境变量自定义 Docker 容器:
-
-- `COMPILE=1` - 启用 `torch.compile` 以加速推理 (约提速10倍, 仅限 CUDA)
-- `GRADIO_SERVER_NAME=0.0.0.0` - WebUI 服务器主机 (默认: 0.0.0.0)
-- `GRADIO_SERVER_PORT=7860` - WebUI 服务器端口 (默认: 7860)
-- `API_SERVER_NAME=0.0.0.0` - API 服务器主机 (默认: 0.0.0.0)
-- `API_SERVER_PORT=8080` - API 服务器端口 (默认: 8080)
-- `LLAMA_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini` - 模型权重路径
-- `DECODER_CHECKPOINT_PATH=checkpoints/openaudio-s1-mini/codec.pth` - 解码器权重路径
-- `DECODER_CONFIG_NAME=modded_dac_vq` - 解码器配置名称
-```
-
-WebUI 和 API 服务器的用法与上文指南中的说明相同。
-
-尽情享受吧!
+未完待续。

+ 7 - 150
docs/zh/install.md

@@ -1,11 +1,11 @@
 ## 系统要求
 
-- GPU 内存:12GB(推理)
+- GPU 内存:24GB(推理)
 - 系统:Linux、WSL
 
 ## 系统设置
 
-OpenAudio 支持多种安装方式,请选择最适合您开发环境的方法。
+FishAudio S2支持多种安装方式,请选择最适合您开发环境的方法。
 
 **先决条件**:安装用于音频处理的系统依赖项:
 ``` bash
@@ -17,15 +17,10 @@ apt install portaudio19-dev libsox-dev ffmpeg
 ```bash
 conda create -n fish-speech python=3.12
 conda activate fish-speech
-
-# GPU 安装 (选择您的 CUDA 版本: cu126, cu128, cu129)
-pip install -e .[cu129]
-
-# 仅 CPU 安装
-pip install -e .[cpu]
-
-# 默认安装 (使用 PyTorch 官方源)
 pip install -e .
+# 如果你没有安装上文的前两个依赖,这里会因为pyaudio无法安装而报错,可以考虑使用下面这一行指令。
+# conda install pyaudio 
+# 随后再次运行pip install -e .即可
 ```
 
 ### UV
@@ -39,147 +34,9 @@ uv sync --python 3.12 --extra cu129
 # 仅 CPU 安装
 uv sync --python 3.12 --extra cpu
 ```
-### Intel Arc XPU 支持
-
-对于 Intel Arc GPU 用户,请按以下方式安装以获得 XPU 支持:
-
-```bash
-conda create -n fish-speech python=3.12
-conda activate fish-speech
-
-# 安装所需的 C++ 标准库
-conda install libstdcxx -c conda-forge
-
-# 安装支持 Intel XPU 的 PyTorch
-pip install --pre torch torchvision toraudio --index-url https://download.pytorch.org/whl/nightly/xpu
-
-# 安装 Fish Speech
-pip install -e .
-```
-
-!!! warning
-    `compile` 选项在 Windows 和 macOS 上不受支持。如果希望通过编译运行,您需要自行安装 Triton。
-
 
 ## Docker 设置
 
-OpenAudio S1 系列模型提供了多种 Docker 部署选项以满足不同需求。您可以使用 Docker Hub 上的预构建镜像,通过 Docker Compose 在本地构建,或手动构建自定义镜像。
-
-我们为 WebUI 和 API 服务器提供了 GPU (默认为 CUDA 12.6) 和 CPU 两种版本的 Docker 镜像。您可以直接使用 Docker Hub 上的预构建镜像,或通过 Docker Compose 在本地构建,也可以手动构建自定义镜像。如果希望在本地构建,请遵循以下说明。如果只想使用预构建镜像,请直接查阅 [推理指南](inference.md) 中的说明。
-
-### 先决条件
-
-- 已安装 Docker 和 Docker Compose
-- 已安装 NVIDIA Docker 运行时 (用于 GPU 支持)
-- 至少 12GB 的 GPU 显存用于 CUDA 推理
-
-### 使用 Docker Compose
-
-对于开发或自定义需求,您可以使用 Docker Compose 在本地构建和运行:
-
-```bash
-# 首先克隆本仓库
-git clone https://github.com/fishaudio/fish-speech.git
-cd fish-speech
-
-# 使用 CUDA 启动 WebUI
-docker compose --profile webui up
-
-# 启动带编译优化的 WebUI
-COMPILE=1 docker compose --profile webui up
-
-# 启动 API 服务器
-docker compose --profile server up
-
-# 启动带编译优化的 API 服务器
-COMPILE=1 docker compose --profile server up
-
-# 仅 CPU 部署
-BACKEND=cpu docker compose --profile webui up
-```
-
-#### Docker Compose 环境变量
-
-您可以使用环境变量自定义部署:
-
-```bash
-# .env 文件示例
-BACKEND=cuda              # 或 cpu
-COMPILE=1                 # 启用编译优化
-GRADIO_PORT=7860         # WebUI 端口
-API_PORT=8080            # API 服务器端口
-UV_VERSION=0.8.15        # UV 包管理器版本
-```
-
-该命令将构建镜像并运行容器。您可以在 `http://localhost:7860` 访问 WebUI,在 `http://localhost:8080` 访问 API 服务器。
-
-### 手动 Docker 构建
-
-对于需要自定义构建流程的高级用户:
-
-```bash
-# 构建支持 CUDA 的 WebUI 镜像
-docker build \
-    --platform linux/amd64 \
-    -f docker/Dockerfile \
-    --build-arg BACKEND=cuda \
-    --build-arg CUDA_VER=12.6.0 \
-    --build-arg UV_EXTRA=cu126 \
-    --target webui \
-    -t fish-speech-webui:cuda .
-
-# 构建支持 CUDA 的 API 服务器镜像
-docker build \
-    --platform linux/amd64 \
-    -f docker/Dockerfile \
-    --build-arg BACKEND=cuda \
-    --build-arg CUDA_VER=12.6.0 \
-    --build-arg UV_EXTRA=cu126 \
-    --target server \
-    -t fish-speech-server:cuda .
-
-# 构建仅 CPU 镜像 (支持多平台)
-docker build \
-    --platform linux/amd64,linux/arm64 \
-    -f docker/Dockerfile \
-    --build-arg BACKEND=cpu \
-    --target webui \
-    -t fish-speech-webui:cpu .
-
-# 构建开发镜像
-docker build \
-    --platform linux/amd64 \
-    -f docker/Dockerfile \
-    --build-arg BACKEND=cuda \
-    --target dev \
-    -t fish-speech-dev:cuda .
-```
-
-#### 构建参数
-
-- `BACKEND`: `cuda` 或 `cpu` (默认: `cuda`)
-- `CUDA_VER`: CUDA 版本 (默认: `12.6.0`)
-- `UV_EXTRA`: 用于 CUDA 的 UV 附加包 (默认: `cu126`)
-- `UBUNTU_VER`: Ubuntu 版本 (默认: `24.04`)
-- `PY_VER`: Python 版本 (默认: `3.12`)
-
-### 卷挂载
-
-两种方法都需要挂载以下目录:
-
-- `./checkpoints:/app/checkpoints` - 模型权重目录
-- `./references:/app/references` - 参考音频文件目录
-
-### 环境变量
-
-- `COMPILE=1` - 启用 `torch.compile` 以加速推理 (约提速10倍)
-- `GRADIO_SERVER_NAME=0.0.0.0` - WebUI 服务器主机
-- `GRADIO_SERVER_PORT=7860` - WebUI 服务器端口
-- `API_SERVER_NAME=0.0.0.0` - API 服务器主机
-- `API_SERVER_PORT=8080` - API 服务器端口
-
-!!! note
-    Docker 容器期望模型权重挂载在 `/app/checkpoints` 路径。在启动容器前,请确保已下载所需的模型权重。
+Fish Audio系列模型提供了多种 Docker 部署选项以满足不同需求。您可以使用 Docker Hub 上的预构建镜像,通过 Docker Compose 在本地构建,或手动构建自定义镜像。
 
-!!! warning
-    GPU 支持需要 NVIDIA Docker 运行时。对于仅 CPU 部署,请移除 `--gpus all` 标志并使用 CPU 镜像。
+未完待续。

+ 0 - 80
docs/zh/samples.md

@@ -1,80 +0,0 @@
-# 例子
-
-## 情感控制(*新特性)
-
-### 基础情感示例
-
-| 情感类型 | 语言 | 输入音频 | 合成音频 | Prompt |
-|---------|------|----------|----------|---------|
-| **高兴** | 中文 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy_refer.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/happy.wav" /> | (happy)嘿嘿...博士,悄悄告诉你一件事——我重新开始练小提琴了。 |
-| **厌恶** | 日文 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/disgusted.wav" /> | (digusted)あなたは、本当に気持ち悪い、嫌い…(disgusted)それでも、慰めを求めますの? |
-| **愤怒** | 英文 | - | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/angry.wav" /> | (angry)I want you to go out immediately! I don't want to see you again, or I will try to kill you! |
-| **愤怒** | 中文 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/作战中4.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/angry.wav" /> | (angry)我让你快滚,你是耳聋吗?!...(angry)信不信我揍你! |
-| **惊讶** | 中文 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/surprised.wav" /> | (surprised)今天你过生日?既然这样的话,我就勉为其难祝你生日快乐吧。(surprised)要不要看看你的桌子底下? |
-| **悲伤** | 日文 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref2.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/sad.wav" /> | (sad)他の小隊長と比べて、私はまだ多くのことを学ばなくてはなりません......(sad)皆さんのペースに追いつけるよう精一杯努力いたしますわ。 |
-
-## 副语言效果(*新特性)
-
-### 笑声效果
-
-| 样例 | 语言 | 提示词 | 音频 |
-|------|------|--------|------|
-| **样例 1** | 中文 | 大家好啊,(笑声)哈哈,我是从来不带节奏的血狼破军,今天来点大家想看的东西。 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh1.wav" /> |
-| **样例 2** | 中文 | (笑声)哈哈(笑声),虽然说"三角洲行动"的策划说他们没有暗改(笑声)哈哈(笑声),但是我相信,大家心里都有数。对不起,实在是太搞笑了,忍不住笑了出来。(笑声)哈哈(笑声) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/laugh2.wav" /> |
-| **样例 3** | 英文 | (laughing)haha(laughing), though many people say that homeless cats need our help, (laughing)haha(laughing), but seldom do they really do something that is useful to the cats, (laughing)haha(laughing) sorry, but this is very interesting. | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/laugh.wav" /> |
-
-### 战吼效果
-
-| 样例 | 语言 | 提示词 | 音频 |
-|------|------|--------|------|
-| **战吼示例** | 英文 | (shouting)oh my god !!!(shouting)(shouting)(shouting), baby(shouting)you (shouting)are (shouting)a piece of sweet, soft(shouting), delicious cake!!! | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/shout.wav" /> |
-
-## 长文本稳定性测试
-
-### 中文长文本测试
-
-**中文测试文本:**
-```
-你们这个是什么群啊,你们这是害人不浅啊你们这个群!谁是群主,出来!真的太过分了。你们搞这个群干什么?
-我儿子每一科的成绩都不过那个平均分呐,他现在初二,你叫我儿子怎么办啊?他现在还不到高中啊?
-你们害死我儿子了!快点出来你这个群主!再这样我去报警了啊!我跟你们说你们这一帮人啊,一天到晚啊,
-搞这些什么游戏啊,动漫啊,会害死你们的,你们没有前途我跟你说。你们这九百多个人,好好学习不好吗?
-一天到晚在上网。有什么意思啊?麻烦你重视一下你们的生活的目标啊?有一点学习目标行不行?一天到晚上网是不是人啊?
-```
-
-| 测试内容 | 说话人/角色 | 输入音频 | 合成音频 |
-|----------|-------------|----------|----------|
-| **长文本测试** | 夕(明日方舟) | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/ref1.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio.wav" /> |
-| **随机说话人** | 随机(音量警告) | 无 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/zh/audio2.wav" /> |
-
-### 英文长文本测试
-
-**英文测试文本:**
-```
-In the realm of advanced technology, the evolution of artificial intelligence stands as a 
-monumental achievement. This dynamic field, constantly pushing the boundaries of what 
-machines can do, has seen rapid growth and innovation. From deciphering complex data 
-patterns to driving cars autonomously, AI's applications are vast and diverse.
-```
-
-| 测试内容 | 说话人 | 输入音频 | 合成音频 |
-|----------|--------|----------|----------|
-| **随机说话人 1** | 随机 | 无 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio.wav" /> |
-| **随机说话人 2** | 随机 | 无 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/en/audio2.wav" /> |
-
-### 日文长文本测试
-
-**日文测试文本:**
-```
-宇宙に始まりはあるが、終わりはない。無限。
-星にもまた始まりはあるが、自らの力をもって滅び逝く。有限。
-英知を持つ者こそ、最も愚かであること。歴史からも読み取れる。
-海に生ける魚は、陸の世界を知らない。彼らが英知を持てば、それもまた滅び逝く。
-人間が光の速さを超えるのは、魚たちが陸で生活を始めるよりも滑稽。
-これは抗える者たちに対する、神からの最後通告とも言えよう。
-```
-
-| 测试内容 | 说话人/角色 | 输入音频 | 合成音频 |
-|----------|-------------|----------|----------|
-| **长文本测试** | 丰川祥子 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/ref.wav" /> | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio.wav" /> |
-| **随机说话人** | 随机 | 无 | <audio controls preload="auto" src="https://demo-r2.speech.fish.audio/s1-20250920/ja/audio2.wav" /> |

+ 2 - 2
fish_speech/configs/modded_dac_vq.yaml

@@ -10,7 +10,7 @@ decoder_transformer_layers: [4, 0, 0, 0]
 transformer_general_config:
   _target_: fish_speech.models.dac.modded_dac.ModelArgs
   _partial_: true
-  block_size: 16384
+  block_size: 8192
   n_local_heads: -1
   head_dim: 64
   rope_base: 10000
@@ -34,7 +34,7 @@ quantizer:
     input_dim: 1024
     config: &transformer_config
       _target_: fish_speech.models.dac.modded_dac.ModelArgs
-      block_size: 4096
+      block_size: 2048
       n_layer: 8
       n_head: 16
       dim: 1024

+ 58 - 31
fish_speech/content_sequence.py

@@ -179,26 +179,33 @@ class ContentSequence:
         audio_parts = []
         audio_masks = []
 
-        ignore_loss_token_ids = [tokenizer.get_token_id(i) for i in ignore_loss_tokens]
+        # Optimization: Batch conversion for ignore tokens
+        ignore_loss_token_ids = []
+        if ignore_loss_tokens:
+             # Use the wrapper method which uses convert_tokens_to_ids
+            ignore_loss_token_ids = [tokenizer.get_token_id(i) for i in ignore_loss_tokens]
 
         for part in self.parts:
             if isinstance(part, TextPart):
                 if part.tokens is None:
                     assert part.text is not None
-                    tokens = tokenizer.encode(part.text)
+                    # Optimization: Explicitly disable special tokens (BOS/EOS) 
+                    # because we are constructing the sequence manually
+                    tokens = tokenizer.encode(part.text, add_special_tokens=False)
                 else:
                     tokens = part.tokens
 
-                tokens = torch.tensor(tokens, dtype=torch.int)
+                tokens = torch.tensor(tokens, dtype=torch.long)
             elif isinstance(part, VQPart):
+                # Critical Optimization: Vectorized mapping
+                # Instead of loop lookup: [tokenizer.semantic_id_to_token_id[i] for i in codes]
+                # We use arithmetic offset: code + semantic_begin_id
+                # This assumes semantic tokens are contiguous in the vocab (DualAR requirement)
                 curr_codes = part.codes.clone().to(torch.int)
-                tokens = torch.tensor(
-                    [
-                        tokenizer.semantic_id_to_token_id[int(i.item())]
-                        for i in curr_codes[0].int()
-                    ],
-                    dtype=torch.int,
-                )
+                
+                # Use int64 (long) for token IDs to avoid overflow or type mismatch in embedding
+                tokens = (curr_codes[0] + tokenizer.semantic_begin_id).to(torch.long)
+                
                 vq_parts.append(curr_codes)
                 vq_require_losses.append(part.cal_loss)
             else:
@@ -227,17 +234,25 @@ class ContentSequence:
                 all_labels.append(torch.full_like(tokens, -100))
 
         # Concatenate all tensors
-        tokens = torch.cat(all_tokens, dim=0)
-        labels = torch.cat(all_labels, dim=0)
-        vq_masks = torch.cat(vq_masks, dim=0)
-        audio_masks = torch.cat(audio_masks, dim=0)
+        if not all_tokens:
+             # Handle empty case safely
+             tokens = torch.empty(0, dtype=torch.long)
+             labels = torch.empty(0, dtype=torch.long)
+             vq_masks = torch.empty(0, dtype=torch.bool)
+             audio_masks = torch.empty(0, dtype=torch.bool)
+        else:
+            tokens = torch.cat(all_tokens, dim=0)
+            labels = torch.cat(all_labels, dim=0)
+            vq_masks = torch.cat(vq_masks, dim=0)
+            audio_masks = torch.cat(audio_masks, dim=0)
+        
         vq_require_losses = torch.tensor(vq_require_losses, dtype=torch.bool)
 
         # Apply shift if needed for next-token prediction
         vq_mask_tokens = vq_masks
         vq_mask_labels = vq_masks
 
-        if add_shift:
+        if add_shift and len(tokens) > 0:
             tokens = tokens[:-1]
             labels = labels[1:]
             vq_masks = vq_masks[:-1]
@@ -247,13 +262,8 @@ class ContentSequence:
 
         # Ignore specified tokens
         for i in ignore_loss_token_ids:
-            assert i != -100 and i is not None
-            labels[labels == i] = -100
-
-        assert tokens.dtype in [
-            torch.int,
-            torch.long,
-        ], f"Invalid dtype: {tokens.dtype}"
+            if i is not None:
+                labels[labels == i] = -100
 
         return EncodedMessage(
             tokens=tokens,
@@ -274,7 +284,9 @@ class ContentSequence:
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         encoded = self.encode(tokenizer, add_shift=False)
         tokens = encoded.tokens
-        values = torch.zeros((num_codebooks + 1, len(tokens)), dtype=torch.int)
+        # Use int32 for prompt cache to save memory, convert to model dtype later if needed
+        # Or keep as input_ids (long)
+        values = torch.zeros((num_codebooks + 1, len(tokens)), dtype=torch.long)
         values[0] = tokens
 
         if (encoded.vq_parts is None or len(encoded.vq_parts) == 0) and (
@@ -282,14 +294,24 @@ class ContentSequence:
         ):
             return values, None, None
 
-        audio_parts = audio_masks = None
+        audio_parts = None 
+        audio_masks = None
+        
         if encoded.vq_parts is not None and len(encoded.vq_parts) > 0:
             vq_parts = encoded.vq_parts
-            vq_parts = torch.cat(vq_parts, dim=1)
-            values[0, encoded.vq_mask_tokens] = (
-                vq_parts[0] + tokenizer.semantic_begin_id
-            )
-            values[1:, encoded.vq_mask_tokens] = vq_parts
+            # List[Tensor(1, T)] -> Tensor(1, Total_T) -> Tensor(1, Total_T)
+            # Ensure we are handling the list concatenation correctly
+            if len(vq_parts) > 1:
+                # We need to be careful here: vq_parts is a list of tensors from different VQPart segments
+                # They correspond to encoded.vq_mask_tokens
+                # Since we just want to fill the 'values' tensor at the right positions:
+                all_vq_codes = torch.cat(vq_parts, dim=1) # Shape: (C, Total_Semantic_Tokens)
+            else:
+                all_vq_codes = vq_parts[0]
+                
+            # Values[0] is already the Main Token ID (Semantic Begin + Code)
+            # Values[1:] should be the codes themselves
+            values[1:, encoded.vq_mask_tokens] = all_vq_codes.to(dtype=torch.long)
 
         if encoded.audio_parts is not None and len(encoded.audio_parts) > 0:
             audio_parts = torch.cat(encoded.audio_parts, dim=0)
@@ -359,7 +381,12 @@ class ContentSequence:
                     count_semantic_tokens = 0
                     semantic_label = None
 
-            val = tokenizer.decode([int(tok.item())])
+            # Use HF decode
+            val = tokenizer.decode([token_id])
+            
+            # Simple fallback for visualization if decode returns empty or weird stuff for special tokens
+            if not val:
+                 val = f"<{token_id}>"
 
             if lab == -100:
                 print_in_green(val)
@@ -369,4 +396,4 @@ class ContentSequence:
         if merge_semantic_tokens and count_semantic_tokens > 0:
             print_semantic_token(semantic_label, count_semantic_tokens)
 
-        print()
+        print()

+ 174 - 0
fish_speech/conversation.py

@@ -0,0 +1,174 @@
+from copy import deepcopy
+from dataclasses import dataclass, field
+from typing import Literal
+
+import torch
+from transformers import PreTrainedTokenizerFast
+
+from fish_speech.content_sequence import (
+    AudioPart,
+    BasePart,
+    ContentSequence,
+    EncodedMessage,
+    TextPart,
+    VQPart,
+)
+from fish_speech.tokenizer import IM_END_TOKEN, IM_START_TOKEN, MODALITY_TOKENS
+
+
+@dataclass(kw_only=True)
+class Message:
+    role: Literal["system", "user", "assistant"]
+    parts: list[BasePart] = field(default_factory=list)
+    add_im_start: bool = True
+    add_im_end: bool = True
+    cal_loss: bool = False
+    modality: Literal["text", "voice", "interleave"] | None = None
+
+    # By default, ignore the loss of the auto-generated im_start token
+    ignore_im_start_loss: bool = True
+
+
+@dataclass
+class Conversation:
+    messages: list[Message]
+
+    def __init__(self: "Conversation", messages: list[Message] | None = None):
+        self.messages = messages or []
+
+    def _build_content_sequence(
+        self: "Conversation",
+        metadata: dict | None = None,
+    ) -> ContentSequence:
+        """
+        Build a ContentSequence from all messages.
+        Handles cal_loss inheritance from message to part level.
+        """
+        all_parts = []
+        for message in self.messages:
+            # Add im_start
+            if message.add_im_start:
+                modality_token = (
+                    MODALITY_TOKENS[message.modality] if message.modality else ""
+                )
+                all_parts.append(
+                    TextPart(
+                        text=f"{IM_START_TOKEN}{message.role}\n{modality_token}",
+                        cal_loss=not message.ignore_im_start_loss,
+                    )
+                )
+
+            # Add message parts
+            for part in message.parts:
+                # Inherit cal_loss from message if not set at part level
+                if not hasattr(part, "cal_loss") or part.cal_loss is False:
+                    new_part = deepcopy(part)
+                    new_part.cal_loss = message.cal_loss
+                    all_parts.append(new_part)
+                else:
+                    all_parts.append(part)
+
+            # Add im_end
+            if message.add_im_end:
+                all_parts.append(
+                    TextPart(text=IM_END_TOKEN + "\n", cal_loss=message.cal_loss)
+                )
+
+        return ContentSequence(parts=all_parts, modality=None, metadata=metadata)
+
+    def encode(
+        self: "Conversation",
+        tokenizer: any,
+        add_shift: bool = True,
+        ignore_loss_tokens: list[str] = [],
+        metadata: dict | None = None,
+        max_length: int | None = None,
+    ) -> EncodedMessage:
+        # Build ContentSequence from messages
+        content_seq = self._build_content_sequence(metadata=metadata)
+        return content_seq.encode(
+            tokenizer,
+            add_shift=add_shift,
+            ignore_loss_tokens=ignore_loss_tokens,
+            max_length=max_length,
+        )
+
+    def encode_for_inference(
+        self: "Conversation",
+        tokenizer: any,
+        num_codebooks: int,
+        metadata: dict | None = None,
+    ):
+        content_seq = self._build_content_sequence(metadata=metadata)
+        return content_seq.encode_for_inference(tokenizer, num_codebooks=num_codebooks)
+
+    def visualize(
+        self: "Conversation",
+        tokenizer: PreTrainedTokenizerFast,
+        ignore_loss_tokens: list[str] = [],
+        merge_semantic_tokens: bool = False,
+        merge_audio_tokens: bool = False,
+        use_color: bool = True,
+    ):
+        """
+        Visualize the encoded sequence with color-coded tokens.
+        Blue/cyan tokens contribute to loss, green tokens do not.
+        """
+        # Build ContentSequence from messages and use its visualize method
+        content_seq = self._build_content_sequence()
+        content_seq.visualize(
+            tokenizer,
+            ignore_loss_tokens=ignore_loss_tokens,
+            merge_semantic_tokens=merge_semantic_tokens,
+        )
+
+    def append(self: "Conversation", message: Message):
+        self.messages.append(message)
+
+    def to_content_sequence(
+        self: "Conversation",
+        metadata: dict | None = None,
+    ) -> ContentSequence:
+        """
+        Convert the Conversation to a ContentSequence.
+
+        This method builds a ContentSequence from all messages,
+        handling cal_loss inheritance from message to part level.
+
+        Args:
+            metadata: Optional metadata to include in the ContentSequence
+
+        Returns:
+            ContentSequence with all messages converted to parts
+        """
+        return self._build_content_sequence(metadata=metadata)
+
+
+if __name__ == "__main__":
+    # Test the new implementation with the same API
+    message0 = Message(
+        role="user",
+        parts=[
+            TextPart(text="Hello, how are you?"),
+            VQPart(codes=torch.zeros((4, 10))),
+        ],
+        cal_loss=False,
+    )
+
+    message1 = Message(
+        role="assistant",
+        parts=[TextPart(text="I'm fine, thank you.")],
+        cal_loss=True,
+    )
+    conversation = Conversation([message0, message1])
+    tokenizer = PreTrainedTokenizerFast.from_pretrained("checkpoints/agent-0.6b-debug")
+
+    # Test with enhanced visualization from ContentSequence
+    print("Basic visualization:")
+    conversation.visualize(tokenizer)
+
+    print("\nWith merged semantic tokens:")
+    conversation.visualize(tokenizer, merge_semantic_tokens=True)
+
+    print("\nWithout colors:")
+    conversation.visualize(tokenizer, use_color=False)

+ 3 - 3
fish_speech/i18n/locale/en_US.json

@@ -72,7 +72,7 @@
   "Put your text here.": "Put your text here.",
   "Reference Audio": "Reference Audio",
   "Reference Text": "Reference Text",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
+  "Related code and weights are released under FISH AUDIO RESEARCH LICENSE.": "Related code and weights are released under FISH AUDIO RESEARCH LICENSE.",
   "Remove Selected Data": "Remove Selected Data",
   "Removed path successfully!": "Removed path successfully!",
   "Repetition Penalty": "Repetition Penalty",
@@ -112,7 +112,7 @@
   "WebUI Host": "WebUI Host",
   "WebUI Port": "WebUI Port",
   "Whisper Model": "Whisper Model",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).",
   "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
   "latest": "latest",
   "new": "new",
@@ -120,4 +120,4 @@
   "Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
   "Text Normalization": "Text Normalization",
   "Select Example Audio": "Select Example Audio"
-}
+}

+ 3 - 3
fish_speech/i18n/locale/es_ES.json

@@ -72,7 +72,7 @@
   "Put your text here.": "Ponga su texto aquí.",
   "Reference Audio": "Audio de Referencia",
   "Reference Text": "Texto de Referencia",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
+  "Related code and weights are released under FISH AUDIO RESEARCH LICENSE.": "El código relacionado y los pesos se publican bajo la FISH AUDIO RESEARCH LICENSE.",
   "Remove Selected Data": "Eliminar Datos Seleccionados",
   "Removed path successfully!": "¡Ruta eliminada exitosamente!",
   "Repetition Penalty": "Penalización por Repetición",
@@ -112,7 +112,7 @@
   "WebUI Host": "Host de WebUI",
   "WebUI Port": "Puerto de WebUI",
   "Whisper Model": "Modelo Whisper",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1.5).",
   "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
   "latest": "más reciente",
   "new": "nuevo",
@@ -120,4 +120,4 @@
   "Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
   "Text Normalization": "Normalización de Texto",
   "Select Example Audio": "Selecionar áudio de exemplo"
-}
+}

+ 3 - 3
fish_speech/i18n/locale/ja_JP.json

@@ -72,7 +72,7 @@
   "Put your text here.": "ここにテキストを入力してください。",
   "Reference Audio": "リファレンスオーディオ",
   "Reference Text": "リファレンステキスト",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
+  "Related code and weights are released under FISH AUDIO RESEARCH LICENSE.": "関連コードと重みはFISH AUDIO RESEARCH LICENSEの下でリリースされます。",
   "Remove Selected Data": "選択したデータを削除",
   "Removed path successfully!": "パスの削除に成功しました!",
   "Repetition Penalty": "反復ペナルティ",
@@ -112,7 +112,7 @@
   "WebUI Host": "WebUIホスト",
   "WebUI Port": "WebUIポート",
   "Whisper Model": "Whisperモデル",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1.5)にあります。",
   "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
   "latest": "最新",
   "new": "新規",
@@ -120,4 +120,4 @@
   "Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー(現在は中国語のみ)",
   "Text Normalization": "テキスト正規化",
   "Select Example Audio": "サンプル音声を選択"
-}
+}

+ 3 - 3
fish_speech/i18n/locale/ko_KR.json

@@ -72,7 +72,7 @@
   "Put your text here.": "여기에 텍스트를 입력하세요.",
   "Reference Audio": "참고 오디오",
   "Reference Text": "참고 텍스트",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "관련 코드 및 가중치는 CC BY-NC-SA 4.0 라이선스 하에 배포됩니다.",
+  "Related code and weights are released under FISH AUDIO RESEARCH LICENSE.": "관련 코드 및 가중치는 FISH AUDIO RESEARCH LICENSE 하에 배포됩니다.",
   "Remove Selected Data": "선택한 데이터 제거",
   "Removed path successfully!": "경로가 성공적으로 제거되었습니다!",
   "Repetition Penalty": "반복 패널티",
@@ -112,7 +112,7 @@
   "WebUI Host": "WebUI 호스트",
   "WebUI Port": "WebUI 포트",
   "Whisper Model": "Whisper 모델",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "소스 코드는 [이곳](https://github.com/fishaudio/fish-speech)에서, 모델은 [이곳](https://huggingface.co/fishaudio/fish-speech-1)에서 확인하실 수 있습니다.",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).": "소스 코드는 [이곳](https://github.com/fishaudio/fish-speech)에서, 모델은 [이곳](https://huggingface.co/fishaudio/fish-speech-1.5)에서 확인하실 수 있습니다.",
   "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 시리즈 GPU에는 bf16-true를, 10+ 시리즈 GPU에는 16-mixed를 권장합니다",
   "latest": "최신",
   "new": "새로운",
@@ -120,4 +120,4 @@
   "Normalization Result Preview (Currently Only Chinese)": "정규화 결과 미리보기(현재 중국어만 지원)",
   "Text Normalization": "텍스트 정규화",
   "Select Example Audio": "예시 오디오 선택"
-}
+}

+ 3 - 4
fish_speech/i18n/locale/pt_BR.json

@@ -9,7 +9,6 @@
   "Batch Inference": "Inferência em Lote",
   "Batch Size": "Tamanho do Lote",
   "Changing with the Model Path": "Alterando com o Caminho do Modelo",
-
   "Compile Model": "Compilar Modelo",
   "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
   "Copy": "Copiar",
@@ -84,7 +83,7 @@
   "Reference Text": "Texto de Referência",
   "warning": "Aviso",
   "Pre-processing begins...": "O pré-processamento começou!",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
+  "Related code and weights are released under FISH AUDIO RESEARCH LICENSE.": "O código relacionado e os pesos são licenciados sob a FISH AUDIO RESEARCH LICENSE.",
   "Remove Selected Data": "Remover Dados Selecionados",
   "Removed path successfully!": "Caminho removido com sucesso!",
   "Repetition Penalty": "Penalidade de Repetição",
@@ -119,7 +118,7 @@
   "WebUI Host": "Host da WebUI",
   "WebUI Port": "Porta da WebUI",
   "Whisper Model": "Modelo Whisper",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1.5).",
   "auto": "automático",
   "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
   "latest": "mais recente",
@@ -130,4 +129,4 @@
   "No": "Não",
   "version:": "versão:",
   "author:": "autor:"
-}
+}

+ 3 - 3
fish_speech/i18n/locale/zh_CN.json

@@ -72,7 +72,7 @@
   "Put your text here.": "在此处输入文本.",
   "Reference Audio": "参考音频",
   "Reference Text": "参考文本",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
+  "Related code and weights are released under FISH AUDIO RESEARCH LICENSE.": "相关代码和权重使用 FISH AUDIO RESEARCH LICENSE 许可证发布.",
   "Remove Selected Data": "移除选中数据",
   "Removed path successfully!": "移除路径成功!",
   "Repetition Penalty": "重复惩罚",
@@ -112,7 +112,7 @@
   "WebUI Host": "WebUI 监听地址",
   "WebUI Port": "WebUI 端口",
   "Whisper Model": "Whisper 模型",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1.5) 找到模型.",
   "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed",
   "latest": "最近的检查点",
   "new": "创建新的检查点",
@@ -120,4 +120,4 @@
   "Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
   "Text Normalization": "文本规范化",
   "Select Example Audio": "选择参考音频"
-}
+}

+ 1 - 7
fish_speech/inference_engine/vq_manager.py

@@ -14,16 +14,10 @@ class VQManager:
         self.load_audio: Callable
 
     def decode_vq_tokens(self, codes):
-        feature_lengths = torch.tensor(
-            [codes.shape[1]], device=self.decoder_model.device
-        )
         logger.info(f"VQ features: {codes.shape}")
 
         if isinstance(self.decoder_model, DAC):
-            return self.decoder_model.decode(
-                indices=codes[None],
-                feature_lengths=feature_lengths,
-            )[0].squeeze()
+            return self.decoder_model.from_indices(codes[None])[0].squeeze()
 
         raise ValueError(f"Unknown model type: {type(self.decoder_model)}")
 

+ 6 - 3
fish_speech/models/dac/inference.py

@@ -87,7 +87,7 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
 
         # VQ Encoder
         audio_lengths = torch.tensor([audios.shape[2]], device=device, dtype=torch.long)
-        indices, indices_lens = model.encode(audios, audio_lengths)
+        indices, _ = model.encode(audios, audio_lengths)
 
         if indices.ndim == 3:
             indices = indices[0]
@@ -101,12 +101,15 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
         indices = np.load(input_path)
         indices = torch.from_numpy(indices).to(device).long()
         assert indices.ndim == 2, f"Expected 2D indices, got {indices.ndim}"
-        indices_lens = torch.tensor([indices.shape[1]], device=device, dtype=torch.long)
+        # indices_lens = torch.tensor([indices.shape[1]], device=device, dtype=torch.long)
     else:
         raise ValueError(f"Unknown input type: {input_path}")
 
     # Restore
-    fake_audios, audio_lengths = model.decode(indices, indices_lens)
+    if indices.ndim == 2:
+        indices = indices.unsqueeze(0)
+
+    fake_audios = model.from_indices(indices)
     audio_time = fake_audios.shape[-1] / model.sample_rate
 
     logger.info(

+ 89 - 26
fish_speech/models/dac/modded_dac.py

@@ -3,16 +3,12 @@ import typing as tp
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
-import hydra
-import librosa
 import numpy as np
-import soundfile as sf
 import torch
 from audiotools import AudioSignal
 from audiotools.ml import BaseModel
 from dac.model.base import CodecMixin
 from dac.nn.layers import Snake1d, WNConv1d, WNConvTranspose1d
-from omegaconf import OmegaConf
 from torch import Tensor, nn
 from torch.nn import functional as F
 from torch.nn.utils.parametrizations import weight_norm
@@ -51,6 +47,7 @@ class ModelArgs:
     channels_first: bool = True  # to be compatible with conv1d input/output
     pos_embed_type: str = "rope"  # can be "rope" or "conformer"
     max_relative_position: int = 128  # for conformer-style relative position embedding
+    window_size: int = 512  # for window limited attention
 
     def __post_init__(self):
         if self.n_local_heads == -1:
@@ -106,16 +103,14 @@ class Transformer(nn.Module):
         # Only compute RoPE frequencies if using RoPE
         if config.pos_embed_type == "rope":
             freqs_cis = precompute_freqs_cis(
-                self.config.block_size, self.config.head_dim, self.config.rope_base
+                327680, self.config.head_dim, self.config.rope_base
             )
-            self.register_buffer("freqs_cis", freqs_cis)
+            self.register_buffer("freqs_cis", freqs_cis, persistent=False)
         else:
             self.register_buffer("freqs_cis", None)
 
-        causal_mask = torch.tril(
-            torch.ones(self.config.block_size, self.config.block_size, dtype=torch.bool)
-        )
-        self.register_buffer("causal_mask", causal_mask)
+        causal_mask = torch.tril(torch.ones(32768, 32768, dtype=torch.bool))
+        self.register_buffer("causal_mask", causal_mask, persistent=False)
 
         self.max_batch_size = -1
         self.max_seq_length = -1
@@ -153,6 +148,7 @@ class Transformer(nn.Module):
             assert (
                 self.freqs_cis is not None
             ), "RoPE frequencies must be initialized for RoPE positional embedding"
+            # print("MAX", input_pos.max())
             freqs_cis = self.freqs_cis[input_pos]
         else:
             freqs_cis = None
@@ -638,7 +634,7 @@ class EncoderBlock(nn.Module):
                 WindowLimitedTransformer(
                     causal=causal,
                     input_dim=dim,
-                    window_size=512,
+                    window_size=getattr(transformer_general_config, "window_size", 512),
                     config=transformer_general_config(
                         n_layer=n_t_layer,
                         n_head=dim // 64,
@@ -814,6 +810,7 @@ class DAC(BaseModel, CodecMixin):
         causal: bool = True,
         encoder_transformer_layers: List[int] = [0, 0, 0, 0],
         decoder_transformer_layers: List[int] = [0, 0, 0, 0],
+        overwrite_decoder: torch.nn.Module = None,
         transformer_general_config=None,
     ):
         super().__init__()
@@ -841,14 +838,17 @@ class DAC(BaseModel, CodecMixin):
 
         self.quantizer = quantizer
 
-        self.decoder = Decoder(
-            latent_dim,
-            decoder_dim,
-            decoder_rates,
-            causal=causal,
-            n_transformer_layers=decoder_transformer_layers,
-            transformer_general_config=transformer_general_config,
-        )
+        if overwrite_decoder is not None:
+            self.decoder = overwrite_decoder
+        else:
+            self.decoder = Decoder(
+                latent_dim,
+                decoder_dim,
+                decoder_rates,
+                causal=causal,
+                n_transformer_layers=decoder_transformer_layers,
+                transformer_general_config=transformer_general_config,
+            )
         self.sample_rate = sample_rate
         self.apply(init_weights)
 
@@ -906,7 +906,6 @@ class DAC(BaseModel, CodecMixin):
         # pad to multiple of self.frame_length
         if audio_data.ndim == 2:
             audio_data = audio_data.unsqueeze(1)
-        # print(audio_data.shape)
         length = audio_data.shape[-1]
         right_pad = math.ceil(length / self.frame_length) * self.frame_length - length
         audio_data = nn.functional.pad(audio_data, (0, right_pad))
@@ -919,13 +918,28 @@ class DAC(BaseModel, CodecMixin):
         indices_lens = torch.ceil(audio_lengths / self.frame_length).long()
         return indices, indices_lens
 
-    def decode(self, indices: torch.Tensor, feature_lengths):
-        if indices.ndim == 2:
-            indices = indices[None]
-
+    def from_indices(self, indices: torch.Tensor):
         z = self.quantizer.decode(indices)
-        audio_lengths = feature_lengths * self.frame_length
-        return self.decoder(z), audio_lengths
+        return self.decoder(z)
+
+    def decode(self, z: torch.Tensor):
+        """Decode given latent codes and return audio data
+
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+            Quantized continuous representation of input
+        length : int, optional
+            Number of samples in output audio, by default None
+
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        return self.decoder(z)
 
     def forward(
         self,
@@ -976,3 +990,52 @@ class DAC(BaseModel, CodecMixin):
         z = vq_results[0] if isinstance(vq_results, tuple) else vq_results.z
         x = self.decode(z)
         return x[..., :length], vq_results
+
+
+if __name__ == "__main__":
+    import hydra
+    import torch
+    import numpy as np
+    import soundfile as sf
+    from omegaconf import OmegaConf
+
+    # 配置路径
+    config_path = "fish_speech/configs/modded_dac_vq.yaml"
+    checkpoint_path = "checkpoints/s2-pro/codec.pth"
+    codes_path = "./output/codes_0.npy"  # 你的 codes 文件路径
+    output_path = "reconstructed_from_codes.wav"
+    sample_rate = 44100 # 请确保采样率与模型训练时一致
+
+    with torch.inference_mode():
+        # 1. 初始化模型
+        model = hydra.utils.instantiate(OmegaConf.load(config_path))
+        new_sd = torch.load(checkpoint_path, map_location="cpu")
+        model.load_state_dict(new_sd, strict=False)
+        model.cuda()
+        model.eval()
+
+        # 2. 加载外部 codes (.npy)
+        # 预期 shape 通常为 [num_codebooks, seq_len] 或 [1, num_codebooks, seq_len]
+        codes_np = np.load(codes_path)
+        codes_tensor = torch.from_numpy(codes_np).to(torch.long).cuda()
+
+        # 如果 codes 没有 batch 维度,增加一个维度 [1, num_codebooks, seq_len]
+        if len(codes_tensor.shape) == 2:
+            codes_tensor = codes_tensor.unsqueeze(0)
+
+        print(f"Loaded codes shape: {codes_tensor.shape}")
+
+        # 3. 直接从 codes 重建音频 (Decoding)
+        # 注意:fish_speech 的 model.from_indices 通常接受的输入是 LongTensor
+        fake_audio = model.from_indices(codes_tensor)
+        
+        # 4. 后处理与保存
+        # fake_audio 形状通常为 [B, C, T]
+        audio_np = fake_audio.squeeze().cpu().numpy()
+        
+        # 如果是多声道,转置为 soundfile 要求的 (samples, channels)
+        if len(audio_np.shape) == 2:
+            audio_np = audio_np.T
+
+        sf.write(output_path, audio_np, sample_rate)
+        print(f"重建完成。音频已保存至: {output_path}")

+ 4 - 8
fish_speech/models/dac/rvq.py

@@ -351,19 +351,15 @@ class DownsampleResidualVectorQuantize(nn.Module):
     #
     def decode(self, indices: torch.Tensor):
         # indices = rearrange(indices, "b (g r) l -> g b l r", g=self.residual_fsq.groups)
-
-        # print(f"indices: {indices.shape}, semantic_quantizer.codebook_size: {self.semantic_quantizer.codebook_size}, quantizer.codebook_size: {self.quantizer.codebook_size}, semantic min: {indices[:, 0].min()}, max: {indices[:, 0].max()}, quantizer min: {indices[:, 1:].min()}, max: {indices[:, 1:].max()}")
-
-        new_indices = torch.zeros_like(indices)
-        new_indices[:, 0] = torch.clamp(
+        indices[:, 0] = torch.clamp(
             indices[:, 0], max=self.semantic_quantizer.codebook_size - 1
         )
-        new_indices[:, 1:] = torch.clamp(
+        indices[:, 1:] = torch.clamp(
             indices[:, 1:], max=self.quantizer.codebook_size - 1
         )
 
-        z_q_semantic = self.semantic_quantizer.from_codes(new_indices[:, :1])[0]
-        z_q_residual = self.quantizer.from_codes(new_indices[:, 1:])[0]
+        z_q_semantic = self.semantic_quantizer.from_codes(indices[:, :1])[0]
+        z_q_residual = self.quantizer.from_codes(indices[:, 1:])[0]
         z_q = z_q_semantic + z_q_residual
         z_q = self.post_module(z_q)
         z_q = self.upsample(z_q)

+ 447 - 170
fish_speech/models/text2semantic/inference.py

@@ -1,8 +1,10 @@
 import os
 import queue
+import re
 import threading
 import time
 import traceback
+from copy import deepcopy
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Literal, Optional, Tuple, Union
@@ -13,13 +15,12 @@ import torch
 import torch._inductor.config
 from loguru import logger
 from tqdm import tqdm
-from transformers import AutoTokenizer
 
 from fish_speech.content_sequence import (
-    ContentSequence,
     TextPart,
     VQPart,
 )
+from fish_speech.conversation import Conversation, Message
 from fish_speech.tokenizer import IM_END_TOKEN
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -27,7 +28,6 @@ torch._inductor.config.coordinate_descent_tuning = True
 torch._inductor.config.triton.unique_kernel_names = True
 
 if hasattr(torch._inductor.config, "fx_graph_cache"):
-    # Experimental feature to reduce compilation times, will be on by default in future
     torch._inductor.config.fx_graph_cache = True
 
 
@@ -47,26 +47,23 @@ def multinomial_sample_one_no_sync(
     return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
 
 
+RAS_WIN_SIZE = 10   # window for Repetition Aware Sampling
+RAS_HIGH_TEMP = 1.0
+RAS_HIGH_TOP_P = 0.9
+
+
 def logits_to_probs(
     logits,
     temperature: torch.Tensor,
     top_p: torch.Tensor,
-    repetition_penalty: torch.Tensor,
-    previous_tokens: Optional[torch.Tensor] = None,
+    top_k: torch.Tensor,
 ) -> torch.Tensor:
-    # Apply repetition penalty
-    if previous_tokens is not None:
-        previous_tokens = previous_tokens.long()
-        score = torch.gather(logits, dim=-1, index=previous_tokens)
-        score = torch.where(
-            score < 0, score * repetition_penalty, score / repetition_penalty
-        )
-        logits.scatter_(dim=-1, index=previous_tokens, src=score)
-
-    # Apply top-p sampling
+    # Sort and compute top-p mask
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
     cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
     sorted_indices_to_remove = cum_probs > top_p
+    # top-k mask
+    sorted_indices_to_remove[top_k:] = True
     sorted_indices_to_remove[0] = False  # keep at least one option
     indices_to_remove = sorted_indices_to_remove.scatter(
         dim=-1, index=sorted_indices, src=sorted_indices_to_remove
@@ -82,15 +79,13 @@ def sample(
     logits,
     temperature: torch.Tensor,
     top_p: torch.Tensor,
-    repetition_penalty: torch.Tensor,
-    previous_tokens: Optional[torch.Tensor] = None,
+    top_k: int,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     probs = logits_to_probs(
         logits=logits[0, -1],
         temperature=temperature,
         top_p=top_p,
-        repetition_penalty=repetition_penalty,
-        previous_tokens=previous_tokens,
+        top_k=top_k,
     )
     idx_next = multinomial_sample_one_no_sync(probs)
     return idx_next, probs
@@ -102,32 +97,44 @@ def decode_one_token_ar(
     input_pos: torch.Tensor,
     temperature: torch.Tensor,
     top_p: torch.Tensor,
-    repetition_penalty: torch.Tensor,
+    top_k: int,
+    semantic_logit_bias: torch.Tensor,
     audio_masks: torch.Tensor,
     audio_parts: torch.Tensor,
     previous_tokens: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
-    # print(x, torch.count_nonzero(vq_masks))
     forward_result = model.forward_generate(
         x,
         input_pos,
         audio_masks=audio_masks,
         audio_parts=audio_parts,
     )
-    logits = forward_result.logits  # [:, -1:]
-    hidden_states = forward_result.hidden_states  # [:, -1:]
+    logits = forward_result.logits  # (1, 1, vocab_size)
+    hidden_states = forward_result.hidden_states
 
-    codebooks = [
-        sample(
-            logits,
-            temperature=temperature,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            previous_tokens=(
-                previous_tokens[:, 0] if previous_tokens is not None else None
-            ),
-        )[0]
-    ]
+    # Apply constrained decoding: only allow semantic tokens + im_end
+    biased_logits = logits + semantic_logit_bias
+
+    # Normal sample
+    main_token_normal = sample(biased_logits, temperature=temperature, top_p=top_p, top_k=top_k)[0]
+
+    # RAS: also sample with high temp to use as fallback if token repeats
+    high_temp = torch.tensor(RAS_HIGH_TEMP, device=temperature.device, dtype=temperature.dtype)
+    high_top_p = torch.tensor(RAS_HIGH_TOP_P, device=top_p.device, dtype=top_p.dtype)
+    main_token_high = sample(biased_logits, temperature=high_temp, top_p=high_top_p, top_k=top_k)[0]
+
+    # Use high-temp sample if: token is semantic AND token is in previous window
+    if previous_tokens is not None:
+        in_window = (previous_tokens[0] == main_token_normal).any()
+        # Use tensor ops (&, torch.where) instead of Python (and, if) — torch.compile requires no data-dependent branching
+        is_semantic = (
+            (main_token_normal >= model.config.semantic_begin_id)
+            & (main_token_normal <= model.config.semantic_end_id)
+        )
+        should_use_high = in_window & is_semantic
+        main_token_normal = torch.where(should_use_high, main_token_high, main_token_normal)
+
+    codebooks = [main_token_normal]
 
     # Only clear cache for fast_layers, avoid clearing main model cache
     for layer in model.fast_layers:
@@ -137,8 +144,11 @@ def decode_one_token_ar(
 
     input_pos = torch.tensor([0], device=hidden_states.device, dtype=torch.long)
     model.forward_generate_fast(hidden_states, input_pos)
-    a = codebooks[0] - model.tokenizer.semantic_begin_id
+    
+    # [MODIFIED] Access config instead of tokenizer
+    a = codebooks[0] - model.config.semantic_begin_id
     a[a < 0] = 0
+    a[a >= model.config.codebook_size] = 0
     hidden_states = model.fast_embeddings(a)
     codebooks.append(a)
 
@@ -148,19 +158,14 @@ def decode_one_token_ar(
         )
         logits = model.forward_generate_fast(hidden_states, input_pos)
 
-        short_logits = logits[:, :, :1024]
+        short_logits = logits # DualAR predicts config.codebook_size number of tokens
 
-        # Convert logits to probs
+        # Convert logits to probs (no constrain for fast codebooks)
         a = sample(
             short_logits,
             temperature=temperature,
             top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            previous_tokens=(
-                previous_tokens[codebook_idx + 1]
-                if previous_tokens is not None
-                else None
-            ),
+            top_k=top_k,
         )[0]
 
         hidden_states = model.fast_embeddings(a)
@@ -181,53 +186,52 @@ def decode_n_tokens(
     num_new_tokens: int,
     temperature: torch.Tensor,
     top_p: torch.Tensor,
-    repetition_penalty: torch.Tensor,
+    top_k: int,
+    semantic_logit_bias: torch.Tensor,
     audio_masks: torch.Tensor,
     audio_parts: torch.Tensor,
     decode_one_token=decode_one_token_ar,
 ):
+    # Rolling window for RAS (Repetition Aware Sampling)
     previous_tokens = torch.zeros(
-        (model.config.num_codebooks + 1, model.config.max_seq_len),
+        (model.config.num_codebooks + 1, RAS_WIN_SIZE),
         dtype=torch.int,
         device=cur_token.device,
     )
+    # Accumulate all generated tokens (the actual output)
+    new_tokens = []
+    
+    # [MODIFIED] Pre-fetch ID for efficiency loop
+    im_end_id = model.tokenizer.get_token_id(IM_END_TOKEN)
 
     for i in tqdm(range(num_new_tokens)):
-        # We need to get windowed repeat penalty
-        win_size = 16
-        if i < win_size:
-            window = previous_tokens[:, :win_size]
-        else:
-            window = previous_tokens[:, i - win_size : i]
-
-        with sdpa_kernel(
-            SDPBackend.MATH
-        ):  # Actually better for Inductor to codegen attention here
+        with sdpa_kernel(SDPBackend.MATH):
             next_token = decode_one_token(
                 model=model,
                 x=cur_token,
                 input_pos=input_pos,
-                previous_tokens=window,
+                previous_tokens=previous_tokens,
                 temperature=temperature,
                 top_p=top_p,
-                repetition_penalty=repetition_penalty,
+                top_k=top_k,
+                semantic_logit_bias=semantic_logit_bias,
                 audio_masks=audio_masks,
                 audio_parts=audio_parts,
             ).clone()
 
         input_pos += 1
         cur_token = next_token.view(1, model.config.num_codebooks + 1, -1)
-        previous_tokens[:, i : i + 1] = next_token.view(
-            model.config.num_codebooks + 1, -1
-        )
+        # Roll RAS window left and insert new token at end
+        previous_tokens = previous_tokens.roll(-1, dims=1)
+        previous_tokens[:, -1] = next_token.view(model.config.num_codebooks + 1, -1)[:, 0]
+        new_tokens.append(next_token)
 
-        if cur_token[0, 0, -1] == model.tokenizer.get_token_id(IM_END_TOKEN):
+        if cur_token[0, 0, -1] == im_end_id:
             break
 
-    # Only clean up the large tensor
     del cur_token
 
-    return previous_tokens[:, : i + 1]
+    return torch.cat(new_tokens, dim=1)
 
 
 @torch.no_grad()
@@ -265,7 +269,8 @@ def generate(
         T_new = model.config.max_seq_len
         max_new_tokens = T_new - T
 
-    device, dtype = prompt.device, prompt.dtype
+    device = prompt.device
+    dtype = next(model.parameters()).dtype  # model weight dtype (bfloat16), NOT prompt dtype (int32)
 
     # Critical fix: Only set up cache on first run or when necessary
     if not hasattr(model, "_cache_setup_done") or not model._cache_setup_done:
@@ -282,35 +287,31 @@ def generate(
     # Create new tensor each time, but try to reuse memory
     input_pos = torch.arange(0, T, device=device, dtype=torch.long)
     empty = torch.empty(
-        (codebook_dim, model.config.max_seq_len), dtype=dtype, device=device
+        (codebook_dim, model.config.max_seq_len), dtype=prompt.dtype, device=device
     )
     empty[:, :T] = prompt
     seq = empty
 
-    # Use pre-created fixed parameter tensors
-    temperature = getattr(
-        model, "fixed_temperature", torch.tensor(0.8, device=device, dtype=torch.float)
-    )
-    top_p = getattr(
-        model, "fixed_top_p", torch.tensor(0.8, device=device, dtype=torch.float)
-    )
-    repetition_penalty = getattr(
-        model,
-        "fixed_repetition_penalty",
-        torch.tensor(1.1, device=device, dtype=torch.float),
-    )
+    temp_val = sampling_kwargs.get("temperature", 1.0)
+    top_p_val = sampling_kwargs.get("top_p", 0.9)
+    top_k_val = sampling_kwargs.get("top_k", 30)
 
-    # If different parameter values are needed, directly modify existing tensors
-    temp_val = sampling_kwargs.get("temperature", 0.7)
-    top_p_val = sampling_kwargs.get("top_p", 0.7)
-    rep_val = sampling_kwargs.get("repetition_penalty", 1.5)
+    temperature = torch.tensor(temp_val, device=device, dtype=dtype)
+    top_p = torch.tensor(top_p_val, device=device, dtype=dtype)
 
-    if abs(temperature.item() - temp_val) > 1e-6:
-        temperature.fill_(temp_val)
-    if abs(top_p.item() - top_p_val) > 1e-6:
-        top_p.fill_(top_p_val)
-    if abs(repetition_penalty.item() - rep_val) > 1e-6:
-        repetition_penalty.fill_(rep_val)
+    # Build semantic logit bias: 0 for semantic tokens + im_end, -inf for all others
+    vocab_size = model.config.vocab_size
+    semantic_logit_bias = torch.full(
+        (1, 1, vocab_size), float("-inf"), device=device, dtype=dtype
+    )
+    
+    # [MODIFIED] Use config for semantic range
+    semantic_logit_bias[
+        0, 0, model.config.semantic_begin_id : model.config.semantic_end_id + 1
+    ] = 0.0
+    
+    # [MODIFIED] Use tokenizer.get_token_id (Wrapper method)
+    semantic_logit_bias[0, 0, model.tokenizer.get_token_id(IM_END_TOKEN)] = 0.0
 
     prefill_decode = decode_one_token_ar
 
@@ -320,7 +321,8 @@ def generate(
         input_pos,
         temperature,
         top_p,
-        repetition_penalty,
+        top_k_val,
+        semantic_logit_bias,
         audio_masks,
         audio_parts,
     )
@@ -336,7 +338,8 @@ def generate(
         max_new_tokens - 1,
         temperature=temperature,
         top_p=top_p,
-        repetition_penalty=repetition_penalty,
+        top_k=top_k_val,
+        semantic_logit_bias=semantic_logit_bias,
         audio_masks=audio_masks,
         audio_parts=audio_parts,
         decode_one_token=decode_one_token,
@@ -358,7 +361,7 @@ def init_model(checkpoint_path, device, precision, compile=False):
 
     if isinstance(model, DualARTransformer):
         decode_one_token = decode_one_token_ar
-        prefill_n_tokens = decode_one_token_ar
+        # prefill_n_tokens = decode_one_token_ar
         logger.info("Using DualARTransformer")
     else:
         raise ValueError("Unsupported model type")
@@ -383,6 +386,60 @@ def init_model(checkpoint_path, device, precision, compile=False):
     return model.eval(), decode_one_token
 
 
+@torch.inference_mode()
+def load_codec_model(codec_checkpoint_path, device, precision=torch.bfloat16):
+    """Load the DAC codec model for audio encoding/decoding."""
+    from hydra.utils import instantiate
+    from omegaconf import OmegaConf
+
+    config_path = Path(__file__).parent.parent.parent / "configs" / "modded_dac_vq.yaml"
+    cfg = OmegaConf.load(str(config_path))
+    codec = instantiate(cfg)
+
+    state_dict = torch.load(codec_checkpoint_path, map_location="cpu")
+    if "state_dict" in state_dict:
+        state_dict = state_dict["state_dict"]
+    if any("generator" in k for k in state_dict):
+        state_dict = {
+            k.replace("generator.", ""): v
+            for k, v in state_dict.items()
+            if "generator." in k
+        }
+    codec.load_state_dict(state_dict, strict=False)
+    codec.eval()
+    codec.to(device=device, dtype=precision)
+    return codec
+
+
+@torch.inference_mode()
+def encode_audio(audio_path, codec, device):
+    """Encode an audio file to VQ codes."""
+    import torchaudio
+
+    wav, sr = torchaudio.load(str(audio_path))
+    if wav.shape[0] > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+    wav = torchaudio.functional.resample(
+        wav.to(device), sr, codec.sample_rate
+    )[0]
+
+    # Match codec model dtype (e.g. bfloat16)
+    model_dtype = next(codec.parameters()).dtype
+    audios = wav[None, None].to(dtype=model_dtype)  # (1, 1, T)
+    audio_lengths = torch.tensor([len(wav)], device=device, dtype=torch.long)
+
+    indices, feature_lengths = codec.encode(audios, audio_lengths)
+    return indices[0, :, : feature_lengths[0]]  # (num_codebooks, T)
+
+
+@torch.inference_mode()
+def decode_to_audio(codes, codec):
+    """Decode VQ codes to audio waveform."""
+    # codes: (num_codebooks, T) -> (1, num_codebooks, T)
+    audio = codec.from_indices(codes[None])
+    return audio[0, 0]  # (T,) mono waveform
+
+
 @dataclass
 class GenerateResponse:
     action: Literal["sample", "next"]
@@ -390,6 +447,75 @@ class GenerateResponse:
     text: Optional[str] = None
 
 
+def split_text_by_speaker(text: str) -> list[str]:
+    """
+    Split text into turns based on <|speaker:X|> tags.
+
+    Args:
+        text: The full text with speaker tags
+
+    Returns:
+        List of speaker turns, each starting with <|speaker:X|>
+    """
+    pattern = r"(<\|speaker:\d+\|>)"
+    parts = re.split(pattern, text)
+
+    turns = []
+    i = 0
+    while i < len(parts):
+        part = parts[i].strip()
+        if re.match(pattern, part):
+            if i + 1 < len(parts):
+                turn = part + parts[i + 1]
+                turns.append(turn.strip())
+                i += 2
+            else:
+                turns.append(part)
+                i += 1
+        else:
+            i += 1
+
+    return turns
+
+
+def group_turns_into_batches(
+    turns: list[str], max_speakers: int = 3, max_bytes: int = 300
+) -> list[str]:
+    """
+    Group turns into batches based on speaker count or byte limit.
+
+    Args:
+        turns: List of speaker turns
+        max_speakers: Maximum number of speakers per batch (default 3)
+        max_bytes: Maximum UTF-8 bytes per batch (default 300)
+
+    Returns:
+        List of batched text strings
+    """
+    batches = []
+    current_batch = []
+    current_bytes = 0
+
+    for turn in turns:
+        turn_bytes = len(turn.encode("utf-8"))
+
+        would_exceed_speakers = len(current_batch) >= max_speakers
+        would_exceed_bytes = current_bytes + turn_bytes > max_bytes and current_batch
+
+        if would_exceed_speakers or would_exceed_bytes:
+            batches.append("\n".join(current_batch))
+            current_batch = [turn]
+            current_bytes = turn_bytes
+        else:
+            current_batch.append(turn)
+            current_bytes += turn_bytes
+
+    if current_batch:
+        batches.append("\n".join(current_batch))
+
+    return batches
+
+
 def generate_long(
     *,
     model,
@@ -398,9 +524,10 @@ def generate_long(
     text: str,
     num_samples: int = 1,
     max_new_tokens: int = 0,
-    top_p: float = 0.8,
+    top_p: float = 0.9,
+    top_k: int = 30,
     repetition_penalty: float = 1.1,
-    temperature: float = 0.8,
+    temperature: float = 1.0,
     compile: bool = False,
     iterative_prompt: bool = True,
     chunk_length: int = 512,
@@ -408,10 +535,9 @@ def generate_long(
     prompt_tokens: Optional[Union[torch.Tensor, list[torch.Tensor]]] = None,
 ):
     assert 0 < top_p <= 1, "top_p must be in (0, 1]"
-    assert 0 < repetition_penalty < 2, "repetition_penalty must be in (0, 2)"
     assert 0 < temperature < 2, "temperature must be in (0, 2)"
 
-    use_prompt = prompt_text is not None and prompt_tokens is not None
+    use_prompt = bool(prompt_text) and bool(prompt_tokens)
     if use_prompt and isinstance(prompt_text, str):
         prompt_text = [prompt_text]
         prompt_tokens = [prompt_tokens]
@@ -426,91 +552,188 @@ def generate_long(
 
     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
     tokenizer = model.tokenizer
-    base_content_sequence = ContentSequence(modality="interleave")
-
     max_length = model.config.max_seq_len
+
+    # Build base conversation with system message
+    base_conversation = Conversation()
+
+
     if use_prompt:
-        for t, c in zip(prompt_text, prompt_tokens):
-            base_content_sequence.append(
-                [
-                    TextPart(text=t),
-                    VQPart(codes=c),
-                ],
-                add_end=True,
-                speaker=0,
-            )
-    base_content_sequence.append(
-        [
-            TextPart(text=text),
-        ],
-        add_end=False,
-        speaker=0,
+        # Auto-add speaker tags to prompt texts that don't have them
+        tagged_prompt_text = []
+        for i, t in enumerate(prompt_text):
+            if not re.search(r"<\|speaker:\d+\|>", t):
+                tagged_prompt_text.append(f"<|speaker:{i}|>{t}")
+            else:
+                tagged_prompt_text.append(t)
+
+        system_parts = [
+            TextPart(
+                text="convert the provided text to speech reference to the following:\n\nText:\n",
+                cal_loss=False,
+            ),
+        ]
+        reference_text = "\n".join(tagged_prompt_text)
+        system_parts.append(TextPart(text=reference_text, cal_loss=False))
+        system_parts.append(TextPart(text="\n\nSpeech:\n", cal_loss=False))
+        all_codes = torch.cat([c for c in prompt_tokens], dim=1)
+        system_parts.append(VQPart(codes=all_codes, cal_loss=False))
+        # torch.save(all_codes, "debug_vq_codes.pt")
+    else:
+        system_parts = [
+            TextPart(text="convert the provided text to speech", cal_loss=False)
+        ]
+
+    base_conversation.append(
+        Message(
+            role="system",
+            parts=system_parts,
+            cal_loss=False,
+            add_im_start=True,
+            add_im_end=True,
+        )
     )
 
-    encoded, audio_masks, audio_parts = base_content_sequence.encode_for_inference(
-        tokenizer, num_codebooks=model.config.num_codebooks
-    )
-    if encoded.size(1) > max_length - 2048:
-        raise ValueError(f"Prompt is too long: {encoded.size(1)} > {max_length - 2048}")
+    # Split text by speaker and group into batches
+    turns = split_text_by_speaker(text)
+    if turns:
+        batches = group_turns_into_batches(
+            turns, max_speakers=5, max_bytes=chunk_length
+        )
+    else:
+        batches = [text]
 
-    encoded = encoded.to(device=device)
-    logger.info(f"Encoded text: {text}")
+    logger.info(
+        f"Split into {len(turns)} turns, grouped into {len(batches)} batches"
+    )
 
     for sample_idx in range(num_samples):
         if torch.cuda.is_available():
             torch.cuda.synchronize()
 
-        global_encoded = []
-        seg_idx = 0
-        prompt_length = encoded.size(1)
-
         t0 = time.perf_counter()
 
-        y = generate(
-            model=model,
-            prompt=encoded,
-            max_new_tokens=max_new_tokens,
-            audio_masks=audio_masks,
-            audio_parts=audio_parts,
-            decode_one_token=decode_one_token,
-            temperature=temperature,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-        )
+        # Deep copy base conversation for this sample
+        conversation = deepcopy(base_conversation)
 
-        if sample_idx == 0 and seg_idx == 0 and compile:
-            logger.info(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
+        for batch_idx, batch_text in enumerate(batches):
+            logger.info(
+                f"--- Sample {sample_idx}, Batch {batch_idx} "
+                f"({len(batch_text.encode('utf-8'))} bytes) ---"
+            )
+            logger.info(f"Batch text: {batch_text}")
+
+            # Add user message
+            conversation.append(
+                Message(
+                    role="user",
+                    parts=[TextPart(text=batch_text, cal_loss=False)],
+                    cal_loss=False,
+                    add_im_start=True,
+                    add_im_end=True,
+                )
+            )
 
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
+            # Deep copy for generation (don't pollute original conversation)
+            conversation_gen = deepcopy(conversation)
+            conversation_gen.append(
+                Message(
+                    role="assistant",
+                    parts=[],
+                    cal_loss=False,
+                    modality="voice",
+                    add_im_start=True,
+                    add_im_end=False,
+                )
+            )
 
-        t = time.perf_counter() - t0
+            logger.info("Visualizing prompt structure:")
+            conversation_gen.visualize(
+                tokenizer,
+                merge_audio_tokens=True,
+                merge_semantic_tokens=True,
+            )
 
-        tokens_generated = y.size(1) - prompt_length
-        tokens_sec = tokens_generated / t
-        logger.info(
-            f"Generated {tokens_generated} tokens in {t:.02f} seconds, {tokens_sec:.02f} tokens/sec"
-        )
-        logger.info(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s")
+            encoded, audio_masks, audio_parts = (
+                conversation_gen.encode_for_inference(
+                    tokenizer, num_codebooks=model.config.num_codebooks
+                )
+            )
 
-        if torch.cuda.is_available():
+            logger.info(f"Encoded prompt shape: {encoded.shape}")
+            if audio_parts is not None:
+                logger.info(f"Audio parts shape: {audio_parts.shape}")
+            if audio_masks is not None:
+                logger.info(
+                    f"Audio masks non-zero count: {torch.count_nonzero(audio_masks)}"
+                )
+
+            if encoded.size(1) > max_length - 2048:
+                raise ValueError(
+                    f"Prompt is too long: {encoded.size(1)} > {max_length - 2048}"
+                )
+
+            encoded = encoded.to(device=device)
+            prompt_length = encoded.size(1)
+
+            y = generate(
+                model=model,
+                prompt=encoded,
+                max_new_tokens=max_new_tokens,
+                audio_masks=audio_masks,
+                audio_parts=audio_parts,
+                decode_one_token=decode_one_token,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+            )
+
+            if sample_idx == 0 and batch_idx == 0 and compile:
+                logger.info(
+                    f"Compilation time: {time.perf_counter() - t0:.2f} seconds"
+                )
+
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+
+            t_batch = time.perf_counter() - t0
+            tokens_generated = y.size(1) - prompt_length
+            tokens_sec = tokens_generated / t_batch if t_batch > 0 else 0
             logger.info(
-                f"GPU Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB"
+                f"Batch {batch_idx}: Generated {tokens_generated} tokens in "
+                f"{t_batch:.02f} seconds, {tokens_sec:.02f} tokens/sec"
+            )
+            logger.info(
+                f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s"
             )
 
-        # Put the generated tokens
-        codes = y[1:, prompt_length:-1].clone()
-        assert (codes >= 0).all(), f"Negative code found"
+            # Extract generated codes
+            codes = y[1:, prompt_length:-1].clone()
+            assert (codes >= 0).all(), f"Negative code found: {codes}"
+
+            # Add assistant message with generated codes back to conversation
+            conversation.append(
+                Message(
+                    role="assistant",
+                    parts=[VQPart(codes=codes.cpu(), cal_loss=False)],
+                    cal_loss=False,
+                    modality="voice",
+                    add_im_start=True,
+                    add_im_end=True,
+                )
+            )
 
-        decoded = y[:, prompt_length:].clone()
-        global_encoded.append(decoded.cpu())
-        assert (codes >= 0).all(), f"Negative code found: {codes}"
+            yield GenerateResponse(
+                action="sample", codes=codes, text=batch_text
+            )
 
-        yield GenerateResponse(action="sample", codes=codes, text=text)
-        seg_idx += 1
+            # Cleanup
+            del y, encoded
 
-        # Force GPU memory cleanup
-        del y, decoded, codes
+        if torch.cuda.is_available():
+            logger.info(
+                f"GPU Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB"
+            )
 
         yield GenerateResponse(action="next")
 
@@ -585,7 +808,7 @@ def launch_thread_safe_queue(
 @click.option(
     "--text",
     type=str,
-    default="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
+    default="<|speaker:0|>你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
 )
 @click.option("--prompt-text", type=str, default=None, multiple=True)
 @click.option(
@@ -594,15 +817,22 @@ def launch_thread_safe_queue(
     default=None,
     multiple=True,
 )
+@click.option(
+    "--prompt-audio",
+    type=click.Path(path_type=Path, exists=True),
+    default=None,
+    multiple=True,
+)
+@click.option("--output", type=click.Path(path_type=Path), default=None)
 @click.option("--num-samples", type=int, default=1)
 @click.option("--max-new-tokens", type=int, default=0)
-@click.option("--top-p", type=float, default=0.8)
-@click.option("--repetition-penalty", type=float, default=1.1)
-@click.option("--temperature", type=float, default=0.8)
+@click.option("--top-p", type=float, default=0.9)
+@click.option("--top-k", type=int, default=30)
+@click.option("--temperature", type=float, default=1.0)
 @click.option(
     "--checkpoint-path",
     type=click.Path(path_type=Path, exists=True),
-    default="checkpoints/openaudio-s1-mini",
+    default="checkpoints/s2-pro",
 )
 @click.option("--device", type=str, default="cuda")
 @click.option("--compile/--no-compile", default=False)
@@ -610,15 +840,17 @@ def launch_thread_safe_queue(
 @click.option("--half/--no-half", default=False)
 @click.option("--iterative-prompt/--no-iterative-prompt", default=True)
 @click.option("--chunk-length", type=int, default=300)
-@click.option("--output-dir", type=Path, default="temp")
+@click.option("--output-dir", type=Path, default="output")
 def main(
     text: str,
     prompt_text: Optional[tuple[str, ...]],
     prompt_tokens: Optional[tuple[Path, ...]],
+    prompt_audio: Optional[tuple[Path, ...]],
+    output: Optional[Path],
     num_samples: int,
     max_new_tokens: int,
-    top_p: int,
-    repetition_penalty: float,
+    top_p: float,
+    top_k: int,
     temperature: float,
     checkpoint_path: Path,
     device: str,
@@ -632,14 +864,26 @@ def main(
     os.makedirs(output_dir, exist_ok=True)
     precision = torch.half if half else torch.bfloat16
 
+    if prompt_text and not prompt_audio and not prompt_tokens:
+        raise ValueError(
+            "--prompt-text requires either --prompt-audio or --prompt-tokens"
+        )
     if (
-        prompt_text is not None
-        and prompt_tokens is not None
+        prompt_text
+        and prompt_tokens
         and len(prompt_text) != len(prompt_tokens)
     ):
         raise ValueError(
             f"Number of prompt text ({len(prompt_text)}) and prompt tokens ({len(prompt_tokens)}) should be the same"
         )
+    if (
+        prompt_text
+        and prompt_audio
+        and len(prompt_text) != len(prompt_audio)
+    ):
+        raise ValueError(
+            f"Number of prompt text ({len(prompt_text)}) and prompt audio ({len(prompt_audio)}) should be the same"
+        )
 
     logger.info("Loading model ...")
     t0 = time.time()
@@ -657,8 +901,21 @@ def main(
 
     logger.info(f"Time to load model: {time.time() - t0:.02f} seconds")
 
+    codec = None
+    codec_checkpoint = checkpoint_path / "codec.pth"
+
+    # Handle prompt: --prompt-audio takes priority over --prompt-tokens
     prompt_tokens_list = None
-    if prompt_tokens is not None:
+    if prompt_audio:
+        logger.info("Loading codec model for audio encoding...")
+        codec = load_codec_model(codec_checkpoint, device, precision)
+        prompt_tokens_list = [
+            encode_audio(p, codec, device).cpu() for p in prompt_audio
+        ]
+        logger.info(
+            f"Encoded {len(prompt_audio)} audio file(s) to VQ codes"
+        )
+    elif prompt_tokens is not None:
         prompt_tokens_list = [torch.from_numpy(np.load(p)) for p in prompt_tokens]
 
     torch.manual_seed(seed)
@@ -674,7 +931,7 @@ def main(
         num_samples=num_samples,
         max_new_tokens=max_new_tokens,
         top_p=top_p,
-        repetition_penalty=repetition_penalty,
+        top_k=top_k,
         temperature=temperature,
         compile=compile,
         iterative_prompt=iterative_prompt,
@@ -692,9 +949,29 @@ def main(
             logger.info(f"Sampled text: {response.text}")
         elif response.action == "next":
             if codes:
+                merged_codes = torch.cat(codes, dim=1)
                 codes_npy_path = os.path.join(output_dir, f"codes_{idx}.npy")
-                np.save(codes_npy_path, torch.cat(codes, dim=1).cpu().numpy())
+                np.save(codes_npy_path, merged_codes.cpu().numpy())
                 logger.info(f"Saved codes to {codes_npy_path}")
+
+                # Decode to wav if --output is specified
+                if output:
+                    if codec is None:
+                        logger.info("Loading codec model for audio decoding...")
+                        codec = load_codec_model(
+                            codec_checkpoint, device, precision
+                        )
+                    audio = decode_to_audio(merged_codes.to(device), codec)
+                    import soundfile as sf
+
+                    out_path = (
+                        str(output)
+                        if num_samples == 1
+                        else str(output.with_stem(f"{output.stem}_{idx}"))
+                    )
+                    sf.write(out_path, audio.cpu().float().numpy(), codec.sample_rate)
+                    logger.info(f"Saved audio to {out_path}")
+
             logger.info(f"Next sample")
             codes = []
             idx += 1
@@ -703,4 +980,4 @@ def main(
 
 
 if __name__ == "__main__":
-    main()
+    main()

+ 167 - 81
fish_speech/models/text2semantic/llama.py

@@ -14,10 +14,8 @@ from torch import Tensor
 from torch.nn import functional as F
 from torch.nn.attention import SDPBackend, sdpa_kernel
 from torch.utils.checkpoint import checkpoint
-from transformers import AutoTokenizer
 
 from fish_speech.models.text2semantic.lora import LoraConfig, setup_lora
-from fish_speech.tokenizer import SEMANTIC_TOKENS, FishTokenizer
 
 
 def find_multiple(n: int, k: int) -> int:
@@ -49,6 +47,9 @@ class BaseModelArgs:
     # Codebook configs
     codebook_size: int = 160
     num_codebooks: int = 4
+    
+    semantic_begin_id: int = 0
+    semantic_end_id: int = 0
 
     # Gradient checkpointing
     use_gradient_checkpointing: bool = True
@@ -59,6 +60,7 @@ class BaseModelArgs:
     # Dummy vars
     is_reward_model: bool = False
     scale_codebook_embeddings: bool = False
+    audio_embed_dim: Optional[int] = None
 
     def __post_init__(self):
         if self.n_local_heads == -1:
@@ -85,11 +87,61 @@ class BaseModelArgs:
                 cls = NaiveModelArgs
             case "dual_ar":
                 cls = DualARModelArgs
+            case "fish_qwen3_omni":
+                return BaseModelArgs._from_fish_qwen3_omni(data)
             case _:
                 raise ValueError(f"Unknown model type: {data['model_type']}")
 
+        # Filter out unexpected keyword arguments
+        valid_keys = {f.name for f in dataclasses.fields(cls)}
+        data = {k: v for k, v in data.items() if k in valid_keys}
+
         return cls(**data)
 
+    @staticmethod
+    def _from_fish_qwen3_omni(data: dict) -> "DualARModelArgs":
+        tc = data["text_config"]
+        adc = data["audio_decoder_config"]
+        flat = dict(
+            model_type="dual_ar",
+            vocab_size=tc["vocab_size"],
+            n_layer=tc["n_layer"],
+            n_head=tc["n_head"],
+            n_local_heads=tc.get("n_local_heads", -1),
+            head_dim=tc.get("head_dim"),
+            dim=tc["dim"],
+            intermediate_size=tc.get("intermediate_size"),
+            rope_base=tc.get("rope_base", 10000),
+            norm_eps=tc.get("norm_eps", 1e-5),
+            max_seq_len=tc.get("max_seq_len", 2048),
+            dropout=tc.get("dropout", 0.0),
+            tie_word_embeddings=tc.get("tie_word_embeddings", True),
+            attention_qkv_bias=tc.get("attention_qkv_bias", False),
+            attention_o_bias=tc.get("attention_o_bias", False),
+            attention_qk_norm=tc.get("attention_qk_norm", False),
+            use_gradient_checkpointing=tc.get("use_gradient_checkpointing", True),
+            initializer_range=tc.get("initializer_range", 0.02),
+            semantic_begin_id=data.get("semantic_start_token_id", 0),
+            semantic_end_id=data.get("semantic_end_token_id", 0),
+            scale_codebook_embeddings=True,
+            norm_fastlayer_input=True,
+            audio_embed_dim=adc.get("text_dim", tc["dim"]),
+            codebook_size=adc["vocab_size"],
+            num_codebooks=adc["num_codebooks"],
+            n_fast_layer=adc["n_layer"],
+            fast_dim=adc.get("dim"),
+            fast_n_head=adc.get("n_head"),
+            fast_n_local_heads=adc.get("n_local_heads"),
+            fast_head_dim=adc.get("head_dim"),
+            fast_intermediate_size=adc.get("intermediate_size"),
+            fast_attention_qkv_bias=adc.get("attention_qkv_bias"),
+            fast_attention_qk_norm=adc.get("attention_qk_norm"),
+            fast_attention_o_bias=adc.get("attention_o_bias"),
+        )
+        valid_keys = {f.name for f in dataclasses.fields(DualARModelArgs)}
+        flat = {k: v for k, v in flat.items() if k in valid_keys and v is not None}
+        return DualARModelArgs(**flat)
+
     def save(self, path: str):
         with open(path, "w") as f:
             json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)
@@ -112,6 +164,7 @@ class DualARModelArgs(BaseModelArgs):
     fast_attention_qkv_bias: bool | None = None
     fast_attention_qk_norm: bool | None = None
     fast_attention_o_bias: bool | None = None
+    norm_fastlayer_input: bool = False
 
     def __post_init__(self):
         super().__post_init__()
@@ -173,17 +226,30 @@ class BaseTransformerForwardResult:
     hidden_states: Tensor
 
 
+def _remap_fish_qwen3_omni_keys(weights: OrderedDict) -> OrderedDict:
+    if not any(k.startswith(("text_model.", "audio_decoder.")) for k in weights):
+        return weights
+    new_weights = OrderedDict()
+    for k, v in weights.items():
+        if k.startswith("text_model.model."):
+            new_key = k[len("text_model.model."):]
+        elif k.startswith("audio_decoder."):
+            suffix = k[len("audio_decoder."):]
+            new_key = suffix if suffix.startswith("codebook_embeddings.") else "fast_" + suffix
+        else:
+            new_key = k
+        new_weights[new_key] = v
+    return new_weights
+
+
 class BaseTransformer(nn.Module):
     def __init__(
         self,
         config: BaseModelArgs,
-        tokenizer: FishTokenizer,
         init_weights: bool = True,
     ) -> None:
         super().__init__()
         self.config = config
-        self.tokenizer = tokenizer
-        self.semantic_token_ids = list(tokenizer.semantic_id_to_token_id.values())
 
         # Slow transformer
         self.embeddings = nn.Embedding(
@@ -255,9 +321,6 @@ class BaseTransformer(nn.Module):
 
     def embed(self, inp: Tensor) -> Tensor:
         embeds = []
-        semantic_token_ids_tensor = torch.tensor(
-            self.semantic_token_ids, device=inp.device, dtype=inp.dtype
-        )
 
         for i in range(self.config.num_codebooks):
             emb = self.codebook_embeddings(
@@ -266,7 +329,12 @@ class BaseTransformer(nn.Module):
             embeds.append(emb)
 
         vq_embeds_sum = torch.stack(embeds, dim=1).sum(dim=1)
-        vq_embeds_sum[~torch.isin(inp[:, 0], semantic_token_ids_tensor)] = 0
+        
+        is_semantic = (inp[:, 0] >= self.config.semantic_begin_id) & \
+                      (inp[:, 0] <= self.config.semantic_end_id)
+        
+        vq_embeds_sum[~is_semantic] = 0
+        
         x = self.embeddings(inp[:, 0]) + vq_embeds_sum
 
         return x
@@ -283,9 +351,6 @@ class BaseTransformer(nn.Module):
 
         freqs_cis = self.freqs_cis[:seq_len]
 
-        # Not that the causal mask here follows the definition of scaled_dot_product_attention
-        # That is, FALSE means masked out
-        # To maintain consistency, key_padding_mask use TRUE to mask out
         mask = None
         if key_padding_mask is not None:
             causal = self.causal_mask[:seq_len, :seq_len]
@@ -295,15 +360,12 @@ class BaseTransformer(nn.Module):
             atten_mask = atten_mask.logical_not()
             mask = causal & atten_mask
 
-        # return freqs_cis, mask
-
         for layer in self.layers:
             if self.config.use_gradient_checkpointing and self.training:
                 x = checkpoint(layer, x, freqs_cis, mask, use_reentrant=True)
             else:
                 x = layer(x, freqs_cis, mask)
 
-        # We got slow_out here
         slow_out = self.norm(x)
 
         if self.config.tie_word_embeddings:
@@ -311,9 +373,15 @@ class BaseTransformer(nn.Module):
         else:
             token_logits = self.output(slow_out)
 
+        hidden_out = (
+            slow_out
+            if getattr(self.config, "norm_fastlayer_input", False)
+            else x
+        )
+
         return BaseTransformerForwardResult(
             logits=token_logits,
-            hidden_states=x,
+            hidden_states=hidden_out,
         )
 
     def forward_generate(
@@ -324,11 +392,8 @@ class BaseTransformer(nn.Module):
         audio_parts: Optional[Tensor] = None,
         return_all: bool = False,
     ) -> BaseTransformerForwardResult:
-        # This is used for generation, optimized for torch compile
-        # assert (
-        #     self.max_seq_len != -1 and self.max_batch_size != -1
-        # ), "Please call setup_caches before forward_generate"
-
+        
+        # Embedding logic replicated from embed() for compilation compatibility
         embeds = []
         for i in range(self.config.num_codebooks):
             emb = self.codebook_embeddings(
@@ -338,15 +403,14 @@ class BaseTransformer(nn.Module):
 
         vq_embeds_sum = torch.stack(embeds, dim=1).sum(dim=1)
 
-        vq_masks = (inp[:, 0] >= self.tokenizer.semantic_begin_id) & (
-            inp[:, 0] <= self.tokenizer.semantic_end_id
+        vq_masks = (inp[:, 0] >= self.config.semantic_begin_id) & (
+            inp[:, 0] <= self.config.semantic_end_id
         )
 
         vq_embeds_sum[~vq_masks] = 0
         x = self.embeddings(inp[:, 0]) + vq_embeds_sum
 
         if self.config.scale_codebook_embeddings:
-            # Expand vq_masks to match x's shape
             vq_masks_expanded = vq_masks.unsqueeze(-1).expand_as(x)
             x = torch.where(
                 vq_masks_expanded, x / math.sqrt(self.config.num_codebooks + 1), x
@@ -354,11 +418,16 @@ class BaseTransformer(nn.Module):
 
         # Audio embeddings
         if audio_parts is not None:
-            audio_embeds = self.audio_projector(audio_parts)
-            if self.config.scale_codebook_embeddings:
-                x[audio_masks] = audio_embeds / math.sqrt(2)
+            # Note: This assumes self.audio_projector exists if audio_parts is used
+            # It seems missing in init, but we keep existing logic
+            if hasattr(self, "audio_projector"):
+                audio_embeds = self.audio_projector(audio_parts)
+                if self.config.scale_codebook_embeddings:
+                    x[audio_masks] = audio_embeds / math.sqrt(2)
+                else:
+                    x[audio_masks] = audio_embeds
             else:
-                x[audio_masks] = audio_embeds
+                logger.warning("audio_parts provided but model has no audio_projector")
 
         if input_pos is None:
             input_pos = torch.arange(inp.shape[-1], device=x.device)
@@ -372,11 +441,9 @@ class BaseTransformer(nn.Module):
         for layer in self.layers:
             x = layer(x, freqs_cis, mask, input_pos=input_pos)
 
-        # If prefill, we only calculate the logits of last token
         if x.size(1) > 1 and not return_all:
             x = x[:, -1:]
 
-        # We got slow_out here
         slow_out = self.norm(x)
 
         if self.config.is_reward_model:
@@ -386,9 +453,15 @@ class BaseTransformer(nn.Module):
         else:
             token_logits = self.output(slow_out)
 
+        hidden_out = (
+            slow_out
+            if getattr(self.config, "norm_fastlayer_input", False)
+            else x
+        )
+
         return BaseTransformerForwardResult(
             logits=token_logits,
-            hidden_states=x,
+            hidden_states=hidden_out,
         )
 
     def _init_weights(self, module):
@@ -410,6 +483,9 @@ class BaseTransformer(nn.Module):
         lora_config: LoraConfig | None = None,
         rope_base: int | None = None,
     ) -> "BaseTransformer":
+        # Import wrapper locally to avoid circular dependency or global import issues
+        from fish_speech.tokenizer import FishTokenizer
+
         config = BaseModelArgs.from_pretrained(str(path))
         if max_length is not None:
             config.max_seq_len = max_length
@@ -419,6 +495,14 @@ class BaseTransformer(nn.Module):
             config.rope_base = rope_base
             logger.info(f"Override rope_base to {rope_base}")
 
+        try:
+            tokenizer = FishTokenizer.from_pretrained(path)
+            config.semantic_begin_id = tokenizer.semantic_begin_id
+            config.semantic_end_id = tokenizer.semantic_end_id
+            logger.info(f"Injected Semantic IDs into Config: {config.semantic_begin_id}-{config.semantic_end_id}")
+        except Exception as e:
+            logger.warning(f"Failed to load tokenizer for config injection: {e}. Semantic IDs might be 0.")
+
         match config.model_type:
             case "naive":
                 model_cls = NaiveTransformer
@@ -427,19 +511,18 @@ class BaseTransformer(nn.Module):
             case _:
                 raise ValueError(f"Unknown model type: {config.model_type}")
 
-        tokenizer = FishTokenizer.from_pretrained(path)
-
         logger.info(f"Loading model from {path}, config: {config}")
-        model = model_cls(config, tokenizer=tokenizer)
+        # Initialize model without passing tokenizer explicitly to __init__
+        model = model_cls(config)
+        # Attach tokenizer to model instance for inference convenience (optional, but good for user scripts)
+        model.tokenizer = tokenizer
 
         if load_weights is False:
             logger.info("Randomly initialized model")
         else:
-
             if "int8" in str(Path(path)):
                 logger.info("Using int8 weight-only quantization!")
                 from tools.llama.quantize import WeightOnlyInt8QuantHandler
-
                 simple_quantizer = WeightOnlyInt8QuantHandler(model)
                 model = simple_quantizer.convert_for_runtime()
 
@@ -449,46 +532,47 @@ class BaseTransformer(nn.Module):
                 assert path_comps[-2].startswith("g")
                 groupsize = int(path_comps[-2][1:])
                 from tools.llama.quantize import WeightOnlyInt4QuantHandler
-
                 simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
                 model = simple_quantizer.convert_for_runtime()
 
-            weights = torch.load(
-                Path(path) / "model.pth",
-                map_location="cpu",
-                mmap=True,
-                weights_only=True,
-            )
-
-            if "state_dict" in weights:
-                logger.warning(
-                    "Using a TextToSemantic LightningModule checkpoint, "
-                    "please make sure it is a full model, not a LoRA model."
-                )
-                weights = weights["state_dict"]
-
-            if next(iter(weights.keys())).startswith("model."):
-                logger.info(
-                    f"Remove prefix 'model.' created by TextToSemantic LightningModule from keys"
+            path_obj = Path(path)
+            index_json = path_obj / "model.safetensors.index.json"
+            single_st = path_obj / "model.safetensors"
+            pth_file = path_obj / "model.pth"
+
+            if index_json.exists():
+                logger.info("Loading sharded safetensors weights")
+                from safetensors.torch import load_file as st_load_file
+                with open(index_json) as f:
+                    st_index = json.load(f)
+                shard_files = sorted(set(st_index["weight_map"].values()))
+                weights = OrderedDict()
+                for shard in shard_files:
+                    weights.update(st_load_file(str(path_obj / shard), device="cpu"))
+                weights = _remap_fish_qwen3_omni_keys(weights)
+            elif single_st.exists():
+                logger.info("Loading single safetensors weights")
+                from safetensors.torch import load_file as st_load_file
+                weights = OrderedDict(st_load_file(str(single_st), device="cpu"))
+                weights = _remap_fish_qwen3_omni_keys(weights)
+            elif pth_file.exists():
+                weights = torch.load(
+                    pth_file,
+                    map_location="cpu",
+                    mmap=True,
+                    weights_only=True,
                 )
-                new_weights = OrderedDict()
-                for k, v in weights.items():
-                    new_weights[k.replace("model.", "")] = v
-                weights = new_weights
-
-            # Remove audio related weights
-            for k in list(weights.keys()):
-                if "audio_" in k:
-                    weights.pop(k)
-
-            # Verify the name and shape of parameters since strict=False in load_state_dict.
-            for k, v in model.named_parameters():
-                if k not in weights:
-                    logger.warning(f"No weight for {k}")
-                elif v.shape != weights[k].shape:
-                    logger.warning(
-                        f"Shape mismatch for {k}: {v.shape} vs {weights[k].shape}"
+                if "state_dict" in weights:
+                    weights = weights["state_dict"]
+                if weights and next(iter(weights.keys())).startswith("model."):
+                    weights = OrderedDict(
+                        (k.replace("model.", ""), v) for k, v in weights.items()
                     )
+                for k in list(weights.keys()):
+                    if "audio_" in k:
+                        weights.pop(k)
+            else:
+                raise FileNotFoundError(f"No model weights found in {path_obj}")
 
             err = model.load_state_dict(weights, strict=False, assign=True)
             logger.info(f"Model weights loaded - Status: {err}")
@@ -510,17 +594,16 @@ class BaseTransformer(nn.Module):
             for key in list(state_dict.keys()):
                 if "lora" not in key:
                     continue
-
                 state_dict.pop(key)
-                logger.info(f"Drop LoRA parameter: {key}")
 
         torch.save(state_dict, path / "model.pth")
-        self.tokenizer.save_pretrained(path)
+        if hasattr(self, "tokenizer"):
+            self.tokenizer.save_pretrained(path)
 
 
 class NaiveTransformer(BaseTransformer):
-    def __init__(self, config: NaiveModelArgs, tokenizer: FishTokenizer) -> None:
-        super().__init__(config, init_weights=False, tokenizer=tokenizer)
+    def __init__(self, config: NaiveModelArgs) -> None:
+        super().__init__(config, init_weights=False)
 
         self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
         self.codebook_output = nn.Linear(
@@ -565,8 +648,8 @@ class NaiveTransformer(BaseTransformer):
 
 
 class DualARTransformer(BaseTransformer):
-    def __init__(self, config: NaiveModelArgs, tokenizer: FishTokenizer) -> None:
-        super().__init__(config, init_weights=False, tokenizer=tokenizer)
+    def __init__(self, config: NaiveModelArgs) -> None:
+        super().__init__(config, init_weights=False)
 
         # Project to fast dim if needed
         if config.fast_dim is not None and config.fast_dim != config.dim:
@@ -655,9 +738,12 @@ class DualARTransformer(BaseTransformer):
 
         # Extract corresponding parts with labels
         token_labels = labels[:, 0]
-        codebook_mask = (token_labels >= self.tokenizer.semantic_begin_id) & (
-            token_labels <= self.tokenizer.semantic_end_id
+        
+        # [MODIFIED] Use config instead of tokenizer
+        codebook_mask = (token_labels >= self.config.semantic_begin_id) & (
+            token_labels <= self.config.semantic_end_id
         )
+        
         # This gives where input token is <|semantic|>
         x = x[codebook_mask]
 
@@ -937,4 +1023,4 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
     )
 
     x_out2 = x_out2.flatten(3)
-    return x_out2.type_as(x)
+    return x_out2.type_as(x)

+ 69 - 138
fish_speech/tokenizer.py

@@ -1,44 +1,31 @@
-import base64
 import json
 import logging
-import re
 from pathlib import Path
+from typing import TYPE_CHECKING, List, Union
 
-import tiktoken
+import torch
+from transformers import AutoTokenizer
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerFast
 
 logger = logging.getLogger(__name__)
 
-# This is a modified version of the default pattern from GPT-4o, that better handles punctuations.
-FISH_TIKTOKEN_PATTERN = "|".join(
-    [
-        r"(?i:'s|'t|'re|'ve|'m|'ll|'d)",
-        r"\p{P}",
-        r"[^\r\n\p{L}\p{N}]?\p{L}+",
-        r"\p{N}",
-        r" ?[^\s\p{L}\p{N}]+[\r\n]*",
-        r"\s*[\r\n]+",
-        r"\s+(\?!\S)",
-        r"\s+",
-    ]
-)
-TIKTOKEN_MAX_ENCODE_CHARS = 400_000
-
-BOS_TOKEN = "<|begin_of_text|>"
-EOS_TOKEN = "<|end_of_text|>"
+# Constants definitions
+EOS_TOKEN = "<|endoftext|>"
 PAD_TOKEN = "<|pad|>"
 IM_START_TOKEN = "<|im_start|>"
 IM_END_TOKEN = "<|im_end|>"
 PHONEME_START_TOKEN = "<|phoneme_start|>"
 PHONEME_END_TOKEN = "<|phoneme_end|>"
-TOOL_CALL_START_TOKEN = "<|tool_call_start|>"
-TOOL_CALL_END_TOKEN = "<|tool_call_end|>"
 
 MODALITY_TEXT_TOKEN = "<|text|>"
 MODALITY_VOICE_TOKEN = "<|voice|>"
 MODALITY_INTERLEAVE_TOKEN = "<|interleave|>"
 AUDIO_START_TOKEN = "<|audio_start|>"
 AUDIO_END_TOKEN = "<|audio_end|>"
-AUDIO_EMBED_TOKEN = "<|audio|>"
+AUDIO_EMBED_TOKEN = "<|audio_pad|>"
+
 MODALITY_TOKENS = {
     "text": MODALITY_TEXT_TOKEN,
     "voice": MODALITY_VOICE_TOKEN,
@@ -46,134 +33,78 @@ MODALITY_TOKENS = {
 }
 
 SEMANTIC_TOKEN_TEMPLATE = "<|semantic:{i}|>"
-SEMANTIC_TOKENS = [SEMANTIC_TOKEN_TEMPLATE.format(i=i) for i in range(4096)]
-
-# Warning: when you add a new special token, you should only add it to the end of the list.
-ALL_SPECIAL_TOKENS = [
-    BOS_TOKEN,
-    EOS_TOKEN,
-    PAD_TOKEN,
-    IM_START_TOKEN,
-    IM_END_TOKEN,
-    PHONEME_START_TOKEN,
-    PHONEME_END_TOKEN,
-    TOOL_CALL_START_TOKEN,
-    TOOL_CALL_END_TOKEN,
-    MODALITY_TEXT_TOKEN,
-    MODALITY_VOICE_TOKEN,
-    MODALITY_INTERLEAVE_TOKEN,
-    AUDIO_START_TOKEN,
-    AUDIO_END_TOKEN,
-    AUDIO_EMBED_TOKEN,
-    *SEMANTIC_TOKENS,
-]
+SEMANTIC_TOKENS =[SEMANTIC_TOKEN_TEMPLATE.format(i=i) for i in range(4096)]
 
+ALL_SPECIAL_TOKENS =[
+    EOS_TOKEN, PAD_TOKEN, IM_START_TOKEN, IM_END_TOKEN,
+    PHONEME_START_TOKEN, PHONEME_END_TOKEN, MODALITY_TEXT_TOKEN,
+    MODALITY_VOICE_TOKEN, MODALITY_INTERLEAVE_TOKEN, AUDIO_START_TOKEN,
+    AUDIO_END_TOKEN, AUDIO_EMBED_TOKEN, *SEMANTIC_TOKENS,
+]
 
 class FishTokenizer:
-    def __init__(
-        self, model_path: str, special_tokens: list[str] = ALL_SPECIAL_TOKENS
-    ) -> None:
-        mergeable_ranks = self.load_tiktoken_bpe(model_path)
-        special_token_begin = len(mergeable_ranks)
-        self.all_special_tokens_with_ids = {
-            token: special_token_begin + i for i, token in enumerate(special_tokens)
-        }
-
+    def __init__(self, model_path: str):
+        self._tokenizer = AutoTokenizer.from_pretrained(model_path)
         self.semantic_id_to_token_id = {}
-        end_idx = 0
-        for token in special_tokens:
-            if token.startswith("<|semantic:"):
-                idx = int(re.match(r"<\|semantic:(\d+)\|>", token).group(1))
-                self.semantic_id_to_token_id[idx] = self.all_special_tokens_with_ids[
-                    token
-                ]
-
-                if idx > end_idx:
-                    end_idx = idx
-
-        self.semantic_begin_id = self.semantic_id_to_token_id[0]
-        self.semantic_end_id = self.semantic_id_to_token_id[end_idx]
-
-        self.tkt_model = tiktoken.core.Encoding(
-            name=Path(model_path).stem,
-            pat_str=FISH_TIKTOKEN_PATTERN,
-            mergeable_ranks=mergeable_ranks,
-            special_tokens=self.all_special_tokens_with_ids,
-        )
+        
+        vocab = self._tokenizer.get_vocab()
+        valid_ids =[]
+        
+        for code_idx in range(4096):
+            token = SEMANTIC_TOKEN_TEMPLATE.format(i=code_idx)
+            if token in vocab:
+                token_id = vocab[token]
+                self.semantic_id_to_token_id[code_idx] = token_id
+                valid_ids.append(token_id)
+        
+        if not valid_ids:
+            logger.error("CRITICAL ERROR: No semantic tokens found in vocab! Audio cannot be synthesized.")
+            self.semantic_begin_id = 0
+            self.semantic_end_id = 0
+            # Dummy tensor to prevent crash, though generation will fail
+            self.semantic_map_tensor = torch.zeros(4096, dtype=torch.long)
+        else:
+            self.semantic_begin_id = min(valid_ids)
+            self.semantic_end_id = max(valid_ids)
+            # Create a lookup tensor to handle potential gaps in token IDs safely
+            self.semantic_map_tensor = torch.zeros(4096, dtype=torch.long)
+            for k, v in self.semantic_id_to_token_id.items():
+                self.semantic_map_tensor[k] = v
+
+        logger.info(f"Loaded Tokenizer. Semantic Range: {self.semantic_begin_id} -> {self.semantic_end_id}")
 
     @property
     def vocab_size(self):
-        return len(self.tkt_model._mergeable_ranks)
+        return self._tokenizer.vocab_size
 
     @property
-    def num_special_tokens(self):
-        return len(self.all_special_tokens_with_ids)
-
-    @staticmethod
-    def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
-        data = {}
-        for line in open(tiktoken_bpe_file).read().splitlines():
-            if not line:
-                continue
-            token, rank = line.split()
-            if token == "=":
-                continue
-            data[base64.b64decode(token)] = int(rank)
-        return data
+    def pad_token_id(self):
+        return self._tokenizer.pad_token_id
 
-    def get_token_id(self, token: str) -> int:
-        return self.all_special_tokens_with_ids[token]
-
-    def encode(self, s: str, allowed_special: bool | set[str] = True) -> list[int]:
-        assert isinstance(s, str)
-
-        subs = []
-        for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS):
-            subs.append(s[i : i + TIKTOKEN_MAX_ENCODE_CHARS])
+    @property
+    def eos_token_id(self):
+        return self._tokenizer.eos_token_id
 
-        if allowed_special is True:
-            allowed_special = self.tkt_model.special_tokens_set
-        elif allowed_special is False:
-            allowed_special = set()
+    def get_token_id(self, token: str) -> int:
+        return self._tokenizer.convert_tokens_to_ids(token)
 
-        return sum(
-            self.tkt_model.encode_batch(
-                subs, allowed_special=allowed_special, disallowed_special=set()
-            ),
-            start=[],
-        )
+    def encode(self, text: str, add_special_tokens: bool = False, **kwargs) -> List[int]:
+        # [FIX] Force Qwen/Tiktoken backends to parse special tokens inline
+        import inspect
+        sig = inspect.signature(self._tokenizer.encode)
+        if "allowed_special" in sig.parameters and "allowed_special" not in kwargs:
+            kwargs["allowed_special"] = "all"
+        return self._tokenizer.encode(text, add_special_tokens=add_special_tokens, **kwargs)
 
-    def decode(self, tokens: list[int]) -> str:
-        return self.tkt_model.decode(tokens)
+    def decode(self, tokens: Union[List[int], int], **kwargs) -> str:
+        return self._tokenizer.decode(tokens, **kwargs)
 
     def save_pretrained(self, path: str):
-        path = Path(path)
-        path.mkdir(parents=True, exist_ok=True)
-
-        with open(path / "tokenizer.tiktoken", "w") as f:
-            for token, rank in self.tkt_model._mergeable_ranks.items():
-                a = base64.b64encode(token).decode()
-                if a == "":
-                    a = "="
-                f.write(f"{a} {rank}\n")
-
-        with open(path / "special_tokens.json", "w") as f:
-            json.dump(
-                self.all_special_tokens_with_ids,
-                f,
-                indent=2,
-                ensure_ascii=False,
-            )
-
-    @staticmethod
-    def from_pretrained(path: str):
-        special_tokens_path = Path(path) / "special_tokens.json"
-        if special_tokens_path.exists():
-            with open(special_tokens_path) as f:
-                all_special_tokens_with_ids = json.load(f)
-        else:
-            all_special_tokens_with_ids = ALL_SPECIAL_TOKENS
+        self._tokenizer.save_pretrained(path)
+
+    @classmethod
+    def from_pretrained(cls, path: str):
+        return cls(path)
 
-        return FishTokenizer(
-            Path(path) / "tokenizer.tiktoken", all_special_tokens_with_ids
-        )
+    def __getattr__(self, name):
+        return getattr(self._tokenizer, name)

+ 12 - 9
pyproject.toml

@@ -8,13 +8,15 @@ description = "Fish Speech"
 readme = "README.md"
 requires-python = ">=3.10"
 keywords = ["TTS", "Speech"]
-license = "Apache-2.0"
+license = {text = "Fish Audio Research License"}
 classifiers = [
     "Programming Language :: Python :: 3",
 ]
 dependencies = [
-    "numpy<=1.26.4",
-    "transformers>=4.45.2",
+    "numpy",
+    "torch==2.8.0",
+    "torchaudio==2.8.0",
+    "transformers<=4.57.3",
     "datasets==2.18.0",
     "lightning>=2.1.0",
     "hydra-core>=1.3.2",
@@ -44,28 +46,29 @@ dependencies = [
     "pydantic==2.9.2",
     "cachetools",
     "descript-audio-codec",
-    "descript-audiotools"
+    "descript-audiotools",
+    "safetensors"
 ]
 
 [project.optional-dependencies]
 stable = [
-    "torch<2.9.0",
+    "torch==2.8.0",
     "torchaudio",
 ]
 cpu = [
-  "torch>=2.5.1",
+  "torch==2.8.0",
   "torchaudio",
 ]
 cu126 = [
-  "torch>=2.5.1",
+  "torch==2.8.0",
   "torchaudio",
 ]
 cu128 = [
-  "torch>=2.5.1",
+  "torch==2.8.0",
   "torchaudio",
 ]
 cu129 = [
-  "torch>=2.5.1",
+  "torch==2.8.0",
   "torchaudio",
 ]
 

+ 0 - 55
tools/download_models.py

@@ -1,55 +0,0 @@
-import os
-
-from huggingface_hub import hf_hub_download
-
-
-# Download
-def check_and_download_files(repo_id, file_list, local_dir):
-    os.makedirs(local_dir, exist_ok=True)
-    for file in file_list:
-        file_path = os.path.join(local_dir, file)
-        if not os.path.exists(file_path):
-            print(f"{file} 不存在,从 Hugging Face 仓库下载...")
-            hf_hub_download(
-                repo_id=repo_id,
-                filename=file,
-                resume_download=True,
-                local_dir=local_dir,
-                local_dir_use_symlinks=False,
-            )
-        else:
-            print(f"{file} 已存在,跳过下载。")
-
-
-# 1st
-repo_id_1 = "fishaudio/openaudio-s1-mini"
-local_dir_1 = "./checkpoints/openaudio-s1-mini"
-files_1 = [
-    ".gitattributes",
-    "model.pth",
-    "README.md",
-    "special_tokens.json",
-    "tokenizer.tiktoken",
-    "config.json",
-    "codec.pth",
-]
-
-# 3rd
-repo_id_3 = "fishaudio/fish-speech-1"
-local_dir_3 = "./"
-files_3 = [
-    "ffmpeg.exe",
-    "ffprobe.exe",
-]
-
-# 4th
-repo_id_4 = "SpicyqSama007/fish-speech-packed"
-local_dir_4 = "./"
-files_4 = [
-    "asr-label-win-x64.exe",
-]
-
-check_and_download_files(repo_id_1, files_1, local_dir_1)
-
-check_and_download_files(repo_id_3, files_3, local_dir_3)
-check_and_download_files(repo_id_4, files_4, local_dir_4)

+ 2 - 2
tools/run_webui.py

@@ -24,12 +24,12 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
-        default="checkpoints/openaudio-s1-mini",
+        default="checkpoints/s2-pro",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=Path,
-        default="checkpoints/openaudio-s1-mini/codec.pth",
+        default="checkpoints/s2-pro/codec.pth",
     )
     parser.add_argument("--decoder-config-name", type=str, default="modded_dac_vq")
     parser.add_argument("--device", type=str, default="cuda")

+ 2 - 2
tools/server/api_utils.py

@@ -24,12 +24,12 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=str,
-        default="checkpoints/openaudio-s1-mini",
+        default="checkpoints/s2-pro",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=str,
-        default="checkpoints/openaudio-s1-mini/codec.pth",
+        default="checkpoints/s2-pro/codec.pth",
     )
     parser.add_argument("--decoder-config-name", type=str, default="modded_dac_vq")
     parser.add_argument("--device", type=str, default="cuda")

+ 1 - 1
tools/vqgan/extract_vq.py

@@ -136,7 +136,7 @@ def process_batch(files: list[Path], model) -> float:
 @click.option("--config-name", default="modded_dac_vq")
 @click.option(
     "--checkpoint-path",
-    default="checkpoints/openaudio-s1-mini/codec.pth",
+    default="checkpoints/s2-pro/codec.pth",
 )
 @click.option("--batch-size", default=64)
 @click.option("--filelist", default=None, type=Path)

+ 1 - 1
tools/webui/variables.py

@@ -6,7 +6,7 @@ HEADER_MD = f"""# Fish Speech
 
 {i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).")}  
 
-{i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")}  
+{i18n("Related code and weights are released under FISH AUDIO RESEARCH LICENSE.")}  
 
 {i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}  
 """

+ 99 - 77
uv.lock

@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux' and extra != 'extra-11-fish-speech-cpu' and extra != 'extra-11-fish-speech-cu126' and extra != 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129'",
@@ -1064,6 +1064,7 @@ dependencies = [
     { name = "hydra-core" },
     { name = "kui" },
     { name = "librosa" },
+    { name = "liger-kernel" },
     { name = "lightning" },
     { name = "loguru" },
     { name = "loralib" },
@@ -1078,6 +1079,7 @@ dependencies = [
     { name = "pyrootutils" },
     { name = "resampy" },
     { name = "rich" },
+    { name = "safetensors" },
     { name = "silero-vad" },
     { name = "tensorboard" },
     { name = "tiktoken" },
@@ -1136,12 +1138,13 @@ requires-dist = [
     { name = "hydra-core", specifier = ">=1.3.2" },
     { name = "kui", specifier = ">=1.6.0" },
     { name = "librosa", specifier = ">=0.10.1" },
+    { name = "liger-kernel" },
     { name = "lightning", specifier = ">=2.1.0" },
     { name = "loguru", specifier = ">=0.6.0" },
     { name = "loralib", specifier = ">=0.1.2" },
     { name = "modelscope", specifier = "==1.17.1" },
     { name = "natsort", specifier = ">=8.4.0" },
-    { name = "numpy", specifier = "<=1.26.4" },
+    { name = "numpy" },
     { name = "opencc-python-reimplemented", specifier = "==0.1.7" },
     { name = "ormsgpack" },
     { name = "pyaudio" },
@@ -1150,20 +1153,21 @@ requires-dist = [
     { name = "pyrootutils", specifier = ">=1.0.4" },
     { name = "resampy", specifier = ">=0.4.3" },
     { name = "rich", specifier = ">=13.5.3" },
+    { name = "safetensors" },
     { name = "silero-vad" },
     { name = "tensorboard", specifier = ">=2.14.1" },
     { name = "tiktoken", specifier = ">=0.8.0" },
-    { name = "torch", marker = "extra == 'cpu'", specifier = ">=2.5.1", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "fish-speech", extra = "cpu" } },
-    { name = "torch", marker = "extra == 'cu126'", specifier = ">=2.5.1", index = "https://download.pytorch.org/whl/cu126", conflict = { package = "fish-speech", extra = "cu126" } },
-    { name = "torch", marker = "extra == 'cu128'", specifier = ">=2.5.1", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "fish-speech", extra = "cu128" } },
-    { name = "torch", marker = "extra == 'cu129'", specifier = ">=2.5.1", index = "https://download.pytorch.org/whl/cu129", conflict = { package = "fish-speech", extra = "cu129" } },
-    { name = "torch", marker = "extra == 'stable'", specifier = ">=2.5.1" },
+    { name = "torch", marker = "extra == 'cpu'", specifier = "==2.8.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "fish-speech", extra = "cpu" } },
+    { name = "torch", marker = "extra == 'cu126'", specifier = "==2.8.0", index = "https://download.pytorch.org/whl/cu126", conflict = { package = "fish-speech", extra = "cu126" } },
+    { name = "torch", marker = "extra == 'cu128'", specifier = "==2.8.0", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "fish-speech", extra = "cu128" } },
+    { name = "torch", marker = "extra == 'cu129'", specifier = "==2.8.0", index = "https://download.pytorch.org/whl/cu129", conflict = { package = "fish-speech", extra = "cu129" } },
+    { name = "torch", marker = "extra == 'stable'", specifier = "==2.8.0" },
     { name = "torchaudio", marker = "extra == 'cpu'", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "fish-speech", extra = "cpu" } },
     { name = "torchaudio", marker = "extra == 'cu126'", index = "https://download.pytorch.org/whl/cu126", conflict = { package = "fish-speech", extra = "cu126" } },
     { name = "torchaudio", marker = "extra == 'cu128'", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "fish-speech", extra = "cu128" } },
     { name = "torchaudio", marker = "extra == 'cu129'", index = "https://download.pytorch.org/whl/cu129", conflict = { package = "fish-speech", extra = "cu129" } },
     { name = "torchaudio", marker = "extra == 'stable'" },
-    { name = "transformers", specifier = ">=4.45.2" },
+    { name = "transformers", specifier = "<=4.57.3" },
     { name = "uvicorn", specifier = ">=0.30.0" },
     { name = "wandb", specifier = ">=0.15.11" },
     { name = "zstandard", specifier = ">=0.22.0" },
@@ -1958,6 +1962,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b5/ba/c63c5786dfee4c3417094c4b00966e61e4a63efecee22cb7b4c0387dda83/librosa-0.11.0-py3-none-any.whl", hash = "sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1", size = 260749, upload-time = "2025-03-11T15:09:52.982Z" },
 ]
 
+[[package]]
+name = "liger-kernel"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(sys_platform == 'darwin' and extra == 'extra-11-fish-speech-cpu') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra != 'extra-11-fish-speech-cpu' and extra != 'extra-11-fish-speech-cu126' and extra != 'extra-11-fish-speech-cu128' and extra != 'extra-11-fish-speech-cu129')" },
+    { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(sys_platform != 'darwin' and extra == 'extra-11-fish-speech-cpu') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "torch", version = "2.8.0+cu126", source = { registry = "https://download.pytorch.org/whl/cu126" }, marker = "extra == 'extra-11-fish-speech-cu126' or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "extra == 'extra-11-fish-speech-cu128' or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra != 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra != 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra != 'extra-11-fish-speech-cpu' and extra != 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "triton" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/73/f8/a25ab6549dd80c5055e84f0ee9be6ecf4a7c85ade448110607f4807ab2ef/liger_kernel-0.7.0.tar.gz", hash = "sha256:48c25648974c7d07d47591117b94188d1059999b7fb652e89e8ba50b21183cb3", size = 3780796, upload-time = "2026-02-12T22:01:00.101Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f7/c6/66c6a3c59b53f4cfbfa0c9e8c573c8843fb1d83cf3c2eb79e39109b76c17/liger_kernel-0.7.0-py3-none-any.whl", hash = "sha256:d2185d94362c069f508eb8aff5e7016c6669613265d5702733d674af4c616a56", size = 276512, upload-time = "2026-02-12T22:00:58.565Z" },
+]
+
 [[package]]
 name = "lightning"
 version = "2.5.5"
@@ -2771,7 +2793,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
     { name = "nvidia-cublas-cu12", version = "12.8.4.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
-    { name = "nvidia-cublas-cu12", version = "12.9.1.4", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "nvidia-cublas-cu12", version = "12.9.1.4", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (platform_machine == 'aarch64' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (sys_platform != 'linux' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" },
@@ -2830,7 +2852,7 @@ resolution-markers = [
     "python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux'",
 ]
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", version = "12.9.86", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "nvidia-nvjitlink-cu12", version = "12.9.86", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (platform_machine == 'aarch64' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (sys_platform != 'linux' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/9b/2b/76445b0af890da61b501fde30650a1a4bd910607261b209cccb5235d3daa/nvidia_cufft_cu12-11.4.1.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1a28c9b12260a1aa7a8fd12f5ebd82d027963d635ba82ff39a1acfa7c4c0fbcf", size = 200822453, upload-time = "2025-06-05T20:05:27.889Z" },
@@ -2988,9 +3010,9 @@ resolution-markers = [
     "python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux'",
 ]
 dependencies = [
-    { name = "nvidia-cublas-cu12", version = "12.9.1.4", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
-    { name = "nvidia-cusparse-cu12", version = "12.5.10.65", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
-    { name = "nvidia-nvjitlink-cu12", version = "12.9.86", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "nvidia-cublas-cu12", version = "12.9.1.4", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (platform_machine == 'aarch64' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (sys_platform != 'linux' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "nvidia-cusparse-cu12", version = "12.5.10.65", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (platform_machine == 'aarch64' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (sys_platform != 'linux' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "nvidia-nvjitlink-cu12", version = "12.9.86", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (platform_machine == 'aarch64' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (sys_platform != 'linux' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/03/99/686ff9bf3a82a531c62b1a5c614476e8dfa24a9d89067aeedf3592ee4538/nvidia_cusolver_cu12-11.7.5.82-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:62efa83e4ace59a4c734d052bb72158e888aa7b770e1a5f601682f16fe5b4fd2", size = 337869834, upload-time = "2025-06-05T20:06:53.125Z" },
@@ -3049,7 +3071,7 @@ resolution-markers = [
     "python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux'",
 ]
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", version = "12.9.86", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "nvidia-nvjitlink-cu12", version = "12.9.86", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (platform_machine == 'aarch64' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (sys_platform != 'linux' and extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/5e/6f/8710fbd17cdd1d0fc3fea7d36d5b65ce1933611c31e1861da330206b253a/nvidia_cusparse_cu12-12.5.10.65-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:221c73e7482dd93eda44e65ce567c031c07e2f93f6fa0ecd3ba876a195023e83", size = 366359408, upload-time = "2025-06-05T20:07:42.501Z" },
@@ -4794,11 +4816,11 @@ dependencies = [
     { name = "typing-extensions", marker = "(sys_platform == 'darwin' and extra == 'extra-11-fish-speech-cpu') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp310-none-macosx_11_0_arm64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp311-none-macosx_11_0_arm64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp312-none-macosx_11_0_arm64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp313-cp313t-macosx_14_0_arm64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:a467b49fe893a6a6cce89e3aee556edfdc64a722d7195fdfdd75cec9dea13779" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:3d05017d19bc99741288e458888283a44b0ee881d53f05f72f8b1cfea8998122" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:a47b7986bee3f61ad217d8a8ce24605809ab425baf349f97de758815edd2ef54" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:fbe2e149c5174ef90d29a5f84a554dfaf28e003cb4f61fa2c8c024c17ec7ca58" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:057efd30a6778d2ee5e2374cd63a63f63311aa6f33321e627c655df60abdd390" },
 ]
 
 [[package]]
@@ -4877,28 +4899,28 @@ dependencies = [
     { name = "typing-extensions", marker = "(sys_platform != 'darwin' and extra == 'extra-11-fish-speech-cpu') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp310-cp310-linux_s390x.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp310-cp310-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-linux_s390x.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-win_arm64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-linux_s390x.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-win_arm64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-linux_s390x.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-win_arm64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313t-win_amd64.whl" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp310-cp310-linux_s390x.whl", hash = "sha256:5d255d259fbc65439b671580e40fdb8faea4644761b64fed90d6904ffe71bbc1" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b2149858b8340aeeb1f3056e0bff5b82b96e43b596fe49a9dba3184522261213" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:16d75fa4e96ea28a785dfd66083ca55eb1058b6d6c5413f01656ca965ee2077e" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp310-cp310-win_amd64.whl", hash = "sha256:7cc4af6ba954f36c2163eab98cf113c137fc25aa8bbf1b06ef155968627beed2" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-linux_s390x.whl", hash = "sha256:2bfc013dd6efdc8f8223a0241d3529af9f315dffefb53ffa3bf14d3f10127da6" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:680129efdeeec3db5da3f88ee5d28c1b1e103b774aef40f9d638e2cce8f8d8d8" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cb06175284673a581dd91fb1965662ae4ecaba6e5c357aa0ea7bb8b84b6b7eeb" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:7631ef49fbd38d382909525b83696dc12a55d68492ade4ace3883c62b9fc140f" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-win_arm64.whl", hash = "sha256:41e6fc5ec0914fcdce44ccf338b1d19a441b55cafdd741fd0bf1af3f9e4cfd14" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-linux_s390x.whl", hash = "sha256:0e34e276722ab7dd0dffa9e12fe2135a9b34a0e300c456ed7ad6430229404eb5" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:610f600c102386e581327d5efc18c0d6edecb9820b4140d26163354a99cd800d" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:cb9a8ba8137ab24e36bf1742cb79a1294bd374db570f09fc15a5e1318160db4e" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:2be20b2c05a0cce10430cc25f32b689259640d273232b2de357c35729132256d" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-win_arm64.whl", hash = "sha256:99fc421a5d234580e45957a7b02effbf3e1c884a5dd077afc85352c77bf41434" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-linux_s390x.whl", hash = "sha256:8b5882276633cf91fe3d2d7246c743b94d44a7e660b27f1308007fdb1bb89f7d" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a5064b5e23772c8d164068cc7c12e01a75faf7b948ecd95a0d4007d7487e5f25" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8f81dedb4c6076ec325acc3b47525f9c550e5284a18eae1d9061c543f7b6e7de" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:e1ee1b2346ade3ea90306dfbec7e8ff17bc220d344109d189ae09078333b0856" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-win_arm64.whl", hash = "sha256:64c187345509f2b1bb334feed4666e2c781ca381874bde589182f81247e61f88" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:af81283ac671f434b1b25c95ba295f270e72db1fad48831eb5e4748ff9840041" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:a9dbb6f64f63258bc811e2c0c99640a81e5af93c531ad96e95c5ec777ea46dab" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:6d93a7165419bc4b2b907e859ccab0dea5deeab261448ae9a5ec5431f14c0e64" },
 ]
 
 [[package]]
@@ -4941,16 +4963,16 @@ dependencies = [
     { name = "typing-extensions", marker = "extra == 'extra-11-fish-speech-cu126' or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp310-cp310-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp310-cp310-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp311-cp311-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp312-cp312-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp313-cp313-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp313-cp313-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp313-cp313t-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp313-cp313t-win_amd64.whl" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6bf3c0085af4176137f216c39995dede9beda9af1307fd1dee2305f4f351eb42" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp310-cp310-win_amd64.whl", hash = "sha256:3b43acaa40ac87495d858bb1dbe574aaaff4e60e9da6221825ece694fd1d2b80" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:542cdbf042aaf5d6ddbed43cb8cd8c4df1e586bebb1338f5dcba14fa52830d3c" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp311-cp311-win_amd64.whl", hash = "sha256:cfb2c640a8955fbd8686c056802f53a512b610c098960d6dad5800cbc16c02b6" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ce6e6a1f4803ad62d1fe51cec3fe5ca14bcd8bc7cace7b09d5590f8147fa16ad" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp312-cp312-win_amd64.whl", hash = "sha256:f6c79eac0018f9d131479ee1b7a68edb030619a316bfbc69275043aa4f338e4c" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d08144011e410b9d15914e7256e1b1708a90484cb2c03712199e0291856d4177" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp313-cp313-win_amd64.whl", hash = "sha256:3aa7da5cc6b7df0e8c0754dea339bd31cd21b5620e84f53b2ac4be47e8bb2179" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:637bdbd510920abce9370c4ff8b44bce6ace48dc1672e1ed42ff14e1e67497b9" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp313-cp313t-win_amd64.whl", hash = "sha256:6b6a2baf405f6c7069f38658bd6f70dced6a06ee044102d05e0bd7310611fce3" },
 ]
 
 [[package]]
@@ -4993,16 +5015,16 @@ dependencies = [
     { name = "typing-extensions", marker = "extra == 'extra-11-fish-speech-cu128' or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra != 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp310-cp310-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp311-cp311-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313t-win_amd64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:0c96999d15cf1f13dd7c913e0b21a9a355538e6cfc10861a17158320292f5954" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp310-cp310-win_amd64.whl", hash = "sha256:43938e9a174c90e5eb9e906532b2f1e21532bbfa5a61b65193b4f54714d34f9e" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:039b9dcdd6bdbaa10a8a5cd6be22c4cb3e3589a341e5f904cbb571ca28f55bed" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:34c55443aafd31046a7963b63d30bc3b628ee4a704f826796c865fdfd05bb596" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4354fc05bb79b208d6995a04ca1ceef6a9547b1c4334435574353d381c55087c" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:0ad925202387f4e7314302a1b4f8860fa824357f9b1466d7992bf276370ebcff" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:3a852369a38dec343d45ecd0bc3660f79b88a23e0c878d18707f7c13bf49538f" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:9e20646802b7fc295c1f8b45fefcfc9fb2e4ec9cbe8593443cd2b9cc307c8405" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4295a22d69408e93d25f51e8d5d579345b6b802383e9414b0f3853ed433d53ae" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:970b4f4661fa7b44f6a7e6df65de7fc4a6fff2af610dc415c1d695ca5f1f37d2" },
 ]
 
 [[package]]
@@ -5049,21 +5071,21 @@ dependencies = [
     { name = "typing-extensions", marker = "(extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra != 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129') or (extra != 'extra-11-fish-speech-cpu' and extra != 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129')" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp310-cp310-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp310-cp310-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp310-cp310-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp311-cp311-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp311-cp311-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp311-cp311-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-win_amd64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:72156354c39c08f3451acb50a6ecd4178d745670ad8651b5c796eaace558ff0f" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:54d240b5d3b1f9075d4ee6179675a22c1974f7bef1885d134c582678d5180cd3" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp310-cp310-win_amd64.whl", hash = "sha256:e4adcc3d44089d4a696e6a2ca4233d1ddad7614adc1d48ec8a8cfb95ba235ea1" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d5a210e351ab0c3acbac18c7397bc66ffd897d5000351e8ce9a21badf7f56b85" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:da19696fd75c4a2d5bc945242619143dfc4cbc3e3deead407f2946d1395c3608" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp311-cp311-win_amd64.whl", hash = "sha256:8a92b6ac49be932a8e4f70282d0d396a95a0fc877a9fbe0bd36be5f765707c84" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:692fe6e513b667f789a543fa9b1baba58e77a46d5c8629764ca0c00a56823e1f" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:02c7258e917f3043c978b53acf6f02b818db0d0d85db0e58ae578af333b9b4e2" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-win_amd64.whl", hash = "sha256:2bc729898e422b9f3da54349eed98f2f0b5dd415434508ee2ab2a13fb021815d" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ad2d64316635e7ab06f6c973a252526d59a92a2045825c102f876914a72304d0" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:563740167be2189b71530b503f0c8a8d7a8267dd49d4de6f9c5f1d23fbe237df" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-win_amd64.whl", hash = "sha256:2cef066f9759ff4d7868a8c3695aa60d9a878598acb3685bb1ef2fdac29dcd68" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2982bf34249cbb38f1090e71ad7097a214a21023ccdc0413961986ab7d0396e6" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6344260959ebcfa6dae458e1c4365195bcfdf00f4f1f1ad438cbaf50756829ed" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-win_amd64.whl", hash = "sha256:9c0cd89e54ce44ce3208c5cf4163773b9cda0067e4b48cfcac56a4e04af52040" },
 ]
 
 [[package]]
@@ -5371,7 +5393,7 @@ name = "triton"
 version = "3.4.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "setuptools", marker = "(sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu126') or (sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu128') or (sys_platform == 'linux' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu126') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cpu' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu128') or (extra == 'extra-11-fish-speech-cu126' and extra == 'extra-11-fish-speech-cu129') or (extra == 'extra-11-fish-speech-cu128' and extra == 'extra-11-fish-speech-cu129')" },
+    { name = "setuptools" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/62/ee/0ee5f64a87eeda19bbad9bc54ae5ca5b98186ed00055281fd40fb4beb10e/triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ff2785de9bc02f500e085420273bb5cc9c9bb767584a4aa28d6e360cec70128", size = 155430069, upload-time = "2025-07-30T19:58:21.715Z" },