2 달 전 · 83f208210d
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-**English** | [简体中文](docs/README.zh.md) | [Portuguese](docs/README.pt-BR.md) | [日本語](docs/README.ja.md) | [한국어](docs/README.ko.md) | [العربية](docs/README.ar.md) <br>
			
 
				+**English** | [简体中文](docs/README.zh.md) | [Portuguese](docs/README.pt-BR.md) | [日本語](docs/README.ja.md) | [한국어](docs/README.ko.md) | [العربية](docs/README.ar.md) | [Español](docs/README.es.md)  <br>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/README.ar.md
+++ b/docs/README.ar.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-[English](../README.md) | [简体中文](README.zh.md) | [Portuguese](README.pt-BR.md) | [日本語](README.ja.md) | [한국어](README.ko.md) | **العربية** <br>
			
 
				+[English](../README.md) | [简体中文](README.zh.md) | [Portuguese](README.pt-BR.md) | [日本語](README.ja.md) | [한국어](README.ko.md) | **العربية** | [Español](docs/README.es.md)  <br>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/README.es.md
+++ b/docs/README.es.md
@@ -0,0 +1,186 @@
 
				+<div align="center">
			
 
				+<h1>Fish Speech</h1>
			
 
				+
			
 
				+[English](../README.md) | [简体中文](docs/README.zh.md) | [Portuguese](docs/README.pt-BR.md) | [日本語](docs/README.ja.md) | [한국어](docs/README.ko.md) | [العربية](docs/README.ar.md) | **Español** <br>
			
 
				+
			
 
				+<a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Clonación&#0032;de&#0032;voz&#0032;expresiva&#0032;y&#0032;texto&#0045;a&#0045;voz | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a> <a href="https://trendshift.io/repositories/7014" target="_blank"> <img src="https://trendshift.io/api/badge/repositories/7014" alt="fishaudio%2Ffish-speech | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/> </a> <br>
			
 
				+
			
 
				+</div>
			
 
				+<br>
			
 
				+
			
 
				+<div align="center">
			
 
				+    <img src="https://count.getloli.com/get/@fish-speech?theme=asoul" /><br>
			
 
				+</div>
			
 
				+
			
 
				+<br>
			
 
				+
			
 
				+<div align="center">
			
 
				+    <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
			
 
				+        <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
			
 
				+    </a>
			
 
				+    <a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
			
 
				+        <img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
			
 
				+    </a>
			
 
				+    <a target="_blank" href="https://pd.qq.com/s/bwxia254o">
			
 
				+      <img alt="QQ Channel" src="https://img.shields.io/badge/QQ-blue?logo=tencentqq">
			
 
				+    </a>
			
 
				+</div>
			
 
				+
			
 
				+<div align="center">
			
 
				+    <a target="_blank" href="https://huggingface.co/fishaudio/s2-pro">
			
 
				+        <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
			
 
				+    </a>
			
 
				+    <a target="_blank" href="https://fish.audio/blog/fish-audio-open-sources-s2/">
			
 
				+        <img alt="Fish Audio Blog" src="https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white"/>
			
 
				+    </a>
			
 
				+    <a target="_blank" href="https://arxiv.org/abs/2603.08823">
			
 
				+        <img alt="Paper | Informe Técnico" src="https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square"/>
			
 
				+    </a>
			
 
				+</div>
			
 
				+
			
 
				+> [!IMPORTANT]
			
 
				+> **Aviso de Licencia**
			
 
				+> Este código y los pesos de modelo asociados se publican bajo la **[FISH AUDIO RESEARCH LICENSE](LICENSE)**. Consulta [LICENSE](LICENSE) para más detalles. Se tomarán acciones ante cualquier violación de la licencia.
			
 
				+
			
 
				+> [!WARNING]
			
 
				+> **Descargo de Responsabilidad Legal**
			
 
				+> No asumimos ninguna responsabilidad por el uso ilegal de este código. Consulta las leyes locales relacionadas con DMCA y otras normativas aplicables.
			
 
				+
			
 
				+## Inicio Rápido
			
 
				+
			
 
				+### Para humanos
			
 
				+
			
 
				+Aquí tienes la documentación oficial de Fish Audio S2. Sigue las instrucciones para comenzar fácilmente.
			
 
				+
			
 
				+* [Instalación](https://speech.fish.audio/install/)
			
 
				+* [Inferencia por línea de comandos](https://speech.fish.audio/inference/#command-line-inference)
			
 
				+* [Inferencia con WebUI](https://speech.fish.audio/inference/#webui-inference)
			
 
				+* [Inferencia en servidor](https://speech.fish.audio/server/)
			
 
				+* [Configuración de Docker](https://speech.fish.audio/install/#docker-setup)
			
 
				+
			
 
				+> [!IMPORTANT]
			
 
				+> **Para el servidor SGLang, consulta [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).**
			
 
				+
			
 
				+### Para agentes LLM
			
 
				+
			
 
				+```
			
 
				+Instala y configura Fish-Audio S2 siguiendo las instrucciones aquí: https://speech.fish.audio/install/
			
 
				+```
			
 
				+
			
 
				+## Fish Audio S2 Pro
			
 
				+
			
 
				+**Sistema de texto a voz (TTS) multilingüe de última generación, redefiniendo los límites de la generación de voz.**
			
 
				+
			
 
				+Fish Audio S2 Pro es el modelo multimodal más avanzado desarrollado por Fish Audio. Entrenado con más de **10 millones de horas** de datos de audio que abarcan más de **80 idiomas**, S2 Pro combina una arquitectura **Dual-Autoregressive (Dual-AR)** con alineación mediante aprendizaje por refuerzo (RL) para generar voz extremadamente natural, realista y emocionalmente rica, liderando tanto sistemas open-source como closed-source.
			
 
				+
			
 
				+La principal fortaleza de S2 Pro es su soporte para control fino a nivel **sub-palabra (sub-word level)** de prosodia y emoción usando etiquetas en lenguaje natural (por ejemplo `[whisper]`, `[excited]`, `[angry]`), además de soportar de forma nativa generación multi-speaker y conversaciones multi-turno.
			
 
				+
			
 
				+Visita el sitio web de Fish Audio para probarlo en vivo, o lee el informe técnico y el blog para más detalles.
			
 
				+
			
 
				+### Variantes del modelo
			
 
				+
			
 
				+| Modelo | Tamaño        | Disponibilidad                                         | Descripción                                               |
			
 
				+| ------ | ------------- | ------------------------------------------------------ | --------------------------------------------------------- |
			
 
				+| S2-Pro | 4B parámetros | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | Modelo insignia completo con máxima calidad y estabilidad |
			
 
				+
			
 
				+Más detalles pueden encontrarse en el informe técnico.
			
 
				+
			
 
				+## Resultados de benchmarks
			
 
				+
			
 
				+| Benchmark                                 | Fish Audio S2              |
			
 
				+| ----------------------------------------- | -------------------------- |
			
 
				+| Seed-TTS Eval — WER (Chino)               | **0.54%** (mejor global)   |
			
 
				+| Seed-TTS Eval — WER (Inglés)              | **0.99%** (mejor global)   |
			
 
				+| Audio Turing Test (con instrucciones)     | **0.515** media posterior  |
			
 
				+| EmergentTTS-Eval — Tasa de victoria       | **81.88%** (máximo global) |
			
 
				+| Fish Instruction Benchmark — TAR          | **93.3%**                  |
			
 
				+| Fish Instruction Benchmark — Calidad      | **4.51 / 5.0**             |
			
 
				+| Multilingüe (MiniMax Testset) — Mejor WER | **11 de 24** idiomas       |
			
 
				+| Multilingüe (MiniMax Testset) — Mejor SIM | **17 de 24** idiomas       |
			
 
				+
			
 
				+En Seed-TTS Eval, S2 logra el menor WER entre todos los modelos evaluados, incluyendo sistemas cerrados: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). En el Audio Turing Test, 0.515 supera a Seed-TTS (0.417) en un 24% y a MiniMax-Speech (0.387) en un 33%. En EmergentTTS-Eval, S2 destaca especialmente en paralingüística (91.61%), preguntas (84.41%) y complejidad sintáctica (83.39%).
			
 
				+
			
 
				+## Highlights
			
 
				+
			
 
				+<img src="./docs/assets/totalability.png" width=200%>
			
 
				+
			
 
				+### Control fino inline mediante lenguaje natural
			
 
				+
			
 
				+S2 Pro aporta un nivel de “alma” sin precedentes a la voz. Usando sintaxis `[tag]`, puedes insertar instrucciones emocionales con precisión en cualquier parte del texto.
			
 
				+
			
 
				+* **Más de 15,000 tags únicos soportados**
			
 
				+* Soporta descripciones libres como `[whisper in small voice]`, `[professional broadcast tone]`, `[pitch up]`
			
 
				+
			
 
				+### Arquitectura Dual-Autoregressive (Dual-AR)
			
 
				+
			
 
				+* **Slow AR (4B parámetros)**: modela la estructura temporal
			
 
				+* **Fast AR (400M parámetros)**: reconstruye detalles acústicos finos
			
 
				+
			
 
				+### Alineación mediante RL
			
 
				+
			
 
				+* Usa GRPO
			
 
				+* Señales de recompensa multidimensionales
			
 
				+
			
 
				+### Rendimiento extremo en streaming
			
 
				+
			
 
				+* RTF: 0.195
			
 
				+* TTFA: ~100 ms
			
 
				+* +3000 tokens/s
			
 
				+
			
 
				+### Soporte multilingüe robusto
			
 
				+
			
 
				+* Más de 80 idiomas
			
 
				+* Sin necesidad de phonemes específicos
			
 
				+
			
 
				+### Generación multi-speaker nativa
			
 
				+
			
 
				+<img src="./docs/assets/chattemplate.png" width=200%>
			
 
				+
			
 
				+Permite múltiples hablantes usando `<|speaker:i|>` en una sola generación.
			
 
				+
			
 
				+### Generación multi-turno
			
 
				+
			
 
				+Mantiene contexto para mejorar la naturalidad.
			
 
				+
			
 
				+### Clonación de voz rápida
			
 
				+
			
 
				+* Solo 10–30 segundos de audio
			
 
				+* Alta fidelidad de timbre y estilo
			
 
				+
			
 
				+Para usar con SGLang Server, consulta el README correspondiente.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Créditos
			
 
				+
			
 
				+* [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
			
 
				+* [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
			
 
				+* [GPT VITS](https://github.com/innnky/gpt-vits)
			
 
				+* [MQTTS](https://github.com/b04901014/MQTTS)
			
 
				+* [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
			
 
				+* [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
			
 
				+* [Qwen3](https://github.com/QwenLM/Qwen3)
			
 
				+
			
 
				+## Informe Técnico
			
 
				+
			
 
				+```bibtex
			
 
				+@misc{fish-speech-v1.4,
			
 
				+      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},
			
 
				+      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},
			
 
				+      year={2024},
			
 
				+      eprint={2411.01156},
			
 
				+      archivePrefix={arXiv},
			
 
				+      primaryClass={cs.SD},
			
 
				+      url={https://arxiv.org/abs/2411.01156},
			
 
				+}
			
 
				+
			
 
				+@misc{liao2026fishaudios2technical,
			
 
				+      title={Fish Audio S2 Technical Report}, 
			
 
				+      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},
			
 
				+      year={2026},
			
 
				+      eprint={2603.08823},
			
 
				+      archivePrefix={arXiv},
			
 
				+      primaryClass={cs.SD},
			
 
				+      url={https://arxiv.org/abs/2603.08823}, 
			
 
				+}
			
 
				+```
			
--- a/docs/README.ja.md
+++ b/docs/README.ja.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-[English](../README.md) | [简体中文](README.zh.md) | [Portuguese](README.pt-BR.md) | **日本語** | [한국어](README.ko.md) | [العربية](README.ar.md) <br>
			
 
				+[English](../README.md) | [简体中文](README.zh.md) | [Portuguese](README.pt-BR.md) | **日本語** | [한국어](README.ko.md) | [العربية](README.ar.md) | [Español](docs/README.es.md)  <br>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/README.ko.md
+++ b/docs/README.ko.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-[English](../README.md) | [简体中文](README.zh.md) | [Portuguese](README.pt-BR.md) | [日本語](README.ja.md) | **한국어** | [العربية](README.ar.md) <br>
			
 
				+[English](../README.md) | [简体中文](README.zh.md) | [Portuguese](README.pt-BR.md) | [日本語](README.ja.md) | **한국어** | [العربية](README.ar.md) | [Español](docs/README.es.md)  <br>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/README.pt-BR.md
+++ b/docs/README.pt-BR.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-[English](../README.md) | [简体中文](README.zh.md) | **Portuguese** | [日本語](README.ja.md) | [한국어](README.ko.md) | [العربية](README.ar.md) <br>
			
 
				+[English](../README.md) | [简体中文](README.zh.md) | **Portuguese** | [日本語](README.ja.md) | [한국어](README.ko.md) | [العربية](README.ar.md) | [Español](docs/README.es.md)  <br>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/README.zh.md
+++ b/docs/README.zh.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-[English](../README.md) | **简体中文** | [Portuguese](README.pt-BR.md) | [日本語](README.ja.md) | [한국어](README.ko.md) | [العربية](README.ar.md) <br>
			
 
				+[English](../README.md) | **简体中文** | [Portuguese](README.pt-BR.md) | [日本語](README.ja.md) | [한국어](README.ko.md) | [العربية](README.ar.md) | [Español](docs/README.es.md)  <br>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/en/index.md
+++ b/docs/en/index.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-<p><strong>English</strong> | <a href="../zh/">简体中文</a> | <a href="../pt/">Portuguese</a> | <a href="../ja/">日本語</a> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a></p>
			
 
				+<p><strong>English</strong> | <a href="../zh/">简体中文</a> | <a href="../pt/">Portuguese</a> | <a href="../ja/">日本語</a> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a> | <a href="../es/">Español</a></p>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/es/finetune.md
+++ b/docs/es/finetune.md
@@ -0,0 +1,131 @@
 
				+# Fine-tuning
			
 
				+
			
 
				+!!! warning
			
 
				+Recomendamos encarecidamente no realizar fine-tuning sobre un modelo entrenado con RL. Ajustar un modelo después de RL puede cambiar la distribución del modelo, lo que puede llevar a una degradación del rendimiento.
			
 
				+
			
 
				+En la versión actual, solo necesitas hacer fine-tuning de la parte ‘LLAMA’.
			
 
				+
			
 
				+## Fine-tuning LLAMA
			
 
				+
			
 
				+### 1. Preparar el dataset
			
 
				+
			
 
				+```
			
 
				+.
			
 
				+├── SPK1
			
 
				+│   ├── 21.15-26.44.lab
			
 
				+│   ├── 21.15-26.44.mp3
			
 
				+│   ├── 27.51-29.98.lab
			
 
				+│   ├── 27.51-29.98.mp3
			
 
				+│   ├── 30.1-32.71.lab
			
 
				+│   └── 30.1-32.71.mp3
			
 
				+└── SPK2
			
 
				+    ├── 38.79-40.85.lab
			
 
				+    └── 38.79-40.85.mp3
			
 
				+```
			
 
				+
			
 
				+Necesitas convertir tu dataset al formato anterior y colocarlo dentro de `data`. El archivo de audio puede tener extensiones `.mp3`, `.wav` o `.flac`, y el archivo de anotación debe tener la extensión `.lab`.
			
 
				+
			
 
				+!!! info
			
 
				+El archivo de anotación `.lab` solo necesita contener la transcripción del audio, sin ningún formato especial. Por ejemplo, si `hi.mp3` dice "Hello, goodbye," entonces el archivo `hi.lab` contendría una única línea de texto: "Hello, goodbye."
			
 
				+
			
 
				+!!! warning
			
 
				+Se recomienda aplicar normalización de loudness al dataset. Puedes usar [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) para hacerlo.
			
 
				+
			
 
				+````
			
 
				+```bash
			
 
				+fap loudness-norm data-raw data --clean
			
 
				+```
			
 
				+````
			
 
				+
			
 
				+### 2. Extracción por lotes de tokens semánticos
			
 
				+
			
 
				+Asegúrate de haber descargado los pesos de VQGAN. Si no, ejecuta el siguiente comando:
			
 
				+
			
 
				+```bash
			
 
				+huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
			
 
				+```
			
 
				+
			
 
				+Luego puedes ejecutar el siguiente comando para extraer los tokens semánticos:
			
 
				+
			
 
				+```bash
			
 
				+python tools/vqgan/extract_vq.py data \
			
 
				+    --num-workers 1 --batch-size 16 \
			
 
				+    --config-name "modded_dac_vq" \
			
 
				+    --checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth"
			
 
				+```
			
 
				+
			
 
				+!!! note
			
 
				+Puedes ajustar `--num-workers` y `--batch-size` para aumentar la velocidad de extracción, pero asegúrate de no exceder el límite de memoria de tu GPU.
			
 
				+
			
 
				+Este comando creará archivos `.npy` en el directorio `data`, como se muestra a continuación:
			
 
				+
			
 
				+```
			
 
				+.
			
 
				+├── SPK1
			
 
				+│   ├── 21.15-26.44.lab
			
 
				+│   ├── 21.15-26.44.mp3
			
 
				+│   ├── 21.15-26.44.npy
			
 
				+│   ├── 27.51-29.98.lab
			
 
				+│   ├── 27.51-29.98.mp3
			
 
				+│   ├── 27.51-29.98.npy
			
 
				+│   ├── 30.1-32.71.lab
			
 
				+│   ├── 30.1-32.71.mp3
			
 
				+│   └── 30.1-32.71.npy
			
 
				+└── SPK2
			
 
				+    ├── 38.79-40.85.lab
			
 
				+    ├── 38.79-40.85.mp3
			
 
				+    └── 38.79-40.85.npy
			
 
				+```
			
 
				+
			
 
				+### 3. Empaquetar el dataset en protobuf
			
 
				+
			
 
				+```bash
			
 
				+python tools/llama/build_dataset.py \
			
 
				+    --input "data" \
			
 
				+    --output "data/protos" \
			
 
				+    --text-extension .lab \
			
 
				+    --num-workers 16
			
 
				+```
			
 
				+
			
 
				+Después de que el comando termine de ejecutarse, deberías ver el archivo `protos` en el directorio `data`.
			
 
				+
			
 
				+### 4. Finalmente, fine-tuning con LoRA
			
 
				+
			
 
				+De manera similar, asegúrate de haber descargado los pesos de `LLAMA`. Si no, ejecuta el siguiente comando:
			
 
				+
			
 
				+```bash
			
 
				+huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini
			
 
				+```
			
 
				+
			
 
				+Finalmente, puedes comenzar el fine-tuning ejecutando el siguiente comando:
			
 
				+
			
 
				+```bash
			
 
				+python fish_speech/train.py --config-name text2semantic_finetune \
			
 
				+    project=$project \
			
 
				+    +lora@model.model.lora_config=r_8_alpha_16
			
 
				+```
			
 
				+
			
 
				+!!! note
			
 
				+Puedes modificar parámetros de entrenamiento como `batch_size`, `gradient_accumulation_steps`, etc., para ajustarlos a la memoria de tu GPU editando `fish_speech/configs/text2semantic_finetune.yaml`.
			
 
				+
			
 
				+!!! note
			
 
				+Para usuarios de Windows, puedes usar `trainer.strategy.process_group_backend=gloo` para evitar problemas con `nccl`.
			
 
				+
			
 
				+Una vez que el entrenamiento esté completo, puedes consultar la sección de [inference](inference.md) para probar tu modelo.
			
 
				+
			
 
				+!!! info
			
 
				+Por defecto, el modelo solo aprenderá los patrones de habla del hablante y no el timbre. Aún necesitas usar prompts para asegurar la estabilidad del timbre.
			
 
				+Si quieres aprender el timbre, puedes aumentar el número de pasos de entrenamiento, pero esto puede llevar a overfitting.
			
 
				+
			
 
				+Después del entrenamiento, necesitas convertir los pesos LoRA a pesos normales antes de realizar inferencia.
			
 
				+
			
 
				+```bash
			
 
				+python tools/llama/merge_lora.py \
			
 
				+	--lora-config r_8_alpha_16 \
			
 
				+	--base-weight checkpoints/openaudio-s1-mini \
			
 
				+	--lora-weight results/$project/checkpoints/step_000000010.ckpt \
			
 
				+	--output checkpoints/openaudio-s1-mini-yth-lora/
			
 
				+```
			
 
				+
			
 
				+!!! note
			
 
				+También puedes probar otros checkpoints. Sugerimos usar el checkpoint más temprano que cumpla con tus requisitos, ya que suelen rendir mejor en datos fuera de distribución (OOD).
			
--- a/docs/es/index.md
+++ b/docs/es/index.md
@@ -0,0 +1,191 @@
 
				+<div align="center">
			
 
				+<h1>Fish Speech</h1>
			
 
				+
			
 
				+<p><a href="../en/">English</a> | <a href="../zh/">简体中文</a> | <a href="../pt/">Portuguese</a> | <a href="../ja/">日本語</a> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a> | <strong>Español</strong></p>
			
 
				+
			
 
				+<a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Clonación&#0032;de&#0032;voz&#0032;expresiva&#0032;y&#0032;texto&#0045;a&#0045;voz | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a> <a href="https://trendshift.io/repositories/7014" target="_blank"> <img src="https://trendshift.io/api/badge/repositories/7014" alt="fishaudio%2Ffish-speech | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/> </a>
			
 
				+
			
 
				+</div>
			
 
				+
			
 
				+<br>
			
 
				+
			
 
				+<div align="center">
			
 
				+    <img src="https://count.getloli.com/get/@fish-speech?theme=asoul" /><br>
			
 
				+</div>
			
 
				+
			
 
				+<br>
			
 
				+
			
 
				+<div align="center">
			
 
				+    <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
			
 
				+        <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
			
 
				+    </a>
			
 
				+    <a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
			
 
				+        <img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
			
 
				+    </a>
			
 
				+    <a target="_blank" href="https://pd.qq.com/s/bwxia254o">
			
 
				+      <img alt="QQ Channel" src="https://img.shields.io/badge/QQ-blue?logo=tencentqq">
			
 
				+    </a>
			
 
				+</div>
			
 
				+
			
 
				+<div align="center">
			
 
				+    <a target="_blank" href="https://huggingface.co/fishaudio/s2">
			
 
				+        <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
			
 
				+    </a>
			
 
				+    <a target="_blank" href="https://fish.audio/blog/fish-audio-open-sources-s2/">
			
 
				+        <img alt="Fish Audio Blog" src="https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white"/>
			
 
				+    </a>
			
 
				+    <a target="_blank" href="https://github.com/fishaudio/fish-speech/blob/main/FishAudioS2TecReport.pdf">
			
 
				+        <img alt="Paper | Technical Report" src="https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square"/>
			
 
				+    </a>
			
 
				+</div>
			
 
				+
			
 
				+!!! info "Aviso de Licencia"
			
 
				+Este código y sus pesos de modelo asociados se publican bajo **FISH AUDIO RESEARCH LICENSE**. Por favor, consulta [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) para más detalles. Tomaremos acciones contra cualquier violación de la licencia.
			
 
				+
			
 
				+!!! warning "Aviso Legal"
			
 
				+No asumimos ninguna responsabilidad por el uso ilegal de este código. Por favor, consulta las leyes locales sobre DMCA y otras leyes relacionadas.
			
 
				+
			
 
				+## Inicio Rápido
			
 
				+
			
 
				+### Para Humanos
			
 
				+
			
 
				+Aquí están los documentos oficiales de Fish Audio S2, sigue las instrucciones para comenzar fácilmente.
			
 
				+
			
 
				+* [Instalación](https://speech.fish.audio/install/)
			
 
				+* [Inferencia por Línea de Comandos](https://speech.fish.audio/inference/#command-line-inference)
			
 
				+* [Inferencia con WebUI](https://speech.fish.audio/inference/#webui-inference)
			
 
				+* [Inferencia en Servidor](https://speech.fish.audio/server/)
			
 
				+* [Configuración con Docker](https://speech.fish.audio/install/#docker-setup)
			
 
				+
			
 
				+> [!IMPORTANT]
			
 
				+> **Para el servidor SGLang, por favor lee [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).**
			
 
				+
			
 
				+### Para Agente LLM
			
 
				+
			
 
				+```
			
 
				+Instala y configura Fish-Audio S2 siguiendo las instrucciones aquí: https://speech.fish.audio/install/
			
 
				+```
			
 
				+
			
 
				+## Fish Audio S2
			
 
				+
			
 
				+**Mejor sistema de texto a voz entre código abierto y cerrado**
			
 
				+
			
 
				+Fish Audio S2 es el modelo más reciente desarrollado por [Fish Audio](https://fish.audio/). Entrenado con más de 10 millones de horas de audio en aproximadamente 50 idiomas, S2 combina alineación mediante aprendizaje por refuerzo con una arquitectura Dual-Autoregresiva para generar voz natural, realista y emocionalmente rica.
			
 
				+
			
 
				+S2 soporta control detallado en línea de prosodia y emoción usando etiquetas en lenguaje natural como `[laugh]`, `[whispers]` y `[super happy]`, así como generación nativa multi-hablante y multi-turno.
			
 
				+
			
 
				+Visita el [sitio web de Fish Audio](https://fish.audio/) para el playground en vivo. Lee el [blog](https://fish.audio/blog/fish-audio-open-sources-s2/) para más detalles.
			
 
				+
			
 
				+### Variantes del Modelo
			
 
				+
			
 
				+| Modelo | Tamaño        | Disponibilidad                                         | Descripción                                                                 |
			
 
				+| ------ | ------------- | ------------------------------------------------------ | --------------------------------------------------------------------------- |
			
 
				+| S2-Pro | 4B parámetros | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | Modelo insignia con todas las funcionalidades, máxima calidad y estabilidad |
			
 
				+
			
 
				+Más detalles del modelo pueden encontrarse en el [reporte técnico](https://arxiv.org/abs/2411.01156).
			
 
				+
			
 
				+## Resultados de Benchmark
			
 
				+
			
 
				+| Benchmark                                   | Fish Audio S2                |
			
 
				+| ------------------------------------------- | ---------------------------- |
			
 
				+| Seed-TTS Eval — WER (Chino)                 | **0.54%** (mejor global)     |
			
 
				+| Seed-TTS Eval — WER (Inglés)                | **0.99%** (mejor global)     |
			
 
				+| Test de Turing de Audio (con instrucción)   | **0.515** media posterior    |
			
 
				+| EmergentTTS-Eval — Win Rate                 | **81.88%** (más alto global) |
			
 
				+| Fish Instruction Benchmark — TAR            | **93.3%**                    |
			
 
				+| Fish Instruction Benchmark — Calidad        | **4.51 / 5.0**               |
			
 
				+| Multilenguaje (MiniMax Testset) — Mejor WER | **11 de 24** idiomas         |
			
 
				+| Multilenguaje (MiniMax Testset) — Mejor SIM | **17 de 24** idiomas         |
			
 
				+
			
 
				+En Seed-TTS Eval, S2 logra el menor WER entre todos los modelos evaluados incluyendo sistemas cerrados: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). En el Test de Turing de Audio, 0.515 supera a Seed-TTS (0.417) en un 24% y a MiniMax-Speech (0.387) en un 33%. En EmergentTTS-Eval, S2 obtiene resultados particularmente fuertes en paralingüística (91.61% win rate), preguntas (84.41%) y complejidad sintáctica (83.39%).
			
 
				+
			
 
				+## Características Destacadas
			
 
				+
			
 
				+<img src="../assets/totalability.png" width=200%>
			
 
				+
			
 
				+### Control Fino en Línea mediante Lenguaje Natural
			
 
				+
			
 
				+S2 permite control localizado sobre la generación de voz incrustando instrucciones en lenguaje natural directamente en posiciones específicas del texto. En lugar de depender de un conjunto fijo de etiquetas, S2 acepta descripciones libres como `[whisper in small voice]`, `[professional broadcast tone]` o `[pitch up]`, permitiendo control expresivo abierto a nivel de palabra.
			
 
				+
			
 
				+### Arquitectura Dual-Autoregresiva
			
 
				+
			
 
				+S2 se basa en un transformer solo-decoder combinado con un códec de audio basado en RVQ (10 codebooks, ~21 Hz). La arquitectura Dual-AR divide la generación en dos etapas:
			
 
				+
			
 
				+* **AR Lento** opera en el eje temporal y predice el codebook semántico principal.
			
 
				+* **AR Rápido** genera los 9 codebooks residuales restantes en cada paso temporal, reconstruyendo detalles acústicos finos.
			
 
				+
			
 
				+Este diseño asimétrico — 4B parámetros en el eje temporal y 400M en el eje de profundidad — mantiene la inferencia eficiente sin perder fidelidad.
			
 
				+
			
 
				+### Alineación con Aprendizaje por Refuerzo
			
 
				+
			
 
				+S2 utiliza Group Relative Policy Optimization (GRPO) para alineación post-entrenamiento. Los mismos modelos usados para filtrar y anotar datos se reutilizan como modelos de recompensa durante RL, eliminando desajustes de distribución. La señal de recompensa combina precisión semántica, cumplimiento de instrucciones, preferencia acústica y similitud de timbre.
			
 
				+
			
 
				+### Streaming en Producción con SGLang
			
 
				+
			
 
				+Debido a que la arquitectura Dual-AR es estructuralmente isomórfica a los LLM autoregresivos estándar, S2 hereda optimizaciones como batching continuo, caché KV paginado, CUDA graph replay y caching de prefijos basado en RadixAttention.
			
 
				+
			
 
				+En una sola GPU NVIDIA H200:
			
 
				+
			
 
				+* **RTF:** 0.195
			
 
				+* **Tiempo hasta primer audio:** ~100 ms
			
 
				+* **Throughput:** 3,000+ tokens acústicos/s manteniendo RTF < 0.5
			
 
				+
			
 
				+### Soporte Multilenguaje
			
 
				+
			
 
				+S2 soporta texto a voz multilenguaje de alta calidad sin requerir fonemas ni preprocesamiento específico del idioma. Incluye:
			
 
				+
			
 
				+**Inglés, Chino, Japonés, Coreano, Árabe, Alemán, Francés...**
			
 
				+
			
 
				+**¡Y MÁS!**
			
 
				+
			
 
				+La lista continúa expandiéndose, revisa [Fish Audio](https://fish.audio/) para las últimas novedades.
			
 
				+
			
 
				+### Generación Multi-Hablante Nativa
			
 
				+
			
 
				+<img src="../assets/chattemplate.png" width=200%>
			
 
				+
			
 
				+Fish Audio S2 permite a los usuarios subir audio de referencia con múltiples hablantes; el modelo gestionará las características de cada uno mediante el token `<|speaker:i|>`. Luego puedes controlar el comportamiento del modelo con ese identificador, permitiendo múltiples voces en una sola generación. Ya no necesitas subir audio por separado para cada hablante.
			
 
				+
			
 
				+### Generación Multi-Turno
			
 
				+
			
 
				+Gracias a la expansión del contexto del modelo, ahora puede usar información previa para mejorar la expresividad del contenido generado, aumentando la naturalidad.
			
 
				+
			
 
				+### Clonación de Voz Rápida
			
 
				+
			
 
				+Fish Audio S2 permite clonación de voz precisa usando una muestra corta (10–30 segundos). El modelo captura timbre, estilo y emoción, produciendo voces realistas y consistentes sin fine-tuning adicional.
			
 
				+Por favor consulta [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) para usar el servidor SGLang.
			
 
				+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
			
 
				+
			
 
				+## Créditos
			
 
				+
			
 
				+* [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
			
 
				+* [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
			
 
				+* [GPT VITS](https://github.com/innnky/gpt-vits)
			
 
				+* [MQTTS](https://github.com/b04901014/MQTTS)
			
 
				+* [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
			
 
				+* [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
			
 
				+* [Qwen3](https://github.com/QwenLM/Qwen3)
			
 
				+
			
 
				+## Reporte Técnico
			
 
				+
			
 
				+```bibtex
			
 
				+@misc{fish-speech-v1.4,
			
 
				+      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},
			
 
				+      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},
			
 
				+      year={2024},
			
 
				+      eprint={2411.01156},
			
 
				+      archivePrefix={arXiv},
			
 
				+      primaryClass={cs.SD},
			
 
				+      url={https://arxiv.org/abs/2411.01156},
			
 
				+}
			
 
				+
			
 
				+@misc{liao2026fishaudios2technical,
			
 
				+      title={Fish Audio S2 Technical Report}, 
			
 
				+      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},
			
 
				+      year={2026},
			
 
				+      eprint={2603.08823},
			
 
				+      archivePrefix={arXiv},
			
 
				+      primaryClass={cs.SD},
			
 
				+      url={https://arxiv.org/abs/2603.08823}, 
			
 
				+}
			
 
				+```
			
--- a/docs/es/inference.md
+++ b/docs/es/inference.md
@@ -0,0 +1,58 @@
 
				+# Inferencia
			
 
				+
			
 
				+El modelo Fish Audio S2 requiere una gran cantidad de VRAM. Recomendamos usar una GPU con al menos 24GB para la inferencia.
			
 
				+
			
 
				+## Descargar Pesos
			
 
				+
			
 
				+Primero, necesitas descargar los pesos del modelo:
			
 
				+
			
 
				+```bash
			
 
				+hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
			
 
				+```
			
 
				+
			
 
				+## Inferencia por Línea de Comandos
			
 
				+
			
 
				+!!! note
			
 
				+Si planeas dejar que el modelo elija aleatoriamente el timbre de voz, puedes omitir este paso.
			
 
				+
			
 
				+### 1. Obtener tokens VQ a partir de audio de referencia
			
 
				+
			
 
				+```bash
			
 
				+python fish_speech/models/dac/inference.py \
			
 
				+    -i "test.wav" \
			
 
				+    --checkpoint-path "checkpoints/s2-pro/codec.pth"
			
 
				+```
			
 
				+
			
 
				+Deberías obtener un `fake.npy` y un `fake.wav`.
			
 
				+
			
 
				+### 2. Generar tokens semánticos a partir de texto:
			
 
				+
			
 
				+```bash
			
 
				+python fish_speech/models/text2semantic/inference.py \
			
 
				+    --text "El texto que quieres convertir" \
			
 
				+    --prompt-text "Tu texto de referencia" \
			
 
				+    --prompt-tokens "fake.npy" \
			
 
				+    # --compile
			
 
				+```
			
 
				+
			
 
				+Este comando creará un archivo `codes_N` en el directorio de trabajo, donde N es un entero que comienza desde 0.
			
 
				+
			
 
				+!!! note
			
 
				+Puede que quieras usar `--compile` para fusionar kernels CUDA y acelerar la inferencia. Sin embargo, recomendamos usar nuestra optimización de aceleración de inferencia con sglang.
			
 
				+Correspondientemente, si no planeas usar aceleración, puedes comentar el parámetro `--compile`.
			
 
				+
			
 
				+!!! info
			
 
				+Para GPUs que no soportan bf16, puede que necesites usar el parámetro `--half`.
			
 
				+
			
 
				+### 3. Generar audio a partir de tokens semánticos:
			
 
				+
			
 
				+```bash
			
 
				+python fish_speech/models/dac/inference.py \
			
 
				+    -i "codes_0.npy" \
			
 
				+```
			
 
				+
			
 
				+Después de eso, obtendrás un archivo `fake.wav`.
			
 
				+
			
 
				+## Inferencia con WebUI
			
 
				+
			
 
				+Próximamente.
			
--- a/docs/es/install.md
+++ b/docs/es/install.md
@@ -0,0 +1,192 @@
 
				+## Requisitos
			
 
				+
			
 
				+* Memoria GPU: 24GB (Inferencia)
			
 
				+* Sistema: Linux, WSL
			
 
				+
			
 
				+## Configuración del sistema
			
 
				+
			
 
				+Fish Audio S2 admite múltiples métodos de instalación. Elige el que mejor se adapte a tu entorno de desarrollo.
			
 
				+
			
 
				+**Prerrequisitos**: Instala las dependencias del sistema para el procesamiento de audio:
			
 
				+
			
 
				+```bash
			
 
				+apt install portaudio19-dev libsox-dev ffmpeg
			
 
				+```
			
 
				+
			
 
				+### Conda
			
 
				+
			
 
				+```bash
			
 
				+conda create -n fish-speech python=3.12
			
 
				+conda activate fish-speech
			
 
				+
			
 
				+# Instalación con GPU (elige tu versión de CUDA: cu126, cu128, cu129)
			
 
				+pip install -e .[cu129]
			
 
				+
			
 
				+# Instalación solo CPU
			
 
				+pip install -e .[cpu]
			
 
				+
			
 
				+# Instalación por defecto (usa el índice por defecto de PyTorch)
			
 
				+pip install -e .
			
 
				+
			
 
				+# Si encuentras un error durante la instalación debido a pyaudio, considera usar el siguiente comando:
			
 
				+# conda install pyaudio
			
 
				+# Luego ejecuta pip install -e . nuevamente
			
 
				+```
			
 
				+
			
 
				+### UV
			
 
				+
			
 
				+UV proporciona una resolución de dependencias e instalación más rápida:
			
 
				+
			
 
				+```bash
			
 
				+# Instalación con GPU (elige tu versión de CUDA: cu126, cu128, cu129)
			
 
				+uv sync --python 3.12 --extra cu129
			
 
				+
			
 
				+# Instalación solo CPU
			
 
				+uv sync --python 3.12 --extra cpu
			
 
				+```
			
 
				+
			
 
				+### Soporte para Intel Arc XPU
			
 
				+
			
 
				+Para usuarios de GPU Intel Arc, instala con soporte XPU:
			
 
				+
			
 
				+```bash
			
 
				+conda create -n fish-speech python=3.12
			
 
				+conda activate fish-speech
			
 
				+
			
 
				+# Instalar la biblioteca estándar de C++ requerida
			
 
				+conda install libstdcxx -c conda-forge
			
 
				+
			
 
				+# Instalar PyTorch con soporte Intel XPU
			
 
				+pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
			
 
				+
			
 
				+# Instalar Fish Speech
			
 
				+pip install -e .
			
 
				+```
			
 
				+
			
 
				+!!! warning
			
 
				+La opción `compile` no es compatible con Windows ni macOS. Si quieres ejecutar con compile, necesitas instalar Triton manualmente.
			
 
				+
			
 
				+## Configuración con Docker
			
 
				+
			
 
				+El modelo de la serie Fish Audio S2 ofrece múltiples opciones de despliegue con Docker para adaptarse a diferentes necesidades. Puedes usar imágenes preconstruidas desde Docker Hub, construir localmente con Docker Compose o crear imágenes personalizadas manualmente.
			
 
				+
			
 
				+Proporcionamos imágenes Docker tanto para WebUI como para el servidor API en GPU (CUDA126 por defecto) y CPU. Puedes usar imágenes preconstruidas desde Docker Hub, construir localmente con Docker Compose o crear imágenes personalizadas manualmente. Si quieres construir localmente, sigue las instrucciones a continuación. Si solo quieres usar imágenes preconstruidas, sigue la [guía de inferencia](inference.md).
			
 
				+
			
 
				+### Prerrequisitos
			
 
				+
			
 
				+* Docker y Docker Compose instalados
			
 
				+* Runtime de NVIDIA Docker (para soporte GPU)
			
 
				+* Al menos 24GB de memoria GPU para inferencia con CUDA
			
 
				+
			
 
				+# Usar docker compose
			
 
				+
			
 
				+Para desarrollo o personalización, puedes usar Docker Compose para construir y ejecutar localmente:
			
 
				+
			
 
				+```bash
			
 
				+# Clonar el repositorio primero
			
 
				+git clone https://github.com/fishaudio/fish-speech.git
			
 
				+cd fish-speech
			
 
				+
			
 
				+# Iniciar WebUI con CUDA
			
 
				+docker compose --profile webui up
			
 
				+
			
 
				+# Iniciar WebUI con optimización compile
			
 
				+COMPILE=1 docker compose --profile webui up
			
 
				+
			
 
				+# Iniciar servidor API
			
 
				+docker compose --profile server up
			
 
				+
			
 
				+# Iniciar servidor API con optimización compile  
			
 
				+COMPILE=1 docker compose --profile server up
			
 
				+
			
 
				+# Para despliegue solo CPU
			
 
				+BACKEND=cpu docker compose --profile webui up
			
 
				+```
			
 
				+
			
 
				+#### Variables de entorno para Docker Compose
			
 
				+
			
 
				+Puedes personalizar el despliegue usando variables de entorno:
			
 
				+
			
 
				+```bash
			
 
				+# Ejemplo de archivo .env
			
 
				+BACKEND=cuda              # o cpu
			
 
				+COMPILE=1                 # Habilitar optimización compile
			
 
				+GRADIO_PORT=7860         # Puerto de WebUI
			
 
				+API_PORT=8080            # Puerto del servidor API
			
 
				+UV_VERSION=0.8.15        # Versión del gestor de paquetes UV
			
 
				+CUDA_VER=12.9.0          # Versión base de imagen CUDA (ej. 12.6.0 para drivers más antiguos)
			
 
				+UV_EXTRA=cu129           # Variante CUDA de PyTorch (cu126, cu128, cu129) — debe coincidir con CUDA_VER
			
 
				+```
			
 
				+
			
 
				+El comando construirá la imagen y ejecutará el contenedor. Puedes acceder a la WebUI en `http://localhost:7860` y al servidor API en `http://localhost:8080`.
			
 
				+
			
 
				+### Construcción manual con Docker
			
 
				+
			
 
				+Para usuarios avanzados que quieran personalizar el proceso de build:
			
 
				+
			
 
				+```bash
			
 
				+# Construir imagen WebUI con soporte CUDA
			
 
				+docker build \
			
 
				+    --platform linux/amd64 \
			
 
				+    -f docker/Dockerfile \
			
 
				+    --build-arg BACKEND=cuda \
			
 
				+    --build-arg CUDA_VER=12.9.0 \
			
 
				+    --build-arg UV_EXTRA=cu129 \
			
 
				+    --target webui \
			
 
				+    -t fish-speech-webui:cuda .
			
 
				+
			
 
				+# Construir imagen del servidor API con soporte CUDA
			
 
				+docker build \
			
 
				+    --platform linux/amd64 \
			
 
				+    -f docker/Dockerfile \
			
 
				+    --build-arg BACKEND=cuda \
			
 
				+    --build-arg CUDA_VER=12.9.0 \
			
 
				+    --build-arg UV_EXTRA=cu129 \
			
 
				+    --target server \
			
 
				+    -t fish-speech-server:cuda .
			
 
				+
			
 
				+# Construir imágenes solo CPU (soporta múltiples plataformas)
			
 
				+docker build \
			
 
				+    --platform linux/amd64,linux/arm64 \
			
 
				+    -f docker/Dockerfile \
			
 
				+    --build-arg BACKEND=cpu \
			
 
				+    --target webui \
			
 
				+    -t fish-speech-webui:cpu .
			
 
				+
			
 
				+# Construir imagen de desarrollo
			
 
				+docker build \
			
 
				+    --platform linux/amd64 \
			
 
				+    -f docker/Dockerfile \
			
 
				+    --build-arg BACKEND=cuda \
			
 
				+    --target dev \
			
 
				+    -t fish-speech-dev:cuda .
			
 
				+```
			
 
				+
			
 
				+#### Argumentos de build
			
 
				+
			
 
				+* `BACKEND`: `cuda` o `cpu` (por defecto: `cuda`)
			
 
				+* `CUDA_VER`: versión de CUDA (por defecto: `12.6.0`)
			
 
				+* `UV_EXTRA`: extra de UV para CUDA (por defecto: `cu126`)
			
 
				+* `UBUNTU_VER`: versión de Ubuntu (por defecto: `24.04`)
			
 
				+* `PY_VER`: versión de Python (por defecto: `3.12`)
			
 
				+
			
 
				+### Montajes de volumen
			
 
				+
			
 
				+Ambos métodos requieren montar estos directorios:
			
 
				+
			
 
				+* `./checkpoints:/app/checkpoints` - Directorio de pesos del modelo
			
 
				+* `./references:/app/references` - Directorio de archivos de audio de referencia
			
 
				+
			
 
				+### Variables de entorno
			
 
				+
			
 
				+* `COMPILE=1` - Habilitar torch.compile para inferencia más rápida (~10x de mejora)
			
 
				+* `GRADIO_SERVER_NAME=0.0.0.0` - Host del servidor WebUI
			
 
				+* `GRADIO_SERVER_PORT=7860` - Puerto del servidor WebUI
			
 
				+* `API_SERVER_NAME=0.0.0.0` - Host del servidor API
			
 
				+* `API_SERVER_PORT=8080` - Puerto del servidor API
			
 
				+
			
 
				+!!! note
			
 
				+Los contenedores Docker esperan que los pesos del modelo estén montados en `/app/checkpoints`. Asegúrate de descargar los pesos del modelo necesarios antes de iniciar los contenedores.
			
 
				+
			
 
				+!!! warning
			
 
				+El soporte GPU requiere el runtime de NVIDIA Docker. Para despliegues solo CPU, elimina el flag `--gpus all` y usa imágenes de CPU.
			
--- a/docs/es/server.md
+++ b/docs/es/server.md
@@ -0,0 +1,80 @@
 
				+# Servidor
			
 
				+
			
 
				+Esta página cubre la inferencia del lado del servidor para Fish Audio S2, además de enlaces rápidos para la inferencia con WebUI y el despliegue con Docker.
			
 
				+
			
 
				+## Inferencia del servidor API
			
 
				+
			
 
				+Fish Speech proporciona un punto de entrada de servidor HTTP en `tools/api_server.py`.
			
 
				+
			
 
				+### Iniciar el servidor localmente
			
 
				+
			
 
				+```bash
			
 
				+python tools/api_server.py \
			
 
				+  --llama-checkpoint-path checkpoints/s2-pro \
			
 
				+  --decoder-checkpoint-path checkpoints/s2-pro/codec.pth \
			
 
				+  --listen 0.0.0.0:8080
			
 
				+```
			
 
				+
			
 
				+Opciones comunes:
			
 
				+
			
 
				+* `--compile`: habilitar optimización con `torch.compile`
			
 
				+* `--half`: usar modo fp16
			
 
				+* `--api-key`: requerir autenticación mediante bearer token
			
 
				+* `--workers`: establecer la cantidad de procesos worker
			
 
				+
			
 
				+### Verificación de estado (health check)
			
 
				+
			
 
				+```bash
			
 
				+curl -X GET http://127.0.0.1:8080/v1/health
			
 
				+```
			
 
				+
			
 
				+Respuesta esperada:
			
 
				+
			
 
				+```json
			
 
				+{"status":"ok"}
			
 
				+```
			
 
				+
			
 
				+### Endpoint principal de la API
			
 
				+
			
 
				+* `POST /v1/tts` para generación de texto a voz (text-to-speech)
			
 
				+* `POST /v1/vqgan/encode` para codificación VQ
			
 
				+* `POST /v1/vqgan/decode` para decodificación VQ
			
 
				+
			
 
				+### Ejemplo de cliente en Python
			
 
				+
			
 
				+El modelo base de TTS se selecciona al iniciar el servidor. En el ejemplo anterior, el servidor se inicia con los pesos `checkpoints/s2-pro`, por lo que cada request enviada a `http://127.0.0.1:8080/v1/tts` usará **S2-Pro** automáticamente. No existe un campo `model` por request en `tools/api_client.py` para llamadas al servidor local.
			
 
				+
			
 
				+```bash
			
 
				+python tools/api_client.py \
			
 
				+  --url http://127.0.0.1:8080/v1/tts \
			
 
				+  --text "Hello from Fish Speech" \
			
 
				+  --output s2-pro-demo
			
 
				+```
			
 
				+
			
 
				+Si quieres seleccionar una voz de referencia guardada, usa `--reference_id`. Esto elige la **referencia de voz**, no el modelo base TTS:
			
 
				+
			
 
				+```bash
			
 
				+python tools/api_client.py \
			
 
				+  --url http://127.0.0.1:8080/v1/tts \
			
 
				+  --text "Hello from Fish Speech" \
			
 
				+  --reference_id my-speaker \
			
 
				+  --output s2-pro-demo
			
 
				+```
			
 
				+
			
 
				+## Inferencia con WebUI
			
 
				+
			
 
				+Para uso con WebUI, ver:
			
 
				+
			
 
				+* [WebUI Inference](https://speech.fish.audio/inference/#webui-inference)
			
 
				+
			
 
				+## Docker
			
 
				+
			
 
				+Para despliegue del servidor o WebUI basado en Docker, ver:
			
 
				+
			
 
				+* [Docker Setup](https://speech.fish.audio/install/#docker-setup)
			
 
				+
			
 
				+También puedes iniciar directamente el perfil del servidor con Docker Compose:
			
 
				+
			
 
				+```bash
			
 
				+docker compose --profile server up
			
 
				+```
			
--- a/docs/ja/index.md
+++ b/docs/ja/index.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-<p><a href="../en/">English</a> | <a href="../zh/">简体中文</a> | <a href="../pt/">Portuguese</a> | <strong>日本語</strong> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a></p>
			
 
				+<p><a href="../en/">English</a> | <a href="../zh/">简体中文</a> | <a href="../pt/">Portuguese</a> | <strong>日本語</strong> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a> | <a href="../es/">Español</a></p>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/ko/index.md
+++ b/docs/ko/index.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-<p><a href="../en/">English</a> | <a href="../zh/">简体中文</a> | <a href="../pt/">Portuguese</a> | <a href="../ja/">日本語</a> | <strong>한국어</strong> | <a href="../ar/">العربية</a></p>
			
 
				+<p><a href="../en/">English</a> | <a href="../zh/">简体中文</a> | <a href="../pt/">Portuguese</a> | <a href="../ja/">日本語</a> | <strong>한국어</strong> | <a href="../ar/">العربية</a> | <a href="../es/">Español</a></p>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/pt/index.md
+++ b/docs/pt/index.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-<p><a href="../en/">English</a> | <a href="../zh/">简体中文</a> | <strong>Portuguese</strong> | <a href="../ja/">日本語</a> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a></p>
			
 
				+<p><a href="../en/">English</a> | <a href="../zh/">简体中文</a> | <strong>Portuguese</strong> | <a href="../ja/">日本語</a> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a> | <a href="../es/">Español</a></p>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">
			
--- a/docs/zh/index.md
+++ b/docs/zh/index.md
@@ -1,7 +1,7 @@
 
				 <div align="center">
			
 
				 <h1>Fish Speech</h1>
			
 
				 
			
 
				-<p><a href="../en/">English</a> | <strong>简体中文</strong> | <a href="../pt/">Portuguese</a> | <a href="../ja/">日本語</a> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a></p>
			
 
				+<p><a href="../en/">English</a> | <strong>简体中文</strong> | <a href="../pt/">Portuguese</a> | <a href="../ja/">日本語</a> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a> | <a href="../es/">Español</a></p>
			
 
				 
			
 
				 <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
			
 
				 <a href="https://trendshift.io/repositories/7014" target="_blank">