index.html 48 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328
  1. <!doctype html>
  2. <html lang="en" class="no-js">
  3. <head>
  4. <meta charset="utf-8">
  5. <meta name="viewport" content="width=device-width,initial-scale=1">
  6. <meta name="description" content="Targeting SOTA TTS solutions.">
  7. <link rel="canonical" href="https://speech.fish.audio/">
  8. <link rel="next" href="install/">
  9. <link rel="alternate" href="/" hreflang="en">
  10. <link rel="alternate" href="/zh/" hreflang="zh">
  11. <link rel="alternate" href="/ja/" hreflang="ja">
  12. <link rel="alternate" href="/pt/" hreflang="pt">
  13. <link rel="alternate" href="/ko/" hreflang="ko">
  14. <link rel="alternate" href="/ar/" hreflang="ar">
  15. <link rel="icon" href="assets/logo.svg">
  16. <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.1">
  17. <title>Fish Audio</title>
  18. <link rel="stylesheet" href="assets/stylesheets/main.484c7ddc.min.css">
  19. <link rel="stylesheet" href="assets/stylesheets/palette.ab4e12ef.min.css">
  20. <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
  21. <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
  22. <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
  23. <link rel="stylesheet" href="stylesheets/extra.css">
  24. <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
  25. </head>
  26. <body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">
  27. <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
  28. <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
  29. <label class="md-overlay" for="__drawer"></label>
  30. <div data-md-component="skip">
  31. <a href="#quick-start" class="md-skip">
  32. Skip to content
  33. </a>
  34. </div>
  35. <div data-md-component="announce">
  36. </div>
  37. <header class="md-header md-header--shadow" data-md-component="header">
  38. <nav class="md-header__inner md-grid" aria-label="Header">
  39. <a href="https://speech.fish.audio" title="Fish Audio" class="md-header__button md-logo" aria-label="Fish Audio" data-md-component="logo">
  40. <img src="assets/logo.svg" alt="logo">
  41. </a>
  42. <label class="md-header__button md-icon" for="__drawer">
  43. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
  44. </label>
  45. <div class="md-header__title" data-md-component="header-title">
  46. <div class="md-header__ellipsis">
  47. <div class="md-header__topic">
  48. <span class="md-ellipsis">
  49. Fish Audio
  50. </span>
  51. </div>
  52. <div class="md-header__topic" data-md-component="header-topic">
  53. <span class="md-ellipsis">
  54. Introduction
  55. </span>
  56. </div>
  57. </div>
  58. </div>
  59. <form class="md-header__option" data-md-component="palette">
  60. <input class="md-option" data-md-color-media="(prefers-color-scheme)" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo" aria-label="Switch to light mode" type="radio" name="__palette" id="__palette_0">
  61. <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_1" hidden>
  62. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m14.3 16-.7-2h-3.2l-.7 2H7.8L11 7h2l3.2 9zM20 8.69V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12zm-9.15 3.96h2.3L12 9z"/></svg>
  63. </label>
  64. <input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_1">
  65. <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_2" hidden>
  66. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
  67. </label>
  68. <input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to light mode" type="radio" name="__palette" id="__palette_2">
  69. <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
  70. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
  71. </label>
  72. </form>
  73. <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
  74. <div class="md-header__option">
  75. <div class="md-select">
  76. <button class="md-header__button md-icon" aria-label="Select language">
  77. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m12.87 15.07-2.54-2.51.03-.03A17.5 17.5 0 0 0 14.07 6H17V4h-7V2H8v2H1v2h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11zM18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2zm-2.62 7 1.62-4.33L19.12 17z"/></svg>
  78. </button>
  79. <div class="md-select__inner">
  80. <ul class="md-select__list">
  81. <li class="md-select__item">
  82. <a href="/" hreflang="en" class="md-select__link">
  83. English
  84. </a>
  85. </li>
  86. <li class="md-select__item">
  87. <a href="/zh/" hreflang="zh" class="md-select__link">
  88. 简体中文
  89. </a>
  90. </li>
  91. <li class="md-select__item">
  92. <a href="/ja/" hreflang="ja" class="md-select__link">
  93. 日本語
  94. </a>
  95. </li>
  96. <li class="md-select__item">
  97. <a href="/pt/" hreflang="pt" class="md-select__link">
  98. Português (Brasil)
  99. </a>
  100. </li>
  101. <li class="md-select__item">
  102. <a href="/ko/" hreflang="ko" class="md-select__link">
  103. 한국어
  104. </a>
  105. </li>
  106. <li class="md-select__item">
  107. <a href="/ar/" hreflang="ar" class="md-select__link">
  108. العربية
  109. </a>
  110. </li>
  111. </ul>
  112. </div>
  113. </div>
  114. </div>
  115. <label class="md-header__button md-icon" for="__search">
  116. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
  117. </label>
  118. <div class="md-search" data-md-component="search" role="dialog">
  119. <label class="md-search__overlay" for="__search"></label>
  120. <div class="md-search__inner" role="search">
  121. <form class="md-search__form" name="search">
  122. <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
  123. <label class="md-search__icon md-icon" for="__search">
  124. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
  125. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
  126. </label>
  127. <nav class="md-search__options" aria-label="Search">
  128. <a href="javascript:void(0)" class="md-search__icon md-icon" title="Share" aria-label="Share" data-clipboard data-clipboard-text="" data-md-component="search-share" tabindex="-1">
  129. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M18 16.08c-.76 0-1.44.3-1.96.77L8.91 12.7c.05-.23.09-.46.09-.7s-.04-.47-.09-.7l7.05-4.11c.54.5 1.25.81 2.04.81a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3c0 .24.04.47.09.7L8.04 9.81C7.5 9.31 6.79 9 6 9a3 3 0 0 0-3 3 3 3 0 0 0 3 3c.79 0 1.5-.31 2.04-.81l7.12 4.15c-.05.21-.08.43-.08.66 0 1.61 1.31 2.91 2.92 2.91s2.92-1.3 2.92-2.91A2.92 2.92 0 0 0 18 16.08"/></svg>
  130. </a>
  131. <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
  132. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
  133. </button>
  134. </nav>
  135. <div class="md-search__suggest" data-md-component="search-suggest"></div>
  136. </form>
  137. <div class="md-search__output">
  138. <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
  139. <div class="md-search-result" data-md-component="search-result">
  140. <div class="md-search-result__meta">
  141. Initializing search
  142. </div>
  143. <ol class="md-search-result__list" role="presentation"></ol>
  144. </div>
  145. </div>
  146. </div>
  147. </div>
  148. </div>
  149. <div class="md-header__source">
  150. <a href="https://github.com/fishaudio/fish-speech" title="Go to repository" class="md-source" data-md-component="source">
  151. <div class="md-source__icon md-icon">
  152. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
  153. </div>
  154. <div class="md-source__repository">
  155. fishaudio/fish-speech
  156. </div>
  157. </a>
  158. </div>
  159. </nav>
  160. </header>
  161. <div class="md-container" data-md-component="container">
  162. <main class="md-main" data-md-component="main">
  163. <div class="md-main__inner md-grid">
  164. <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
  165. <div class="md-sidebar__scrollwrap">
  166. <div class="md-sidebar__inner">
  167. <nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
  168. <label class="md-nav__title" for="__drawer">
  169. <a href="https://speech.fish.audio" title="Fish Audio" class="md-nav__button md-logo" aria-label="Fish Audio" data-md-component="logo">
  170. <img src="assets/logo.svg" alt="logo">
  171. </a>
  172. Fish Audio
  173. </label>
  174. <div class="md-nav__source">
  175. <a href="https://github.com/fishaudio/fish-speech" title="Go to repository" class="md-source" data-md-component="source">
  176. <div class="md-source__icon md-icon">
  177. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
  178. </div>
  179. <div class="md-source__repository">
  180. fishaudio/fish-speech
  181. </div>
  182. </a>
  183. </div>
  184. <ul class="md-nav__list" data-md-scrollfix>
  185. <li class="md-nav__item md-nav__item--active">
  186. <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
  187. <label class="md-nav__link md-nav__link--active" for="__toc">
  188. <span class="md-ellipsis">
  189. Introduction
  190. </span>
  191. <span class="md-nav__icon md-icon"></span>
  192. </label>
  193. <a href="." class="md-nav__link md-nav__link--active">
  194. <span class="md-ellipsis">
  195. Introduction
  196. </span>
  197. </a>
  198. <nav class="md-nav md-nav--secondary" aria-label="Table of contents">
  199. <label class="md-nav__title" for="__toc">
  200. <span class="md-nav__icon md-icon"></span>
  201. Table of contents
  202. </label>
  203. <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
  204. <li class="md-nav__item">
  205. <a href="#quick-start" class="md-nav__link">
  206. <span class="md-ellipsis">
  207. Quick Start
  208. </span>
  209. </a>
  210. <nav class="md-nav" aria-label="Quick Start">
  211. <ul class="md-nav__list">
  212. <li class="md-nav__item">
  213. <a href="#for-human" class="md-nav__link">
  214. <span class="md-ellipsis">
  215. For Human
  216. </span>
  217. </a>
  218. </li>
  219. <li class="md-nav__item">
  220. <a href="#for-llm-agent" class="md-nav__link">
  221. <span class="md-ellipsis">
  222. For LLM Agent
  223. </span>
  224. </a>
  225. </li>
  226. </ul>
  227. </nav>
  228. </li>
  229. <li class="md-nav__item">
  230. <a href="#fish-audio-s2" class="md-nav__link">
  231. <span class="md-ellipsis">
  232. Fish Audio S2
  233. </span>
  234. </a>
  235. <nav class="md-nav" aria-label="Fish Audio S2">
  236. <ul class="md-nav__list">
  237. <li class="md-nav__item">
  238. <a href="#model-variants" class="md-nav__link">
  239. <span class="md-ellipsis">
  240. Model Variants
  241. </span>
  242. </a>
  243. </li>
  244. </ul>
  245. </nav>
  246. </li>
  247. <li class="md-nav__item">
  248. <a href="#benchmark-results" class="md-nav__link">
  249. <span class="md-ellipsis">
  250. Benchmark Results
  251. </span>
  252. </a>
  253. </li>
  254. <li class="md-nav__item">
  255. <a href="#highlights" class="md-nav__link">
  256. <span class="md-ellipsis">
  257. Highlights
  258. </span>
  259. </a>
  260. <nav class="md-nav" aria-label="Highlights">
  261. <ul class="md-nav__list">
  262. <li class="md-nav__item">
  263. <a href="#fine-grained-inline-control-via-natural-language" class="md-nav__link">
  264. <span class="md-ellipsis">
  265. Fine-Grained Inline Control via Natural Language
  266. </span>
  267. </a>
  268. </li>
  269. <li class="md-nav__item">
  270. <a href="#dual-autoregressive-architecture" class="md-nav__link">
  271. <span class="md-ellipsis">
  272. Dual-Autoregressive Architecture
  273. </span>
  274. </a>
  275. </li>
  276. <li class="md-nav__item">
  277. <a href="#reinforcement-learning-alignment" class="md-nav__link">
  278. <span class="md-ellipsis">
  279. Reinforcement Learning Alignment
  280. </span>
  281. </a>
  282. </li>
  283. <li class="md-nav__item">
  284. <a href="#production-streaming-via-sglang" class="md-nav__link">
  285. <span class="md-ellipsis">
  286. Production Streaming via SGLang
  287. </span>
  288. </a>
  289. </li>
  290. <li class="md-nav__item">
  291. <a href="#multilingual-support" class="md-nav__link">
  292. <span class="md-ellipsis">
  293. Multilingual Support
  294. </span>
  295. </a>
  296. </li>
  297. <li class="md-nav__item">
  298. <a href="#native-multi-speaker-generation" class="md-nav__link">
  299. <span class="md-ellipsis">
  300. Native Multi-Speaker Generation
  301. </span>
  302. </a>
  303. </li>
  304. <li class="md-nav__item">
  305. <a href="#multi-turn-generation" class="md-nav__link">
  306. <span class="md-ellipsis">
  307. Multi-Turn Generation
  308. </span>
  309. </a>
  310. </li>
  311. <li class="md-nav__item">
  312. <a href="#rapid-voice-cloning" class="md-nav__link">
  313. <span class="md-ellipsis">
  314. Rapid Voice Cloning
  315. </span>
  316. </a>
  317. </li>
  318. </ul>
  319. </nav>
  320. </li>
  321. <li class="md-nav__item">
  322. <a href="#credits" class="md-nav__link">
  323. <span class="md-ellipsis">
  324. Credits
  325. </span>
  326. </a>
  327. </li>
  328. <li class="md-nav__item">
  329. <a href="#tech-report" class="md-nav__link">
  330. <span class="md-ellipsis">
  331. Tech Report
  332. </span>
  333. </a>
  334. </li>
  335. </ul>
  336. </nav>
  337. </li>
  338. <li class="md-nav__item">
  339. <a href="install/" class="md-nav__link">
  340. <span class="md-ellipsis">
  341. Installation
  342. </span>
  343. </a>
  344. </li>
  345. <li class="md-nav__item">
  346. <a href="finetune/" class="md-nav__link">
  347. <span class="md-ellipsis">
  348. Finetune
  349. </span>
  350. </a>
  351. </li>
  352. <li class="md-nav__item">
  353. <a href="inference/" class="md-nav__link">
  354. <span class="md-ellipsis">
  355. Inference
  356. </span>
  357. </a>
  358. </li>
  359. <li class="md-nav__item">
  360. <a href="server/" class="md-nav__link">
  361. <span class="md-ellipsis">
  362. Server
  363. </span>
  364. </a>
  365. </li>
  366. <li class="md-nav__item">
  367. <a href="en/samples.md" class="md-nav__link">
  368. <span class="md-ellipsis">
  369. Samples
  370. </span>
  371. </a>
  372. </li>
  373. </ul>
  374. </nav>
  375. </div>
  376. </div>
  377. </div>
  378. <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
  379. <div class="md-sidebar__scrollwrap">
  380. <div class="md-sidebar__inner">
  381. <nav class="md-nav md-nav--secondary" aria-label="Table of contents">
  382. <label class="md-nav__title" for="__toc">
  383. <span class="md-nav__icon md-icon"></span>
  384. Table of contents
  385. </label>
  386. <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
  387. <li class="md-nav__item">
  388. <a href="#quick-start" class="md-nav__link">
  389. <span class="md-ellipsis">
  390. Quick Start
  391. </span>
  392. </a>
  393. <nav class="md-nav" aria-label="Quick Start">
  394. <ul class="md-nav__list">
  395. <li class="md-nav__item">
  396. <a href="#for-human" class="md-nav__link">
  397. <span class="md-ellipsis">
  398. For Human
  399. </span>
  400. </a>
  401. </li>
  402. <li class="md-nav__item">
  403. <a href="#for-llm-agent" class="md-nav__link">
  404. <span class="md-ellipsis">
  405. For LLM Agent
  406. </span>
  407. </a>
  408. </li>
  409. </ul>
  410. </nav>
  411. </li>
  412. <li class="md-nav__item">
  413. <a href="#fish-audio-s2" class="md-nav__link">
  414. <span class="md-ellipsis">
  415. Fish Audio S2
  416. </span>
  417. </a>
  418. <nav class="md-nav" aria-label="Fish Audio S2">
  419. <ul class="md-nav__list">
  420. <li class="md-nav__item">
  421. <a href="#model-variants" class="md-nav__link">
  422. <span class="md-ellipsis">
  423. Model Variants
  424. </span>
  425. </a>
  426. </li>
  427. </ul>
  428. </nav>
  429. </li>
  430. <li class="md-nav__item">
  431. <a href="#benchmark-results" class="md-nav__link">
  432. <span class="md-ellipsis">
  433. Benchmark Results
  434. </span>
  435. </a>
  436. </li>
  437. <li class="md-nav__item">
  438. <a href="#highlights" class="md-nav__link">
  439. <span class="md-ellipsis">
  440. Highlights
  441. </span>
  442. </a>
  443. <nav class="md-nav" aria-label="Highlights">
  444. <ul class="md-nav__list">
  445. <li class="md-nav__item">
  446. <a href="#fine-grained-inline-control-via-natural-language" class="md-nav__link">
  447. <span class="md-ellipsis">
  448. Fine-Grained Inline Control via Natural Language
  449. </span>
  450. </a>
  451. </li>
  452. <li class="md-nav__item">
  453. <a href="#dual-autoregressive-architecture" class="md-nav__link">
  454. <span class="md-ellipsis">
  455. Dual-Autoregressive Architecture
  456. </span>
  457. </a>
  458. </li>
  459. <li class="md-nav__item">
  460. <a href="#reinforcement-learning-alignment" class="md-nav__link">
  461. <span class="md-ellipsis">
  462. Reinforcement Learning Alignment
  463. </span>
  464. </a>
  465. </li>
  466. <li class="md-nav__item">
  467. <a href="#production-streaming-via-sglang" class="md-nav__link">
  468. <span class="md-ellipsis">
  469. Production Streaming via SGLang
  470. </span>
  471. </a>
  472. </li>
  473. <li class="md-nav__item">
  474. <a href="#multilingual-support" class="md-nav__link">
  475. <span class="md-ellipsis">
  476. Multilingual Support
  477. </span>
  478. </a>
  479. </li>
  480. <li class="md-nav__item">
  481. <a href="#native-multi-speaker-generation" class="md-nav__link">
  482. <span class="md-ellipsis">
  483. Native Multi-Speaker Generation
  484. </span>
  485. </a>
  486. </li>
  487. <li class="md-nav__item">
  488. <a href="#multi-turn-generation" class="md-nav__link">
  489. <span class="md-ellipsis">
  490. Multi-Turn Generation
  491. </span>
  492. </a>
  493. </li>
  494. <li class="md-nav__item">
  495. <a href="#rapid-voice-cloning" class="md-nav__link">
  496. <span class="md-ellipsis">
  497. Rapid Voice Cloning
  498. </span>
  499. </a>
  500. </li>
  501. </ul>
  502. </nav>
  503. </li>
  504. <li class="md-nav__item">
  505. <a href="#credits" class="md-nav__link">
  506. <span class="md-ellipsis">
  507. Credits
  508. </span>
  509. </a>
  510. </li>
  511. <li class="md-nav__item">
  512. <a href="#tech-report" class="md-nav__link">
  513. <span class="md-ellipsis">
  514. Tech Report
  515. </span>
  516. </a>
  517. </li>
  518. </ul>
  519. </nav>
  520. </div>
  521. </div>
  522. </div>
  523. <div class="md-content" data-md-component="content">
  524. <article class="md-content__inner md-typeset">
  525. <a href="https://github.com/fishaudio/fish-speech/blob/main/docs/en/index.md" title="Edit this page" class="md-content__button md-icon" rel="edit">
  526. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M10 20H6V4h7v5h5v3.1l2-2V8l-6-6H6c-1.1 0-2 .9-2 2v16c0 1.1.9 2 2 2h4zm10.2-7c.1 0 .3.1.4.2l1.3 1.3c.2.2.2.6 0 .8l-1 1-2.1-2.1 1-1c.1-.1.2-.2.4-.2m0 3.9L14.1 23H12v-2.1l6.1-6.1z"/></svg>
  527. </a>
  528. <a href="https://github.com/fishaudio/fish-speech/raw/main/docs/en/index.md" title="View source of this page" class="md-content__button md-icon">
  529. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17 18c.56 0 1 .44 1 1s-.44 1-1 1-1-.44-1-1 .44-1 1-1m0-3c-2.73 0-5.06 1.66-6 4 .94 2.34 3.27 4 6 4s5.06-1.66 6-4c-.94-2.34-3.27-4-6-4m0 6.5a2.5 2.5 0 0 1-2.5-2.5 2.5 2.5 0 0 1 2.5-2.5 2.5 2.5 0 0 1 2.5 2.5 2.5 2.5 0 0 1-2.5 2.5M9.27 20H6V4h7v5h5v4.07c.7.08 1.36.25 2 .49V8l-6-6H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h4.5a8.2 8.2 0 0 1-1.23-2"/></svg>
  530. </a>
  531. <div align="center">
  532. <h1>Fish Speech</h1>
  533. <p><strong>English</strong> | <a href="../zh/">简体中文</a> | <a href="../pt/">Portuguese</a> | <a href="../ja/">日本語</a> | <a href="../ko/">한국어</a> | <a href="../ar/">العربية</a> | <a href="../es/">Español</a></p>
  534. <a href="https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710" alt="Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
  535. <a href="https://trendshift.io/repositories/7014" target="_blank">
  536. <img src="https://trendshift.io/api/badge/repositories/7014" alt="fishaudio%2Ffish-speech | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/>
  537. </a>
  538. </div>
  539. <p><br></p>
  540. <div align="center">
  541. <img src="https://count.getloli.com/get/@fish-speech?theme=asoul" /><br>
  542. </div>
  543. <p><br></p>
  544. <div align="center">
  545. <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
  546. <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
  547. </a>
  548. <a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
  549. <img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
  550. </a>
  551. <a target="_blank" href="https://pd.qq.com/s/bwxia254o">
  552. <img alt="QQ Channel" src="https://img.shields.io/badge/QQ-blue?logo=tencentqq">
  553. </a>
  554. </div>
  555. <div align="center">
  556. <a target="_blank" href="https://huggingface.co/fishaudio/s2">
  557. <img alt="HuggingFace Model" src="https://img.shields.io/badge/🤗%20-models-orange"/>
  558. </a>
  559. <a target="_blank" href="https://fish.audio/blog/fish-audio-open-sources-s2/">
  560. <img alt="Fish Audio Blog" src="https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white"/>
  561. </a>
  562. <a target="_blank" href="https://arxiv.org/abs/2603.08823">
  563. <img alt="Paper | Technical Report" src="https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square"/>
  564. </a>
  565. </div>
  566. <div class="admonition info">
  567. <p class="admonition-title">License Notice</p>
  568. <p>This codebase and its associated model weights are released under <strong>FISH AUDIO RESEARCH LICENSE</strong>. Please refer to <a href="https://github.com/fishaudio/fish-speech/blob/main/LICENSE">LICENSE</a> for more details. We will take action against any violation of the license.</p>
  569. </div>
  570. <div class="admonition warning">
  571. <p class="admonition-title">Legal Disclaimer</p>
  572. <p>We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.</p>
  573. </div>
  574. <h2 id="quick-start">Quick Start</h2>
  575. <h3 id="for-human">For Human</h3>
  576. <p>Here are the official documents for Fish Audio S2, follow the instructions to get started easily.</p>
  577. <ul>
  578. <li><a href="https://speech.fish.audio/install/">Installation</a></li>
  579. <li><a href="https://speech.fish.audio/inference/#command-line-inference">Command Line Inference</a></li>
  580. <li><a href="https://speech.fish.audio/inference/#webui-inference">WebUI Inference</a></li>
  581. <li><a href="https://speech.fish.audio/server/">Server Inference</a></li>
  582. <li><a href="https://speech.fish.audio/install/#docker-setup">Docker Setup</a></li>
  583. </ul>
  584. <blockquote>
  585. <p>[!IMPORTANT]
  586. <strong>For SGLang server, please read <a href="https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md">SGLang-Omni README</a>.</strong></p>
  587. </blockquote>
  588. <h3 id="for-llm-agent">For LLM Agent</h3>
  589. <div class="language-text highlight"><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>Install and configure Fish-Audio S2 by following the instructions here: https://speech.fish.audio/install/
  590. </span></code></pre></div>
  591. <h2 id="fish-audio-s2">Fish Audio S2</h2>
  592. <p><strong>Best text-to-speech system among both open source and closed source</strong></p>
  593. <p>Fish Audio S2 is the latest model developed by <a href="https://fish.audio/">Fish Audio</a>. Trained on over 10 million hours of audio across approximately 50 languages, S2 combines reinforcement learning alignment with a Dual-Autoregressive architecture to generate speech that sounds natural, realistic, and emotionally rich.</p>
  594. <p>S2 supports fine-grained inline control of prosody and emotion using natural-language tags like <code>[laugh]</code>, <code>[whispers]</code>, and <code>[super happy]</code>, as well as native multi-speaker and multi-turn generation.</p>
  595. <p>Visit the <a href="https://fish.audio/">Fish Audio website</a> for live playground. Read the <a href="https://fish.audio/blog/fish-audio-open-sources-s2/">blog post</a> and <a href="https://arxiv.org/abs/2603.08823">technical report</a> for more details.</p>
  596. <h3 id="model-variants">Model Variants</h3>
  597. <table>
  598. <thead>
  599. <tr>
  600. <th>Model</th>
  601. <th>Size</th>
  602. <th>Availability</th>
  603. <th>Description</th>
  604. </tr>
  605. </thead>
  606. <tbody>
  607. <tr>
  608. <td>S2-Pro</td>
  609. <td>4B parameters</td>
  610. <td><a href="https://huggingface.co/fishaudio/s2-pro">HuggingFace</a></td>
  611. <td>Full-featured flagship model with maximum quality and stability</td>
  612. </tr>
  613. </tbody>
  614. </table>
  615. <p>More details of the model can be found in the <a href="https://arxiv.org/abs/2411.01156">technical report</a>.</p>
  616. <h2 id="benchmark-results">Benchmark Results</h2>
  617. <table>
  618. <thead>
  619. <tr>
  620. <th>Benchmark</th>
  621. <th>Fish Audio S2</th>
  622. </tr>
  623. </thead>
  624. <tbody>
  625. <tr>
  626. <td>Seed-TTS Eval — WER (Chinese)</td>
  627. <td><strong>0.54%</strong> (best overall)</td>
  628. </tr>
  629. <tr>
  630. <td>Seed-TTS Eval — WER (English)</td>
  631. <td><strong>0.99%</strong> (best overall)</td>
  632. </tr>
  633. <tr>
  634. <td>Audio Turing Test (with instruction)</td>
  635. <td><strong>0.515</strong> posterior mean</td>
  636. </tr>
  637. <tr>
  638. <td>EmergentTTS-Eval — Win Rate</td>
  639. <td><strong>81.88%</strong> (highest overall)</td>
  640. </tr>
  641. <tr>
  642. <td>Fish Instruction Benchmark — TAR</td>
  643. <td><strong>93.3%</strong></td>
  644. </tr>
  645. <tr>
  646. <td>Fish Instruction Benchmark — Quality</td>
  647. <td><strong>4.51 / 5.0</strong></td>
  648. </tr>
  649. <tr>
  650. <td>Multilingual (MiniMax Testset) — Best WER</td>
  651. <td><strong>11 of 24</strong> languages</td>
  652. </tr>
  653. <tr>
  654. <td>Multilingual (MiniMax Testset) — Best SIM</td>
  655. <td><strong>17 of 24</strong> languages</td>
  656. </tr>
  657. </tbody>
  658. </table>
  659. <p>On Seed-TTS Eval, S2 achieves the lowest WER among all evaluated models including closed-source systems: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). On the Audio Turing Test, 0.515 surpasses Seed-TTS (0.417) by 24% and MiniMax-Speech (0.387) by 33%. On EmergentTTS-Eval, S2 achieves particularly strong results in paralinguistics (91.61% win rate), questions (84.41%), and syntactic complexity (83.39%).</p>
  660. <h2 id="highlights">Highlights</h2>
  661. <p><img src="../assets/totalability.png" width=200%></p>
  662. <h3 id="fine-grained-inline-control-via-natural-language">Fine-Grained Inline Control via Natural Language</h3>
  663. <p>S2 enables localized control over speech generation by embedding natural-language instructions directly at specific word or phrase positions within the text. Rather than relying on a fixed set of predefined tags, S2 accepts free-form textual descriptions — such as <code>[whisper in small voice]</code>, <code>[professional broadcast tone]</code>, or <code>[pitch up]</code> — allowing open-ended expression control at the word level.</p>
  664. <h3 id="dual-autoregressive-architecture">Dual-Autoregressive Architecture</h3>
  665. <p>S2 builds on a decoder-only transformer combined with an RVQ-based audio codec (10 codebooks, ~21 Hz frame rate). The Dual-AR architecture splits generation into two stages:</p>
  666. <ul>
  667. <li><strong>Slow AR</strong> operates along the time axis and predicts the primary semantic codebook.</li>
  668. <li><strong>Fast AR</strong> generates the remaining 9 residual codebooks at each time step, reconstructing fine-grained acoustic detail.</li>
  669. </ul>
  670. <p>This asymmetric design — 4B parameters along the time axis, 400M parameters along the depth axis — keeps inference efficient while preserving audio fidelity.</p>
  671. <h3 id="reinforcement-learning-alignment">Reinforcement Learning Alignment</h3>
  672. <p>S2 uses Group Relative Policy Optimization (GRPO) for post-training alignment. The same models used to filter and annotate training data are directly reused as reward models during RL — eliminating distribution mismatch between pre-training data and post-training objectives. The reward signal combines semantic accuracy, instruction adherence, acoustic preference scoring, and timbre similarity.</p>
  673. <h3 id="production-streaming-via-sglang">Production Streaming via SGLang</h3>
  674. <p>Because the Dual-AR architecture is structurally isomorphic to standard autoregressive LLMs, S2 directly inherits all LLM-native serving optimizations from SGLang — including continuous batching, paged KV cache, CUDA graph replay, and RadixAttention-based prefix caching.</p>
  675. <p>On a single NVIDIA H200 GPU:</p>
  676. <ul>
  677. <li><strong>Real-Time Factor (RTF):</strong> 0.195</li>
  678. <li><strong>Time-to-first-audio:</strong> ~100 ms</li>
  679. <li><strong>Throughput:</strong> 3,000+ acoustic tokens/s while maintaining RTF below 0.5</li>
  680. </ul>
  681. <h3 id="multilingual-support">Multilingual Support</h3>
  682. <p>S2 supports high-quality multilingual text-to-speech without requiring phonemes or language-specific preprocessing. Including:</p>
  683. <p><strong>English, Chinese, Japanese, Korean, Arabics, German, French...</strong></p>
  684. <p><strong>AND MORE!</strong></p>
  685. <p>The list is constantly expanding, check <a href="https://fish.audio/">Fish Audio</a> for the latest releases.</p>
  686. <h3 id="native-multi-speaker-generation">Native Multi-Speaker Generation</h3>
  687. <p><img src="../assets/chattemplate.png" width=200%></p>
  688. <p>Fish Audio S2 allows users to upload reference audio with multi-speaker, the model will deal with every speaker's feature via <code>&lt;|speaker:i|&gt;</code> token. Then you can control the model's performance with the speaker id token, allowing a single generation to include multiple speakers. You no longer need to upload reference audio separately for each speaker.</p>
  689. <h3 id="multi-turn-generation">Multi-Turn Generation</h3>
  690. <p>Thanks to the expansion of the model context, our model can now use previous information to improve the expressiveness of subsequent generated content, thereby increasing the naturalness of the content.</p>
  691. <h3 id="rapid-voice-cloning">Rapid Voice Cloning</h3>
  692. <p>Fish Audio S2 supports accurate voice cloning using a short reference sample (typically 10–30 seconds). The model captures timbre, speaking style, and emotional tendencies, producing realistic and consistent cloned voices without additional fine-tuning.
  693. Please refer to <a href="https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md">SGLang-Omni README</a> to use the SGLang server.</p>
  694. <hr />
  695. <h2 id="credits">Credits</h2>
  696. <ul>
  697. <li><a href="https://github.com/daniilrobnikov/vits2">VITS2 (daniilrobnikov)</a></li>
  698. <li><a href="https://github.com/fishaudio/Bert-VITS2">Bert-VITS2</a></li>
  699. <li><a href="https://github.com/innnky/gpt-vits">GPT VITS</a></li>
  700. <li><a href="https://github.com/b04901014/MQTTS">MQTTS</a></li>
  701. <li><a href="https://github.com/pytorch-labs/gpt-fast">GPT Fast</a></li>
  702. <li><a href="https://github.com/RVC-Boss/GPT-SoVITS">GPT-SoVITS</a></li>
  703. <li><a href="https://github.com/QwenLM/Qwen3">Qwen3</a></li>
  704. </ul>
  705. <h2 id="tech-report">Tech Report</h2>
  706. <div class="language-bibtex highlight"><pre><span></span><code><span id="__span-1-1"><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a><span class="nc">@misc</span><span class="p">{</span><span class="nl">fish-speech-v1.4</span><span class="p">,</span>
  707. </span><span id="__span-1-2"><a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a><span class="w"> </span><span class="na">title</span><span class="p">=</span><span class="s">{Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis}</span><span class="p">,</span>
  708. </span><span id="__span-1-3"><a id="__codelineno-1-3" name="__codelineno-1-3" href="#__codelineno-1-3"></a><span class="w"> </span><span class="na">author</span><span class="p">=</span><span class="s">{Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing}</span><span class="p">,</span>
  709. </span><span id="__span-1-4"><a id="__codelineno-1-4" name="__codelineno-1-4" href="#__codelineno-1-4"></a><span class="w"> </span><span class="na">year</span><span class="p">=</span><span class="s">{2024}</span><span class="p">,</span>
  710. </span><span id="__span-1-5"><a id="__codelineno-1-5" name="__codelineno-1-5" href="#__codelineno-1-5"></a><span class="w"> </span><span class="na">eprint</span><span class="p">=</span><span class="s">{2411.01156}</span><span class="p">,</span>
  711. </span><span id="__span-1-6"><a id="__codelineno-1-6" name="__codelineno-1-6" href="#__codelineno-1-6"></a><span class="w"> </span><span class="na">archivePrefix</span><span class="p">=</span><span class="s">{arXiv}</span><span class="p">,</span>
  712. </span><span id="__span-1-7"><a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a><span class="w"> </span><span class="na">primaryClass</span><span class="p">=</span><span class="s">{cs.SD}</span><span class="p">,</span>
  713. </span><span id="__span-1-8"><a id="__codelineno-1-8" name="__codelineno-1-8" href="#__codelineno-1-8"></a><span class="w"> </span><span class="na">url</span><span class="p">=</span><span class="s">{https://arxiv.org/abs/2411.01156}</span><span class="p">,</span>
  714. </span><span id="__span-1-9"><a id="__codelineno-1-9" name="__codelineno-1-9" href="#__codelineno-1-9"></a><span class="p">}</span>
  715. </span><span id="__span-1-10"><a id="__codelineno-1-10" name="__codelineno-1-10" href="#__codelineno-1-10"></a>
  716. </span><span id="__span-1-11"><a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a><span class="nc">@misc</span><span class="p">{</span><span class="nl">liao2026fishaudios2technical</span><span class="p">,</span>
  717. </span><span id="__span-1-12"><a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a><span class="w"> </span><span class="na">title</span><span class="p">=</span><span class="s">{Fish Audio S2 Technical Report}</span><span class="p">,</span><span class="w"> </span>
  718. </span><span id="__span-1-13"><a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a><span class="w"> </span><span class="na">author</span><span class="p">=</span><span class="s">{Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han}</span><span class="p">,</span>
  719. </span><span id="__span-1-14"><a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a><span class="w"> </span><span class="na">year</span><span class="p">=</span><span class="s">{2026}</span><span class="p">,</span>
  720. </span><span id="__span-1-15"><a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a><span class="w"> </span><span class="na">eprint</span><span class="p">=</span><span class="s">{2603.08823}</span><span class="p">,</span>
  721. </span><span id="__span-1-16"><a id="__codelineno-1-16" name="__codelineno-1-16" href="#__codelineno-1-16"></a><span class="w"> </span><span class="na">archivePrefix</span><span class="p">=</span><span class="s">{arXiv}</span><span class="p">,</span>
  722. </span><span id="__span-1-17"><a id="__codelineno-1-17" name="__codelineno-1-17" href="#__codelineno-1-17"></a><span class="w"> </span><span class="na">primaryClass</span><span class="p">=</span><span class="s">{cs.SD}</span><span class="p">,</span>
  723. </span><span id="__span-1-18"><a id="__codelineno-1-18" name="__codelineno-1-18" href="#__codelineno-1-18"></a><span class="w"> </span><span class="na">url</span><span class="p">=</span><span class="s">{https://arxiv.org/abs/2603.08823}</span><span class="p">,</span><span class="w"> </span>
  724. </span><span id="__span-1-19"><a id="__codelineno-1-19" name="__codelineno-1-19" href="#__codelineno-1-19"></a><span class="p">}</span>
  725. </span></code></pre></div>
  726. </article>
  727. </div>
  728. <script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
  729. </div>
  730. </main>
  731. <footer class="md-footer">
  732. <nav class="md-footer__inner md-grid" aria-label="Footer" >
  733. <a href="install/" class="md-footer__link md-footer__link--next" aria-label="Next: Installation">
  734. <div class="md-footer__title">
  735. <span class="md-footer__direction">
  736. Next
  737. </span>
  738. <div class="md-ellipsis">
  739. Installation
  740. </div>
  741. </div>
  742. <div class="md-footer__button md-icon">
  743. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11z"/></svg>
  744. </div>
  745. </a>
  746. </nav>
  747. <div class="md-footer-meta md-typeset">
  748. <div class="md-footer-meta__inner md-grid">
  749. <div class="md-copyright">
  750. <div class="md-copyright__highlight">
  751. Copyright &copy; 2023-2025 by Fish Audio
  752. </div>
  753. Made with
  754. <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
  755. Material for MkDocs
  756. </a>
  757. </div>
  758. <div class="md-social">
  759. <a href="https://discord.gg/Es5qTB9BcN" target="_blank" rel="noopener" title="discord.gg" class="md-social__link">
  760. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M492.5 69.8c-.2-.3-.4-.6-.8-.7-38.1-17.5-78.4-30-119.7-37.1-.4-.1-.8 0-1.1.1s-.6.4-.8.8c-5.5 9.9-10.5 20.2-14.9 30.6-44.6-6.8-89.9-6.8-134.4 0-4.5-10.5-9.5-20.7-15.1-30.6-.2-.3-.5-.6-.8-.8s-.7-.2-1.1-.2C162.5 39 122.2 51.5 84.1 69c-.3.1-.6.4-.8.7C7.1 183.5-13.8 294.6-3.6 404.2c0 .3.1.5.2.8s.3.4.5.6c44.4 32.9 94 58 146.8 74.2.4.1.8.1 1.1 0s.7-.4.9-.7c11.3-15.4 21.4-31.8 30-48.8.1-.2.2-.5.2-.8s0-.5-.1-.8-.2-.5-.4-.6-.4-.3-.7-.4c-15.8-6.1-31.2-13.4-45.9-21.9-.3-.2-.5-.4-.7-.6s-.3-.6-.3-.9 0-.6.2-.9.3-.5.6-.7c3.1-2.3 6.2-4.7 9.1-7.1.3-.2.6-.4.9-.4s.7 0 1 .1c96.2 43.9 200.4 43.9 295.5 0 .3-.1.7-.2 1-.2s.7.2.9.4c2.9 2.4 6 4.9 9.1 7.2.2.2.4.4.6.7s.2.6.2.9-.1.6-.3.9-.4.5-.6.6c-14.7 8.6-30 15.9-45.9 21.8-.2.1-.5.2-.7.4s-.3.4-.4.7-.1.5-.1.8.1.5.2.8c8.8 17 18.8 33.3 30 48.8.2.3.6.6.9.7s.8.1 1.1 0c52.9-16.2 102.6-41.3 147.1-74.2.2-.2.4-.4.5-.6s.2-.5.2-.8c12.3-126.8-20.5-236.9-86.9-334.5zm-302 267.7c-29 0-52.8-26.6-52.8-59.2s23.4-59.2 52.8-59.2c29.7 0 53.3 26.8 52.8 59.2 0 32.7-23.4 59.2-52.8 59.2m195.4 0c-29 0-52.8-26.6-52.8-59.2s23.4-59.2 52.8-59.2c29.7 0 53.3 26.8 52.8 59.2 0 32.7-23.2 59.2-52.8 59.2"/></svg>
  761. </a>
  762. <a href="https://hub.docker.com/r/fishaudio/fish-speech" target="_blank" rel="noopener" title="hub.docker.com" class="md-social__link">
  763. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M349.9 236.3h-66.1v-59.4h66.1zm0-204.3h-66.1v60.7h66.1zm78.2 144.8H362v59.4h66.1zm-156.3-72.1h-66.1v60.1h66.1zm78.1 0h-66.1v60.1h66.1zm276.8 100c-14.4-9.7-47.6-13.2-73.1-8.4-3.3-24-16.7-44.9-41.1-63.7l-14-9.3-9.3 14c-18.4 27.8-23.4 73.6-3.7 103.8-8.7 4.7-25.8 11.1-48.4 10.7H2.4c-8.7 50.8 5.8 116.8 44 162.1 37.1 43.9 92.7 66.2 165.4 66.2 157.4 0 273.9-72.5 328.4-204.2 21.4.4 67.6.1 91.3-45.2 1.5-2.5 6.6-13.2 8.5-17.1zm-511.1-27.9h-66v59.4h66.1v-59.4zm78.1 0h-66.1v59.4h66.1zm78.1 0h-66.1v59.4h66.1zm-78.1-72.1h-66.1v60.1h66.1z"/></svg>
  764. </a>
  765. <a href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093" target="_blank" rel="noopener" title="qm.qq.com" class="md-social__link">
  766. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M434.1 420.4c-11.5 1.4-44.9-52.7-44.9-52.7 0 31.3-16.1 72.2-51 101.8 16.8 5.2 54.8 19.2 45.8 34.4-7.3 12.3-125.5 7.9-159.6 4-34.1 3.8-152.3 8.3-159.6-4-9-15.2 28.9-29.2 45.8-34.4-34.9-29.5-51.1-70.4-51.1-101.8 0 0-33.3 54.1-44.9 52.7-5.4-.6-12.4-29.6 9.3-99.7 10.3-33 22-60.5 40.1-105.8C60.9 98 109.2-.1 224.3-.1 338-.1 387.5 96 384.6 214.9c18.1 45.2 29.9 72.9 40.1 105.8 21.8 70.1 14.7 99.1 9.3 99.7z"/></svg>
  767. </a>
  768. </div>
  769. </div>
  770. </div>
  771. </footer>
  772. </div>
  773. <div class="md-dialog" data-md-component="dialog">
  774. <div class="md-dialog__inner md-typeset"></div>
  775. </div>
  776. <script id="__config" type="application/json">{"annotate": null, "base": ".", "features": ["content.action.edit", "content.action.view", "navigation.tracking", "navigation.footer", "search", "search.suggest", "search.highlight", "search.share", "content.code.copy"], "search": "assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
  777. <script src="assets/javascripts/bundle.79ae519e.min.js"></script>
  778. </body>
  779. </html>