9 months ago · b25daedd60
--- a/docs/en/inference.md
+++ b/docs/en/inference.md
@@ -14,11 +14,11 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope
 
															 ## Command Line Inference
														
 
															+### 1. Get VQ tokens from reference audio
														
 
															+
														
 
															 !!! note
														
 
															     If you plan to let the model randomly choose a voice timbre, you can skip this step.
														
 
															-### 1. Get VQ tokens from reference audio
														
 
															-
														
 
															 ```bash
														
 
															 python fish_speech/models/dac/inference.py \
														
 
															     -i "ref_audio_name.wav" \
														
@@ -36,6 +36,8 @@ python fish_speech/models/text2semantic/inference.py \
 
															     --prompt-tokens "fake.npy" \
														
 
															     --compile
														
 
															 ```
														
 
															+with `--prompt-tokens "fake.npy"` and `--prompt-text "Your reference text"` from step 1.
														
 
															+If you want to let the model randomly choose a voice timbre, skip the two parameters.
														
 
															 This command will create a `codes_N` file in the working directory, where N is an integer starting from 0.
														
@@ -96,6 +98,7 @@ python -m tools.run_webui
 
															 !!! note
														
 
															     You can save the label file and reference audio file in advance to the `references` folder in the main directory (which you need to create yourself), so that you can directly call them in the WebUI.
														
 
															+    Inside the `references` folder, put subdirectories named `<voice_id>`, and put the label file (`sample.lab`, containing the reference text) and reference audio file (`sample.wav`) in the subdirectory.
														
 
															 !!! note
														
 
															     You can use Gradio environment variables, such as `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME` to configure WebUI.
														
--- a/fish_speech/inference_engine/reference_loader.py
+++ b/fish_speech/inference_engine/reference_loader.py
@@ -18,7 +18,6 @@ from fish_speech.utils.schema import ServeReferenceAudio
 
															 class ReferenceLoader:
														
 
															-
														
 
															     def __init__(self) -> None:
														
 
															         """
														
 
															         Component of the TTSInferenceEngine class.
														
@@ -43,7 +42,6 @@ class ReferenceLoader:
 
															         id: str,
														
 
															         use_cache: Literal["on", "off"],
														
 
															     ) -> Tuple:
														
 
															-
														
 
															         # Load the references audio and text by id
														
 
															         ref_folder = Path("references") / id
														
 
															         ref_folder.mkdir(parents=True, exist_ok=True)
														
@@ -79,7 +77,6 @@ class ReferenceLoader:
 
															         references: list[ServeReferenceAudio],
														
 
															         use_cache: Literal["on", "off"],
														
 
															     ) -> Tuple:
														
 
															-
														
 
															         # Load the references audio and text by hash
														
 
															         audio_hashes = [sha256(ref.audio).hexdigest() for ref in references]
														
@@ -109,7 +106,7 @@ class ReferenceLoader:
 
															         return prompt_tokens, prompt_texts
														
 
															-    def load_audio(self, reference_audio, sr):
														
 
															+    def load_audio(self, reference_audio: bytes | str, sr: int):
														
 
															         """
														
 
															         Load the audio data from a file or bytes.
														
 
															         """
														
@@ -130,3 +127,145 @@ class ReferenceLoader:
 
															         audio = waveform.squeeze().numpy()
														
 
															         return audio
														
 
															+
														
 
															+    def list_reference_ids(self) -> list[str]:
														
 
															+        """
														
 
															+        List all valid reference IDs (subdirectory names containing valid audio and .lab files).
														
 
															+
														
 
															+        Returns:
														
 
															+            list[str]: List of valid reference IDs
														
 
															+        """
														
 
															+        ref_base_path = Path("references")
														
 
															+        if not ref_base_path.exists():
														
 
															+            return []
														
 
															+
														
 
															+        valid_ids = []
														
 
															+        for ref_dir in ref_base_path.iterdir():
														
 
															+            if not ref_dir.is_dir():
														
 
															+                continue
														
 
															+
														
 
															+            # Check if directory contains at least one audio file and corresponding .lab file
														
 
															+            audio_files = list_files(
														
 
															+                ref_dir, AUDIO_EXTENSIONS, recursive=False, sort=False
														
 
															+            )
														
 
															+            if not audio_files:
														
 
															+                continue
														
 
															+
														
 
															+            # Check if corresponding .lab file exists for at least one audio file
														
 
															+            has_valid_pair = False
														
 
															+            for audio_file in audio_files:
														
 
															+                lab_file = audio_file.with_suffix(".lab")
														
 
															+                if lab_file.exists():
														
 
															+                    has_valid_pair = True
														
 
															+                    break
														
 
															+
														
 
															+            if has_valid_pair:
														
 
															+                valid_ids.append(ref_dir.name)
														
 
															+
														
 
															+        return sorted(valid_ids)
														
 
															+
														
 
															+    def add_reference(self, id: str, wav_file_path: str, reference_text: str) -> None:
														
 
															+        """
														
 
															+        Add a new reference voice by creating a new directory and copying files.
														
 
															+
														
 
															+        Args:
														
 
															+            id: Reference ID (directory name)
														
 
															+            wav_file_path: Path to the audio file to copy
														
 
															+            reference_text: Text content for the .lab file
														
 
															+
														
 
															+        Raises:
														
 
															+            FileExistsError: If the reference ID already exists
														
 
															+            FileNotFoundError: If the audio file doesn't exist
														
 
															+            OSError: If file operations fail
														
 
															+        """
														
 
															+        # Validate ID format
														
 
															+        import re
														
 
															+
														
 
															+        if not re.match(r"^[a-zA-Z0-9\-_ ]+$", id):
														
 
															+            raise ValueError(
														
 
															+                "Reference ID contains invalid characters. Only alphanumeric, hyphens, underscores, and spaces are allowed."
														
 
															+            )
														
 
															+
														
 
															+        if len(id) > 255:
														
 
															+            raise ValueError(
														
 
															+                "Reference ID is too long. Maximum length is 255 characters."
														
 
															+            )
														
 
															+
														
 
															+        # Check if reference already exists
														
 
															+        ref_dir = Path("references") / id
														
 
															+        if ref_dir.exists():
														
 
															+            raise FileExistsError(f"Reference ID '{id}' already exists")
														
 
															+
														
 
															+        # Check if audio file exists
														
 
															+        audio_path = Path(wav_file_path)
														
 
															+        if not audio_path.exists():
														
 
															+            raise FileNotFoundError(f"Audio file not found: {wav_file_path}")
														
 
															+
														
 
															+        # Validate audio file extension
														
 
															+        if audio_path.suffix.lower() not in AUDIO_EXTENSIONS:
														
 
															+            raise ValueError(
														
 
															+                f"Unsupported audio format: {audio_path.suffix}. Supported formats: {', '.join(AUDIO_EXTENSIONS)}"
														
 
															+            )
														
 
															+
														
 
															+        try:
														
 
															+            # Create reference directory
														
 
															+            ref_dir.mkdir(parents=True, exist_ok=False)
														
 
															+
														
 
															+            # Determine the target audio filename with original extension
														
 
															+            target_audio_path = ref_dir / f"sample{audio_path.suffix}"
														
 
															+
														
 
															+            # Copy audio file
														
 
															+            import shutil
														
 
															+
														
 
															+            shutil.copy2(audio_path, target_audio_path)
														
 
															+
														
 
															+            # Create .lab file
														
 
															+            lab_path = ref_dir / "sample.lab"
														
 
															+            with open(lab_path, "w", encoding="utf-8") as f:
														
 
															+                f.write(reference_text)
														
 
															+
														
 
															+            # Clear cache for this ID if it exists
														
 
															+            if id in self.ref_by_id:
														
 
															+                del self.ref_by_id[id]
														
 
															+
														
 
															+            logger.info(f"Successfully added reference voice with ID: {id}")
														
 
															+
														
 
															+        except Exception as e:
														
 
															+            # Clean up on failure
														
 
															+            if ref_dir.exists():
														
 
															+                import shutil
														
 
															+
														
 
															+                shutil.rmtree(ref_dir)
														
 
															+            raise e
														
 
															+
														
 
															+    def delete_reference(self, id: str) -> None:
														
 
															+        """
														
 
															+        Delete a reference voice by removing its directory and files.
														
 
															+
														
 
															+        Args:
														
 
															+            id: Reference ID (directory name) to delete
														
 
															+
														
 
															+        Raises:
														
 
															+            FileNotFoundError: If the reference ID doesn't exist
														
 
															+            OSError: If file operations fail
														
 
															+        """
														
 
															+        # Check if reference exists
														
 
															+        ref_dir = Path("references") / id
														
 
															+        if not ref_dir.exists():
														
 
															+            raise FileNotFoundError(f"Reference ID '{id}' does not exist")
														
 
															+
														
 
															+        try:
														
 
															+            # Remove the entire reference directory
														
 
															+            import shutil
														
 
															+
														
 
															+            shutil.rmtree(ref_dir)
														
 
															+
														
 
															+            # Clear cache for this ID if it exists
														
 
															+            if id in self.ref_by_id:
														
 
															+                del self.ref_by_id[id]
														
 
															+
														
 
															+            logger.info(f"Successfully deleted reference voice with ID: {id}")
														
 
															+
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"Failed to delete reference '{id}': {e}")
														
 
															+            raise OSError(f"Failed to delete reference '{id}': {e}")
														
--- a/fish_speech/utils/schema.py
+++ b/fish_speech/utils/schema.py
@@ -69,7 +69,7 @@ class ServeReferenceAudio(BaseModel):
 
															         ):  # Check if audio is a string (Base64)
														
 
															             try:
														
 
															                 values["audio"] = base64.b64decode(audio)
														
 
															-            except Exception as e:
														
 
															+            except Exception:
														
 
															                 # If the audio is not a valid base64 string, we will just ignore it and let the server handle it
														
 
															                 pass
														
 
															         return values
														
@@ -103,3 +103,34 @@ class ServeTTSRequest(BaseModel):
 
															     class Config:
														
 
															         # Allow arbitrary types for pytorch related types
														
 
															         arbitrary_types_allowed = True
														
 
															+
														
 
															+
														
 
															+class AddReferenceRequest(BaseModel):
														
 
															+    id: str = Field(..., min_length=1, max_length=255, pattern=r"^[a-zA-Z0-9\-_ ]+$")
														
 
															+    audio: bytes
														
 
															+    text: str = Field(..., min_length=1)
														
 
															+
														
 
															+
														
 
															+class AddReferenceResponse(BaseModel):
														
 
															+    success: bool
														
 
															+    message: str
														
 
															+    reference_id: str
														
 
															+
														
 
															+
														
 
															+class ListReferencesResponse(BaseModel):
														
 
															+    success: bool
														
 
															+    reference_ids: list[str]
														
 
															+    message: str = "Success"
														
 
															+
														
 
															+
														
 
															+class DeleteReferenceResponse(BaseModel):
														
 
															+    success: bool
														
 
															+    message: str
														
 
															+    reference_id: str
														
 
															+
														
 
															+
														
 
															+class UpdateReferenceResponse(BaseModel):
														
 
															+    success: bool
														
 
															+    message: str
														
 
															+    old_reference_id: str
														
 
															+    new_reference_id: str
														
--- a/tools/api_client.py
+++ b/tools/api_client.py
@@ -1,5 +1,6 @@
 
															 import argparse
														
 
															 import base64
														
 
															+import time
														
 
															 import wave
														
 
															 import ormsgpack
														
@@ -13,7 +14,6 @@ from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest
 
															 def parse_args():
														
 
															-
														
 
															     parser = argparse.ArgumentParser(
														
 
															         description="Send a WAV file and text to a server and receive synthesized audio.",
														
 
															         formatter_class=argparse.RawTextHelpFormatter,
														
@@ -97,8 +97,9 @@ def parse_args():
 
															         "--temperature", type=float, default=0.8, help="Temperature for sampling"
														
 
															     )
														
 
															+    # parser.add_argument("--streaming", type=bool, default=False, help="Enable streaming response")
														
 
															     parser.add_argument(
														
 
															-        "--streaming", type=bool, default=False, help="Enable streaming response"
														
 
															+        "--streaming", action="store_true", help="Enable streaming response"
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         "--channels", type=int, default=1, help="Number of audio channels"
														
@@ -115,8 +116,7 @@ def parse_args():
 
															         "--seed",
														
 
															         type=int,
														
 
															         default=None,
														
 
															-        help="`None` means randomized inference, otherwise deterministic.\n"
														
 
															-        "It can't be used for fixing a timbre.",
														
 
															+        help="`None` means randomized inference, otherwise deterministic.\nIt can't be used for fixing a timbre.",
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         "--api_key",
														
@@ -129,7 +129,6 @@ def parse_args():
 
															 if __name__ == "__main__":
														
 
															-
														
 
															     args = parse_args()
														
 
															     idstr: str | None = args.reference_id
														
@@ -172,8 +171,11 @@ if __name__ == "__main__":
 
															     pydantic_data = ServeTTSRequest(**data)
														
 
															+    print("Sending request")
														
 
															+    start_time = time.time()
														
 
															     response = requests.post(
														
 
															         args.url,
														
 
															+        params={"format": "msgpack"},
														
 
															         data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
														
 
															         stream=args.streaming,
														
 
															         headers={
														
@@ -181,6 +183,8 @@ if __name__ == "__main__":
 
															             "content-type": "application/msgpack",
														
 
															         },
														
 
															     )
														
 
															+    end_time = time.time()
														
 
															+    print(f"Request took {end_time - start_time} seconds")
														
 
															     if response.status_code == 200:
														
 
															         if args.streaming:
														
--- a/tools/server/api_utils.py
+++ b/tools/server/api_utils.py
@@ -4,7 +4,14 @@ from typing import Annotated, Any
 
															 import ormsgpack
														
 
															 from baize.datastructures import ContentType
														
 
															-from kui.asgi import HTTPException, HttpRequest
														
 
															+from kui.asgi import (
														
 
															+    HTTPException,
														
 
															+    HttpRequest,
														
 
															+    JSONResponse,
														
 
															+    request,
														
 
															+)
														
 
															+from loguru import logger
														
 
															+from pydantic import BaseModel
														
 
															 from fish_speech.inference_engine import TTSInferenceEngine
														
 
															 from fish_speech.utils.schema import ServeTTSRequest
														
@@ -40,7 +47,10 @@ class MsgPackRequest(HttpRequest):
 
															     async def data(
														
 
															         self,
														
 
															     ) -> Annotated[
														
 
															-        Any, ContentType("application/msgpack"), ContentType("application/json")
														
 
															+        Any,
														
 
															+        ContentType("application/msgpack"),
														
 
															+        ContentType("application/json"),
														
 
															+        ContentType("multipart/form-data"),
														
 
															     ]:
														
 
															         if self.content_type == "application/msgpack":
														
 
															             return ormsgpack.unpackb(await self.body)
														
@@ -48,14 +58,20 @@ class MsgPackRequest(HttpRequest):
 
															         elif self.content_type == "application/json":
														
 
															             return await self.json
														
 
															+        elif self.content_type == "multipart/form-data":
														
 
															+            return await self.form
														
 
															+
														
 
															         raise HTTPException(
														
 
															             HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
														
 
															-            headers={"Accept": "application/msgpack, application/json"},
														
 
															+            headers={
														
 
															+                "Accept": "application/msgpack, application/json, multipart/form-data"
														
 
															+            },
														
 
															         )
														
 
															 async def inference_async(req: ServeTTSRequest, engine: TTSInferenceEngine):
														
 
															     for chunk in inference(req, engine):
														
 
															+        print("Got chunk")
														
 
															         if isinstance(chunk, bytes):
														
 
															             yield chunk
														
@@ -73,3 +89,61 @@ def get_content_type(audio_format):
 
															         return "audio/mpeg"
														
 
															     else:
														
 
															         return "application/octet-stream"
														
 
															+
														
 
															+
														
 
															+def wants_json(req):
														
 
															+    """Helper method to determine if the client wants a JSON response
														
 
															+
														
 
															+    Parameters
														
 
															+    ----------
														
 
															+    req : Request
														
 
															+        The request object
														
 
															+
														
 
															+    Returns
														
 
															+    -------
														
 
															+    bool
														
 
															+        True if the client wants a JSON response, False otherwise
														
 
															+    """
														
 
															+    q = req.query_params.get("format", "").strip().lower()
														
 
															+    if q in {"json", "application/json", "msgpack", "application/msgpack"}:
														
 
															+        return q == "json"
														
 
															+    accept = req.headers.get("Accept", "").strip().lower()
														
 
															+    return "application/json" in accept and "application/msgpack" not in accept
														
 
															+
														
 
															+
														
 
															+def format_response(response: BaseModel, status_code=200):
														
 
															+    """
														
 
															+    Helper function to format responses consistently based on client preference.
														
 
															+
														
 
															+    Parameters
														
 
															+    ----------
														
 
															+    response : BaseModel
														
 
															+        The response object to format
														
 
															+    status_code : int
														
 
															+        HTTP status code (default: 200)
														
 
															+
														
 
															+    Returns
														
 
															+    -------
														
 
															+    Response
														
 
															+        Formatted response in the client's preferred format
														
 
															+    """
														
 
															+    try:
														
 
															+        if wants_json(request):
														
 
															+            return JSONResponse(
														
 
															+                response.model_dump(mode="json"), status_code=status_code
														
 
															+            )
														
 
															+
														
 
															+        return (
														
 
															+            ormsgpack.packb(
														
 
															+                response,
														
 
															+                option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
														
 
															+            ),
														
 
															+            status_code,
														
 
															+            {"Content-Type": "application/msgpack"},
														
 
															+        )
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"Error formatting response: {e}", exc_info=True)
														
 
															+        # Fallback to JSON response if formatting fails
														
 
															+        return JSONResponse(
														
 
															+            {"error": "Response formatting failed", "details": str(e)}, status_code=500
														
 
															+        )
														
--- a/tools/server/views.py
+++ b/tools/server/views.py
@@ -1,7 +1,11 @@
 
															 import io
														
 
															 import os
														
 
															+import re
														
 
															+import shutil
														
 
															+import tempfile
														
 
															 import time
														
 
															 from http import HTTPStatus
														
 
															+from pathlib import Path
														
 
															 import numpy as np
														
 
															 import ormsgpack
														
@@ -14,20 +18,27 @@ from kui.asgi import (
 
															     JSONResponse,
														
 
															     Routes,
														
 
															     StreamResponse,
														
 
															+    UploadFile,
														
 
															     request,
														
 
															 )
														
 
															 from loguru import logger
														
 
															 from typing_extensions import Annotated
														
 
															 from fish_speech.utils.schema import (
														
 
															+    AddReferenceRequest,
														
 
															+    AddReferenceResponse,
														
 
															+    DeleteReferenceResponse,
														
 
															+    ListReferencesResponse,
														
 
															     ServeTTSRequest,
														
 
															     ServeVQGANDecodeRequest,
														
 
															     ServeVQGANDecodeResponse,
														
 
															     ServeVQGANEncodeRequest,
														
 
															     ServeVQGANEncodeResponse,
														
 
															+    UpdateReferenceResponse,
														
 
															 )
														
 
															 from tools.server.api_utils import (
														
 
															     buffer_to_async_generator,
														
 
															+    format_response,
														
 
															     get_content_type,
														
 
															     inference_async,
														
 
															 )
														
@@ -56,87 +67,396 @@ class Health(HttpView):
 
															 @routes.http.post("/v1/vqgan/encode")
														
 
															 async def vqgan_encode(req: Annotated[ServeVQGANEncodeRequest, Body(exclusive=True)]):
														
 
															-    # Get the model from the app
														
 
															-    model_manager: ModelManager = request.app.state.model_manager
														
 
															-    decoder_model = model_manager.decoder_model
														
 
															+    """
														
 
															+    Encode audio using VQGAN model.
														
 
															+    """
														
 
															+    try:
														
 
															+        # Get the model from the app
														
 
															+        model_manager: ModelManager = request.app.state.model_manager
														
 
															+        decoder_model = model_manager.decoder_model
														
 
															-    # Encode the audio
														
 
															-    start_time = time.time()
														
 
															-    tokens = cached_vqgan_batch_encode(decoder_model, req.audios)
														
 
															-    logger.info(f"[EXEC] VQGAN encode time: {(time.time() - start_time) * 1000:.2f}ms")
														
 
															+        # Encode the audio
														
 
															+        start_time = time.time()
														
 
															+        tokens = cached_vqgan_batch_encode(decoder_model, req.audios)
														
 
															+        logger.info(
														
 
															+            f"[EXEC] VQGAN encode time: {(time.time() - start_time) * 1000:.2f}ms"
														
 
															+        )
														
 
															-    # Return the response
														
 
															-    return ormsgpack.packb(
														
 
															-        ServeVQGANEncodeResponse(tokens=[i.tolist() for i in tokens]),
														
 
															-        option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
														
 
															-    )
														
 
															+        # Return the response
														
 
															+        return ormsgpack.packb(
														
 
															+            ServeVQGANEncodeResponse(tokens=[i.tolist() for i in tokens]),
														
 
															+            option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
														
 
															+        )
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"Error in VQGAN encode: {e}", exc_info=True)
														
 
															+        raise HTTPException(
														
 
															+            HTTPStatus.INTERNAL_SERVER_ERROR, content="Failed to encode audio"
														
 
															+        )
														
 
															 @routes.http.post("/v1/vqgan/decode")
														
 
															 async def vqgan_decode(req: Annotated[ServeVQGANDecodeRequest, Body(exclusive=True)]):
														
 
															-    # Get the model from the app
														
 
															-    model_manager: ModelManager = request.app.state.model_manager
														
 
															-    decoder_model = model_manager.decoder_model
														
 
															+    """
														
 
															+    Decode tokens to audio using VQGAN model.
														
 
															+    """
														
 
															+    try:
														
 
															+        # Get the model from the app
														
 
															+        model_manager: ModelManager = request.app.state.model_manager
														
 
															+        decoder_model = model_manager.decoder_model
														
 
															-    # Decode the audio
														
 
															-    tokens = [torch.tensor(token, dtype=torch.int) for token in req.tokens]
														
 
															-    start_time = time.time()
														
 
															-    audios = batch_vqgan_decode(decoder_model, tokens)
														
 
															-    logger.info(f"[EXEC] VQGAN decode time: {(time.time() - start_time) * 1000:.2f}ms")
														
 
															-    audios = [audio.astype(np.float16).tobytes() for audio in audios]
														
 
															+        # Decode the audio
														
 
															+        tokens = [torch.tensor(token, dtype=torch.int) for token in req.tokens]
														
 
															+        start_time = time.time()
														
 
															+        audios = batch_vqgan_decode(decoder_model, tokens)
														
 
															+        logger.info(
														
 
															+            f"[EXEC] VQGAN decode time: {(time.time() - start_time) * 1000:.2f}ms"
														
 
															+        )
														
 
															+        audios = [audio.astype(np.float16).tobytes() for audio in audios]
														
 
															-    # Return the response
														
 
															-    return ormsgpack.packb(
														
 
															-        ServeVQGANDecodeResponse(audios=audios),
														
 
															-        option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
														
 
															-    )
														
 
															+        # Return the response
														
 
															+        return ormsgpack.packb(
														
 
															+            ServeVQGANDecodeResponse(audios=audios),
														
 
															+            option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
														
 
															+        )
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"Error in VQGAN decode: {e}", exc_info=True)
														
 
															+        raise HTTPException(
														
 
															+            HTTPStatus.INTERNAL_SERVER_ERROR, content="Failed to decode tokens to audio"
														
 
															+        )
														
 
															 @routes.http.post("/v1/tts")
														
 
															 async def tts(req: Annotated[ServeTTSRequest, Body(exclusive=True)]):
														
 
															-    # Get the model from the app
														
 
															-    app_state = request.app.state
														
 
															-    model_manager: ModelManager = app_state.model_manager
														
 
															-    engine = model_manager.tts_inference_engine
														
 
															-    sample_rate = engine.decoder_model.sample_rate
														
 
															-
														
 
															-    # Check if the text is too long
														
 
															-    if app_state.max_text_length > 0 and len(req.text) > app_state.max_text_length:
														
 
															+    """
														
 
															+    Generate speech from text using TTS model.
														
 
															+    """
														
 
															+    try:
														
 
															+        # Get the model from the app
														
 
															+        app_state = request.app.state
														
 
															+        model_manager: ModelManager = app_state.model_manager
														
 
															+        engine = model_manager.tts_inference_engine
														
 
															+        sample_rate = engine.decoder_model.sample_rate
														
 
															+
														
 
															+        # Check if the text is too long
														
 
															+        if app_state.max_text_length > 0 and len(req.text) > app_state.max_text_length:
														
 
															+            raise HTTPException(
														
 
															+                HTTPStatus.BAD_REQUEST,
														
 
															+                content=f"Text is too long, max length is {app_state.max_text_length}",
														
 
															+            )
														
 
															+
														
 
															+        # Check if streaming is enabled
														
 
															+        if req.streaming and req.format != "wav":
														
 
															+            raise HTTPException(
														
 
															+                HTTPStatus.BAD_REQUEST,
														
 
															+                content="Streaming only supports WAV format",
														
 
															+            )
														
 
															+
														
 
															+        # Perform TTS
														
 
															+        if req.streaming:
														
 
															+            return StreamResponse(
														
 
															+                iterable=inference_async(req, engine),
														
 
															+                headers={
														
 
															+                    "Content-Disposition": f"attachment; filename=audio.{req.format}",
														
 
															+                },
														
 
															+                content_type=get_content_type(req.format),
														
 
															+            )
														
 
															+        else:
														
 
															+            fake_audios = next(inference(req, engine))
														
 
															+            buffer = io.BytesIO()
														
 
															+            sf.write(
														
 
															+                buffer,
														
 
															+                fake_audios,
														
 
															+                sample_rate,
														
 
															+                format=req.format,
														
 
															+            )
														
 
															+
														
 
															+            return StreamResponse(
														
 
															+                iterable=buffer_to_async_generator(buffer.getvalue()),
														
 
															+                headers={
														
 
															+                    "Content-Disposition": f"attachment; filename=audio.{req.format}",
														
 
															+                },
														
 
															+                content_type=get_content_type(req.format),
														
 
															+            )
														
 
															+    except HTTPException:
														
 
															+        # Re-raise HTTP exceptions as they are already properly formatted
														
 
															+        raise
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"Error in TTS generation: {e}", exc_info=True)
														
 
															         raise HTTPException(
														
 
															-            HTTPStatus.BAD_REQUEST,
														
 
															-            content=f"Text is too long, max length is {app_state.max_text_length}",
														
 
															+            HTTPStatus.INTERNAL_SERVER_ERROR, content="Failed to generate speech"
														
 
															         )
														
 
															-    # Check if streaming is enabled
														
 
															-    if req.streaming and req.format != "wav":
														
 
															-        raise HTTPException(
														
 
															-            HTTPStatus.BAD_REQUEST,
														
 
															-            content="Streaming only supports WAV format",
														
 
															-        )
														
 
															-
														
 
															-    # Perform TTS
														
 
															-    if req.streaming:
														
 
															-        return StreamResponse(
														
 
															-            iterable=inference_async(req, engine),
														
 
															-            headers={
														
 
															-                "Content-Disposition": f"attachment; filename=audio.{req.format}",
														
 
															-            },
														
 
															-            content_type=get_content_type(req.format),
														
 
															-        )
														
 
															-    else:
														
 
															-        fake_audios = next(inference(req, engine))
														
 
															-        buffer = io.BytesIO()
														
 
															-        sf.write(
														
 
															-            buffer,
														
 
															-            fake_audios,
														
 
															-            sample_rate,
														
 
															-            format=req.format,
														
 
															-        )
														
 
															-
														
 
															-        return StreamResponse(
														
 
															-            iterable=buffer_to_async_generator(buffer.getvalue()),
														
 
															-            headers={
														
 
															-                "Content-Disposition": f"attachment; filename=audio.{req.format}",
														
 
															-            },
														
 
															-            content_type=get_content_type(req.format),
														
 
															+
														
 
															+@routes.http.post("/v1/references/add")
														
 
															+async def add_reference(
														
 
															+    id: str = Body(...), audio: UploadFile = Body(...), text: str = Body(...)
														
 
															+):
														
 
															+    """
														
 
															+    Add a new reference voice with audio file and text.
														
 
															+    """
														
 
															+    temp_file_path = None
														
 
															+
														
 
															+    try:
														
 
															+        # Validate input parameters
														
 
															+        if not id or not id.strip():
														
 
															+            raise ValueError("Reference ID cannot be empty")
														
 
															+
														
 
															+        if not text or not text.strip():
														
 
															+            raise ValueError("Reference text cannot be empty")
														
 
															+
														
 
															+        # Get the model manager to access the reference loader
														
 
															+        app_state = request.app.state
														
 
															+        model_manager: ModelManager = app_state.model_manager
														
 
															+        engine = model_manager.tts_inference_engine
														
 
															+
														
 
															+        # Read the uploaded audio file
														
 
															+        audio_content = audio.read()
														
 
															+        if not audio_content:
														
 
															+            raise ValueError("Audio file is empty or could not be read")
														
 
															+
														
 
															+        # Create a temporary file for the audio data
														
 
															+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
														
 
															+            temp_file.write(audio_content)
														
 
															+            temp_file_path = temp_file.name
														
 
															+
														
 
															+        # Add the reference using the engine's reference loader
														
 
															+        engine.add_reference(id, temp_file_path, text)
														
 
															+
														
 
															+        response = AddReferenceResponse(
														
 
															+            success=True,
														
 
															+            message=f"Reference voice '{id}' added successfully",
														
 
															+            reference_id=id,
														
 
															+        )
														
 
															+        return format_response(response)
														
 
															+
														
 
															+    except FileExistsError as e:
														
 
															+        logger.warning(f"Reference ID '{id}' already exists: {e}")
														
 
															+        response = AddReferenceResponse(
														
 
															+            success=False,
														
 
															+            message=f"Reference ID '{id}' already exists",
														
 
															+            reference_id=id,
														
 
															+        )
														
 
															+        return format_response(response, status_code=409)  # Conflict
														
 
															+
														
 
															+    except ValueError as e:
														
 
															+        logger.warning(f"Invalid input for reference '{id}': {e}")
														
 
															+        response = AddReferenceResponse(success=False, message=str(e), reference_id=id)
														
 
															+        return format_response(response, status_code=400)
														
 
															+
														
 
															+    except (FileNotFoundError, OSError) as e:
														
 
															+        logger.error(f"File system error for reference '{id}': {e}")
														
 
															+        response = AddReferenceResponse(
														
 
															+            success=False, message="File system error occurred", reference_id=id
														
 
															+        )
														
 
															+        return format_response(response, status_code=500)
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"Unexpected error adding reference '{id}': {e}", exc_info=True)
														
 
															+        response = AddReferenceResponse(
														
 
															+            success=False, message="Internal server error occurred", reference_id=id
														
 
															+        )
														
 
															+        return format_response(response, status_code=500)
														
 
															+
														
 
															+    finally:
														
 
															+        # Clean up temporary file
														
 
															+        if temp_file_path and os.path.exists(temp_file_path):
														
 
															+            try:
														
 
															+                os.unlink(temp_file_path)
														
 
															+            except OSError as e:
														
 
															+                logger.warning(
														
 
															+                    f"Failed to clean up temporary file {temp_file_path}: {e}"
														
 
															+                )
														
 
															+
														
 
															+
														
 
															+@routes.http.get("/v1/references/list")
														
 
															+async def list_references():
														
 
															+    """
														
 
															+    Get a list of all available reference voice IDs.
														
 
															+    """
														
 
															+    try:
														
 
															+        # Get the model manager to access the reference loader
														
 
															+        app_state = request.app.state
														
 
															+        model_manager: ModelManager = app_state.model_manager
														
 
															+        engine = model_manager.tts_inference_engine
														
 
															+
														
 
															+        # Get the list of reference IDs
														
 
															+        reference_ids = engine.list_reference_ids()
														
 
															+
														
 
															+        response = ListReferencesResponse(
														
 
															+            success=True,
														
 
															+            reference_ids=reference_ids,
														
 
															+            message=f"Found {len(reference_ids)} reference voices",
														
 
															+        )
														
 
															+        return format_response(response)
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"Unexpected error listing references: {e}", exc_info=True)
														
 
															+        response = ListReferencesResponse(
														
 
															+            success=False, reference_ids=[], message="Internal server error occurred"
														
 
															+        )
														
 
															+        return format_response(response, status_code=500)
														
 
															+
														
 
															+
														
 
															+@routes.http.delete("/v1/references/delete")
														
 
															+async def delete_reference(reference_id: str = Body(...)):
														
 
															+    """
														
 
															+    Delete a reference voice by ID.
														
 
															+    """
														
 
															+    try:
														
 
															+        # Validate input parameters
														
 
															+        if not reference_id or not reference_id.strip():
														
 
															+            raise ValueError("Reference ID cannot be empty")
														
 
															+
														
 
															+        # Get the model manager to access the reference loader
														
 
															+        app_state = request.app.state
														
 
															+        model_manager: ModelManager = app_state.model_manager
														
 
															+        engine = model_manager.tts_inference_engine
														
 
															+
														
 
															+        # Delete the reference using the engine's reference loader
														
 
															+        engine.delete_reference(reference_id)
														
 
															+
														
 
															+        response = DeleteReferenceResponse(
														
 
															+            success=True,
														
 
															+            message=f"Reference voice '{reference_id}' deleted successfully",
														
 
															+            reference_id=reference_id,
														
 
															+        )
														
 
															+        return format_response(response)
														
 
															+
														
 
															+    except FileNotFoundError as e:
														
 
															+        logger.warning(f"Reference ID '{reference_id}' not found: {e}")
														
 
															+        response = DeleteReferenceResponse(
														
 
															+            success=False,
														
 
															+            message=f"Reference ID '{reference_id}' not found",
														
 
															+            reference_id=reference_id,
														
 
															+        )
														
 
															+        return format_response(response, status_code=404)  # Not Found
														
 
															+
														
 
															+    except ValueError as e:
														
 
															+        logger.warning(f"Invalid input for reference '{reference_id}': {e}")
														
 
															+        response = DeleteReferenceResponse(
														
 
															+            success=False, message=str(e), reference_id=reference_id
														
 
															+        )
														
 
															+        return format_response(response, status_code=400)
														
 
															+
														
 
															+    except OSError as e:
														
 
															+        logger.error(f"File system error deleting reference '{reference_id}': {e}")
														
 
															+        response = DeleteReferenceResponse(
														
 
															+            success=False,
														
 
															+            message="File system error occurred",
														
 
															+            reference_id=reference_id,
														
 
															+        )
														
 
															+        return format_response(response, status_code=500)
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        logger.error(
														
 
															+            f"Unexpected error deleting reference '{reference_id}': {e}", exc_info=True
														
 
															+        )
														
 
															+        response = DeleteReferenceResponse(
														
 
															+            success=False,
														
 
															+            message="Internal server error occurred",
														
 
															+            reference_id=reference_id,
														
 
															+        )
														
 
															+        return format_response(response, status_code=500)
														
 
															+
														
 
															+
														
 
															+@routes.http.post("/v1/references/update")
														
 
															+async def update_reference(
														
 
															+    old_reference_id: str = Body(...), new_reference_id: str = Body(...)
														
 
															+):
														
 
															+    """
														
 
															+    Rename a reference voice directory from old_reference_id to new_reference_id.
														
 
															+    """
														
 
															+    try:
														
 
															+        # Validate input parameters
														
 
															+        if not old_reference_id or not old_reference_id.strip():
														
 
															+            raise ValueError("Old reference ID cannot be empty")
														
 
															+        if not new_reference_id or not new_reference_id.strip():
														
 
															+            raise ValueError("New reference ID cannot be empty")
														
 
															+        if old_reference_id == new_reference_id:
														
 
															+            raise ValueError("New reference ID must be different from old reference ID")
														
 
															+
														
 
															+        # Validate ID format per ReferenceLoader rules
														
 
															+        id_pattern = r"^[a-zA-Z0-9\-_ ]+$"
														
 
															+        if not re.match(id_pattern, new_reference_id) or len(new_reference_id) > 255:
														
 
															+            raise ValueError(
														
 
															+                "New reference ID contains invalid characters or is too long"
														
 
															+            )
														
 
															+
														
 
															+        # Access engine to update caches after renaming
														
 
															+        app_state = request.app.state
														
 
															+        model_manager: ModelManager = app_state.model_manager
														
 
															+        engine = model_manager.tts_inference_engine
														
 
															+
														
 
															+        refs_base = Path("references")
														
 
															+        old_dir = refs_base / old_reference_id
														
 
															+        new_dir = refs_base / new_reference_id
														
 
															+
														
 
															+        # Existence checks
														
 
															+        if not old_dir.exists() or not old_dir.is_dir():
														
 
															+            raise FileNotFoundError(f"Reference ID '{old_reference_id}' not found")
														
 
															+        if new_dir.exists():
														
 
															+            # Conflict: destination already exists
														
 
															+            response = UpdateReferenceResponse(
														
 
															+                success=False,
														
 
															+                message=f"Reference ID '{new_reference_id}' already exists",
														
 
															+                old_reference_id=old_reference_id,
														
 
															+                new_reference_id=new_reference_id,
														
 
															+            )
														
 
															+            return format_response(response, status_code=409)
														
 
															+
														
 
															+        # Perform rename
														
 
															+        old_dir.rename(new_dir)
														
 
															+
														
 
															+        # Update in-memory cache key if present
														
 
															+        if old_reference_id in engine.ref_by_id:
														
 
															+            engine.ref_by_id[new_reference_id] = engine.ref_by_id.pop(old_reference_id)
														
 
															+
														
 
															+        response = UpdateReferenceResponse(
														
 
															+            success=True,
														
 
															+            message=(
														
 
															+                f"Reference voice renamed from '{old_reference_id}' to '{new_reference_id}' successfully"
														
 
															+            ),
														
 
															+            old_reference_id=old_reference_id,
														
 
															+            new_reference_id=new_reference_id,
														
 
															+        )
														
 
															+        return format_response(response)
														
 
															+
														
 
															+    except FileNotFoundError as e:
														
 
															+        logger.warning(str(e))
														
 
															+        response = UpdateReferenceResponse(
														
 
															+            success=False,
														
 
															+            message=str(e),
														
 
															+            old_reference_id=old_reference_id,
														
 
															+            new_reference_id=new_reference_id,
														
 
															+        )
														
 
															+        return format_response(response, status_code=404)
														
 
															+
														
 
															+    except ValueError as e:
														
 
															+        logger.warning(f"Invalid input for update reference: {e}")
														
 
															+        response = UpdateReferenceResponse(
														
 
															+            success=False,
														
 
															+            message=str(e),
														
 
															+            old_reference_id=old_reference_id if "old_reference_id" in locals() else "",
														
 
															+            new_reference_id=new_reference_id if "new_reference_id" in locals() else "",
														
 
															+        )
														
 
															+        return format_response(response, status_code=400)
														
 
															+
														
 
															+    except OSError as e:
														
 
															+        logger.error(f"File system error renaming reference: {e}")
														
 
															+        response = UpdateReferenceResponse(
														
 
															+            success=False,
														
 
															+            message="File system error occurred",
														
 
															+            old_reference_id=old_reference_id,
														
 
															+            new_reference_id=new_reference_id,
														
 
															+        )
														
 
															+        return format_response(response, status_code=500)
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"Unexpected error updating reference: {e}", exc_info=True)
														
 
															+        response = UpdateReferenceResponse(
														
 
															+            success=False,
														
 
															+            message="Internal server error occurred",
														
 
															+            old_reference_id=old_reference_id if "old_reference_id" in locals() else "",
														
 
															+            new_reference_id=new_reference_id if "new_reference_id" in locals() else "",
														
 
															         )
														
 
															+        return format_response(response, status_code=500)