2 سال پیش · a095c8f6c7
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
 
															     "samplerate>=0.2.1",
														
 
															     "resampy>=0.4.3",
														
 
															     "spaces>=0.26.1",
														
 
															-    "einx[torch]==0.2.0"
														
 
															+    "einx[torch]==0.2.2"
														
 
															 ]
														
 
															 [project.optional-dependencies]
														
--- a/tools/api.py
+++ b/tools/api.py
@@ -1,5 +1,6 @@
 
															 import base64
														
 
															 import io
														
 
															+import threading
														
 
															 import traceback
														
 
															 from argparse import ArgumentParser
														
 
															 from http import HTTPStatus
														
@@ -17,15 +18,13 @@ from kui.wsgi import (
 
															     Kui,
														
 
															     OpenAPI,
														
 
															     StreamResponse,
														
 
															-    allow_cors,
														
 
															 )
														
 
															 from kui.wsgi.routing import MultimethodRoutes
														
 
															 from loguru import logger
														
 
															 from pydantic import BaseModel
														
 
															 from transformers import AutoTokenizer
														
 
															-from tools.llama.generate import generate_long
														
 
															-from tools.llama.generate import load_model as load_llama_model
														
 
															+from tools.llama.generate import launch_thread_safe_queue
														
 
															 from tools.vqgan.inference import load_model as load_vqgan_model
														
 
															 from tools.webui import inference
														
@@ -95,11 +94,9 @@ def inference(req: InvokeRequest):
 
															         prompt_tokens = vqgan_model.encode(audios, audio_lengths)[0][0]
														
 
															     # LLAMA Inference
														
 
															-    result = generate_long(
														
 
															-        model=llama_model,
														
 
															+    request = dict(
														
 
															         tokenizer=llama_tokenizer,
														
 
															         device=vqgan_model.device,
														
 
															-        decode_one_token=decode_one_token,
														
 
															         max_new_tokens=req.max_new_tokens,
														
 
															         text=req.text,
														
 
															         top_k=int(req.top_k) if req.top_k > 0 else None,
														
@@ -115,7 +112,18 @@ def inference(req: InvokeRequest):
 
															         prompt_text=req.reference_text,
														
 
															     )
														
 
															-    codes = next(result)
														
 
															+    payload = dict(
														
 
															+        event=threading.Event(),
														
 
															+        request=request,
														
 
															+    )
														
 
															+    llama_queue.put(payload)
														
 
															+
														
 
															+    # Wait for the result
														
 
															+    payload["event"].wait()
														
 
															+    if payload["success"] is False:
														
 
															+        raise payload["response"]
														
 
															+
														
 
															+    codes = payload["response"][0]
														
 
															     # VQGAN Inference
														
 
															     feature_lengths = torch.tensor([codes.shape[1]], device=vqgan_model.device)
														
@@ -128,7 +136,7 @@ def inference(req: InvokeRequest):
 
															     return fake_audios
														
 
															-@routes.http.post("/invoke")
														
 
															+@routes.http.post("/v1/invoke")
														
 
															 def api_invoke_model(
														
 
															     req: Annotated[InvokeRequest, Body(exclusive=True)],
														
 
															 ):
														
@@ -139,7 +147,7 @@ def api_invoke_model(
 
															     if args.max_gradio_length > 0 and len(req.text) > args.max_gradio_length:
														
 
															         raise HTTPException(
														
 
															             HTTPStatus.BAD_REQUEST,
														
 
															-            f"Text is too long, max length is {args.max_gradio_length}",
														
 
															+            content=f"Text is too long, max length is {args.max_gradio_length}",
														
 
															         )
														
 
															     try:
														
@@ -147,7 +155,11 @@ def api_invoke_model(
 
															         lock.acquire()
														
 
															         fake_audios = inference(req)
														
 
															     except Exception as e:
														
 
															-        raise HTTPException(HTTPStatus.INTERNAL_SERVER_ERROR, str(e))
														
 
															+        import traceback
														
 
															+
														
 
															+        traceback.print_exc()
														
 
															+
														
 
															+        raise HTTPException(HTTPStatus.INTERNAL_SERVER_ERROR, content=str(e))
														
 
															     finally:
														
 
															         # Release lock
														
 
															         lock.release()
														
@@ -159,12 +171,14 @@ def api_invoke_model(
 
															         iterable=[buffer.getvalue()],
														
 
															         headers={
														
 
															             "Content-Disposition": f"attachment; filename=audio.{req.format}",
														
 
															-            "Content-Type": "application/octet-stream",
														
 
															         },
														
 
															+        # Make swagger-ui happy
														
 
															+        # content_type=f"audio/{req.format}",
														
 
															+        content_type="application/octet-stream",
														
 
															     )
														
 
															-@routes.http.post("/health")
														
 
															+@routes.http.post("/v1/health")
														
 
															 def api_health():
														
 
															     """
														
 
															     Health check
														
@@ -201,7 +215,14 @@ def parse_args():
 
															 # Define Kui app
														
 
															+openapi = OpenAPI(
														
 
															+    {
														
 
															+        "title": "Fish Speech API",
														
 
															+    },
														
 
															+).routes
														
 
															+
														
 
															 app = Kui(
														
 
															+    routes=routes + openapi[1:],  # Remove the default route
														
 
															     exception_handlers={
														
 
															         HTTPException: http_execption_handler,
														
 
															         Exception: other_exception_handler,
														
@@ -209,9 +230,6 @@ app = Kui(
 
															     cors_config={},
														
 
															 )
														
 
															-# Swagger UI & routes
														
 
															-app.router << ("/v1" // routes) << ("/docs" // OpenAPI().routes)
														
 
															-
														
 
															 if __name__ == "__main__":
														
 
															     import threading
														
@@ -222,7 +240,7 @@ if __name__ == "__main__":
 
															     args.precision = torch.half if args.half else torch.bfloat16
														
 
															     logger.info("Loading Llama model...")
														
 
															-    llama_model, decode_one_token = load_llama_model(
														
 
															+    llama_queue = launch_thread_safe_queue(
														
 
															         config_name=args.llama_config_name,
														
 
															         checkpoint_path=args.llama_checkpoint_path,
														
 
															         device=args.device,
														
--- a/tools/llama/generate.py
+++ b/tools/llama/generate.py
@@ -1,4 +1,6 @@
 
															 import os
														
 
															+import queue
														
 
															+import threading
														
 
															 import time
														
 
															 from pathlib import Path
														
 
															 from typing import Optional, Tuple, Union
														
@@ -567,10 +569,7 @@ def generate_long(
 
															             codes = y[1:, prompt_length:-2].clone()
														
 
															             codes = codes - 2
														
 
															-            if not (codes >= 0).all():
														
 
															-                global_encoded.pop()
														
 
															-                logger.warning(f"Negative code found: {codes}, retrying ...")
														
 
															-                continue
														
 
															+            assert (codes >= 0).all(), f"Negative code found"
														
 
															             decoded = y[:, prompt_length:-1].clone()
														
 
															             if decoded[0, -1] != im_end_id:  # <im_end>
														
@@ -599,6 +598,47 @@ def generate_long(
 
															             yield all_codes
														
 
															+def launch_thread_safe_queue(
														
 
															+    config_name,
														
 
															+    checkpoint_path,
														
 
															+    device,
														
 
															+    precision,
														
 
															+    max_length,
														
 
															+    compile=False,
														
 
															+):
														
 
															+    input_queue = queue.Queue()
														
 
															+
														
 
															+    def worker():
														
 
															+        model, decode_one_token = load_model(
														
 
															+            config_name, checkpoint_path, device, precision, max_length, compile=compile
														
 
															+        )
														
 
															+
														
 
															+        while True:
														
 
															+            item = input_queue.get()
														
 
															+            if item is None:
														
 
															+                break
														
 
															+
														
 
															+            kwargs = item["request"]
														
 
															+            event = item["event"]
														
 
															+
														
 
															+            try:
														
 
															+                item["success"] = True
														
 
															+                item["response"] = list(
														
 
															+                    generate_long(
														
 
															+                        model=model, decode_one_token=decode_one_token, **kwargs
														
 
															+                    )
														
 
															+                )
														
 
															+            except Exception as e:
														
 
															+                item["success"] = False
														
 
															+                item["response"] = e
														
 
															+
														
 
															+            event.set()
														
 
															+
														
 
															+    threading.Thread(target=worker, daemon=True).start()
														
 
															+
														
 
															+    return input_queue
														
 
															+
														
 
															+
														
 
															 @click.command()
														
 
															 @click.option(
														
 
															     "--text",
														
--- a/tools/webui.py
+++ b/tools/webui.py
@@ -1,5 +1,6 @@
 
															 import html
														
 
															 import os
														
 
															+import threading
														
 
															 from argparse import ArgumentParser
														
 
															 from io import BytesIO
														
 
															 from pathlib import Path
														
@@ -12,8 +13,7 @@ from loguru import logger
 
															 from torchaudio import functional as AF
														
 
															 from transformers import AutoTokenizer
														
 
															-from tools.llama.generate import generate_long
														
 
															-from tools.llama.generate import load_model as load_llama_model
														
 
															+from tools.llama.generate import launch_thread_safe_queue
														
 
															 from tools.vqgan.inference import load_model as load_vqgan_model
														
 
															 # Make einx happy
														
@@ -85,11 +85,9 @@ def inference(
 
															         prompt_tokens = vqgan_model.encode(audios, audio_lengths)[0][0]
														
 
															     # LLAMA Inference
														
 
															-    result = generate_long(
														
 
															-        model=llama_model,
														
 
															+    request = dict(
														
 
															         tokenizer=llama_tokenizer,
														
 
															         device=vqgan_model.device,
														
 
															-        decode_one_token=decode_one_token,
														
 
															         max_new_tokens=max_new_tokens,
														
 
															         text=text,
														
 
															         top_k=int(top_k) if top_k > 0 else None,
														
@@ -105,7 +103,18 @@ def inference(
 
															         prompt_text=reference_text if enable_reference_audio else None,
														
 
															     )
														
 
															-    codes = next(result)
														
 
															+    payload = dict(
														
 
															+        event=threading.Event(),
														
 
															+        request=request,
														
 
															+    )
														
 
															+    llama_queue.put(payload)
														
 
															+
														
 
															+    # Wait for the result
														
 
															+    payload["event"].wait()
														
 
															+    if payload["success"] is False:
														
 
															+        raise payload["response"]
														
 
															+
														
 
															+    codes = payload["response"][0]
														
 
															     # VQGAN Inference
														
 
															     feature_lengths = torch.tensor([codes.shape[1]], device=vqgan_model.device)
														
@@ -270,7 +279,7 @@ if __name__ == "__main__":
 
															     args.precision = torch.half if args.half else torch.bfloat16
														
 
															     logger.info("Loading Llama model...")
														
 
															-    llama_model, decode_one_token = load_llama_model(
														
 
															+    llama_queue = launch_thread_safe_queue(
														
 
															         config_name=args.llama_config_name,
														
 
															         checkpoint_path=args.llama_checkpoint_path,
														
 
															         device=args.device,