Browse Source

feat: 完成txt to speech 接口

huangzhichao 2 ngày trước cách đây
commit
60fe1daae0

+ 14 - 0
.dockerignore

@@ -0,0 +1,14 @@
+.venv
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+build
+dist
+.git
+.gitignore
+.env
+*.log
+*.sqlite3
+app/audio

+ 16 - 0
.env.example

@@ -0,0 +1,16 @@
+## Copy this to `.env` and fill values
+
+# LLMs
+LLM_PROVIDER=mock
+LLM_MODEL=mock-echo-001
+LLM_TEMPERATURE=1.0
+OPENAI_API_KEY=
+
+# DashScope
+# Preferred name
+DASHSCOPE_API_KEY=
+# Legacy/fallback name supported by code
+APIKEY=
+
+# OSS upload endpoint
+UPLOAD_PATH=https://api.piaoquantv.com/ad/file/upload

+ 19 - 0
.gitignore

@@ -0,0 +1,19 @@
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+.pytest_cache/
+
+# Virtual envs
+.venv/
+venv/
+
+# OS
+.DS_Store
+
+# IDE
+.idea/
+.vscode/
+
+# Local environment
+.env

+ 31 - 0
Dockerfile

@@ -0,0 +1,31 @@
+# syntax=docker/dockerfile:1
+
+FROM python:3.11-slim AS base
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1
+
+WORKDIR /app
+
+# System deps
+RUN apt-get update -y && apt-get install -y --no-install-recommends \
+    build-essential curl && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install Python deps first (better layer caching)
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+# Copy source
+COPY app ./app
+COPY README.md .
+
+# Optionally copy .env at build time (usually mounted at runtime)
+# COPY .env .
+
+EXPOSE 8000
+
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+

+ 102 - 0
README.md

@@ -0,0 +1,102 @@
+# FastAPI 最小项目模板
+
+一个最小可运行的 FastAPI 项目,包含健康检查与示例路由。
+
+## 运行步骤
+
+1) 创建虚拟环境并激活
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate  # Windows: .venv\\Scripts\\activate
+python -m pip install -U pip
+```
+
+2) 安装依赖
+
+```bash
+pip install -r requirements.txt
+```
+
+3) 启动开发服务(热重载)
+
+```bash
+uvicorn app.main:app --reload --port 8000
+```
+
+4) 打开接口文档
+
+- Swagger UI: http://127.0.0.1:8000/docs
+- ReDoc: http://127.0.0.1:8000/redoc
+
+## 项目结构
+
+```
+.
+├── app
+│   ├── __init__.py
+│   ├── core
+│   │   ├── __init__.py
+│   │   └── config.py       # 环境配置(LLM_PROVIDER 等)
+│   ├── api
+│   │   ├── __init__.py
+│   │   ├── deps.py         # 依赖注入(LLMService)
+│   │   └── routes.py       # 路由(/api/ping, /api/llm/chat)
+│   ├── providers
+│   │   ├── __init__.py
+│   │   ├── base.py         # Provider 协议定义
+│   │   └── mock_provider.py# 本地回显 Provider
+│   ├── schemas
+│   │   ├── __init__.py
+│   │   └── llm.py          # LLM 请求/响应模型
+│   ├── services
+│   │   ├── __init__.py
+│   │   └── llm_service.py  # LLM 业务服务
+│   └── main.py            # FastAPI 应用入口(/health)
+├── requirements.txt        # 依赖
+├── .gitignore
+└── README.md
+```
+
+## LLM 业务模块
+
+- 环境变量:
+  - `LLM_PROVIDER`:默认 `mock`
+  - `LLM_MODEL`:默认 `mock-echo-001`
+  - `LLM_TEMPERATURE`:默认 `1.0`
+  - `OPENAI_API_KEY`:如接入 OpenAI 时使用(目前示例未调用外部服务)
+
+- 示例接口:`POST /api/llm/chat`
+
+请求体示例:
+
+```json
+{
+  "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "你好,今天上海天气如何?"}
+  ],
+  "model": "mock-echo-001",
+  "temperature": 0.7
+}
+```
+
+响应示例(mock 提供器):
+
+```json
+{
+  "content": "[mock] You said: 你好,今天上海天气如何?",
+  "model": "mock-echo-001",
+  "usage": {"prompt_tokens": 8, "completion_tokens": 7, "total_tokens": 15}
+}
+```
+
+后续要接入真实大模型时,只需新增对应 Provider(如 `openai_provider.py`),在 `app/api/deps.py` 中根据 `LLM_PROVIDER` 切换实例即可。
+
+## 常见扩展(可选)
+
+- 配置管理:引入 `pydantic-settings` 并添加 `Settings` 类读取环境变量
+- 日志:使用 `logging` 或 `structlog`
+- 测试:新增 `pytest`,并添加 `tests/` 目录
+- 代码质量:`ruff`/`black`/`mypy`,及 `pre-commit`
+- 容器化:编写 `Dockerfile` 与 `docker-compose.yml`

+ 1 - 0
app/__init__.py

@@ -0,0 +1 @@
+# Makes `app` a package

+ 1 - 0
app/api/__init__.py

@@ -0,0 +1 @@
+# Makes `app.api` a package

+ 10 - 0
app/api/deps.py

@@ -0,0 +1,10 @@
+from fastapi import Depends
+
+from ..providers.speech_provider import SpeechProvider
+from ..services.speech_service import SpeechService
+
+
+def get_speech_service() -> SpeechService:
+    provider = SpeechProvider()
+    return SpeechService(provider)
+

+ 17 - 0
app/api/routes.py

@@ -0,0 +1,17 @@
+from fastapi import APIRouter, Depends
+from .deps import get_speech_service
+from ..schemas.llm import TextToSpeechResponse, TextToSpeechRequest
+from ..services.speech_service import SpeechService
+
+
+router = APIRouter()
+
+
+@router.get("/ping", tags=["default"])
+def ping():
+    return {"message": "pong"}
+
+@router.post('/llm/text-to-speech', response_model=TextToSpeechResponse, tags=["llm"])
+def text_to_speech(req: TextToSpeechRequest, service: SpeechService = Depends(get_speech_service)):
+    return service.text_to_speech(req)
+

BIN
app/audio/betty.mp3


BIN
app/audio/zhichu.mp3


BIN
app/audio/zhida.mp3


BIN
app/audio/zhifei.mp3


BIN
app/audio/zhiming.mp3


BIN
app/audio/zhiru.mp3


+ 2 - 0
app/core/__init__.py

@@ -0,0 +1,2 @@
+"""Core utilities like configuration and app-wide helpers."""
+

+ 31 - 0
app/core/config.py

@@ -0,0 +1,31 @@
+import os
+from typing import Optional
+from dataclasses import dataclass
+
+try:
+    # Load variables from a local .env if present
+    from dotenv import load_dotenv  # type: ignore
+
+    load_dotenv()
+except Exception:
+    # If python-dotenv isn't installed, skip silently. Env vars still work.
+    pass
+
+
+@dataclass
+class Settings:
+    llm_provider: str = os.getenv("LLM_PROVIDER", "mock").lower()
+    default_model: str = os.getenv("LLM_MODEL", "mock-echo-001")
+    temperature: float = float(os.getenv("LLM_TEMPERATURE", "1.0"))
+    openai_api_key: Optional[str] = os.getenv("OPENAI_API_KEY")
+    dashscope_api_key: Optional[str] = (
+        os.getenv("DASHSCOPE_API_KEY") or os.getenv("APIKEY")
+    )
+    upload_path: Optional[str] = os.getenv(
+        "UPLOAD_PATH", "https://api.piaoquantv.com/ad/file/upload"
+    )
+
+
+def get_settings() -> Settings:
+    return Settings()
+

+ 15 - 0
app/main.py

@@ -0,0 +1,15 @@
+from fastapi import FastAPI
+from .api.routes import router as api_router
+
+
+app = FastAPI(title="AI Server", version="0.1.0")
+
+
+@app.get("/health", tags=["health"])  # 简单健康检查
+def health():
+    return {"status": "ok"}
+
+
+# 业务 API 路由
+app.include_router(api_router, prefix="/api")
+

+ 2 - 0
app/providers/__init__.py

@@ -0,0 +1,2 @@
+"""LLM provider implementations (OpenAI, Azure, Mock, etc.)."""
+

+ 25 - 0
app/providers/base.py

@@ -0,0 +1,25 @@
+from typing import List, Optional, Protocol
+
+from ..schemas.llm import ChatMessage, ChatResponse, TextToSpeechResponse
+
+
+class LLMProvider(Protocol):
+    def chat(
+        self,
+        messages: List[ChatMessage],
+        *,
+        model: Optional[str] = None,
+        temperature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+    ) -> ChatResponse:
+        ...
+
+class SpeechProvider(Protocol):
+    def text_to_speech(
+        self,
+        text: str,
+        *,
+        model: Optional[str] = None,
+        format: Optional[str] = None,
+    ) -> TextToSpeechResponse:
+        ...

+ 33 - 0
app/providers/mock_provider.py

@@ -0,0 +1,33 @@
+from typing import List, Optional
+
+from ..schemas.llm import ChatMessage, ChatResponse, Usage
+
+
+class MockProvider:
+    """A simple echo-like provider for local testing.
+
+    It takes the last user message and returns a prefixed reply.
+    """
+
+    def chat(
+        self,
+        messages: List[ChatMessage],
+        *,
+        model: Optional[str] = None,
+        temperature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+    ) -> ChatResponse:
+        last_user = next((m.content for m in reversed(messages) if m.role == "user"), "")
+        content = f"[mock] You said: {last_user}"
+
+        # Very rough word-count "tokens" approximation for demo purposes only
+        prompt_tokens = sum(len(m.content.split()) for m in messages)
+        completion_tokens = len(content.split())
+        usage = Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        )
+
+        return ChatResponse(content=content, model=model or "mock-echo-001", usage=usage)
+

+ 185 - 0
app/providers/speech_provider.py

@@ -0,0 +1,185 @@
+from wave import open as wave_open
+from pathlib import Path
+import re
+import os
+import json
+from typing import Optional
+
+import dashscope
+from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
+from dashscope.audio.tts import ResultCallback, SpeechSynthesizer, SpeechSynthesisResult
+
+import requests
+
+from ..schemas.llm import TextToSpeechResponse
+from ..core.config import get_settings
+
+settings = get_settings()
+# Configure DashScope API key from env/.env
+dashscope.api_key = settings.dashscope_api_key or ""
+UPLOAD_PATH = settings.upload_path or 'https://api.piaoquantv.com/ad/file/upload'
+
+
+def _safe_filename(name: str) -> str:
+    # Keep alphanum, dash, underscore, Chinese; replace others with '_'
+    return re.sub(r"[^\w\-\u4e00-\u9fff]+", "_", name).strip("_") or "output"
+
+
+class SpeechProvider:
+    def text_to_speech(self, pitch: float, rate: float, filename: str, text: str, *, model: Optional[str] = None, format: Optional[str] = None) -> TextToSpeechResponse:
+        # Resolve output path under app/audio and ensure directory exists
+        app_dir = Path(__file__).resolve().parents[1]  # .../app
+        audio_dir = app_dir / "audio"
+        audio_dir.mkdir(parents=True, exist_ok=True)
+
+        # determine desired output format (default mp3 for smaller size)
+        audio_format = (format or 'mp3').lower()
+        if audio_format not in {"wav", "mp3"}:
+            audio_format = "mp3"
+
+        # choose extension and sample rate
+        ext = "wav" if audio_format == "wav" else "mp3"
+        sample_rate = 48000 if audio_format == "wav" else 24000
+
+        filename = f"{_safe_filename(filename)}.{ext}"
+        out_path = audio_dir / filename
+
+        # Prepare callback with audio params
+        callback = Callback(
+            out_path=str(out_path),
+            sample_rate=sample_rate,
+            channels=1,
+            sampwidth=2,
+            audio_format=audio_format,
+        )
+
+        SpeechSynthesizer.call(
+            model=(model or 'sambert-zhifei-v1'),
+            text=text,
+            pitch=pitch,
+            rate=rate,
+            format=audio_format,
+            sample_rate=sample_rate,
+            callback=callback,
+            word_timestamp_enabled=True,
+            phoneme_timestamp_enabled=True,
+        )
+
+        # After synthesis completes, upload the file to OSS
+        try:
+            url = _upload_file(UPLOAD_PATH, out_path)
+            return TextToSpeechResponse(audio_url=url)
+        except Exception as e:
+            # If upload fails, fall back to local path to avoid breaking
+            print(f"[warn] Upload failed: {e}")
+            return TextToSpeechResponse(audio_url=str(out_path))
+
+
+class Callback(ResultCallback):
+    def __init__(self, out_path: str, sample_rate: int = 16000, channels: int = 1, sampwidth: int = 2, audio_format: str = "mp3"):
+        self.out_path = out_path
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.sampwidth = sampwidth
+        self.wav_file = None
+        self._fh = None
+        self.audio_format = audio_format
+
+    def on_open(self):
+        print('Speech synthesizer is opened.')
+        # Ensure parent directory exists (in case not created earlier)
+        Path(self.out_path).parent.mkdir(parents=True, exist_ok=True)
+        if self.audio_format == "wav":
+            self.wav_file = wave_open(self.out_path, 'wb')
+            self.wav_file.setnchannels(self.channels)
+            self.wav_file.setsampwidth(self.sampwidth)
+            self.wav_file.setframerate(self.sample_rate)
+        else:
+            # For mp3 (and other compressed formats), write raw bytes
+            self._fh = open(self.out_path, 'wb')
+
+    def on_complete(self):
+        print('Speech synthesizer is completed.')
+        if self.wav_file:
+            self.wav_file.close()
+            self.wav_file = None
+        if self._fh:
+            self._fh.close()
+            self._fh = None
+
+    def on_error(self, response: SpeechSynthesisResponse):
+        print('Speech synthesizer failed, response is %s' % (str(response)))
+
+    def on_close(self):
+        print('Speech synthesizer is closed.')
+
+    def on_event(self, result: SpeechSynthesisResult):
+        frame = result.get_audio_frame()
+        if not frame:
+            return
+        if self.wav_file:
+            self.wav_file.writeframes(frame)
+        elif self._fh:
+            self._fh.write(frame)
+
+
+def _extract_url_from_response(resp_json: dict) -> Optional[str]:
+    # Try common shapes: {data: {url}}, {url}, {data: "http..."}
+    try_keys = [
+        ("data", "fileUrl"),
+        ("data",),
+        ("fileUrl",),
+        ("result", "fileUrl"),
+        ("payload", "fileUrl"),
+    ]
+    for path in try_keys:
+        cur = resp_json
+        ok = True
+        for k in path:
+            if isinstance(cur, dict) and k in cur:
+                cur = cur[k]
+                print(cur)
+            else:
+                ok = False
+                break
+        if ok and isinstance(cur, str) and cur.startswith("http"):
+            return cur
+    return None
+
+
+def _upload_file(upload_url: str, file_path: Path) -> str:
+    if not upload_url:
+        raise ValueError("upload_url is empty")
+    if not Path(file_path).exists():
+        raise FileNotFoundError(str(file_path))
+
+    filename = Path(file_path).name
+    # Guess content type
+    content_type = "audio/mpeg" if filename.lower().endswith(".mp3") else "audio/wav"
+
+    with open(file_path, "rb") as f:
+        files = {
+            "file": (filename, f, content_type),
+            "fileType": (None, "VOICE")
+        }
+        resp = requests.post(upload_url, files=files, timeout=30)
+    resp.raise_for_status()
+
+    # Try to parse JSON for a URL; fallback to raw text if JSON invalid
+    url: Optional[str] = None
+    try:
+        data = resp.json()
+        url = _extract_url_from_response(data)
+    except Exception:
+        pass
+
+    if not url:
+        # As a last resort, if the response text looks like a URL, use it
+        txt = (resp.text or "").strip()
+        if txt.startswith("http"):
+            url = txt
+
+    if not url:
+        raise RuntimeError("Upload succeeded but no URL found in response")
+
+    return url

+ 2 - 0
app/schemas/__init__.py

@@ -0,0 +1,2 @@
+"""Pydantic schemas for request/response models."""
+

+ 40 - 0
app/schemas/llm.py

@@ -0,0 +1,40 @@
+from typing import List, Optional, Literal
+from pydantic import BaseModel, Field
+
+
+Role = Literal["system", "user", "assistant"]
+
+
+class ChatMessage(BaseModel):
+    role: Role
+    content: str
+
+
+class ChatRequest(BaseModel):
+    messages: List[ChatMessage] = Field(min_length=1)
+    model: Optional[str] = None
+    temperature: Optional[float] = Field(default=None, ge=0, le=2)
+    max_tokens: Optional[int] = Field(default=None, gt=0)
+
+
+class Usage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ChatResponse(BaseModel):
+    content: str
+    model: Optional[str] = None
+    usage: Optional[Usage] = None
+
+class TextToSpeechResponse(BaseModel):
+    audio_url: str
+
+class TextToSpeechRequest(BaseModel):
+    pitch: float = 1
+    rate: float = 1
+    filename: str
+    text: str
+    model: Optional[str] = None
+    format: Optional[str] = None

+ 2 - 0
app/services/__init__.py

@@ -0,0 +1,2 @@
+"""Business services for orchestrating application logic."""
+

+ 16 - 0
app/services/llm_service.py

@@ -0,0 +1,16 @@
+from ..schemas.llm import ChatRequest, ChatResponse
+from ..providers.base import LLMProvider
+
+
+class LLMService:
+    def __init__(self, provider: LLMProvider) -> None:
+        self._provider = provider
+
+    def chat(self, req: ChatRequest) -> ChatResponse:
+        return self._provider.chat(
+            req.messages,
+            model=req.model,
+            temperature=req.temperature,
+            max_tokens=req.max_tokens,
+        )
+

+ 17 - 0
app/services/speech_service.py

@@ -0,0 +1,17 @@
+from ..schemas.llm import TextToSpeechRequest, TextToSpeechResponse
+from ..providers.speech_provider import SpeechProvider
+
+
+class SpeechService:
+    def __init__(self, provider: SpeechProvider) -> None:
+        self._provider = provider
+
+    def text_to_speech(self, req: TextToSpeechRequest) -> TextToSpeechResponse:
+        return self._provider.text_to_speech(
+            req.pitch,
+            req.rate,
+            req.filename,
+            req.text,
+            model=req.model,
+            format=req.format,
+        )

+ 6 - 0
requirements.txt

@@ -0,0 +1,6 @@
+fastapi[standard]==0.113.0
+pydantic==2.8.0
+uvicorn[standard]>=0.30.0
+dashscope>=0.1.0
+python-dotenv>=1.0.1
+requests>=2.31.0