소스 검색

Merge branch 'feature/text-to-speech' of Web/pq-web-ai into master

huangzhichao 5 시간 전
부모
커밋
032faee850
9개의 변경된 파일255개의 추가작업 그리고 87개의 파일을 삭제
  1. 0 1
      .dockerignore
  2. 1 0
      .env
  3. 4 3
      .gitignore
  4. 7 6
      Dockerfile
  5. 4 0
      README.md
  6. 57 0
      app/core/logger.py
  7. 0 33
      app/providers/mock_provider.py
  8. 175 41
      app/providers/speech_provider.py
  9. 7 3
      app/schemas/speech.py

+ 0 - 1
.dockerignore

@@ -8,7 +8,6 @@ build
 dist
 .git
 .gitignore
-.env
 *.log
 *.sqlite3
 app/audio

+ 1 - 0
.env

@@ -0,0 +1 @@
+APIKEY = 'sk-65745f458f654368b4d3a83362a607c1'

+ 4 - 3
.gitignore

@@ -15,8 +15,9 @@ venv/
 .idea/
 .vscode/
 
-# Local environment
-.env
-
 # Temp cache directory
 temp/
+
+# logs
+logs/
+*/log

+ 7 - 6
Dockerfile

@@ -1,6 +1,6 @@
 # syntax=docker/dockerfile:1
 
-FROM python:3.11-slim AS base
+FROM registry.cn-hangzhou.aliyuncs.com/stuuudy/python:3.11-slim AS base
 
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
@@ -10,20 +10,21 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
 WORKDIR /app
 
 # System deps
-RUN apt-get update -y && apt-get install -y --no-install-recommends \
-    build-essential curl && \
-    rm -rf /var/lib/apt/lists/*
+# RUN apt-get update -y && apt-get install -y --no-install-recommends \
+#     build-essential curl && \
+#     rm -rf /var/lib/apt/lists/*
 
 # Install Python deps first (better layer caching)
 COPY requirements.txt .
-RUN pip install -r requirements.txt
+RUN pip install --no-cache-dir --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple/
+RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple/
 
 # Copy source
 COPY app ./app
 COPY README.md .
 
 # Optionally copy .env at build time (usually mounted at runtime)
-# COPY .env .
+COPY .env .
 
 EXPOSE 8000
 

+ 4 - 0
README.md

@@ -29,6 +29,10 @@ uvicorn app.main:app --reload --port 8000
 - Swagger UI: http://127.0.0.1:8000/docs
 - ReDoc: http://127.0.0.1:8000/redoc
 
+5) 内网IP
+
+192.168.244.164
+
 ## 项目结构
 
 ```

+ 57 - 0
app/core/logger.py

@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import os
+import logging
+from pathlib import Path
+from logging.handlers import RotatingFileHandler
+from typing import Optional
+
+_CONFIGURED = False
+
+
+def _project_root() -> Path:
+    # Resolve repo root from this file location: app/core/logger.py -> repo/app/core
+    return Path(__file__).resolve().parents[2]
+
+
+def configure_logging(level: Optional[str] = None, log_dir: Optional[str] = None) -> None:
+    global _CONFIGURED
+    if _CONFIGURED:
+        return
+
+    # Determine log level
+    level_name = (level or os.getenv("LOG_LEVEL") or "INFO").upper()
+    log_level = getattr(logging, level_name, logging.INFO)
+
+    # Determine logs directory
+    base_dir = Path(log_dir) if log_dir else _project_root() / "logs"
+    base_dir.mkdir(parents=True, exist_ok=True)
+
+    # Root logger configuration
+    root = logging.getLogger()
+    root.setLevel(log_level)
+
+    fmt = logging.Formatter(
+        fmt="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    file_handler = RotatingFileHandler(base_dir / "app.log", maxBytes=1_000_000, backupCount=3)
+    file_handler.setFormatter(fmt)
+    file_handler.setLevel(log_level)
+    root.addHandler(file_handler)
+
+    # Stream warnings+ to stderr for container visibility
+    stream = logging.StreamHandler()
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(fmt)
+    root.addHandler(stream)
+
+    _CONFIGURED = True
+
+
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    if not _CONFIGURED:
+        configure_logging()
+    return logging.getLogger(name or "app")
+

+ 0 - 33
app/providers/mock_provider.py

@@ -1,33 +0,0 @@
-from typing import List, Optional
-
-from ..schemas.speech import ChatMessage, ChatResponse, Usage
-
-
-class MockProvider:
-    """A simple echo-like provider for local testing.
-
-    It takes the last user message and returns a prefixed reply.
-    """
-
-    def chat(
-        self,
-        messages: List[ChatMessage],
-        *,
-        model: Optional[str] = None,
-        temperature: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-    ) -> ChatResponse:
-        last_user = next((m.content for m in reversed(messages) if m.role == "user"), "")
-        content = f"[mock] You said: {last_user}"
-
-        # Very rough word-count "tokens" approximation for demo purposes only
-        prompt_tokens = sum(len(m.content.split()) for m in messages)
-        completion_tokens = len(content.split())
-        usage = Usage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=prompt_tokens + completion_tokens,
-        )
-
-        return ChatResponse(content=content, model=model or "mock-echo-001", usage=usage)
-

+ 175 - 41
app/providers/speech_provider.py

@@ -11,13 +11,17 @@ from dashscope.audio.tts import ResultCallback, SpeechSynthesizer, SpeechSynthes
 
 import requests
 
-from ..schemas.speech import TextToSpeechResponse
+from ..schemas.speech import TextToSpeechResponse, DataPayload
 from ..core.config import get_settings
+from ..core.logger import get_logger
 
 settings = get_settings()
 # Configure DashScope API key from env/.env
 dashscope.api_key = settings.dashscope_api_key or ""
-UPLOAD_PATH = settings.upload_path
+UPLOAD_PATH = settings.upload_path or ""
+
+# module logger
+logger = get_logger("speech_provider")
 
 
 def _safe_filename(name: str) -> str:
@@ -30,11 +34,30 @@ class SpeechProvider:
         # Resolve output path under project-root/temp and ensure directory exists
         project_root = Path(__file__).resolve().parents[2]  # repo root
         audio_dir = project_root / "temp"
-        audio_dir.mkdir(parents=True, exist_ok=True)
+        try:
+            audio_dir.mkdir(parents=True, exist_ok=True)
+        except Exception as e:
+            logger.error("Failed to create audio directory %s: %s", audio_dir, e, exc_info=True)
+            return TextToSpeechResponse(code=1, data=None, msg=f"create audio dir failed: {e}")
+
+        # Basic input validation
+        if not isinstance(text, str) or not text.strip():
+            msg = "text is required"
+            logger.error(msg)
+            return TextToSpeechResponse(code=1, data=None, msg=msg)
+        if not isinstance(filename, str) or not filename.strip():
+            msg = "filename is required"
+            logger.error(msg)
+            return TextToSpeechResponse(code=1, data=None, msg=msg)
+        if not dashscope.api_key:
+            msg = "DASHSCOPE_API_KEY is missing"
+            logger.error(msg)
+            return TextToSpeechResponse(code=1, data=None, msg=msg)
 
         # determine desired output format (default mp3 for smaller size)
         audio_format = (format or 'mp3').lower()
         if audio_format not in {"wav", "mp3"}:
+            logger.info("unsupported format '%s', fallback to mp3", audio_format)
             audio_format = "mp3"
 
         # choose extension and sample rate
@@ -53,18 +76,39 @@ class SpeechProvider:
             audio_format=audio_format,
         )
 
-        SpeechSynthesizer.call(
-            model=(model or 'sambert-zhifei-v1'),
-            volume=volume,
-            text=text,
-            pitch=pitch,
-            rate=rate,
-            format=audio_format,
-            sample_rate=sample_rate,
-            callback=callback,
-            word_timestamp_enabled=True,
-            phoneme_timestamp_enabled=True,
-        )
+        # Run TTS with robust error handling
+        try:
+            SpeechSynthesizer.call(
+                model=(model or 'sambert-zhifei-v1'),
+                volume=volume,
+                text=text,
+                pitch=pitch,
+                rate=rate,
+                format=audio_format,
+                sample_rate=sample_rate,
+                callback=callback,
+                word_timestamp_enabled=True,
+                phoneme_timestamp_enabled=True,
+            )
+        except Exception as e:
+            logger.error("TTS call failed", exc_info=True)
+            # Ensure any open file handles are closed
+            try:
+                callback.on_complete()
+            except Exception:
+                pass
+            return TextToSpeechResponse(code=1, data=None, msg=str(e))
+
+        if callback.had_error:
+            # TTS reported an error via callback
+            base_msg = callback.error_message or "speech synthesis failed"
+            # Enrich message with model error code/status when available
+            if callback.error_code or callback.status_code is not None:
+                msg = f"[{callback.error_code or 'Error'}] {base_msg} (status={callback.status_code})"
+            else:
+                msg = base_msg
+            logger.error("TTS callback error: %s", msg)
+            return TextToSpeechResponse(code=1, data=None, msg=msg)
 
         # After synthesis completes, upload the file to OSS
         try:
@@ -73,12 +117,16 @@ class SpeechProvider:
             try:
                 Path(out_path).unlink(missing_ok=True)
             except Exception as del_err:
-                print(f"[warn] Failed to delete local audio {out_path}: {del_err}")
-            return TextToSpeechResponse(audio_url=url)
+                logger.warning("Failed to delete local audio %s: %s", out_path, del_err)
+            return TextToSpeechResponse(
+                code=0,
+                data=DataPayload(audio_url=url),
+                msg='success'
+            )
         except Exception as e:
-            # If upload fails, fall back to local path to avoid breaking
-            print(f"[warn] Upload failed: {e}")
-            return TextToSpeechResponse(audio_url=str(out_path))
+            # Keep local file for inspection; report error message
+            logger.error("Upload failed", exc_info=True)
+            return TextToSpeechResponse(code=1, data=None, msg=str(e))
 
 
 class Callback(ResultCallback):
@@ -90,22 +138,31 @@ class Callback(ResultCallback):
         self.wav_file = None
         self._fh = None
         self.audio_format = audio_format
+        self.had_error = False
+        self.error_message: Optional[str] = None
+        self.error_code: Optional[str] = None
+        self.status_code: Optional[int] = None
 
     def on_open(self):
-        print('Speech synthesizer is opened.')
-        # Ensure parent directory exists (in case not created earlier)
-        Path(self.out_path).parent.mkdir(parents=True, exist_ok=True)
-        if self.audio_format == "wav":
-            self.wav_file = wave_open(self.out_path, 'wb')
-            self.wav_file.setnchannels(self.channels)
-            self.wav_file.setsampwidth(self.sampwidth)
-            self.wav_file.setframerate(self.sample_rate)
-        else:
-            # For mp3 (and other compressed formats), write raw bytes
-            self._fh = open(self.out_path, 'wb')
+        logger.info('Speech synthesizer opened')
+        try:
+            # Ensure parent directory exists (in case not created earlier)
+            Path(self.out_path).parent.mkdir(parents=True, exist_ok=True)
+            if self.audio_format == "wav":
+                self.wav_file = wave_open(self.out_path, 'wb')
+                self.wav_file.setnchannels(self.channels)
+                self.wav_file.setsampwidth(self.sampwidth)
+                self.wav_file.setframerate(self.sample_rate)
+            else:
+                # For mp3 (and other compressed formats), write raw bytes
+                self._fh = open(self.out_path, 'wb')
+        except Exception as e:
+            self.had_error = True
+            self.error_message = f"open output failed: {e}"
+            logger.error("Failed to open output file %s: %s", self.out_path, e, exc_info=True)
 
     def on_complete(self):
-        print('Speech synthesizer is completed.')
+        logger.info('Speech synthesizer completed')
         if self.wav_file:
             self.wav_file.close()
             self.wav_file = None
@@ -114,19 +171,44 @@ class Callback(ResultCallback):
             self._fh = None
 
     def on_error(self, response: SpeechSynthesisResponse):
-        print('Speech synthesizer failed, response is %s' % (str(response)))
+        # Capture error and mark state for upstream handling
+        code, detail, status = _extract_dashscope_error(response)
+        self.had_error = True
+        self.error_message = detail
+        self.error_code = code
+        self.status_code = status
+        # Log with structured context
+        if code or status is not None:
+            logger.error('Speech synthesizer failed: code=%s status=%s msg=%s', code, status, detail)
+        else:
+            logger.error('Speech synthesizer failed: %s', detail)
+        # Ensure file handles are closed even on error
+        try:
+            self.on_complete()
+        except Exception:
+            pass
 
     def on_close(self):
-        print('Speech synthesizer is closed.')
+        logger.info('Speech synthesizer closed')
 
     def on_event(self, result: SpeechSynthesisResult):
         frame = result.get_audio_frame()
         if not frame:
             return
-        if self.wav_file:
-            self.wav_file.writeframes(frame)
-        elif self._fh:
-            self._fh.write(frame)
+        try:
+            if self.wav_file:
+                self.wav_file.writeframes(frame)
+            elif self._fh:
+                self._fh.write(frame)
+            else:
+                # No open handle; mark error to surface upstream
+                self.had_error = True
+                self.error_message = "audio handle not initialized"
+                logger.error("Audio handle not initialized when receiving frame")
+        except Exception as e:
+            self.had_error = True
+            self.error_message = f"write frame failed: {e}"
+            logger.error("Failed writing audio frame: %s", e, exc_info=True)
 
 
 def _extract_url_from_response(resp_json: dict) -> Optional[str]:
@@ -144,7 +226,6 @@ def _extract_url_from_response(resp_json: dict) -> Optional[str]:
         for k in path:
             if isinstance(cur, dict) and k in cur:
                 cur = cur[k]
-                print(cur)
             else:
                 ok = False
                 break
@@ -153,10 +234,62 @@ def _extract_url_from_response(resp_json: dict) -> Optional[str]:
     return None
 
 
+def _extract_dashscope_error(resp: object) -> tuple[Optional[str], str, Optional[int]]:
+    """Best-effort extraction of (code, message, http_status) from DashScope response.
+    Compatible with SpeechSynthesisResponse or dict-like payloads.
+    """
+    code: Optional[str] = None
+    msg: str = "speech synthesis failed"
+    status: Optional[int] = None
+
+    # If it looks like a dict
+    if isinstance(resp, dict):
+        code = str(resp.get("code")) if resp.get("code") is not None else None
+        status = resp.get("status_code") if isinstance(resp.get("status_code"), int) else None
+        msg = str(resp.get("message") or msg)
+        return code, msg, status
+
+    # Try attribute-style access
+    try:
+        status_attr = getattr(resp, "status_code", None)
+        if isinstance(status_attr, int):
+            status = status_attr
+    except Exception:
+        pass
+    try:
+        code_attr = getattr(resp, "code", None)
+        if code_attr is not None:
+            code = str(code_attr)
+    except Exception:
+        pass
+    try:
+        msg_attr = getattr(resp, "message", None)
+        if msg_attr:
+            msg = str(msg_attr)
+    except Exception:
+        pass
+
+    # As a last resort, try to parse JSON from str(resp)
+    try:
+        s = str(resp)
+        if s and s.strip().startswith("{"):
+            data = json.loads(s)
+            if isinstance(data, dict):
+                code = str(data.get("code")) if data.get("code") is not None else code
+                status = data.get("status_code") if isinstance(data.get("status_code"), int) else status
+                msg = str(data.get("message") or msg)
+    except Exception:
+        pass
+
+    return code, msg, status
+
+
 def _upload_file(upload_url: str, file_path: Path) -> str:
     if not upload_url:
+        logger.error("upload_url is empty")
         raise ValueError("upload_url is empty")
     if not Path(file_path).exists():
+        logger.error("audio file not found: %s", file_path)
         raise FileNotFoundError(str(file_path))
 
     filename = Path(file_path).name
@@ -177,7 +310,7 @@ def _upload_file(upload_url: str, file_path: Path) -> str:
         data = resp.json()
         url = _extract_url_from_response(data)
     except Exception:
-        pass
+        logger.warning("Upload response is not valid JSON")
 
     if not url:
         # As a last resort, if the response text looks like a URL, use it
@@ -186,6 +319,7 @@ def _upload_file(upload_url: str, file_path: Path) -> str:
             url = txt
 
     if not url:
-        raise RuntimeError("Upload succeeded but no URL found in response")
+        logger.error("upload succeeded but no URL found in response")
+        raise RuntimeError("upload succeeded but no URL found in response")
 
     return url

+ 7 - 3
app/schemas/speech.py

@@ -28,14 +28,18 @@ class ChatResponse(BaseModel):
     model: Optional[str] = None
     usage: Optional[Usage] = None
 
-class TextToSpeechResponse(BaseModel):
+class DataPayload(BaseModel):
     audio_url: str
 
+class TextToSpeechResponse(BaseModel):
+    code: int
+    data: Optional[DataPayload] = None
+    msg: Optional[str] = None
+
 class TextToSpeechRequest(BaseModel):
     volume: int = 1
     pitch: float = 1
     rate: float = 1
     filename: str
     text: str
-    model: Optional[str] = None
-    format: Optional[str] = None
+    model: str