speech_provider.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. from wave import open as wave_open
  2. from pathlib import Path
  3. import re
  4. import os
  5. import json
  6. from typing import Optional
  7. import dashscope
  8. from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
  9. from dashscope.audio.tts import ResultCallback, SpeechSynthesizer, SpeechSynthesisResult
  10. import requests
  11. from ..schemas.speech import TextToSpeechResponse, DataPayload
  12. from ..core.config import get_settings
  13. from ..core.logger import get_logger
  14. settings = get_settings()
  15. # Configure DashScope API key from env/.env
  16. dashscope.api_key = settings.dashscope_api_key or ""
  17. UPLOAD_PATH = settings.upload_path or ""
  18. # module logger
  19. logger = get_logger("speech_provider")
  20. def _safe_filename(name: str) -> str:
  21. # Keep alphanum, dash, underscore, Chinese; replace others with '_'
  22. return re.sub(r"[^\w\-\u4e00-\u9fff]+", "_", name).strip("_") or "output"
  23. class SpeechProvider:
  24. def text_to_speech(self, volume: int, pitch: float, rate: float, filename: str, text: str, *, model: Optional[str] = None, format: Optional[str] = None) -> TextToSpeechResponse:
  25. # Resolve output path under project-root/temp and ensure directory exists
  26. project_root = Path(__file__).resolve().parents[2] # repo root
  27. audio_dir = project_root / "temp"
  28. try:
  29. audio_dir.mkdir(parents=True, exist_ok=True)
  30. except Exception as e:
  31. logger.error("Failed to create audio directory %s: %s", audio_dir, e, exc_info=True)
  32. return TextToSpeechResponse(code=1, data=None, msg=f"create audio dir failed: {e}")
  33. # Basic input validation
  34. if not isinstance(text, str) or not text.strip():
  35. msg = "text is required"
  36. logger.error(msg)
  37. return TextToSpeechResponse(code=1, data=None, msg=msg)
  38. if not isinstance(filename, str) or not filename.strip():
  39. msg = "filename is required"
  40. logger.error(msg)
  41. return TextToSpeechResponse(code=1, data=None, msg=msg)
  42. if not dashscope.api_key:
  43. msg = "DASHSCOPE_API_KEY is missing"
  44. logger.error(msg)
  45. return TextToSpeechResponse(code=1, data=None, msg=msg)
  46. # determine desired output format (default mp3 for smaller size)
  47. audio_format = (format or 'mp3').lower()
  48. if audio_format not in {"wav", "mp3"}:
  49. logger.info("unsupported format '%s', fallback to mp3", audio_format)
  50. audio_format = "mp3"
  51. # choose extension and sample rate
  52. ext = "wav" if audio_format == "wav" else "mp3"
  53. sample_rate = 48000 if audio_format == "wav" else 24000
  54. filename = f"{_safe_filename(filename)}.{ext}"
  55. out_path = audio_dir / filename
  56. # Prepare callback with audio params
  57. callback = Callback(
  58. out_path=str(out_path),
  59. sample_rate=sample_rate,
  60. channels=1,
  61. sampwidth=2,
  62. audio_format=audio_format,
  63. )
  64. # Run TTS with robust error handling
  65. try:
  66. SpeechSynthesizer.call(
  67. model=(model or 'sambert-zhifei-v1'),
  68. volume=volume,
  69. text=text,
  70. pitch=pitch,
  71. rate=rate,
  72. format=audio_format,
  73. sample_rate=sample_rate,
  74. callback=callback,
  75. word_timestamp_enabled=True,
  76. phoneme_timestamp_enabled=True,
  77. )
  78. except Exception as e:
  79. logger.error("TTS call failed", exc_info=True)
  80. # Ensure any open file handles are closed
  81. try:
  82. callback.on_complete()
  83. except Exception:
  84. pass
  85. return TextToSpeechResponse(code=1, data=None, msg=str(e))
  86. if callback.had_error:
  87. # TTS reported an error via callback
  88. base_msg = callback.error_message or "speech synthesis failed"
  89. # Enrich message with model error code/status when available
  90. if callback.error_code or callback.status_code is not None:
  91. msg = f"[{callback.error_code or 'Error'}] {base_msg} (status={callback.status_code})"
  92. else:
  93. msg = base_msg
  94. logger.error("TTS callback error: %s", msg)
  95. return TextToSpeechResponse(code=1, data=None, msg=msg)
  96. # After synthesis completes, upload the file to OSS
  97. try:
  98. url = _upload_file(UPLOAD_PATH, out_path)
  99. # Upload succeeded; remove local audio file to save space
  100. try:
  101. Path(out_path).unlink(missing_ok=True)
  102. except Exception as del_err:
  103. logger.warning("Failed to delete local audio %s: %s", out_path, del_err)
  104. return TextToSpeechResponse(
  105. code=0,
  106. data=DataPayload(audio_url=url),
  107. msg='success'
  108. )
  109. except Exception as e:
  110. # Keep local file for inspection; report error message
  111. logger.error("Upload failed", exc_info=True)
  112. return TextToSpeechResponse(code=1, data=None, msg=str(e))
  113. class Callback(ResultCallback):
  114. def __init__(self, out_path: str, sample_rate: int = 16000, channels: int = 1, sampwidth: int = 2, audio_format: str = "mp3"):
  115. self.out_path = out_path
  116. self.sample_rate = sample_rate
  117. self.channels = channels
  118. self.sampwidth = sampwidth
  119. self.wav_file = None
  120. self._fh = None
  121. self.audio_format = audio_format
  122. self.had_error = False
  123. self.error_message: Optional[str] = None
  124. self.error_code: Optional[str] = None
  125. self.status_code: Optional[int] = None
  126. def on_open(self):
  127. logger.info('Speech synthesizer opened')
  128. try:
  129. # Ensure parent directory exists (in case not created earlier)
  130. Path(self.out_path).parent.mkdir(parents=True, exist_ok=True)
  131. if self.audio_format == "wav":
  132. self.wav_file = wave_open(self.out_path, 'wb')
  133. self.wav_file.setnchannels(self.channels)
  134. self.wav_file.setsampwidth(self.sampwidth)
  135. self.wav_file.setframerate(self.sample_rate)
  136. else:
  137. # For mp3 (and other compressed formats), write raw bytes
  138. self._fh = open(self.out_path, 'wb')
  139. except Exception as e:
  140. self.had_error = True
  141. self.error_message = f"open output failed: {e}"
  142. logger.error("Failed to open output file %s: %s", self.out_path, e, exc_info=True)
  143. def on_complete(self):
  144. logger.info('Speech synthesizer completed')
  145. if self.wav_file:
  146. self.wav_file.close()
  147. self.wav_file = None
  148. if self._fh:
  149. self._fh.close()
  150. self._fh = None
  151. def on_error(self, response: SpeechSynthesisResponse):
  152. # Capture error and mark state for upstream handling
  153. code, detail, status = _extract_dashscope_error(response)
  154. self.had_error = True
  155. self.error_message = detail
  156. self.error_code = code
  157. self.status_code = status
  158. # Log with structured context
  159. if code or status is not None:
  160. logger.error('Speech synthesizer failed: code=%s status=%s msg=%s', code, status, detail)
  161. else:
  162. logger.error('Speech synthesizer failed: %s', detail)
  163. # Ensure file handles are closed even on error
  164. try:
  165. self.on_complete()
  166. except Exception:
  167. pass
  168. def on_close(self):
  169. logger.info('Speech synthesizer closed')
  170. def on_event(self, result: SpeechSynthesisResult):
  171. frame = result.get_audio_frame()
  172. if not frame:
  173. return
  174. try:
  175. if self.wav_file:
  176. self.wav_file.writeframes(frame)
  177. elif self._fh:
  178. self._fh.write(frame)
  179. else:
  180. # No open handle; mark error to surface upstream
  181. self.had_error = True
  182. self.error_message = "audio handle not initialized"
  183. logger.error("Audio handle not initialized when receiving frame")
  184. except Exception as e:
  185. self.had_error = True
  186. self.error_message = f"write frame failed: {e}"
  187. logger.error("Failed writing audio frame: %s", e, exc_info=True)
  188. def _extract_url_from_response(resp_json: dict) -> Optional[str]:
  189. # Try common shapes: {data: {url}}, {url}, {data: "http..."}
  190. try_keys = [
  191. ("data", "fileUrl"),
  192. ("data",),
  193. ("fileUrl",),
  194. ("result", "fileUrl"),
  195. ("payload", "fileUrl"),
  196. ]
  197. for path in try_keys:
  198. cur = resp_json
  199. ok = True
  200. for k in path:
  201. if isinstance(cur, dict) and k in cur:
  202. cur = cur[k]
  203. else:
  204. ok = False
  205. break
  206. if ok and isinstance(cur, str) and cur.startswith("http"):
  207. return cur
  208. return None
  209. def _extract_dashscope_error(resp: object) -> tuple[Optional[str], str, Optional[int]]:
  210. """Best-effort extraction of (code, message, http_status) from DashScope response.
  211. Compatible with SpeechSynthesisResponse or dict-like payloads.
  212. """
  213. code: Optional[str] = None
  214. msg: str = "speech synthesis failed"
  215. status: Optional[int] = None
  216. # If it looks like a dict
  217. if isinstance(resp, dict):
  218. code = str(resp.get("code")) if resp.get("code") is not None else None
  219. status = resp.get("status_code") if isinstance(resp.get("status_code"), int) else None
  220. msg = str(resp.get("message") or msg)
  221. return code, msg, status
  222. # Try attribute-style access
  223. try:
  224. status_attr = getattr(resp, "status_code", None)
  225. if isinstance(status_attr, int):
  226. status = status_attr
  227. except Exception:
  228. pass
  229. try:
  230. code_attr = getattr(resp, "code", None)
  231. if code_attr is not None:
  232. code = str(code_attr)
  233. except Exception:
  234. pass
  235. try:
  236. msg_attr = getattr(resp, "message", None)
  237. if msg_attr:
  238. msg = str(msg_attr)
  239. except Exception:
  240. pass
  241. # As a last resort, try to parse JSON from str(resp)
  242. try:
  243. s = str(resp)
  244. if s and s.strip().startswith("{"):
  245. data = json.loads(s)
  246. if isinstance(data, dict):
  247. code = str(data.get("code")) if data.get("code") is not None else code
  248. status = data.get("status_code") if isinstance(data.get("status_code"), int) else status
  249. msg = str(data.get("message") or msg)
  250. except Exception:
  251. pass
  252. return code, msg, status
  253. def _upload_file(upload_url: str, file_path: Path) -> str:
  254. if not upload_url:
  255. logger.error("upload_url is empty")
  256. raise ValueError("upload_url is empty")
  257. if not Path(file_path).exists():
  258. logger.error("audio file not found: %s", file_path)
  259. raise FileNotFoundError(str(file_path))
  260. filename = Path(file_path).name
  261. # Guess content type
  262. content_type = "audio/mpeg" if filename.lower().endswith(".mp3") else "audio/wav"
  263. with open(file_path, "rb") as f:
  264. files = {
  265. "file": (filename, f, content_type),
  266. "fileType": (None, "VOICE")
  267. }
  268. resp = requests.post(upload_url, files=files, timeout=30)
  269. resp.raise_for_status()
  270. # Try to parse JSON for a URL; fallback to raw text if JSON invalid
  271. url: Optional[str] = None
  272. try:
  273. data = resp.json()
  274. url = _extract_url_from_response(data)
  275. except Exception:
  276. logger.warning("Upload response is not valid JSON")
  277. if not url:
  278. # As a last resort, if the response text looks like a URL, use it
  279. txt = (resp.text or "").strip()
  280. if txt.startswith("http"):
  281. url = txt
  282. if not url:
  283. logger.error("upload succeeded but no URL found in response")
  284. raise RuntimeError("upload succeeded but no URL found in response")
  285. return url