speech_provider.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. from wave import open as wave_open
  2. from pathlib import Path
  3. import re
  4. import os
  5. import json
  6. from typing import Optional
  7. import dashscope
  8. from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
  9. from dashscope.audio.tts import ResultCallback, SpeechSynthesizer, SpeechSynthesisResult
  10. import requests
  11. from ..schemas.speech import TextToSpeechResponse, DataPayload
  12. from ..core.config import get_settings
  13. settings = get_settings()
  14. # Configure DashScope API key from env/.env
  15. dashscope.api_key = settings.dashscope_api_key or ""
  16. UPLOAD_PATH = settings.upload_path
  17. def _safe_filename(name: str) -> str:
  18. # Keep alphanum, dash, underscore, Chinese; replace others with '_'
  19. return re.sub(r"[^\w\-\u4e00-\u9fff]+", "_", name).strip("_") or "output"
  20. class SpeechProvider:
  21. def text_to_speech(self, volume: int, pitch: float, rate: float, filename: str, text: str, *, model: Optional[str] = None, format: Optional[str] = None) -> TextToSpeechResponse:
  22. # Resolve output path under project-root/temp and ensure directory exists
  23. project_root = Path(__file__).resolve().parents[2] # repo root
  24. audio_dir = project_root / "temp"
  25. audio_dir.mkdir(parents=True, exist_ok=True)
  26. # determine desired output format (default mp3 for smaller size)
  27. audio_format = (format or 'mp3').lower()
  28. if audio_format not in {"wav", "mp3"}:
  29. audio_format = "mp3"
  30. # choose extension and sample rate
  31. ext = "wav" if audio_format == "wav" else "mp3"
  32. sample_rate = 48000 if audio_format == "wav" else 24000
  33. filename = f"{_safe_filename(filename)}.{ext}"
  34. out_path = audio_dir / filename
  35. # Prepare callback with audio params
  36. callback = Callback(
  37. out_path=str(out_path),
  38. sample_rate=sample_rate,
  39. channels=1,
  40. sampwidth=2,
  41. audio_format=audio_format,
  42. )
  43. SpeechSynthesizer.call(
  44. model=(model or 'sambert-zhifei-v1'),
  45. volume=volume,
  46. text=text,
  47. pitch=pitch,
  48. rate=rate,
  49. format=audio_format,
  50. sample_rate=sample_rate,
  51. callback=callback,
  52. word_timestamp_enabled=True,
  53. phoneme_timestamp_enabled=True,
  54. )
  55. # After synthesis completes, upload the file to OSS
  56. try:
  57. url = _upload_file(UPLOAD_PATH, out_path)
  58. # Upload succeeded; remove local audio file to save space
  59. try:
  60. Path(out_path).unlink(missing_ok=True)
  61. except Exception as del_err:
  62. print(f"[warn] Failed to delete local audio {out_path}: {del_err}")
  63. return TextToSpeechResponse(
  64. code=0,
  65. data=DataPayload(audio_url=url),
  66. msg='success'
  67. )
  68. except Exception as e:
  69. # If upload fails, fall back to local path to avoid breaking
  70. print(f"[warn] Upload failed: {e}")
  71. return TextToSpeechResponse(code=0, msg=e.message/'error')
  72. class Callback(ResultCallback):
  73. def __init__(self, out_path: str, sample_rate: int = 16000, channels: int = 1, sampwidth: int = 2, audio_format: str = "mp3"):
  74. self.out_path = out_path
  75. self.sample_rate = sample_rate
  76. self.channels = channels
  77. self.sampwidth = sampwidth
  78. self.wav_file = None
  79. self._fh = None
  80. self.audio_format = audio_format
  81. def on_open(self):
  82. print('Speech synthesizer is opened.')
  83. # Ensure parent directory exists (in case not created earlier)
  84. Path(self.out_path).parent.mkdir(parents=True, exist_ok=True)
  85. if self.audio_format == "wav":
  86. self.wav_file = wave_open(self.out_path, 'wb')
  87. self.wav_file.setnchannels(self.channels)
  88. self.wav_file.setsampwidth(self.sampwidth)
  89. self.wav_file.setframerate(self.sample_rate)
  90. else:
  91. # For mp3 (and other compressed formats), write raw bytes
  92. self._fh = open(self.out_path, 'wb')
  93. def on_complete(self):
  94. print('Speech synthesizer is completed.')
  95. if self.wav_file:
  96. self.wav_file.close()
  97. self.wav_file = None
  98. if self._fh:
  99. self._fh.close()
  100. self._fh = None
  101. def on_error(self, response: SpeechSynthesisResponse):
  102. print('Speech synthesizer failed, response is %s' % (str(response)))
  103. def on_close(self):
  104. print('Speech synthesizer is closed.')
  105. def on_event(self, result: SpeechSynthesisResult):
  106. frame = result.get_audio_frame()
  107. if not frame:
  108. return
  109. if self.wav_file:
  110. self.wav_file.writeframes(frame)
  111. elif self._fh:
  112. self._fh.write(frame)
  113. def _extract_url_from_response(resp_json: dict) -> Optional[str]:
  114. # Try common shapes: {data: {url}}, {url}, {data: "http..."}
  115. try_keys = [
  116. ("data", "fileUrl"),
  117. ("data",),
  118. ("fileUrl",),
  119. ("result", "fileUrl"),
  120. ("payload", "fileUrl"),
  121. ]
  122. for path in try_keys:
  123. cur = resp_json
  124. ok = True
  125. for k in path:
  126. if isinstance(cur, dict) and k in cur:
  127. cur = cur[k]
  128. print(cur)
  129. else:
  130. ok = False
  131. break
  132. if ok and isinstance(cur, str) and cur.startswith("http"):
  133. return cur
  134. return None
  135. def _upload_file(upload_url: str, file_path: Path) -> str:
  136. if not upload_url:
  137. raise ValueError("upload_url is empty")
  138. if not Path(file_path).exists():
  139. raise FileNotFoundError(str(file_path))
  140. filename = Path(file_path).name
  141. # Guess content type
  142. content_type = "audio/mpeg" if filename.lower().endswith(".mp3") else "audio/wav"
  143. with open(file_path, "rb") as f:
  144. files = {
  145. "file": (filename, f, content_type),
  146. "fileType": (None, "VOICE")
  147. }
  148. resp = requests.post(upload_url, files=files, timeout=30)
  149. resp.raise_for_status()
  150. # Try to parse JSON for a URL; fallback to raw text if JSON invalid
  151. url: Optional[str] = None
  152. try:
  153. data = resp.json()
  154. url = _extract_url_from_response(data)
  155. except Exception:
  156. pass
  157. if not url:
  158. # As a last resort, if the response text looks like a URL, use it
  159. txt = (resp.text or "").strip()
  160. if txt.startswith("http"):
  161. url = txt
  162. if not url:
  163. raise RuntimeError("Upload succeeded but no URL found in response")
  164. return url