|
@@ -0,0 +1,120 @@
|
|
|
|
|
+import argparse
|
|
|
|
|
+import base64
|
|
|
|
|
+import json
|
|
|
|
|
+
|
|
|
|
|
+import pyaudio
|
|
|
|
|
+import requests
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def wav_to_base64(file_path):
|
|
|
|
|
+ with open(file_path, "rb") as wav_file:
|
|
|
|
|
+ wav_content = wav_file.read()
|
|
|
|
|
+ base64_encoded = base64.b64encode(wav_content)
|
|
|
|
|
+ return base64_encoded.decode("utf-8")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def play_audio(audio_content, format, channels, rate):
|
|
|
|
|
+ p = pyaudio.PyAudio()
|
|
|
|
|
+ stream = p.open(format=format, channels=channels, rate=rate, output=True)
|
|
|
|
|
+ stream.write(audio_content)
|
|
|
|
|
+ stream.stop_stream()
|
|
|
|
|
+ stream.close()
|
|
|
|
|
+ p.terminate()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ parser = argparse.ArgumentParser(
|
|
|
|
|
+ description="Send a WAV file and text to a server and receive synthesized audio."
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--url", "-u", type=str, required=True, help="URL of the server"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--text", "-t", type=str, required=True, help="Text to be synthesized"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--reference_audio", "-ra", type=str, required=True, help="Path to the WAV file"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--reference_text",
|
|
|
|
|
+ "-rt",
|
|
|
|
|
+ type=str,
|
|
|
|
|
+ required=True,
|
|
|
|
|
+ help="Reference text for voice synthesis",
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--max_new_tokens", type=int, default=0, help="Maximum new tokens to generate"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--chunk_length", type=int, default=30, help="Chunk length for synthesis"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--top_p", type=float, default=0.7, help="Top-p sampling for synthesis"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--repetition_penalty",
|
|
|
|
|
+ type=float,
|
|
|
|
|
+ default=1.5,
|
|
|
|
|
+ help="Repetition penalty for synthesis",
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--temperature", type=float, default=0.7, help="Temperature for sampling"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--speaker", type=str, default=None, help="Speaker ID for voice synthesis"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument("--format", type=str, default="wav", help="Audio format")
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--streaming", type=bool, default=False, help="Enable streaming response"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--channels", type=int, default=1, help="Number of audio channels"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
|
|
|
|
|
+
|
|
|
|
|
+ args = parser.parse_args()
|
|
|
|
|
+
|
|
|
|
|
+ base64_audio = wav_to_base64(args.reference_audio)
|
|
|
|
|
+
|
|
|
|
|
+ data = {
|
|
|
|
|
+ "text": args.text,
|
|
|
|
|
+ "reference_text": args.reference_text,
|
|
|
|
|
+ "reference_audio": base64_audio,
|
|
|
|
|
+ "max_new_tokens": args.max_new_tokens,
|
|
|
|
|
+ "chunk_length": args.chunk_length,
|
|
|
|
|
+ "top_p": args.top_p,
|
|
|
|
|
+ "repetition_penalty": args.repetition_penalty,
|
|
|
|
|
+ "temperature": args.temperature,
|
|
|
|
|
+ "speaker": args.speaker,
|
|
|
|
|
+ "format": args.format,
|
|
|
|
|
+ "streaming": args.streaming,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ response = requests.post(args.url, json=data, stream=args.streaming)
|
|
|
|
|
+
|
|
|
|
|
+ audio_format = pyaudio.paInt16 # Assuming 16-bit PCM format
|
|
|
|
|
+
|
|
|
|
|
+ if response.status_code == 200:
|
|
|
|
|
+ if args.streaming:
|
|
|
|
|
+ p = pyaudio.PyAudio()
|
|
|
|
|
+ stream = p.open(
|
|
|
|
|
+ format=audio_format, channels=args.channels, rate=args.rate, output=True
|
|
|
|
|
+ )
|
|
|
|
|
+ for chunk in response.iter_content(chunk_size=1024):
|
|
|
|
|
+ if chunk:
|
|
|
|
|
+ stream.write(chunk)
|
|
|
|
|
+ stream.stop_stream()
|
|
|
|
|
+ stream.close()
|
|
|
|
|
+ p.terminate()
|
|
|
|
|
+ else:
|
|
|
|
|
+ audio_content = response.content
|
|
|
|
|
+
|
|
|
|
|
+ with open("generated_audio.wav", "wb") as audio_file:
|
|
|
|
|
+ audio_file.write(audio_content)
|
|
|
|
|
+
|
|
|
|
|
+ play_audio(audio_content, audio_format, args.channels, args.rate)
|
|
|
|
|
+ print("Audio has been saved to 'generated_audio.wav'.")
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f"Request failed with status code {response.status_code}")
|
|
|
|
|
+ print(response.json())
|