|
@@ -1,11 +1,13 @@
|
|
|
package minimax
|
|
package minimax
|
|
|
|
|
|
|
|
import (
|
|
import (
|
|
|
- "encoding/base64"
|
|
|
|
|
|
|
+ "encoding/hex"
|
|
|
"encoding/json"
|
|
"encoding/json"
|
|
|
"errors"
|
|
"errors"
|
|
|
|
|
+ "fmt"
|
|
|
"io"
|
|
"io"
|
|
|
"net/http"
|
|
"net/http"
|
|
|
|
|
+ "strings"
|
|
|
|
|
|
|
|
"github.com/QuantumNous/new-api/dto"
|
|
"github.com/QuantumNous/new-api/dto"
|
|
|
relaycommon "github.com/QuantumNous/new-api/relay/common"
|
|
relaycommon "github.com/QuantumNous/new-api/relay/common"
|
|
@@ -14,96 +16,78 @@ import (
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
type MiniMaxTTSRequest struct {
|
|
type MiniMaxTTSRequest struct {
|
|
|
- Model string `json:"model"`
|
|
|
|
|
- Text string `json:"text"`
|
|
|
|
|
- VoiceID string `json:"voice_id"`
|
|
|
|
|
- Speed float64 `json:"speed,omitempty"`
|
|
|
|
|
- Vol float64 `json:"vol,omitempty"`
|
|
|
|
|
- Pitch int `json:"pitch,omitempty"`
|
|
|
|
|
- AudioSampleRate int `json:"audio_sample_rate,omitempty"`
|
|
|
|
|
- OutputFormat string `json:"output_format,omitempty"`
|
|
|
|
|
|
|
+ Model string `json:"model"`
|
|
|
|
|
+ Text string `json:"text"`
|
|
|
|
|
+ Stream bool `json:"stream,omitempty"`
|
|
|
|
|
+ StreamOptions *StreamOptions `json:"stream_options,omitempty"`
|
|
|
|
|
+ VoiceSetting VoiceSetting `json:"voice_setting"`
|
|
|
|
|
+ PronunciationDict *PronunciationDict `json:"pronunciation_dict,omitempty"`
|
|
|
|
|
+ AudioSetting *AudioSetting `json:"audio_setting,omitempty"`
|
|
|
|
|
+ TimbreWeights []TimbreWeight `json:"timbre_weights,omitempty"`
|
|
|
|
|
+ LanguageBoost string `json:"language_boost,omitempty"`
|
|
|
|
|
+ VoiceModify *VoiceModify `json:"voice_modify,omitempty"`
|
|
|
|
|
+ SubtitleEnable bool `json:"subtitle_enable,omitempty"`
|
|
|
|
|
+ OutputFormat string `json:"output_format,omitempty"`
|
|
|
|
|
+ AigcWatermark bool `json:"aigc_watermark,omitempty"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-type MiniMaxTTSResponse struct {
|
|
|
|
|
- Created int `json:"created"`
|
|
|
|
|
- Data []MiniMaxTTSData `json:"data"`
|
|
|
|
|
- ID string `json:"id"`
|
|
|
|
|
- Model string `json:"model"`
|
|
|
|
|
- Object string `json:"object"`
|
|
|
|
|
- Usage MiniMaxTTSUsage `json:"usage"`
|
|
|
|
|
|
|
+type StreamOptions struct {
|
|
|
|
|
+ ExcludeAggregatedAudio bool `json:"exclude_aggregated_audio,omitempty"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-type MiniMaxTTSData struct {
|
|
|
|
|
- Index int `json:"index"`
|
|
|
|
|
- Audio string `json:"audio"`
|
|
|
|
|
- Text string `json:"text"`
|
|
|
|
|
- FinishReason string `json:"finish_reason"`
|
|
|
|
|
|
|
+type VoiceSetting struct {
|
|
|
|
|
+ VoiceID string `json:"voice_id"`
|
|
|
|
|
+ Speed float64 `json:"speed,omitempty"`
|
|
|
|
|
+ Vol float64 `json:"vol,omitempty"`
|
|
|
|
|
+ Pitch int `json:"pitch,omitempty"`
|
|
|
|
|
+ Emotion string `json:"emotion,omitempty"`
|
|
|
|
|
+ TextNormalization bool `json:"text_normalization,omitempty"`
|
|
|
|
|
+ LatexRead bool `json:"latex_read,omitempty"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-type MiniMaxTTSUsage struct {
|
|
|
|
|
- TotalTokens int `json:"total_tokens"`
|
|
|
|
|
|
|
+type PronunciationDict struct {
|
|
|
|
|
+ Tone []string `json:"tone,omitempty"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-type MiniMaxTTSErrorResponse struct {
|
|
|
|
|
- Error MiniMaxTTSError `json:"error"`
|
|
|
|
|
|
|
+type AudioSetting struct {
|
|
|
|
|
+ SampleRate int `json:"sample_rate,omitempty"`
|
|
|
|
|
+ Bitrate int `json:"bitrate,omitempty"`
|
|
|
|
|
+ Format string `json:"format,omitempty"`
|
|
|
|
|
+ Channel int `json:"channel,omitempty"`
|
|
|
|
|
+ ForceCbr bool `json:"force_cbr,omitempty"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-type MiniMaxTTSError struct {
|
|
|
|
|
- Code string `json:"code"`
|
|
|
|
|
- Message string `json:"message"`
|
|
|
|
|
- Type string `json:"type"`
|
|
|
|
|
|
|
+type TimbreWeight struct {
|
|
|
|
|
+ VoiceID string `json:"voice_id"`
|
|
|
|
|
+ Weight int `json:"weight"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// OpenAI voice to MiniMax voice_id mapping
|
|
|
|
|
-var openAIToMiniMaxVoiceMap = map[string]string{
|
|
|
|
|
- "alloy": "male-qn-qingse",
|
|
|
|
|
- "echo": "male-qn-jingying",
|
|
|
|
|
- "fable": "female-shaonv",
|
|
|
|
|
- "onyx": "male-qn-badao",
|
|
|
|
|
- "nova": "female-shaonv-jingpin",
|
|
|
|
|
- "shimmer": "female-yujie",
|
|
|
|
|
- // Add some standard MiniMax voice IDs
|
|
|
|
|
- "voice-1": "male-qn-qingse",
|
|
|
|
|
- "voice-2": "female-shaonv",
|
|
|
|
|
|
|
+type VoiceModify struct {
|
|
|
|
|
+ Pitch int `json:"pitch,omitempty"`
|
|
|
|
|
+ Intensity int `json:"intensity,omitempty"`
|
|
|
|
|
+ Timbre int `json:"timbre,omitempty"`
|
|
|
|
|
+ SoundEffects string `json:"sound_effects,omitempty"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// OpenAI response format to MiniMax output format mapping
|
|
|
|
|
-var responseFormatToOutputFormatMap = map[string]string{
|
|
|
|
|
- "mp3": "mp3",
|
|
|
|
|
- "opus": "mp3",
|
|
|
|
|
- "aac": "aac",
|
|
|
|
|
- "flac": "flac",
|
|
|
|
|
- "wav": "wav",
|
|
|
|
|
- "pcm": "pcm",
|
|
|
|
|
|
|
+type MiniMaxTTSResponse struct {
|
|
|
|
|
+ Data MiniMaxTTSData `json:"data"`
|
|
|
|
|
+ ExtraInfo MiniMaxExtraInfo `json:"extra_info"`
|
|
|
|
|
+ TraceID string `json:"trace_id"`
|
|
|
|
|
+ BaseResp MiniMaxBaseResp `json:"base_resp"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// TTS model mapping - MiniMax uses speech-01 or speech-01-turbo
|
|
|
|
|
-var modelToTTSModelMap = map[string]string{
|
|
|
|
|
- "speech-01": "speech-01",
|
|
|
|
|
- "speech-01-turbo": "speech-01-turbo",
|
|
|
|
|
- "tts-1": "speech-01-turbo",
|
|
|
|
|
- "tts-1-hd": "speech-01",
|
|
|
|
|
|
|
+type MiniMaxTTSData struct {
|
|
|
|
|
+ Audio string `json:"audio"`
|
|
|
|
|
+ Status int `json:"status"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-func mapVoiceType(openAIVoice string) string {
|
|
|
|
|
- if voice, ok := openAIToMiniMaxVoiceMap[openAIVoice]; ok {
|
|
|
|
|
- return voice
|
|
|
|
|
- }
|
|
|
|
|
- return "female-shaonv" // default voice
|
|
|
|
|
|
|
+type MiniMaxExtraInfo struct {
|
|
|
|
|
+ UsageCharacters int64 `json:"usage_characters"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-func mapOutputFormat(responseFormat string) string {
|
|
|
|
|
- if format, ok := responseFormatToOutputFormatMap[responseFormat]; ok {
|
|
|
|
|
- return format
|
|
|
|
|
- }
|
|
|
|
|
- return "mp3" // default format
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-func getTTSModel(modelName string) string {
|
|
|
|
|
- if ttsModel, ok := modelToTTSModelMap[modelName]; ok {
|
|
|
|
|
- return ttsModel
|
|
|
|
|
- }
|
|
|
|
|
- return "speech-01-turbo" // default model
|
|
|
|
|
|
|
+type MiniMaxBaseResp struct {
|
|
|
|
|
+ StatusCode int64 `json:"status_code"`
|
|
|
|
|
+ StatusMsg string `json:"status_msg"`
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
func getContentTypeByFormat(format string) string {
|
|
func getContentTypeByFormat(format string) string {
|
|
@@ -124,66 +108,64 @@ func handleTTSResponse(c *gin.Context, resp *http.Response, info *relaycommon.Re
|
|
|
body, readErr := io.ReadAll(resp.Body)
|
|
body, readErr := io.ReadAll(resp.Body)
|
|
|
if readErr != nil {
|
|
if readErr != nil {
|
|
|
return nil, types.NewErrorWithStatusCode(
|
|
return nil, types.NewErrorWithStatusCode(
|
|
|
- errors.New("failed to read minimax response"),
|
|
|
|
|
|
|
+ fmt.Errorf("failed to read minimax response: %w", readErr),
|
|
|
types.ErrorCodeReadResponseBodyFailed,
|
|
types.ErrorCodeReadResponseBodyFailed,
|
|
|
http.StatusInternalServerError,
|
|
http.StatusInternalServerError,
|
|
|
)
|
|
)
|
|
|
}
|
|
}
|
|
|
defer resp.Body.Close()
|
|
defer resp.Body.Close()
|
|
|
|
|
|
|
|
- // First try to parse as error response
|
|
|
|
|
- var errorResp MiniMaxTTSErrorResponse
|
|
|
|
|
- if unmarshalErr := json.Unmarshal(body, &errorResp); unmarshalErr == nil && errorResp.Error.Code != "" {
|
|
|
|
|
- return nil, types.NewErrorWithStatusCode(
|
|
|
|
|
- errors.New(errorResp.Error.Message),
|
|
|
|
|
- types.ErrorCodeBadResponse,
|
|
|
|
|
- http.StatusBadRequest,
|
|
|
|
|
- )
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // Parse as successful response
|
|
|
|
|
|
|
+ // Parse response
|
|
|
var minimaxResp MiniMaxTTSResponse
|
|
var minimaxResp MiniMaxTTSResponse
|
|
|
if unmarshalErr := json.Unmarshal(body, &minimaxResp); unmarshalErr != nil {
|
|
if unmarshalErr := json.Unmarshal(body, &minimaxResp); unmarshalErr != nil {
|
|
|
return nil, types.NewErrorWithStatusCode(
|
|
return nil, types.NewErrorWithStatusCode(
|
|
|
- errors.New("failed to parse minimax response"),
|
|
|
|
|
|
|
+ fmt.Errorf("failed to unmarshal minimax TTS response: %w", unmarshalErr),
|
|
|
types.ErrorCodeBadResponseBody,
|
|
types.ErrorCodeBadResponseBody,
|
|
|
http.StatusInternalServerError,
|
|
http.StatusInternalServerError,
|
|
|
)
|
|
)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Check if we have audio data
|
|
|
|
|
- if len(minimaxResp.Data) == 0 || minimaxResp.Data[0].Audio == "" {
|
|
|
|
|
|
|
+ // Check base_resp status code
|
|
|
|
|
+ if minimaxResp.BaseResp.StatusCode != 0 {
|
|
|
return nil, types.NewErrorWithStatusCode(
|
|
return nil, types.NewErrorWithStatusCode(
|
|
|
- errors.New("no audio data in response"),
|
|
|
|
|
|
|
+ fmt.Errorf("minimax TTS error: %d - %s", minimaxResp.BaseResp.StatusCode, minimaxResp.BaseResp.StatusMsg),
|
|
|
types.ErrorCodeBadResponse,
|
|
types.ErrorCodeBadResponse,
|
|
|
http.StatusBadRequest,
|
|
http.StatusBadRequest,
|
|
|
)
|
|
)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Decode base64 audio data
|
|
|
|
|
- audioData, decodeErr := base64.StdEncoding.DecodeString(minimaxResp.Data[0].Audio)
|
|
|
|
|
- if decodeErr != nil {
|
|
|
|
|
|
|
+ // Check if we have audio data
|
|
|
|
|
+ if minimaxResp.Data.Audio == "" {
|
|
|
return nil, types.NewErrorWithStatusCode(
|
|
return nil, types.NewErrorWithStatusCode(
|
|
|
- errors.New("failed to decode audio data"),
|
|
|
|
|
- types.ErrorCodeBadResponseBody,
|
|
|
|
|
- http.StatusInternalServerError,
|
|
|
|
|
|
|
+ fmt.Errorf("no audio data in minimax TTS response"),
|
|
|
|
|
+ types.ErrorCodeBadResponse,
|
|
|
|
|
+ http.StatusBadRequest,
|
|
|
)
|
|
)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Get output format from context or default to mp3
|
|
|
|
|
- outputFormat := c.GetString("response_format")
|
|
|
|
|
- if outputFormat == "" {
|
|
|
|
|
- outputFormat = "mp3"
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ if strings.HasPrefix(minimaxResp.Data.Audio, "http") {
|
|
|
|
|
+ c.Redirect(http.StatusFound, minimaxResp.Data.Audio)
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // Handle hex-encoded audio data
|
|
|
|
|
+ audioData, decodeErr := hex.DecodeString(minimaxResp.Data.Audio)
|
|
|
|
|
+ if decodeErr != nil {
|
|
|
|
|
+ return nil, types.NewErrorWithStatusCode(
|
|
|
|
|
+ fmt.Errorf("failed to decode hex audio data: %w", decodeErr),
|
|
|
|
|
+ types.ErrorCodeBadResponse,
|
|
|
|
|
+ http.StatusInternalServerError,
|
|
|
|
|
+ )
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Determine content type - default to mp3
|
|
|
|
|
+ contentType := "audio/mpeg"
|
|
|
|
|
|
|
|
- contentType := getContentTypeByFormat(outputFormat)
|
|
|
|
|
- c.Header("Content-Type", contentType)
|
|
|
|
|
- c.Data(http.StatusOK, contentType, audioData)
|
|
|
|
|
|
|
+ c.Data(http.StatusOK, contentType, audioData)
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
usage = &dto.Usage{
|
|
usage = &dto.Usage{
|
|
|
PromptTokens: info.PromptTokens,
|
|
PromptTokens: info.PromptTokens,
|
|
|
CompletionTokens: 0,
|
|
CompletionTokens: 0,
|
|
|
- TotalTokens: minimaxResp.Usage.TotalTokens,
|
|
|
|
|
|
|
+ TotalTokens: int(minimaxResp.ExtraInfo.UsageCharacters),
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
return usage, nil
|
|
return usage, nil
|