7 месяцев назад · c7ab0f4f3d
--- a/relay/channel/minimax/adaptor.go
+++ b/relay/channel/minimax/adaptor.go
@@ -34,17 +34,20 @@ func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInf
 
				 		return nil, errors.New("unsupported audio relay mode")
			
 
				 	}
			
 
				 
			
 
				-	voiceID := mapVoiceType(request.Voice)
			
 
				+	voiceID := request.Voice
			
 
				 	speed := request.Speed
			
 
				-	outputFormat := mapOutputFormat(request.ResponseFormat)
			
 
				-
			
 
				-	c.Set("response_format", outputFormat)
			
 
				+	outputFormat := request.ResponseFormat
			
 
				 
			
 
				 	minimaxRequest := MiniMaxTTSRequest{
			
 
				-		Model:        getTTSModel(info.OriginModelName),
			
 
				-		Text:         request.Input,
			
 
				-		VoiceID:      voiceID,
			
 
				-		Speed:        speed,
			
 
				+		Model: info.OriginModelName,
			
 
				+		Text:  request.Input,
			
 
				+		VoiceSetting: VoiceSetting{
			
 
				+			VoiceID: voiceID,
			
 
				+			Speed:   speed,
			
 
				+		},
			
 
				+		AudioSetting: &AudioSetting{
			
 
				+			Format: outputFormat,
			
 
				+		},
			
 
				 		OutputFormat: outputFormat,
			
 
				 	}
			
 
				 
			
@@ -59,6 +62,11 @@ func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInf
 
				 	if err != nil {
			
 
				 		return nil, fmt.Errorf("error marshalling minimax request: %w", err)
			
 
				 	}
			
 
				+	if outputFormat != "hex" {
			
 
				+		outputFormat = "url"
			
 
				+	}
			
 
				+
			
 
				+	c.Set("response_format", outputFormat)
			
 
				 
			
 
				 	// Debug: log the request structure
			
 
				 	fmt.Printf("MiniMax TTS Request: %s\n", string(jsonData))
			
@@ -79,12 +87,6 @@ func (a *Adaptor) GetRequestURL(info *relaycommon.RelayInfo) (string, error) {
 
				 
			
 
				 func (a *Adaptor) SetupRequestHeader(c *gin.Context, req *http.Header, info *relaycommon.RelayInfo) error {
			
 
				 	channel.SetupApiRequestHeader(info, c, req)
			
 
				-
			
 
				-	if info.RelayMode == constant.RelayModeAudioSpeech {
			
 
				-		req.Set("Content-Type", "application/json")
			
 
				-		return nil
			
 
				-	}
			
 
				-
			
 
				 	req.Set("Authorization", "Bearer "+info.ApiKey)
			
 
				 	return nil
			
 
				 }
			
--- a/relay/channel/minimax/tts.go
+++ b/relay/channel/minimax/tts.go
@@ -1,11 +1,13 @@
 
				 package minimax
			
 
				 
			
 
				 import (
			
 
				-	"encoding/base64"
			
 
				+	"encoding/hex"
			
 
				 	"encoding/json"
			
 
				 	"errors"
			
 
				+	"fmt"
			
 
				 	"io"
			
 
				 	"net/http"
			
 
				+	"strings"
			
 
				 
			
 
				 	"github.com/QuantumNous/new-api/dto"
			
 
				 	relaycommon "github.com/QuantumNous/new-api/relay/common"
			
@@ -14,96 +16,78 @@ import (
 
				 )
			
 
				 
			
 
				 type MiniMaxTTSRequest struct {
			
 
				-	Model           string  `json:"model"`
			
 
				-	Text            string  `json:"text"`
			
 
				-	VoiceID         string  `json:"voice_id"`
			
 
				-	Speed           float64 `json:"speed,omitempty"`
			
 
				-	Vol             float64 `json:"vol,omitempty"`
			
 
				-	Pitch           int     `json:"pitch,omitempty"`
			
 
				-	AudioSampleRate int     `json:"audio_sample_rate,omitempty"`
			
 
				-	OutputFormat    string  `json:"output_format,omitempty"`
			
 
				+	Model             string             `json:"model"`
			
 
				+	Text              string             `json:"text"`
			
 
				+	Stream            bool               `json:"stream,omitempty"`
			
 
				+	StreamOptions     *StreamOptions     `json:"stream_options,omitempty"`
			
 
				+	VoiceSetting      VoiceSetting       `json:"voice_setting"`
			
 
				+	PronunciationDict *PronunciationDict `json:"pronunciation_dict,omitempty"`
			
 
				+	AudioSetting      *AudioSetting      `json:"audio_setting,omitempty"`
			
 
				+	TimbreWeights     []TimbreWeight     `json:"timbre_weights,omitempty"`
			
 
				+	LanguageBoost     string             `json:"language_boost,omitempty"`
			
 
				+	VoiceModify       *VoiceModify       `json:"voice_modify,omitempty"`
			
 
				+	SubtitleEnable    bool               `json:"subtitle_enable,omitempty"`
			
 
				+	OutputFormat      string             `json:"output_format,omitempty"`
			
 
				+	AigcWatermark     bool               `json:"aigc_watermark,omitempty"`
			
 
				 }
			
 
				 
			
 
				-type MiniMaxTTSResponse struct {
			
 
				-	Created int              `json:"created"`
			
 
				-	Data    []MiniMaxTTSData `json:"data"`
			
 
				-	ID      string           `json:"id"`
			
 
				-	Model   string           `json:"model"`
			
 
				-	Object  string           `json:"object"`
			
 
				-	Usage   MiniMaxTTSUsage  `json:"usage"`
			
 
				+type StreamOptions struct {
			
 
				+	ExcludeAggregatedAudio bool `json:"exclude_aggregated_audio,omitempty"`
			
 
				 }
			
 
				 
			
 
				-type MiniMaxTTSData struct {
			
 
				-	Index        int    `json:"index"`
			
 
				-	Audio        string `json:"audio"`
			
 
				-	Text         string `json:"text"`
			
 
				-	FinishReason string `json:"finish_reason"`
			
 
				+type VoiceSetting struct {
			
 
				+	VoiceID           string  `json:"voice_id"`
			
 
				+	Speed             float64 `json:"speed,omitempty"`
			
 
				+	Vol               float64 `json:"vol,omitempty"`
			
 
				+	Pitch             int     `json:"pitch,omitempty"`
			
 
				+	Emotion           string  `json:"emotion,omitempty"`
			
 
				+	TextNormalization bool    `json:"text_normalization,omitempty"`
			
 
				+	LatexRead         bool    `json:"latex_read,omitempty"`
			
 
				 }
			
 
				 
			
 
				-type MiniMaxTTSUsage struct {
			
 
				-	TotalTokens int `json:"total_tokens"`
			
 
				+type PronunciationDict struct {
			
 
				+	Tone []string `json:"tone,omitempty"`
			
 
				 }
			
 
				 
			
 
				-type MiniMaxTTSErrorResponse struct {
			
 
				-	Error MiniMaxTTSError `json:"error"`
			
 
				+type AudioSetting struct {
			
 
				+	SampleRate int    `json:"sample_rate,omitempty"`
			
 
				+	Bitrate    int    `json:"bitrate,omitempty"`
			
 
				+	Format     string `json:"format,omitempty"`
			
 
				+	Channel    int    `json:"channel,omitempty"`
			
 
				+	ForceCbr   bool   `json:"force_cbr,omitempty"`
			
 
				 }
			
 
				 
			
 
				-type MiniMaxTTSError struct {
			
 
				-	Code    string `json:"code"`
			
 
				-	Message string `json:"message"`
			
 
				-	Type    string `json:"type"`
			
 
				+type TimbreWeight struct {
			
 
				+	VoiceID string `json:"voice_id"`
			
 
				+	Weight  int    `json:"weight"`
			
 
				 }
			
 
				 
			
 
				-// OpenAI voice to MiniMax voice_id mapping
			
 
				-var openAIToMiniMaxVoiceMap = map[string]string{
			
 
				-	"alloy":   "male-qn-qingse",
			
 
				-	"echo":    "male-qn-jingying",
			
 
				-	"fable":   "female-shaonv",
			
 
				-	"onyx":    "male-qn-badao",
			
 
				-	"nova":    "female-shaonv-jingpin",
			
 
				-	"shimmer": "female-yujie",
			
 
				-	// Add some standard MiniMax voice IDs
			
 
				-	"voice-1": "male-qn-qingse",
			
 
				-	"voice-2": "female-shaonv",
			
 
				+type VoiceModify struct {
			
 
				+	Pitch        int    `json:"pitch,omitempty"`
			
 
				+	Intensity    int    `json:"intensity,omitempty"`
			
 
				+	Timbre       int    `json:"timbre,omitempty"`
			
 
				+	SoundEffects string `json:"sound_effects,omitempty"`
			
 
				 }
			
 
				 
			
 
				-// OpenAI response format to MiniMax output format mapping
			
 
				-var responseFormatToOutputFormatMap = map[string]string{
			
 
				-	"mp3":  "mp3",
			
 
				-	"opus": "mp3",
			
 
				-	"aac":  "aac",
			
 
				-	"flac": "flac",
			
 
				-	"wav":  "wav",
			
 
				-	"pcm":  "pcm",
			
 
				+type MiniMaxTTSResponse struct {
			
 
				+	Data      MiniMaxTTSData   `json:"data"`
			
 
				+	ExtraInfo MiniMaxExtraInfo `json:"extra_info"`
			
 
				+	TraceID   string           `json:"trace_id"`
			
 
				+	BaseResp  MiniMaxBaseResp  `json:"base_resp"`
			
 
				 }
			
 
				 
			
 
				-// TTS model mapping - MiniMax uses speech-01 or speech-01-turbo
			
 
				-var modelToTTSModelMap = map[string]string{
			
 
				-	"speech-01":       "speech-01",
			
 
				-	"speech-01-turbo": "speech-01-turbo",
			
 
				-	"tts-1":           "speech-01-turbo",
			
 
				-	"tts-1-hd":        "speech-01",
			
 
				+type MiniMaxTTSData struct {
			
 
				+	Audio  string `json:"audio"`
			
 
				+	Status int    `json:"status"`
			
 
				 }
			
 
				 
			
 
				-func mapVoiceType(openAIVoice string) string {
			
 
				-	if voice, ok := openAIToMiniMaxVoiceMap[openAIVoice]; ok {
			
 
				-		return voice
			
 
				-	}
			
 
				-	return "female-shaonv" // default voice
			
 
				+type MiniMaxExtraInfo struct {
			
 
				+	UsageCharacters int64 `json:"usage_characters"`
			
 
				 }
			
 
				 
			
 
				-func mapOutputFormat(responseFormat string) string {
			
 
				-	if format, ok := responseFormatToOutputFormatMap[responseFormat]; ok {
			
 
				-		return format
			
 
				-	}
			
 
				-	return "mp3" // default format
			
 
				-}
			
 
				-
			
 
				-func getTTSModel(modelName string) string {
			
 
				-	if ttsModel, ok := modelToTTSModelMap[modelName]; ok {
			
 
				-		return ttsModel
			
 
				-	}
			
 
				-	return "speech-01-turbo" // default model
			
 
				+type MiniMaxBaseResp struct {
			
 
				+	StatusCode int64  `json:"status_code"`
			
 
				+	StatusMsg  string `json:"status_msg"`
			
 
				 }
			
 
				 
			
 
				 func getContentTypeByFormat(format string) string {
			
@@ -124,66 +108,64 @@ func handleTTSResponse(c *gin.Context, resp *http.Response, info *relaycommon.Re
 
				 	body, readErr := io.ReadAll(resp.Body)
			
 
				 	if readErr != nil {
			
 
				 		return nil, types.NewErrorWithStatusCode(
			
 
				-			errors.New("failed to read minimax response"),
			
 
				+			fmt.Errorf("failed to read minimax response: %w", readErr),
			
 
				 			types.ErrorCodeReadResponseBodyFailed,
			
 
				 			http.StatusInternalServerError,
			
 
				 		)
			
 
				 	}
			
 
				 	defer resp.Body.Close()
			
 
				 
			
 
				-	// First try to parse as error response
			
 
				-	var errorResp MiniMaxTTSErrorResponse
			
 
				-	if unmarshalErr := json.Unmarshal(body, &errorResp); unmarshalErr == nil && errorResp.Error.Code != "" {
			
 
				-		return nil, types.NewErrorWithStatusCode(
			
 
				-			errors.New(errorResp.Error.Message),
			
 
				-			types.ErrorCodeBadResponse,
			
 
				-			http.StatusBadRequest,
			
 
				-		)
			
 
				-	}
			
 
				-
			
 
				-	// Parse as successful response
			
 
				+	// Parse response
			
 
				 	var minimaxResp MiniMaxTTSResponse
			
 
				 	if unmarshalErr := json.Unmarshal(body, &minimaxResp); unmarshalErr != nil {
			
 
				 		return nil, types.NewErrorWithStatusCode(
			
 
				-			errors.New("failed to parse minimax response"),
			
 
				+			fmt.Errorf("failed to unmarshal minimax TTS response: %w", unmarshalErr),
			
 
				 			types.ErrorCodeBadResponseBody,
			
 
				 			http.StatusInternalServerError,
			
 
				 		)
			
 
				 	}
			
 
				 
			
 
				-	// Check if we have audio data
			
 
				-	if len(minimaxResp.Data) == 0 || minimaxResp.Data[0].Audio == "" {
			
 
				+	// Check base_resp status code
			
 
				+	if minimaxResp.BaseResp.StatusCode != 0 {
			
 
				 		return nil, types.NewErrorWithStatusCode(
			
 
				-			errors.New("no audio data in response"),
			
 
				+			fmt.Errorf("minimax TTS error: %d - %s", minimaxResp.BaseResp.StatusCode, minimaxResp.BaseResp.StatusMsg),
			
 
				 			types.ErrorCodeBadResponse,
			
 
				 			http.StatusBadRequest,
			
 
				 		)
			
 
				 	}
			
 
				 
			
 
				-	// Decode base64 audio data
			
 
				-	audioData, decodeErr := base64.StdEncoding.DecodeString(minimaxResp.Data[0].Audio)
			
 
				-	if decodeErr != nil {
			
 
				+	// Check if we have audio data
			
 
				+	if minimaxResp.Data.Audio == "" {
			
 
				 		return nil, types.NewErrorWithStatusCode(
			
 
				-			errors.New("failed to decode audio data"),
			
 
				-			types.ErrorCodeBadResponseBody,
			
 
				-			http.StatusInternalServerError,
			
 
				+			fmt.Errorf("no audio data in minimax TTS response"),
			
 
				+			types.ErrorCodeBadResponse,
			
 
				+			http.StatusBadRequest,
			
 
				 		)
			
 
				 	}
			
 
				 
			
 
				-	// Get output format from context or default to mp3
			
 
				-	outputFormat := c.GetString("response_format")
			
 
				-	if outputFormat == "" {
			
 
				-		outputFormat = "mp3"
			
 
				-	}
			
 
				+	if strings.HasPrefix(minimaxResp.Data.Audio, "http") {
			
 
				+		c.Redirect(http.StatusFound, minimaxResp.Data.Audio)
			
 
				+	} else {
			
 
				+		// Handle hex-encoded audio data
			
 
				+		audioData, decodeErr := hex.DecodeString(minimaxResp.Data.Audio)
			
 
				+		if decodeErr != nil {
			
 
				+			return nil, types.NewErrorWithStatusCode(
			
 
				+				fmt.Errorf("failed to decode hex audio data: %w", decodeErr),
			
 
				+				types.ErrorCodeBadResponse,
			
 
				+				http.StatusInternalServerError,
			
 
				+			)
			
 
				+		}
			
 
				+
			
 
				+		// Determine content type - default to mp3
			
 
				+		contentType := "audio/mpeg"
			
 
				 
			
 
				-	contentType := getContentTypeByFormat(outputFormat)
			
 
				-	c.Header("Content-Type", contentType)
			
 
				-	c.Data(http.StatusOK, contentType, audioData)
			
 
				+		c.Data(http.StatusOK, contentType, audioData)
			
 
				+	}
			
 
				 
			
 
				 	usage = &dto.Usage{
			
 
				 		PromptTokens:     info.PromptTokens,
			
 
				 		CompletionTokens: 0,
			
 
				-		TotalTokens:      minimaxResp.Usage.TotalTokens,
			
 
				+		TotalTokens:      int(minimaxResp.ExtraInfo.UsageCharacters),
			
 
				 	}
			
 
				 
			
 
				 	return usage, nil