| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194 |
- package minimax
- import (
- "encoding/hex"
- "encoding/json"
- "errors"
- "fmt"
- "io"
- "net/http"
- "strings"
- "github.com/QuantumNous/new-api/dto"
- relaycommon "github.com/QuantumNous/new-api/relay/common"
- "github.com/QuantumNous/new-api/types"
- "github.com/gin-gonic/gin"
- )
- type MiniMaxTTSRequest struct {
- Model string `json:"model"`
- Text string `json:"text"`
- Stream bool `json:"stream,omitempty"`
- StreamOptions *StreamOptions `json:"stream_options,omitempty"`
- VoiceSetting VoiceSetting `json:"voice_setting"`
- PronunciationDict *PronunciationDict `json:"pronunciation_dict,omitempty"`
- AudioSetting *AudioSetting `json:"audio_setting,omitempty"`
- TimbreWeights []TimbreWeight `json:"timbre_weights,omitempty"`
- LanguageBoost string `json:"language_boost,omitempty"`
- VoiceModify *VoiceModify `json:"voice_modify,omitempty"`
- SubtitleEnable bool `json:"subtitle_enable,omitempty"`
- OutputFormat string `json:"output_format,omitempty"`
- AigcWatermark bool `json:"aigc_watermark,omitempty"`
- }
- type StreamOptions struct {
- ExcludeAggregatedAudio bool `json:"exclude_aggregated_audio,omitempty"`
- }
- type VoiceSetting struct {
- VoiceID string `json:"voice_id"`
- Speed float64 `json:"speed,omitempty"`
- Vol float64 `json:"vol,omitempty"`
- Pitch int `json:"pitch,omitempty"`
- Emotion string `json:"emotion,omitempty"`
- TextNormalization bool `json:"text_normalization,omitempty"`
- LatexRead bool `json:"latex_read,omitempty"`
- }
- type PronunciationDict struct {
- Tone []string `json:"tone,omitempty"`
- }
- type AudioSetting struct {
- SampleRate int `json:"sample_rate,omitempty"`
- Bitrate int `json:"bitrate,omitempty"`
- Format string `json:"format,omitempty"`
- Channel int `json:"channel,omitempty"`
- ForceCbr bool `json:"force_cbr,omitempty"`
- }
- type TimbreWeight struct {
- VoiceID string `json:"voice_id"`
- Weight int `json:"weight"`
- }
- type VoiceModify struct {
- Pitch int `json:"pitch,omitempty"`
- Intensity int `json:"intensity,omitempty"`
- Timbre int `json:"timbre,omitempty"`
- SoundEffects string `json:"sound_effects,omitempty"`
- }
- type MiniMaxTTSResponse struct {
- Data MiniMaxTTSData `json:"data"`
- ExtraInfo MiniMaxExtraInfo `json:"extra_info"`
- TraceID string `json:"trace_id"`
- BaseResp MiniMaxBaseResp `json:"base_resp"`
- }
- type MiniMaxTTSData struct {
- Audio string `json:"audio"`
- Status int `json:"status"`
- }
- type MiniMaxExtraInfo struct {
- UsageCharacters int64 `json:"usage_characters"`
- }
- type MiniMaxBaseResp struct {
- StatusCode int64 `json:"status_code"`
- StatusMsg string `json:"status_msg"`
- }
- func getContentTypeByFormat(format string) string {
- contentTypeMap := map[string]string{
- "mp3": "audio/mpeg",
- "wav": "audio/wav",
- "flac": "audio/flac",
- "aac": "audio/aac",
- "pcm": "audio/pcm",
- }
- if ct, ok := contentTypeMap[format]; ok {
- return ct
- }
- return "audio/mpeg" // default to mp3
- }
- func handleTTSResponse(c *gin.Context, resp *http.Response, info *relaycommon.RelayInfo) (usage any, err *types.NewAPIError) {
- body, readErr := io.ReadAll(resp.Body)
- if readErr != nil {
- return nil, types.NewErrorWithStatusCode(
- fmt.Errorf("failed to read minimax response: %w", readErr),
- types.ErrorCodeReadResponseBodyFailed,
- http.StatusInternalServerError,
- )
- }
- defer resp.Body.Close()
- // Parse response
- var minimaxResp MiniMaxTTSResponse
- if unmarshalErr := json.Unmarshal(body, &minimaxResp); unmarshalErr != nil {
- return nil, types.NewErrorWithStatusCode(
- fmt.Errorf("failed to unmarshal minimax TTS response: %w", unmarshalErr),
- types.ErrorCodeBadResponseBody,
- http.StatusInternalServerError,
- )
- }
- // Check base_resp status code
- if minimaxResp.BaseResp.StatusCode != 0 {
- return nil, types.NewErrorWithStatusCode(
- fmt.Errorf("minimax TTS error: %d - %s", minimaxResp.BaseResp.StatusCode, minimaxResp.BaseResp.StatusMsg),
- types.ErrorCodeBadResponse,
- http.StatusBadRequest,
- )
- }
- // Check if we have audio data
- if minimaxResp.Data.Audio == "" {
- return nil, types.NewErrorWithStatusCode(
- fmt.Errorf("no audio data in minimax TTS response"),
- types.ErrorCodeBadResponse,
- http.StatusBadRequest,
- )
- }
- if strings.HasPrefix(minimaxResp.Data.Audio, "http") {
- c.Redirect(http.StatusFound, minimaxResp.Data.Audio)
- } else {
- // Handle hex-encoded audio data
- audioData, decodeErr := hex.DecodeString(minimaxResp.Data.Audio)
- if decodeErr != nil {
- return nil, types.NewErrorWithStatusCode(
- fmt.Errorf("failed to decode hex audio data: %w", decodeErr),
- types.ErrorCodeBadResponse,
- http.StatusInternalServerError,
- )
- }
- // Determine content type - default to mp3
- contentType := "audio/mpeg"
- c.Data(http.StatusOK, contentType, audioData)
- }
- usage = &dto.Usage{
- PromptTokens: info.GetEstimatePromptTokens(),
- CompletionTokens: 0,
- TotalTokens: int(minimaxResp.ExtraInfo.UsageCharacters),
- }
- return usage, nil
- }
- func handleChatCompletionResponse(c *gin.Context, resp *http.Response, info *relaycommon.RelayInfo) (usage any, err *types.NewAPIError) {
- body, readErr := io.ReadAll(resp.Body)
- if readErr != nil {
- return nil, types.NewErrorWithStatusCode(
- errors.New("failed to read minimax response"),
- types.ErrorCodeReadResponseBodyFailed,
- http.StatusInternalServerError,
- )
- }
- defer resp.Body.Close()
- // Set response headers
- for key, values := range resp.Header {
- for _, value := range values {
- c.Header(key, value)
- }
- }
- c.Data(resp.StatusCode, "application/json", body)
- return nil, nil
- }
|