فهرست منبع

Merge pull request #826 from Calcium-Ion/cache

feat: Add prompt cache hit tokens support for DeepSeek channel #406
Calcium-Ion 1 سال پیش
والد
کامیت
d9390ff4c3
5فایلهای تغییر یافته به همراه26 افزوده شده و 17 حذف شده
  1. 1 0
      dto/openai_response.go
  2. 6 0
      relay/channel/openai/relay-openai.go
  3. 9 8
      relay/relay-text.go
  4. 7 6
      service/quota.go
  5. 3 3
      setting/operation_setting/cache_ratio.go

+ 1 - 0
dto/openai_response.go

@@ -166,6 +166,7 @@ type Usage struct {
 	PromptTokens           int                `json:"prompt_tokens"`
 	CompletionTokens       int                `json:"completion_tokens"`
 	TotalTokens            int                `json:"total_tokens"`
+	PromptCacheHitTokens   int                `json:"prompt_cache_hit_tokens,omitempty"`
 	PromptTokensDetails    InputTokenDetails  `json:"prompt_tokens_details"`
 	CompletionTokenDetails OutputTokenDetails `json:"completion_tokens_details"`
 }

+ 6 - 0
relay/channel/openai/relay-openai.go

@@ -254,6 +254,12 @@ func OaiStreamHandler(c *gin.Context, resp *http.Response, info *relaycommon.Rel
 	if !containStreamUsage {
 		usage, _ = service.ResponseText2Usage(responseTextBuilder.String(), info.UpstreamModelName, info.PromptTokens)
 		usage.CompletionTokens += toolCount * 7
+	} else {
+		if info.ChannelType == common.ChannelTypeDeepSeek {
+			if usage.PromptCacheHitTokens != 0 {
+				usage.PromptTokensDetails.CachedTokens = usage.PromptCacheHitTokens
+			}
+		}
 	}
 
 	if info.ShouldIncludeUsage && !containStreamUsage {

+ 9 - 8
relay/relay-text.go

@@ -320,19 +320,20 @@ func postConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo,
 	groupRatio := priceData.GroupRatio
 	modelPrice := priceData.ModelPrice
 
-	quota := 0
+	quotaCalculate := 0.0
 	if !priceData.UsePrice {
-		quota = (promptTokens - cacheTokens) + int(math.Round(float64(cacheTokens)*cacheRatio))
-		quota += int(math.Round(float64(completionTokens) * completionRatio))
-		quota = int(math.Round(float64(quota) * ratio))
-		if ratio != 0 && quota <= 0 {
-			quota = 1
+		quotaCalculate = float64(promptTokens-cacheTokens) + float64(cacheTokens)*cacheRatio
+		quotaCalculate += float64(completionTokens) * completionRatio
+		quotaCalculate = quotaCalculate * ratio
+		if ratio != 0 && quotaCalculate <= 0 {
+			quotaCalculate = 1
 		}
 	} else {
-		quota = int(modelPrice * common.QuotaPerUnit * groupRatio)
+		quotaCalculate = modelPrice * common.QuotaPerUnit * groupRatio
 	}
+	quota := int(quotaCalculate)
 	totalTokens := promptTokens + completionTokens
-	
+
 	var logContent string
 	if !priceData.UsePrice {
 		logContent = fmt.Sprintf("模型倍率 %.2f,补全倍率 %.2f,分组倍率 %.2f", modelRatio, completionRatio, groupRatio)

+ 7 - 6
service/quota.go

@@ -4,7 +4,6 @@ import (
 	"errors"
 	"fmt"
 	"github.com/bytedance/gopkg/util/gopool"
-	"math"
 	"one-api/common"
 	constant2 "one-api/constant"
 	"one-api/dto"
@@ -44,16 +43,18 @@ func calculateAudioQuota(info QuotaInfo) int {
 	audioCompletionRatio := operation_setting.GetAudioCompletionRatio(info.ModelName)
 	ratio := info.GroupRatio * info.ModelRatio
 
-	quota := info.InputDetails.TextTokens + int(math.Round(float64(info.OutputDetails.TextTokens)*completionRatio))
-	quota += int(math.Round(float64(info.InputDetails.AudioTokens)*audioRatio)) +
-		int(math.Round(float64(info.OutputDetails.AudioTokens)*audioRatio*audioCompletionRatio))
+	quota := 0.0
+	quota += float64(info.InputDetails.TextTokens)
+	quota += float64(info.OutputDetails.TextTokens) * completionRatio
+	quota += float64(info.InputDetails.AudioTokens) * audioRatio
+	quota += float64(info.OutputDetails.AudioTokens) * audioRatio * audioCompletionRatio
 
-	quota = int(math.Round(float64(quota) * ratio))
+	quota = quota * ratio
 	if ratio != 0 && quota <= 0 {
 		quota = 1
 	}
 
-	return quota
+	return int(quota)
 }
 
 func PreWssConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo, usage *dto.RealtimeUsage) error {

+ 3 - 3
setting/operation_setting/cache_ratio.go

@@ -16,9 +16,9 @@ var defaultCacheRatio = map[string]float64{
 	"gpt-4o-mini-2024-07-18":       0.5,
 	"gpt-4o-realtime-preview":      0.5,
 	"gpt-4o-mini-realtime-preview": 0.5,
-	"deepseek-chat":                0.5,
-	"deepseek-reasoner":            0.5,
-	"deepseek-coder":               0.5,
+	"deepseek-chat":                0.1,
+	"deepseek-reasoner":            0.1,
+	"deepseek-coder":               0.1,
 }
 
 var defaultCreateCacheRatio = map[string]float64{}