Sfoglia il codice sorgente

Merge pull request #3059 from QuantumNous/feat/veo

feat(gemini): implement video generation configuration
Calcium-Ion 1 settimana fa
parent
commit
4727fc5d60

+ 1 - 0
dto/openai_video.go

@@ -43,6 +43,7 @@ func (m *OpenAIVideo) SetMetadata(k string, v any) {
 func NewOpenAIVideo() *OpenAIVideo {
 	return &OpenAIVideo{
 		Object: "video",
+		Status: VideoStatusQueued,
 	}
 }
 

+ 56 - 78
relay/channel/task/gemini/adaptor.go

@@ -22,64 +22,6 @@ import (
 	"github.com/pkg/errors"
 )
 
-// ============================
-// Request / Response structures
-// ============================
-
-// GeminiVideoGenerationConfig represents the video generation configuration
-// Based on: https://ai.google.dev/gemini-api/docs/video
-type GeminiVideoGenerationConfig struct {
-	AspectRatio      string  `json:"aspectRatio,omitempty"`      // "16:9" or "9:16"
-	DurationSeconds  float64 `json:"durationSeconds,omitempty"`  // 4, 6, or 8 (as number)
-	NegativePrompt   string  `json:"negativePrompt,omitempty"`   // unwanted elements
-	PersonGeneration string  `json:"personGeneration,omitempty"` // "allow_all" for text-to-video, "allow_adult" for image-to-video
-	Resolution       string  `json:"resolution,omitempty"`       // video resolution
-}
-
-// GeminiVideoRequest represents a single video generation instance
-type GeminiVideoRequest struct {
-	Prompt string `json:"prompt"`
-}
-
-// GeminiVideoPayload represents the complete video generation request payload
-type GeminiVideoPayload struct {
-	Instances  []GeminiVideoRequest        `json:"instances"`
-	Parameters GeminiVideoGenerationConfig `json:"parameters,omitempty"`
-}
-
-type submitResponse struct {
-	Name string `json:"name"`
-}
-
-type operationVideo struct {
-	MimeType           string `json:"mimeType"`
-	BytesBase64Encoded string `json:"bytesBase64Encoded"`
-	Encoding           string `json:"encoding"`
-}
-
-type operationResponse struct {
-	Name     string `json:"name"`
-	Done     bool   `json:"done"`
-	Response struct {
-		Type                  string           `json:"@type"`
-		RaiMediaFilteredCount int              `json:"raiMediaFilteredCount"`
-		Videos                []operationVideo `json:"videos"`
-		BytesBase64Encoded    string           `json:"bytesBase64Encoded"`
-		Encoding              string           `json:"encoding"`
-		Video                 string           `json:"video"`
-		GenerateVideoResponse struct {
-			GeneratedSamples []struct {
-				Video struct {
-					URI string `json:"uri"`
-				} `json:"video"`
-			} `json:"generatedSamples"`
-		} `json:"generateVideoResponse"`
-	} `json:"response"`
-	Error struct {
-		Message string `json:"message"`
-	} `json:"error"`
-}
-
 // ============================
 // Adaptor implementation
 // ============================
@@ -99,17 +41,16 @@ func (a *TaskAdaptor) Init(info *relaycommon.RelayInfo) {
 
 // ValidateRequestAndSetAction parses body, validates fields and sets default action.
 func (a *TaskAdaptor) ValidateRequestAndSetAction(c *gin.Context, info *relaycommon.RelayInfo) (taskErr *dto.TaskError) {
-	// Use the standard validation method for TaskSubmitReq
 	return relaycommon.ValidateBasicTaskRequest(c, info, constant.TaskActionTextGenerate)
 }
 
-// BuildRequestURL constructs the upstream URL.
+// BuildRequestURL constructs the Gemini API generateVideos endpoint.
 func (a *TaskAdaptor) BuildRequestURL(info *relaycommon.RelayInfo) (string, error) {
 	modelName := info.UpstreamModelName
 	version := model_setting.GetGeminiVersionSetting(modelName)
 
 	return fmt.Sprintf(
-		"%s/%s/models/%s:predictLongRunning",
+		"%s/%s/models/%s:generateVideos",
 		a.baseURL,
 		version,
 		modelName,
@@ -124,7 +65,7 @@ func (a *TaskAdaptor) BuildRequestHeader(c *gin.Context, req *http.Request, info
 	return nil
 }
 
-// BuildRequestBody converts request into Gemini specific format.
+// BuildRequestBody converts request into the Gemini API generateVideos format.
 func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayInfo) (io.Reader, error) {
 	v, ok := c.Get("task_request")
 	if !ok {
@@ -135,18 +76,34 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
 		return nil, fmt.Errorf("unexpected task_request type")
 	}
 
-	// Create structured video generation request
 	body := GeminiVideoPayload{
-		Instances: []GeminiVideoRequest{
-			{Prompt: req.Prompt},
-		},
-		Parameters: GeminiVideoGenerationConfig{},
+		Prompt: req.Prompt,
+		Config: &GeminiVideoGenerationConfig{},
 	}
 
-	metadata := req.Metadata
-	if err := taskcommon.UnmarshalMetadata(metadata, &body.Parameters); err != nil {
+	if img := ExtractMultipartImage(c, info); img != nil {
+		body.Image = img
+	} else if len(req.Images) > 0 {
+		if parsed := ParseImageInput(req.Images[0]); parsed != nil {
+			body.Image = parsed
+			info.Action = constant.TaskActionGenerate
+		}
+	}
+
+	if err := taskcommon.UnmarshalMetadata(req.Metadata, body.Config); err != nil {
 		return nil, errors.Wrap(err, "unmarshal metadata failed")
 	}
+	if body.Config.DurationSeconds == 0 && req.Duration > 0 {
+		body.Config.DurationSeconds = req.Duration
+	}
+	if body.Config.Resolution == "" && req.Size != "" {
+		body.Config.Resolution = SizeToVeoResolution(req.Size)
+	}
+	if body.Config.AspectRatio == "" && req.Size != "" {
+		body.Config.AspectRatio = SizeToVeoAspectRatio(req.Size)
+	}
+	body.Config.Resolution = strings.ToLower(body.Config.Resolution)
+	body.Config.NumberOfVideos = 1
 
 	data, err := common.Marshal(body)
 	if err != nil {
@@ -186,14 +143,40 @@ func (a *TaskAdaptor) DoResponse(c *gin.Context, resp *http.Response, info *rela
 }
 
 func (a *TaskAdaptor) GetModelList() []string {
-	return []string{"veo-3.0-generate-001", "veo-3.1-generate-preview", "veo-3.1-fast-generate-preview"}
+	return []string{
+		"veo-3.0-generate-001",
+		"veo-3.0-fast-generate-001",
+		"veo-3.1-generate-preview",
+		"veo-3.1-fast-generate-preview",
+	}
 }
 
 func (a *TaskAdaptor) GetChannelName() string {
 	return "gemini"
 }
 
-// FetchTask fetch task status
+// EstimateBilling returns OtherRatios based on durationSeconds and resolution.
+func (a *TaskAdaptor) EstimateBilling(c *gin.Context, info *relaycommon.RelayInfo) map[string]float64 {
+	v, ok := c.Get("task_request")
+	if !ok {
+		return nil
+	}
+	req, ok := v.(relaycommon.TaskSubmitReq)
+	if !ok {
+		return nil
+	}
+
+	seconds := ResolveVeoDuration(req.Metadata, req.Duration, req.Seconds)
+	resolution := ResolveVeoResolution(req.Metadata, req.Size)
+	resRatio := VeoResolutionRatio(info.UpstreamModelName, resolution)
+
+	return map[string]float64{
+		"seconds":    float64(seconds),
+		"resolution": resRatio,
+	}
+}
+
+// FetchTask polls task status via the Gemini operations GET endpoint.
 func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy string) (*http.Response, error) {
 	taskID, ok := body["task_id"].(string)
 	if !ok {
@@ -205,7 +188,6 @@ func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy
 		return nil, fmt.Errorf("decode task_id failed: %w", err)
 	}
 
-	// For Gemini API, we use GET request to the operations endpoint
 	version := model_setting.GetGeminiVersionSetting("default")
 	url := fmt.Sprintf("%s/%s/%s", baseUrl, version, upstreamName)
 
@@ -249,11 +231,9 @@ func (a *TaskAdaptor) ParseTaskResult(respBody []byte) (*relaycommon.TaskInfo, e
 	ti.Progress = "100%"
 
 	ti.TaskID = taskcommon.EncodeLocalTaskID(op.Name)
-	// Url intentionally left empty — the caller constructs the proxy URL using the public task ID
 
-	// Extract URL from generateVideoResponse if available
-	if len(op.Response.GenerateVideoResponse.GeneratedSamples) > 0 {
-		if uri := op.Response.GenerateVideoResponse.GeneratedSamples[0].Video.URI; uri != "" {
+	if len(op.Response.GenerateVideoResponse.GeneratedVideos) > 0 {
+		if uri := op.Response.GenerateVideoResponse.GeneratedVideos[0].Video.URI; uri != "" {
 			ti.RemoteUrl = uri
 		}
 	}
@@ -262,8 +242,6 @@ func (a *TaskAdaptor) ParseTaskResult(respBody []byte) (*relaycommon.TaskInfo, e
 }
 
 func (a *TaskAdaptor) ConvertToOpenAIVideo(task *model.Task) ([]byte, error) {
-	// Use GetUpstreamTaskID() to get the real upstream operation name for model extraction.
-	// task.TaskID is now a public task_xxxx ID, no longer a base64-encoded upstream name.
 	upstreamTaskID := task.GetUpstreamTaskID()
 	upstreamName, err := taskcommon.DecodeLocalTaskID(upstreamTaskID)
 	if err != nil {

+ 138 - 0
relay/channel/task/gemini/billing.go

@@ -0,0 +1,138 @@
+package gemini
+
+import (
+	"strconv"
+	"strings"
+)
+
+// ParseVeoDurationSeconds extracts durationSeconds from metadata.
+// Returns 8 (Veo default) when not specified or invalid.
+func ParseVeoDurationSeconds(metadata map[string]any) int {
+	if metadata == nil {
+		return 8
+	}
+	v, ok := metadata["durationSeconds"]
+	if !ok {
+		return 8
+	}
+	switch n := v.(type) {
+	case float64:
+		if int(n) > 0 {
+			return int(n)
+		}
+	case int:
+		if n > 0 {
+			return n
+		}
+	}
+	return 8
+}
+
+// ParseVeoResolution extracts resolution from metadata.
+// Returns "720p" when not specified.
+func ParseVeoResolution(metadata map[string]any) string {
+	if metadata == nil {
+		return "720p"
+	}
+	v, ok := metadata["resolution"]
+	if !ok {
+		return "720p"
+	}
+	if s, ok := v.(string); ok && s != "" {
+		return strings.ToLower(s)
+	}
+	return "720p"
+}
+
+// ResolveVeoDuration returns the effective duration in seconds.
+// Priority: metadata["durationSeconds"] > stdDuration > stdSeconds > default (8).
+func ResolveVeoDuration(metadata map[string]any, stdDuration int, stdSeconds string) int {
+	if metadata != nil {
+		if _, exists := metadata["durationSeconds"]; exists {
+			if d := ParseVeoDurationSeconds(metadata); d > 0 {
+				return d
+			}
+		}
+	}
+	if stdDuration > 0 {
+		return stdDuration
+	}
+	if s, err := strconv.Atoi(stdSeconds); err == nil && s > 0 {
+		return s
+	}
+	return 8
+}
+
+// ResolveVeoResolution returns the effective resolution string (lowercase).
+// Priority: metadata["resolution"] > SizeToVeoResolution(stdSize) > default ("720p").
+func ResolveVeoResolution(metadata map[string]any, stdSize string) string {
+	if metadata != nil {
+		if _, exists := metadata["resolution"]; exists {
+			if r := ParseVeoResolution(metadata); r != "" {
+				return r
+			}
+		}
+	}
+	if stdSize != "" {
+		return SizeToVeoResolution(stdSize)
+	}
+	return "720p"
+}
+
+// SizeToVeoResolution converts a "WxH" size string to a Veo resolution label.
+func SizeToVeoResolution(size string) string {
+	parts := strings.SplitN(strings.ToLower(size), "x", 2)
+	if len(parts) != 2 {
+		return "720p"
+	}
+	w, _ := strconv.Atoi(parts[0])
+	h, _ := strconv.Atoi(parts[1])
+	maxDim := w
+	if h > maxDim {
+		maxDim = h
+	}
+	if maxDim >= 3840 {
+		return "4k"
+	}
+	if maxDim >= 1920 {
+		return "1080p"
+	}
+	return "720p"
+}
+
+// SizeToVeoAspectRatio converts a "WxH" size string to a Veo aspect ratio.
+func SizeToVeoAspectRatio(size string) string {
+	parts := strings.SplitN(strings.ToLower(size), "x", 2)
+	if len(parts) != 2 {
+		return "16:9"
+	}
+	w, _ := strconv.Atoi(parts[0])
+	h, _ := strconv.Atoi(parts[1])
+	if w <= 0 || h <= 0 {
+		return "16:9"
+	}
+	if h > w {
+		return "9:16"
+	}
+	return "16:9"
+}
+
+// VeoResolutionRatio returns the pricing multiplier for the given resolution.
+// Standard resolutions (720p, 1080p) return 1.0.
+// 4K returns a model-specific multiplier based on Google's official pricing.
+func VeoResolutionRatio(modelName, resolution string) float64 {
+	if resolution != "4k" {
+		return 1.0
+	}
+	// 4K multipliers derived from Vertex AI official pricing (video+audio base):
+	//   veo-3.1-generate:      $0.60 / $0.40 = 1.5
+	//   veo-3.1-fast-generate: $0.35 / $0.15 ≈ 2.333
+	// Veo 3.0 models do not support 4K; return 1.0 as fallback.
+	if strings.Contains(modelName, "3.1-fast-generate") {
+		return 2.333333
+	}
+	if strings.Contains(modelName, "3.1-generate") || strings.Contains(modelName, "3.1") {
+		return 1.5
+	}
+	return 1.0
+}

+ 63 - 0
relay/channel/task/gemini/dto.go

@@ -0,0 +1,63 @@
+package gemini
+
+// GeminiVideoGenerationConfig represents the Gemini API GenerateVideosConfig.
+// Reference: https://ai.google.dev/gemini-api/docs/video
+type GeminiVideoGenerationConfig struct {
+	AspectRatio      string `json:"aspectRatio,omitempty"`
+	DurationSeconds  int    `json:"durationSeconds,omitempty"`
+	NegativePrompt   string `json:"negativePrompt,omitempty"`
+	PersonGeneration string `json:"personGeneration,omitempty"`
+	Resolution       string `json:"resolution,omitempty"`
+	NumberOfVideos   int    `json:"numberOfVideos,omitempty"`
+}
+
+// VeoImageInput represents an image input for Veo image-to-video.
+// Used by both Gemini and Vertex adaptors.
+type VeoImageInput struct {
+	BytesBase64Encoded string `json:"bytesBase64Encoded"`
+	MimeType           string `json:"mimeType"`
+}
+
+// GeminiVideoPayload is the top-level request body for the Gemini API
+// models/{model}:generateVideos endpoint.
+type GeminiVideoPayload struct {
+	Model  string                       `json:"model,omitempty"`
+	Prompt string                       `json:"prompt"`
+	Image  *VeoImageInput               `json:"image,omitempty"`
+	Config *GeminiVideoGenerationConfig `json:"config,omitempty"`
+	// TODO: support referenceImages (style/asset references, up to 3 images)
+	// TODO: support lastFrame (first+last frame interpolation, Veo 3.1)
+}
+
+type submitResponse struct {
+	Name string `json:"name"`
+}
+
+type operationVideo struct {
+	MimeType           string `json:"mimeType"`
+	BytesBase64Encoded string `json:"bytesBase64Encoded"`
+	Encoding           string `json:"encoding"`
+}
+
+type operationResponse struct {
+	Name     string `json:"name"`
+	Done     bool   `json:"done"`
+	Response struct {
+		Type                  string           `json:"@type"`
+		RaiMediaFilteredCount int              `json:"raiMediaFilteredCount"`
+		Videos                []operationVideo `json:"videos"`
+		BytesBase64Encoded    string           `json:"bytesBase64Encoded"`
+		Encoding              string           `json:"encoding"`
+		Video                 string           `json:"video"`
+		GenerateVideoResponse struct {
+			GeneratedVideos []struct {
+				Video struct {
+					URI string `json:"uri"`
+				} `json:"video"`
+			} `json:"generatedVideos"`
+		} `json:"generateVideoResponse"`
+	} `json:"response"`
+	Error struct {
+		Message string `json:"message"`
+	} `json:"error"`
+}

+ 100 - 0
relay/channel/task/gemini/image.go

@@ -0,0 +1,100 @@
+package gemini
+
+import (
+	"encoding/base64"
+	"io"
+	"net/http"
+	"strings"
+
+	"github.com/QuantumNous/new-api/constant"
+	relaycommon "github.com/QuantumNous/new-api/relay/common"
+	"github.com/gin-gonic/gin"
+)
+
+const maxVeoImageSize = 20 * 1024 * 1024 // 20 MB
+
+// ExtractMultipartImage reads the first `input_reference` file from a multipart
+// form upload and returns a VeoImageInput. Returns nil if no file is present.
+func ExtractMultipartImage(c *gin.Context, info *relaycommon.RelayInfo) *VeoImageInput {
+	mf, err := c.MultipartForm()
+	if err != nil {
+		return nil
+	}
+	files, exists := mf.File["input_reference"]
+	if !exists || len(files) == 0 {
+		return nil
+	}
+	fh := files[0]
+	if fh.Size > maxVeoImageSize {
+		return nil
+	}
+	file, err := fh.Open()
+	if err != nil {
+		return nil
+	}
+	defer file.Close()
+
+	fileBytes, err := io.ReadAll(file)
+	if err != nil {
+		return nil
+	}
+
+	mimeType := fh.Header.Get("Content-Type")
+	if mimeType == "" || mimeType == "application/octet-stream" {
+		mimeType = http.DetectContentType(fileBytes)
+	}
+
+	info.Action = constant.TaskActionGenerate
+	return &VeoImageInput{
+		BytesBase64Encoded: base64.StdEncoding.EncodeToString(fileBytes),
+		MimeType:           mimeType,
+	}
+}
+
+// ParseImageInput parses an image string (data URI or raw base64) into a
+// VeoImageInput. Returns nil if the input is empty or invalid.
+// TODO: support downloading HTTP URL images and converting to base64
+func ParseImageInput(imageStr string) *VeoImageInput {
+	imageStr = strings.TrimSpace(imageStr)
+	if imageStr == "" {
+		return nil
+	}
+
+	if strings.HasPrefix(imageStr, "data:") {
+		return parseDataURI(imageStr)
+	}
+
+	raw, err := base64.StdEncoding.DecodeString(imageStr)
+	if err != nil {
+		return nil
+	}
+	return &VeoImageInput{
+		BytesBase64Encoded: imageStr,
+		MimeType:           http.DetectContentType(raw),
+	}
+}
+
+func parseDataURI(uri string) *VeoImageInput {
+	// data:image/png;base64,iVBOR...
+	rest := uri[len("data:"):]
+	idx := strings.Index(rest, ",")
+	if idx < 0 {
+		return nil
+	}
+	meta := rest[:idx]
+	b64 := rest[idx+1:]
+	if b64 == "" {
+		return nil
+	}
+
+	mimeType := "application/octet-stream"
+	parts := strings.SplitN(meta, ";", 2)
+	if len(parts) >= 1 && parts[0] != "" {
+		mimeType = parts[0]
+	}
+
+	return &VeoImageInput{
+		BytesBase64Encoded: b64,
+		MimeType:           mimeType,
+	}
+}

+ 74 - 39
relay/channel/task/vertex/adaptor.go

@@ -16,6 +16,7 @@ import (
 	"github.com/QuantumNous/new-api/constant"
 	"github.com/QuantumNous/new-api/dto"
 	"github.com/QuantumNous/new-api/relay/channel"
+	geminitask "github.com/QuantumNous/new-api/relay/channel/task/gemini"
 	taskcommon "github.com/QuantumNous/new-api/relay/channel/task/taskcommon"
 	vertexcore "github.com/QuantumNous/new-api/relay/channel/vertex"
 	relaycommon "github.com/QuantumNous/new-api/relay/common"
@@ -26,9 +27,34 @@ import (
 // Request / Response structures
 // ============================
 
+type veoInstance struct {
+	Prompt string                    `json:"prompt"`
+	Image  *geminitask.VeoImageInput `json:"image,omitempty"`
+	// TODO: support referenceImages (style/asset references, up to 3 images)
+	// TODO: support lastFrame (first+last frame interpolation, Veo 3.1)
+}
+
+type veoParameters struct {
+	SampleCount        int    `json:"sampleCount"`
+	DurationSeconds    int    `json:"durationSeconds,omitempty"`
+	AspectRatio        string `json:"aspectRatio,omitempty"`
+	Resolution         string `json:"resolution,omitempty"`
+	NegativePrompt     string `json:"negativePrompt,omitempty"`
+	PersonGeneration   string `json:"personGeneration,omitempty"`
+	StorageUri         string `json:"storageUri,omitempty"`
+	CompressionQuality string `json:"compressionQuality,omitempty"`
+	ResizeMode         string `json:"resizeMode,omitempty"`
+	Seed               *int   `json:"seed,omitempty"`
+	GenerateAudio      *bool  `json:"generateAudio,omitempty"`
+}
+
 type requestPayload struct {
-	Instances  []map[string]any `json:"instances"`
-	Parameters map[string]any   `json:"parameters,omitempty"`
+	Instances  []veoInstance  `json:"instances"`
+	Parameters *veoParameters `json:"parameters,omitempty"`
+}
+
+type fetchOperationPayload struct {
+	OperationName string `json:"operationName"`
 }
 
 type submitResponse struct {
@@ -134,25 +160,21 @@ func (a *TaskAdaptor) BuildRequestHeader(c *gin.Context, req *http.Request, info
 	return nil
 }
 
-// EstimateBilling 根据用户请求中的 sampleCount 计算 OtherRatios。
-func (a *TaskAdaptor) EstimateBilling(c *gin.Context, _ *relaycommon.RelayInfo) map[string]float64 {
-	sampleCount := 1
+// EstimateBilling returns OtherRatios based on durationSeconds and resolution.
+func (a *TaskAdaptor) EstimateBilling(c *gin.Context, info *relaycommon.RelayInfo) map[string]float64 {
 	v, ok := c.Get("task_request")
-	if ok {
-		req := v.(relaycommon.TaskSubmitReq)
-		if req.Metadata != nil {
-			if sc, exists := req.Metadata["sampleCount"]; exists {
-				if i, ok := sc.(int); ok && i > 0 {
-					sampleCount = i
-				}
-				if f, ok := sc.(float64); ok && int(f) > 0 {
-					sampleCount = int(f)
-				}
-			}
-		}
+	if !ok {
+		return nil
 	}
+	req := v.(relaycommon.TaskSubmitReq)
+
+	seconds := geminitask.ResolveVeoDuration(req.Metadata, req.Duration, req.Seconds)
+	resolution := geminitask.ResolveVeoResolution(req.Metadata, req.Size)
+	resRatio := geminitask.VeoResolutionRatio(info.UpstreamModelName, resolution)
+
 	return map[string]float64{
-		"sampleCount": float64(sampleCount),
+		"seconds":    float64(seconds),
+		"resolution": resRatio,
 	}
 }
 
@@ -164,29 +186,35 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
 	}
 	req := v.(relaycommon.TaskSubmitReq)
 
-	body := requestPayload{
-		Instances:  []map[string]any{{"prompt": req.Prompt}},
-		Parameters: map[string]any{},
-	}
-	if req.Metadata != nil {
-		if v, ok := req.Metadata["storageUri"]; ok {
-			body.Parameters["storageUri"] = v
-		}
-		if v, ok := req.Metadata["sampleCount"]; ok {
-			if i, ok := v.(int); ok {
-				body.Parameters["sampleCount"] = i
-			}
-			if f, ok := v.(float64); ok {
-				body.Parameters["sampleCount"] = int(f)
-			}
+	instance := veoInstance{Prompt: req.Prompt}
+	if img := geminitask.ExtractMultipartImage(c, info); img != nil {
+		instance.Image = img
+	} else if len(req.Images) > 0 {
+		if parsed := geminitask.ParseImageInput(req.Images[0]); parsed != nil {
+			instance.Image = parsed
+			info.Action = constant.TaskActionGenerate
 		}
 	}
-	if _, ok := body.Parameters["sampleCount"]; !ok {
-		body.Parameters["sampleCount"] = 1
+
+	params := &veoParameters{}
+	if err := taskcommon.UnmarshalMetadata(req.Metadata, params); err != nil {
+		return nil, fmt.Errorf("unmarshal metadata failed: %w", err)
 	}
+	if params.DurationSeconds == 0 && req.Duration > 0 {
+		params.DurationSeconds = req.Duration
+	}
+	if params.Resolution == "" && req.Size != "" {
+		params.Resolution = geminitask.SizeToVeoResolution(req.Size)
+	}
+	if params.AspectRatio == "" && req.Size != "" {
+		params.AspectRatio = geminitask.SizeToVeoAspectRatio(req.Size)
+	}
+	params.Resolution = strings.ToLower(params.Resolution)
+	params.SampleCount = 1
 
-	if body.Parameters["sampleCount"].(int) <= 0 {
-		return nil, fmt.Errorf("sampleCount must be greater than 0")
+	body := requestPayload{
+		Instances:  []veoInstance{instance},
+		Parameters: params,
 	}
 
 	data, err := common.Marshal(body)
@@ -226,7 +254,14 @@ func (a *TaskAdaptor) DoResponse(c *gin.Context, resp *http.Response, info *rela
 	return localID, responseBody, nil
 }
 
-func (a *TaskAdaptor) GetModelList() []string { return []string{"veo-3.0-generate-001"} }
+func (a *TaskAdaptor) GetModelList() []string {
+	return []string{
+		"veo-3.0-generate-001",
+		"veo-3.0-fast-generate-001",
+		"veo-3.1-generate-preview",
+		"veo-3.1-fast-generate-preview",
+	}
+}
 func (a *TaskAdaptor) GetChannelName() string { return "vertex" }
 
 // FetchTask fetch task status
@@ -254,7 +289,7 @@ func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy
 	} else {
 		url = fmt.Sprintf("https://%s-aiplatform.googleapis.com/v1/projects/%s/locations/%s/publishers/google/models/%s:fetchPredictOperation", region, project, region, modelName)
 	}
-	payload := map[string]string{"operationName": upstreamName}
+	payload := fetchOperationPayload{OperationName: upstreamName}
 	data, err := common.Marshal(payload)
 	if err != nil {
 		return nil, err

+ 4 - 0
setting/ratio_setting/model_ratio.go

@@ -298,6 +298,10 @@ var defaultModelPrice = map[string]float64{
 	"sora-2":                         0.3,
 	"sora-2-pro":                     0.5,
 	"gpt-4o-mini-tts":                0.3,
+	"veo-3.0-generate-001":           0.4,
+	"veo-3.0-fast-generate-001":      0.15,
+	"veo-3.1-generate-preview":       0.4,
+	"veo-3.1-fast-generate-preview":  0.15,
 }
 
 var defaultAudioRatio = map[string]float64{