ソースを参照

feat(gemini): update request structures for Veo predictLongRunning

- Refactored the request URL and body construction methods to align with the Veo predictLongRunning endpoint.
- Introduced new data structures for Veo instances and parameters, replacing the previous Gemini video generation configurations.
- Updated the Vertex adaptor to utilize the new Veo request payload format.
CaIon 6 日 前
コミット
21cfc1ca38

+ 21 - 19
relay/channel/task/gemini/adaptor.go

@@ -44,13 +44,13 @@ func (a *TaskAdaptor) ValidateRequestAndSetAction(c *gin.Context, info *relaycom
 	return relaycommon.ValidateBasicTaskRequest(c, info, constant.TaskActionTextGenerate)
 }
 
-// BuildRequestURL constructs the Gemini API generateVideos endpoint.
+// BuildRequestURL constructs the Gemini API predictLongRunning endpoint for Veo.
 func (a *TaskAdaptor) BuildRequestURL(info *relaycommon.RelayInfo) (string, error) {
 	modelName := info.UpstreamModelName
 	version := model_setting.GetGeminiVersionSetting(modelName)
 
 	return fmt.Sprintf(
-		"%s/%s/models/%s:generateVideos",
+		"%s/%s/models/%s:predictLongRunning",
 		a.baseURL,
 		version,
 		modelName,
@@ -65,7 +65,7 @@ func (a *TaskAdaptor) BuildRequestHeader(c *gin.Context, req *http.Request, info
 	return nil
 }
 
-// BuildRequestBody converts request into the Gemini API generateVideos format.
+// BuildRequestBody converts request into the Veo predictLongRunning format.
 func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayInfo) (io.Reader, error) {
 	v, ok := c.Get("task_request")
 	if !ok {
@@ -76,34 +76,36 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
 		return nil, fmt.Errorf("unexpected task_request type")
 	}
 
-	body := GeminiVideoPayload{
-		Prompt: req.Prompt,
-		Config: &GeminiVideoGenerationConfig{},
-	}
-
+	instance := VeoInstance{Prompt: req.Prompt}
 	if img := ExtractMultipartImage(c, info); img != nil {
-		body.Image = img
+		instance.Image = img
 	} else if len(req.Images) > 0 {
 		if parsed := ParseImageInput(req.Images[0]); parsed != nil {
-			body.Image = parsed
+			instance.Image = parsed
 			info.Action = constant.TaskActionGenerate
 		}
 	}
 
-	if err := taskcommon.UnmarshalMetadata(req.Metadata, body.Config); err != nil {
+	params := &VeoParameters{}
+	if err := taskcommon.UnmarshalMetadata(req.Metadata, params); err != nil {
 		return nil, errors.Wrap(err, "unmarshal metadata failed")
 	}
-	if body.Config.DurationSeconds == 0 && req.Duration > 0 {
-		body.Config.DurationSeconds = req.Duration
+	if params.DurationSeconds == 0 && req.Duration > 0 {
+		params.DurationSeconds = req.Duration
 	}
-	if body.Config.Resolution == "" && req.Size != "" {
-		body.Config.Resolution = SizeToVeoResolution(req.Size)
+	if params.Resolution == "" && req.Size != "" {
+		params.Resolution = SizeToVeoResolution(req.Size)
 	}
-	if body.Config.AspectRatio == "" && req.Size != "" {
-		body.Config.AspectRatio = SizeToVeoAspectRatio(req.Size)
+	if params.AspectRatio == "" && req.Size != "" {
+		params.AspectRatio = SizeToVeoAspectRatio(req.Size)
+	}
+	params.Resolution = strings.ToLower(params.Resolution)
+	params.SampleCount = 1
+
+	body := VeoRequestPayload{
+		Instances:  []VeoInstance{instance},
+		Parameters: params,
 	}
-	body.Config.Resolution = strings.ToLower(body.Config.Resolution)
-	body.Config.NumberOfVideos = 1
 
 	data, err := common.Marshal(body)
 	if err != nil {

+ 26 - 18
relay/channel/task/gemini/dto.go

@@ -1,16 +1,5 @@
 package gemini
 
-// GeminiVideoGenerationConfig represents the Gemini API GenerateVideosConfig.
-// Reference: https://ai.google.dev/gemini-api/docs/video
-type GeminiVideoGenerationConfig struct {
-	AspectRatio      string `json:"aspectRatio,omitempty"`
-	DurationSeconds  int    `json:"durationSeconds,omitempty"`
-	NegativePrompt   string `json:"negativePrompt,omitempty"`
-	PersonGeneration string `json:"personGeneration,omitempty"`
-	Resolution       string `json:"resolution,omitempty"`
-	NumberOfVideos   int    `json:"numberOfVideos,omitempty"`
-}
-
 // VeoImageInput represents an image input for Veo image-to-video.
 // Used by both Gemini and Vertex adaptors.
 type VeoImageInput struct {
@@ -18,17 +7,36 @@ type VeoImageInput struct {
 	MimeType           string `json:"mimeType"`
 }
 
-// GeminiVideoPayload is the top-level request body for the Gemini API
-// models/{model}:generateVideos endpoint.
-type GeminiVideoPayload struct {
-	Model  string                       `json:"model,omitempty"`
-	Prompt string                       `json:"prompt"`
-	Image  *VeoImageInput               `json:"image,omitempty"`
-	Config *GeminiVideoGenerationConfig `json:"config,omitempty"`
+// VeoInstance represents a single instance in the Veo predictLongRunning request.
+type VeoInstance struct {
+	Prompt string         `json:"prompt"`
+	Image  *VeoImageInput `json:"image,omitempty"`
 	// TODO: support referenceImages (style/asset references, up to 3 images)
 	// TODO: support lastFrame (first+last frame interpolation, Veo 3.1)
 }
 
+// VeoParameters represents the parameters block for Veo predictLongRunning.
+type VeoParameters struct {
+	SampleCount        int    `json:"sampleCount"`
+	DurationSeconds    int    `json:"durationSeconds,omitempty"`
+	AspectRatio        string `json:"aspectRatio,omitempty"`
+	Resolution         string `json:"resolution,omitempty"`
+	NegativePrompt     string `json:"negativePrompt,omitempty"`
+	PersonGeneration   string `json:"personGeneration,omitempty"`
+	StorageUri         string `json:"storageUri,omitempty"`
+	CompressionQuality string `json:"compressionQuality,omitempty"`
+	ResizeMode         string `json:"resizeMode,omitempty"`
+	Seed               *int   `json:"seed,omitempty"`
+	GenerateAudio      *bool  `json:"generateAudio,omitempty"`
+}
+
+// VeoRequestPayload is the top-level request body for the Veo
+// predictLongRunning endpoint (used by both Gemini and Vertex).
+type VeoRequestPayload struct {
+	Instances  []VeoInstance  `json:"instances"`
+	Parameters *VeoParameters `json:"parameters,omitempty"`
+}
+
 type submitResponse struct {
 	Name string `json:"name"`
 }

+ 4 - 30
relay/channel/task/vertex/adaptor.go

@@ -27,32 +27,6 @@ import (
 // Request / Response structures
 // ============================
 
-type veoInstance struct {
-	Prompt string                    `json:"prompt"`
-	Image  *geminitask.VeoImageInput `json:"image,omitempty"`
-	// TODO: support referenceImages (style/asset references, up to 3 images)
-	// TODO: support lastFrame (first+last frame interpolation, Veo 3.1)
-}
-
-type veoParameters struct {
-	SampleCount        int    `json:"sampleCount"`
-	DurationSeconds    int    `json:"durationSeconds,omitempty"`
-	AspectRatio        string `json:"aspectRatio,omitempty"`
-	Resolution         string `json:"resolution,omitempty"`
-	NegativePrompt     string `json:"negativePrompt,omitempty"`
-	PersonGeneration   string `json:"personGeneration,omitempty"`
-	StorageUri         string `json:"storageUri,omitempty"`
-	CompressionQuality string `json:"compressionQuality,omitempty"`
-	ResizeMode         string `json:"resizeMode,omitempty"`
-	Seed               *int   `json:"seed,omitempty"`
-	GenerateAudio      *bool  `json:"generateAudio,omitempty"`
-}
-
-type requestPayload struct {
-	Instances  []veoInstance  `json:"instances"`
-	Parameters *veoParameters `json:"parameters,omitempty"`
-}
-
 type fetchOperationPayload struct {
 	OperationName string `json:"operationName"`
 }
@@ -186,7 +160,7 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
 	}
 	req := v.(relaycommon.TaskSubmitReq)
 
-	instance := veoInstance{Prompt: req.Prompt}
+	instance := geminitask.VeoInstance{Prompt: req.Prompt}
 	if img := geminitask.ExtractMultipartImage(c, info); img != nil {
 		instance.Image = img
 	} else if len(req.Images) > 0 {
@@ -196,7 +170,7 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
 		}
 	}
 
-	params := &veoParameters{}
+	params := &geminitask.VeoParameters{}
 	if err := taskcommon.UnmarshalMetadata(req.Metadata, params); err != nil {
 		return nil, fmt.Errorf("unmarshal metadata failed: %w", err)
 	}
@@ -212,8 +186,8 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
 	params.Resolution = strings.ToLower(params.Resolution)
 	params.SampleCount = 1
 
-	body := requestPayload{
-		Instances:  []veoInstance{instance},
+	body := geminitask.VeoRequestPayload{
+		Instances:  []geminitask.VeoInstance{instance},
 		Parameters: params,
 	}