|
|
@@ -4,12 +4,15 @@ AIGC接口调用
|
|
|
"""
|
|
|
import json
|
|
|
import logging
|
|
|
+import os
|
|
|
from datetime import datetime
|
|
|
+from pathlib import Path
|
|
|
from typing import List, Dict, Union, Tuple, Any
|
|
|
|
|
|
import requests
|
|
|
|
|
|
from agent import ToolResult, tool
|
|
|
+from db import update_content_plan_ids
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@@ -21,6 +24,42 @@ DEFAULT_TOKEN = "8bf14f27fc3a486788f3383452422d72"
|
|
|
DEFAULT_TIMEOUT = 60.0
|
|
|
|
|
|
|
|
|
+def _load_output_json(trace_id: str, output_dir: str) -> Dict[str, Any]:
|
|
|
+ """Load {output_dir}/{trace_id}/output.json."""
|
|
|
+ path = Path(output_dir) / trace_id / "output.json"
|
|
|
+ if not path.exists():
|
|
|
+ raise FileNotFoundError(f"output.json not found: {path}")
|
|
|
+ with path.open("r", encoding="utf-8") as f:
|
|
|
+ return json.load(f)
|
|
|
+
|
|
|
+
|
|
|
+def _extract_content_ids(data: Dict[str, Any]) -> List[str]:
|
|
|
+ """Extract aweme_id list from output json."""
|
|
|
+ contents = data.get("contents") or []
|
|
|
+ if not isinstance(contents, list):
|
|
|
+ return []
|
|
|
+ content_ids: List[str] = []
|
|
|
+ for item in contents:
|
|
|
+ if not isinstance(item, dict):
|
|
|
+ continue
|
|
|
+ aweme_id = item.get("aweme_id")
|
|
|
+ if aweme_id is None:
|
|
|
+ continue
|
|
|
+ aweme_id_str = str(aweme_id).strip()
|
|
|
+ if aweme_id_str:
|
|
|
+ content_ids.append(aweme_id_str)
|
|
|
+ return content_ids
|
|
|
+
|
|
|
+
|
|
|
+def _get_produce_plan_ids_from_env() -> List[str]:
|
|
|
+ """Read AIGC_DEMAND_DOUYIN_CONTENT_PRODUCE_PLAN_ID from env."""
|
|
|
+ raw = os.getenv("AIGC_DEMAND_DOUYIN_CONTENT_PRODUCE_PLAN_ID", "").strip()
|
|
|
+ if not raw:
|
|
|
+ return []
|
|
|
+ # 接口需要 List[str],因此把 env 字段(字符串)包装成 list。
|
|
|
+ return [raw]
|
|
|
+
|
|
|
+
|
|
|
@tool(description="根据抖音账号ID创建爬取计划")
|
|
|
async def create_crawler_plan_by_douyin_account_id(
|
|
|
account_id: str,
|
|
|
@@ -168,14 +207,12 @@ async def create_crawler_plan_by_douyin_account_id(
|
|
|
|
|
|
@tool(description="根据抖音视频ID创建爬取计划")
|
|
|
async def create_crawler_plan_by_douyin_content_id(
|
|
|
- content_ids: List[str],
|
|
|
- produce_plan_ids: List[str] = []
|
|
|
+ trace_id: str,
|
|
|
) -> ToolResult:
|
|
|
"""
|
|
|
根据抖音视频ID创建爬取计划
|
|
|
Args:
|
|
|
- content_ids: 抖音内容ID列表
|
|
|
- produce_plan_ids: 爬取计划要绑定的生成计划ID,默认为空列表
|
|
|
+ trace_id: 内容寻找任务 trace_id(用于读取 {output_dir}/{trace_id}/output.json)
|
|
|
Returns:
|
|
|
Returns:
|
|
|
ToolResult: 包含以下内容
|
|
|
@@ -193,20 +230,45 @@ async def create_crawler_plan_by_douyin_content_id(
|
|
|
Note:
|
|
|
- 建议从 metadata.result 获取结构化数据,而非解析 output 文本
|
|
|
"""
|
|
|
- if not content_ids or not isinstance(content_ids, list):
|
|
|
- logger.error(f"create_crawler_plan_by_douyin_content_id invalid content_ids. content_ids: {content_ids}")
|
|
|
+ if not trace_id or not isinstance(trace_id, str):
|
|
|
+ logger.error(f"create_crawler_plan_by_douyin_content_id invalid trace_id: {trace_id}")
|
|
|
return ToolResult(
|
|
|
- title="根据抖音内容ID创建爬取计划失败",
|
|
|
+ title="根据抖音内容创建爬取计划失败",
|
|
|
+ output="",
|
|
|
+ error="trace_id 参数无效: trace_id 必须是非空字符串",
|
|
|
+ )
|
|
|
+
|
|
|
+ output_dir = os.getenv("OUTPUT_DIR", ".cache/output")
|
|
|
+ try:
|
|
|
+ data = _load_output_json(trace_id=trace_id, output_dir=output_dir)
|
|
|
+ content_ids = _extract_content_ids(data)
|
|
|
+ except Exception as e:
|
|
|
+ msg = f"加载/解析 output.json 失败: {e}"
|
|
|
+ logger.error(msg, exc_info=True)
|
|
|
+ return ToolResult(
|
|
|
+ title="根据抖音内容创建爬取计划失败",
|
|
|
output="",
|
|
|
- error="content_ids 参数无效: content_ids必须是列表"
|
|
|
+ error=msg,
|
|
|
+ )
|
|
|
+
|
|
|
+ if not content_ids:
|
|
|
+ return ToolResult(
|
|
|
+ title="根据抖音内容创建爬取计划失败",
|
|
|
+ output="",
|
|
|
+ error="未在 output.json.contents 中找到有效 aweme_id",
|
|
|
)
|
|
|
if len(content_ids) > 100:
|
|
|
- logger.error(f"create_crawler_plan_by_douyin_content_id invalid content_ids length. content_ids.length: {len(content_ids)}")
|
|
|
+ logger.error(
|
|
|
+ "create_crawler_plan_by_douyin_content_id invalid content_ids length. "
|
|
|
+ f"content_ids.length: {len(content_ids)}"
|
|
|
+ )
|
|
|
return ToolResult(
|
|
|
- title="根据抖音内容ID创建爬取计划失败",
|
|
|
+ title="根据抖音内容创建爬取计划失败",
|
|
|
output="",
|
|
|
- error=f"content_ids 长度异常: 期望1~100, 实际{len(content_ids)}"
|
|
|
+ error=f"content_ids 长度异常: 期望1~100, 实际{len(content_ids)}",
|
|
|
)
|
|
|
+
|
|
|
+ produce_plan_ids = _get_produce_plan_ids_from_env()
|
|
|
dt = datetime.now().strftime("%Y%m%d%h%M%s")
|
|
|
crawler_plan_name = f"【内容寻找Agent自动创建】抖音视频直接抓取-{dt}-抖音"
|
|
|
params = {
|
|
|
@@ -242,6 +304,10 @@ async def create_crawler_plan_by_douyin_content_id(
|
|
|
summary_lines.append(f" 抖音视频IDs: {','.join(content_ids)}")
|
|
|
summary_lines.append(f" 爬取计划ID: {crawler_plan_id}")
|
|
|
produce_plan_infos: List[Dict[str, str]] = []
|
|
|
+ db_updated_rows = 0
|
|
|
+ # 环境里的生成计划 ID(字符串);与是否执行绑定接口无关,用于写库
|
|
|
+ env_produce_plan_id = (produce_plan_ids[0] if produce_plan_ids else "").strip()
|
|
|
+
|
|
|
if produce_plan_ids:
|
|
|
input_source_info = {
|
|
|
"contentType": 1,
|
|
|
@@ -260,6 +326,18 @@ async def create_crawler_plan_by_douyin_content_id(
|
|
|
summary_lines.append(f" 绑定结果: {'绑定成功' if not produce_plan_info.get('msg') else '绑定失败'}")
|
|
|
summary_lines.append(f" 信息: {produce_plan_info.get('msg', '成功')}")
|
|
|
|
|
|
+ # 爬取计划 id 与生成计划 id 任一存在则写库(不依赖是否已配置 produce_plan_ids 去走绑定)
|
|
|
+ if (crawler_plan_id or "").strip() or env_produce_plan_id:
|
|
|
+ try:
|
|
|
+ db_updated_rows = update_content_plan_ids(
|
|
|
+ trace_id=trace_id,
|
|
|
+ aweme_ids=content_ids,
|
|
|
+ crawler_plan_id=crawler_plan_id or "",
|
|
|
+ produce_plan_id=env_produce_plan_id,
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"update content plan ids failed: {e}", exc_info=True)
|
|
|
+
|
|
|
return ToolResult(
|
|
|
title="根据抖音内容ID创建爬取计划",
|
|
|
output="\n".join(summary_lines),
|
|
|
@@ -278,7 +356,8 @@ async def create_crawler_plan_by_douyin_content_id(
|
|
|
}
|
|
|
for produce_plan_info in produce_plan_infos
|
|
|
]
|
|
|
- }
|
|
|
+ },
|
|
|
+ "db": {"updated_rows": db_updated_rows},
|
|
|
},
|
|
|
long_term_memory="Create crawler plan by DouYin Content IDs",
|
|
|
)
|