|
|
@@ -9,9 +9,8 @@ from __future__ import annotations
|
|
|
import json
|
|
|
import logging
|
|
|
import os
|
|
|
-from datetime import datetime, timezone
|
|
|
from pathlib import Path
|
|
|
-from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
|
+from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
|
|
from agent.tools import tool, ToolResult
|
|
|
from utils.tool_logging import format_tool_result_for_log, log_tool_call
|
|
|
@@ -67,158 +66,227 @@ def _load_output_json(*, trace_id: str) -> Optional[Dict[str, Any]]:
|
|
|
return data if isinstance(data, dict) else None
|
|
|
|
|
|
|
|
|
-def _extract_get_video_topic_videos(*, trace_id: str) -> List[Dict[str, Any]]:
|
|
|
+def _extract_contents(*, trace_id: str) -> List[Dict[str, Any]]:
|
|
|
"""
|
|
|
- 从 log.txt 中提取 get_video_topic 的返回 metadata.videos(原始选题点)。
|
|
|
-
|
|
|
- 期望日志片段形态(render_log_html 同源格式):
|
|
|
- [FOLD:🔧 工具调用:get_video_topic ...]
|
|
|
- ...
|
|
|
- [FOLD:📤 返回内容]
|
|
|
- <json array>
|
|
|
- [/FOLD]
|
|
|
+ 从 output.json 读取最终入选 contents。
|
|
|
+
|
|
|
+ 约定:
|
|
|
+ - 只允许对 output.json.contents 内的 aweme_id 生成/写入 process_trace rows
|
|
|
"""
|
|
|
- log_path = _output_dir_path() / trace_id / "log.txt"
|
|
|
- try:
|
|
|
- text = log_path.read_text(encoding="utf-8")
|
|
|
- except FileNotFoundError:
|
|
|
- return []
|
|
|
- except Exception:
|
|
|
- logger.warning("读取 log.txt 失败: %s", str(log_path), exc_info=True)
|
|
|
+ output_json = _load_output_json(trace_id=trace_id) or {}
|
|
|
+ contents = output_json.get("contents")
|
|
|
+ if not isinstance(contents, list):
|
|
|
return []
|
|
|
+ out: List[Dict[str, Any]] = []
|
|
|
+ for item in contents:
|
|
|
+ if isinstance(item, dict):
|
|
|
+ out.append(item)
|
|
|
+ return out
|
|
|
|
|
|
- marker = "[FOLD:🔧 工具调用:get_video_topic"
|
|
|
- start = text.find(marker)
|
|
|
- if start < 0:
|
|
|
- return []
|
|
|
- snippet = text[start:]
|
|
|
|
|
|
- out_marker = "[FOLD:📤 返回内容]"
|
|
|
- out_start = snippet.find(out_marker)
|
|
|
- if out_start < 0:
|
|
|
- return []
|
|
|
- after = snippet[out_start + len(out_marker) :]
|
|
|
+def _map_strategy_type(value: Any) -> str:
|
|
|
+ v = str(value or "").strip()
|
|
|
+ if v in ("case_based", "case", "case出发"):
|
|
|
+ return "case出发"
|
|
|
+ if v in ("feature_based", "feature", "特征出发"):
|
|
|
+ return "特征出发"
|
|
|
+ return v
|
|
|
+
|
|
|
+
|
|
|
+def _map_channel(value: Any) -> str:
|
|
|
+ v = str(value or "").strip()
|
|
|
+ mapping = {
|
|
|
+ "search": "抖音搜索",
|
|
|
+ "author": "订阅账号",
|
|
|
+ "ranking": "榜单",
|
|
|
+ "other": "其他",
|
|
|
+ "抖音搜索": "抖音搜索",
|
|
|
+ "订阅账号": "订阅账号",
|
|
|
+ "榜单": "榜单",
|
|
|
+ "其他": "其他",
|
|
|
+ }
|
|
|
+ return mapping.get(v, v)
|
|
|
+
|
|
|
+
|
|
|
+def _map_decision_basis(value: Any) -> str:
|
|
|
+ v = str(value or "").strip()
|
|
|
+ mapping = {
|
|
|
+ "content_portrait": "内容画像匹配",
|
|
|
+ "author_portrait": "作者画像匹配",
|
|
|
+ "demand_filtering": "需求筛选",
|
|
|
+ "other": "其他",
|
|
|
+ "画像缺失": "画像缺失",
|
|
|
+ "内容画像匹配": "内容画像匹配",
|
|
|
+ "作者画像匹配": "作者画像匹配",
|
|
|
+ "需求筛选": "需求筛选",
|
|
|
+ "其他": "其他",
|
|
|
+ }
|
|
|
+ return mapping.get(v, v)
|
|
|
|
|
|
- json_start = after.find("[")
|
|
|
- if json_start < 0:
|
|
|
- return []
|
|
|
- json_end = after.find("[/FOLD]")
|
|
|
- if json_end < 0:
|
|
|
- return []
|
|
|
|
|
|
- raw = after[json_start:json_end].strip()
|
|
|
- try:
|
|
|
- parsed = json.loads(raw)
|
|
|
- except Exception:
|
|
|
- logger.warning("解析 get_video_topic 返回 JSON 失败", exc_info=True)
|
|
|
- return []
|
|
|
+def _infer_decision_basis_from_output_content(content: Dict[str, Any]) -> str:
|
|
|
+ portrait = content.get("portrait_data") or {}
|
|
|
+ source = str(portrait.get("source") or "").strip()
|
|
|
+ if source == "content_like":
|
|
|
+ return "内容画像匹配"
|
|
|
+ if source == "account_fans":
|
|
|
+ return "作者画像匹配"
|
|
|
+ if source == "none":
|
|
|
+ return "画像缺失"
|
|
|
+ return ""
|
|
|
|
|
|
- return parsed if isinstance(parsed, list) else []
|
|
|
|
|
|
+def _build_base_row(*, trace_id: str, content: Dict[str, Any], input_features: List[str], query: str) -> Dict[str, Any]:
|
|
|
+ return {
|
|
|
+ "trace_id": trace_id,
|
|
|
+ "aweme_id": str(content.get("aweme_id") or "").strip(),
|
|
|
+ "title": str(content.get("title") or "").strip(),
|
|
|
+ "author_nickname": str(content.get("author_nickname") or "").strip(),
|
|
|
+ "strategy_type": "",
|
|
|
+ "from_case_aweme_id": "",
|
|
|
+ "from_case_point": "",
|
|
|
+ "from_feature": "",
|
|
|
+ "search_keyword": str(query or "").strip(),
|
|
|
+ "channel": "抖音搜索",
|
|
|
+ "decision_basis": _infer_decision_basis_from_output_content(content),
|
|
|
+ "decision_notes": str(content.get("reason") or "").strip(),
|
|
|
+ "input_features": input_features,
|
|
|
+ }
|
|
|
|
|
|
-def _flatten_case_points_text(video: Dict[str, Any]) -> str:
|
|
|
- tp = video.get("选题点")
|
|
|
- if not isinstance(tp, dict):
|
|
|
- return ""
|
|
|
- tokens: List[str] = []
|
|
|
- for k in ("灵感点", "目的点", "关键点"):
|
|
|
- v = tp.get(k)
|
|
|
- if isinstance(v, list):
|
|
|
- for x in v:
|
|
|
- if isinstance(x, str) and x.strip():
|
|
|
- tokens.append(x.strip())
|
|
|
- return " ".join(tokens)
|
|
|
|
|
|
+_ROW_KEYS: Tuple[str, ...] = (
|
|
|
+ "trace_id",
|
|
|
+ "aweme_id",
|
|
|
+ "title",
|
|
|
+ "author_nickname",
|
|
|
+ "strategy_type",
|
|
|
+ "from_case_aweme_id",
|
|
|
+ "from_case_point",
|
|
|
+ "from_feature",
|
|
|
+ "search_keyword",
|
|
|
+ "channel",
|
|
|
+ "decision_basis",
|
|
|
+ "decision_notes",
|
|
|
+ "input_features",
|
|
|
+)
|
|
|
|
|
|
-def _score_match(*, row_text: str, candidate_text: str) -> int:
|
|
|
- """
|
|
|
- 简单可控的匹配评分:按“子串命中次数”计分,避免引入分词依赖。
|
|
|
- """
|
|
|
- rt = (row_text or "").strip()
|
|
|
- ct = (candidate_text or "").strip()
|
|
|
- if not rt or not ct:
|
|
|
- return 0
|
|
|
- score = 0
|
|
|
- for token in _split_input_features(rt):
|
|
|
- if token and token in ct:
|
|
|
- score += 2
|
|
|
- # 再做一次整体包含(更强信号)
|
|
|
- if rt and rt in ct:
|
|
|
- score += 3
|
|
|
- return score
|
|
|
-
|
|
|
-
|
|
|
-def _pick_best_case_video(
|
|
|
- *, row: Dict[str, Any], case_videos: Sequence[Dict[str, Any]]
|
|
|
-) -> Optional[Dict[str, Any]]:
|
|
|
- if not case_videos:
|
|
|
- return None
|
|
|
- row_text = " ".join(
|
|
|
- [
|
|
|
- str(row.get("from_case_point") or ""),
|
|
|
- str(row.get("search_keyword") or ""),
|
|
|
- str(row.get("title") or ""),
|
|
|
- ]
|
|
|
- ).strip()
|
|
|
- scored: List[Tuple[int, int]] = []
|
|
|
- for i, v in enumerate(case_videos):
|
|
|
- scored.append((_score_match(row_text=row_text, candidate_text=_flatten_case_points_text(v)), i))
|
|
|
- scored.sort(reverse=True)
|
|
|
- best_score, best_idx = scored[0]
|
|
|
- # 低于 1 视为“不确定”,但仍给出一个稳定的默认(第一个)
|
|
|
- if best_score <= 0:
|
|
|
- return case_videos[0]
|
|
|
- return case_videos[best_idx]
|
|
|
+
|
|
|
+def _sanitize_row(row: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
+ """只保留固定字段,并把枚举值规范成中文。"""
|
|
|
+ out: Dict[str, Any] = {k: row.get(k, "") for k in _ROW_KEYS}
|
|
|
+ out["strategy_type"] = _map_strategy_type(out.get("strategy_type"))
|
|
|
+ out["channel"] = _map_channel(out.get("channel"))
|
|
|
+ out["decision_basis"] = _map_decision_basis(out.get("decision_basis"))
|
|
|
+ # input_features 规范为 list[str]
|
|
|
+ feats = out.get("input_features")
|
|
|
+ if isinstance(feats, list):
|
|
|
+ out["input_features"] = [str(x).strip() for x in feats if str(x).strip()]
|
|
|
+ elif isinstance(feats, str):
|
|
|
+ out["input_features"] = _split_input_features(feats)
|
|
|
+ else:
|
|
|
+ out["input_features"] = []
|
|
|
+ return out
|
|
|
|
|
|
|
|
|
def _normalize_payload(*, trace_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
- rows = payload.get("rows")
|
|
|
- if not isinstance(rows, list):
|
|
|
- return payload
|
|
|
+ # tool 只做最小职责:过滤/补全/规范化;复杂推理由 skill 生成 summary_json 来完成
|
|
|
+ raw_rows = payload.get("rows")
|
|
|
+ rows_in_payload: List[Dict[str, Any]] = []
|
|
|
+ if isinstance(raw_rows, list):
|
|
|
+ for item in raw_rows:
|
|
|
+ if isinstance(item, dict):
|
|
|
+ rows_in_payload.append(item)
|
|
|
|
|
|
output_json = _load_output_json(trace_id=trace_id) or {}
|
|
|
- input_features = _split_input_features(str(output_json.get("query") or ""))
|
|
|
- case_videos = _extract_get_video_topic_videos(trace_id=trace_id)
|
|
|
+ query = str(output_json.get("query") or "").strip()
|
|
|
+ input_features = _split_input_features(query)
|
|
|
+ contents = _extract_contents(trace_id=trace_id)
|
|
|
+ contents_by_aweme_id: Dict[str, Dict[str, Any]] = {
|
|
|
+ str(c.get("aweme_id") or "").strip(): c for c in contents if str(c.get("aweme_id") or "").strip()
|
|
|
+ }
|
|
|
|
|
|
- normalized_rows: List[Any] = []
|
|
|
- for item in rows:
|
|
|
- if not isinstance(item, dict):
|
|
|
- normalized_rows.append(item)
|
|
|
+ # 先把 payload rows 归并到 aweme_id
|
|
|
+ payload_by_aweme_id: Dict[str, Dict[str, Any]] = {}
|
|
|
+ for r in rows_in_payload:
|
|
|
+ aweme_id = str(r.get("aweme_id") or r.get("awemeId") or "").strip()
|
|
|
+ if not aweme_id:
|
|
|
continue
|
|
|
- row = dict(item)
|
|
|
-
|
|
|
- # 1) 每条视频都体现原始输入特征词
|
|
|
- if "input_features" not in row:
|
|
|
- row["input_features"] = input_features
|
|
|
-
|
|
|
- # 2) from_case_point:尽量输出“原始选题点信息”,而不是联想词
|
|
|
- if "from_case_point" in row and case_videos:
|
|
|
- original = _pick_best_case_video(row=row, case_videos=case_videos)
|
|
|
- if isinstance(original, dict) and isinstance(original.get("选题点"), dict):
|
|
|
- # 保留模型原先写的联想/归类结果,便于排查,但不作为主字段
|
|
|
- if isinstance(row.get("from_case_point"), str) and row.get("from_case_point"):
|
|
|
- row["from_case_point_guess"] = row["from_case_point"]
|
|
|
- row["from_case_point"] = original.get("选题点")
|
|
|
- if "from_case_aweme_id" not in row:
|
|
|
- row["from_case_aweme_id"] = str(original.get("id") or "").strip() or None
|
|
|
-
|
|
|
- normalized_rows.append(row)
|
|
|
-
|
|
|
- out = dict(payload)
|
|
|
- out["rows"] = normalized_rows
|
|
|
- return out
|
|
|
+ payload_by_aweme_id[aweme_id] = dict(r)
|
|
|
+
|
|
|
+ # 只允许 payload 覆盖“策略/来源/解释”字段,避免覆盖 output.json.contents 的身份字段(title/author 等)
|
|
|
+ allowed_payload_keys: set[str] = {
|
|
|
+ "strategy_type",
|
|
|
+ "from_case_aweme_id",
|
|
|
+ "from_case_point",
|
|
|
+ "from_feature",
|
|
|
+ "search_keyword",
|
|
|
+ "channel",
|
|
|
+ "decision_basis",
|
|
|
+ "decision_notes",
|
|
|
+ "input_features",
|
|
|
+ }
|
|
|
+
|
|
|
+ # 兼容 payload 的常见别名/驼峰 key(模型输出不稳定时,尽量不丢信息)
|
|
|
+ alias_map: Dict[str, Tuple[str, ...]] = {
|
|
|
+ "strategy_type": ("strategy_type", "strategyType"),
|
|
|
+ "from_case_aweme_id": ("from_case_aweme_id", "fromCaseAwemeId", "case_aweme_id", "caseAwemeId"),
|
|
|
+ "from_case_point": ("from_case_point", "fromCasePoint", "case_point", "casePoint"),
|
|
|
+ "from_feature": ("from_feature", "fromFeature", "feature", "from_feature_name"),
|
|
|
+ "search_keyword": ("search_keyword", "searchKeyword", "keyword"),
|
|
|
+ "channel": ("channel", "source_channel", "sourceChannel", "source"),
|
|
|
+ "decision_basis": ("decision_basis", "decisionBasis"),
|
|
|
+ "decision_notes": ("decision_notes", "decisionNotes", "notes"),
|
|
|
+ "input_features": ("input_features", "inputFeatures"),
|
|
|
+ }
|
|
|
+
|
|
|
+ def _pick(provided: Dict[str, Any], key: str) -> Any:
|
|
|
+ for k in alias_map.get(key, (key,)):
|
|
|
+ if k in provided:
|
|
|
+ return provided.get(k)
|
|
|
+ return None
|
|
|
+
|
|
|
+ normalized: List[Dict[str, Any]] = []
|
|
|
+ for aweme_id, content in contents_by_aweme_id.items():
|
|
|
+ base = _build_base_row(trace_id=trace_id, content=content, input_features=input_features, query=query)
|
|
|
+ provided = payload_by_aweme_id.get(aweme_id) or {}
|
|
|
+
|
|
|
+ merged = dict(base)
|
|
|
+ # 只合并允许覆盖的字段
|
|
|
+ for k in allowed_payload_keys:
|
|
|
+ v = _pick(provided, k)
|
|
|
+ if v is not None:
|
|
|
+ merged[k] = v
|
|
|
+
|
|
|
+ # 身份字段强制以 output.json.contents 为准(即使 payload 传了也不采纳)
|
|
|
+ merged["aweme_id"] = str(content.get("aweme_id") or "").strip()
|
|
|
+ merged["title"] = str(content.get("title") or "").strip()
|
|
|
+ merged["author_nickname"] = str(content.get("author_nickname") or "").strip()
|
|
|
+
|
|
|
+ # 如果缺失 input_features,用 query 拆分补齐
|
|
|
+ if "input_features" not in merged or not merged.get("input_features"):
|
|
|
+ merged["input_features"] = input_features
|
|
|
+
|
|
|
+ normalized.append(_sanitize_row(merged))
|
|
|
+
|
|
|
+ # 保持稳定顺序:按 rank(若有)或 aweme_id
|
|
|
+ def _sort_key(r: Dict[str, Any]) -> Tuple[int, str]:
|
|
|
+ c = contents_by_aweme_id.get(str(r.get("aweme_id") or "").strip()) or {}
|
|
|
+ try:
|
|
|
+ rank = int(c.get("rank") or 0)
|
|
|
+ except Exception:
|
|
|
+ rank = 0
|
|
|
+ return (rank if rank > 0 else 10**9, str(r.get("aweme_id") or ""))
|
|
|
+
|
|
|
+ normalized.sort(key=_sort_key)
|
|
|
+ return {"rows": normalized}
|
|
|
|
|
|
|
|
|
def _write_process_trace(*, trace_id: str, payload: Dict[str, Any]) -> Path:
|
|
|
out_dir = _output_dir_path() / trace_id
|
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
path = out_dir / "process_trace.json"
|
|
|
- doc = {
|
|
|
- **payload,
|
|
|
- "schema_version": "1.0",
|
|
|
- "trace_id": trace_id,
|
|
|
- "generated_at": datetime.now(timezone.utc).isoformat(),
|
|
|
- }
|
|
|
+ # 输出格式收敛:只允许 {"rows": [...]}
|
|
|
+ doc = {"rows": payload.get("rows") or []}
|
|
|
with path.open("w", encoding="utf-8") as f:
|
|
|
json.dump(doc, f, ensure_ascii=False, indent=2)
|
|
|
return path
|
|
|
@@ -226,26 +294,24 @@ def _write_process_trace(*, trace_id: str, payload: Dict[str, Any]) -> Path:
|
|
|
|
|
|
@tool(
|
|
|
description=(
|
|
|
- "在**全部流程执行完毕之后**调用:把每条入选/候选内容的「选择策略」整理成表格 JSON,"
|
|
|
+ "在**全部流程执行完毕之后**调用:把每条最终入选内容的「选择策略」整理成表格 JSON,"
|
|
|
"写入当前任务的 output 目录下的 process_trace.json,便于后续复盘。"
|
|
|
- "参数 summary_json 为 JSON 字符串,可以是:"
|
|
|
- "1)数组:每一项是一行记录;会被包成 {\"rows\": [...]};"
|
|
|
- "2)对象:应包含 rows 字段,rows 为行列表。"
|
|
|
- "建议每行至少包含:strategy_type(\"case_based\" | \"feature_based\")、"
|
|
|
- "from_case_aweme_id / from_feature(来源 case 的选题点或特征)、"
|
|
|
- "search_keyword(使用的搜索词)、"
|
|
|
- "channel(\"search\" | \"author\" | \"ranking\" | \"other\" 等)、"
|
|
|
- "decision_basis(如 \"demand_filtering\" | \"content_portrait\" | \"author_portrait\" | \"other\")、"
|
|
|
- "decision_notes(自由文本补充原因)。"
|
|
|
+ "参数 summary_json 为 JSON 字符串,可以是数组或对象(对象需包含 rows)。"
|
|
|
+ "可选参数 log_path/log_text 用于传入本次运行日志(便于复盘留档/未来扩展)。"
|
|
|
),
|
|
|
)
|
|
|
-async def exec_summary(trace_id: str, summary_json: str) -> ToolResult:
|
|
|
- """
|
|
|
- Args:
|
|
|
- trace_id: 本次任务 trace_id(与 output.json 同目录)。
|
|
|
- summary_json: JSON 字符串。对象或数组均可;数组会包成 {\"rows\": [...] }。
|
|
|
- """
|
|
|
- call_params = {"trace_id": trace_id, "summary_json": "<json>"}
|
|
|
+async def exec_summary(
|
|
|
+ trace_id: str,
|
|
|
+ summary_json: str,
|
|
|
+ log_path: str = "",
|
|
|
+ log_text: str = "",
|
|
|
+) -> ToolResult:
|
|
|
+ call_params = {
|
|
|
+ "trace_id": trace_id,
|
|
|
+ "summary_json": "<json>",
|
|
|
+ "log_path": (log_path or "").strip(),
|
|
|
+ "log_text": "<text>",
|
|
|
+ }
|
|
|
tid = (trace_id or "").strip()
|
|
|
if not tid:
|
|
|
err = ToolResult(
|
|
|
@@ -289,7 +355,13 @@ async def exec_summary(trace_id: str, summary_json: str) -> ToolResult:
|
|
|
out = ToolResult(
|
|
|
title="过程摘要",
|
|
|
output=f"已写入 {path}",
|
|
|
- metadata={"ok": True, "trace_id": tid, "path": str(path)},
|
|
|
+ metadata={
|
|
|
+ "ok": True,
|
|
|
+ "trace_id": tid,
|
|
|
+ "path": str(path),
|
|
|
+ "log_path": (log_path or "").strip(),
|
|
|
+ "log_text_len": len((log_text or "").strip()),
|
|
|
+ },
|
|
|
)
|
|
|
log_tool_call(_LOG_LABEL, {"trace_id": tid}, format_tool_result_for_log(out))
|
|
|
return out
|