analyze_creation_pattern_v2.py 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 创作模式分析 V2(完整流程)
  5. 整合三步流程:
  6. 1. 数据准备:根据帖子图谱 + 人设图谱,提取待分析数据
  7. 2. 起点分析:AI分析创意起点(新版prompt)
  8. 3. 模式推导:基于共现关系的迭代推导
  9. 输入:帖子图谱 + 人设图谱
  10. 输出:完整的创作模式分析结果
  11. """
  12. import asyncio
  13. import json
  14. from pathlib import Path
  15. from typing import Dict, List, Optional, Set
  16. import sys
  17. # 添加项目根目录到路径
  18. project_root = Path(__file__).parent.parent.parent
  19. sys.path.insert(0, str(project_root))
  20. from lib.llm_cached import analyze, LLMConfig, AnalyzeResult
  21. from lib.my_trace import set_trace_smith as set_trace
  22. from script.data_processing.path_config import PathConfig
  23. # ===== 配置 =====
  24. TASK_NAME = "creation_pattern_v2" # 缓存任务名称
  25. OUTPUT_DIR_NAME = "creation_pattern_v2" # 输出目录名称
  26. MATCH_SCORE_THRESHOLD = 0.8 # 匹配分数阈值
  27. GLOBAL_RATIO_THRESHOLD = 0.8 # 全局占比阈值
  28. ORIGIN_SCORE_THRESHOLD = 0.8 # 起点分数阈值
  29. # ===== 数据加载 =====
  30. def load_json(file_path: Path) -> Dict:
  31. """加载JSON文件"""
  32. with open(file_path, "r", encoding="utf-8") as f:
  33. return json.load(f)
  34. def get_post_graph_files(config: PathConfig) -> List[Path]:
  35. """获取所有帖子图谱文件"""
  36. post_graph_dir = config.intermediate_dir / "post_graph"
  37. return sorted(post_graph_dir.glob("*_帖子图谱.json"))
  38. # ===== 第一步:数据准备 =====
  39. def extract_post_detail(post_graph: Dict) -> Dict:
  40. """提取帖子详情"""
  41. meta = post_graph.get("meta", {})
  42. post_detail = meta.get("postDetail", {})
  43. return {
  44. "postId": meta.get("postId", ""),
  45. "postTitle": meta.get("postTitle", ""),
  46. "body_text": post_detail.get("body_text", ""),
  47. "images": post_detail.get("images", []),
  48. "video": post_detail.get("video"),
  49. "publish_time": post_detail.get("publish_time", ""),
  50. "like_count": post_detail.get("like_count", 0),
  51. "collect_count": post_detail.get("collect_count", 0),
  52. }
  53. def extract_analysis_nodes(post_graph: Dict, persona_graph: Dict) -> tuple:
  54. """
  55. 提取待分析节点列表
  56. 待分析节点 = 灵感点 + 目的点 + 关键点
  57. """
  58. nodes = post_graph.get("nodes", {})
  59. edges = post_graph.get("edges", {})
  60. persona_nodes = persona_graph.get("nodes", {})
  61. persona_index = persona_graph.get("index", {})
  62. # 1. 收集关键点信息
  63. keypoints = {}
  64. for node_id, node in nodes.items():
  65. if node.get("type") == "标签" and node.get("dimension") == "关键点":
  66. keypoints[node_id] = {
  67. "名称": node.get("name", ""),
  68. "描述": node.get("detail", {}).get("description", ""),
  69. }
  70. # 2. 分析支撑关系
  71. support_map = {}
  72. for edge_id, edge in edges.items():
  73. if edge.get("type") == "支撑":
  74. source_id = edge.get("source", "")
  75. target_id = edge.get("target", "")
  76. if source_id in keypoints:
  77. if target_id not in support_map:
  78. support_map[target_id] = []
  79. support_map[target_id].append(keypoints[source_id])
  80. # 3. 分析关联关系
  81. relation_map = {}
  82. for edge_id, edge in edges.items():
  83. if edge.get("type") == "关联":
  84. source_id = edge.get("source", "")
  85. target_id = edge.get("target", "")
  86. source_name = nodes.get(source_id, {}).get("name", "")
  87. target_name = nodes.get(target_id, {}).get("name", "")
  88. if source_id not in relation_map:
  89. relation_map[source_id] = []
  90. relation_map[source_id].append(target_name)
  91. if target_id not in relation_map:
  92. relation_map[target_id] = []
  93. relation_map[target_id].append(source_name)
  94. # 4. 分析人设匹配
  95. match_map = {}
  96. persona_out_edges = persona_index.get("outEdges", {})
  97. def get_node_info(node_id: str) -> Optional[Dict]:
  98. """获取人设节点的标准信息"""
  99. node = persona_nodes.get(node_id, {})
  100. if not node:
  101. return None
  102. detail = node.get("detail", {})
  103. parent_path = detail.get("parentPath", [])
  104. return {
  105. "节点ID": node_id,
  106. "节点名称": node.get("name", ""),
  107. "节点分类": "/".join(parent_path) if parent_path else "",
  108. "节点维度": node.get("dimension", ""),
  109. "节点类型": node.get("type", ""),
  110. "人设全局占比": detail.get("probGlobal", 0),
  111. "父类下占比": detail.get("probToParent", 0),
  112. }
  113. def get_parent_category_id(node_id: str) -> Optional[str]:
  114. """通过属于边获取父分类节点ID"""
  115. belong_edges = persona_out_edges.get(node_id, {}).get("属于", [])
  116. for edge in belong_edges:
  117. target_id = edge.get("target", "")
  118. target_node = persona_nodes.get(target_id, {})
  119. if target_node.get("type") == "分类":
  120. return target_id
  121. return None
  122. for edge_id, edge in edges.items():
  123. if edge.get("type") == "匹配":
  124. source_id = edge.get("source", "")
  125. target_id = edge.get("target", "")
  126. if source_id.startswith("帖子:") and target_id.startswith("人设:"):
  127. match_score = edge.get("score", 0)
  128. persona_node = persona_nodes.get(target_id, {})
  129. if persona_node:
  130. node_type = persona_node.get("type", "")
  131. match_node_info = get_node_info(target_id)
  132. if not match_node_info:
  133. continue
  134. if node_type == "标签":
  135. category_id = get_parent_category_id(target_id)
  136. else:
  137. category_id = target_id
  138. category_info = None
  139. if category_id:
  140. category_node = persona_nodes.get(category_id, {})
  141. if category_node:
  142. category_detail = category_node.get("detail", {})
  143. category_path = category_detail.get("parentPath", [])
  144. category_info = {
  145. "节点ID": category_id,
  146. "节点名称": category_node.get("name", ""),
  147. "节点分类": "/".join(category_path) if category_path else "",
  148. "节点维度": category_node.get("dimension", ""),
  149. "节点类型": "分类",
  150. "人设全局占比": category_detail.get("probGlobal", 0),
  151. "父类下占比": category_detail.get("probToParent", 0),
  152. "历史共现分类": [],
  153. }
  154. co_occur_edges = persona_out_edges.get(category_id, {}).get("分类共现", [])
  155. co_occur_edges_sorted = sorted(co_occur_edges, key=lambda x: x.get("score", 0), reverse=True)
  156. for co_edge in co_occur_edges_sorted[:5]:
  157. co_target_id = co_edge.get("target", "")
  158. co_score = co_edge.get("score", 0)
  159. co_node = persona_nodes.get(co_target_id, {})
  160. if co_node:
  161. co_detail = co_node.get("detail", {})
  162. co_path = co_detail.get("parentPath", [])
  163. category_info["历史共现分类"].append({
  164. "节点ID": co_target_id,
  165. "节点名称": co_node.get("name", ""),
  166. "节点分类": "/".join(co_path) if co_path else "",
  167. "节点维度": co_node.get("dimension", ""),
  168. "节点类型": "分类",
  169. "人设全局占比": co_detail.get("probGlobal", 0),
  170. "父类下占比": co_detail.get("probToParent", 0),
  171. "共现度": round(co_score, 4),
  172. })
  173. if source_id not in match_map:
  174. match_map[source_id] = []
  175. match_map[source_id].append({
  176. "匹配节点": match_node_info,
  177. "匹配分数": round(match_score, 4),
  178. "所属分类": category_info,
  179. })
  180. # 5. 构建待分析节点列表
  181. analysis_nodes = []
  182. for node_id, node in nodes.items():
  183. if node.get("type") == "标签" and node.get("domain") == "帖子":
  184. dimension = node.get("dimension", "")
  185. if dimension in ["灵感点", "目的点", "关键点"]:
  186. match_info = match_map.get(node_id)
  187. analysis_nodes.append({
  188. "节点ID": node_id,
  189. "节点名称": node.get("name", ""),
  190. "节点分类": node.get("category", ""),
  191. "节点维度": dimension,
  192. "节点类型": node.get("type", ""),
  193. "节点描述": node.get("detail", {}).get("description", ""),
  194. "人设匹配": match_info,
  195. })
  196. # 6. 构建关系列表
  197. relation_list = []
  198. for edge_id, edge in edges.items():
  199. if edge.get("type") == "支撑":
  200. source_id = edge.get("source", "")
  201. target_id = edge.get("target", "")
  202. if source_id in keypoints:
  203. relation_list.append({
  204. "来源节点": source_id,
  205. "目标节点": target_id,
  206. "关系类型": "支撑",
  207. })
  208. seen_relations = set()
  209. for edge_id, edge in edges.items():
  210. if edge.get("type") == "关联":
  211. source_id = edge.get("source", "")
  212. target_id = edge.get("target", "")
  213. key = tuple(sorted([source_id, target_id]))
  214. if key not in seen_relations:
  215. seen_relations.add(key)
  216. relation_list.append({
  217. "来源节点": source_id,
  218. "目标节点": target_id,
  219. "关系类型": "关联",
  220. })
  221. return analysis_nodes, relation_list
  222. def prepare_analysis_data(post_graph: Dict, persona_graph: Dict) -> Dict:
  223. """
  224. 准备完整的分析数据
  225. 输出扁平化的节点列表 + 独立的人设共现关系数据
  226. """
  227. analysis_nodes, relation_list = extract_analysis_nodes(post_graph, persona_graph)
  228. # 扁平化节点,提取人设共现关系数据
  229. flat_nodes = []
  230. persona_co_occur = {} # {分类ID: {名称, 共现分类列表}}
  231. for node in analysis_nodes:
  232. # 基础节点字段
  233. flat_node = {
  234. "节点ID": node["节点ID"],
  235. "节点名称": node["节点名称"],
  236. "节点分类": node.get("节点分类", ""),
  237. "节点维度": node["节点维度"],
  238. "节点描述": node.get("节点描述", ""),
  239. "是否已知": False,
  240. "发现编号": None,
  241. }
  242. # 提取人设匹配信息(list格式,支持多个匹配)
  243. match_list = node.get("人设匹配") or []
  244. if match_list:
  245. flat_node["人设匹配"] = []
  246. for match_info in match_list:
  247. category_info = match_info.get("所属分类")
  248. category_id = category_info.get("节点ID") if category_info else None
  249. # 保留完整的匹配信息,但去掉历史共现分类(拆到外面)
  250. clean_match = {
  251. "匹配节点": match_info.get("匹配节点"),
  252. "匹配分数": match_info.get("匹配分数", 0),
  253. }
  254. if category_info:
  255. # 复制所属分类,但不包含历史共现分类
  256. clean_category = {k: v for k, v in category_info.items() if k != "历史共现分类"}
  257. clean_match["所属分类"] = clean_category
  258. flat_node["人设匹配"].append(clean_match)
  259. # 收集人设共现关系(去重)- 从历史共现分类拆出来
  260. if category_id and category_id not in persona_co_occur:
  261. co_occur_list = category_info.get("历史共现分类", [])
  262. if co_occur_list:
  263. persona_co_occur[category_id] = [
  264. {
  265. "节点ID": c.get("节点ID"),
  266. "节点名称": c.get("节点名称"),
  267. "节点分类": c.get("节点分类", ""),
  268. "节点维度": c.get("节点维度", ""),
  269. "节点类型": c.get("节点类型", ""),
  270. "人设全局占比": c.get("人设全局占比", 0),
  271. "父类下占比": c.get("父类下占比", 0),
  272. "共现度": c.get("共现度", 0),
  273. }
  274. for c in co_occur_list
  275. if c.get("节点ID")
  276. ]
  277. else:
  278. flat_node["人设匹配"] = []
  279. flat_nodes.append(flat_node)
  280. return {
  281. "帖子详情": extract_post_detail(post_graph),
  282. "节点列表": flat_nodes,
  283. "关系列表": relation_list,
  284. "人设共现关系": persona_co_occur,
  285. }
  286. # ===== 第二步:起点分析(新版prompt) =====
  287. def get_best_match(node: Dict) -> Optional[Dict]:
  288. """获取节点的最佳人设匹配(分数最高的)"""
  289. match_list = node.get("人设匹配") or []
  290. if not match_list:
  291. return None
  292. return max(match_list, key=lambda m: m.get("匹配分数", 0))
  293. def get_match_score(node: Dict) -> float:
  294. """获取节点的最高人设匹配分数"""
  295. best_match = get_best_match(node)
  296. if best_match:
  297. return best_match.get("匹配分数", 0)
  298. return 0
  299. def get_category_id(node: Dict) -> Optional[str]:
  300. """获取节点的所属分类ID(最佳匹配的)"""
  301. best_match = get_best_match(node)
  302. if best_match:
  303. category = best_match.get("所属分类")
  304. if category:
  305. return category.get("节点ID")
  306. return None
  307. def get_all_category_ids(node: Dict) -> List[str]:
  308. """获取节点所有匹配的分类ID"""
  309. match_list = node.get("人设匹配") or []
  310. result = []
  311. for m in match_list:
  312. category = m.get("所属分类")
  313. if category and category.get("节点ID"):
  314. result.append(category.get("节点ID"))
  315. return result
  316. def get_category_global_ratio(node: Dict) -> float:
  317. """获取节点所属分类的人设全局占比(最佳匹配的)"""
  318. best_match = get_best_match(node)
  319. if best_match:
  320. category = best_match.get("所属分类")
  321. if category:
  322. return category.get("人设全局占比", 0)
  323. return 0
  324. def is_persona_constant(node: Dict) -> bool:
  325. """判断节点是否为人设常量(匹配分数 >= 0.8 且 分类全局占比 >= 0.8)"""
  326. match_score = get_match_score(node)
  327. global_ratio = get_category_global_ratio(node)
  328. return match_score >= MATCH_SCORE_THRESHOLD and global_ratio >= GLOBAL_RATIO_THRESHOLD
  329. def build_origin_context(nodes: List[Dict]) -> Dict:
  330. """构造AI分析的上下文(新版格式)"""
  331. # 所有创意标签
  332. all_tags = []
  333. for node in nodes:
  334. all_tags.append({
  335. "名称": node["节点名称"],
  336. "人设匹配度": round(get_match_score(node), 2),
  337. "所属分类全局占比": round(get_category_global_ratio(node), 2),
  338. })
  339. # 起点候选集(灵感点 + 目的点)
  340. candidates = [
  341. node["节点名称"]
  342. for node in nodes
  343. if node["节点维度"] in ["灵感点", "目的点"]
  344. ]
  345. return {
  346. "all_tags": all_tags,
  347. "candidates": candidates,
  348. }
  349. def format_origin_prompt(context: Dict) -> str:
  350. """格式化起点分析的prompt(新版)"""
  351. all_tags = context["all_tags"]
  352. candidates = context["candidates"]
  353. # 创意标签列表
  354. tags_text = ""
  355. for tag in all_tags:
  356. tags_text += f"- {tag['名称']}\n"
  357. tags_text += f" 人设匹配度: {tag['人设匹配度']} | 所属分类全局占比: {tag['所属分类全局占比']}\n\n"
  358. # 起点候选集(一行)
  359. candidates_text = "、".join(candidates)
  360. prompt = f"""# Role
  361. 你是小红书爆款内容的"逆向工程"专家。你的核心能力是透过内容的表象,还原创作者最初的脑回路。
  362. # Task
  363. 我提供一组笔记的【创意标签】和一个【起点候选集】。
  364. 请推理出哪些选项是真正的**创意起点**。
  365. # Input Data
  366. ## 创意标签
  367. {tags_text}
  368. ## 起点候选集
  369. {candidates_text}
  370. # 推理约束
  371. - 无法被其他项或人设推理出的点,即为起点(推理关系局限在起点候选集中)
  372. - 包含/被包含关系代表一种顺序:由大节点推导出被包含节点
  373. - 目的推理手段
  374. - 实质推理形式
  375. - 和人设匹配度越低的帖子是起点概率越大,证明这个起点具备外部性
  376. # Output Format
  377. 请输出一个标准的 JSON 格式。
  378. - Key: 候选集中的词。
  379. - Value: 一个对象,包含:
  380. - `score`: 0.0 到 1.0 的浮点数(代表是起点的可能性)。
  381. - `analysis`: 一句话推理"""
  382. return prompt
  383. async def analyze_origin(nodes: List[Dict], force_llm: bool = False) -> Dict:
  384. """
  385. 执行起点分析
  386. 输入: 节点列表
  387. 输出: 节点列表(加了起点分析、是否已知、发现编号字段)+ 中间结果
  388. """
  389. context = build_origin_context(nodes)
  390. prompt = format_origin_prompt(context)
  391. print(f"\n 起点候选: {len(context['candidates'])} 个")
  392. result = await analyze(
  393. prompt=prompt,
  394. task_name=f"{TASK_NAME}/origin",
  395. force=force_llm,
  396. parse_json=True,
  397. )
  398. # 把分析结果合并到节点
  399. llm_result = result.data or {}
  400. output_nodes = []
  401. current_order = 1 # 已知节点的发现编号计数
  402. for node in nodes:
  403. new_node = dict(node) # 复制原节点
  404. name = node["节点名称"]
  405. if name in llm_result:
  406. score = llm_result[name].get("score", 0)
  407. analysis = llm_result[name].get("analysis", "")
  408. # 加起点分析
  409. new_node["起点分析"] = {
  410. "分数": score,
  411. "说明": analysis,
  412. }
  413. # 高分起点标记为已知
  414. if score >= ORIGIN_SCORE_THRESHOLD:
  415. new_node["是否已知"] = True
  416. new_node["发现编号"] = current_order
  417. current_order += 1
  418. else:
  419. new_node["起点分析"] = None
  420. output_nodes.append(new_node)
  421. return {
  422. "输入上下文": {
  423. "创意标签": context["all_tags"],
  424. "起点候选": context["candidates"],
  425. },
  426. "中间结果": llm_result,
  427. "输出节点": output_nodes,
  428. "cache_hit": result.cache_hit,
  429. "model": result.model_name,
  430. "log_url": result.log_url,
  431. }
  432. # ===== 第三步:模式推导 =====
  433. def derive_patterns(
  434. nodes: List[Dict],
  435. persona_co_occur: Dict[str, Dict],
  436. ) -> Dict:
  437. """
  438. 基于共现关系的迭代推导
  439. 输入: 带起点分析的节点列表 + 人设共现关系数据
  440. 输出: 节点列表(加了推导轮次、未知原因字段)+ 推导边列表
  441. """
  442. node_by_name: Dict[str, Dict] = {n["节点名称"]: n for n in nodes}
  443. # 构建共现查找表 {节点ID: {共现节点ID: 共现度}}
  444. co_occur_lookup = {}
  445. for cat_id, co_occur_list in persona_co_occur.items():
  446. co_occur_lookup[cat_id] = {
  447. c["节点ID"]: c["共现度"]
  448. for c in co_occur_list
  449. }
  450. # 1. 初始化已知点集合(已经是已知的节点)
  451. known_names: Set[str] = set()
  452. node_round: Dict[str, int] = {} # {节点名称: 加入轮次}
  453. for node in nodes:
  454. if node.get("是否已知"):
  455. known_names.add(node["节点名称"])
  456. node_round[node["节点名称"]] = 0
  457. unknown_names: Set[str] = set(node_by_name.keys()) - known_names
  458. edges: List[Dict] = []
  459. # 2. 迭代推导
  460. round_num = 0
  461. new_known_this_round = known_names.copy()
  462. while new_known_this_round:
  463. round_num += 1
  464. new_known_next_round: Set[str] = set()
  465. for known_name in new_known_this_round:
  466. known_node = node_by_name.get(known_name)
  467. if not known_node:
  468. continue
  469. if get_match_score(known_node) < MATCH_SCORE_THRESHOLD:
  470. continue
  471. # 获取该节点所属分类的共现列表
  472. known_cat_id = get_category_id(known_node)
  473. if not known_cat_id or known_cat_id not in co_occur_lookup:
  474. continue
  475. co_occur_map = co_occur_lookup[known_cat_id]
  476. for unknown_name in list(unknown_names):
  477. unknown_node = node_by_name.get(unknown_name)
  478. if not unknown_node:
  479. continue
  480. if get_match_score(unknown_node) < MATCH_SCORE_THRESHOLD:
  481. continue
  482. # 检查未知节点的分类是否在已知节点的共现列表中
  483. unknown_cat_id = get_category_id(unknown_node)
  484. if unknown_cat_id and unknown_cat_id in co_occur_map:
  485. co_occur_score = co_occur_map[unknown_cat_id]
  486. new_known_next_round.add(unknown_name)
  487. node_round[unknown_name] = round_num
  488. edges.append({
  489. "来源": known_node["节点ID"],
  490. "目标": unknown_node["节点ID"],
  491. "关系类型": "共现推导",
  492. "推导轮次": round_num,
  493. "共现分类ID": unknown_cat_id,
  494. "共现度": co_occur_score,
  495. })
  496. known_names.update(new_known_next_round)
  497. unknown_names -= new_known_next_round
  498. new_known_this_round = new_known_next_round
  499. if not new_known_next_round:
  500. break
  501. # 3. 构建输出节点(只更新是否已知、发现编号)
  502. # 先找出当前最大发现编号
  503. max_order = 0
  504. for node in nodes:
  505. if node.get("发现编号") and node["发现编号"] > max_order:
  506. max_order = node["发现编号"]
  507. # 按推导轮次排序新发现的节点,分配发现编号
  508. new_known_by_round = {}
  509. for name, r in node_round.items():
  510. if r > 0: # 排除起点(轮次0)
  511. if r not in new_known_by_round:
  512. new_known_by_round[r] = []
  513. new_known_by_round[r].append(name)
  514. # 分配发现编号
  515. order_map = {}
  516. current_order = max_order + 1
  517. for r in sorted(new_known_by_round.keys()):
  518. for name in new_known_by_round[r]:
  519. order_map[name] = current_order
  520. current_order += 1
  521. output_nodes = []
  522. for node in nodes:
  523. new_node = dict(node)
  524. name = node["节点名称"]
  525. # 如果是新推导出来的(非起点),更新已知状态和发现编号
  526. if name in node_round and node_round[name] > 0:
  527. new_node["是否已知"] = True
  528. new_node["发现编号"] = order_map.get(name)
  529. output_nodes.append(new_node)
  530. return {
  531. "输出节点": output_nodes,
  532. "推导边列表": edges,
  533. "推导轮次": round_num,
  534. }
  535. # ===== 第四步:下一步分析 =====
  536. def build_next_step_context(known_nodes: List[Dict], unknown_nodes: List[Dict], all_nodes: List[Dict]) -> Dict:
  537. """构造下一步分析的上下文"""
  538. # 已知点信息(按发现顺序排序)
  539. known_sorted = sorted(known_nodes, key=lambda n: n.get("发现编号") or 999)
  540. known_info = []
  541. for n in known_sorted:
  542. info = {
  543. "名称": n["节点名称"],
  544. "维度": n["节点维度"],
  545. "分类": n.get("节点分类", ""),
  546. "描述": n.get("节点描述", ""),
  547. "人设匹配度": round(get_match_score(n), 2),
  548. "人设全局占比": round(get_category_global_ratio(n), 2),
  549. "发现编号": n.get("发现编号"),
  550. }
  551. # 如果有起点分析,加上
  552. if n.get("起点分析"):
  553. info["起点说明"] = n["起点分析"].get("说明", "")
  554. known_info.append(info)
  555. # 未知点信息
  556. unknown_info = []
  557. for n in unknown_nodes:
  558. unknown_info.append({
  559. "名称": n["节点名称"],
  560. "维度": n["节点维度"],
  561. "分类": n.get("节点分类", ""),
  562. "描述": n.get("节点描述", ""),
  563. "人设匹配度": round(get_match_score(n), 2),
  564. "人设全局占比": round(get_category_global_ratio(n), 2),
  565. })
  566. # 人设常量(从全部节点中筛选)
  567. constants = [
  568. n["节点名称"]
  569. for n in all_nodes
  570. if is_persona_constant(n)
  571. ]
  572. return {
  573. "known_nodes": known_info,
  574. "unknown_nodes": unknown_info,
  575. "constants": constants,
  576. }
  577. def format_next_step_prompt(context: Dict) -> str:
  578. """格式化下一步分析的prompt"""
  579. known_text = ""
  580. for i, n in enumerate(context["known_nodes"], 1):
  581. known_text += f"{i}. {n['名称']} ({n['维度']})\n"
  582. known_text += f" 分类: {n['分类']}\n"
  583. known_text += f" 描述: {n['描述']}\n"
  584. known_text += f" 人设匹配度: {n['人设匹配度']} | 人设全局占比: {n['人设全局占比']}\n"
  585. if n.get("起点说明"):
  586. known_text += f" 起点说明: {n['起点说明']}\n"
  587. known_text += "\n"
  588. unknown_text = ""
  589. for n in context["unknown_nodes"]:
  590. unknown_text += f"- {n['名称']} ({n['维度']})\n"
  591. unknown_text += f" 分类: {n['分类']}\n"
  592. unknown_text += f" 描述: {n['描述']}\n"
  593. unknown_text += f" 人设匹配度: {n['人设匹配度']} | 人设全局占比: {n['人设全局占比']}\n\n"
  594. constants = context.get("constants", [])
  595. constants_text = "、".join(constants) if constants else "无"
  596. prompt = f"""# Role
  597. 你是小红书爆款内容的"逆向工程"专家。你的任务是还原创作者的思维路径。
  598. # Task
  599. 基于已知的创意点,推理哪些未知点最可能是创作者**下一步直接想到**的点。
  600. 可以有多个点同时被想到(如果它们在逻辑上是并列的)。
  601. ## 已知点(按发现顺序)
  602. {known_text}
  603. ## 未知点(待推理)
  604. {unknown_text}
  605. ## 人设常量
  606. {constants_text}
  607. # 推理约束
  608. 1. 创作者的思维是有逻辑的:先有动机/目的,再想形式/手法
  609. 2. 关键点通常是为了支撑灵感点或目的点
  610. 3. 人设常量是创作者固有的风格,不需要推理
  611. 4. 只输出"下一步直接能想到"的点,不是所有未知点
  612. # Output Format
  613. 输出 JSON,对每个未知点评分:
  614. - Key: 未知点名称
  615. - Value: 对象,包含:
  616. - `score`: 0.0-1.0(下一步被想到的可能性)
  617. - `from`: 从哪个已知点推导出来(已知点名称)
  618. - `reason`: 如何从该已知点推导出来(一句话)"""
  619. return prompt
  620. async def analyze_next_step(
  621. nodes: List[Dict],
  622. force_llm: bool = False
  623. ) -> Dict:
  624. """
  625. 执行下一步分析
  626. 输入: 节点列表(有已知和未知)
  627. 输出: 最可能的下一步点列表
  628. """
  629. # 分离已知和未知
  630. known_nodes = [n for n in nodes if n.get("是否已知")]
  631. unknown_nodes = [n for n in nodes if not n.get("是否已知")]
  632. if not unknown_nodes:
  633. return {
  634. "输入上下文": {"已知点": [], "未知点": [], "人设常量": []},
  635. "中间结果": [],
  636. "下一步点": [],
  637. }
  638. context = build_next_step_context(known_nodes, unknown_nodes, nodes)
  639. prompt = format_next_step_prompt(context)
  640. print(f"\n 已知点: {len(known_nodes)} 个")
  641. print(f" 未知点: {len(unknown_nodes)} 个")
  642. result = await analyze(
  643. prompt=prompt,
  644. task_name=f"{TASK_NAME}/next_step",
  645. force=force_llm,
  646. parse_json=True,
  647. )
  648. # 解析结果(现在是 {name: {score, from, reason}} 格式)
  649. llm_result = result.data or {}
  650. # 构建候选列表,按分数排序
  651. candidates = []
  652. for name, info in llm_result.items():
  653. candidates.append({
  654. "节点名称": name,
  655. "可能性分数": info.get("score", 0),
  656. "推导来源": info.get("from", ""),
  657. "推理说明": info.get("reason", ""),
  658. })
  659. candidates.sort(key=lambda x: x["可能性分数"], reverse=True)
  660. return {
  661. "输入上下文": {
  662. "已知点": context["known_nodes"],
  663. "未知点": context["unknown_nodes"],
  664. "人设常量": context["constants"],
  665. },
  666. "中间结果": llm_result,
  667. "下一步候选": candidates,
  668. "cache_hit": result.cache_hit,
  669. "model": result.model_name,
  670. "log_url": result.log_url,
  671. }
  672. # ===== 完整流程 =====
  673. def save_result(post_id: str, post_detail: Dict, steps: List, config: PathConfig) -> Path:
  674. """保存结果到文件"""
  675. output_dir = config.intermediate_dir / OUTPUT_DIR_NAME
  676. output_dir.mkdir(parents=True, exist_ok=True)
  677. output_file = output_dir / f"{post_id}_创作模式.json"
  678. result = {
  679. "帖子详情": post_detail,
  680. "步骤列表": steps,
  681. }
  682. with open(output_file, "w", encoding="utf-8") as f:
  683. json.dump(result, f, ensure_ascii=False, indent=2)
  684. print(f" [已保存] {output_file.name}")
  685. return output_file
  686. async def process_single_post(
  687. post_file: Path,
  688. persona_graph: Dict,
  689. config: PathConfig,
  690. force_llm: bool = False,
  691. max_step: int = 3,
  692. ) -> Dict:
  693. """
  694. 处理单个帖子
  695. Args:
  696. force_llm: 强制重新调用LLM(跳过LLM缓存)
  697. max_step: 最多运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导)
  698. """
  699. post_graph = load_json(post_file)
  700. post_id = post_graph.get("meta", {}).get("postId", "unknown")
  701. print(f"\n{'=' * 60}")
  702. print(f"处理帖子: {post_id}")
  703. print("-" * 60)
  704. steps = []
  705. # ===== 步骤1:数据准备 =====
  706. print("\n[步骤1] 数据准备...")
  707. data = prepare_analysis_data(post_graph, persona_graph)
  708. post_detail = data["帖子详情"]
  709. nodes_step1 = data["节点列表"]
  710. relations_step1 = data["关系列表"]
  711. persona_co_occur = data["人设共现关系"]
  712. # 步骤1所有节点都是新的
  713. new_known_step1 = [n["节点名称"] for n in nodes_step1 if n.get("是否已知")]
  714. step1 = {
  715. "步骤": "数据准备",
  716. "输入": {
  717. "帖子图谱": str(post_file.name),
  718. "人设图谱": "人设图谱.json",
  719. },
  720. "输出": {
  721. "新的已知节点": new_known_step1,
  722. "新的边": [],
  723. "节点列表": nodes_step1,
  724. "边列表": relations_step1,
  725. },
  726. "人设共现关系": persona_co_occur,
  727. "摘要": {
  728. "节点数": len(nodes_step1),
  729. "边数": len(relations_step1),
  730. "人设共现数": len(persona_co_occur),
  731. },
  732. }
  733. steps.append(step1)
  734. print(f" 节点数: {len(nodes_step1)}")
  735. print(f" 关系数: {len(relations_step1)}")
  736. print(f" 人设共现数: {len(persona_co_occur)}")
  737. # 步骤1完成,保存
  738. save_result(post_id, post_detail, steps, config)
  739. if max_step == 1:
  740. return {"帖子详情": post_detail, "步骤列表": steps}
  741. # ===== 步骤2:起点分析 =====
  742. print("\n[步骤2] 起点分析...")
  743. origin_result = await analyze_origin(nodes_step1, force_llm=force_llm)
  744. nodes_step2 = origin_result["输出节点"]
  745. # 统计高分起点
  746. def get_origin_score(node):
  747. analysis = node.get("起点分析")
  748. if analysis:
  749. return analysis.get("分数", 0)
  750. return 0
  751. high_score_origins = [
  752. (n["节点名称"], get_origin_score(n))
  753. for n in nodes_step2
  754. if get_origin_score(n) >= 0.7
  755. ]
  756. # 新发现的已知节点(起点)
  757. new_known_nodes = [n["节点名称"] for n in nodes_step2 if n.get("是否已知")]
  758. step2 = {
  759. "步骤": "起点分析",
  760. "输入": {
  761. "节点列表": nodes_step1,
  762. "创意标签": origin_result["输入上下文"]["创意标签"],
  763. "起点候选": origin_result["输入上下文"]["起点候选"],
  764. },
  765. "中间结果": origin_result["中间结果"],
  766. "输出": {
  767. "新的已知节点": new_known_nodes,
  768. "新的边": [],
  769. "节点列表": nodes_step2,
  770. "边列表": relations_step1, # 边没变化
  771. },
  772. "摘要": {
  773. "新已知数": len(new_known_nodes),
  774. "model": origin_result["model"],
  775. "cache_hit": origin_result["cache_hit"],
  776. "log_url": origin_result.get("log_url"),
  777. },
  778. }
  779. steps.append(step2)
  780. print(f" 高分起点 (>=0.7): {len(high_score_origins)} 个")
  781. for name, score in sorted(high_score_origins, key=lambda x: -x[1]):
  782. print(f" ★ {name}: {score:.2f}")
  783. # 步骤2完成,保存
  784. save_result(post_id, post_detail, steps, config)
  785. if max_step == 2:
  786. return {"帖子详情": post_detail, "步骤列表": steps}
  787. # ===== 步骤3:模式推导 =====
  788. print("\n[步骤3] 模式推导...")
  789. derivation_result = derive_patterns(nodes_step2, persona_co_occur)
  790. nodes_step3 = derivation_result["输出节点"]
  791. edges = derivation_result["推导边列表"]
  792. # 统计
  793. known_count = sum(1 for n in nodes_step3 if n.get("是否已知"))
  794. unknown_count = len(nodes_step3) - known_count
  795. # 新发现的已知节点(本步骤推导出来的,不包括之前的起点)
  796. prev_known = {n["节点名称"] for n in nodes_step2 if n.get("是否已知")}
  797. new_known_nodes = [n["节点名称"] for n in nodes_step3 if n.get("是否已知") and n["节点名称"] not in prev_known]
  798. # 合并边列表(原有边 + 推导边)
  799. all_edges = relations_step1 + edges
  800. step3 = {
  801. "步骤": "模式推导",
  802. "输入": {
  803. "节点列表": nodes_step2,
  804. "人设共现关系": persona_co_occur,
  805. },
  806. "输出": {
  807. "新的已知节点": new_known_nodes,
  808. "新的边": edges,
  809. "节点列表": nodes_step3,
  810. "边列表": all_edges,
  811. },
  812. "摘要": {
  813. "已知点数": known_count,
  814. "新已知数": len(new_known_nodes),
  815. "新边数": len(edges),
  816. "未知点数": unknown_count,
  817. },
  818. }
  819. steps.append(step3)
  820. print(f" 已知点: {known_count} 个")
  821. print(f" 推导边: {len(edges)} 条")
  822. print(f" 未知点: {unknown_count} 个")
  823. # 步骤3完成,保存
  824. save_result(post_id, post_detail, steps, config)
  825. if max_step == 3:
  826. return {"帖子详情": post_detail, "步骤列表": steps}
  827. # ===== 步骤4:下一步分析 =====
  828. print("\n[步骤4] 下一步分析...")
  829. next_step_result = await analyze_next_step(nodes_step3, force_llm=force_llm)
  830. # 获取候选列表
  831. candidates = next_step_result["下一步候选"]
  832. # 筛选高分候选 (>= 0.8)
  833. NEXT_STEP_THRESHOLD = 0.8
  834. high_score_candidates = [c for c in candidates if c["可能性分数"] >= NEXT_STEP_THRESHOLD]
  835. # 构建节点名称到节点的映射
  836. node_by_name = {n["节点名称"]: n for n in nodes_step3}
  837. # 找出当前最大发现编号
  838. max_order = max((n.get("发现编号") or 0) for n in nodes_step3)
  839. # 更新节点:把高分候选标记为已知
  840. nodes_step4 = []
  841. new_known_names = []
  842. current_order = max_order + 1
  843. for node in nodes_step3:
  844. new_node = dict(node)
  845. name = node["节点名称"]
  846. # 检查是否在高分候选中
  847. matching = [c for c in high_score_candidates if c["节点名称"] == name]
  848. if matching and not node.get("是否已知"):
  849. new_node["是否已知"] = True
  850. new_node["发现编号"] = current_order
  851. current_order += 1
  852. new_known_names.append(name)
  853. nodes_step4.append(new_node)
  854. # 创建新的边(推导边)
  855. new_edges = []
  856. for c in high_score_candidates:
  857. target_node = node_by_name.get(c["节点名称"])
  858. source_name = c["推导来源"]
  859. source_node = node_by_name.get(source_name)
  860. if target_node and source_node:
  861. new_edges.append({
  862. "来源": source_node["节点ID"],
  863. "目标": target_node["节点ID"],
  864. "关系类型": "AI推导",
  865. "可能性分数": c["可能性分数"],
  866. "推理说明": c["推理说明"],
  867. })
  868. # 合并边列表
  869. all_edges_step4 = all_edges + new_edges
  870. step4 = {
  871. "步骤": "下一步分析",
  872. "输入": {
  873. "已知点": next_step_result["输入上下文"]["已知点"],
  874. "未知点": next_step_result["输入上下文"]["未知点"],
  875. "人设常量": next_step_result["输入上下文"]["人设常量"],
  876. },
  877. "中间结果": next_step_result["中间结果"],
  878. "输出": {
  879. "新的已知节点": new_known_names,
  880. "新的边": new_edges,
  881. "节点列表": nodes_step4,
  882. "边列表": all_edges_step4,
  883. },
  884. "摘要": {
  885. "已知点数": sum(1 for n in nodes_step4 if n.get("是否已知")),
  886. "新已知数": len(new_known_names),
  887. "新边数": len(new_edges),
  888. "未知点数": sum(1 for n in nodes_step4 if not n.get("是否已知")),
  889. "model": next_step_result.get("model"),
  890. "cache_hit": next_step_result.get("cache_hit"),
  891. "log_url": next_step_result.get("log_url"),
  892. },
  893. }
  894. steps.append(step4)
  895. # 打印高分候选
  896. print(f" 候选数: {len(candidates)} 个")
  897. print(f" 高分候选 (>={NEXT_STEP_THRESHOLD}): {len(high_score_candidates)} 个")
  898. for c in high_score_candidates:
  899. print(f" ★ {c['节点名称']} ({c['可能性分数']:.2f}) ← {c['推导来源']}")
  900. print(f" {c['推理说明']}")
  901. # 步骤4完成,保存
  902. save_result(post_id, post_detail, steps, config)
  903. if max_step == 4:
  904. return {"帖子详情": post_detail, "步骤列表": steps}
  905. # ===== 循环:步骤3→步骤4 直到全部已知 =====
  906. iteration = 1
  907. current_nodes = nodes_step4
  908. current_edges = all_edges_step4
  909. MAX_ITERATIONS = 10 # 防止无限循环
  910. while True:
  911. # 检查是否还有未知节点
  912. unknown_count = sum(1 for n in current_nodes if not n.get("是否已知"))
  913. if unknown_count == 0:
  914. print(f"\n[完成] 所有节点已变为已知")
  915. break
  916. if iteration > MAX_ITERATIONS:
  917. print(f"\n[警告] 达到最大迭代次数 {MAX_ITERATIONS},停止循环")
  918. break
  919. # ===== 迭代步骤3:共现推导 =====
  920. print(f"\n[迭代{iteration}-步骤3] 模式推导...")
  921. derivation_result = derive_patterns(current_nodes, persona_co_occur)
  922. nodes_iter3 = derivation_result["输出节点"]
  923. edges_iter3 = derivation_result["推导边列表"]
  924. # 统计新推导的
  925. prev_known_names = {n["节点名称"] for n in current_nodes if n.get("是否已知")}
  926. new_known_step3 = [n["节点名称"] for n in nodes_iter3 if n.get("是否已知") and n["节点名称"] not in prev_known_names]
  927. new_edges_step3 = edges_iter3 # derive_patterns 返回的是本轮新增的边
  928. all_edges_iter3 = current_edges + new_edges_step3
  929. step_iter3 = {
  930. "步骤": f"迭代{iteration}-模式推导",
  931. "输入": {
  932. "节点列表": current_nodes,
  933. "人设共现关系": persona_co_occur,
  934. },
  935. "输出": {
  936. "新的已知节点": new_known_step3,
  937. "新的边": new_edges_step3,
  938. "节点列表": nodes_iter3,
  939. "边列表": all_edges_iter3,
  940. },
  941. "摘要": {
  942. "已知点数": sum(1 for n in nodes_iter3 if n.get("是否已知")),
  943. "新已知数": len(new_known_step3),
  944. "新边数": len(new_edges_step3),
  945. "未知点数": sum(1 for n in nodes_iter3 if not n.get("是否已知")),
  946. },
  947. }
  948. steps.append(step_iter3)
  949. print(f" 新已知: {len(new_known_step3)} 个")
  950. print(f" 新边: {len(new_edges_step3)} 条")
  951. save_result(post_id, post_detail, steps, config)
  952. # 检查是否还有未知
  953. unknown_after_step3 = sum(1 for n in nodes_iter3 if not n.get("是否已知"))
  954. if unknown_after_step3 == 0:
  955. print(f"\n[完成] 所有节点已变为已知")
  956. break
  957. # ===== 迭代步骤4:AI推导 =====
  958. print(f"\n[迭代{iteration}-步骤4] 下一步分析...")
  959. next_step_result = await analyze_next_step(nodes_iter3, force_llm=force_llm)
  960. candidates_iter4 = next_step_result["下一步候选"]
  961. high_score_iter4 = [c for c in candidates_iter4 if c["可能性分数"] >= NEXT_STEP_THRESHOLD]
  962. # 更新节点
  963. node_by_name_iter4 = {n["节点名称"]: n for n in nodes_iter3}
  964. max_order_iter4 = max((n.get("发现编号") or 0) for n in nodes_iter3)
  965. nodes_iter4 = []
  966. new_known_iter4 = []
  967. current_order_iter4 = max_order_iter4 + 1
  968. for node in nodes_iter3:
  969. new_node = dict(node)
  970. name = node["节点名称"]
  971. matching = [c for c in high_score_iter4 if c["节点名称"] == name]
  972. if matching and not node.get("是否已知"):
  973. new_node["是否已知"] = True
  974. new_node["发现编号"] = current_order_iter4
  975. current_order_iter4 += 1
  976. new_known_iter4.append(name)
  977. nodes_iter4.append(new_node)
  978. # 创建新边
  979. new_edges_iter4 = []
  980. for c in high_score_iter4:
  981. target_node = node_by_name_iter4.get(c["节点名称"])
  982. source_node = node_by_name_iter4.get(c["推导来源"])
  983. if target_node and source_node:
  984. new_edges_iter4.append({
  985. "来源": source_node["节点ID"],
  986. "目标": target_node["节点ID"],
  987. "关系类型": "AI推导",
  988. "可能性分数": c["可能性分数"],
  989. "推理说明": c["推理说明"],
  990. })
  991. all_edges_iter4 = all_edges_iter3 + new_edges_iter4
  992. step_iter4 = {
  993. "步骤": f"迭代{iteration}-下一步分析",
  994. "输入": {
  995. "已知点": next_step_result["输入上下文"]["已知点"],
  996. "未知点": next_step_result["输入上下文"]["未知点"],
  997. "人设常量": next_step_result["输入上下文"]["人设常量"],
  998. },
  999. "中间结果": next_step_result["中间结果"],
  1000. "输出": {
  1001. "新的已知节点": new_known_iter4,
  1002. "新的边": new_edges_iter4,
  1003. "节点列表": nodes_iter4,
  1004. "边列表": all_edges_iter4,
  1005. },
  1006. "摘要": {
  1007. "已知点数": sum(1 for n in nodes_iter4 if n.get("是否已知")),
  1008. "新已知数": len(new_known_iter4),
  1009. "新边数": len(new_edges_iter4),
  1010. "未知点数": sum(1 for n in nodes_iter4 if not n.get("是否已知")),
  1011. "model": next_step_result.get("model"),
  1012. "cache_hit": next_step_result.get("cache_hit"),
  1013. },
  1014. }
  1015. steps.append(step_iter4)
  1016. print(f" 新已知: {len(new_known_iter4)} 个")
  1017. print(f" 新边: {len(new_edges_iter4)} 条")
  1018. save_result(post_id, post_detail, steps, config)
  1019. # 如果这轮没有新进展,停止
  1020. if len(new_known_step3) == 0 and len(new_known_iter4) == 0:
  1021. print(f"\n[停止] 本轮无新进展,停止循环")
  1022. break
  1023. # 更新状态,进入下一轮
  1024. current_nodes = nodes_iter4
  1025. current_edges = all_edges_iter4
  1026. iteration += 1
  1027. return {"帖子详情": post_detail, "步骤列表": steps}
  1028. # ===== 主函数 =====
  1029. async def main(
  1030. post_id: str = None,
  1031. all_posts: bool = False,
  1032. force_llm: bool = False,
  1033. max_step: int = 3,
  1034. ):
  1035. """主函数"""
  1036. _, log_url = set_trace()
  1037. config = PathConfig()
  1038. print(f"账号: {config.account_name}")
  1039. print(f"Trace URL: {log_url}")
  1040. print(f"输出目录: {OUTPUT_DIR_NAME}")
  1041. # 加载人设图谱
  1042. persona_graph_file = config.intermediate_dir / "人设图谱.json"
  1043. if not persona_graph_file.exists():
  1044. print(f"错误: 人设图谱文件不存在: {persona_graph_file}")
  1045. return
  1046. persona_graph = load_json(persona_graph_file)
  1047. print(f"人设图谱节点数: {len(persona_graph.get('nodes', {}))}")
  1048. # 获取帖子图谱文件
  1049. post_graph_files = get_post_graph_files(config)
  1050. if not post_graph_files:
  1051. print("错误: 没有找到帖子图谱文件")
  1052. return
  1053. # 确定要处理的帖子
  1054. if post_id:
  1055. target_file = next(
  1056. (f for f in post_graph_files if post_id in f.name),
  1057. None
  1058. )
  1059. if not target_file:
  1060. print(f"错误: 未找到帖子 {post_id}")
  1061. return
  1062. files_to_process = [target_file]
  1063. elif all_posts:
  1064. files_to_process = post_graph_files
  1065. else:
  1066. files_to_process = [post_graph_files[0]]
  1067. print(f"待处理帖子数: {len(files_to_process)}")
  1068. # 处理
  1069. results = []
  1070. for i, post_file in enumerate(files_to_process, 1):
  1071. print(f"\n{'#' * 60}")
  1072. print(f"# 处理帖子 {i}/{len(files_to_process)}")
  1073. print(f"{'#' * 60}")
  1074. result = await process_single_post(
  1075. post_file=post_file,
  1076. persona_graph=persona_graph,
  1077. config=config,
  1078. force_llm=force_llm,
  1079. max_step=max_step,
  1080. )
  1081. results.append(result)
  1082. # 汇总
  1083. print(f"\n{'#' * 60}")
  1084. print(f"# 完成! 共处理 {len(results)} 个帖子")
  1085. print(f"{'#' * 60}")
  1086. print(f"Trace: {log_url}")
  1087. print("\n汇总:")
  1088. for result in results:
  1089. post_id = result["帖子详情"]["postId"]
  1090. steps = result.get("步骤列表", [])
  1091. num_steps = len(steps)
  1092. if num_steps == 1:
  1093. step1_summary = steps[0].get("摘要", {})
  1094. print(f" {post_id}: 节点数={step1_summary.get('节点数', 0)} (仅数据准备)")
  1095. elif num_steps == 2:
  1096. step2_summary = steps[1].get("摘要", {})
  1097. print(f" {post_id}: 起点={step2_summary.get('新已知数', 0)} (未推导)")
  1098. elif num_steps == 3:
  1099. step3_summary = steps[2].get("摘要", {})
  1100. print(f" {post_id}: 已知={step3_summary.get('已知点数', 0)}, "
  1101. f"未知={step3_summary.get('未知点数', 0)}")
  1102. elif num_steps >= 4:
  1103. step4_summary = steps[3].get("摘要", {})
  1104. print(f" {post_id}: 已知={step4_summary.get('已知点数', 0)}, "
  1105. f"新已知={step4_summary.get('新已知数', 0)}, "
  1106. f"新边={step4_summary.get('新边数', 0)}, "
  1107. f"未知={step4_summary.get('未知点数', 0)}")
  1108. else:
  1109. print(f" {post_id}: 无步骤数据")
  1110. if __name__ == "__main__":
  1111. import argparse
  1112. parser = argparse.ArgumentParser(description="创作模式分析 V2")
  1113. parser.add_argument("--post-id", type=str, help="帖子ID")
  1114. parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
  1115. parser.add_argument("--force-llm", action="store_true", help="强制重新调用LLM(跳过LLM缓存)")
  1116. parser.add_argument("--step", type=int, default=5, choices=[1, 2, 3, 4, 5],
  1117. help="运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导, 4=下一步分析, 5=完整循环)")
  1118. args = parser.parse_args()
  1119. asyncio.run(main(
  1120. post_id=args.post_id,
  1121. all_posts=args.all_posts,
  1122. force_llm=args.force_llm,
  1123. max_step=args.step,
  1124. ))