analyze_creation_pattern.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 创作模式分析(完整流程)
  5. 整合三步流程:
  6. 1. 数据准备:根据帖子图谱 + 人设图谱,提取待分析数据
  7. 2. 起点分析:AI分析创意起点
  8. 3. 模式推导:基于共现关系的迭代推导
  9. 输入:帖子图谱 + 人设图谱
  10. 输出:完整的创作模式分析结果
  11. """
  12. import asyncio
  13. import json
  14. from pathlib import Path
  15. from typing import Dict, List, Optional, Set
  16. import sys
  17. # 添加项目根目录到路径
  18. project_root = Path(__file__).parent.parent.parent
  19. sys.path.insert(0, str(project_root))
  20. from lib.llm_cached import analyze, LLMConfig, AnalyzeResult
  21. from lib.my_trace import set_trace_smith as set_trace
  22. from script.data_processing.path_config import PathConfig
  23. # ===== 配置 =====
  24. TASK_NAME = "creation_pattern" # 缓存任务名称
  25. MATCH_SCORE_THRESHOLD = 0.8 # 匹配分数阈值
  26. GLOBAL_RATIO_THRESHOLD = 0.8 # 全局占比阈值
  27. ORIGIN_SCORE_THRESHOLD = 0.8 # 起点分数阈值
  28. # ===== 数据加载 =====
  29. def load_json(file_path: Path) -> Dict:
  30. """加载JSON文件"""
  31. with open(file_path, "r", encoding="utf-8") as f:
  32. return json.load(f)
  33. def get_post_graph_files(config: PathConfig) -> List[Path]:
  34. """获取所有帖子图谱文件"""
  35. post_graph_dir = config.intermediate_dir / "post_graph"
  36. return sorted(post_graph_dir.glob("*_帖子图谱.json"))
  37. # ===== 第一步:数据准备 =====
  38. def extract_post_detail(post_graph: Dict) -> Dict:
  39. """提取帖子详情"""
  40. meta = post_graph.get("meta", {})
  41. post_detail = meta.get("postDetail", {})
  42. return {
  43. "postId": meta.get("postId", ""),
  44. "postTitle": meta.get("postTitle", ""),
  45. "body_text": post_detail.get("body_text", ""),
  46. "images": post_detail.get("images", []),
  47. "video": post_detail.get("video"),
  48. "publish_time": post_detail.get("publish_time", ""),
  49. "like_count": post_detail.get("like_count", 0),
  50. "collect_count": post_detail.get("collect_count", 0),
  51. }
  52. def extract_analysis_nodes(post_graph: Dict, persona_graph: Dict) -> tuple:
  53. """
  54. 提取待分析节点列表
  55. 待分析节点 = 灵感点 + 目的点 + 关键点
  56. """
  57. nodes = post_graph.get("nodes", {})
  58. edges = post_graph.get("edges", {})
  59. persona_nodes = persona_graph.get("nodes", {})
  60. persona_index = persona_graph.get("index", {})
  61. # 1. 收集关键点信息
  62. keypoints = {}
  63. for node_id, node in nodes.items():
  64. if node.get("type") == "标签" and node.get("dimension") == "关键点":
  65. keypoints[node_id] = {
  66. "名称": node.get("name", ""),
  67. "描述": node.get("detail", {}).get("description", ""),
  68. }
  69. # 2. 分析支撑关系
  70. support_map = {}
  71. for edge_id, edge in edges.items():
  72. if edge.get("type") == "支撑":
  73. source_id = edge.get("source", "")
  74. target_id = edge.get("target", "")
  75. if source_id in keypoints:
  76. if target_id not in support_map:
  77. support_map[target_id] = []
  78. support_map[target_id].append(keypoints[source_id])
  79. # 3. 分析关联关系
  80. relation_map = {}
  81. for edge_id, edge in edges.items():
  82. if edge.get("type") == "关联":
  83. source_id = edge.get("source", "")
  84. target_id = edge.get("target", "")
  85. source_name = nodes.get(source_id, {}).get("name", "")
  86. target_name = nodes.get(target_id, {}).get("name", "")
  87. if source_id not in relation_map:
  88. relation_map[source_id] = []
  89. relation_map[source_id].append(target_name)
  90. if target_id not in relation_map:
  91. relation_map[target_id] = []
  92. relation_map[target_id].append(source_name)
  93. # 4. 分析人设匹配
  94. match_map = {}
  95. persona_out_edges = persona_index.get("outEdges", {})
  96. def get_node_info(node_id: str) -> Optional[Dict]:
  97. """获取人设节点的标准信息"""
  98. node = persona_nodes.get(node_id, {})
  99. if not node:
  100. return None
  101. detail = node.get("detail", {})
  102. parent_path = detail.get("parentPath", [])
  103. return {
  104. "节点ID": node_id,
  105. "节点名称": node.get("name", ""),
  106. "节点分类": "/".join(parent_path) if parent_path else "",
  107. "节点维度": node.get("dimension", ""),
  108. "节点类型": node.get("type", ""),
  109. "人设全局占比": detail.get("probGlobal", 0),
  110. "父类下占比": detail.get("probToParent", 0),
  111. }
  112. def get_parent_category_id(node_id: str) -> Optional[str]:
  113. """通过属于边获取父分类节点ID"""
  114. belong_edges = persona_out_edges.get(node_id, {}).get("属于", [])
  115. for edge in belong_edges:
  116. target_id = edge.get("target", "")
  117. target_node = persona_nodes.get(target_id, {})
  118. if target_node.get("type") == "分类":
  119. return target_id
  120. return None
  121. for edge_id, edge in edges.items():
  122. if edge.get("type") == "匹配":
  123. source_id = edge.get("source", "")
  124. target_id = edge.get("target", "")
  125. if source_id.startswith("帖子:") and target_id.startswith("人设:"):
  126. match_score = edge.get("score", 0)
  127. persona_node = persona_nodes.get(target_id, {})
  128. if persona_node:
  129. node_type = persona_node.get("type", "")
  130. match_node_info = get_node_info(target_id)
  131. if not match_node_info:
  132. continue
  133. if node_type == "标签":
  134. category_id = get_parent_category_id(target_id)
  135. else:
  136. category_id = target_id
  137. category_info = None
  138. if category_id:
  139. category_node = persona_nodes.get(category_id, {})
  140. if category_node:
  141. category_detail = category_node.get("detail", {})
  142. category_path = category_detail.get("parentPath", [])
  143. category_info = {
  144. "节点ID": category_id,
  145. "节点名称": category_node.get("name", ""),
  146. "节点分类": "/".join(category_path) if category_path else "",
  147. "节点维度": category_node.get("dimension", ""),
  148. "节点类型": "分类",
  149. "人设全局占比": category_detail.get("probGlobal", 0),
  150. "父类下占比": category_detail.get("probToParent", 0),
  151. "历史共现分类": [],
  152. }
  153. co_occur_edges = persona_out_edges.get(category_id, {}).get("分类共现", [])
  154. co_occur_edges_sorted = sorted(co_occur_edges, key=lambda x: x.get("score", 0), reverse=True)
  155. for co_edge in co_occur_edges_sorted[:5]:
  156. co_target_id = co_edge.get("target", "")
  157. co_score = co_edge.get("score", 0)
  158. co_node = persona_nodes.get(co_target_id, {})
  159. if co_node:
  160. co_detail = co_node.get("detail", {})
  161. co_path = co_detail.get("parentPath", [])
  162. category_info["历史共现分类"].append({
  163. "节点ID": co_target_id,
  164. "节点名称": co_node.get("name", ""),
  165. "节点分类": "/".join(co_path) if co_path else "",
  166. "节点维度": co_node.get("dimension", ""),
  167. "节点类型": "分类",
  168. "人设全局占比": co_detail.get("probGlobal", 0),
  169. "父类下占比": co_detail.get("probToParent", 0),
  170. "共现度": round(co_score, 4),
  171. })
  172. if source_id not in match_map:
  173. match_map[source_id] = []
  174. match_map[source_id].append({
  175. "匹配节点": match_node_info,
  176. "匹配分数": round(match_score, 4),
  177. "所属分类": category_info,
  178. })
  179. # 5. 构建待分析节点列表
  180. analysis_nodes = []
  181. for node_id, node in nodes.items():
  182. if node.get("type") == "标签" and node.get("domain") == "帖子":
  183. dimension = node.get("dimension", "")
  184. if dimension in ["灵感点", "目的点", "关键点"]:
  185. match_info = match_map.get(node_id)
  186. analysis_nodes.append({
  187. "节点ID": node_id,
  188. "节点名称": node.get("name", ""),
  189. "节点分类": node.get("category", ""),
  190. "节点维度": dimension,
  191. "节点类型": node.get("type", ""),
  192. "节点描述": node.get("detail", {}).get("description", ""),
  193. "人设匹配": match_info,
  194. })
  195. # 6. 构建关系列表
  196. relation_list = []
  197. for edge_id, edge in edges.items():
  198. if edge.get("type") == "支撑":
  199. source_id = edge.get("source", "")
  200. target_id = edge.get("target", "")
  201. if source_id in keypoints:
  202. relation_list.append({
  203. "来源节点": source_id,
  204. "目标节点": target_id,
  205. "关系类型": "支撑",
  206. })
  207. seen_relations = set()
  208. for edge_id, edge in edges.items():
  209. if edge.get("type") == "关联":
  210. source_id = edge.get("source", "")
  211. target_id = edge.get("target", "")
  212. key = tuple(sorted([source_id, target_id]))
  213. if key not in seen_relations:
  214. seen_relations.add(key)
  215. relation_list.append({
  216. "来源节点": source_id,
  217. "目标节点": target_id,
  218. "关系类型": "关联",
  219. })
  220. return analysis_nodes, relation_list
  221. def prepare_analysis_data(post_graph: Dict, persona_graph: Dict) -> Dict:
  222. """
  223. 准备完整的分析数据
  224. 输出扁平化的节点列表 + 独立的人设共现关系数据
  225. """
  226. analysis_nodes, relation_list = extract_analysis_nodes(post_graph, persona_graph)
  227. # 扁平化节点,提取人设共现关系数据
  228. flat_nodes = []
  229. persona_co_occur = {} # {分类ID: {名称, 共现分类列表}}
  230. for node in analysis_nodes:
  231. # 基础节点字段
  232. flat_node = {
  233. "节点ID": node["节点ID"],
  234. "节点名称": node["节点名称"],
  235. "节点分类": node.get("节点分类", ""),
  236. "节点维度": node["节点维度"],
  237. "节点描述": node.get("节点描述", ""),
  238. "是否已知": False,
  239. "发现编号": None,
  240. }
  241. # 提取人设匹配信息(list格式,支持多个匹配)
  242. match_list = node.get("人设匹配") or []
  243. if match_list:
  244. flat_node["人设匹配"] = []
  245. for match_info in match_list:
  246. category_info = match_info.get("所属分类")
  247. category_id = category_info.get("节点ID") if category_info else None
  248. # 保留完整的匹配信息,但去掉历史共现分类(拆到外面)
  249. clean_match = {
  250. "匹配节点": match_info.get("匹配节点"),
  251. "匹配分数": match_info.get("匹配分数", 0),
  252. }
  253. if category_info:
  254. # 复制所属分类,但不包含历史共现分类
  255. clean_category = {k: v for k, v in category_info.items() if k != "历史共现分类"}
  256. clean_match["所属分类"] = clean_category
  257. flat_node["人设匹配"].append(clean_match)
  258. # 收集人设共现关系(去重)- 从历史共现分类拆出来
  259. if category_id and category_id not in persona_co_occur:
  260. co_occur_list = category_info.get("历史共现分类", [])
  261. if co_occur_list:
  262. persona_co_occur[category_id] = [
  263. {
  264. "节点ID": c.get("节点ID"),
  265. "节点名称": c.get("节点名称"),
  266. "节点分类": c.get("节点分类", ""),
  267. "节点维度": c.get("节点维度", ""),
  268. "节点类型": c.get("节点类型", ""),
  269. "人设全局占比": c.get("人设全局占比", 0),
  270. "父类下占比": c.get("父类下占比", 0),
  271. "共现度": c.get("共现度", 0),
  272. }
  273. for c in co_occur_list
  274. if c.get("节点ID")
  275. ]
  276. else:
  277. flat_node["人设匹配"] = []
  278. flat_nodes.append(flat_node)
  279. return {
  280. "帖子详情": extract_post_detail(post_graph),
  281. "节点列表": flat_nodes,
  282. "关系列表": relation_list,
  283. "人设共现关系": persona_co_occur,
  284. }
  285. # ===== 第二步:起点分析 =====
  286. def get_best_match(node: Dict) -> Optional[Dict]:
  287. """获取节点的最佳人设匹配(分数最高的)"""
  288. match_list = node.get("人设匹配") or []
  289. if not match_list:
  290. return None
  291. return max(match_list, key=lambda m: m.get("匹配分数", 0))
  292. def get_match_score(node: Dict) -> float:
  293. """获取节点的最高人设匹配分数"""
  294. best_match = get_best_match(node)
  295. if best_match:
  296. return best_match.get("匹配分数", 0)
  297. return 0
  298. def get_category_id(node: Dict) -> Optional[str]:
  299. """获取节点的所属分类ID(最佳匹配的)"""
  300. best_match = get_best_match(node)
  301. if best_match:
  302. category = best_match.get("所属分类")
  303. if category:
  304. return category.get("节点ID")
  305. return None
  306. def get_all_category_ids(node: Dict) -> List[str]:
  307. """获取节点所有匹配的分类ID"""
  308. match_list = node.get("人设匹配") or []
  309. result = []
  310. for m in match_list:
  311. category = m.get("所属分类")
  312. if category and category.get("节点ID"):
  313. result.append(category.get("节点ID"))
  314. return result
  315. def get_category_global_ratio(node: Dict) -> float:
  316. """获取节点所属分类的人设全局占比(最佳匹配的)"""
  317. best_match = get_best_match(node)
  318. if best_match:
  319. category = best_match.get("所属分类")
  320. if category:
  321. return category.get("人设全局占比", 0)
  322. return 0
  323. def is_persona_constant(node: Dict) -> bool:
  324. """判断节点是否为人设常量(匹配分数 >= 0.8 且 分类全局占比 >= 0.8)"""
  325. match_score = get_match_score(node)
  326. global_ratio = get_category_global_ratio(node)
  327. return match_score >= MATCH_SCORE_THRESHOLD and global_ratio >= GLOBAL_RATIO_THRESHOLD
  328. def build_origin_context(nodes: List[Dict]) -> Dict:
  329. """构造AI分析的上下文"""
  330. all_points = []
  331. for node in nodes:
  332. all_points.append({
  333. "名称": node["节点名称"],
  334. "分类": node.get("节点分类", ""),
  335. "维度": node.get("节点维度", ""),
  336. "描述": node.get("节点描述", ""),
  337. "人设匹配度": round(get_match_score(node), 2),
  338. })
  339. # 起点候选集(灵感点 + 目的点)
  340. candidates = [
  341. node["节点名称"]
  342. for node in nodes
  343. if node["节点维度"] in ["灵感点", "目的点"]
  344. ]
  345. # 人设常量(匹配分数 >= 0.8 且 分类全局占比 >= 0.8)
  346. constants = [
  347. node["节点名称"]
  348. for node in nodes
  349. if is_persona_constant(node)
  350. ]
  351. return {
  352. "all_points": all_points,
  353. "candidates": candidates,
  354. "constants": constants,
  355. }
  356. def format_origin_prompt(context: Dict) -> str:
  357. """格式化起点分析的prompt"""
  358. all_points = context["all_points"]
  359. candidates = context["candidates"]
  360. constants = context["constants"]
  361. points_text = ""
  362. for p in all_points:
  363. points_text += f"- {p['名称']}\n"
  364. points_text += f" 维度: {p['维度']} | 分类: {p['分类']}\n"
  365. points_text += f" 描述: {p['描述']}\n"
  366. points_text += f" 人设匹配度: {p['人设匹配度']}\n"
  367. points_text += "\n"
  368. candidates_text = "、".join(candidates)
  369. constants_text = "、".join(constants) if constants else "无"
  370. prompt = f"""# Role
  371. 你是小红书爆款内容的"逆向工程"专家。你的核心能力是透过内容的表象(视觉/形式),还原创作者最初的脑回路(动机/实质)。
  372. # Task
  373. 我提供一组笔记的【创意标签】和一个【起点候选集】。
  374. 请推理出哪些选项是真正的**创意起点**。
  375. # Input Data
  376. ## 全部创意点
  377. {points_text}
  378. ## 起点候选集
  379. {candidates_text}
  380. ## 来自人设的常量
  381. {constants_text}
  382. # 推理约束
  383. 1. 实质推形式,而不是形式推实质,除非形式是一切创意的起点
  384. 2. 因推果而不是果推因
  385. 3. 无法被其他项或人设推理出的点,即为起点
  386. # Output Format
  387. 请输出一个标准的 JSON 格式。
  388. - Key: 候选集中的词。
  389. - Value: 一个对象,包含:
  390. - `score`: 0.0 到 1.0 的浮点数(代表是起点的可能性)。
  391. - `analysis`: 一句话推理"""
  392. return prompt
  393. async def analyze_origin(nodes: List[Dict], force_llm: bool = False) -> Dict:
  394. """
  395. 执行起点分析
  396. 输入: 节点列表
  397. 输出: 节点列表(加了起点分析、是否已知、发现编号字段)+ 中间结果
  398. """
  399. context = build_origin_context(nodes)
  400. prompt = format_origin_prompt(context)
  401. print(f"\n 起点候选: {len(context['candidates'])} 个")
  402. print(f" 人设常量: {len(context['constants'])} 个")
  403. result = await analyze(
  404. prompt=prompt,
  405. task_name=f"{TASK_NAME}/origin",
  406. force=force_llm,
  407. parse_json=True,
  408. )
  409. # 把分析结果合并到节点
  410. llm_result = result.data or {}
  411. output_nodes = []
  412. current_order = 1 # 已知节点的发现编号计数
  413. for node in nodes:
  414. new_node = dict(node) # 复制原节点
  415. name = node["节点名称"]
  416. if name in llm_result:
  417. score = llm_result[name].get("score", 0)
  418. analysis = llm_result[name].get("analysis", "")
  419. # 加起点分析
  420. new_node["起点分析"] = {
  421. "分数": score,
  422. "说明": analysis,
  423. }
  424. # 高分起点标记为已知
  425. if score >= ORIGIN_SCORE_THRESHOLD:
  426. new_node["是否已知"] = True
  427. new_node["发现编号"] = current_order
  428. current_order += 1
  429. else:
  430. new_node["起点分析"] = None
  431. output_nodes.append(new_node)
  432. return {
  433. "输入上下文": {
  434. "起点候选": context["candidates"],
  435. "人设常量": context["constants"],
  436. },
  437. "中间结果": llm_result,
  438. "输出节点": output_nodes,
  439. "cache_hit": result.cache_hit,
  440. "model": result.model_name,
  441. "log_url": result.log_url,
  442. }
  443. # ===== 第三步:模式推导 =====
  444. def derive_patterns(
  445. nodes: List[Dict],
  446. persona_co_occur: Dict[str, Dict],
  447. ) -> Dict:
  448. """
  449. 基于共现关系的迭代推导
  450. 输入: 带起点分析的节点列表 + 人设共现关系数据
  451. 输出: 节点列表(加了推导轮次、未知原因字段)+ 推导边列表
  452. """
  453. node_by_name: Dict[str, Dict] = {n["节点名称"]: n for n in nodes}
  454. # 构建共现查找表 {节点ID: {共现节点ID: 共现度}}
  455. co_occur_lookup = {}
  456. for cat_id, co_occur_list in persona_co_occur.items():
  457. co_occur_lookup[cat_id] = {
  458. c["节点ID"]: c["共现度"]
  459. for c in co_occur_list
  460. }
  461. # 1. 初始化已知点集合(已经是已知的节点)
  462. known_names: Set[str] = set()
  463. node_round: Dict[str, int] = {} # {节点名称: 加入轮次}
  464. for node in nodes:
  465. if node.get("是否已知"):
  466. known_names.add(node["节点名称"])
  467. node_round[node["节点名称"]] = 0
  468. unknown_names: Set[str] = set(node_by_name.keys()) - known_names
  469. edges: List[Dict] = []
  470. # 2. 迭代推导
  471. round_num = 0
  472. new_known_this_round = known_names.copy()
  473. while new_known_this_round:
  474. round_num += 1
  475. new_known_next_round: Set[str] = set()
  476. for known_name in new_known_this_round:
  477. known_node = node_by_name.get(known_name)
  478. if not known_node:
  479. continue
  480. if get_match_score(known_node) < MATCH_SCORE_THRESHOLD:
  481. continue
  482. # 获取该节点所属分类的共现列表
  483. known_cat_id = get_category_id(known_node)
  484. if not known_cat_id or known_cat_id not in co_occur_lookup:
  485. continue
  486. co_occur_map = co_occur_lookup[known_cat_id]
  487. for unknown_name in list(unknown_names):
  488. unknown_node = node_by_name.get(unknown_name)
  489. if not unknown_node:
  490. continue
  491. if get_match_score(unknown_node) < MATCH_SCORE_THRESHOLD:
  492. continue
  493. # 检查未知节点的分类是否在已知节点的共现列表中
  494. unknown_cat_id = get_category_id(unknown_node)
  495. if unknown_cat_id and unknown_cat_id in co_occur_map:
  496. co_occur_score = co_occur_map[unknown_cat_id]
  497. new_known_next_round.add(unknown_name)
  498. node_round[unknown_name] = round_num
  499. edges.append({
  500. "来源": known_node["节点ID"],
  501. "目标": unknown_node["节点ID"],
  502. "关系类型": "共现推导",
  503. "推导轮次": round_num,
  504. "共现分类ID": unknown_cat_id,
  505. "共现度": co_occur_score,
  506. })
  507. known_names.update(new_known_next_round)
  508. unknown_names -= new_known_next_round
  509. new_known_this_round = new_known_next_round
  510. if not new_known_next_round:
  511. break
  512. # 3. 构建输出节点(只更新是否已知、发现编号)
  513. # 先找出当前最大发现编号
  514. max_order = 0
  515. for node in nodes:
  516. if node.get("发现编号") and node["发现编号"] > max_order:
  517. max_order = node["发现编号"]
  518. # 按推导轮次排序新发现的节点,分配发现编号
  519. new_known_by_round = {}
  520. for name, r in node_round.items():
  521. if r > 0: # 排除起点(轮次0)
  522. if r not in new_known_by_round:
  523. new_known_by_round[r] = []
  524. new_known_by_round[r].append(name)
  525. # 分配发现编号
  526. order_map = {}
  527. current_order = max_order + 1
  528. for r in sorted(new_known_by_round.keys()):
  529. for name in new_known_by_round[r]:
  530. order_map[name] = current_order
  531. current_order += 1
  532. output_nodes = []
  533. for node in nodes:
  534. new_node = dict(node)
  535. name = node["节点名称"]
  536. # 如果是新推导出来的(非起点),更新已知状态和发现编号
  537. if name in node_round and node_round[name] > 0:
  538. new_node["是否已知"] = True
  539. new_node["发现编号"] = order_map.get(name)
  540. output_nodes.append(new_node)
  541. return {
  542. "输出节点": output_nodes,
  543. "推导边列表": edges,
  544. "推导轮次": round_num,
  545. }
  546. # ===== 完整流程 =====
  547. def save_result(post_id: str, post_detail: Dict, steps: List, config: PathConfig) -> Path:
  548. """保存结果到文件"""
  549. output_dir = config.intermediate_dir / "creation_pattern"
  550. output_dir.mkdir(parents=True, exist_ok=True)
  551. output_file = output_dir / f"{post_id}_创作模式.json"
  552. result = {
  553. "帖子详情": post_detail,
  554. "步骤列表": steps,
  555. }
  556. with open(output_file, "w", encoding="utf-8") as f:
  557. json.dump(result, f, ensure_ascii=False, indent=2)
  558. print(f" [已保存] {output_file.name}")
  559. return output_file
  560. async def process_single_post(
  561. post_file: Path,
  562. persona_graph: Dict,
  563. config: PathConfig,
  564. force_llm: bool = False,
  565. max_step: int = 3,
  566. ) -> Dict:
  567. """
  568. 处理单个帖子
  569. Args:
  570. force_llm: 强制重新调用LLM(跳过LLM缓存)
  571. max_step: 最多运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导)
  572. """
  573. post_graph = load_json(post_file)
  574. post_id = post_graph.get("meta", {}).get("postId", "unknown")
  575. print(f"\n{'=' * 60}")
  576. print(f"处理帖子: {post_id}")
  577. print("-" * 60)
  578. steps = []
  579. # ===== 步骤1:数据准备 =====
  580. print("\n[步骤1] 数据准备...")
  581. data = prepare_analysis_data(post_graph, persona_graph)
  582. post_detail = data["帖子详情"]
  583. nodes_step1 = data["节点列表"]
  584. relations_step1 = data["关系列表"]
  585. persona_co_occur = data["人设共现关系"]
  586. # 步骤1所有节点都是新的
  587. new_known_step1 = [n["节点名称"] for n in nodes_step1 if n.get("是否已知")]
  588. step1 = {
  589. "步骤": "数据准备",
  590. "输入": {
  591. "帖子图谱": str(post_file.name),
  592. "人设图谱": "人设图谱.json",
  593. },
  594. "输出": {
  595. "新的已知节点": new_known_step1,
  596. "新的边": [],
  597. "节点列表": nodes_step1,
  598. "边列表": relations_step1,
  599. },
  600. "人设共现关系": persona_co_occur,
  601. "摘要": {
  602. "节点数": len(nodes_step1),
  603. "边数": len(relations_step1),
  604. "人设共现数": len(persona_co_occur),
  605. },
  606. }
  607. steps.append(step1)
  608. print(f" 节点数: {len(nodes_step1)}")
  609. print(f" 关系数: {len(relations_step1)}")
  610. print(f" 人设共现数: {len(persona_co_occur)}")
  611. # 步骤1完成,保存
  612. save_result(post_id, post_detail, steps, config)
  613. if max_step == 1:
  614. return {"帖子详情": post_detail, "步骤列表": steps}
  615. # ===== 步骤2:起点分析 =====
  616. print("\n[步骤2] 起点分析...")
  617. origin_result = await analyze_origin(nodes_step1, force_llm=force_llm)
  618. nodes_step2 = origin_result["输出节点"]
  619. # 统计高分起点
  620. def get_origin_score(node):
  621. analysis = node.get("起点分析")
  622. if analysis:
  623. return analysis.get("分数", 0)
  624. return 0
  625. high_score_origins = [
  626. (n["节点名称"], get_origin_score(n))
  627. for n in nodes_step2
  628. if get_origin_score(n) >= 0.7
  629. ]
  630. # 新发现的已知节点(起点)
  631. new_known_nodes = [n["节点名称"] for n in nodes_step2 if n.get("是否已知")]
  632. step2 = {
  633. "步骤": "起点分析",
  634. "输入": {
  635. "节点列表": nodes_step1,
  636. "起点候选": origin_result["输入上下文"]["起点候选"],
  637. "人设常量": origin_result["输入上下文"]["人设常量"],
  638. },
  639. "中间结果": origin_result["中间结果"],
  640. "输出": {
  641. "新的已知节点": new_known_nodes,
  642. "新的边": [],
  643. "节点列表": nodes_step2,
  644. "边列表": relations_step1, # 边没变化
  645. },
  646. "摘要": {
  647. "新已知数": len(new_known_nodes),
  648. "model": origin_result["model"],
  649. "cache_hit": origin_result["cache_hit"],
  650. "log_url": origin_result.get("log_url"),
  651. },
  652. }
  653. steps.append(step2)
  654. print(f" 高分起点 (>=0.7): {len(high_score_origins)} 个")
  655. for name, score in sorted(high_score_origins, key=lambda x: -x[1]):
  656. print(f" ★ {name}: {score:.2f}")
  657. # 步骤2完成,保存
  658. save_result(post_id, post_detail, steps, config)
  659. if max_step == 2:
  660. return {"帖子详情": post_detail, "步骤列表": steps}
  661. # ===== 步骤3:模式推导 =====
  662. print("\n[步骤3] 模式推导...")
  663. derivation_result = derive_patterns(nodes_step2, persona_co_occur)
  664. nodes_step3 = derivation_result["输出节点"]
  665. edges = derivation_result["推导边列表"]
  666. # 统计
  667. known_count = sum(1 for n in nodes_step3 if n.get("是否已知"))
  668. unknown_count = len(nodes_step3) - known_count
  669. # 新发现的已知节点(本步骤推导出来的,不包括之前的起点)
  670. prev_known = {n["节点名称"] for n in nodes_step2 if n.get("是否已知")}
  671. new_known_nodes = [n["节点名称"] for n in nodes_step3 if n.get("是否已知") and n["节点名称"] not in prev_known]
  672. # 合并边列表(原有边 + 推导边)
  673. all_edges = relations_step1 + edges
  674. step3 = {
  675. "步骤": "模式推导",
  676. "输入": {
  677. "节点列表": nodes_step2,
  678. "人设共现关系": persona_co_occur,
  679. },
  680. "输出": {
  681. "新的已知节点": new_known_nodes,
  682. "新的边": edges,
  683. "节点列表": nodes_step3,
  684. "边列表": all_edges,
  685. },
  686. "摘要": {
  687. "已知点数": known_count,
  688. "新已知数": len(new_known_nodes),
  689. "新边数": len(edges),
  690. "未知点数": unknown_count,
  691. },
  692. }
  693. steps.append(step3)
  694. print(f" 已知点: {known_count} 个")
  695. print(f" 推导边: {len(edges)} 条")
  696. print(f" 未知点: {unknown_count} 个")
  697. # 步骤3完成,保存
  698. save_result(post_id, post_detail, steps, config)
  699. return {"帖子详情": post_detail, "步骤列表": steps}
  700. # ===== 主函数 =====
  701. async def main(
  702. post_id: str = None,
  703. all_posts: bool = False,
  704. force_llm: bool = False,
  705. max_step: int = 3,
  706. ):
  707. """主函数"""
  708. _, log_url = set_trace()
  709. config = PathConfig()
  710. print(f"账号: {config.account_name}")
  711. print(f"Trace URL: {log_url}")
  712. # 加载人设图谱
  713. persona_graph_file = config.intermediate_dir / "人设图谱.json"
  714. if not persona_graph_file.exists():
  715. print(f"错误: 人设图谱文件不存在: {persona_graph_file}")
  716. return
  717. persona_graph = load_json(persona_graph_file)
  718. print(f"人设图谱节点数: {len(persona_graph.get('nodes', {}))}")
  719. # 获取帖子图谱文件
  720. post_graph_files = get_post_graph_files(config)
  721. if not post_graph_files:
  722. print("错误: 没有找到帖子图谱文件")
  723. return
  724. # 确定要处理的帖子
  725. if post_id:
  726. target_file = next(
  727. (f for f in post_graph_files if post_id in f.name),
  728. None
  729. )
  730. if not target_file:
  731. print(f"错误: 未找到帖子 {post_id}")
  732. return
  733. files_to_process = [target_file]
  734. elif all_posts:
  735. files_to_process = post_graph_files
  736. else:
  737. files_to_process = [post_graph_files[0]]
  738. print(f"待处理帖子数: {len(files_to_process)}")
  739. # 处理
  740. results = []
  741. for i, post_file in enumerate(files_to_process, 1):
  742. print(f"\n{'#' * 60}")
  743. print(f"# 处理帖子 {i}/{len(files_to_process)}")
  744. print(f"{'#' * 60}")
  745. result = await process_single_post(
  746. post_file=post_file,
  747. persona_graph=persona_graph,
  748. config=config,
  749. force_llm=force_llm,
  750. max_step=max_step,
  751. )
  752. results.append(result)
  753. # 汇总
  754. print(f"\n{'#' * 60}")
  755. print(f"# 完成! 共处理 {len(results)} 个帖子")
  756. print(f"{'#' * 60}")
  757. print(f"Trace: {log_url}")
  758. print("\n汇总:")
  759. for result in results:
  760. post_id = result["帖子详情"]["postId"]
  761. steps = result.get("步骤列表", [])
  762. num_steps = len(steps)
  763. if num_steps == 1:
  764. step1_summary = steps[0].get("摘要", {})
  765. print(f" {post_id}: 节点数={step1_summary.get('节点数', 0)} (仅数据准备)")
  766. elif num_steps == 2:
  767. step2_summary = steps[1].get("摘要", {})
  768. print(f" {post_id}: 起点={step2_summary.get('高分起点数', 0)} (未推导)")
  769. elif num_steps >= 3:
  770. step2_summary = steps[1].get("摘要", {})
  771. step3_summary = steps[2].get("摘要", {})
  772. print(f" {post_id}: 起点={step2_summary.get('高分起点数', 0)}, "
  773. f"已知={step3_summary.get('已知点数', 0)}, "
  774. f"推导边={step3_summary.get('推导边数', 0)}")
  775. else:
  776. print(f" {post_id}: 无步骤数据")
  777. if __name__ == "__main__":
  778. import argparse
  779. parser = argparse.ArgumentParser(description="创作模式分析")
  780. parser.add_argument("--post-id", type=str, help="帖子ID")
  781. parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
  782. parser.add_argument("--force-llm", action="store_true", help="强制重新调用LLM(跳过LLM缓存)")
  783. parser.add_argument("--step", type=int, default=3, choices=[1, 2, 3],
  784. help="运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导)")
  785. args = parser.parse_args()
  786. asyncio.run(main(
  787. post_id=args.post_id,
  788. all_posts=args.all_posts,
  789. force_llm=args.force_llm,
  790. max_step=args.step,
  791. ))