analyze_creation_pattern_v5.py 59 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 选题点顺序分析(完整流程)
  5. 整合六步流程:
  6. 1. 数据准备:根据帖子图谱 + 人设图谱,提取待分析数据
  7. 2. 人设常量判断:识别人设常量(匹配分数>=0.8 且 全局占比>=0.7)
  8. 3. 起点分析:AI分析创意起点(新版prompt)
  9. 4. 模式推导:基于共现关系的迭代推导
  10. 5. 下一步分析:AI推导下一步最可能的点
  11. 6. 循环:重复步骤4-5直到全部已知
  12. 输入:帖子图谱 + 人设图谱
  13. 输出:选题点顺序分析结果
  14. """
  15. import asyncio
  16. import json
  17. from pathlib import Path
  18. from typing import Dict, List, Optional, Set
  19. import sys
  20. # 添加项目根目录到路径
  21. project_root = Path(__file__).parent.parent.parent
  22. sys.path.insert(0, str(project_root))
  23. from lib.llm_cached import analyze, LLMConfig, AnalyzeResult
  24. from lib.my_trace import set_trace_smith as set_trace
  25. from script.data_processing.path_config import PathConfig
  26. # ===== 配置 =====
  27. TASK_NAME = "creation_pattern_v5" # 缓存任务名称(保持不变以命中缓存)
  28. OUTPUT_DIR_NAME = "point_order_v5" # 输出目录名称
  29. MATCH_SCORE_THRESHOLD = 0.8 # 匹配分数阈值
  30. GLOBAL_RATIO_THRESHOLD = 0.7 # 全局占比阈值(>=0.7 算常量)
  31. ORIGIN_SCORE_THRESHOLD = 0.8 # 起点分数阈值
  32. # ===== 数据加载 =====
  33. def load_json(file_path: Path) -> Dict:
  34. """加载JSON文件"""
  35. with open(file_path, "r", encoding="utf-8") as f:
  36. return json.load(f)
  37. def get_post_graph_files(config: PathConfig) -> List[Path]:
  38. """获取所有帖子图谱文件"""
  39. post_graph_dir = config.intermediate_dir / "post_graph"
  40. return sorted(post_graph_dir.glob("*_帖子图谱.json"))
  41. # ===== 第一步:数据准备 =====
  42. def extract_post_detail(post_graph: Dict) -> Dict:
  43. """提取帖子详情"""
  44. meta = post_graph.get("meta", {})
  45. post_detail = meta.get("postDetail", {})
  46. return {
  47. "postId": meta.get("postId", ""),
  48. "postTitle": meta.get("postTitle", ""),
  49. "body_text": post_detail.get("body_text", ""),
  50. "images": post_detail.get("images", []),
  51. "video": post_detail.get("video"),
  52. "publish_time": post_detail.get("publish_time", ""),
  53. "like_count": post_detail.get("like_count", 0),
  54. "collect_count": post_detail.get("collect_count", 0),
  55. }
  56. def extract_analysis_nodes(post_graph: Dict, persona_graph: Dict) -> tuple:
  57. """
  58. 提取待分析节点列表
  59. 待分析节点 = 灵感点 + 目的点 + 关键点
  60. """
  61. nodes = post_graph.get("nodes", {})
  62. edges = post_graph.get("edges", {})
  63. persona_nodes = persona_graph.get("nodes", {})
  64. persona_index = persona_graph.get("index", {})
  65. # 1. 收集关键点信息
  66. keypoints = {}
  67. for node_id, node in nodes.items():
  68. if node.get("type") == "标签" and node.get("dimension") == "关键点":
  69. keypoints[node_id] = {
  70. "名称": node.get("name", ""),
  71. "描述": node.get("detail", {}).get("description", ""),
  72. }
  73. # 2. 分析支撑关系
  74. support_map = {}
  75. for edge_id, edge in edges.items():
  76. if edge.get("type") == "支撑":
  77. source_id = edge.get("source", "")
  78. target_id = edge.get("target", "")
  79. if source_id in keypoints:
  80. if target_id not in support_map:
  81. support_map[target_id] = []
  82. support_map[target_id].append(keypoints[source_id])
  83. # 3. 分析关联关系
  84. relation_map = {}
  85. for edge_id, edge in edges.items():
  86. if edge.get("type") == "关联":
  87. source_id = edge.get("source", "")
  88. target_id = edge.get("target", "")
  89. source_name = nodes.get(source_id, {}).get("name", "")
  90. target_name = nodes.get(target_id, {}).get("name", "")
  91. if source_id not in relation_map:
  92. relation_map[source_id] = []
  93. relation_map[source_id].append(target_name)
  94. if target_id not in relation_map:
  95. relation_map[target_id] = []
  96. relation_map[target_id].append(source_name)
  97. # 4. 分析人设匹配
  98. match_map = {}
  99. persona_out_edges = persona_index.get("outEdges", {})
  100. def get_node_info(node_id: str) -> Optional[Dict]:
  101. """获取人设节点的标准信息"""
  102. node = persona_nodes.get(node_id, {})
  103. if not node:
  104. return None
  105. detail = node.get("detail", {})
  106. parent_path = detail.get("parentPath", [])
  107. return {
  108. "节点ID": node_id,
  109. "节点名称": node.get("name", ""),
  110. "节点分类": "/".join(parent_path) if parent_path else "",
  111. "节点维度": node.get("dimension", ""),
  112. "节点类型": node.get("type", ""),
  113. "人设全局占比": detail.get("probGlobal", 0),
  114. "父类下占比": detail.get("probToParent", 0),
  115. }
  116. def get_parent_category_id(node_id: str) -> Optional[str]:
  117. """通过属于边获取父分类节点ID"""
  118. belong_edges = persona_out_edges.get(node_id, {}).get("属于", [])
  119. for edge in belong_edges:
  120. target_id = edge.get("target", "")
  121. target_node = persona_nodes.get(target_id, {})
  122. if target_node.get("type") == "分类":
  123. return target_id
  124. return None
  125. for edge_id, edge in edges.items():
  126. if edge.get("type") == "匹配":
  127. source_id = edge.get("source", "")
  128. target_id = edge.get("target", "")
  129. if source_id.startswith("帖子:") and target_id.startswith("人设:"):
  130. match_score = edge.get("score", 0)
  131. persona_node = persona_nodes.get(target_id, {})
  132. if persona_node:
  133. node_type = persona_node.get("type", "")
  134. match_node_info = get_node_info(target_id)
  135. if not match_node_info:
  136. continue
  137. if node_type == "标签":
  138. category_id = get_parent_category_id(target_id)
  139. else:
  140. category_id = target_id
  141. category_info = None
  142. if category_id:
  143. category_node = persona_nodes.get(category_id, {})
  144. if category_node:
  145. category_detail = category_node.get("detail", {})
  146. category_path = category_detail.get("parentPath", [])
  147. category_info = {
  148. "节点ID": category_id,
  149. "节点名称": category_node.get("name", ""),
  150. "节点分类": "/".join(category_path) if category_path else "",
  151. "节点维度": category_node.get("dimension", ""),
  152. "节点类型": "分类",
  153. "人设全局占比": category_detail.get("probGlobal", 0),
  154. "父类下占比": category_detail.get("probToParent", 0),
  155. "历史共现分类": [],
  156. }
  157. co_occur_edges = persona_out_edges.get(category_id, {}).get("分类共现", [])
  158. co_occur_edges_sorted = sorted(co_occur_edges, key=lambda x: x.get("score", 0), reverse=True)
  159. for co_edge in co_occur_edges_sorted[:5]:
  160. co_target_id = co_edge.get("target", "")
  161. co_score = co_edge.get("score", 0)
  162. co_node = persona_nodes.get(co_target_id, {})
  163. if co_node:
  164. co_detail = co_node.get("detail", {})
  165. co_path = co_detail.get("parentPath", [])
  166. category_info["历史共现分类"].append({
  167. "节点ID": co_target_id,
  168. "节点名称": co_node.get("name", ""),
  169. "节点分类": "/".join(co_path) if co_path else "",
  170. "节点维度": co_node.get("dimension", ""),
  171. "节点类型": "分类",
  172. "人设全局占比": co_detail.get("probGlobal", 0),
  173. "父类下占比": co_detail.get("probToParent", 0),
  174. "共现度": round(co_score, 4),
  175. })
  176. if source_id not in match_map:
  177. match_map[source_id] = []
  178. match_map[source_id].append({
  179. "匹配节点": match_node_info,
  180. "匹配分数": round(match_score, 4),
  181. "所属分类": category_info,
  182. })
  183. # 5. 构建待分析节点列表
  184. analysis_nodes = []
  185. for node_id, node in nodes.items():
  186. if node.get("type") == "标签" and node.get("domain") == "帖子":
  187. dimension = node.get("dimension", "")
  188. if dimension in ["灵感点", "目的点", "关键点"]:
  189. match_info = match_map.get(node_id)
  190. analysis_nodes.append({
  191. "节点ID": node_id,
  192. "节点名称": node.get("name", ""),
  193. "节点分类": node.get("category", ""),
  194. "节点维度": dimension,
  195. "节点类型": node.get("type", ""),
  196. "节点描述": node.get("detail", {}).get("description", ""),
  197. "人设匹配": match_info,
  198. })
  199. # 6. 构建关系列表
  200. relation_list = []
  201. for edge_id, edge in edges.items():
  202. if edge.get("type") == "支撑":
  203. source_id = edge.get("source", "")
  204. target_id = edge.get("target", "")
  205. if source_id in keypoints:
  206. relation_list.append({
  207. "来源节点": source_id,
  208. "目标节点": target_id,
  209. "关系类型": "支撑",
  210. })
  211. seen_relations = set()
  212. for edge_id, edge in edges.items():
  213. if edge.get("type") == "关联":
  214. source_id = edge.get("source", "")
  215. target_id = edge.get("target", "")
  216. key = tuple(sorted([source_id, target_id]))
  217. if key not in seen_relations:
  218. seen_relations.add(key)
  219. relation_list.append({
  220. "来源节点": source_id,
  221. "目标节点": target_id,
  222. "关系类型": "关联",
  223. })
  224. return analysis_nodes, relation_list
  225. def prepare_analysis_data(post_graph: Dict, persona_graph: Dict) -> Dict:
  226. """
  227. 准备完整的分析数据
  228. 输出扁平化的节点列表 + 独立的人设共现关系数据
  229. 节点默认:是人设常量=False,是否已知=False,发现编号=None
  230. """
  231. analysis_nodes, relation_list = extract_analysis_nodes(post_graph, persona_graph)
  232. # 扁平化节点,提取人设共现关系数据
  233. flat_nodes = []
  234. persona_co_occur = {} # {分类ID: {名称, 共现分类列表}}
  235. for node in analysis_nodes:
  236. # 基础节点字段(是人设常量默认为False)
  237. flat_node = {
  238. "节点ID": node["节点ID"],
  239. "节点名称": node["节点名称"],
  240. "节点分类": node.get("节点分类", ""),
  241. "节点维度": node["节点维度"],
  242. "节点描述": node.get("节点描述", ""),
  243. "是否已知": False,
  244. "发现编号": None,
  245. "是人设常量": False, # 默认为False,在步骤2判断
  246. }
  247. # 提取人设匹配信息(list格式,支持多个匹配)
  248. match_list = node.get("人设匹配") or []
  249. if match_list:
  250. flat_node["人设匹配"] = []
  251. for match_info in match_list:
  252. match_score = match_info.get("匹配分数", 0)
  253. category_info = match_info.get("所属分类")
  254. category_id = category_info.get("节点ID") if category_info else None
  255. # 保留完整的匹配信息,但去掉历史共现分类(拆到外面)
  256. clean_match = {
  257. "匹配节点": match_info.get("匹配节点"),
  258. "匹配分数": match_score,
  259. }
  260. if category_info:
  261. # 复制所属分类,但不包含历史共现分类
  262. clean_category = {k: v for k, v in category_info.items() if k != "历史共现分类"}
  263. clean_match["所属分类"] = clean_category
  264. flat_node["人设匹配"].append(clean_match)
  265. # 收集人设共现关系(去重)- 从历史共现分类拆出来
  266. if category_id and category_id not in persona_co_occur:
  267. co_occur_list = category_info.get("历史共现分类", [])
  268. if co_occur_list:
  269. persona_co_occur[category_id] = [
  270. {
  271. "节点ID": c.get("节点ID"),
  272. "节点名称": c.get("节点名称"),
  273. "节点分类": c.get("节点分类", ""),
  274. "节点维度": c.get("节点维度", ""),
  275. "节点类型": c.get("节点类型", ""),
  276. "人设全局占比": c.get("人设全局占比", 0),
  277. "父类下占比": c.get("父类下占比", 0),
  278. "共现度": c.get("共现度", 0),
  279. }
  280. for c in co_occur_list
  281. if c.get("节点ID")
  282. ]
  283. else:
  284. flat_node["人设匹配"] = []
  285. flat_nodes.append(flat_node)
  286. return {
  287. "帖子详情": extract_post_detail(post_graph),
  288. "节点列表": flat_nodes,
  289. "关系列表": relation_list,
  290. "人设共现关系": persona_co_occur,
  291. }
  292. # ===== 第二步:人设常量判断 =====
  293. def identify_persona_constants(nodes: List[Dict]) -> Dict:
  294. """
  295. 识别人设常量
  296. 判断条件:匹配分数 >= 0.8 且 所属分类全局占比 >= 0.7
  297. 输入: 节点列表
  298. 输出: 节点列表(更新了是人设常量、是否已知、发现编号字段)+ 人设常量列表
  299. """
  300. output_nodes = []
  301. persona_constants = []
  302. for node in nodes:
  303. new_node = dict(node)
  304. # 获取最佳匹配分数和全局占比
  305. match_list = node.get("人设匹配") or []
  306. best_match_score = 0
  307. best_global_ratio = 0
  308. for match_info in match_list:
  309. match_score = match_info.get("匹配分数", 0)
  310. category_info = match_info.get("所属分类")
  311. global_ratio = category_info.get("人设全局占比", 0) if category_info else 0
  312. if match_score > best_match_score:
  313. best_match_score = match_score
  314. best_global_ratio = global_ratio
  315. # 判断是否为人设常量
  316. is_constant = (best_match_score >= MATCH_SCORE_THRESHOLD and
  317. best_global_ratio >= GLOBAL_RATIO_THRESHOLD)
  318. if is_constant:
  319. new_node["是人设常量"] = True
  320. new_node["是否已知"] = True
  321. new_node["发现编号"] = 1 # 人设常量发现编号为1
  322. persona_constants.append(new_node["节点名称"])
  323. output_nodes.append(new_node)
  324. return {
  325. "输出节点": output_nodes,
  326. "人设常量": persona_constants,
  327. }
  328. # ===== 第三步:起点分析(新版prompt) =====
  329. def get_best_match(node: Dict) -> Optional[Dict]:
  330. """获取节点的最佳人设匹配(分数最高的)"""
  331. match_list = node.get("人设匹配") or []
  332. if not match_list:
  333. return None
  334. return max(match_list, key=lambda m: m.get("匹配分数", 0))
  335. def get_match_score(node: Dict) -> float:
  336. """获取节点的最高人设匹配分数"""
  337. best_match = get_best_match(node)
  338. if best_match:
  339. return best_match.get("匹配分数", 0)
  340. return 0
  341. def get_category_id(node: Dict) -> Optional[str]:
  342. """获取节点的所属分类ID(最佳匹配的)"""
  343. best_match = get_best_match(node)
  344. if best_match:
  345. category = best_match.get("所属分类")
  346. if category:
  347. return category.get("节点ID")
  348. return None
  349. def get_all_category_ids(node: Dict) -> List[str]:
  350. """获取节点所有匹配的分类ID"""
  351. match_list = node.get("人设匹配") or []
  352. result = []
  353. for m in match_list:
  354. category = m.get("所属分类")
  355. if category and category.get("节点ID"):
  356. result.append(category.get("节点ID"))
  357. return result
  358. def get_category_global_ratio(node: Dict) -> float:
  359. """获取节点所属分类的人设全局占比(最佳匹配的)"""
  360. best_match = get_best_match(node)
  361. if best_match:
  362. category = best_match.get("所属分类")
  363. if category:
  364. return category.get("人设全局占比", 0)
  365. return 0
  366. def is_persona_constant(node: Dict) -> bool:
  367. """判断节点是否为人设常量(匹配分数 >= 0.8 且 分类全局占比 >= 0.7)"""
  368. match_score = get_match_score(node)
  369. global_ratio = get_category_global_ratio(node)
  370. return match_score >= MATCH_SCORE_THRESHOLD and global_ratio >= GLOBAL_RATIO_THRESHOLD
  371. def build_origin_context(nodes: List[Dict]) -> Dict:
  372. """构造AI分析的上下文(新版格式)"""
  373. # 所有创意标签(排除人设常量)
  374. all_tags = []
  375. for node in nodes:
  376. if node.get("是人设常量"):
  377. continue # 跳过人设常量
  378. all_tags.append({
  379. "名称": node["节点名称"],
  380. "人设匹配度": round(get_match_score(node), 2),
  381. "所属分类全局占比": round(get_category_global_ratio(node), 2),
  382. })
  383. # 起点候选集(灵感点 + 目的点,排除人设常量)
  384. candidates = [
  385. node["节点名称"]
  386. for node in nodes
  387. if node["节点维度"] in ["灵感点", "目的点"] and not node.get("是人设常量")
  388. ]
  389. return {
  390. "all_tags": all_tags,
  391. "candidates": candidates,
  392. }
  393. def format_origin_prompt(context: Dict) -> str:
  394. """格式化起点分析的prompt(新版)"""
  395. all_tags = context["all_tags"]
  396. candidates = context["candidates"]
  397. # 创意标签列表
  398. tags_text = ""
  399. for tag in all_tags:
  400. tags_text += f"- {tag['名称']}\n"
  401. tags_text += f" 人设匹配度: {tag['人设匹配度']} | 所属分类全局占比: {tag['所属分类全局占比']}\n\n"
  402. # 起点候选集(一行)
  403. candidates_text = "、".join(candidates)
  404. prompt = f"""# Role
  405. 你是小红书爆款内容的"逆向工程"专家。你的核心能力是透过内容的表象,还原创作者最初的脑回路。
  406. # Task
  407. 我提供一组笔记的【创意标签】和一个【起点候选集】。
  408. 请推理出哪些选项是真正的**创意起点**。
  409. # Input Data
  410. ## 创意标签
  411. {tags_text}
  412. ## 起点候选集
  413. {candidates_text}
  414. # 推理约束
  415. - 无法被其他项或人设推理出的点,即为起点(推理关系局限在起点候选集中)
  416. - 包含/被包含关系代表一种顺序:由大节点推导出被包含节点
  417. - 目的推理手段
  418. - 实质推理形式
  419. - 和人设匹配度越低的帖子是起点概率越大,证明这个起点具备外部性
  420. # Output Format
  421. 请输出一个标准的 JSON 格式。
  422. - Key: 候选集中的词。
  423. - Value: 一个对象,包含:
  424. - `score`: 0.0 到 1.0 的浮点数(代表是起点的可能性)。
  425. - `analysis`: 一句话推理"""
  426. return prompt
  427. async def analyze_origin(nodes: List[Dict], force_llm: bool = False, log_url: str = None) -> Dict:
  428. """
  429. 执行起点分析
  430. 输入: 节点列表
  431. 输出: 节点列表(加了起点分析、是否已知、发现编号字段)+ 中间结果
  432. """
  433. context = build_origin_context(nodes)
  434. prompt = format_origin_prompt(context)
  435. print(f"\n 起点候选: {len(context['candidates'])} 个")
  436. # 如果没有候选,直接返回
  437. if not context['candidates']:
  438. print(f" (无起点候选,跳过LLM分析)")
  439. return {
  440. "输入上下文": {
  441. "创意标签": context["all_tags"],
  442. "起点候选": context["candidates"],
  443. },
  444. "中间结果": {},
  445. "输出节点": nodes,
  446. "cache_hit": None,
  447. "model": None,
  448. "log_url": None,
  449. }
  450. result = await analyze(
  451. prompt=prompt,
  452. task_name=f"{TASK_NAME}/origin",
  453. force=force_llm,
  454. parse_json=True,
  455. log_url=log_url,
  456. )
  457. # 把分析结果合并到节点
  458. llm_result = result.data or {}
  459. output_nodes = []
  460. # 同一个步骤出来的节点使用相同的发现编号
  461. step_order = 1 # 起点分析步骤的编号
  462. for node in nodes:
  463. new_node = dict(node) # 复制原节点
  464. name = node["节点名称"]
  465. # 跳过已经是已知的节点(人设常量)
  466. if node.get("是否已知"):
  467. output_nodes.append(new_node)
  468. continue
  469. if name in llm_result:
  470. score = llm_result[name].get("score", 0)
  471. analysis = llm_result[name].get("analysis", "")
  472. # 加起点分析
  473. new_node["起点分析"] = {
  474. "分数": score,
  475. "说明": analysis,
  476. }
  477. # 高分起点标记为已知(同一步骤的节点使用相同编号)
  478. if score >= ORIGIN_SCORE_THRESHOLD:
  479. new_node["是否已知"] = True
  480. new_node["发现编号"] = step_order
  481. else:
  482. new_node["起点分析"] = None
  483. output_nodes.append(new_node)
  484. return {
  485. "输入上下文": {
  486. "创意标签": context["all_tags"],
  487. "起点候选": context["candidates"],
  488. },
  489. "中间结果": llm_result,
  490. "输出节点": output_nodes,
  491. "cache_hit": result.cache_hit,
  492. "model": result.model_name,
  493. "log_url": result.log_url,
  494. }
  495. # ===== 辅助函数 =====
  496. def get_node_domain(node_id: str) -> str:
  497. """从节点ID中提取域(帖子/人设)"""
  498. if node_id.startswith("帖子:"):
  499. return "帖子"
  500. elif node_id.startswith("人设:"):
  501. return "人设"
  502. return ""
  503. # ===== 第三步:模式推导 =====
  504. def derive_patterns(
  505. nodes: List[Dict],
  506. persona_co_occur: Dict[str, Dict],
  507. ) -> Dict:
  508. """
  509. 基于共现关系的迭代推导
  510. 输入: 带起点分析的节点列表 + 人设共现关系数据
  511. 输出: 节点列表(加了推导轮次、未知原因字段)+ 推导边列表
  512. """
  513. node_by_name: Dict[str, Dict] = {n["节点名称"]: n for n in nodes}
  514. # 构建共现查找表 {节点ID: {共现节点ID: {完整信息}}}
  515. co_occur_lookup = {}
  516. for cat_id, co_occur_list in persona_co_occur.items():
  517. co_occur_lookup[cat_id] = {
  518. c["节点ID"]: {
  519. "共现度": c["共现度"],
  520. "节点ID": c.get("节点ID", ""),
  521. "节点名称": c.get("节点名称", ""),
  522. "节点维度": c.get("节点维度", ""),
  523. }
  524. for c in co_occur_list
  525. }
  526. def build_path_to_category(node: Dict) -> List[Dict]:
  527. """
  528. 构建从帖子标签到人设分类的路径(包含节点和边信息)
  529. 返回格式: [节点, 边, 节点, 边, 节点, ...]
  530. """
  531. node_id = node["节点ID"]
  532. path = [{
  533. "类型": "节点",
  534. "节点ID": node_id,
  535. "节点名称": node["节点名称"],
  536. "节点类型": "标签",
  537. "节点维度": node.get("节点维度", ""),
  538. "节点域": get_node_domain(node_id),
  539. }]
  540. best_match = get_best_match(node)
  541. if not best_match:
  542. return path
  543. match_score = best_match.get("匹配分数", 0)
  544. match_node = best_match.get("匹配节点", {})
  545. category = best_match.get("所属分类", {})
  546. # 如果匹配的是标签
  547. if match_node:
  548. node_type = match_node.get("节点类型", "")
  549. if node_type == "标签":
  550. # 添加匹配边
  551. path.append({
  552. "类型": "边",
  553. "边类型": "匹配",
  554. "分数": match_score,
  555. })
  556. # 添加人设标签节点
  557. match_node_id = match_node.get("节点ID", "")
  558. path.append({
  559. "类型": "节点",
  560. "节点ID": match_node_id,
  561. "节点名称": match_node.get("节点名称", ""),
  562. "节点类型": "标签",
  563. "节点维度": match_node.get("节点维度", ""),
  564. "节点域": get_node_domain(match_node_id),
  565. })
  566. # 添加属于边
  567. if category:
  568. path.append({
  569. "类型": "边",
  570. "边类型": "属于",
  571. "分数": 1,
  572. })
  573. # 添加分类节点
  574. if category:
  575. # 如果直接匹配的是分类,添加匹配边
  576. if not match_node or match_node.get("节点类型") != "标签":
  577. path.append({
  578. "类型": "边",
  579. "边类型": "匹配",
  580. "分数": match_score,
  581. })
  582. category_id = category.get("节点ID", "")
  583. path.append({
  584. "类型": "节点",
  585. "节点ID": category_id,
  586. "节点名称": category.get("节点名称", ""),
  587. "节点类型": "分类",
  588. "节点维度": category.get("节点维度", ""),
  589. "节点域": get_node_domain(category_id),
  590. })
  591. return path
  592. # 1. 初始化已知点集合(已经是已知的节点)
  593. known_names: Set[str] = set()
  594. node_round: Dict[str, int] = {} # {节点名称: 加入轮次}
  595. for node in nodes:
  596. if node.get("是否已知"):
  597. known_names.add(node["节点名称"])
  598. node_round[node["节点名称"]] = 0
  599. unknown_names: Set[str] = set(node_by_name.keys()) - known_names
  600. edges: List[Dict] = []
  601. # 2. 迭代推导
  602. round_num = 0
  603. new_known_this_round = known_names.copy()
  604. while new_known_this_round:
  605. round_num += 1
  606. new_known_next_round: Set[str] = set()
  607. for known_name in new_known_this_round:
  608. known_node = node_by_name.get(known_name)
  609. if not known_node:
  610. continue
  611. if get_match_score(known_node) < MATCH_SCORE_THRESHOLD:
  612. continue
  613. # 获取该节点所属分类的共现列表
  614. known_cat_id = get_category_id(known_node)
  615. if not known_cat_id or known_cat_id not in co_occur_lookup:
  616. continue
  617. co_occur_map = co_occur_lookup[known_cat_id]
  618. for unknown_name in list(unknown_names):
  619. unknown_node = node_by_name.get(unknown_name)
  620. if not unknown_node:
  621. continue
  622. if get_match_score(unknown_node) < MATCH_SCORE_THRESHOLD:
  623. continue
  624. # 检查未知节点的分类是否在已知节点的共现列表中
  625. unknown_cat_id = get_category_id(unknown_node)
  626. if unknown_cat_id and unknown_cat_id in co_occur_map:
  627. co_occur_info = co_occur_map[unknown_cat_id]
  628. co_occur_score = co_occur_info["共现度"]
  629. new_known_next_round.add(unknown_name)
  630. node_round[unknown_name] = round_num
  631. # 动态构建推导路径(包含节点和边)
  632. # 来源侧路径: 帖子标签 -匹配-> [人设标签 -属于->] 人设分类
  633. source_path = build_path_to_category(known_node)
  634. # 添加共现边
  635. source_path.append({
  636. "类型": "边",
  637. "边类型": "共现",
  638. "分数": co_occur_score,
  639. })
  640. # 添加共现分类节点
  641. co_occur_node_id = co_occur_info["节点ID"]
  642. source_path.append({
  643. "类型": "节点",
  644. "节点ID": co_occur_node_id,
  645. "节点名称": co_occur_info["节点名称"],
  646. "节点类型": "分类",
  647. "节点维度": co_occur_info.get("节点维度", ""),
  648. "节点域": get_node_domain(co_occur_node_id),
  649. })
  650. # 目标侧路径: 人设分类 -> [人设标签] -> 帖子标签(需要反转)
  651. target_path = build_path_to_category(unknown_node)
  652. target_path.reverse()
  653. # 去掉目标路径的第一个节点(分类),因为已经用共现分类表示
  654. # 但保留边信息
  655. if len(target_path) > 0 and target_path[0].get("类型") == "节点":
  656. target_path = target_path[1:] # 去掉分类节点,保留后续的边和节点
  657. # 合并完整路径
  658. full_path = source_path + target_path
  659. edges.append({
  660. "来源": known_node["节点ID"],
  661. "目标": unknown_node["节点ID"],
  662. "关系类型": "共现推导",
  663. "score": co_occur_score,
  664. "推导轮次": round_num,
  665. "推导路径": full_path,
  666. })
  667. known_names.update(new_known_next_round)
  668. unknown_names -= new_known_next_round
  669. new_known_this_round = new_known_next_round
  670. if not new_known_next_round:
  671. break
  672. # 3. 构建输出节点(只更新是否已知、发现编号)
  673. # 先找出当前最大发现编号
  674. max_order = 0
  675. for node in nodes:
  676. if node.get("发现编号") and node["发现编号"] > max_order:
  677. max_order = node["发现编号"]
  678. # 按推导轮次排序新发现的节点,分配发现编号
  679. new_known_by_round = {}
  680. for name, r in node_round.items():
  681. if r > 0: # 排除起点(轮次0)
  682. if r not in new_known_by_round:
  683. new_known_by_round[r] = []
  684. new_known_by_round[r].append(name)
  685. # 分配发现编号(同一轮次的节点使用相同编号)
  686. order_map = {}
  687. for r in sorted(new_known_by_round.keys()):
  688. step_order = max_order + r # 同一轮次使用相同编号
  689. for name in new_known_by_round[r]:
  690. order_map[name] = step_order
  691. output_nodes = []
  692. for node in nodes:
  693. new_node = dict(node)
  694. name = node["节点名称"]
  695. # 如果是新推导出来的(非起点),更新已知状态和发现编号
  696. if name in node_round and node_round[name] > 0:
  697. new_node["是否已知"] = True
  698. new_node["发现编号"] = order_map.get(name)
  699. output_nodes.append(new_node)
  700. return {
  701. "输出节点": output_nodes,
  702. "推导边列表": edges,
  703. "推导轮次": round_num,
  704. }
  705. # ===== 第四步:下一步分析 =====
  706. def build_next_step_context(known_nodes: List[Dict], unknown_nodes: List[Dict], all_nodes: List[Dict]) -> Dict:
  707. """构造下一步分析的上下文(简化版)"""
  708. # 已知点信息(按发现顺序排序,只保留名称和维度)
  709. known_sorted = sorted(known_nodes, key=lambda n: n.get("发现编号") or 999)
  710. known_info = [
  711. {"名称": n["节点名称"], "维度": n["节点维度"]}
  712. for n in known_sorted
  713. ]
  714. # 未知点信息(只保留名称和维度)
  715. unknown_info = [
  716. {"名称": n["节点名称"], "维度": n["节点维度"]}
  717. for n in unknown_nodes
  718. ]
  719. return {
  720. "known_nodes": known_info,
  721. "unknown_nodes": unknown_info,
  722. }
  723. def format_next_step_prompt(context: Dict) -> str:
  724. """格式化下一步分析的prompt(简化版)"""
  725. # 已知点:- 名称 (维度)
  726. known_text = "\n".join([
  727. f"- {n['名称']} ({n['维度']})"
  728. for n in context["known_nodes"]
  729. ])
  730. # 未知点:- 名称 (维度)
  731. unknown_text = "\n".join([
  732. f"- {n['名称']} ({n['维度']})"
  733. for n in context["unknown_nodes"]
  734. ])
  735. prompt = f"""# Role
  736. 你是小红书爆款内容的"逆向工程"专家。你的任务是还原创作者的思维路径。
  737. # Task
  738. 基于已知的创意点,推理哪些未知点最可能是创作者**下一步直接想到**的点。
  739. 可以有多个点同时被想到(如果它们在逻辑上是并列的)。
  740. ## 已知点
  741. {known_text}
  742. ## 未知点(待推理)
  743. {unknown_text}
  744. # 推理约束
  745. - 创作者的思维是有逻辑的:先有实质,再想形式
  746. - 包含/被包含关系代表一种顺序:由大节点推导出被包含节点
  747. - 只输出"下一步直接能想到"的点,不是所有未知点
  748. # Output Format
  749. 输出 JSON,对每个未知点评分:
  750. - Key: 未知点名称
  751. - Value: 对象,包含:
  752. - `score`: 0.0-1.0(下一步被想到的可能性)
  753. - `from`: 从哪个已知点推导出来(已知点名称),数组
  754. - `reason`: 如何从该已知点推导出来(一句话)"""
  755. return prompt
  756. async def analyze_next_step(
  757. nodes: List[Dict],
  758. force_llm: bool = False,
  759. log_url: str = None,
  760. ) -> Dict:
  761. """
  762. 执行下一步分析
  763. 输入: 节点列表(有已知和未知)
  764. 输出: 最可能的下一步点列表
  765. """
  766. # 分离已知和未知
  767. known_nodes = [n for n in nodes if n.get("是否已知")]
  768. unknown_nodes = [n for n in nodes if not n.get("是否已知")]
  769. if not unknown_nodes:
  770. return {
  771. "输入上下文": {"已知点": [], "未知点": []},
  772. "中间结果": [],
  773. "下一步点": [],
  774. }
  775. context = build_next_step_context(known_nodes, unknown_nodes, nodes)
  776. prompt = format_next_step_prompt(context)
  777. print(f"\n 已知点: {len(known_nodes)} 个")
  778. print(f" 未知点: {len(unknown_nodes)} 个")
  779. result = await analyze(
  780. prompt=prompt,
  781. task_name=f"{TASK_NAME}/next_step",
  782. force=force_llm,
  783. parse_json=True,
  784. log_url=log_url,
  785. )
  786. # 解析结果(现在是 {name: {score, from, reason}} 格式)
  787. llm_result = result.data or {}
  788. # 构建候选列表,按分数排序
  789. candidates = []
  790. for name, info in llm_result.items():
  791. # from 现在是数组
  792. from_list = info.get("from", [])
  793. if isinstance(from_list, str):
  794. from_list = [from_list] # 兼容旧格式
  795. candidates.append({
  796. "节点名称": name,
  797. "可能性分数": info.get("score", 0),
  798. "推导来源": from_list,
  799. "推理说明": info.get("reason", ""),
  800. })
  801. candidates.sort(key=lambda x: x["可能性分数"], reverse=True)
  802. return {
  803. "输入上下文": {
  804. "已知点": context["known_nodes"],
  805. "未知点": context["unknown_nodes"],
  806. },
  807. "中间结果": llm_result,
  808. "下一步候选": candidates,
  809. "cache_hit": result.cache_hit,
  810. "model": result.model_name,
  811. "log_url": result.log_url,
  812. }
  813. # ===== 完整流程 =====
  814. def save_result(post_id: str, post_detail: Dict, steps: List, config: PathConfig) -> Path:
  815. """保存结果到文件"""
  816. output_dir = config.intermediate_dir / OUTPUT_DIR_NAME
  817. output_dir.mkdir(parents=True, exist_ok=True)
  818. output_file = output_dir / f"{post_id}_点顺序.json"
  819. result = {
  820. "帖子详情": post_detail,
  821. "步骤列表": steps,
  822. }
  823. with open(output_file, "w", encoding="utf-8") as f:
  824. json.dump(result, f, ensure_ascii=False, indent=2)
  825. print(f" [已保存] {output_file.name}")
  826. return output_file
  827. async def process_single_post(
  828. post_file: Path,
  829. persona_graph: Dict,
  830. config: PathConfig,
  831. force_llm: bool = False,
  832. max_step: int = 6,
  833. log_url: str = None,
  834. ) -> Dict:
  835. """
  836. 处理单个帖子
  837. Args:
  838. force_llm: 强制重新调用LLM(跳过LLM缓存)
  839. max_step: 最多运行到第几步 (1=数据准备, 2=人设常量判断, 3=起点分析, 4=模式推导, 5=下一步分析, 6=完整循环)
  840. """
  841. post_graph = load_json(post_file)
  842. post_id = post_graph.get("meta", {}).get("postId", "unknown")
  843. print(f"\n{'=' * 60}")
  844. print(f"处理帖子: {post_id}")
  845. print("-" * 60)
  846. steps = []
  847. # ===== 步骤1:数据准备 =====
  848. print("\n[步骤1] 数据准备...")
  849. data = prepare_analysis_data(post_graph, persona_graph)
  850. post_detail = data["帖子详情"]
  851. nodes_step1 = data["节点列表"]
  852. relations_step1 = data["关系列表"]
  853. persona_co_occur = data["人设共现关系"]
  854. step1 = {
  855. "步骤": "数据准备",
  856. "输入": {
  857. "帖子图谱": str(post_file.name),
  858. "人设图谱": "人设图谱.json",
  859. },
  860. "输出": {
  861. "新的已知节点": [],
  862. "新的边": [],
  863. "节点列表": nodes_step1,
  864. "边列表": relations_step1,
  865. },
  866. "人设共现关系": persona_co_occur,
  867. "摘要": {
  868. "节点数": len(nodes_step1),
  869. "边数": len(relations_step1),
  870. "人设共现数": len(persona_co_occur),
  871. },
  872. }
  873. steps.append(step1)
  874. print(f" 节点数: {len(nodes_step1)}")
  875. print(f" 关系数: {len(relations_step1)}")
  876. print(f" 人设共现数: {len(persona_co_occur)}")
  877. # 步骤1完成,保存
  878. save_result(post_id, post_detail, steps, config)
  879. if max_step == 1:
  880. return {"帖子详情": post_detail, "步骤列表": steps}
  881. # ===== 步骤2:人设常量判断 =====
  882. print("\n[步骤2] 人设常量判断...")
  883. constant_result = identify_persona_constants(nodes_step1)
  884. nodes_step2 = constant_result["输出节点"]
  885. persona_constants = constant_result["人设常量"]
  886. step2 = {
  887. "步骤": "人设常量判断",
  888. "输入": {
  889. "节点列表": nodes_step1,
  890. },
  891. "输出": {
  892. "新的已知节点": persona_constants,
  893. "新的边": [],
  894. "节点列表": nodes_step2,
  895. "边列表": relations_step1,
  896. },
  897. "人设常量": persona_constants,
  898. "摘要": {
  899. "人设常量数": len(persona_constants),
  900. },
  901. }
  902. steps.append(step2)
  903. print(f" 人设常量: {len(persona_constants)} 个")
  904. if persona_constants:
  905. for name in persona_constants:
  906. print(f" ◆ {name}")
  907. # 步骤2完成,保存
  908. save_result(post_id, post_detail, steps, config)
  909. if max_step == 2:
  910. return {"帖子详情": post_detail, "步骤列表": steps}
  911. # ===== 步骤3:起点分析 =====
  912. print("\n[步骤3] 起点分析...")
  913. origin_result = await analyze_origin(nodes_step2, force_llm=force_llm, log_url=log_url)
  914. nodes_step3 = origin_result["输出节点"]
  915. # 统计高分起点(排除人设常量)
  916. def get_origin_score(node):
  917. analysis = node.get("起点分析")
  918. if analysis:
  919. return analysis.get("分数", 0)
  920. return 0
  921. high_score_origins = [
  922. (n["节点名称"], get_origin_score(n))
  923. for n in nodes_step3
  924. if get_origin_score(n) >= 0.7 and not n.get("是人设常量")
  925. ]
  926. # 新发现的已知节点(起点,不包括人设常量)
  927. prev_known = {n["节点名称"] for n in nodes_step2 if n.get("是否已知")}
  928. new_known_nodes = [n["节点名称"] for n in nodes_step3 if n.get("是否已知") and n["节点名称"] not in prev_known]
  929. step3 = {
  930. "步骤": "起点分析",
  931. "输入": {
  932. "节点列表": nodes_step2,
  933. "创意标签": origin_result["输入上下文"]["创意标签"],
  934. "起点候选": origin_result["输入上下文"]["起点候选"],
  935. },
  936. "中间结果": origin_result["中间结果"],
  937. "输出": {
  938. "新的已知节点": new_known_nodes,
  939. "新的边": [],
  940. "节点列表": nodes_step3,
  941. "边列表": relations_step1, # 边没变化
  942. },
  943. "摘要": {
  944. "新已知数": len(new_known_nodes),
  945. "model": origin_result["model"],
  946. "cache_hit": origin_result["cache_hit"],
  947. "log_url": origin_result.get("log_url"),
  948. },
  949. }
  950. steps.append(step3)
  951. print(f" 高分起点 (>=0.7): {len(high_score_origins)} 个")
  952. for name, score in sorted(high_score_origins, key=lambda x: -x[1]):
  953. print(f" ★ {name}: {score:.2f}")
  954. # 步骤3完成,保存
  955. save_result(post_id, post_detail, steps, config)
  956. if max_step == 3:
  957. return {"帖子详情": post_detail, "步骤列表": steps}
  958. # ===== 步骤4:模式推导 =====
  959. print("\n[步骤4] 模式推导...")
  960. derivation_result = derive_patterns(nodes_step3, persona_co_occur)
  961. nodes_step4 = derivation_result["输出节点"]
  962. edges = derivation_result["推导边列表"]
  963. # 统计
  964. known_count = sum(1 for n in nodes_step4 if n.get("是否已知"))
  965. unknown_count = len(nodes_step4) - known_count
  966. # 新发现的已知节点(本步骤推导出来的,不包括之前的起点)
  967. prev_known = {n["节点名称"] for n in nodes_step3 if n.get("是否已知")}
  968. new_known_nodes = [n["节点名称"] for n in nodes_step4 if n.get("是否已知") and n["节点名称"] not in prev_known]
  969. # 合并边列表(原有边 + 推导边)
  970. all_edges = relations_step1 + edges
  971. step4 = {
  972. "步骤": "模式推导",
  973. "输入": {
  974. "节点列表": nodes_step3,
  975. "人设共现关系": persona_co_occur,
  976. },
  977. "输出": {
  978. "新的已知节点": new_known_nodes,
  979. "新的边": edges,
  980. "节点列表": nodes_step4,
  981. "边列表": all_edges,
  982. },
  983. "摘要": {
  984. "已知点数": known_count,
  985. "新已知数": len(new_known_nodes),
  986. "新边数": len(edges),
  987. "未知点数": unknown_count,
  988. },
  989. }
  990. steps.append(step4)
  991. print(f" 已知点: {known_count} 个")
  992. print(f" 推导边: {len(edges)} 条")
  993. print(f" 未知点: {unknown_count} 个")
  994. # 步骤4完成,保存
  995. save_result(post_id, post_detail, steps, config)
  996. if max_step == 4:
  997. return {"帖子详情": post_detail, "步骤列表": steps}
  998. # ===== 步骤5:下一步分析 =====
  999. print("\n[步骤5] 下一步分析...")
  1000. next_step_result = await analyze_next_step(nodes_step4, force_llm=force_llm, log_url=log_url)
  1001. # 获取候选列表
  1002. candidates = next_step_result["下一步候选"]
  1003. # 筛选高分候选 (>= 0.8)
  1004. NEXT_STEP_THRESHOLD = 0.8
  1005. high_score_candidates = [c for c in candidates if c["可能性分数"] >= NEXT_STEP_THRESHOLD]
  1006. # 构建节点名称到节点的映射
  1007. node_by_name = {n["节点名称"]: n for n in nodes_step4}
  1008. # 找出当前最大发现编号
  1009. max_order = max((n.get("发现编号") or 0) for n in nodes_step4)
  1010. # 更新节点:把高分候选标记为已知(同一步骤的节点使用相同编号)
  1011. nodes_step5 = []
  1012. new_known_names = []
  1013. step_order = max_order + 1 # 同一步骤的节点使用相同编号
  1014. for node in nodes_step4:
  1015. new_node = dict(node)
  1016. name = node["节点名称"]
  1017. # 检查是否在高分候选中
  1018. matching = [c for c in high_score_candidates if c["节点名称"] == name]
  1019. if matching and not node.get("是否已知"):
  1020. new_node["是否已知"] = True
  1021. new_node["发现编号"] = step_order # 同一步骤使用相同编号
  1022. new_known_names.append(name)
  1023. nodes_step5.append(new_node)
  1024. # 创建新的边(推导边,from 是数组,为每个来源创建一条边)
  1025. new_edges = []
  1026. for c in high_score_candidates:
  1027. target_node = node_by_name.get(c["节点名称"])
  1028. if not target_node:
  1029. continue
  1030. for source_name in c["推导来源"]:
  1031. source_node = node_by_name.get(source_name)
  1032. if source_node:
  1033. new_edges.append({
  1034. "来源": source_node["节点ID"],
  1035. "目标": target_node["节点ID"],
  1036. "关系类型": "AI推导",
  1037. "score": c["可能性分数"],
  1038. "推理说明": c["推理说明"],
  1039. "推导路径": [
  1040. {
  1041. "类型": "节点",
  1042. "节点ID": source_node["节点ID"],
  1043. "节点名称": source_node["节点名称"],
  1044. "节点类型": "标签",
  1045. "节点维度": source_node["节点维度"],
  1046. "节点域": get_node_domain(source_node["节点ID"]),
  1047. },
  1048. {
  1049. "类型": "边",
  1050. "边类型": "AI推导",
  1051. "分数": c["可能性分数"],
  1052. },
  1053. {
  1054. "类型": "节点",
  1055. "节点ID": target_node["节点ID"],
  1056. "节点名称": target_node["节点名称"],
  1057. "节点类型": "标签",
  1058. "节点维度": target_node["节点维度"],
  1059. "节点域": get_node_domain(target_node["节点ID"]),
  1060. },
  1061. ],
  1062. })
  1063. # 合并边列表
  1064. all_edges_step5 = all_edges + new_edges
  1065. step5 = {
  1066. "步骤": "下一步分析",
  1067. "输入": {
  1068. "已知点": next_step_result["输入上下文"]["已知点"],
  1069. "未知点": next_step_result["输入上下文"]["未知点"],
  1070. },
  1071. "中间结果": next_step_result["中间结果"],
  1072. "输出": {
  1073. "新的已知节点": new_known_names,
  1074. "新的边": new_edges,
  1075. "节点列表": nodes_step5,
  1076. "边列表": all_edges_step5,
  1077. },
  1078. "摘要": {
  1079. "已知点数": sum(1 for n in nodes_step5 if n.get("是否已知")),
  1080. "新已知数": len(new_known_names),
  1081. "新边数": len(new_edges),
  1082. "未知点数": sum(1 for n in nodes_step5 if not n.get("是否已知")),
  1083. "model": next_step_result.get("model"),
  1084. "cache_hit": next_step_result.get("cache_hit"),
  1085. "log_url": next_step_result.get("log_url"),
  1086. },
  1087. }
  1088. steps.append(step5)
  1089. # 打印高分候选
  1090. print(f" 候选数: {len(candidates)} 个")
  1091. print(f" 高分候选 (>={NEXT_STEP_THRESHOLD}): {len(high_score_candidates)} 个")
  1092. for c in high_score_candidates:
  1093. from_str = " & ".join(c["推导来源"])
  1094. print(f" ★ {c['节点名称']} ({c['可能性分数']:.2f}) ← {from_str}")
  1095. print(f" {c['推理说明']}")
  1096. # 步骤5完成,保存
  1097. save_result(post_id, post_detail, steps, config)
  1098. if max_step == 5:
  1099. return {"帖子详情": post_detail, "步骤列表": steps}
  1100. # ===== 循环:步骤4→步骤5 直到全部已知 =====
  1101. iteration = 1
  1102. current_nodes = nodes_step5
  1103. current_edges = all_edges_step5
  1104. MAX_ITERATIONS = 10 # 防止无限循环
  1105. while True:
  1106. # 检查是否还有未知节点
  1107. unknown_count = sum(1 for n in current_nodes if not n.get("是否已知"))
  1108. if unknown_count == 0:
  1109. print(f"\n[完成] 所有节点已变为已知")
  1110. break
  1111. if iteration > MAX_ITERATIONS:
  1112. print(f"\n[警告] 达到最大迭代次数 {MAX_ITERATIONS},停止循环")
  1113. break
  1114. # ===== 迭代步骤3:共现推导 =====
  1115. print(f"\n[迭代{iteration}-步骤3] 模式推导...")
  1116. derivation_result = derive_patterns(current_nodes, persona_co_occur)
  1117. nodes_iter3 = derivation_result["输出节点"]
  1118. edges_iter3 = derivation_result["推导边列表"]
  1119. # 统计新推导的
  1120. prev_known_names = {n["节点名称"] for n in current_nodes if n.get("是否已知")}
  1121. new_known_step3 = [n["节点名称"] for n in nodes_iter3 if n.get("是否已知") and n["节点名称"] not in prev_known_names]
  1122. new_edges_step3 = edges_iter3 # derive_patterns 返回的是本轮新增的边
  1123. all_edges_iter3 = current_edges + new_edges_step3
  1124. step_iter3 = {
  1125. "步骤": f"迭代{iteration}-模式推导",
  1126. "输入": {
  1127. "节点列表": current_nodes,
  1128. "人设共现关系": persona_co_occur,
  1129. },
  1130. "输出": {
  1131. "新的已知节点": new_known_step3,
  1132. "新的边": new_edges_step3,
  1133. "节点列表": nodes_iter3,
  1134. "边列表": all_edges_iter3,
  1135. },
  1136. "摘要": {
  1137. "已知点数": sum(1 for n in nodes_iter3 if n.get("是否已知")),
  1138. "新已知数": len(new_known_step3),
  1139. "新边数": len(new_edges_step3),
  1140. "未知点数": sum(1 for n in nodes_iter3 if not n.get("是否已知")),
  1141. },
  1142. }
  1143. steps.append(step_iter3)
  1144. print(f" 新已知: {len(new_known_step3)} 个")
  1145. print(f" 新边: {len(new_edges_step3)} 条")
  1146. save_result(post_id, post_detail, steps, config)
  1147. # 检查是否还有未知
  1148. unknown_after_step3 = sum(1 for n in nodes_iter3 if not n.get("是否已知"))
  1149. if unknown_after_step3 == 0:
  1150. print(f"\n[完成] 所有节点已变为已知")
  1151. break
  1152. # ===== 迭代步骤4:AI推导 =====
  1153. print(f"\n[迭代{iteration}-步骤4] 下一步分析...")
  1154. next_step_result = await analyze_next_step(nodes_iter3, force_llm=force_llm, log_url=log_url)
  1155. candidates_iter4 = next_step_result["下一步候选"]
  1156. high_score_iter4 = [c for c in candidates_iter4 if c["可能性分数"] >= NEXT_STEP_THRESHOLD]
  1157. # 更新节点(同一步骤的节点使用相同编号)
  1158. node_by_name_iter4 = {n["节点名称"]: n for n in nodes_iter3}
  1159. max_order_iter4 = max((n.get("发现编号") or 0) for n in nodes_iter3)
  1160. nodes_iter4 = []
  1161. new_known_iter4 = []
  1162. step_order_iter4 = max_order_iter4 + 1 # 同一步骤的节点使用相同编号
  1163. for node in nodes_iter3:
  1164. new_node = dict(node)
  1165. name = node["节点名称"]
  1166. matching = [c for c in high_score_iter4 if c["节点名称"] == name]
  1167. if matching and not node.get("是否已知"):
  1168. new_node["是否已知"] = True
  1169. new_node["发现编号"] = step_order_iter4 # 同一步骤使用相同编号
  1170. new_known_iter4.append(name)
  1171. nodes_iter4.append(new_node)
  1172. # 创建新边(from 是数组,为每个来源创建一条边)
  1173. new_edges_iter4 = []
  1174. for c in high_score_iter4:
  1175. target_node = node_by_name_iter4.get(c["节点名称"])
  1176. if not target_node:
  1177. continue
  1178. for source_name in c["推导来源"]:
  1179. source_node = node_by_name_iter4.get(source_name)
  1180. if source_node:
  1181. new_edges_iter4.append({
  1182. "来源": source_node["节点ID"],
  1183. "目标": target_node["节点ID"],
  1184. "关系类型": "AI推导",
  1185. "score": c["可能性分数"],
  1186. "推理说明": c["推理说明"],
  1187. "推导路径": [
  1188. {
  1189. "类型": "节点",
  1190. "节点ID": source_node["节点ID"],
  1191. "节点名称": source_node["节点名称"],
  1192. "节点类型": "标签",
  1193. "节点维度": source_node["节点维度"],
  1194. "节点域": get_node_domain(source_node["节点ID"]),
  1195. },
  1196. {
  1197. "类型": "边",
  1198. "边类型": "AI推导",
  1199. "分数": c["可能性分数"],
  1200. },
  1201. {
  1202. "类型": "节点",
  1203. "节点ID": target_node["节点ID"],
  1204. "节点名称": target_node["节点名称"],
  1205. "节点类型": "标签",
  1206. "节点维度": target_node["节点维度"],
  1207. "节点域": get_node_domain(target_node["节点ID"]),
  1208. },
  1209. ],
  1210. })
  1211. all_edges_iter4 = all_edges_iter3 + new_edges_iter4
  1212. step_iter4 = {
  1213. "步骤": f"迭代{iteration}-下一步分析",
  1214. "输入": {
  1215. "已知点": next_step_result["输入上下文"]["已知点"],
  1216. "未知点": next_step_result["输入上下文"]["未知点"],
  1217. },
  1218. "中间结果": next_step_result["中间结果"],
  1219. "输出": {
  1220. "新的已知节点": new_known_iter4,
  1221. "新的边": new_edges_iter4,
  1222. "节点列表": nodes_iter4,
  1223. "边列表": all_edges_iter4,
  1224. },
  1225. "摘要": {
  1226. "已知点数": sum(1 for n in nodes_iter4 if n.get("是否已知")),
  1227. "新已知数": len(new_known_iter4),
  1228. "新边数": len(new_edges_iter4),
  1229. "未知点数": sum(1 for n in nodes_iter4 if not n.get("是否已知")),
  1230. "model": next_step_result.get("model"),
  1231. "cache_hit": next_step_result.get("cache_hit"),
  1232. },
  1233. }
  1234. steps.append(step_iter4)
  1235. print(f" 新已知: {len(new_known_iter4)} 个")
  1236. print(f" 新边: {len(new_edges_iter4)} 条")
  1237. save_result(post_id, post_detail, steps, config)
  1238. # 如果这轮没有新进展,停止
  1239. if len(new_known_step3) == 0 and len(new_known_iter4) == 0:
  1240. print(f"\n[停止] 本轮无新进展,停止循环")
  1241. break
  1242. # 更新状态,进入下一轮
  1243. current_nodes = nodes_iter4
  1244. current_edges = all_edges_iter4
  1245. iteration += 1
  1246. return {"帖子详情": post_detail, "步骤列表": steps}
  1247. # ===== 主函数 =====
  1248. async def main(
  1249. post_id: str = None,
  1250. all_posts: bool = False,
  1251. force_llm: bool = False,
  1252. max_step: int = 6,
  1253. ):
  1254. """主函数"""
  1255. _, log_url = set_trace()
  1256. config = PathConfig()
  1257. print(f"账号: {config.account_name}")
  1258. print(f"Trace URL: {log_url}")
  1259. print(f"输出目录: {OUTPUT_DIR_NAME}")
  1260. # 加载人设图谱
  1261. persona_graph_file = config.intermediate_dir / "人设图谱.json"
  1262. if not persona_graph_file.exists():
  1263. print(f"错误: 人设图谱文件不存在: {persona_graph_file}")
  1264. return
  1265. persona_graph = load_json(persona_graph_file)
  1266. print(f"人设图谱节点数: {len(persona_graph.get('nodes', {}))}")
  1267. # 获取帖子图谱文件
  1268. post_graph_files = get_post_graph_files(config)
  1269. if not post_graph_files:
  1270. print("错误: 没有找到帖子图谱文件")
  1271. return
  1272. # 确定要处理的帖子
  1273. if post_id:
  1274. target_file = next(
  1275. (f for f in post_graph_files if post_id in f.name),
  1276. None
  1277. )
  1278. if not target_file:
  1279. print(f"错误: 未找到帖子 {post_id}")
  1280. return
  1281. files_to_process = [target_file]
  1282. elif all_posts:
  1283. files_to_process = post_graph_files
  1284. else:
  1285. files_to_process = [post_graph_files[0]]
  1286. print(f"待处理帖子数: {len(files_to_process)}")
  1287. # 处理
  1288. results = []
  1289. for i, post_file in enumerate(files_to_process, 1):
  1290. print(f"\n{'#' * 60}")
  1291. print(f"# 处理帖子 {i}/{len(files_to_process)}")
  1292. print(f"{'#' * 60}")
  1293. result = await process_single_post(
  1294. post_file=post_file,
  1295. persona_graph=persona_graph,
  1296. config=config,
  1297. force_llm=force_llm,
  1298. max_step=max_step,
  1299. log_url=log_url,
  1300. )
  1301. results.append(result)
  1302. # 汇总
  1303. print(f"\n{'#' * 60}")
  1304. print(f"# 完成! 共处理 {len(results)} 个帖子")
  1305. print(f"{'#' * 60}")
  1306. print(f"Trace: {log_url}")
  1307. print("\n汇总:")
  1308. for result in results:
  1309. post_id = result["帖子详情"]["postId"]
  1310. steps = result.get("步骤列表", [])
  1311. num_steps = len(steps)
  1312. if num_steps == 1:
  1313. step1_summary = steps[0].get("摘要", {})
  1314. print(f" {post_id}: 节点数={step1_summary.get('节点数', 0)}, "
  1315. f"人设常量={step1_summary.get('人设常量数', 0)} (仅数据准备)")
  1316. elif num_steps == 2:
  1317. step2_summary = steps[1].get("摘要", {})
  1318. print(f" {post_id}: 起点={step2_summary.get('新已知数', 0)} (未推导)")
  1319. elif num_steps == 3:
  1320. step3_summary = steps[2].get("摘要", {})
  1321. print(f" {post_id}: 已知={step3_summary.get('已知点数', 0)}, "
  1322. f"未知={step3_summary.get('未知点数', 0)}")
  1323. elif num_steps >= 4:
  1324. step4_summary = steps[3].get("摘要", {})
  1325. print(f" {post_id}: 已知={step4_summary.get('已知点数', 0)}, "
  1326. f"新已知={step4_summary.get('新已知数', 0)}, "
  1327. f"新边={step4_summary.get('新边数', 0)}, "
  1328. f"未知={step4_summary.get('未知点数', 0)}")
  1329. else:
  1330. print(f" {post_id}: 无步骤数据")
  1331. if __name__ == "__main__":
  1332. import argparse
  1333. parser = argparse.ArgumentParser(description="选题点顺序分析")
  1334. parser.add_argument("--post-id", type=str, help="帖子ID")
  1335. parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
  1336. parser.add_argument("--force-llm", action="store_true", help="强制重新调用LLM(跳过LLM缓存)")
  1337. parser.add_argument("--step", type=int, default=6, choices=[1, 2, 3, 4, 5, 6],
  1338. help="运行到第几步 (1=数据准备, 2=人设常量判断, 3=起点分析, 4=模式推导, 5=下一步分析, 6=完整循环)")
  1339. args = parser.parse_args()
  1340. asyncio.run(main(
  1341. post_id=args.post_id,
  1342. all_posts=args.all_posts,
  1343. force_llm=args.force_llm,
  1344. max_step=args.step,
  1345. ))