analyze_node_origin_v4.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 节点来源分析脚本 V4
  5. 采用两步法:
  6. 1. 第一步(筛选):筛选可能的来源特征
  7. 2. 第二步(评估):对筛选出的特征进行可能性评估
  8. 输入:post_graph 目录中的帖子图谱文件
  9. 输出:节点来源分析结果
  10. """
  11. import asyncio
  12. import json
  13. from pathlib import Path
  14. from typing import Dict, List
  15. import sys
  16. # 添加项目根目录到路径
  17. project_root = Path(__file__).parent.parent.parent
  18. sys.path.insert(0, str(project_root))
  19. from agents import Agent, Runner, ModelSettings, trace
  20. from agents.tracing.create import custom_span
  21. from lib.client import get_model
  22. from lib.my_trace import set_trace_smith as set_trace
  23. from script.data_processing.path_config import PathConfig
  24. # 模型配置
  25. MODEL_NAME = "google/gemini-3-pro-preview"
  26. # MODEL_NAME = 'deepseek/deepseek-v3.2'
  27. # MODEL_NAME = 'anthropic/claude-sonnet-4.5'
  28. # 第一步筛选 Agent
  29. filter_agent = Agent(
  30. name="Feature Filter",
  31. model=get_model(MODEL_NAME),
  32. model_settings=ModelSettings(
  33. temperature=0.0,
  34. max_tokens=16384,
  35. ),
  36. tools=[],
  37. )
  38. # 第二步评估 Agent
  39. evaluate_agent = Agent(
  40. name="Feature Evaluator",
  41. model=get_model(MODEL_NAME),
  42. model_settings=ModelSettings(
  43. temperature=0.0,
  44. max_tokens=32768,
  45. ),
  46. tools=[],
  47. )
  48. # ===== 数据提取函数 =====
  49. def get_post_graph_files(config: PathConfig) -> List[Path]:
  50. """获取所有帖子图谱文件"""
  51. post_graph_dir = config.intermediate_dir / "post_graph"
  52. return sorted(post_graph_dir.glob("*_帖子图谱.json"))
  53. def load_post_graph(file_path: Path) -> Dict:
  54. """加载帖子图谱"""
  55. with open(file_path, "r", encoding="utf-8") as f:
  56. return json.load(f)
  57. def extract_tags_from_post_graph(post_graph: Dict) -> List[Dict]:
  58. """从帖子图谱中提取标签节点"""
  59. tags = []
  60. for node_id, node in post_graph.get("nodes", {}).items():
  61. if node.get("type") == "标签" and node.get("domain") == "帖子":
  62. tags.append({
  63. "id": node_id,
  64. "name": node.get("name", ""),
  65. "dimension": node.get("dimension", ""),
  66. })
  67. return tags
  68. def prepare_analyze_input(post_graph: Dict, target_name: str = None) -> Dict:
  69. """准备分析输入数据"""
  70. tags = extract_tags_from_post_graph(post_graph)
  71. if not tags:
  72. raise ValueError("帖子图谱中没有找到标签节点")
  73. # 确定目标节点
  74. if target_name:
  75. target_tag = next((t for t in tags if t["name"] == target_name), None)
  76. if not target_tag:
  77. raise ValueError(f"未找到目标节点: {target_name}")
  78. else:
  79. key_point_tags = [t for t in tags if t["dimension"] == "关键点"]
  80. if not key_point_tags:
  81. raise ValueError("没有找到关键点标签")
  82. target_tag = key_point_tags[0]
  83. # 候选节点筛选:灵感点/目的点的候选集排除关键点
  84. target_dimension = target_tag["dimension"]
  85. candidate_tags = []
  86. for t in tags:
  87. if t["name"] == target_tag["name"]:
  88. continue
  89. if target_dimension in ["灵感点", "目的点"] and t["dimension"] == "关键点":
  90. continue
  91. candidate_tags.append(t)
  92. return {
  93. "目标特征": {
  94. "特征名称": target_tag["name"],
  95. "特征类型": target_tag["dimension"]
  96. },
  97. "候选特征": [
  98. {
  99. "特征名称": t["name"],
  100. "特征类型": t["dimension"]
  101. }
  102. for t in candidate_tags
  103. ]
  104. }
  105. # ===== Prompt 构建 =====
  106. def build_filter_prompt(input_data: Dict) -> str:
  107. """构建第一步筛选 prompt"""
  108. target = input_data["目标特征"]
  109. candidates = input_data["候选特征"]
  110. # 构建候选特征列表
  111. candidates_text = []
  112. for c in candidates:
  113. candidates_text.append(f"- {c['特征名称']} ({c['特征类型']})")
  114. candidates_section = "\n".join(candidates_text)
  115. return f'''# 背景
  116. 推理一个小红书帖子选题前脑海中的点,在创作者脑中的因果顺序
  117. # Task
  118. 请分析【输入数据】与【目标点】的关系,按以下两类筛选证据:
  119. 1. **单独推理**:哪个特征单凭自己就能有可能指向目标特征?
  120. 2. **组合推理**:哪几个特征必须结合在一起,才能指向目标特征?(缺一不可才算组合)
  121. 如果能独立推出则无需组合。
  122. # 筛选原则
  123. 1. 实质推形式,而不是形式推实质
  124. 2. 因推果而不是果推因
  125. 3. 目的推理手段而不是手段推理目的
  126. 4. 只有当 A 是 B 的充分必要条件的时候,A 可以推理出 B
  127. **本次分析的目标特征是:{target['特征名称']}({target['特征类型']})**
  128. # 输入数据
  129. {candidates_section}
  130. # 输出格式
  131. 请严格按照以下 JSON 结构输出:
  132. ```json
  133. {{
  134. "目标特征": "{target['特征名称']}",
  135. "预备分析列表": {{
  136. "单独推理": [
  137. {{
  138. "来源特征": "特征A",
  139. "来源特征类型": "灵感点/目的点/关键点",
  140. "初步理由": "简要说明为什么这个特征可能推导出目标"
  141. }}
  142. ],
  143. "组合推理": [
  144. {{
  145. "组合成员": ["特征B", "特征C"],
  146. "成员类型": ["目的点", "关键点"],
  147. "初步理由": "简要说明为什么这些特征需要组合才能推导出目标"
  148. }}
  149. ]
  150. }}
  151. }}
  152. ```
  153. 注意:
  154. - 单独推理的来源特征必须是输入数据中的原话
  155. - 组合推理的成员数量通常为 2-3 个
  156. - 如果某个特征完全无法推导出目标,不要勉强添加
  157. '''.strip()
  158. def build_evaluate_prompt(input_data: Dict, filter_result: Dict) -> str:
  159. """构建第二步评估 prompt"""
  160. target = input_data["目标特征"]
  161. prep_list = filter_result.get("预备分析列表", {})
  162. # 构建单独推理列表
  163. single_items = prep_list.get("单独推理", [])
  164. single_text = ""
  165. if single_items:
  166. for item in single_items:
  167. single_text += f"- {item.get('来源特征', '')}({item.get('来源特征类型', '')})\n"
  168. else:
  169. single_text = "(无)\n"
  170. # 构建组合推理列表
  171. combo_items = prep_list.get("组合推理", [])
  172. combo_text = ""
  173. if combo_items:
  174. for item in combo_items:
  175. members = " + ".join(item.get("组合成员", []))
  176. combo_text += f"- {members}\n"
  177. else:
  178. combo_text = "(无)\n"
  179. return f'''# 背景
  180. 推理一个小红书帖子选题前的点,在创作者脑中的因果顺序
  181. # Task
  182. 请判断以下筛选出的特征推理出【{target['特征名称']}】的可能性
  183. ## 待评估的单独推理特征:
  184. {single_text}
  185. ## 待评估的组合推理特征:
  186. {combo_text}
  187. # 推理约束
  188. 1. 实质推形式,而不是形式推实质
  189. 2. 因推果而不是果推因
  190. 3. 目的推理手段而不是手段推理目的
  191. 4. 只有当 A 是 B 的充分必要条件的时候,A 可以推理出 B
  192. # 评分标准
  193. | 分数范围 | 等级 | 说明 |
  194. |---------|------|------|
  195. | 0.80 - 1.00 | 逻辑必然 | A 是 B 的充分必要条件,必然推导 |
  196. | 0.50 - 0.79 | 高可能性 | A 高度倾向于推导出 B,但非唯一选择 |
  197. | 0.20 - 0.49 | 创意偏好 | A 可以推导出 B,但其他选择同样可行 |
  198. | 0.00 - 0.19 | 弱关联 | A 与 B 关联性很弱,不建议保留 |
  199. # 输出格式
  200. 请严格按照以下 JSON 结构输出:
  201. ```json
  202. {{
  203. "目标关键特征": "{target['特征名称']}",
  204. "推理分析": {{
  205. "单独推理": [
  206. {{
  207. "来源特征": "特征A",
  208. "来源特征类型": "灵感点/目的点/关键点",
  209. "可能性": 0.xx,
  210. "结论": "详细说明推导逻辑..."
  211. }}
  212. ],
  213. "组合推理": [
  214. {{
  215. "组合成员": ["特征B", "特征C"],
  216. "成员类型": ["目的点", "关键点"],
  217. "可能性": 0.xx,
  218. "结论": "详细说明组合推导逻辑..."
  219. }}
  220. ]
  221. }}
  222. }}
  223. ```
  224. 注意:
  225. - 如果某个特征经评估后可能性低于 0.2,可以标注但建议说明原因
  226. - 结论要清晰说明推导逻辑,避免空洞表述
  227. '''.strip()
  228. # ===== 主分析函数 =====
  229. async def analyze_node_origin(
  230. post_id: str = None,
  231. target_name: str = None,
  232. config: PathConfig = None
  233. ) -> Dict:
  234. """分析目标节点可能由哪些候选节点推导而来(两步法)"""
  235. if config is None:
  236. config = PathConfig()
  237. # 获取帖子图谱文件
  238. post_graph_files = get_post_graph_files(config)
  239. if not post_graph_files:
  240. raise ValueError("没有找到帖子图谱文件")
  241. # 选择帖子
  242. if post_id:
  243. target_file = next(
  244. (f for f in post_graph_files if post_id in f.name),
  245. None
  246. )
  247. if not target_file:
  248. raise ValueError(f"未找到帖子: {post_id}")
  249. else:
  250. target_file = post_graph_files[0]
  251. # 加载帖子图谱
  252. post_graph = load_post_graph(target_file)
  253. actual_post_id = post_graph.get("meta", {}).get("postId", "unknown")
  254. # 准备输入数据
  255. input_data = prepare_analyze_input(post_graph, target_name)
  256. actual_target_name = input_data["目标特征"]["特征名称"]
  257. print(f"帖子ID: {actual_post_id}")
  258. print(f"目标特征: {actual_target_name}")
  259. print(f"候选特征数: {len(input_data['候选特征'])}")
  260. # ===== 第一步:筛选 =====
  261. filter_prompt = build_filter_prompt(input_data)
  262. with custom_span(
  263. name=f"Step1 筛选 - {actual_target_name}",
  264. data={"目标特征": actual_target_name}
  265. ):
  266. filter_result_raw = await Runner.run(filter_agent, input=filter_prompt)
  267. filter_output = filter_result_raw.final_output
  268. # 解析筛选结果
  269. try:
  270. filter_result = parse_json_output(filter_output)
  271. except Exception as e:
  272. return {
  273. "帖子id": actual_post_id,
  274. "目标节点": actual_target_name,
  275. "模型": MODEL_NAME,
  276. "输入": input_data,
  277. "输出": None,
  278. "错误": f"筛选步骤解析失败: {str(e)}",
  279. "原始输出_筛选": filter_output
  280. }
  281. # 检查是否有可评估的特征
  282. prep_list = filter_result.get("预备分析列表", {})
  283. single_count = len(prep_list.get("单独推理", []))
  284. combo_count = len(prep_list.get("组合推理", []))
  285. if single_count == 0 and combo_count == 0:
  286. return {
  287. "帖子id": actual_post_id,
  288. "目标节点": actual_target_name,
  289. "模型": MODEL_NAME,
  290. "输入": input_data,
  291. "筛选结果": filter_result,
  292. "输出": {
  293. "目标关键特征": actual_target_name,
  294. "推理分析": {
  295. "单独推理": [],
  296. "组合推理": []
  297. }
  298. },
  299. "说明": "筛选步骤未找到可推导的特征"
  300. }
  301. print(f" 筛选结果: 单独推理 {single_count} 个, 组合推理 {combo_count} 个")
  302. # ===== 第二步:评估 =====
  303. evaluate_prompt = build_evaluate_prompt(input_data, filter_result)
  304. with custom_span(
  305. name=f"Step2 评估 - {actual_target_name}",
  306. data={"单独推理数": single_count, "组合推理数": combo_count}
  307. ):
  308. evaluate_result_raw = await Runner.run(evaluate_agent, input=evaluate_prompt)
  309. evaluate_output = evaluate_result_raw.final_output
  310. # 解析评估结果
  311. try:
  312. evaluate_result = parse_json_output(evaluate_output)
  313. except Exception as e:
  314. return {
  315. "帖子id": actual_post_id,
  316. "目标节点": actual_target_name,
  317. "模型": MODEL_NAME,
  318. "输入": input_data,
  319. "筛选结果": filter_result,
  320. "输出": None,
  321. "错误": f"评估步骤解析失败: {str(e)}",
  322. "原始输出_评估": evaluate_output
  323. }
  324. return {
  325. "帖子id": actual_post_id,
  326. "目标节点": actual_target_name,
  327. "模型": MODEL_NAME,
  328. "输入": input_data,
  329. "筛选结果": filter_result,
  330. "输出": evaluate_result
  331. }
  332. def parse_json_output(output: str) -> Dict:
  333. """解析 JSON 输出"""
  334. if "```json" in output:
  335. json_start = output.find("```json") + 7
  336. json_end = output.find("```", json_start)
  337. json_str = output[json_start:json_end].strip()
  338. elif "{" in output and "}" in output:
  339. json_start = output.find("{")
  340. json_end = output.rfind("}") + 1
  341. json_str = output[json_start:json_end]
  342. else:
  343. json_str = output
  344. return json.loads(json_str)
  345. # ===== 图谱构建函数 =====
  346. def build_origin_graph(all_results: List[Dict], post_id: str) -> Dict:
  347. """将分析结果转换为图谱格式"""
  348. nodes = {}
  349. edges = {}
  350. # 特征名到节点ID的映射(用于修正 LLM 返回的类型名不匹配问题)
  351. name_to_node_id = {}
  352. for result in all_results:
  353. target_input = result.get("输入", {})
  354. # 添加目标节点
  355. target_info = target_input.get("目标特征", {})
  356. target_name = target_info.get("特征名称", "")
  357. target_type = target_info.get("特征类型", "关键点")
  358. node_id = f"帖子:{target_type}:标签:{target_name}"
  359. if node_id not in nodes:
  360. nodes[node_id] = {
  361. "name": target_name,
  362. "type": "标签",
  363. "dimension": target_type,
  364. "domain": "帖子",
  365. "detail": {}
  366. }
  367. name_to_node_id[target_name] = node_id
  368. # 添加候选特征节点
  369. for candidate in target_input.get("候选特征", []):
  370. c_name = candidate.get("特征名称", "")
  371. c_type = candidate.get("特征类型", "关键点")
  372. c_node_id = f"帖子:{c_type}:标签:{c_name}"
  373. if c_node_id not in nodes:
  374. nodes[c_node_id] = {
  375. "name": c_name,
  376. "type": "标签",
  377. "dimension": c_type,
  378. "domain": "帖子",
  379. "detail": {}
  380. }
  381. name_to_node_id[c_name] = c_node_id
  382. # 构建推导边
  383. for result in all_results:
  384. target_name = result.get("目标特征", "")
  385. # 使用映射获取正确的节点ID
  386. target_node_id = name_to_node_id.get(target_name)
  387. if not target_node_id:
  388. continue
  389. # V4 的推理分析在顶层,不是在 输出 下面
  390. reasoning = result.get("推理分析", {})
  391. # 单独推理的边
  392. for item in reasoning.get("单独推理", []):
  393. source_name = item.get("来源特征", "")
  394. # 使用映射获取正确的节点ID(而非LLM返回的类型名)
  395. source_node_id = name_to_node_id.get(source_name)
  396. if not source_node_id:
  397. continue
  398. probability = item.get("可能性", 0)
  399. edge_id = f"{source_node_id}|推导|{target_node_id}"
  400. edges[edge_id] = {
  401. "source": source_node_id,
  402. "target": target_node_id,
  403. "type": "推导",
  404. "score": probability,
  405. "detail": {
  406. "推理类型": "单独推理",
  407. "结论": item.get("结论", "")
  408. }
  409. }
  410. # 组合推理的边
  411. for item in reasoning.get("组合推理", []):
  412. members = item.get("组合成员", [])
  413. probability = item.get("可能性", 0)
  414. # 验证所有成员都存在于映射中
  415. member_node_ids = []
  416. valid = True
  417. for m in members:
  418. m_node_id = name_to_node_id.get(m)
  419. if not m_node_id:
  420. valid = False
  421. break
  422. member_node_ids.append((m, m_node_id))
  423. if not valid:
  424. continue
  425. # 按名称排序
  426. sorted_member_ids = sorted(member_node_ids, key=lambda x: x[0])
  427. # 从节点ID中提取实际的维度类型(帖子:灵感点:标签:xxx -> 灵感点)
  428. combo_parts = []
  429. for m_name, m_node_id in sorted_member_ids:
  430. parts = m_node_id.split(":")
  431. m_dimension = parts[1] if len(parts) > 1 else "关键点"
  432. combo_parts.append(f"{m_dimension}:{m_name}")
  433. combo_name = " + ".join(combo_parts)
  434. combo_node_id = f"帖子:组合:组合:{combo_name}"
  435. if combo_node_id not in nodes:
  436. nodes[combo_node_id] = {
  437. "name": combo_name,
  438. "type": "组合",
  439. "dimension": "组合",
  440. "domain": "帖子",
  441. "detail": {
  442. "成员": [m for m, _ in sorted_member_ids],
  443. "成员类型": [m_node_id.split(":")[1] for _, m_node_id in sorted_member_ids]
  444. }
  445. }
  446. # 组合节点到目标的边
  447. edge_id = f"{combo_node_id}|推导|{target_node_id}"
  448. edges[edge_id] = {
  449. "source": combo_node_id,
  450. "target": target_node_id,
  451. "type": "推导",
  452. "score": probability,
  453. "detail": {
  454. "推理类型": "组合推理",
  455. "结论": item.get("结论", "")
  456. }
  457. }
  458. # 成员到组合节点的边
  459. for m_name, m_node_id in sorted_member_ids:
  460. m_edge_id = f"{m_node_id}|组成|{combo_node_id}"
  461. if m_edge_id not in edges:
  462. edges[m_edge_id] = {
  463. "source": m_node_id,
  464. "target": combo_node_id,
  465. "type": "组成",
  466. "score": 1.0,
  467. "detail": {}
  468. }
  469. return {
  470. "meta": {
  471. "postId": post_id,
  472. "type": "推导图谱",
  473. "version": "v4",
  474. "stats": {
  475. "nodeCount": len(nodes),
  476. "edgeCount": len(edges)
  477. }
  478. },
  479. "nodes": nodes,
  480. "edges": edges
  481. }
  482. # ===== 辅助函数 =====
  483. def get_all_target_names(post_graph: Dict, dimensions: List[str] = None) -> List[str]:
  484. """获取所有可作为目标的特征名称"""
  485. if dimensions is None:
  486. dimensions = ["灵感点", "目的点", "关键点"]
  487. tags = extract_tags_from_post_graph(post_graph)
  488. return [t["name"] for t in tags if t["dimension"] in dimensions]
  489. def get_score_level(score: float) -> str:
  490. """根据分数返回等级"""
  491. if score >= 0.80:
  492. return "逻辑必然"
  493. elif score >= 0.50:
  494. return "高可能性"
  495. elif score >= 0.20:
  496. return "创意偏好"
  497. else:
  498. return "弱关联"
  499. def display_result(result: Dict):
  500. """显示单个分析结果"""
  501. output = result.get("输出")
  502. if output:
  503. print(f"\n目标关键特征: {output.get('目标关键特征', 'N/A')}")
  504. reasoning = output.get("推理分析", {})
  505. # 显示单独推理
  506. single = reasoning.get("单独推理", [])
  507. if single:
  508. print(" 【单独推理】")
  509. for item in single[:5]:
  510. score = item.get("可能性", 0)
  511. level = get_score_level(score)
  512. print(f" [{score:.2f} {level}] {item.get('来源特征', '')}")
  513. # 显示组合推理
  514. combo = reasoning.get("组合推理", [])
  515. if combo:
  516. print(" 【组合推理】")
  517. for item in combo[:3]:
  518. members = " + ".join(item.get("组合成员", []))
  519. score = item.get("可能性", 0)
  520. level = get_score_level(score)
  521. print(f" [{score:.2f} {level}] {members}")
  522. else:
  523. error = result.get("错误", "")
  524. if error:
  525. print(f" 分析失败: {error}")
  526. else:
  527. print(f" {result.get('说明', '无结果')}")
  528. # ===== 单帖子处理函数 =====
  529. async def process_single_post(
  530. post_file: Path,
  531. config: PathConfig,
  532. target_name: str = None,
  533. num_targets: int = 999,
  534. dimensions: List[str] = None
  535. ):
  536. """处理单个帖子"""
  537. if dimensions is None:
  538. dimensions = ["灵感点", "目的点", "关键点"]
  539. # 为每个帖子生成独立的 trace
  540. current_time, log_url = set_trace()
  541. # 加载帖子图谱
  542. post_graph = load_post_graph(post_file)
  543. actual_post_id = post_graph.get("meta", {}).get("postId", "unknown")
  544. print(f"\n{'=' * 60}")
  545. print(f"帖子ID: {actual_post_id}")
  546. print(f"Trace URL: {log_url}")
  547. # 确定要分析的目标特征列表
  548. if target_name:
  549. target_names = [target_name]
  550. else:
  551. all_targets = get_all_target_names(post_graph, dimensions)
  552. target_names = all_targets[:num_targets]
  553. print(f"待分析目标特征: {target_names}")
  554. print("-" * 60)
  555. # 输出目录
  556. output_dir = config.intermediate_dir / "node_origin_analysis"
  557. output_dir.mkdir(parents=True, exist_ok=True)
  558. # 使用 trace 上下文包裹单个帖子的分析
  559. with trace(f"节点来源分析 V4 - {actual_post_id}"):
  560. # 并发分析所有目标特征
  561. async def analyze_single(name: str, index: int):
  562. print(f"\n[{index}/{len(target_names)}] 开始分析: {name}")
  563. result = await analyze_node_origin(
  564. post_id=actual_post_id,
  565. target_name=name,
  566. config=config
  567. )
  568. print(f"[{index}/{len(target_names)}] 完成: {name}")
  569. display_result(result)
  570. output = result.get("输出", {})
  571. return {
  572. "目标特征": result.get("目标节点"),
  573. "筛选结果": result.get("筛选结果"),
  574. "推理分析": output.get("推理分析", {}) if output else {},
  575. "输入": result.get("输入"),
  576. "错误": result.get("错误"),
  577. "说明": result.get("说明")
  578. }
  579. # 创建并发任务
  580. tasks = [
  581. analyze_single(name, i)
  582. for i, name in enumerate(target_names, 1)
  583. ]
  584. # 并发执行
  585. all_results = await asyncio.gather(*tasks)
  586. # 合并保存到一个文件
  587. merged_output = {
  588. "元数据": {
  589. "current_time": current_time,
  590. "log_url": log_url,
  591. "model": MODEL_NAME,
  592. "version": "v4"
  593. },
  594. "帖子id": actual_post_id,
  595. "分析结果列表": all_results
  596. }
  597. output_file = output_dir / f"{actual_post_id}_来源分析_v4.json"
  598. with open(output_file, "w", encoding="utf-8") as f:
  599. json.dump(merged_output, f, ensure_ascii=False, indent=2)
  600. # 生成推导关系图谱
  601. graph_output = build_origin_graph(all_results, actual_post_id)
  602. graph_file = output_dir / f"{actual_post_id}_推导图谱_v4.json"
  603. with open(graph_file, "w", encoding="utf-8") as f:
  604. json.dump(graph_output, f, ensure_ascii=False, indent=2)
  605. print(f"\n完成! 共分析 {len(target_names)} 个目标特征")
  606. print(f"分析结果: {output_file}")
  607. print(f"推导图谱: {graph_file}")
  608. print(f"Trace: {log_url}")
  609. return actual_post_id
  610. # ===== 主函数 =====
  611. async def main(
  612. post_id: str = None,
  613. target_name: str = None,
  614. num_targets: int = 999,
  615. dimensions: List[str] = None,
  616. all_posts: bool = False
  617. ):
  618. """主函数"""
  619. if dimensions is None:
  620. dimensions = ["灵感点", "目的点", "关键点"]
  621. config = PathConfig()
  622. print(f"账号: {config.account_name}")
  623. print(f"使用模型: {MODEL_NAME}")
  624. print(f"分析维度: {dimensions}")
  625. print(f"版本: V4 (两步法: 筛选 + 评估)")
  626. # 获取帖子图谱文件
  627. post_graph_files = get_post_graph_files(config)
  628. if not post_graph_files:
  629. print("错误: 没有找到帖子图谱文件")
  630. return
  631. # 确定要处理的帖子列表
  632. if post_id:
  633. target_file = next(
  634. (f for f in post_graph_files if post_id in f.name),
  635. None
  636. )
  637. if not target_file:
  638. print(f"错误: 未找到帖子 {post_id}")
  639. return
  640. files_to_process = [target_file]
  641. elif all_posts:
  642. files_to_process = post_graph_files
  643. else:
  644. files_to_process = [post_graph_files[0]]
  645. print(f"待处理帖子数: {len(files_to_process)}")
  646. # 逐个处理帖子
  647. processed_posts = []
  648. for i, post_file in enumerate(files_to_process, 1):
  649. print(f"\n{'#' * 60}")
  650. print(f"# 处理帖子 {i}/{len(files_to_process)}")
  651. print(f"{'#' * 60}")
  652. post_id_result = await process_single_post(
  653. post_file=post_file,
  654. config=config,
  655. target_name=target_name,
  656. num_targets=num_targets,
  657. dimensions=dimensions
  658. )
  659. processed_posts.append(post_id_result)
  660. print(f"\n{'#' * 60}")
  661. print(f"# 全部完成! 共处理 {len(processed_posts)} 个帖子")
  662. print(f"{'#' * 60}")
  663. def rebuild_graph_from_file(analysis_file: Path) -> None:
  664. """从已有的分析结果文件重建图谱"""
  665. with open(analysis_file, "r", encoding="utf-8") as f:
  666. data = json.load(f)
  667. post_id = data.get("帖子id", "unknown")
  668. all_results = data.get("分析结果列表", [])
  669. print(f"从分析文件重建图谱: {analysis_file.name}")
  670. print(f"帖子ID: {post_id}")
  671. print(f"分析结果数: {len(all_results)}")
  672. # 构建图谱
  673. graph_output = build_origin_graph(all_results, post_id)
  674. # 保存图谱
  675. graph_file = analysis_file.parent / f"{post_id}_推导图谱_v4.json"
  676. with open(graph_file, "w", encoding="utf-8") as f:
  677. json.dump(graph_output, f, ensure_ascii=False, indent=2)
  678. print(f"图谱已保存: {graph_file}")
  679. print(f"节点数: {graph_output['meta']['stats']['nodeCount']}")
  680. print(f"边数: {graph_output['meta']['stats']['edgeCount']}")
  681. if __name__ == "__main__":
  682. import argparse
  683. parser = argparse.ArgumentParser(description="分析节点来源 (V4 两步法)")
  684. parser.add_argument("--post-id", type=str, help="帖子ID(指定则只处理该帖子)")
  685. parser.add_argument("--target", type=str, help="目标节点名称(指定则只分析这一个特征)")
  686. parser.add_argument("--num", type=int, default=999, help="要分析的目标特征数量")
  687. parser.add_argument("--dims", type=str, nargs="+",
  688. choices=["灵感点", "目的点", "关键点"],
  689. help="指定要分析的维度(默认全部)")
  690. parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
  691. parser.add_argument("--rebuild-graph", type=str, metavar="FILE",
  692. help="从已有分析文件重建图谱(不重新分析)")
  693. args = parser.parse_args()
  694. # 如果指定了 --rebuild-graph,只重建图谱
  695. if args.rebuild_graph:
  696. rebuild_graph_from_file(Path(args.rebuild_graph))
  697. else:
  698. # 确定维度(默认所有维度)
  699. if args.dims:
  700. dimensions = args.dims
  701. else:
  702. dimensions = ["灵感点", "目的点", "关键点"]
  703. # 运行主函数
  704. asyncio.run(main(
  705. post_id=args.post_id,
  706. target_name=args.target,
  707. num_targets=args.num,
  708. dimensions=dimensions,
  709. all_posts=args.all_posts
  710. ))