build_persona_graph.py 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 构建人设图谱
  5. ================================================================================
  6. 输入文件:
  7. ================================================================================
  8. 1. pattern聚合结果.json - 分类节点、标签节点、属于/包含边
  9. 2. dimension_associations_analysis.json - 分类共现边(跨点)
  10. 3. intra_dimension_associations_analysis.json - 分类共现边(点内)
  11. 4. 历史帖子解构目录/*.json - 标签共现边
  12. ================================================================================
  13. 输出文件: 人设图谱.json
  14. ================================================================================
  15. {
  16. "meta": { # 元信息
  17. "description": "...",
  18. "account": "账号名",
  19. "createdAt": "时间戳",
  20. "stats": { ... } # 统计信息
  21. },
  22. "nodes": { # 节点字典 (nodeId -> nodeData)
  23. "{domain}:{dimension}:{type}:{name}": {
  24. "name": "显示名称",
  25. "type": "人设|灵感点|目的点|关键点|分类|标签",
  26. "domain": "人设",
  27. "dimension": "人设|灵感点|目的点|关键点",
  28. "detail": { ... }
  29. }
  30. },
  31. "edges": { # 边字典 (edgeId -> edgeData)
  32. "{source}|{type}|{target}": {
  33. "source": "源节点ID",
  34. "target": "目标节点ID",
  35. "type": "属于|包含|标签共现|分类共现|分类共现",
  36. "score": 0.5,
  37. "detail": { ... }
  38. }
  39. },
  40. "index": { # 游走索引
  41. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  42. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  43. },
  44. "tree": { ... } # 嵌套树结构(从根节点沿"包含"边构建)
  45. }
  46. ================================================================================
  47. 核心逻辑:
  48. ================================================================================
  49. 1. 提取节点
  50. - 从 pattern 提取分类节点(按维度分组的层级分类)
  51. - 从 pattern 提取标签节点(具体特征标签)
  52. - 添加根节点(人设)和维度节点(灵感点/目的点/关键点)
  53. 2. 提取边
  54. - 属于/包含边:根据节点的 parentPath 构建层级关系
  55. - 分类共现边(跨点):从关联分析结果提取
  56. - 分类共现边(点内):从点内关联分析提取
  57. - 标签共现边:遍历历史帖子,统计标签同现
  58. 3. 构建索引
  59. - outEdges: 从该节点出发能到达的节点
  60. - inEdges: 能到达该节点的源节点
  61. 4. 构建树
  62. - 从根节点开始,沿"包含"边递归构建嵌套树结构
  63. ================================================================================
  64. 节点ID格式: {domain}:{dimension}:{type}:{name}
  65. ================================================================================
  66. - 根节点: 人设:人设:人设:人设
  67. - 维度节点: 人设:灵感点:灵感点:灵感点
  68. - 分类节点: 人设:灵感点:分类:视觉呈现
  69. - 标签节点: 人设:灵感点:标签:手绘风格
  70. ================================================================================
  71. 边类型:
  72. ================================================================================
  73. - 属于: 子节点 -> 父节点(层级关系)
  74. - 包含: 父节点 -> 子节点(层级关系)
  75. - 标签共现: 标签 <-> 标签(同一帖子出现)
  76. - 分类共现: 分类 <-> 分类(跨维度共现)
  77. - 分类共现: 分类 <-> 分类(点内组合共现)
  78. ================================================================================
  79. 图游走函数:
  80. ================================================================================
  81. 1. walk_graph(index, start_node, edge_types, direction, min_score)
  82. - 从起始节点出发,按边类型序列游走N步
  83. - 示例: walk_graph(index, "人设:灵感点:标签:手绘风格", ["属于", "分类共现"])
  84. - 返回: 到达的节点ID集合
  85. 2. get_neighbors(index, node_id, edge_type, direction, min_score)
  86. - 获取节点的邻居
  87. - 示例: get_neighbors(index, "人设:灵感点:分类:视觉呈现", "包含")
  88. - 返回: 邻居列表 [{"target": "...", "score": 0.5}, ...]
  89. ================================================================================
  90. """
  91. import json
  92. from pathlib import Path
  93. from typing import Dict, List, Set, Any
  94. from datetime import datetime
  95. import sys
  96. # 添加项目根目录到路径
  97. project_root = Path(__file__).parent.parent.parent
  98. sys.path.insert(0, str(project_root))
  99. from script.data_processing.path_config import PathConfig
  100. # ==================== 节点和边构建工具 ====================
  101. def build_node_id(domain: str, dimension: str, node_type: str, name: str) -> str:
  102. """构建节点ID"""
  103. return f"{domain}:{dimension}:{node_type}:{name}"
  104. def build_edge_id(source: str, edge_type: str, target: str) -> str:
  105. """构建边ID"""
  106. return f"{source}|{edge_type}|{target}"
  107. def create_node(
  108. domain: str,
  109. dimension: str,
  110. node_type: str,
  111. name: str,
  112. detail: Dict = None
  113. ) -> Dict:
  114. """创建节点"""
  115. return {
  116. "name": name,
  117. "type": node_type,
  118. "dimension": dimension,
  119. "domain": domain,
  120. "detail": detail or {}
  121. }
  122. def create_edge(
  123. source: str,
  124. target: str,
  125. edge_type: str,
  126. score: float = None,
  127. detail: Dict = None
  128. ) -> Dict:
  129. """创建边"""
  130. return {
  131. "source": source,
  132. "target": target,
  133. "type": edge_type,
  134. "score": score,
  135. "detail": detail or {}
  136. }
  137. # ==================== 从 pattern 提取分类节点 ====================
  138. def extract_category_nodes_from_pattern(
  139. pattern_data: Dict,
  140. dimension_key: str,
  141. dimension_name: str
  142. ) -> Dict[str, Dict]:
  143. """
  144. 从 pattern 聚合结果中提取分类节点
  145. Returns:
  146. { nodeId: nodeData }
  147. """
  148. nodes = {}
  149. if dimension_key not in pattern_data:
  150. return nodes
  151. def collect_sources_recursively(node: Dict) -> List[Dict]:
  152. """递归收集节点及其所有子节点的特征来源"""
  153. sources = []
  154. if "特征列表" in node:
  155. for feature in node["特征列表"]:
  156. source = {
  157. "pointName": feature.get("所属点", ""),
  158. "pointDesc": feature.get("点描述", ""),
  159. "postId": feature.get("帖子id", "")
  160. }
  161. sources.append(source)
  162. for key, value in node.items():
  163. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  164. continue
  165. if isinstance(value, dict):
  166. sources.extend(collect_sources_recursively(value))
  167. return sources
  168. def traverse_node(node: Dict, parent_path: List[str]):
  169. """递归遍历节点"""
  170. for key, value in node.items():
  171. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  172. continue
  173. if isinstance(value, dict):
  174. current_path = parent_path + [key]
  175. # 获取帖子列表
  176. post_ids = value.get("帖子列表", [])
  177. # 构建节点来源
  178. node_sources = []
  179. if "特征列表" in value:
  180. for feature in value["特征列表"]:
  181. source = {
  182. "pointName": feature.get("所属点", ""),
  183. "pointDesc": feature.get("点描述", ""),
  184. "postId": feature.get("帖子id", "")
  185. }
  186. node_sources.append(source)
  187. else:
  188. node_sources = collect_sources_recursively(value)
  189. # 计算帖子数
  190. if post_ids:
  191. post_count = len(post_ids)
  192. else:
  193. post_count = len(set(s.get("postId", "") for s in node_sources if s.get("postId")))
  194. # 构建节点
  195. node_id = build_node_id("人设", dimension_name, "分类", key)
  196. nodes[node_id] = create_node(
  197. domain="人设",
  198. dimension=dimension_name,
  199. node_type="分类",
  200. name=key,
  201. detail={
  202. "parentPath": parent_path.copy(),
  203. "postCount": post_count,
  204. "sources": node_sources
  205. }
  206. )
  207. # 递归处理子节点
  208. traverse_node(value, current_path)
  209. traverse_node(pattern_data[dimension_key], [])
  210. return nodes
  211. # ==================== 从 pattern 提取标签节点 ====================
  212. def extract_tag_nodes_from_pattern(
  213. pattern_data: Dict,
  214. dimension_key: str,
  215. dimension_name: str
  216. ) -> Dict[str, Dict]:
  217. """
  218. 从 pattern 聚合结果中提取标签节点
  219. Returns:
  220. { nodeId: nodeData }
  221. """
  222. nodes = {}
  223. tag_map = {} # 用于合并同名标签: tagId -> { sources, postIds, parentPath }
  224. if dimension_key not in pattern_data:
  225. return nodes
  226. def traverse_node(node: Dict, parent_path: List[str]):
  227. """递归遍历节点"""
  228. # 处理特征列表(标签)
  229. if "特征列表" in node:
  230. for feature in node["特征列表"]:
  231. tag_name = feature.get("特征名称", "")
  232. if not tag_name:
  233. continue
  234. source = {
  235. "pointName": feature.get("所属点", ""),
  236. "pointDesc": feature.get("点描述", ""),
  237. "postId": feature.get("帖子id", "")
  238. }
  239. tag_id = build_node_id("人设", dimension_name, "标签", tag_name)
  240. if tag_id not in tag_map:
  241. tag_map[tag_id] = {
  242. "name": tag_name,
  243. "sources": [],
  244. "postIds": set(),
  245. "parentPath": parent_path.copy()
  246. }
  247. tag_map[tag_id]["sources"].append(source)
  248. if source["postId"]:
  249. tag_map[tag_id]["postIds"].add(source["postId"])
  250. # 递归处理子节点
  251. for key, value in node.items():
  252. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  253. continue
  254. if isinstance(value, dict):
  255. current_path = parent_path + [key]
  256. traverse_node(value, current_path)
  257. traverse_node(pattern_data[dimension_key], [])
  258. # 转换为节点
  259. for tag_id, tag_info in tag_map.items():
  260. nodes[tag_id] = create_node(
  261. domain="人设",
  262. dimension=dimension_name,
  263. node_type="标签",
  264. name=tag_info["name"],
  265. detail={
  266. "parentPath": tag_info["parentPath"],
  267. "postCount": len(tag_info["postIds"]),
  268. "sources": tag_info["sources"]
  269. }
  270. )
  271. return nodes
  272. # ==================== 从 pattern 提取属于/包含边 ====================
  273. def extract_belong_contain_edges(
  274. pattern_data: Dict,
  275. dimension_key: str,
  276. dimension_name: str,
  277. nodes: Dict[str, Dict]
  278. ) -> Dict[str, Dict]:
  279. """
  280. 从 pattern 聚合结果中提取属于/包含边
  281. Returns:
  282. { edgeId: edgeData }
  283. """
  284. edges = {}
  285. if dimension_key not in pattern_data:
  286. return edges
  287. # 构建分类名称到ID的映射
  288. category_name_to_id = {}
  289. for node_id, node_data in nodes.items():
  290. if node_data["type"] == "分类" and node_data["dimension"] == dimension_name:
  291. category_name_to_id[node_data["name"]] = node_id
  292. # 为每个节点创建属于边(子→父)
  293. for node_id, node_data in nodes.items():
  294. if node_data["dimension"] != dimension_name:
  295. continue
  296. parent_path = node_data["detail"].get("parentPath", [])
  297. if not parent_path:
  298. continue
  299. # 取最后一个作为直接父分类
  300. parent_name = parent_path[-1]
  301. parent_id = category_name_to_id.get(parent_name)
  302. if parent_id:
  303. # 属于边:子 → 父
  304. edge_id = build_edge_id(node_id, "属于", parent_id)
  305. edges[edge_id] = create_edge(
  306. source=node_id,
  307. target=parent_id,
  308. edge_type="属于",
  309. score=1.0,
  310. detail={}
  311. )
  312. # 包含边:父 → 子
  313. edge_id_contain = build_edge_id(parent_id, "包含", node_id)
  314. edges[edge_id_contain] = create_edge(
  315. source=parent_id,
  316. target=node_id,
  317. edge_type="包含",
  318. score=1.0,
  319. detail={}
  320. )
  321. return edges
  322. # ==================== 从关联分析提取分类共现边(跨点)====================
  323. def extract_category_cooccur_edges(associations_data: Dict) -> Dict[str, Dict]:
  324. """
  325. 从 dimension_associations_analysis.json 中提取分类共现边(跨点)
  326. Returns:
  327. { edgeId: edgeData }
  328. """
  329. edges = {}
  330. if "单维度关联分析" not in associations_data:
  331. return edges
  332. single_dim = associations_data["单维度关联分析"]
  333. # 维度映射
  334. dimension_map = {
  335. "灵感点维度": "灵感点",
  336. "目的点维度": "目的点",
  337. "关键点维度": "关键点"
  338. }
  339. def get_last_segment(path: str) -> str:
  340. """获取路径的最后一段"""
  341. return path.split("/")[-1]
  342. for dim_key, dim_data in single_dim.items():
  343. if dim_key not in dimension_map:
  344. continue
  345. source_dimension = dimension_map[dim_key]
  346. for direction_key, direction_data in dim_data.items():
  347. if direction_key == "说明" or "→" not in direction_key:
  348. continue
  349. for source_path, source_info in direction_data.items():
  350. source_name = get_last_segment(source_path)
  351. source_node_id = build_node_id("人设", source_dimension, "分类", source_name)
  352. for field_name, associations in source_info.items():
  353. if not field_name.startswith("与") or not field_name.endswith("的关联"):
  354. continue
  355. target_dimension = field_name[1:-3]
  356. if not isinstance(associations, list):
  357. continue
  358. for assoc in associations:
  359. target_path = assoc.get("目标分类", "")
  360. if not target_path:
  361. continue
  362. target_name = get_last_segment(target_path)
  363. target_node_id = build_node_id("人设", target_dimension, "分类", target_name)
  364. # 使用 Jaccard 作为 score
  365. jaccard = assoc.get("Jaccard相似度", 0)
  366. edge_id = build_edge_id(source_node_id, "分类共现", target_node_id)
  367. edges[edge_id] = create_edge(
  368. source=source_node_id,
  369. target=target_node_id,
  370. edge_type="分类共现",
  371. score=jaccard,
  372. detail={
  373. "jaccard": jaccard,
  374. "overlapCoef": assoc.get("重叠系数", 0),
  375. "cooccurCount": assoc.get("共同帖子数", 0),
  376. "cooccurPosts": assoc.get("共同帖子ID", [])
  377. }
  378. )
  379. return edges
  380. # ==================== 从关联分析提取分类共现边(点内)====================
  381. def extract_intra_category_cooccur_edges(intra_data: Dict) -> Dict[str, Dict]:
  382. """
  383. 从 intra_dimension_associations_analysis.json 中提取点内分类共现边
  384. Returns:
  385. { edgeId: edgeData }
  386. """
  387. edges = {}
  388. if "叶子分类组合聚类" not in intra_data:
  389. return edges
  390. clusters_by_dim = intra_data["叶子分类组合聚类"]
  391. for dimension, clusters in clusters_by_dim.items():
  392. if dimension not in ("灵感点", "目的点", "关键点"):
  393. continue
  394. for cluster_key, cluster_data in clusters.items():
  395. leaf_categories = cluster_data.get("叶子分类组合", [])
  396. point_count = cluster_data.get("点数", 0)
  397. point_details = cluster_data.get("点详情列表", [])
  398. # 提取点名称列表
  399. point_names = [p.get("点名称", "") for p in point_details if p.get("点名称")]
  400. # 两两组合生成共现边
  401. for i in range(len(leaf_categories)):
  402. for j in range(i + 1, len(leaf_categories)):
  403. cat1 = leaf_categories[i]
  404. cat2 = leaf_categories[j]
  405. cat1_id = build_node_id("人设", dimension, "分类", cat1)
  406. cat2_id = build_node_id("人设", dimension, "分类", cat2)
  407. # 确保顺序一致(按字典序)
  408. if cat1_id > cat2_id:
  409. cat1_id, cat2_id = cat2_id, cat1_id
  410. edge_id = build_edge_id(cat1_id, "分类共现", cat2_id)
  411. if edge_id in edges:
  412. # 累加
  413. edges[edge_id]["detail"]["pointCount"] += point_count
  414. edges[edge_id]["detail"]["pointNames"].extend(point_names)
  415. else:
  416. edges[edge_id] = create_edge(
  417. source=cat1_id,
  418. target=cat2_id,
  419. edge_type="分类共现",
  420. score=point_count, # 先用点数作为 score,后续可归一化
  421. detail={
  422. "pointCount": point_count,
  423. "pointNames": point_names.copy()
  424. }
  425. )
  426. return edges
  427. # ==================== 从历史帖子提取标签共现边 ====================
  428. def extract_tag_cooccur_edges(historical_posts_dir: Path) -> Dict[str, Dict]:
  429. """
  430. 从历史帖子解构结果中提取标签共现边
  431. Returns:
  432. { edgeId: edgeData }
  433. """
  434. edges = {}
  435. cooccur_map = {} # (tag1_id, tag2_id, dimension) -> { cooccurPosts: set() }
  436. if not historical_posts_dir.exists():
  437. print(f" 警告: 历史帖子目录不存在: {historical_posts_dir}")
  438. return edges
  439. json_files = list(historical_posts_dir.glob("*.json"))
  440. print(f" 找到 {len(json_files)} 个历史帖子文件")
  441. def extract_post_id_from_filename(filename: str) -> str:
  442. """从文件名中提取帖子ID"""
  443. import re
  444. match = re.match(r'^([^_]+)_', filename)
  445. return match.group(1) if match else ""
  446. def extract_tags_from_post(post_data: Dict) -> Dict[str, List[str]]:
  447. """从帖子解构结果中提取所有标签"""
  448. tags_by_dimension = {
  449. "灵感点": [],
  450. "目的点": [],
  451. "关键点": []
  452. }
  453. if "三点解构" not in post_data:
  454. return tags_by_dimension
  455. three_points = post_data["三点解构"]
  456. # 灵感点
  457. if "灵感点" in three_points:
  458. inspiration = three_points["灵感点"]
  459. for section in ["全新内容", "共性差异", "共性内容"]:
  460. if section in inspiration and isinstance(inspiration[section], list):
  461. for item in inspiration[section]:
  462. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  463. for feature in item["提取的特征"]:
  464. tag_name = feature.get("特征名称", "")
  465. if tag_name:
  466. tags_by_dimension["灵感点"].append(tag_name)
  467. # 目的点
  468. if "目的点" in three_points:
  469. purpose = three_points["目的点"]
  470. if "purposes" in purpose and isinstance(purpose["purposes"], list):
  471. for item in purpose["purposes"]:
  472. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  473. for feature in item["提取的特征"]:
  474. tag_name = feature.get("特征名称", "")
  475. if tag_name:
  476. tags_by_dimension["目的点"].append(tag_name)
  477. # 关键点
  478. if "关键点" in three_points:
  479. key_points = three_points["关键点"]
  480. if "key_points" in key_points and isinstance(key_points["key_points"], list):
  481. for item in key_points["key_points"]:
  482. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  483. for feature in item["提取的特征"]:
  484. tag_name = feature.get("特征名称", "")
  485. if tag_name:
  486. tags_by_dimension["关键点"].append(tag_name)
  487. return tags_by_dimension
  488. # 遍历所有帖子文件
  489. for file_path in json_files:
  490. post_id = extract_post_id_from_filename(file_path.name)
  491. if not post_id:
  492. continue
  493. try:
  494. with open(file_path, "r", encoding="utf-8") as f:
  495. post_data = json.load(f)
  496. tags_by_dimension = extract_tags_from_post(post_data)
  497. # 对每个维度内的标签两两组合
  498. for dimension, tags in tags_by_dimension.items():
  499. unique_tags = list(set(tags))
  500. for i in range(len(unique_tags)):
  501. for j in range(i + 1, len(unique_tags)):
  502. tag1 = unique_tags[i]
  503. tag2 = unique_tags[j]
  504. tag1_id = build_node_id("人设", dimension, "标签", tag1)
  505. tag2_id = build_node_id("人设", dimension, "标签", tag2)
  506. # 确保顺序一致
  507. if tag1_id > tag2_id:
  508. tag1_id, tag2_id = tag2_id, tag1_id
  509. key = (tag1_id, tag2_id)
  510. if key not in cooccur_map:
  511. cooccur_map[key] = {"cooccurPosts": set()}
  512. cooccur_map[key]["cooccurPosts"].add(post_id)
  513. except Exception as e:
  514. print(f" 警告: 处理文件 {file_path.name} 时出错: {e}")
  515. # 转换为边
  516. for (tag1_id, tag2_id), info in cooccur_map.items():
  517. cooccur_posts = list(info["cooccurPosts"])
  518. cooccur_count = len(cooccur_posts)
  519. edge_id = build_edge_id(tag1_id, "标签共现", tag2_id)
  520. edges[edge_id] = create_edge(
  521. source=tag1_id,
  522. target=tag2_id,
  523. edge_type="标签共现",
  524. score=cooccur_count, # 先用共现次数,后续可归一化
  525. detail={
  526. "cooccurCount": cooccur_count,
  527. "cooccurPosts": cooccur_posts
  528. }
  529. )
  530. return edges
  531. # ==================== 构建嵌套树结构 ====================
  532. def build_nested_tree(nodes: Dict[str, Dict], edges: Dict[str, Dict]) -> Dict:
  533. """
  534. 从根节点开始,沿"包含"边递归构建嵌套树结构
  535. 包含边:父节点 -> 子节点
  536. 从根节点开始,递归找所有包含的子节点
  537. Returns:
  538. 嵌套的树结构
  539. """
  540. # 从"包含"边构建 父节点 -> [子节点] 的映射
  541. parent_to_children = {} # parent_id -> [child_id, ...]
  542. for edge_id, edge_data in edges.items():
  543. if edge_data["type"] == "包含":
  544. parent_id = edge_data["source"]
  545. child_id = edge_data["target"]
  546. if parent_id not in parent_to_children:
  547. parent_to_children[parent_id] = []
  548. parent_to_children[parent_id].append(child_id)
  549. # 递归构建子树
  550. def build_subtree(node_id: str) -> Dict:
  551. node_data = nodes[node_id]
  552. subtree = {
  553. "id": node_id,
  554. "name": node_data["name"],
  555. "type": node_data["type"],
  556. "domain": node_data["domain"],
  557. "dimension": node_data["dimension"],
  558. "detail": node_data.get("detail", {}),
  559. "children": []
  560. }
  561. # 获取子节点
  562. child_ids = parent_to_children.get(node_id, [])
  563. for child_id in child_ids:
  564. if child_id in nodes:
  565. subtree["children"].append(build_subtree(child_id))
  566. return subtree
  567. # 从根节点开始构建
  568. root_id = "人设:人设:人设:人设"
  569. return build_subtree(root_id)
  570. # ==================== 图游走工具 ====================
  571. def walk_graph(
  572. index: Dict,
  573. start_node: str,
  574. edge_types: List[str],
  575. direction: str = "out",
  576. min_score: float = None
  577. ) -> Set[str]:
  578. """
  579. 从起始节点出发,按指定边类型序列游走N步
  580. Args:
  581. index: 游走索引 {"outEdges": {...}, "inEdges": {...}}
  582. start_node: 起始节点ID
  583. edge_types: 边类型序列,如 ["属于", "分类共现"]
  584. direction: 游走方向 "out"(沿出边) / "in"(沿入边)
  585. min_score: 最小分数过滤
  586. Returns:
  587. 到达的节点ID集合
  588. Example:
  589. # 从标签出发,沿"属于"边走1步,再沿"分类共现"边走1步
  590. result = walk_graph(
  591. index,
  592. "人设:灵感点:标签:手绘风格",
  593. ["属于", "分类共现"]
  594. )
  595. """
  596. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  597. target_key = "target" if direction == "out" else "source"
  598. current_nodes = {start_node}
  599. for edge_type in edge_types:
  600. next_nodes = set()
  601. for node in current_nodes:
  602. neighbors = edge_index.get(node, {}).get(edge_type, [])
  603. for neighbor in neighbors:
  604. # 分数过滤
  605. if min_score is not None and neighbor.get("score", 0) < min_score:
  606. continue
  607. next_nodes.add(neighbor[target_key])
  608. current_nodes = next_nodes
  609. if not current_nodes:
  610. break
  611. return current_nodes
  612. def get_neighbors(
  613. index: Dict,
  614. node_id: str,
  615. edge_type: str = None,
  616. direction: str = "out",
  617. min_score: float = None
  618. ) -> List[Dict]:
  619. """
  620. 获取节点的邻居
  621. Args:
  622. index: 游走索引
  623. node_id: 节点ID
  624. edge_type: 边类型(可选,不指定则返回所有类型)
  625. direction: 方向 "out" / "in"
  626. min_score: 最小分数过滤
  627. Returns:
  628. 邻居列表 [{"target": "...", "score": 0.5}, ...]
  629. """
  630. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  631. node_edges = edge_index.get(node_id, {})
  632. if edge_type:
  633. neighbors = node_edges.get(edge_type, [])
  634. else:
  635. neighbors = []
  636. for edges in node_edges.values():
  637. neighbors.extend(edges)
  638. if min_score is not None:
  639. neighbors = [n for n in neighbors if n.get("score", 0) >= min_score]
  640. return neighbors
  641. # ==================== 构建索引 ====================
  642. def build_index(edges: Dict[str, Dict]) -> Dict:
  643. """
  644. 构建游走索引
  645. Returns:
  646. {
  647. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  648. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  649. }
  650. """
  651. out_edges = {}
  652. in_edges = {}
  653. for edge_id, edge_data in edges.items():
  654. source = edge_data["source"]
  655. target = edge_data["target"]
  656. edge_type = edge_data["type"]
  657. score = edge_data["score"]
  658. # outEdges
  659. if source not in out_edges:
  660. out_edges[source] = {}
  661. if edge_type not in out_edges[source]:
  662. out_edges[source][edge_type] = []
  663. out_edges[source][edge_type].append({
  664. "target": target,
  665. "score": score
  666. })
  667. # inEdges
  668. if target not in in_edges:
  669. in_edges[target] = {}
  670. if edge_type not in in_edges[target]:
  671. in_edges[target][edge_type] = []
  672. in_edges[target][edge_type].append({
  673. "source": source,
  674. "score": score
  675. })
  676. return {
  677. "outEdges": out_edges,
  678. "inEdges": in_edges
  679. }
  680. # ==================== 主函数 ====================
  681. def main():
  682. config = PathConfig()
  683. config.ensure_dirs()
  684. print(f"账号: {config.account_name}")
  685. print(f"输出版本: {config.output_version}")
  686. print()
  687. # 输入文件路径
  688. pattern_file = config.pattern_cluster_file
  689. associations_file = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
  690. intra_associations_file = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
  691. historical_posts_dir = config.historical_posts_dir
  692. # 输出文件路径
  693. output_file = config.intermediate_dir / "人设图谱.json"
  694. print("输入文件:")
  695. print(f" pattern聚合文件: {pattern_file}")
  696. print(f" 跨点关联分析文件: {associations_file}")
  697. print(f" 点内关联分析文件: {intra_associations_file}")
  698. print(f" 历史帖子目录: {historical_posts_dir}")
  699. print(f"\n输出文件: {output_file}")
  700. print()
  701. # ===== 读取数据 =====
  702. print("=" * 60)
  703. print("读取数据...")
  704. print(" 读取 pattern 聚合结果...")
  705. with open(pattern_file, "r", encoding="utf-8") as f:
  706. pattern_data = json.load(f)
  707. print(" 读取跨点关联分析结果...")
  708. with open(associations_file, "r", encoding="utf-8") as f:
  709. associations_data = json.load(f)
  710. print(" 读取点内关联分析结果...")
  711. with open(intra_associations_file, "r", encoding="utf-8") as f:
  712. intra_associations_data = json.load(f)
  713. # ===== 提取节点 =====
  714. print("\n" + "=" * 60)
  715. print("提取节点...")
  716. all_nodes = {}
  717. dimension_mapping = {
  718. "灵感点列表": "灵感点",
  719. "目的点": "目的点",
  720. "关键点列表": "关键点"
  721. }
  722. # 分类节点
  723. print("\n提取分类节点:")
  724. for dim_key, dim_name in dimension_mapping.items():
  725. category_nodes = extract_category_nodes_from_pattern(pattern_data, dim_key, dim_name)
  726. all_nodes.update(category_nodes)
  727. print(f" {dim_name}: {len(category_nodes)} 个")
  728. # 标签节点
  729. print("\n提取标签节点:")
  730. for dim_key, dim_name in dimension_mapping.items():
  731. tag_nodes = extract_tag_nodes_from_pattern(pattern_data, dim_key, dim_name)
  732. all_nodes.update(tag_nodes)
  733. print(f" {dim_name}: {len(tag_nodes)} 个")
  734. # 统计
  735. category_count = sum(1 for n in all_nodes.values() if n["type"] == "分类")
  736. tag_count = sum(1 for n in all_nodes.values() if n["type"] == "标签")
  737. print(f"\n节点总计: {len(all_nodes)} (分类: {category_count}, 标签: {tag_count})")
  738. # ===== 提取边 =====
  739. print("\n" + "=" * 60)
  740. print("提取边...")
  741. all_edges = {}
  742. # 属于/包含边
  743. print("\n提取属于/包含边:")
  744. for dim_key, dim_name in dimension_mapping.items():
  745. belong_contain_edges = extract_belong_contain_edges(pattern_data, dim_key, dim_name, all_nodes)
  746. all_edges.update(belong_contain_edges)
  747. belong_count = sum(1 for e in all_edges.values() if e["type"] == "属于")
  748. contain_count = sum(1 for e in all_edges.values() if e["type"] == "包含")
  749. print(f" 属于边: {belong_count}, 包含边: {contain_count}")
  750. # 分类共现边(跨点)
  751. print("\n提取分类共现边(跨点):")
  752. category_cooccur_edges = extract_category_cooccur_edges(associations_data)
  753. all_edges.update(category_cooccur_edges)
  754. print(f" 分类共现边: {len(category_cooccur_edges)}")
  755. # 分类共现边(点内)
  756. print("\n提取分类共现边(点内):")
  757. intra_category_edges = extract_intra_category_cooccur_edges(intra_associations_data)
  758. all_edges.update(intra_category_edges)
  759. print(f" 分类共现边: {len(intra_category_edges)}")
  760. # 标签共现边
  761. print("\n提取标签共现边:")
  762. tag_cooccur_edges = extract_tag_cooccur_edges(historical_posts_dir)
  763. all_edges.update(tag_cooccur_edges)
  764. print(f" 标签共现边: {len(tag_cooccur_edges)}")
  765. # ===== 添加根节点和维度节点 =====
  766. print("\n添加根节点和维度节点:")
  767. # 根节点
  768. root_id = "人设:人设:人设:人设"
  769. all_nodes[root_id] = create_node(
  770. domain="人设",
  771. dimension="人设",
  772. node_type="人设",
  773. name="人设",
  774. detail={}
  775. )
  776. # 维度节点 + 边
  777. dimensions = ["灵感点", "目的点", "关键点"]
  778. for dim in dimensions:
  779. dim_id = f"人设:{dim}:{dim}:{dim}"
  780. all_nodes[dim_id] = create_node(
  781. domain="人设",
  782. dimension=dim,
  783. node_type=dim,
  784. name=dim,
  785. detail={}
  786. )
  787. # 维度 -> 根 的属于边
  788. edge_id = build_edge_id(dim_id, "属于", root_id)
  789. all_edges[edge_id] = create_edge(
  790. source=dim_id,
  791. target=root_id,
  792. edge_type="属于",
  793. score=1.0,
  794. detail={}
  795. )
  796. # 根 -> 维度 的包含边
  797. edge_id_contain = build_edge_id(root_id, "包含", dim_id)
  798. all_edges[edge_id_contain] = create_edge(
  799. source=root_id,
  800. target=dim_id,
  801. edge_type="包含",
  802. score=1.0,
  803. detail={}
  804. )
  805. # 找该维度下的顶级分类(没有父节点的分类),添加边
  806. dim_categories = [
  807. (nid, ndata) for nid, ndata in all_nodes.items()
  808. if ndata["dimension"] == dim and ndata["type"] == "分类"
  809. and not ndata["detail"].get("parentPath")
  810. ]
  811. for cat_id, cat_data in dim_categories:
  812. # 顶级分类 -> 维度 的属于边
  813. edge_id = build_edge_id(cat_id, "属于", dim_id)
  814. all_edges[edge_id] = create_edge(
  815. source=cat_id,
  816. target=dim_id,
  817. edge_type="属于",
  818. score=1.0,
  819. detail={}
  820. )
  821. # 维度 -> 顶级分类 的包含边
  822. edge_id_contain = build_edge_id(dim_id, "包含", cat_id)
  823. all_edges[edge_id_contain] = create_edge(
  824. source=dim_id,
  825. target=cat_id,
  826. edge_type="包含",
  827. score=1.0,
  828. detail={}
  829. )
  830. print(f" 添加节点: 1 根节点 + 3 维度节点 = 4")
  831. print(f" 添加边: 根↔维度 6条 + 维度↔顶级分类")
  832. # 边统计
  833. edge_type_counts = {}
  834. for edge in all_edges.values():
  835. t = edge["type"]
  836. edge_type_counts[t] = edge_type_counts.get(t, 0) + 1
  837. print(f"\n边总计: {len(all_edges)}")
  838. for t, count in sorted(edge_type_counts.items(), key=lambda x: -x[1]):
  839. print(f" {t}: {count}")
  840. # ===== 构建索引 =====
  841. print("\n" + "=" * 60)
  842. print("构建索引...")
  843. index = build_index(all_edges)
  844. print(f" outEdges 节点数: {len(index['outEdges'])}")
  845. print(f" inEdges 节点数: {len(index['inEdges'])}")
  846. # ===== 构建嵌套树 =====
  847. print("\n" + "=" * 60)
  848. print("构建嵌套树...")
  849. tree = build_nested_tree(all_nodes, all_edges)
  850. # 统计树节点数
  851. def count_tree_nodes(node):
  852. count = 1
  853. for child in node.get("children", []):
  854. count += count_tree_nodes(child)
  855. return count
  856. tree_node_count = count_tree_nodes(tree)
  857. print(f" 树节点数: {tree_node_count}")
  858. # ===== 统计各维度 =====
  859. dimension_stats = {}
  860. for dim_name in ["灵感点", "目的点", "关键点"]:
  861. dim_categories = sum(1 for n in all_nodes.values() if n["type"] == "分类" and n["dimension"] == dim_name)
  862. dim_tags = sum(1 for n in all_nodes.values() if n["type"] == "标签" and n["dimension"] == dim_name)
  863. dimension_stats[dim_name] = {
  864. "categoryCount": dim_categories,
  865. "tagCount": dim_tags
  866. }
  867. # ===== 构建输出 =====
  868. print("\n" + "=" * 60)
  869. print("保存结果...")
  870. output_data = {
  871. "meta": {
  872. "description": "人设图谱数据",
  873. "account": config.account_name,
  874. "createdAt": datetime.now().isoformat(),
  875. "stats": {
  876. "nodeCount": len(all_nodes),
  877. "edgeCount": len(all_edges),
  878. "categoryCount": category_count,
  879. "tagCount": tag_count,
  880. "treeNodeCount": tree_node_count,
  881. "dimensions": dimension_stats,
  882. "edgeTypes": edge_type_counts
  883. }
  884. },
  885. "nodes": all_nodes,
  886. "edges": all_edges,
  887. "index": index,
  888. "tree": tree
  889. }
  890. with open(output_file, "w", encoding="utf-8") as f:
  891. json.dump(output_data, f, ensure_ascii=False, indent=2)
  892. print(f"\n输出文件: {output_file}")
  893. print("\n" + "=" * 60)
  894. print("完成!")
  895. if __name__ == "__main__":
  896. main()