build_persona_graph.py 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 构建人设图谱
  5. ================================================================================
  6. 输入文件:
  7. ================================================================================
  8. 1. pattern聚合结果.json - 分类节点、标签节点、属于/包含边
  9. 2. dimension_associations_analysis.json - 分类共现边(跨点)
  10. 3. intra_dimension_associations_analysis.json - 分类共现边(点内)
  11. 4. 历史帖子解构目录/*.json - 标签共现边
  12. ================================================================================
  13. 输出文件: 人设图谱.json
  14. ================================================================================
  15. {
  16. "meta": { # 元信息
  17. "description": "...",
  18. "account": "账号名",
  19. "createdAt": "时间戳",
  20. "stats": { ... } # 统计信息
  21. },
  22. "nodes": { # 节点字典 (nodeId -> nodeData)
  23. "{domain}:{dimension}:{type}:{name}": {
  24. "name": "显示名称",
  25. "type": "人设|灵感点|目的点|关键点|分类|标签",
  26. "domain": "人设",
  27. "dimension": "人设|灵感点|目的点|关键点",
  28. "detail": { ... }
  29. }
  30. },
  31. "edges": { # 边字典 (edgeId -> edgeData)
  32. "{source}|{type}|{target}": {
  33. "source": "源节点ID",
  34. "target": "目标节点ID",
  35. "type": "属于|包含|标签共现|分类共现|分类共现",
  36. "score": 0.5,
  37. "detail": { ... }
  38. }
  39. },
  40. "index": { # 游走索引
  41. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  42. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  43. },
  44. "tree": { ... } # 嵌套树结构(从根节点沿"包含"边构建)
  45. }
  46. ================================================================================
  47. 核心逻辑:
  48. ================================================================================
  49. 1. 提取节点
  50. - 从 pattern 提取分类节点(按维度分组的层级分类)
  51. - 从 pattern 提取标签节点(具体特征标签)
  52. - 添加根节点(人设)和维度节点(灵感点/目的点/关键点)
  53. 2. 提取边
  54. - 属于/包含边:根据节点的 parentPath 构建层级关系
  55. - 分类共现边(跨点):从关联分析结果提取
  56. - 分类共现边(点内):从点内关联分析提取
  57. - 标签共现边:遍历历史帖子,统计标签同现
  58. 3. 构建索引
  59. - outEdges: 从该节点出发能到达的节点
  60. - inEdges: 能到达该节点的源节点
  61. 4. 构建树
  62. - 从根节点开始,沿"包含"边递归构建嵌套树结构
  63. ================================================================================
  64. 节点ID格式: {domain}:{dimension}:{type}:{name}
  65. ================================================================================
  66. - 根节点: 人设:人设:人设:人设
  67. - 维度节点: 人设:灵感点:灵感点:灵感点
  68. - 分类节点: 人设:灵感点:分类:视觉呈现
  69. - 标签节点: 人设:灵感点:标签:手绘风格
  70. ================================================================================
  71. 边类型:
  72. ================================================================================
  73. - 属于: 子节点 -> 父节点(层级关系)
  74. - 包含: 父节点 -> 子节点(层级关系)
  75. - 标签共现: 标签 <-> 标签(同一帖子出现)
  76. - 分类共现: 分类 <-> 分类(跨维度共现)
  77. - 分类共现: 分类 <-> 分类(点内组合共现)
  78. ================================================================================
  79. 图游走函数:
  80. ================================================================================
  81. 1. walk_graph(index, start_node, edge_types, direction, min_score)
  82. - 从起始节点出发,按边类型序列游走N步
  83. - 示例: walk_graph(index, "人设:灵感点:标签:手绘风格", ["属于", "分类共现"])
  84. - 返回: 到达的节点ID集合
  85. 2. get_neighbors(index, node_id, edge_type, direction, min_score)
  86. - 获取节点的邻居
  87. - 示例: get_neighbors(index, "人设:灵感点:分类:视觉呈现", "包含")
  88. - 返回: 邻居列表 [{"target": "...", "score": 0.5}, ...]
  89. ================================================================================
  90. """
  91. import json
  92. from pathlib import Path
  93. from typing import Dict, List, Set, Any
  94. from datetime import datetime
  95. import sys
  96. # 添加项目根目录到路径
  97. project_root = Path(__file__).parent.parent.parent
  98. sys.path.insert(0, str(project_root))
  99. from script.data_processing.path_config import PathConfig
  100. # ==================== 节点和边构建工具 ====================
  101. def build_node_id(domain: str, dimension: str, node_type: str, name: str) -> str:
  102. """构建节点ID"""
  103. return f"{domain}:{dimension}:{node_type}:{name}"
  104. def build_edge_id(source: str, edge_type: str, target: str) -> str:
  105. """构建边ID"""
  106. return f"{source}|{edge_type}|{target}"
  107. def create_node(
  108. domain: str,
  109. dimension: str,
  110. node_type: str,
  111. name: str,
  112. detail: Dict = None
  113. ) -> Dict:
  114. """创建节点"""
  115. return {
  116. "name": name,
  117. "type": node_type,
  118. "dimension": dimension,
  119. "domain": domain,
  120. "detail": detail or {}
  121. }
  122. def create_edge(
  123. source: str,
  124. target: str,
  125. edge_type: str,
  126. score: float = None,
  127. detail: Dict = None
  128. ) -> Dict:
  129. """创建边"""
  130. return {
  131. "source": source,
  132. "target": target,
  133. "type": edge_type,
  134. "score": score,
  135. "detail": detail or {}
  136. }
  137. # ==================== 从 pattern 提取分类节点 ====================
  138. def extract_category_nodes_from_pattern(
  139. pattern_data: Dict,
  140. dimension_key: str,
  141. dimension_name: str
  142. ) -> Dict[str, Dict]:
  143. """
  144. 从 pattern 聚合结果中提取分类节点
  145. Returns:
  146. { nodeId: nodeData }
  147. """
  148. nodes = {}
  149. if dimension_key not in pattern_data:
  150. return nodes
  151. def collect_sources_recursively(node: Dict) -> List[Dict]:
  152. """递归收集节点及其所有子节点的特征来源"""
  153. sources = []
  154. if "特征列表" in node:
  155. for feature in node["特征列表"]:
  156. source = {
  157. "pointName": feature.get("所属点", ""),
  158. "pointDesc": feature.get("点描述", ""),
  159. "postId": feature.get("帖子id", "")
  160. }
  161. sources.append(source)
  162. for key, value in node.items():
  163. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  164. continue
  165. if isinstance(value, dict):
  166. sources.extend(collect_sources_recursively(value))
  167. return sources
  168. def traverse_node(node: Dict, parent_path: List[str]):
  169. """递归遍历节点"""
  170. for key, value in node.items():
  171. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  172. continue
  173. if isinstance(value, dict):
  174. current_path = parent_path + [key]
  175. # 收集帖子ID列表(递归收集当前节点及所有子节点的帖子ID,去重)
  176. all_sources = collect_sources_recursively(value)
  177. unique_post_ids = list(set(s.get("postId", "") for s in all_sources if s.get("postId")))
  178. # 构建节点
  179. node_id = build_node_id("人设", dimension_name, "分类", key)
  180. nodes[node_id] = create_node(
  181. domain="人设",
  182. dimension=dimension_name,
  183. node_type="分类",
  184. name=key,
  185. detail={
  186. "parentPath": parent_path.copy(),
  187. "postIds": unique_post_ids,
  188. "postCount": len(unique_post_ids)
  189. }
  190. )
  191. # 递归处理子节点
  192. traverse_node(value, current_path)
  193. traverse_node(pattern_data[dimension_key], [])
  194. return nodes
  195. # ==================== 从 pattern 提取标签节点 ====================
  196. def extract_tag_nodes_from_pattern(
  197. pattern_data: Dict,
  198. dimension_key: str,
  199. dimension_name: str
  200. ) -> Dict[str, Dict]:
  201. """
  202. 从 pattern 聚合结果中提取标签节点
  203. Returns:
  204. { nodeId: nodeData }
  205. """
  206. nodes = {}
  207. tag_map = {} # 用于合并同名标签: tagId -> { postIds, parentPath }
  208. if dimension_key not in pattern_data:
  209. return nodes
  210. def traverse_node(node: Dict, parent_path: List[str]):
  211. """递归遍历节点"""
  212. # 处理特征列表(标签)
  213. if "特征列表" in node:
  214. for feature in node["特征列表"]:
  215. tag_name = feature.get("特征名称", "")
  216. if not tag_name:
  217. continue
  218. post_id = feature.get("帖子id", "")
  219. tag_id = build_node_id("人设", dimension_name, "标签", tag_name)
  220. if tag_id not in tag_map:
  221. tag_map[tag_id] = {
  222. "name": tag_name,
  223. "postIds": set(),
  224. "parentPath": parent_path.copy()
  225. }
  226. if post_id:
  227. tag_map[tag_id]["postIds"].add(post_id)
  228. # 递归处理子节点
  229. for key, value in node.items():
  230. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  231. continue
  232. if isinstance(value, dict):
  233. current_path = parent_path + [key]
  234. traverse_node(value, current_path)
  235. traverse_node(pattern_data[dimension_key], [])
  236. # 转换为节点
  237. for tag_id, tag_info in tag_map.items():
  238. nodes[tag_id] = create_node(
  239. domain="人设",
  240. dimension=dimension_name,
  241. node_type="标签",
  242. name=tag_info["name"],
  243. detail={
  244. "parentPath": tag_info["parentPath"],
  245. "postIds": list(tag_info["postIds"]),
  246. "postCount": len(tag_info["postIds"])
  247. }
  248. )
  249. return nodes
  250. # ==================== 从 pattern 提取属于/包含边 ====================
  251. def extract_belong_contain_edges(
  252. pattern_data: Dict,
  253. dimension_key: str,
  254. dimension_name: str,
  255. nodes: Dict[str, Dict]
  256. ) -> Dict[str, Dict]:
  257. """
  258. 从 pattern 聚合结果中提取属于/包含边
  259. Returns:
  260. { edgeId: edgeData }
  261. """
  262. edges = {}
  263. if dimension_key not in pattern_data:
  264. return edges
  265. # 构建分类名称到ID的映射
  266. category_name_to_id = {}
  267. for node_id, node_data in nodes.items():
  268. if node_data["type"] == "分类" and node_data["dimension"] == dimension_name:
  269. category_name_to_id[node_data["name"]] = node_id
  270. # 为每个节点创建属于边(子→父)
  271. for node_id, node_data in nodes.items():
  272. if node_data["dimension"] != dimension_name:
  273. continue
  274. parent_path = node_data["detail"].get("parentPath", [])
  275. if not parent_path:
  276. continue
  277. # 取最后一个作为直接父分类
  278. parent_name = parent_path[-1]
  279. parent_id = category_name_to_id.get(parent_name)
  280. if parent_id:
  281. # 获取 source 和 target 的 postIds
  282. child_post_ids = node_data["detail"].get("postIds", [])
  283. parent_post_ids = nodes.get(parent_id, {}).get("detail", {}).get("postIds", [])
  284. # 属于边:子 → 父
  285. edge_id = build_edge_id(node_id, "属于", parent_id)
  286. edges[edge_id] = create_edge(
  287. source=node_id,
  288. target=parent_id,
  289. edge_type="属于",
  290. score=1.0,
  291. detail={
  292. "sourcePostIds": child_post_ids,
  293. "targetPostIds": parent_post_ids
  294. }
  295. )
  296. # 包含边:父 → 子
  297. edge_id_contain = build_edge_id(parent_id, "包含", node_id)
  298. edges[edge_id_contain] = create_edge(
  299. source=parent_id,
  300. target=node_id,
  301. edge_type="包含",
  302. score=1.0,
  303. detail={
  304. "sourcePostIds": parent_post_ids,
  305. "targetPostIds": child_post_ids
  306. }
  307. )
  308. return edges
  309. # ==================== 从关联分析提取分类共现边(跨点)====================
  310. def extract_category_cooccur_edges(associations_data: Dict, nodes: Dict[str, Dict]) -> Dict[str, Dict]:
  311. """
  312. 从 dimension_associations_analysis.json 中提取分类共现边(跨点)
  313. Args:
  314. associations_data: 关联分析数据
  315. nodes: 已构建的节点数据(用于获取节点的 postIds)
  316. Returns:
  317. { edgeId: edgeData }
  318. """
  319. edges = {}
  320. if "单维度关联分析" not in associations_data:
  321. return edges
  322. single_dim = associations_data["单维度关联分析"]
  323. # 维度映射
  324. dimension_map = {
  325. "灵感点维度": "灵感点",
  326. "目的点维度": "目的点",
  327. "关键点维度": "关键点"
  328. }
  329. def get_last_segment(path: str) -> str:
  330. """获取路径的最后一段"""
  331. return path.split("/")[-1]
  332. for dim_key, dim_data in single_dim.items():
  333. if dim_key not in dimension_map:
  334. continue
  335. source_dimension = dimension_map[dim_key]
  336. for direction_key, direction_data in dim_data.items():
  337. if direction_key == "说明" or "→" not in direction_key:
  338. continue
  339. for source_path, source_info in direction_data.items():
  340. source_name = get_last_segment(source_path)
  341. source_node_id = build_node_id("人设", source_dimension, "分类", source_name)
  342. for field_name, associations in source_info.items():
  343. if not field_name.startswith("与") or not field_name.endswith("的关联"):
  344. continue
  345. target_dimension = field_name[1:-3]
  346. if not isinstance(associations, list):
  347. continue
  348. for assoc in associations:
  349. target_path = assoc.get("目标分类", "")
  350. if not target_path:
  351. continue
  352. target_name = get_last_segment(target_path)
  353. target_node_id = build_node_id("人设", target_dimension, "分类", target_name)
  354. # 使用 Jaccard 作为 score
  355. jaccard = assoc.get("Jaccard相似度", 0)
  356. # 获取 source 和 target 的 postIds
  357. source_post_ids = nodes.get(source_node_id, {}).get("detail", {}).get("postIds", [])
  358. target_post_ids = nodes.get(target_node_id, {}).get("detail", {}).get("postIds", [])
  359. edge_id = build_edge_id(source_node_id, "分类共现", target_node_id)
  360. edges[edge_id] = create_edge(
  361. source=source_node_id,
  362. target=target_node_id,
  363. edge_type="分类共现",
  364. score=jaccard,
  365. detail={
  366. "postIds": assoc.get("共同帖子ID", []),
  367. "postCount": assoc.get("共同帖子数", 0),
  368. "jaccard": jaccard,
  369. "overlapCoef": assoc.get("重叠系数", 0),
  370. "sourcePostIds": source_post_ids,
  371. "targetPostIds": target_post_ids
  372. }
  373. )
  374. return edges
  375. # ==================== 从关联分析提取分类共现边(点内)====================
  376. def extract_intra_category_cooccur_edges(intra_data: Dict, nodes: Dict[str, Dict]) -> Dict[str, Dict]:
  377. """
  378. 从 intra_dimension_associations_analysis.json 中提取点内分类共现边
  379. Args:
  380. intra_data: 点内关联分析数据
  381. nodes: 已构建的节点数据(用于获取节点的 postIds)
  382. Returns:
  383. { edgeId: edgeData }
  384. """
  385. edges = {}
  386. if "叶子分类组合聚类" not in intra_data:
  387. return edges
  388. clusters_by_dim = intra_data["叶子分类组合聚类"]
  389. for dimension, clusters in clusters_by_dim.items():
  390. if dimension not in ("灵感点", "目的点", "关键点"):
  391. continue
  392. for cluster_key, cluster_data in clusters.items():
  393. leaf_categories = cluster_data.get("叶子分类组合", [])
  394. point_count = cluster_data.get("点数", 0)
  395. point_details = cluster_data.get("点详情列表", [])
  396. # 提取点名称列表
  397. point_names = [p.get("点名称", "") for p in point_details if p.get("点名称")]
  398. # 两两组合生成共现边
  399. for i in range(len(leaf_categories)):
  400. for j in range(i + 1, len(leaf_categories)):
  401. cat1 = leaf_categories[i]
  402. cat2 = leaf_categories[j]
  403. cat1_id = build_node_id("人设", dimension, "分类", cat1)
  404. cat2_id = build_node_id("人设", dimension, "分类", cat2)
  405. # 确保顺序一致(按字典序)
  406. if cat1_id > cat2_id:
  407. cat1_id, cat2_id = cat2_id, cat1_id
  408. edge_id = build_edge_id(cat1_id, "分类共现", cat2_id)
  409. if edge_id in edges:
  410. # 累加
  411. edges[edge_id]["detail"]["pointCount"] += point_count
  412. edges[edge_id]["detail"]["pointNames"].extend(point_names)
  413. else:
  414. # 获取 source 和 target 的 postIds
  415. cat1_post_ids = nodes.get(cat1_id, {}).get("detail", {}).get("postIds", [])
  416. cat2_post_ids = nodes.get(cat2_id, {}).get("detail", {}).get("postIds", [])
  417. # 计算 Jaccard(基于帖子)
  418. cat1_set = set(cat1_post_ids)
  419. cat2_set = set(cat2_post_ids)
  420. intersection = cat1_set & cat2_set
  421. union = cat1_set | cat2_set
  422. jaccard = round(len(intersection) / len(union), 4) if union else 0
  423. edges[edge_id] = create_edge(
  424. source=cat1_id,
  425. target=cat2_id,
  426. edge_type="分类共现",
  427. score=jaccard,
  428. detail={
  429. "postIds": list(intersection),
  430. "postCount": len(intersection),
  431. "jaccard": jaccard,
  432. "pointCount": point_count,
  433. "pointNames": point_names.copy(),
  434. "sourcePostIds": cat1_post_ids,
  435. "targetPostIds": cat2_post_ids
  436. }
  437. )
  438. return edges
  439. # ==================== 从历史帖子提取标签共现边 ====================
  440. def extract_tag_cooccur_edges(historical_posts_dir: Path, nodes: Dict[str, Dict]) -> Dict[str, Dict]:
  441. """
  442. 从历史帖子解构结果中提取标签共现边
  443. Args:
  444. historical_posts_dir: 历史帖子目录
  445. nodes: 已构建的节点数据(用于获取标签的 postIds 计算 Jaccard)
  446. Returns:
  447. { edgeId: edgeData }
  448. """
  449. edges = {}
  450. cooccur_map = {} # (tag1_id, tag2_id) -> { postIds: set() }
  451. if not historical_posts_dir.exists():
  452. print(f" 警告: 历史帖子目录不存在: {historical_posts_dir}")
  453. return edges
  454. json_files = list(historical_posts_dir.glob("*.json"))
  455. print(f" 找到 {len(json_files)} 个历史帖子文件")
  456. def extract_post_id_from_filename(filename: str) -> str:
  457. """从文件名中提取帖子ID"""
  458. import re
  459. match = re.match(r'^([^_]+)_', filename)
  460. return match.group(1) if match else ""
  461. def extract_tags_from_post(post_data: Dict) -> Dict[str, List[str]]:
  462. """从帖子解构结果中提取所有标签"""
  463. tags_by_dimension = {
  464. "灵感点": [],
  465. "目的点": [],
  466. "关键点": []
  467. }
  468. if "三点解构" not in post_data:
  469. return tags_by_dimension
  470. three_points = post_data["三点解构"]
  471. # 灵感点
  472. if "灵感点" in three_points:
  473. inspiration = three_points["灵感点"]
  474. for section in ["全新内容", "共性差异", "共性内容"]:
  475. if section in inspiration and isinstance(inspiration[section], list):
  476. for item in inspiration[section]:
  477. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  478. for feature in item["提取的特征"]:
  479. tag_name = feature.get("特征名称", "")
  480. if tag_name:
  481. tags_by_dimension["灵感点"].append(tag_name)
  482. # 目的点
  483. if "目的点" in three_points:
  484. purpose = three_points["目的点"]
  485. if "purposes" in purpose and isinstance(purpose["purposes"], list):
  486. for item in purpose["purposes"]:
  487. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  488. for feature in item["提取的特征"]:
  489. tag_name = feature.get("特征名称", "")
  490. if tag_name:
  491. tags_by_dimension["目的点"].append(tag_name)
  492. # 关键点
  493. if "关键点" in three_points:
  494. key_points = three_points["关键点"]
  495. if "key_points" in key_points and isinstance(key_points["key_points"], list):
  496. for item in key_points["key_points"]:
  497. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  498. for feature in item["提取的特征"]:
  499. tag_name = feature.get("特征名称", "")
  500. if tag_name:
  501. tags_by_dimension["关键点"].append(tag_name)
  502. return tags_by_dimension
  503. # 遍历所有帖子文件
  504. for file_path in json_files:
  505. post_id = extract_post_id_from_filename(file_path.name)
  506. if not post_id:
  507. continue
  508. try:
  509. with open(file_path, "r", encoding="utf-8") as f:
  510. post_data = json.load(f)
  511. tags_by_dimension = extract_tags_from_post(post_data)
  512. # 对每个维度内的标签两两组合
  513. for dimension, tags in tags_by_dimension.items():
  514. unique_tags = list(set(tags))
  515. for i in range(len(unique_tags)):
  516. for j in range(i + 1, len(unique_tags)):
  517. tag1 = unique_tags[i]
  518. tag2 = unique_tags[j]
  519. tag1_id = build_node_id("人设", dimension, "标签", tag1)
  520. tag2_id = build_node_id("人设", dimension, "标签", tag2)
  521. # 确保顺序一致
  522. if tag1_id > tag2_id:
  523. tag1_id, tag2_id = tag2_id, tag1_id
  524. key = (tag1_id, tag2_id)
  525. if key not in cooccur_map:
  526. cooccur_map[key] = {"postIds": set()}
  527. cooccur_map[key]["postIds"].add(post_id)
  528. except Exception as e:
  529. print(f" 警告: 处理文件 {file_path.name} 时出错: {e}")
  530. # 转换为边
  531. for (tag1_id, tag2_id), info in cooccur_map.items():
  532. cooccur_post_ids = list(info["postIds"])
  533. cooccur_count = len(cooccur_post_ids)
  534. # 获取两个标签的帖子集合,计算 Jaccard
  535. tag1_post_ids = nodes.get(tag1_id, {}).get("detail", {}).get("postIds", [])
  536. tag2_post_ids = nodes.get(tag2_id, {}).get("detail", {}).get("postIds", [])
  537. union_count = len(set(tag1_post_ids) | set(tag2_post_ids))
  538. jaccard = round(cooccur_count / union_count, 4) if union_count > 0 else 0
  539. edge_id = build_edge_id(tag1_id, "标签共现", tag2_id)
  540. edges[edge_id] = create_edge(
  541. source=tag1_id,
  542. target=tag2_id,
  543. edge_type="标签共现",
  544. score=jaccard,
  545. detail={
  546. "postIds": cooccur_post_ids,
  547. "postCount": cooccur_count,
  548. "jaccard": jaccard,
  549. "sourcePostIds": tag1_post_ids,
  550. "targetPostIds": tag2_post_ids
  551. }
  552. )
  553. return edges
  554. # ==================== 构建嵌套树结构 ====================
  555. def build_nested_tree(nodes: Dict[str, Dict], edges: Dict[str, Dict]) -> Dict:
  556. """
  557. 从根节点开始,沿"包含"边递归构建嵌套树结构
  558. 包含边:父节点 -> 子节点
  559. 从根节点开始,递归找所有包含的子节点
  560. Returns:
  561. 嵌套的树结构
  562. """
  563. # 从"包含"边构建 父节点 -> [子节点] 的映射
  564. parent_to_children = {} # parent_id -> [child_id, ...]
  565. for edge_id, edge_data in edges.items():
  566. if edge_data["type"] == "包含":
  567. parent_id = edge_data["source"]
  568. child_id = edge_data["target"]
  569. if parent_id not in parent_to_children:
  570. parent_to_children[parent_id] = []
  571. parent_to_children[parent_id].append(child_id)
  572. # 递归构建子树
  573. def build_subtree(node_id: str) -> Dict:
  574. node_data = nodes[node_id]
  575. subtree = {
  576. "id": node_id,
  577. "name": node_data["name"],
  578. "type": node_data["type"],
  579. "domain": node_data["domain"],
  580. "dimension": node_data["dimension"],
  581. "detail": node_data.get("detail", {}),
  582. "children": []
  583. }
  584. # 获取子节点
  585. child_ids = parent_to_children.get(node_id, [])
  586. for child_id in child_ids:
  587. if child_id in nodes:
  588. subtree["children"].append(build_subtree(child_id))
  589. return subtree
  590. # 从根节点开始构建
  591. root_id = "人设:人设:人设:人设"
  592. return build_subtree(root_id)
  593. # ==================== 图游走工具 ====================
  594. def walk_graph(
  595. index: Dict,
  596. start_node: str,
  597. edge_types: List[str],
  598. direction: str = "out",
  599. min_score: float = None
  600. ) -> Set[str]:
  601. """
  602. 从起始节点出发,按指定边类型序列游走N步
  603. Args:
  604. index: 游走索引 {"outEdges": {...}, "inEdges": {...}}
  605. start_node: 起始节点ID
  606. edge_types: 边类型序列,如 ["属于", "分类共现"]
  607. direction: 游走方向 "out"(沿出边) / "in"(沿入边)
  608. min_score: 最小分数过滤
  609. Returns:
  610. 到达的节点ID集合
  611. Example:
  612. # 从标签出发,沿"属于"边走1步,再沿"分类共现"边走1步
  613. result = walk_graph(
  614. index,
  615. "人设:灵感点:标签:手绘风格",
  616. ["属于", "分类共现"]
  617. )
  618. """
  619. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  620. target_key = "target" if direction == "out" else "source"
  621. current_nodes = {start_node}
  622. for edge_type in edge_types:
  623. next_nodes = set()
  624. for node in current_nodes:
  625. neighbors = edge_index.get(node, {}).get(edge_type, [])
  626. for neighbor in neighbors:
  627. # 分数过滤
  628. if min_score is not None and neighbor.get("score", 0) < min_score:
  629. continue
  630. next_nodes.add(neighbor[target_key])
  631. current_nodes = next_nodes
  632. if not current_nodes:
  633. break
  634. return current_nodes
  635. def get_neighbors(
  636. index: Dict,
  637. node_id: str,
  638. edge_type: str = None,
  639. direction: str = "out",
  640. min_score: float = None
  641. ) -> List[Dict]:
  642. """
  643. 获取节点的邻居
  644. Args:
  645. index: 游走索引
  646. node_id: 节点ID
  647. edge_type: 边类型(可选,不指定则返回所有类型)
  648. direction: 方向 "out" / "in"
  649. min_score: 最小分数过滤
  650. Returns:
  651. 邻居列表 [{"target": "...", "score": 0.5}, ...]
  652. """
  653. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  654. node_edges = edge_index.get(node_id, {})
  655. if edge_type:
  656. neighbors = node_edges.get(edge_type, [])
  657. else:
  658. neighbors = []
  659. for edges in node_edges.values():
  660. neighbors.extend(edges)
  661. if min_score is not None:
  662. neighbors = [n for n in neighbors if n.get("score", 0) >= min_score]
  663. return neighbors
  664. # ==================== 构建索引 ====================
  665. def build_index(edges: Dict[str, Dict]) -> Dict:
  666. """
  667. 构建游走索引
  668. Returns:
  669. {
  670. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  671. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  672. }
  673. """
  674. out_edges = {}
  675. in_edges = {}
  676. for edge_id, edge_data in edges.items():
  677. source = edge_data["source"]
  678. target = edge_data["target"]
  679. edge_type = edge_data["type"]
  680. score = edge_data["score"]
  681. # outEdges
  682. if source not in out_edges:
  683. out_edges[source] = {}
  684. if edge_type not in out_edges[source]:
  685. out_edges[source][edge_type] = []
  686. out_edges[source][edge_type].append({
  687. "target": target,
  688. "score": score
  689. })
  690. # inEdges
  691. if target not in in_edges:
  692. in_edges[target] = {}
  693. if edge_type not in in_edges[target]:
  694. in_edges[target][edge_type] = []
  695. in_edges[target][edge_type].append({
  696. "source": source,
  697. "score": score
  698. })
  699. return {
  700. "outEdges": out_edges,
  701. "inEdges": in_edges
  702. }
  703. # ==================== 主函数 ====================
  704. def main():
  705. config = PathConfig()
  706. config.ensure_dirs()
  707. print(f"账号: {config.account_name}")
  708. print(f"输出版本: {config.output_version}")
  709. print()
  710. # 输入文件路径
  711. pattern_file = config.pattern_cluster_file
  712. associations_file = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
  713. intra_associations_file = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
  714. historical_posts_dir = config.historical_posts_dir
  715. # 输出文件路径
  716. output_file = config.intermediate_dir / "人设图谱.json"
  717. print("输入文件:")
  718. print(f" pattern聚合文件: {pattern_file}")
  719. print(f" 跨点关联分析文件: {associations_file}")
  720. print(f" 点内关联分析文件: {intra_associations_file}")
  721. print(f" 历史帖子目录: {historical_posts_dir}")
  722. print(f"\n输出文件: {output_file}")
  723. print()
  724. # ===== 读取数据 =====
  725. print("=" * 60)
  726. print("读取数据...")
  727. print(" 读取 pattern 聚合结果...")
  728. with open(pattern_file, "r", encoding="utf-8") as f:
  729. pattern_data = json.load(f)
  730. print(" 读取跨点关联分析结果...")
  731. with open(associations_file, "r", encoding="utf-8") as f:
  732. associations_data = json.load(f)
  733. print(" 读取点内关联分析结果...")
  734. with open(intra_associations_file, "r", encoding="utf-8") as f:
  735. intra_associations_data = json.load(f)
  736. # ===== 提取节点 =====
  737. print("\n" + "=" * 60)
  738. print("提取节点...")
  739. all_nodes = {}
  740. dimension_mapping = {
  741. "灵感点列表": "灵感点",
  742. "目的点": "目的点",
  743. "关键点列表": "关键点"
  744. }
  745. # 分类节点
  746. print("\n提取分类节点:")
  747. for dim_key, dim_name in dimension_mapping.items():
  748. category_nodes = extract_category_nodes_from_pattern(pattern_data, dim_key, dim_name)
  749. all_nodes.update(category_nodes)
  750. print(f" {dim_name}: {len(category_nodes)} 个")
  751. # 标签节点
  752. print("\n提取标签节点:")
  753. for dim_key, dim_name in dimension_mapping.items():
  754. tag_nodes = extract_tag_nodes_from_pattern(pattern_data, dim_key, dim_name)
  755. all_nodes.update(tag_nodes)
  756. print(f" {dim_name}: {len(tag_nodes)} 个")
  757. # 统计
  758. category_count = sum(1 for n in all_nodes.values() if n["type"] == "分类")
  759. tag_count = sum(1 for n in all_nodes.values() if n["type"] == "标签")
  760. print(f"\n节点总计: {len(all_nodes)} (分类: {category_count}, 标签: {tag_count})")
  761. # ===== 提取边 =====
  762. print("\n" + "=" * 60)
  763. print("提取边...")
  764. all_edges = {}
  765. # 属于/包含边
  766. print("\n提取属于/包含边:")
  767. for dim_key, dim_name in dimension_mapping.items():
  768. belong_contain_edges = extract_belong_contain_edges(pattern_data, dim_key, dim_name, all_nodes)
  769. all_edges.update(belong_contain_edges)
  770. belong_count = sum(1 for e in all_edges.values() if e["type"] == "属于")
  771. contain_count = sum(1 for e in all_edges.values() if e["type"] == "包含")
  772. print(f" 属于边: {belong_count}, 包含边: {contain_count}")
  773. # 分类共现边(跨点)
  774. print("\n提取分类共现边(跨点):")
  775. category_cooccur_edges = extract_category_cooccur_edges(associations_data, all_nodes)
  776. all_edges.update(category_cooccur_edges)
  777. print(f" 分类共现边: {len(category_cooccur_edges)}")
  778. # 分类共现边(点内)
  779. print("\n提取分类共现边(点内):")
  780. intra_category_edges = extract_intra_category_cooccur_edges(intra_associations_data, all_nodes)
  781. all_edges.update(intra_category_edges)
  782. print(f" 分类共现边: {len(intra_category_edges)}")
  783. # 标签共现边
  784. print("\n提取标签共现边:")
  785. tag_cooccur_edges = extract_tag_cooccur_edges(historical_posts_dir, all_nodes)
  786. all_edges.update(tag_cooccur_edges)
  787. print(f" 标签共现边: {len(tag_cooccur_edges)}")
  788. # ===== 添加根节点和维度节点 =====
  789. print("\n添加根节点和维度节点:")
  790. # 收集所有帖子ID(用于根节点)
  791. all_post_ids_for_root = set()
  792. for node in all_nodes.values():
  793. post_ids = node["detail"].get("postIds", [])
  794. all_post_ids_for_root.update(post_ids)
  795. # 根节点
  796. root_id = "人设:人设:人设:人设"
  797. root_post_ids = list(all_post_ids_for_root)
  798. all_nodes[root_id] = create_node(
  799. domain="人设",
  800. dimension="人设",
  801. node_type="人设",
  802. name="人设",
  803. detail={
  804. "postIds": root_post_ids,
  805. "postCount": len(root_post_ids)
  806. }
  807. )
  808. # 维度节点 + 边
  809. dimensions = ["灵感点", "目的点", "关键点"]
  810. for dim in dimensions:
  811. # 收集该维度下所有节点的帖子ID
  812. dim_post_ids = set()
  813. for node in all_nodes.values():
  814. if node["dimension"] == dim:
  815. post_ids = node["detail"].get("postIds", [])
  816. dim_post_ids.update(post_ids)
  817. dim_post_ids_list = list(dim_post_ids)
  818. dim_id = f"人设:{dim}:{dim}:{dim}"
  819. all_nodes[dim_id] = create_node(
  820. domain="人设",
  821. dimension=dim,
  822. node_type=dim,
  823. name=dim,
  824. detail={
  825. "postIds": dim_post_ids_list,
  826. "postCount": len(dim_post_ids_list)
  827. }
  828. )
  829. # 维度 -> 根 的属于边
  830. edge_id = build_edge_id(dim_id, "属于", root_id)
  831. all_edges[edge_id] = create_edge(
  832. source=dim_id,
  833. target=root_id,
  834. edge_type="属于",
  835. score=1.0,
  836. detail={
  837. "sourcePostIds": dim_post_ids_list,
  838. "targetPostIds": root_post_ids
  839. }
  840. )
  841. # 根 -> 维度 的包含边
  842. edge_id_contain = build_edge_id(root_id, "包含", dim_id)
  843. all_edges[edge_id_contain] = create_edge(
  844. source=root_id,
  845. target=dim_id,
  846. edge_type="包含",
  847. score=1.0,
  848. detail={
  849. "sourcePostIds": root_post_ids,
  850. "targetPostIds": dim_post_ids_list
  851. }
  852. )
  853. # 找该维度下的顶级分类(没有父节点的分类),添加边
  854. dim_categories = [
  855. (nid, ndata) for nid, ndata in all_nodes.items()
  856. if ndata["dimension"] == dim and ndata["type"] == "分类"
  857. and not ndata["detail"].get("parentPath")
  858. ]
  859. for cat_id, cat_data in dim_categories:
  860. cat_post_ids = cat_data["detail"].get("postIds", [])
  861. # 顶级分类 -> 维度 的属于边
  862. edge_id = build_edge_id(cat_id, "属于", dim_id)
  863. all_edges[edge_id] = create_edge(
  864. source=cat_id,
  865. target=dim_id,
  866. edge_type="属于",
  867. score=1.0,
  868. detail={
  869. "sourcePostIds": cat_post_ids,
  870. "targetPostIds": dim_post_ids_list
  871. }
  872. )
  873. # 维度 -> 顶级分类 的包含边
  874. edge_id_contain = build_edge_id(dim_id, "包含", cat_id)
  875. all_edges[edge_id_contain] = create_edge(
  876. source=dim_id,
  877. target=cat_id,
  878. edge_type="包含",
  879. score=1.0,
  880. detail={
  881. "sourcePostIds": dim_post_ids_list,
  882. "targetPostIds": cat_post_ids
  883. }
  884. )
  885. print(f" 添加节点: 1 根节点 + 3 维度节点 = 4")
  886. print(f" 添加边: 根↔维度 6条 + 维度↔顶级分类")
  887. # 边统计
  888. edge_type_counts = {}
  889. for edge in all_edges.values():
  890. t = edge["type"]
  891. edge_type_counts[t] = edge_type_counts.get(t, 0) + 1
  892. print(f"\n边总计: {len(all_edges)}")
  893. for t, count in sorted(edge_type_counts.items(), key=lambda x: -x[1]):
  894. print(f" {t}: {count}")
  895. # ===== 计算节点概率 =====
  896. print("\n" + "=" * 60)
  897. print("计算节点概率...")
  898. # 1. 计算总帖子数(所有帖子ID的并集)
  899. all_post_ids = set()
  900. for node in all_nodes.values():
  901. post_ids = node["detail"].get("postIds", [])
  902. all_post_ids.update(post_ids)
  903. total_post_count = len(all_post_ids)
  904. print(f" 总帖子数: {total_post_count}")
  905. # 2. 为每个节点计算概率
  906. for node_id, node in all_nodes.items():
  907. post_count = node["detail"].get("postCount", 0)
  908. # 全局概率
  909. if total_post_count > 0:
  910. node["detail"]["probGlobal"] = round(post_count / total_post_count, 4)
  911. else:
  912. node["detail"]["probGlobal"] = 0
  913. # 相对父节点的概率
  914. # 通过"属于"边找父节点
  915. parent_edge_id = None
  916. for edge_id, edge in all_edges.items():
  917. if edge["source"] == node_id and edge["type"] == "属于":
  918. parent_node_id = edge["target"]
  919. parent_node = all_nodes.get(parent_node_id)
  920. if parent_node:
  921. parent_post_count = parent_node["detail"].get("postCount", 0)
  922. if parent_post_count > 0:
  923. node["detail"]["probToParent"] = round(post_count / parent_post_count, 4)
  924. else:
  925. node["detail"]["probToParent"] = 0
  926. break
  927. else:
  928. # 没有父节点(根节点)
  929. node["detail"]["probToParent"] = 1.0
  930. print(f" 已为 {len(all_nodes)} 个节点计算概率")
  931. # 3. 更新"包含"边的分数(使用子节点的 probToParent)
  932. contain_edge_updated = 0
  933. for edge_id, edge in all_edges.items():
  934. if edge["type"] == "包含":
  935. target_node = all_nodes.get(edge["target"])
  936. if target_node:
  937. edge["score"] = target_node["detail"].get("probToParent", 1.0)
  938. contain_edge_updated += 1
  939. print(f" 已更新 {contain_edge_updated} 条包含边的分数")
  940. # ===== 构建索引 =====
  941. print("\n" + "=" * 60)
  942. print("构建索引...")
  943. index = build_index(all_edges)
  944. print(f" outEdges 节点数: {len(index['outEdges'])}")
  945. print(f" inEdges 节点数: {len(index['inEdges'])}")
  946. # ===== 构建嵌套树 =====
  947. print("\n" + "=" * 60)
  948. print("构建嵌套树...")
  949. tree = build_nested_tree(all_nodes, all_edges)
  950. # 统计树节点数
  951. def count_tree_nodes(node):
  952. count = 1
  953. for child in node.get("children", []):
  954. count += count_tree_nodes(child)
  955. return count
  956. tree_node_count = count_tree_nodes(tree)
  957. print(f" 树节点数: {tree_node_count}")
  958. # ===== 统计各维度 =====
  959. dimension_stats = {}
  960. for dim_name in ["灵感点", "目的点", "关键点"]:
  961. dim_categories = sum(1 for n in all_nodes.values() if n["type"] == "分类" and n["dimension"] == dim_name)
  962. dim_tags = sum(1 for n in all_nodes.values() if n["type"] == "标签" and n["dimension"] == dim_name)
  963. dimension_stats[dim_name] = {
  964. "categoryCount": dim_categories,
  965. "tagCount": dim_tags
  966. }
  967. # ===== 构建输出 =====
  968. print("\n" + "=" * 60)
  969. print("保存结果...")
  970. output_data = {
  971. "meta": {
  972. "description": "人设图谱数据",
  973. "account": config.account_name,
  974. "createdAt": datetime.now().isoformat(),
  975. "stats": {
  976. "nodeCount": len(all_nodes),
  977. "edgeCount": len(all_edges),
  978. "categoryCount": category_count,
  979. "tagCount": tag_count,
  980. "treeNodeCount": tree_node_count,
  981. "dimensions": dimension_stats,
  982. "edgeTypes": edge_type_counts
  983. }
  984. },
  985. "nodes": all_nodes,
  986. "edges": all_edges,
  987. "index": index,
  988. "tree": tree
  989. }
  990. with open(output_file, "w", encoding="utf-8") as f:
  991. json.dump(output_data, f, ensure_ascii=False, indent=2)
  992. print(f"\n输出文件: {output_file}")
  993. print("\n" + "=" * 60)
  994. print("完成!")
  995. if __name__ == "__main__":
  996. main()