build_persona_graph.py 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 构建人设图谱
  5. ================================================================================
  6. 输入文件:
  7. ================================================================================
  8. 1. pattern聚合结果.json - 分类节点、标签节点、属于/包含边
  9. 2. dimension_associations_analysis.json - 分类共现边(跨点)
  10. 3. intra_dimension_associations_analysis.json - 分类共现边(点内)
  11. 4. 历史帖子解构目录/*.json - 标签共现边
  12. ================================================================================
  13. 输出文件: 人设图谱.json
  14. ================================================================================
  15. {
  16. "meta": { # 元信息
  17. "description": "...",
  18. "account": "账号名",
  19. "createdAt": "时间戳",
  20. "stats": { ... } # 统计信息
  21. },
  22. "nodes": { # 节点字典 (nodeId -> nodeData)
  23. "{domain}:{dimension}:{type}:{name}": {
  24. "name": "显示名称",
  25. "type": "人设|灵感点|目的点|关键点|分类|标签",
  26. "domain": "人设",
  27. "dimension": "人设|灵感点|目的点|关键点",
  28. "detail": { ... }
  29. }
  30. },
  31. "edges": { # 边字典 (edgeId -> edgeData)
  32. "{source}|{type}|{target}": {
  33. "source": "源节点ID",
  34. "target": "目标节点ID",
  35. "type": "属于|包含|标签共现|分类共现|分类共现",
  36. "score": 0.5,
  37. "detail": { ... }
  38. }
  39. },
  40. "index": { # 游走索引
  41. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  42. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  43. },
  44. "tree": { ... } # 嵌套树结构(从根节点沿"包含"边构建)
  45. }
  46. ================================================================================
  47. 核心逻辑:
  48. ================================================================================
  49. 1. 提取节点
  50. - 从 pattern 提取分类节点(按维度分组的层级分类)
  51. - 从 pattern 提取标签节点(具体特征标签)
  52. - 添加根节点(人设)和维度节点(灵感点/目的点/关键点)
  53. 2. 提取边
  54. - 属于/包含边:根据节点的 parentPath 构建层级关系
  55. - 分类共现边(跨点):从关联分析结果提取
  56. - 分类共现边(点内):从点内关联分析提取
  57. - 标签共现边:遍历历史帖子,统计标签同现
  58. 3. 构建索引
  59. - outEdges: 从该节点出发能到达的节点
  60. - inEdges: 能到达该节点的源节点
  61. 4. 构建树
  62. - 从根节点开始,沿"包含"边递归构建嵌套树结构
  63. ================================================================================
  64. 节点ID格式: {domain}:{dimension}:{type}:{name}
  65. ================================================================================
  66. - 根节点: 人设:人设:人设:人设
  67. - 维度节点: 人设:灵感点:灵感点:灵感点
  68. - 分类节点: 人设:灵感点:分类:视觉呈现
  69. - 标签节点: 人设:灵感点:标签:手绘风格
  70. ================================================================================
  71. 边类型:
  72. ================================================================================
  73. - 属于: 子节点 -> 父节点(层级关系)
  74. - 包含: 父节点 -> 子节点(层级关系)
  75. - 标签共现: 标签 <-> 标签(同一帖子出现)
  76. - 分类共现: 分类 <-> 分类(跨维度共现)
  77. - 分类共现: 分类 <-> 分类(点内组合共现)
  78. ================================================================================
  79. 图游走函数:
  80. ================================================================================
  81. 1. walk_graph(index, start_node, edge_types, direction, min_score)
  82. - 从起始节点出发,按边类型序列游走N步
  83. - 示例: walk_graph(index, "人设:灵感点:标签:手绘风格", ["属于", "分类共现"])
  84. - 返回: 到达的节点ID集合
  85. 2. get_neighbors(index, node_id, edge_type, direction, min_score)
  86. - 获取节点的邻居
  87. - 示例: get_neighbors(index, "人设:灵感点:分类:视觉呈现", "包含")
  88. - 返回: 邻居列表 [{"target": "...", "score": 0.5}, ...]
  89. ================================================================================
  90. """
  91. import json
  92. from pathlib import Path
  93. from typing import Dict, List, Set, Any
  94. from datetime import datetime
  95. import sys
  96. # 添加项目根目录到路径
  97. project_root = Path(__file__).parent.parent.parent
  98. sys.path.insert(0, str(project_root))
  99. from script.data_processing.path_config import PathConfig
  100. # ==================== 节点和边构建工具 ====================
  101. def build_node_id(domain: str, dimension: str, node_type: str, name: str) -> str:
  102. """构建节点ID"""
  103. return f"{domain}:{dimension}:{node_type}:{name}"
  104. def build_edge_id(source: str, edge_type: str, target: str) -> str:
  105. """构建边ID"""
  106. return f"{source}|{edge_type}|{target}"
  107. def create_node(
  108. domain: str,
  109. dimension: str,
  110. node_type: str,
  111. name: str,
  112. detail: Dict = None
  113. ) -> Dict:
  114. """创建节点"""
  115. return {
  116. "name": name,
  117. "type": node_type,
  118. "dimension": dimension,
  119. "domain": domain,
  120. "detail": detail or {}
  121. }
  122. def create_edge(
  123. source: str,
  124. target: str,
  125. edge_type: str,
  126. score: float = None,
  127. detail: Dict = None
  128. ) -> Dict:
  129. """创建边"""
  130. return {
  131. "source": source,
  132. "target": target,
  133. "type": edge_type,
  134. "score": score,
  135. "detail": detail or {}
  136. }
  137. # ==================== 从 pattern 提取分类节点 ====================
  138. def extract_category_nodes_from_pattern(
  139. pattern_data: Dict,
  140. dimension_key: str,
  141. dimension_name: str
  142. ) -> Dict[str, Dict]:
  143. """
  144. 从 pattern 聚合结果中提取分类节点
  145. Returns:
  146. { nodeId: nodeData }
  147. """
  148. nodes = {}
  149. if dimension_key not in pattern_data:
  150. return nodes
  151. def collect_sources_recursively(node: Dict) -> List[Dict]:
  152. """递归收集节点及其所有子节点的特征来源"""
  153. sources = []
  154. if "特征列表" in node:
  155. for feature in node["特征列表"]:
  156. source = {
  157. "pointName": feature.get("所属点", ""),
  158. "pointDesc": feature.get("点描述", ""),
  159. "postId": feature.get("帖子id", "")
  160. }
  161. sources.append(source)
  162. for key, value in node.items():
  163. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  164. continue
  165. if isinstance(value, dict):
  166. sources.extend(collect_sources_recursively(value))
  167. return sources
  168. def traverse_node(node: Dict, parent_path: List[str]):
  169. """递归遍历节点"""
  170. for key, value in node.items():
  171. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  172. continue
  173. if isinstance(value, dict):
  174. current_path = parent_path + [key]
  175. # 构建节点来源(只收集当前节点的特征)
  176. node_sources = []
  177. if "特征列表" in value:
  178. for feature in value["特征列表"]:
  179. source = {
  180. "pointName": feature.get("所属点", ""),
  181. "pointDesc": feature.get("点描述", ""),
  182. "postId": feature.get("帖子id", "")
  183. }
  184. node_sources.append(source)
  185. # 收集帖子ID列表(递归收集当前节点及所有子节点的帖子ID,去重)
  186. all_sources = collect_sources_recursively(value)
  187. unique_post_ids = list(set(s.get("postId", "") for s in all_sources if s.get("postId")))
  188. # 构建节点
  189. node_id = build_node_id("人设", dimension_name, "分类", key)
  190. nodes[node_id] = create_node(
  191. domain="人设",
  192. dimension=dimension_name,
  193. node_type="分类",
  194. name=key,
  195. detail={
  196. "parentPath": parent_path.copy(),
  197. "postIds": unique_post_ids,
  198. "postCount": len(unique_post_ids),
  199. "sources": node_sources
  200. }
  201. )
  202. # 递归处理子节点
  203. traverse_node(value, current_path)
  204. traverse_node(pattern_data[dimension_key], [])
  205. return nodes
  206. # ==================== 从 pattern 提取标签节点 ====================
  207. def extract_tag_nodes_from_pattern(
  208. pattern_data: Dict,
  209. dimension_key: str,
  210. dimension_name: str
  211. ) -> Dict[str, Dict]:
  212. """
  213. 从 pattern 聚合结果中提取标签节点
  214. Returns:
  215. { nodeId: nodeData }
  216. """
  217. nodes = {}
  218. tag_map = {} # 用于合并同名标签: tagId -> { sources, postIds, parentPath }
  219. if dimension_key not in pattern_data:
  220. return nodes
  221. def traverse_node(node: Dict, parent_path: List[str]):
  222. """递归遍历节点"""
  223. # 处理特征列表(标签)
  224. if "特征列表" in node:
  225. for feature in node["特征列表"]:
  226. tag_name = feature.get("特征名称", "")
  227. if not tag_name:
  228. continue
  229. source = {
  230. "pointName": feature.get("所属点", ""),
  231. "pointDesc": feature.get("点描述", ""),
  232. "postId": feature.get("帖子id", "")
  233. }
  234. tag_id = build_node_id("人设", dimension_name, "标签", tag_name)
  235. if tag_id not in tag_map:
  236. tag_map[tag_id] = {
  237. "name": tag_name,
  238. "sources": [],
  239. "postIds": set(),
  240. "parentPath": parent_path.copy()
  241. }
  242. tag_map[tag_id]["sources"].append(source)
  243. if source["postId"]:
  244. tag_map[tag_id]["postIds"].add(source["postId"])
  245. # 递归处理子节点
  246. for key, value in node.items():
  247. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  248. continue
  249. if isinstance(value, dict):
  250. current_path = parent_path + [key]
  251. traverse_node(value, current_path)
  252. traverse_node(pattern_data[dimension_key], [])
  253. # 转换为节点
  254. for tag_id, tag_info in tag_map.items():
  255. nodes[tag_id] = create_node(
  256. domain="人设",
  257. dimension=dimension_name,
  258. node_type="标签",
  259. name=tag_info["name"],
  260. detail={
  261. "parentPath": tag_info["parentPath"],
  262. "postIds": list(tag_info["postIds"]),
  263. "postCount": len(tag_info["postIds"]),
  264. "sources": tag_info["sources"]
  265. }
  266. )
  267. return nodes
  268. # ==================== 从 pattern 提取属于/包含边 ====================
  269. def extract_belong_contain_edges(
  270. pattern_data: Dict,
  271. dimension_key: str,
  272. dimension_name: str,
  273. nodes: Dict[str, Dict]
  274. ) -> Dict[str, Dict]:
  275. """
  276. 从 pattern 聚合结果中提取属于/包含边
  277. Returns:
  278. { edgeId: edgeData }
  279. """
  280. edges = {}
  281. if dimension_key not in pattern_data:
  282. return edges
  283. # 构建分类名称到ID的映射
  284. category_name_to_id = {}
  285. for node_id, node_data in nodes.items():
  286. if node_data["type"] == "分类" and node_data["dimension"] == dimension_name:
  287. category_name_to_id[node_data["name"]] = node_id
  288. # 为每个节点创建属于边(子→父)
  289. for node_id, node_data in nodes.items():
  290. if node_data["dimension"] != dimension_name:
  291. continue
  292. parent_path = node_data["detail"].get("parentPath", [])
  293. if not parent_path:
  294. continue
  295. # 取最后一个作为直接父分类
  296. parent_name = parent_path[-1]
  297. parent_id = category_name_to_id.get(parent_name)
  298. if parent_id:
  299. # 获取 source 和 target 的 postIds
  300. child_post_ids = node_data["detail"].get("postIds", [])
  301. parent_post_ids = nodes.get(parent_id, {}).get("detail", {}).get("postIds", [])
  302. # 属于边:子 → 父
  303. edge_id = build_edge_id(node_id, "属于", parent_id)
  304. edges[edge_id] = create_edge(
  305. source=node_id,
  306. target=parent_id,
  307. edge_type="属于",
  308. score=1.0,
  309. detail={
  310. "sourcePostIds": child_post_ids,
  311. "targetPostIds": parent_post_ids
  312. }
  313. )
  314. # 包含边:父 → 子
  315. edge_id_contain = build_edge_id(parent_id, "包含", node_id)
  316. edges[edge_id_contain] = create_edge(
  317. source=parent_id,
  318. target=node_id,
  319. edge_type="包含",
  320. score=1.0,
  321. detail={
  322. "sourcePostIds": parent_post_ids,
  323. "targetPostIds": child_post_ids
  324. }
  325. )
  326. return edges
  327. # ==================== 从关联分析提取分类共现边(跨点)====================
  328. def extract_category_cooccur_edges(associations_data: Dict, nodes: Dict[str, Dict]) -> Dict[str, Dict]:
  329. """
  330. 从 dimension_associations_analysis.json 中提取分类共现边(跨点)
  331. Args:
  332. associations_data: 关联分析数据
  333. nodes: 已构建的节点数据(用于获取节点的 postIds)
  334. Returns:
  335. { edgeId: edgeData }
  336. """
  337. edges = {}
  338. if "单维度关联分析" not in associations_data:
  339. return edges
  340. single_dim = associations_data["单维度关联分析"]
  341. # 维度映射
  342. dimension_map = {
  343. "灵感点维度": "灵感点",
  344. "目的点维度": "目的点",
  345. "关键点维度": "关键点"
  346. }
  347. def get_last_segment(path: str) -> str:
  348. """获取路径的最后一段"""
  349. return path.split("/")[-1]
  350. for dim_key, dim_data in single_dim.items():
  351. if dim_key not in dimension_map:
  352. continue
  353. source_dimension = dimension_map[dim_key]
  354. for direction_key, direction_data in dim_data.items():
  355. if direction_key == "说明" or "→" not in direction_key:
  356. continue
  357. for source_path, source_info in direction_data.items():
  358. source_name = get_last_segment(source_path)
  359. source_node_id = build_node_id("人设", source_dimension, "分类", source_name)
  360. for field_name, associations in source_info.items():
  361. if not field_name.startswith("与") or not field_name.endswith("的关联"):
  362. continue
  363. target_dimension = field_name[1:-3]
  364. if not isinstance(associations, list):
  365. continue
  366. for assoc in associations:
  367. target_path = assoc.get("目标分类", "")
  368. if not target_path:
  369. continue
  370. target_name = get_last_segment(target_path)
  371. target_node_id = build_node_id("人设", target_dimension, "分类", target_name)
  372. # 使用 Jaccard 作为 score
  373. jaccard = assoc.get("Jaccard相似度", 0)
  374. # 获取 source 和 target 的 postIds
  375. source_post_ids = nodes.get(source_node_id, {}).get("detail", {}).get("postIds", [])
  376. target_post_ids = nodes.get(target_node_id, {}).get("detail", {}).get("postIds", [])
  377. edge_id = build_edge_id(source_node_id, "分类共现", target_node_id)
  378. edges[edge_id] = create_edge(
  379. source=source_node_id,
  380. target=target_node_id,
  381. edge_type="分类共现",
  382. score=jaccard,
  383. detail={
  384. "postIds": assoc.get("共同帖子ID", []),
  385. "postCount": assoc.get("共同帖子数", 0),
  386. "jaccard": jaccard,
  387. "overlapCoef": assoc.get("重叠系数", 0),
  388. "sourcePostIds": source_post_ids,
  389. "targetPostIds": target_post_ids
  390. }
  391. )
  392. return edges
  393. # ==================== 从关联分析提取分类共现边(点内)====================
  394. def extract_intra_category_cooccur_edges(intra_data: Dict, nodes: Dict[str, Dict]) -> Dict[str, Dict]:
  395. """
  396. 从 intra_dimension_associations_analysis.json 中提取点内分类共现边
  397. Args:
  398. intra_data: 点内关联分析数据
  399. nodes: 已构建的节点数据(用于获取节点的 postIds)
  400. Returns:
  401. { edgeId: edgeData }
  402. """
  403. edges = {}
  404. if "叶子分类组合聚类" not in intra_data:
  405. return edges
  406. clusters_by_dim = intra_data["叶子分类组合聚类"]
  407. for dimension, clusters in clusters_by_dim.items():
  408. if dimension not in ("灵感点", "目的点", "关键点"):
  409. continue
  410. for cluster_key, cluster_data in clusters.items():
  411. leaf_categories = cluster_data.get("叶子分类组合", [])
  412. point_count = cluster_data.get("点数", 0)
  413. point_details = cluster_data.get("点详情列表", [])
  414. # 提取点名称列表
  415. point_names = [p.get("点名称", "") for p in point_details if p.get("点名称")]
  416. # 两两组合生成共现边
  417. for i in range(len(leaf_categories)):
  418. for j in range(i + 1, len(leaf_categories)):
  419. cat1 = leaf_categories[i]
  420. cat2 = leaf_categories[j]
  421. cat1_id = build_node_id("人设", dimension, "分类", cat1)
  422. cat2_id = build_node_id("人设", dimension, "分类", cat2)
  423. # 确保顺序一致(按字典序)
  424. if cat1_id > cat2_id:
  425. cat1_id, cat2_id = cat2_id, cat1_id
  426. edge_id = build_edge_id(cat1_id, "分类共现", cat2_id)
  427. if edge_id in edges:
  428. # 累加
  429. edges[edge_id]["detail"]["pointCount"] += point_count
  430. edges[edge_id]["detail"]["pointNames"].extend(point_names)
  431. else:
  432. # 获取 source 和 target 的 postIds
  433. cat1_post_ids = nodes.get(cat1_id, {}).get("detail", {}).get("postIds", [])
  434. cat2_post_ids = nodes.get(cat2_id, {}).get("detail", {}).get("postIds", [])
  435. # 计算 Jaccard(基于帖子)
  436. cat1_set = set(cat1_post_ids)
  437. cat2_set = set(cat2_post_ids)
  438. intersection = cat1_set & cat2_set
  439. union = cat1_set | cat2_set
  440. jaccard = round(len(intersection) / len(union), 4) if union else 0
  441. edges[edge_id] = create_edge(
  442. source=cat1_id,
  443. target=cat2_id,
  444. edge_type="分类共现",
  445. score=jaccard,
  446. detail={
  447. "postIds": list(intersection),
  448. "postCount": len(intersection),
  449. "jaccard": jaccard,
  450. "pointCount": point_count,
  451. "pointNames": point_names.copy(),
  452. "sourcePostIds": cat1_post_ids,
  453. "targetPostIds": cat2_post_ids
  454. }
  455. )
  456. return edges
  457. # ==================== 从历史帖子提取标签共现边 ====================
  458. def extract_tag_cooccur_edges(historical_posts_dir: Path, nodes: Dict[str, Dict]) -> Dict[str, Dict]:
  459. """
  460. 从历史帖子解构结果中提取标签共现边
  461. Args:
  462. historical_posts_dir: 历史帖子目录
  463. nodes: 已构建的节点数据(用于获取标签的 postIds 计算 Jaccard)
  464. Returns:
  465. { edgeId: edgeData }
  466. """
  467. edges = {}
  468. cooccur_map = {} # (tag1_id, tag2_id) -> { postIds: set() }
  469. if not historical_posts_dir.exists():
  470. print(f" 警告: 历史帖子目录不存在: {historical_posts_dir}")
  471. return edges
  472. json_files = list(historical_posts_dir.glob("*.json"))
  473. print(f" 找到 {len(json_files)} 个历史帖子文件")
  474. def extract_post_id_from_filename(filename: str) -> str:
  475. """从文件名中提取帖子ID"""
  476. import re
  477. match = re.match(r'^([^_]+)_', filename)
  478. return match.group(1) if match else ""
  479. def extract_tags_from_post(post_data: Dict) -> Dict[str, List[str]]:
  480. """从帖子解构结果中提取所有标签"""
  481. tags_by_dimension = {
  482. "灵感点": [],
  483. "目的点": [],
  484. "关键点": []
  485. }
  486. if "三点解构" not in post_data:
  487. return tags_by_dimension
  488. three_points = post_data["三点解构"]
  489. # 灵感点
  490. if "灵感点" in three_points:
  491. inspiration = three_points["灵感点"]
  492. for section in ["全新内容", "共性差异", "共性内容"]:
  493. if section in inspiration and isinstance(inspiration[section], list):
  494. for item in inspiration[section]:
  495. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  496. for feature in item["提取的特征"]:
  497. tag_name = feature.get("特征名称", "")
  498. if tag_name:
  499. tags_by_dimension["灵感点"].append(tag_name)
  500. # 目的点
  501. if "目的点" in three_points:
  502. purpose = three_points["目的点"]
  503. if "purposes" in purpose and isinstance(purpose["purposes"], list):
  504. for item in purpose["purposes"]:
  505. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  506. for feature in item["提取的特征"]:
  507. tag_name = feature.get("特征名称", "")
  508. if tag_name:
  509. tags_by_dimension["目的点"].append(tag_name)
  510. # 关键点
  511. if "关键点" in three_points:
  512. key_points = three_points["关键点"]
  513. if "key_points" in key_points and isinstance(key_points["key_points"], list):
  514. for item in key_points["key_points"]:
  515. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  516. for feature in item["提取的特征"]:
  517. tag_name = feature.get("特征名称", "")
  518. if tag_name:
  519. tags_by_dimension["关键点"].append(tag_name)
  520. return tags_by_dimension
  521. # 遍历所有帖子文件
  522. for file_path in json_files:
  523. post_id = extract_post_id_from_filename(file_path.name)
  524. if not post_id:
  525. continue
  526. try:
  527. with open(file_path, "r", encoding="utf-8") as f:
  528. post_data = json.load(f)
  529. tags_by_dimension = extract_tags_from_post(post_data)
  530. # 对每个维度内的标签两两组合
  531. for dimension, tags in tags_by_dimension.items():
  532. unique_tags = list(set(tags))
  533. for i in range(len(unique_tags)):
  534. for j in range(i + 1, len(unique_tags)):
  535. tag1 = unique_tags[i]
  536. tag2 = unique_tags[j]
  537. tag1_id = build_node_id("人设", dimension, "标签", tag1)
  538. tag2_id = build_node_id("人设", dimension, "标签", tag2)
  539. # 确保顺序一致
  540. if tag1_id > tag2_id:
  541. tag1_id, tag2_id = tag2_id, tag1_id
  542. key = (tag1_id, tag2_id)
  543. if key not in cooccur_map:
  544. cooccur_map[key] = {"postIds": set()}
  545. cooccur_map[key]["postIds"].add(post_id)
  546. except Exception as e:
  547. print(f" 警告: 处理文件 {file_path.name} 时出错: {e}")
  548. # 转换为边
  549. for (tag1_id, tag2_id), info in cooccur_map.items():
  550. cooccur_post_ids = list(info["postIds"])
  551. cooccur_count = len(cooccur_post_ids)
  552. # 获取两个标签的帖子集合,计算 Jaccard
  553. tag1_post_ids = nodes.get(tag1_id, {}).get("detail", {}).get("postIds", [])
  554. tag2_post_ids = nodes.get(tag2_id, {}).get("detail", {}).get("postIds", [])
  555. union_count = len(set(tag1_post_ids) | set(tag2_post_ids))
  556. jaccard = round(cooccur_count / union_count, 4) if union_count > 0 else 0
  557. edge_id = build_edge_id(tag1_id, "标签共现", tag2_id)
  558. edges[edge_id] = create_edge(
  559. source=tag1_id,
  560. target=tag2_id,
  561. edge_type="标签共现",
  562. score=jaccard,
  563. detail={
  564. "postIds": cooccur_post_ids,
  565. "postCount": cooccur_count,
  566. "jaccard": jaccard,
  567. "sourcePostIds": tag1_post_ids,
  568. "targetPostIds": tag2_post_ids
  569. }
  570. )
  571. return edges
  572. # ==================== 构建嵌套树结构 ====================
  573. def build_nested_tree(nodes: Dict[str, Dict], edges: Dict[str, Dict]) -> Dict:
  574. """
  575. 从根节点开始,沿"包含"边递归构建嵌套树结构
  576. 包含边:父节点 -> 子节点
  577. 从根节点开始,递归找所有包含的子节点
  578. Returns:
  579. 嵌套的树结构
  580. """
  581. # 从"包含"边构建 父节点 -> [子节点] 的映射
  582. parent_to_children = {} # parent_id -> [child_id, ...]
  583. for edge_id, edge_data in edges.items():
  584. if edge_data["type"] == "包含":
  585. parent_id = edge_data["source"]
  586. child_id = edge_data["target"]
  587. if parent_id not in parent_to_children:
  588. parent_to_children[parent_id] = []
  589. parent_to_children[parent_id].append(child_id)
  590. # 递归构建子树
  591. def build_subtree(node_id: str) -> Dict:
  592. node_data = nodes[node_id]
  593. subtree = {
  594. "id": node_id,
  595. "name": node_data["name"],
  596. "type": node_data["type"],
  597. "domain": node_data["domain"],
  598. "dimension": node_data["dimension"],
  599. "detail": node_data.get("detail", {}),
  600. "children": []
  601. }
  602. # 获取子节点
  603. child_ids = parent_to_children.get(node_id, [])
  604. for child_id in child_ids:
  605. if child_id in nodes:
  606. subtree["children"].append(build_subtree(child_id))
  607. return subtree
  608. # 从根节点开始构建
  609. root_id = "人设:人设:人设:人设"
  610. return build_subtree(root_id)
  611. # ==================== 图游走工具 ====================
  612. def walk_graph(
  613. index: Dict,
  614. start_node: str,
  615. edge_types: List[str],
  616. direction: str = "out",
  617. min_score: float = None
  618. ) -> Set[str]:
  619. """
  620. 从起始节点出发,按指定边类型序列游走N步
  621. Args:
  622. index: 游走索引 {"outEdges": {...}, "inEdges": {...}}
  623. start_node: 起始节点ID
  624. edge_types: 边类型序列,如 ["属于", "分类共现"]
  625. direction: 游走方向 "out"(沿出边) / "in"(沿入边)
  626. min_score: 最小分数过滤
  627. Returns:
  628. 到达的节点ID集合
  629. Example:
  630. # 从标签出发,沿"属于"边走1步,再沿"分类共现"边走1步
  631. result = walk_graph(
  632. index,
  633. "人设:灵感点:标签:手绘风格",
  634. ["属于", "分类共现"]
  635. )
  636. """
  637. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  638. target_key = "target" if direction == "out" else "source"
  639. current_nodes = {start_node}
  640. for edge_type in edge_types:
  641. next_nodes = set()
  642. for node in current_nodes:
  643. neighbors = edge_index.get(node, {}).get(edge_type, [])
  644. for neighbor in neighbors:
  645. # 分数过滤
  646. if min_score is not None and neighbor.get("score", 0) < min_score:
  647. continue
  648. next_nodes.add(neighbor[target_key])
  649. current_nodes = next_nodes
  650. if not current_nodes:
  651. break
  652. return current_nodes
  653. def get_neighbors(
  654. index: Dict,
  655. node_id: str,
  656. edge_type: str = None,
  657. direction: str = "out",
  658. min_score: float = None
  659. ) -> List[Dict]:
  660. """
  661. 获取节点的邻居
  662. Args:
  663. index: 游走索引
  664. node_id: 节点ID
  665. edge_type: 边类型(可选,不指定则返回所有类型)
  666. direction: 方向 "out" / "in"
  667. min_score: 最小分数过滤
  668. Returns:
  669. 邻居列表 [{"target": "...", "score": 0.5}, ...]
  670. """
  671. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  672. node_edges = edge_index.get(node_id, {})
  673. if edge_type:
  674. neighbors = node_edges.get(edge_type, [])
  675. else:
  676. neighbors = []
  677. for edges in node_edges.values():
  678. neighbors.extend(edges)
  679. if min_score is not None:
  680. neighbors = [n for n in neighbors if n.get("score", 0) >= min_score]
  681. return neighbors
  682. # ==================== 构建索引 ====================
  683. def build_index(edges: Dict[str, Dict]) -> Dict:
  684. """
  685. 构建游走索引
  686. Returns:
  687. {
  688. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  689. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  690. }
  691. """
  692. out_edges = {}
  693. in_edges = {}
  694. for edge_id, edge_data in edges.items():
  695. source = edge_data["source"]
  696. target = edge_data["target"]
  697. edge_type = edge_data["type"]
  698. score = edge_data["score"]
  699. # outEdges
  700. if source not in out_edges:
  701. out_edges[source] = {}
  702. if edge_type not in out_edges[source]:
  703. out_edges[source][edge_type] = []
  704. out_edges[source][edge_type].append({
  705. "target": target,
  706. "score": score
  707. })
  708. # inEdges
  709. if target not in in_edges:
  710. in_edges[target] = {}
  711. if edge_type not in in_edges[target]:
  712. in_edges[target][edge_type] = []
  713. in_edges[target][edge_type].append({
  714. "source": source,
  715. "score": score
  716. })
  717. return {
  718. "outEdges": out_edges,
  719. "inEdges": in_edges
  720. }
  721. # ==================== 主函数 ====================
  722. def main():
  723. config = PathConfig()
  724. config.ensure_dirs()
  725. print(f"账号: {config.account_name}")
  726. print(f"输出版本: {config.output_version}")
  727. print()
  728. # 输入文件路径
  729. pattern_file = config.pattern_cluster_file
  730. associations_file = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
  731. intra_associations_file = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
  732. historical_posts_dir = config.historical_posts_dir
  733. # 输出文件路径
  734. output_file = config.intermediate_dir / "人设图谱.json"
  735. print("输入文件:")
  736. print(f" pattern聚合文件: {pattern_file}")
  737. print(f" 跨点关联分析文件: {associations_file}")
  738. print(f" 点内关联分析文件: {intra_associations_file}")
  739. print(f" 历史帖子目录: {historical_posts_dir}")
  740. print(f"\n输出文件: {output_file}")
  741. print()
  742. # ===== 读取数据 =====
  743. print("=" * 60)
  744. print("读取数据...")
  745. print(" 读取 pattern 聚合结果...")
  746. with open(pattern_file, "r", encoding="utf-8") as f:
  747. pattern_data = json.load(f)
  748. print(" 读取跨点关联分析结果...")
  749. with open(associations_file, "r", encoding="utf-8") as f:
  750. associations_data = json.load(f)
  751. print(" 读取点内关联分析结果...")
  752. with open(intra_associations_file, "r", encoding="utf-8") as f:
  753. intra_associations_data = json.load(f)
  754. # ===== 提取节点 =====
  755. print("\n" + "=" * 60)
  756. print("提取节点...")
  757. all_nodes = {}
  758. dimension_mapping = {
  759. "灵感点列表": "灵感点",
  760. "目的点": "目的点",
  761. "关键点列表": "关键点"
  762. }
  763. # 分类节点
  764. print("\n提取分类节点:")
  765. for dim_key, dim_name in dimension_mapping.items():
  766. category_nodes = extract_category_nodes_from_pattern(pattern_data, dim_key, dim_name)
  767. all_nodes.update(category_nodes)
  768. print(f" {dim_name}: {len(category_nodes)} 个")
  769. # 标签节点
  770. print("\n提取标签节点:")
  771. for dim_key, dim_name in dimension_mapping.items():
  772. tag_nodes = extract_tag_nodes_from_pattern(pattern_data, dim_key, dim_name)
  773. all_nodes.update(tag_nodes)
  774. print(f" {dim_name}: {len(tag_nodes)} 个")
  775. # 统计
  776. category_count = sum(1 for n in all_nodes.values() if n["type"] == "分类")
  777. tag_count = sum(1 for n in all_nodes.values() if n["type"] == "标签")
  778. print(f"\n节点总计: {len(all_nodes)} (分类: {category_count}, 标签: {tag_count})")
  779. # ===== 提取边 =====
  780. print("\n" + "=" * 60)
  781. print("提取边...")
  782. all_edges = {}
  783. # 属于/包含边
  784. print("\n提取属于/包含边:")
  785. for dim_key, dim_name in dimension_mapping.items():
  786. belong_contain_edges = extract_belong_contain_edges(pattern_data, dim_key, dim_name, all_nodes)
  787. all_edges.update(belong_contain_edges)
  788. belong_count = sum(1 for e in all_edges.values() if e["type"] == "属于")
  789. contain_count = sum(1 for e in all_edges.values() if e["type"] == "包含")
  790. print(f" 属于边: {belong_count}, 包含边: {contain_count}")
  791. # 分类共现边(跨点)
  792. print("\n提取分类共现边(跨点):")
  793. category_cooccur_edges = extract_category_cooccur_edges(associations_data, all_nodes)
  794. all_edges.update(category_cooccur_edges)
  795. print(f" 分类共现边: {len(category_cooccur_edges)}")
  796. # 分类共现边(点内)
  797. print("\n提取分类共现边(点内):")
  798. intra_category_edges = extract_intra_category_cooccur_edges(intra_associations_data, all_nodes)
  799. all_edges.update(intra_category_edges)
  800. print(f" 分类共现边: {len(intra_category_edges)}")
  801. # 标签共现边
  802. print("\n提取标签共现边:")
  803. tag_cooccur_edges = extract_tag_cooccur_edges(historical_posts_dir, all_nodes)
  804. all_edges.update(tag_cooccur_edges)
  805. print(f" 标签共现边: {len(tag_cooccur_edges)}")
  806. # ===== 添加根节点和维度节点 =====
  807. print("\n添加根节点和维度节点:")
  808. # 收集所有帖子ID(用于根节点)
  809. all_post_ids_for_root = set()
  810. for node in all_nodes.values():
  811. post_ids = node["detail"].get("postIds", [])
  812. all_post_ids_for_root.update(post_ids)
  813. # 根节点
  814. root_id = "人设:人设:人设:人设"
  815. root_post_ids = list(all_post_ids_for_root)
  816. all_nodes[root_id] = create_node(
  817. domain="人设",
  818. dimension="人设",
  819. node_type="人设",
  820. name="人设",
  821. detail={
  822. "postIds": root_post_ids,
  823. "postCount": len(root_post_ids)
  824. }
  825. )
  826. # 维度节点 + 边
  827. dimensions = ["灵感点", "目的点", "关键点"]
  828. for dim in dimensions:
  829. # 收集该维度下所有节点的帖子ID
  830. dim_post_ids = set()
  831. for node in all_nodes.values():
  832. if node["dimension"] == dim:
  833. post_ids = node["detail"].get("postIds", [])
  834. dim_post_ids.update(post_ids)
  835. dim_post_ids_list = list(dim_post_ids)
  836. dim_id = f"人设:{dim}:{dim}:{dim}"
  837. all_nodes[dim_id] = create_node(
  838. domain="人设",
  839. dimension=dim,
  840. node_type=dim,
  841. name=dim,
  842. detail={
  843. "postIds": dim_post_ids_list,
  844. "postCount": len(dim_post_ids_list)
  845. }
  846. )
  847. # 维度 -> 根 的属于边
  848. edge_id = build_edge_id(dim_id, "属于", root_id)
  849. all_edges[edge_id] = create_edge(
  850. source=dim_id,
  851. target=root_id,
  852. edge_type="属于",
  853. score=1.0,
  854. detail={
  855. "sourcePostIds": dim_post_ids_list,
  856. "targetPostIds": root_post_ids
  857. }
  858. )
  859. # 根 -> 维度 的包含边
  860. edge_id_contain = build_edge_id(root_id, "包含", dim_id)
  861. all_edges[edge_id_contain] = create_edge(
  862. source=root_id,
  863. target=dim_id,
  864. edge_type="包含",
  865. score=1.0,
  866. detail={
  867. "sourcePostIds": root_post_ids,
  868. "targetPostIds": dim_post_ids_list
  869. }
  870. )
  871. # 找该维度下的顶级分类(没有父节点的分类),添加边
  872. dim_categories = [
  873. (nid, ndata) for nid, ndata in all_nodes.items()
  874. if ndata["dimension"] == dim and ndata["type"] == "分类"
  875. and not ndata["detail"].get("parentPath")
  876. ]
  877. for cat_id, cat_data in dim_categories:
  878. cat_post_ids = cat_data["detail"].get("postIds", [])
  879. # 顶级分类 -> 维度 的属于边
  880. edge_id = build_edge_id(cat_id, "属于", dim_id)
  881. all_edges[edge_id] = create_edge(
  882. source=cat_id,
  883. target=dim_id,
  884. edge_type="属于",
  885. score=1.0,
  886. detail={
  887. "sourcePostIds": cat_post_ids,
  888. "targetPostIds": dim_post_ids_list
  889. }
  890. )
  891. # 维度 -> 顶级分类 的包含边
  892. edge_id_contain = build_edge_id(dim_id, "包含", cat_id)
  893. all_edges[edge_id_contain] = create_edge(
  894. source=dim_id,
  895. target=cat_id,
  896. edge_type="包含",
  897. score=1.0,
  898. detail={
  899. "sourcePostIds": dim_post_ids_list,
  900. "targetPostIds": cat_post_ids
  901. }
  902. )
  903. print(f" 添加节点: 1 根节点 + 3 维度节点 = 4")
  904. print(f" 添加边: 根↔维度 6条 + 维度↔顶级分类")
  905. # 边统计
  906. edge_type_counts = {}
  907. for edge in all_edges.values():
  908. t = edge["type"]
  909. edge_type_counts[t] = edge_type_counts.get(t, 0) + 1
  910. print(f"\n边总计: {len(all_edges)}")
  911. for t, count in sorted(edge_type_counts.items(), key=lambda x: -x[1]):
  912. print(f" {t}: {count}")
  913. # ===== 计算节点概率 =====
  914. print("\n" + "=" * 60)
  915. print("计算节点概率...")
  916. # 1. 计算总帖子数(所有帖子ID的并集)
  917. all_post_ids = set()
  918. for node in all_nodes.values():
  919. post_ids = node["detail"].get("postIds", [])
  920. all_post_ids.update(post_ids)
  921. total_post_count = len(all_post_ids)
  922. print(f" 总帖子数: {total_post_count}")
  923. # 2. 为每个节点计算概率
  924. for node_id, node in all_nodes.items():
  925. post_count = node["detail"].get("postCount", 0)
  926. # 全局概率
  927. if total_post_count > 0:
  928. node["detail"]["probGlobal"] = round(post_count / total_post_count, 4)
  929. else:
  930. node["detail"]["probGlobal"] = 0
  931. # 相对父节点的概率
  932. # 通过"属于"边找父节点
  933. parent_edge_id = None
  934. for edge_id, edge in all_edges.items():
  935. if edge["source"] == node_id and edge["type"] == "属于":
  936. parent_node_id = edge["target"]
  937. parent_node = all_nodes.get(parent_node_id)
  938. if parent_node:
  939. parent_post_count = parent_node["detail"].get("postCount", 0)
  940. if parent_post_count > 0:
  941. node["detail"]["probToParent"] = round(post_count / parent_post_count, 4)
  942. else:
  943. node["detail"]["probToParent"] = 0
  944. break
  945. else:
  946. # 没有父节点(根节点)
  947. node["detail"]["probToParent"] = 1.0
  948. print(f" 已为 {len(all_nodes)} 个节点计算概率")
  949. # 3. 更新"包含"边的分数(使用子节点的 probToParent)
  950. contain_edge_updated = 0
  951. for edge_id, edge in all_edges.items():
  952. if edge["type"] == "包含":
  953. target_node = all_nodes.get(edge["target"])
  954. if target_node:
  955. edge["score"] = target_node["detail"].get("probToParent", 1.0)
  956. contain_edge_updated += 1
  957. print(f" 已更新 {contain_edge_updated} 条包含边的分数")
  958. # ===== 构建索引 =====
  959. print("\n" + "=" * 60)
  960. print("构建索引...")
  961. index = build_index(all_edges)
  962. print(f" outEdges 节点数: {len(index['outEdges'])}")
  963. print(f" inEdges 节点数: {len(index['inEdges'])}")
  964. # ===== 构建嵌套树 =====
  965. print("\n" + "=" * 60)
  966. print("构建嵌套树...")
  967. tree = build_nested_tree(all_nodes, all_edges)
  968. # 统计树节点数
  969. def count_tree_nodes(node):
  970. count = 1
  971. for child in node.get("children", []):
  972. count += count_tree_nodes(child)
  973. return count
  974. tree_node_count = count_tree_nodes(tree)
  975. print(f" 树节点数: {tree_node_count}")
  976. # ===== 统计各维度 =====
  977. dimension_stats = {}
  978. for dim_name in ["灵感点", "目的点", "关键点"]:
  979. dim_categories = sum(1 for n in all_nodes.values() if n["type"] == "分类" and n["dimension"] == dim_name)
  980. dim_tags = sum(1 for n in all_nodes.values() if n["type"] == "标签" and n["dimension"] == dim_name)
  981. dimension_stats[dim_name] = {
  982. "categoryCount": dim_categories,
  983. "tagCount": dim_tags
  984. }
  985. # ===== 构建输出 =====
  986. print("\n" + "=" * 60)
  987. print("保存结果...")
  988. output_data = {
  989. "meta": {
  990. "description": "人设图谱数据",
  991. "account": config.account_name,
  992. "createdAt": datetime.now().isoformat(),
  993. "stats": {
  994. "nodeCount": len(all_nodes),
  995. "edgeCount": len(all_edges),
  996. "categoryCount": category_count,
  997. "tagCount": tag_count,
  998. "treeNodeCount": tree_node_count,
  999. "dimensions": dimension_stats,
  1000. "edgeTypes": edge_type_counts
  1001. }
  1002. },
  1003. "nodes": all_nodes,
  1004. "edges": all_edges,
  1005. "index": index,
  1006. "tree": tree
  1007. }
  1008. with open(output_file, "w", encoding="utf-8") as f:
  1009. json.dump(output_data, f, ensure_ascii=False, indent=2)
  1010. print(f"\n输出文件: {output_file}")
  1011. print("\n" + "=" * 60)
  1012. print("完成!")
  1013. if __name__ == "__main__":
  1014. main()