匹配图谱可视化

#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 将匹配图谱数据可视化为交互式HTML文件输入：match_graph目录下的JSON文件输出：单个HTML文件，包含所有帖子的图谱，可通过Tab切换 """ import json from pathlib import Path from typing import Dict, List import sys # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from script.data_processing.path_config import PathConfig HTML_TEMPLATE = ''' 匹配图谱可视化 ''' def generate_combined_html(all_graph_data: List[Dict], persona_tree_data: Dict, output_file: Path): """ 生成包含所有帖子图谱的HTML文件 Args: all_graph_data: 所有帖子的图谱数据列表 persona_tree_data: 完整的人设树数据（节点和边） output_file: 输出文件路径 """ # 生成帖子选项HTML tabs_html = "" for i, data in enumerate(all_graph_data): post_title = data.get("postTitle", "") # 使用帖子标题，如果太长则截断 if post_title: option_name = post_title[:30] + "..." if len(post_title) > 30 else post_title else: option_name = f"帖子 {i+1}" selected = "selected" if i == 0 else "" tabs_html += f'\n' # 生成HTML html_content = HTML_TEMPLATE.format( tabs_html=tabs_html, all_graph_data=json.dumps(all_graph_data, ensure_ascii=False), persona_tree_data=json.dumps(persona_tree_data, ensure_ascii=False) ) with open(output_file, "w", encoding="utf-8") as f: f.write(html_content) def main(): # 使用路径配置 config = PathConfig() print(f"账号: {config.account_name}") print(f"输出版本: {config.output_version}") print() # 输入目录 match_graph_dir = config.intermediate_dir / "match_graph" # 输出文件 output_file = config.intermediate_dir / "match_graph.html" print(f"输入目录: {match_graph_dir}") print(f"输出文件: {output_file}") print() # 读取人设树中间数据 persona_tree_file = config.intermediate_dir / "persona_tree.json" persona_tree_data = {"nodes": [], "edges": []} if persona_tree_file.exists(): print(f"读取人设树数据: {persona_tree_file.name}") with open(persona_tree_file, "r", encoding="utf-8") as f: tree_data = json.load(f) persona_tree_data["nodes"] = tree_data.get("nodes", []) persona_tree_data["edges"] = tree_data.get("edges", []) category_count = len([n for n in persona_tree_data["nodes"] if n.get("节点类型") == "分类"]) tag_count = len([n for n in persona_tree_data["nodes"] if n.get("节点类型") == "标签"]) print(f" 分类节点: {category_count}, 标签节点: {tag_count}") print(f" 边数: {len(persona_tree_data['edges'])}") print() # 读取帖子树数据 post_trees_file = match_graph_dir / "post_trees.json" post_trees_data = {} # postId -> postTree if post_trees_file.exists(): print(f"读取帖子树数据: {post_trees_file.name}") with open(post_trees_file, "r", encoding="utf-8") as f: trees_data = json.load(f) for tree in trees_data.get("postTrees", []): post_trees_data[tree["postId"]] = tree print(f" 帖子树数量: {len(post_trees_data)}") else: print(f"警告: 帖子树数据文件不存在: {post_trees_file}") print(" 请先运行 build_post_tree.py 生成帖子树数据") print() # 读取所有匹配图谱文件 graph_files = sorted(match_graph_dir.glob("*_match_graph.json")) print(f"找到 {len(graph_files)} 个匹配图谱文件") # results目录（存放完整帖子详情） results_dir = config.intermediate_dir.parent / "results" all_graph_data = [] for i, graph_file in enumerate(graph_files, 1): print(f" [{i}/{len(graph_files)}] 读取: {graph_file.name}") with open(graph_file, "r", encoding="utf-8") as f: match_graph_data = json.load(f) post_id = match_graph_data["说明"]["帖子ID"] # 尝试读取完整帖子详情 post_detail = { "title": match_graph_data["说明"].get("帖子标题", ""), "post_id": post_id } how_file = results_dir / f"{post_id}_how.json" if how_file.exists(): with open(how_file, "r", encoding="utf-8") as f: how_data = json.load(f) if "帖子详情" in how_data: post_detail = how_data["帖子详情"] post_detail["post_id"] = post_id # 获取预构建的帖子树数据 post_tree = post_trees_data.get(post_id, {}) # 提取需要的数据 graph_data = { "postId": post_id, "postTitle": match_graph_data["说明"].get("帖子标题", ""), "stats": match_graph_data["说明"]["统计"], "nodes": match_graph_data["节点列表"], "edges": match_graph_data["边列表"], "personaEdgeToMirrorEdges": match_graph_data.get("人设边到镜像边映射", {}), # 预构建的帖子树数据 "postTree": post_tree, "postDetail": post_tree.get("postDetail", post_detail) } all_graph_data.append(graph_data) # 生成HTML print("\n生成HTML文件...") generate_combined_html(all_graph_data, persona_tree_data, output_file) print("\n" + "="*60) print("处理完成!") print(f"输出文件: {output_file}") if __name__ == "__main__": main()

匹配图谱

匹配关系 (0)

点击节点或边查看详情