#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
将匹配图谱数据可视化为交互式HTML文件
输入:match_graph目录下的JSON文件
输出:单个HTML文件,包含所有帖子的图谱,可通过Tab切换
"""
import json
from pathlib import Path
from typing import Dict, List
import sys
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from script.data_processing.path_config import PathConfig
HTML_TEMPLATE = '''
匹配图谱可视化
'''
def generate_combined_html(all_graph_data: List[Dict], persona_tree_data: Dict, output_file: Path):
"""
生成包含所有帖子图谱的HTML文件
Args:
all_graph_data: 所有帖子的图谱数据列表
persona_tree_data: 完整的人设树数据(节点和边)
output_file: 输出文件路径
"""
# 生成帖子选项HTML
tabs_html = ""
for i, data in enumerate(all_graph_data):
post_title = data.get("postTitle", "")
# 使用帖子标题,如果太长则截断
if post_title:
option_name = post_title[:30] + "..." if len(post_title) > 30 else post_title
else:
option_name = f"帖子 {i+1}"
selected = "selected" if i == 0 else ""
tabs_html += f'\n'
# 生成HTML
html_content = HTML_TEMPLATE.format(
tabs_html=tabs_html,
all_graph_data=json.dumps(all_graph_data, ensure_ascii=False),
persona_tree_data=json.dumps(persona_tree_data, ensure_ascii=False)
)
with open(output_file, "w", encoding="utf-8") as f:
f.write(html_content)
def main():
# 使用路径配置
config = PathConfig()
print(f"账号: {config.account_name}")
print(f"输出版本: {config.output_version}")
print()
# 输入目录
match_graph_dir = config.intermediate_dir / "match_graph"
# 输出文件
output_file = config.intermediate_dir / "match_graph.html"
print(f"输入目录: {match_graph_dir}")
print(f"输出文件: {output_file}")
print()
# 读取人设树中间数据
persona_tree_file = config.intermediate_dir / "persona_tree.json"
persona_tree_data = {"nodes": [], "edges": []}
if persona_tree_file.exists():
print(f"读取人设树数据: {persona_tree_file.name}")
with open(persona_tree_file, "r", encoding="utf-8") as f:
tree_data = json.load(f)
persona_tree_data["nodes"] = tree_data.get("nodes", [])
persona_tree_data["edges"] = tree_data.get("edges", [])
category_count = len([n for n in persona_tree_data["nodes"] if n.get("节点类型") == "分类"])
tag_count = len([n for n in persona_tree_data["nodes"] if n.get("节点类型") == "标签"])
print(f" 分类节点: {category_count}, 标签节点: {tag_count}")
print(f" 边数: {len(persona_tree_data['edges'])}")
print()
# 读取帖子树数据
post_trees_file = match_graph_dir / "post_trees.json"
post_trees_data = {} # postId -> postTree
if post_trees_file.exists():
print(f"读取帖子树数据: {post_trees_file.name}")
with open(post_trees_file, "r", encoding="utf-8") as f:
trees_data = json.load(f)
for tree in trees_data.get("postTrees", []):
post_trees_data[tree["postId"]] = tree
print(f" 帖子树数量: {len(post_trees_data)}")
else:
print(f"警告: 帖子树数据文件不存在: {post_trees_file}")
print(" 请先运行 build_post_tree.py 生成帖子树数据")
print()
# 读取所有匹配图谱文件
graph_files = sorted(match_graph_dir.glob("*_match_graph.json"))
print(f"找到 {len(graph_files)} 个匹配图谱文件")
# results目录(存放完整帖子详情)
results_dir = config.intermediate_dir.parent / "results"
all_graph_data = []
for i, graph_file in enumerate(graph_files, 1):
print(f" [{i}/{len(graph_files)}] 读取: {graph_file.name}")
with open(graph_file, "r", encoding="utf-8") as f:
match_graph_data = json.load(f)
post_id = match_graph_data["说明"]["帖子ID"]
# 尝试读取完整帖子详情
post_detail = {
"title": match_graph_data["说明"].get("帖子标题", ""),
"post_id": post_id
}
how_file = results_dir / f"{post_id}_how.json"
if how_file.exists():
with open(how_file, "r", encoding="utf-8") as f:
how_data = json.load(f)
if "帖子详情" in how_data:
post_detail = how_data["帖子详情"]
post_detail["post_id"] = post_id
# 获取预构建的帖子树数据
post_tree = post_trees_data.get(post_id, {})
# 提取需要的数据
graph_data = {
"postId": post_id,
"postTitle": match_graph_data["说明"].get("帖子标题", ""),
"stats": match_graph_data["说明"]["统计"],
"nodes": match_graph_data["节点列表"],
"edges": match_graph_data["边列表"],
"personaEdgeToMirrorEdges": match_graph_data.get("人设边到镜像边映射", {}),
# 预构建的帖子树数据
"postTree": post_tree,
"postDetail": post_tree.get("postDetail", post_detail)
}
all_graph_data.append(graph_data)
# 生成HTML
print("\n生成HTML文件...")
generate_combined_html(all_graph_data, persona_tree_data, output_file)
print("\n" + "="*60)
print("处理完成!")
print(f"输出文件: {output_file}")
if __name__ == "__main__":
main()