#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 生成级联树形可视化 使用全新的cascade_visualizer生成四层级联展示 """ import sys import json from pathlib import Path from datetime import datetime # 添加项目根目录到路径 project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) from src.visualizers.cascade_visualizer import CascadeVisualizer def load_json(file_path: Path) -> dict: """加载JSON文件""" with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) def main(): """主函数""" print("🎯 级联树形可视化生成器") print("=" * 60) # 定义路径 output_dir = project_root / "output_v2" visualization_dir = project_root / "visualization" # 加载评估数据 evaluated_file = output_dir / "evaluated_results.json" print(f"📖 加载评估数据: {evaluated_file}") if not evaluated_file.exists(): print(f"❌ 文件不存在: {evaluated_file}") return evaluated_data = load_json(evaluated_file) print(f"✓ 加载了 {len(evaluated_data)} 个原始特征") # 加载解构数据 deep_analysis_file = output_dir / "deep_analysis_results.json" print(f"📖 加载解构数据: {deep_analysis_file}") if not deep_analysis_file.exists(): print(f"❌ 文件不存在: {deep_analysis_file}") return deep_analysis_full = load_json(deep_analysis_file) deep_analysis_data = deep_analysis_full.get('results', []) print(f"✓ 加载了 {len(deep_analysis_data)} 个解构结果") # 加载Stage8数据 similarity_file = output_dir / "similarity_analysis_results.json" print(f"📖 加载Stage8数据: {similarity_file}") similarity_data = {} if similarity_file.exists(): similarity_full = load_json(similarity_file) similarity_data = similarity_full.get('results', {}) print(f"✓ 加载了 {len(similarity_data)} 个相似度评分") else: print("⚠️ Stage8数据文件不存在,将使用默认值") # 计算统计数据 print("\n📊 计算统计数据...") stats = calculate_stats(evaluated_data) print("✓ 统计完成:") print(f" - 原始特征: {stats['原始特征数']}") print(f" - 搜索词总数: {stats['搜索词总数']}") print(f" - 帖子总数: {stats['帖子总数']}") print(f" - 完全匹配: {stats['完全匹配']} ({stats['完全匹配率']})") # 提取所有特征信息 print("\n📊 提取所有特征信息...") all_features = extract_all_features(evaluated_data, deep_analysis_data, similarity_data) print(f"✓ 提取了 {len(all_features)} 个特征") # 统计分类 high_similarity = sum(1 for f in all_features if f.get('相似度得分', 0) >= 0.8) partial_match = sum(1 for f in all_features if 0.5 <= f.get('相似度得分', 0) < 0.8) low_similarity = sum(1 for f in all_features if f.get('相似度得分', 0) < 0.5) print(f" - 高相似度特征(≥0.8): {high_similarity} 个") print(f" - 部分匹配特征(0.5-0.8): {partial_match} 个") print(f" - 低相似度特征(<0.5): {low_similarity} 个") # 生成可视化 print("\n🎨 生成级联可视化页面...") visualizer = CascadeVisualizer() # 生成输出文件名 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = visualization_dir / f"cascade_results_{timestamp}.html" # 生成HTML result_file = visualizer.generate_html(all_features, stats, str(output_file)) print(f"✓ 生成完成: {result_file}") print(f"\n🌐 在浏览器中打开查看:") print(f" file://{result_file}") def calculate_stats(evaluated_data: list) -> dict: """计算统计数据""" stats = { '原始特征数': len(evaluated_data), '搜索词总数': 0, '帖子总数': 0, '完全匹配': 0, '相似匹配': 0, '弱相似': 0, '无匹配': 0, '已过滤': 0 } total_notes = 0 complete_notes = 0 for item in evaluated_data: groups = item.get('组合评估结果_分组', []) for group in groups: searches = group.get('top10_searches', []) stats['搜索词总数'] += len(searches) for search in searches: eval_data = search.get('evaluation_with_filter', {}) search_stats = eval_data.get('statistics', {}) stats['完全匹配'] += search_stats.get('完全匹配(0.8-1.0)', 0) stats['相似匹配'] += search_stats.get('相似匹配(0.6-0.79)', 0) stats['弱相似'] += search_stats.get('弱相似(0.5-0.59)', 0) stats['无匹配'] += search_stats.get('无匹配(≤0.4)', 0) stats['已过滤'] += eval_data.get('filtered_count', 0) # 统计帖子总数 notes = search.get('search_result', {}).get('data', {}).get('data', []) total_notes += len(notes) # 统计完全匹配的帖子 notes_with_scores = eval_data.get('notes_with_scores', []) for note_eval in notes_with_scores: match_level = note_eval.get('match_level', '') if '完全匹配' in match_level: complete_notes += 1 stats['帖子总数'] = total_notes stats['完全匹配率'] = f"{(complete_notes / total_notes * 100):.1f}%" if total_notes > 0 else "0%" return stats def extract_all_features(evaluated_data: list, deep_analysis_data: list, similarity_data: list) -> list: """ 提取所有特征信息,整合评估数据、解构数据和相似度数据 """ all_features = [] # 遍历评估数据 for eval_item in evaluated_data: post_target_word = eval_item.get('帖子目标词', '') persona_feature = eval_item.get('人设特征名称', '') # 简化处理:直接从eval_item中获取相似度得分 # 如果没有,默认为0.5(部分匹配) similarity_score = eval_item.get('相似度得分', 0.5) # 整合数据 feature = { '帖子目标词': post_target_word, '人设特征名称': persona_feature, '相似度得分': similarity_score, '组合评估结果_分组': eval_item.get('组合评估结果_分组', []) } all_features.append(feature) return all_features if __name__ == "__main__": main()