visualize_cascade.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 生成级联树形可视化
  5. 使用全新的cascade_visualizer生成四层级联展示
  6. """
  7. import sys
  8. import json
  9. from pathlib import Path
  10. from datetime import datetime
  11. # 添加项目根目录到路径
  12. project_root = Path(__file__).parent.parent
  13. sys.path.insert(0, str(project_root))
  14. from src.visualizers.cascade_visualizer import CascadeVisualizer
  15. def load_json(file_path: Path) -> dict:
  16. """加载JSON文件"""
  17. with open(file_path, 'r', encoding='utf-8') as f:
  18. return json.load(f)
  19. def main():
  20. """主函数"""
  21. print("🎯 级联树形可视化生成器")
  22. print("=" * 60)
  23. # 定义路径
  24. output_dir = project_root / "output_v2"
  25. visualization_dir = project_root / "visualization"
  26. # 加载评估数据
  27. evaluated_file = output_dir / "evaluated_results.json"
  28. print(f"📖 加载评估数据: {evaluated_file}")
  29. if not evaluated_file.exists():
  30. print(f"❌ 文件不存在: {evaluated_file}")
  31. return
  32. evaluated_data = load_json(evaluated_file)
  33. print(f"✓ 加载了 {len(evaluated_data)} 个原始特征")
  34. # 加载解构数据
  35. deep_analysis_file = output_dir / "deep_analysis_results.json"
  36. print(f"📖 加载解构数据: {deep_analysis_file}")
  37. if not deep_analysis_file.exists():
  38. print(f"❌ 文件不存在: {deep_analysis_file}")
  39. return
  40. deep_analysis_full = load_json(deep_analysis_file)
  41. deep_analysis_data = deep_analysis_full.get('results', [])
  42. print(f"✓ 加载了 {len(deep_analysis_data)} 个解构结果")
  43. # 加载Stage8数据
  44. similarity_file = output_dir / "similarity_analysis_results.json"
  45. print(f"📖 加载Stage8数据: {similarity_file}")
  46. similarity_data = {}
  47. if similarity_file.exists():
  48. similarity_full = load_json(similarity_file)
  49. similarity_data = similarity_full.get('results', {})
  50. print(f"✓ 加载了 {len(similarity_data)} 个相似度评分")
  51. else:
  52. print("⚠️ Stage8数据文件不存在,将使用默认值")
  53. # 计算统计数据
  54. print("\n📊 计算统计数据...")
  55. stats = calculate_stats(evaluated_data)
  56. print("✓ 统计完成:")
  57. print(f" - 原始特征: {stats['原始特征数']}")
  58. print(f" - 搜索词总数: {stats['搜索词总数']}")
  59. print(f" - 帖子总数: {stats['帖子总数']}")
  60. print(f" - 完全匹配: {stats['完全匹配']} ({stats['完全匹配率']})")
  61. # 提取所有特征信息
  62. print("\n📊 提取所有特征信息...")
  63. all_features = extract_all_features(evaluated_data, deep_analysis_data, similarity_data)
  64. print(f"✓ 提取了 {len(all_features)} 个特征")
  65. # 统计分类
  66. high_similarity = sum(1 for f in all_features if f.get('相似度得分', 0) >= 0.8)
  67. partial_match = sum(1 for f in all_features if 0.5 <= f.get('相似度得分', 0) < 0.8)
  68. low_similarity = sum(1 for f in all_features if f.get('相似度得分', 0) < 0.5)
  69. print(f" - 高相似度特征(≥0.8): {high_similarity} 个")
  70. print(f" - 部分匹配特征(0.5-0.8): {partial_match} 个")
  71. print(f" - 低相似度特征(<0.5): {low_similarity} 个")
  72. # 生成可视化
  73. print("\n🎨 生成级联可视化页面...")
  74. visualizer = CascadeVisualizer()
  75. # 生成输出文件名
  76. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  77. output_file = visualization_dir / f"cascade_results_{timestamp}.html"
  78. # 生成HTML
  79. result_file = visualizer.generate_html(all_features, stats, str(output_file))
  80. print(f"✓ 生成完成: {result_file}")
  81. print(f"\n🌐 在浏览器中打开查看:")
  82. print(f" file://{result_file}")
  83. def calculate_stats(evaluated_data: list) -> dict:
  84. """计算统计数据"""
  85. stats = {
  86. '原始特征数': len(evaluated_data),
  87. '搜索词总数': 0,
  88. '帖子总数': 0,
  89. '完全匹配': 0,
  90. '相似匹配': 0,
  91. '弱相似': 0,
  92. '无匹配': 0,
  93. '已过滤': 0
  94. }
  95. total_notes = 0
  96. complete_notes = 0
  97. for item in evaluated_data:
  98. groups = item.get('组合评估结果_分组', [])
  99. for group in groups:
  100. searches = group.get('top10_searches', [])
  101. stats['搜索词总数'] += len(searches)
  102. for search in searches:
  103. eval_data = search.get('evaluation_with_filter', {})
  104. search_stats = eval_data.get('statistics', {})
  105. stats['完全匹配'] += search_stats.get('完全匹配(0.8-1.0)', 0)
  106. stats['相似匹配'] += search_stats.get('相似匹配(0.6-0.79)', 0)
  107. stats['弱相似'] += search_stats.get('弱相似(0.5-0.59)', 0)
  108. stats['无匹配'] += search_stats.get('无匹配(≤0.4)', 0)
  109. stats['已过滤'] += eval_data.get('filtered_count', 0)
  110. # 统计帖子总数
  111. notes = search.get('search_result', {}).get('data', {}).get('data', [])
  112. total_notes += len(notes)
  113. # 统计完全匹配的帖子
  114. notes_with_scores = eval_data.get('notes_with_scores', [])
  115. for note_eval in notes_with_scores:
  116. match_level = note_eval.get('match_level', '')
  117. if '完全匹配' in match_level:
  118. complete_notes += 1
  119. stats['帖子总数'] = total_notes
  120. stats['完全匹配率'] = f"{(complete_notes / total_notes * 100):.1f}%" if total_notes > 0 else "0%"
  121. return stats
  122. def extract_all_features(evaluated_data: list, deep_analysis_data: list, similarity_data: list) -> list:
  123. """
  124. 提取所有特征信息,整合评估数据、解构数据和相似度数据
  125. """
  126. all_features = []
  127. # 遍历评估数据
  128. for eval_item in evaluated_data:
  129. post_target_word = eval_item.get('帖子目标词', '')
  130. persona_feature = eval_item.get('人设特征名称', '')
  131. # 简化处理:直接从eval_item中获取相似度得分
  132. # 如果没有,默认为0.5(部分匹配)
  133. similarity_score = eval_item.get('相似度得分', 0.5)
  134. # 整合数据
  135. feature = {
  136. '帖子目标词': post_target_word,
  137. '人设特征名称': persona_feature,
  138. '相似度得分': similarity_score,
  139. '组合评估结果_分组': eval_item.get('组合评估结果_分组', [])
  140. }
  141. all_features.append(feature)
  142. return all_features
  143. if __name__ == "__main__":
  144. main()