| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 生成级联树形可视化
- 使用全新的cascade_visualizer生成四层级联展示
- """
- import sys
- import json
- from pathlib import Path
- from datetime import datetime
- # 添加项目根目录到路径
- project_root = Path(__file__).parent.parent
- sys.path.insert(0, str(project_root))
- from src.visualizers.cascade_visualizer import CascadeVisualizer
- def load_json(file_path: Path) -> dict:
- """加载JSON文件"""
- with open(file_path, 'r', encoding='utf-8') as f:
- return json.load(f)
- def main():
- """主函数"""
- print("🎯 级联树形可视化生成器")
- print("=" * 60)
- # 定义路径
- output_dir = project_root / "output_v2"
- visualization_dir = project_root / "visualization"
- # 加载评估数据
- evaluated_file = output_dir / "evaluated_results.json"
- print(f"📖 加载评估数据: {evaluated_file}")
- if not evaluated_file.exists():
- print(f"❌ 文件不存在: {evaluated_file}")
- return
- evaluated_data = load_json(evaluated_file)
- print(f"✓ 加载了 {len(evaluated_data)} 个原始特征")
- # 加载解构数据
- deep_analysis_file = output_dir / "deep_analysis_results.json"
- print(f"📖 加载解构数据: {deep_analysis_file}")
- if not deep_analysis_file.exists():
- print(f"❌ 文件不存在: {deep_analysis_file}")
- return
- deep_analysis_full = load_json(deep_analysis_file)
- deep_analysis_data = deep_analysis_full.get('results', [])
- print(f"✓ 加载了 {len(deep_analysis_data)} 个解构结果")
- # 加载Stage8数据
- similarity_file = output_dir / "similarity_analysis_results.json"
- print(f"📖 加载Stage8数据: {similarity_file}")
- similarity_data = {}
- if similarity_file.exists():
- similarity_full = load_json(similarity_file)
- similarity_data = similarity_full.get('results', {})
- print(f"✓ 加载了 {len(similarity_data)} 个相似度评分")
- else:
- print("⚠️ Stage8数据文件不存在,将使用默认值")
- # 计算统计数据
- print("\n📊 计算统计数据...")
- stats = calculate_stats(evaluated_data)
- print("✓ 统计完成:")
- print(f" - 原始特征: {stats['原始特征数']}")
- print(f" - 搜索词总数: {stats['搜索词总数']}")
- print(f" - 帖子总数: {stats['帖子总数']}")
- print(f" - 完全匹配: {stats['完全匹配']} ({stats['完全匹配率']})")
- # 提取所有特征信息
- print("\n📊 提取所有特征信息...")
- all_features = extract_all_features(evaluated_data, deep_analysis_data, similarity_data)
- print(f"✓ 提取了 {len(all_features)} 个特征")
- # 统计分类
- high_similarity = sum(1 for f in all_features if f.get('相似度得分', 0) >= 0.8)
- partial_match = sum(1 for f in all_features if 0.5 <= f.get('相似度得分', 0) < 0.8)
- low_similarity = sum(1 for f in all_features if f.get('相似度得分', 0) < 0.5)
- print(f" - 高相似度特征(≥0.8): {high_similarity} 个")
- print(f" - 部分匹配特征(0.5-0.8): {partial_match} 个")
- print(f" - 低相似度特征(<0.5): {low_similarity} 个")
- # 生成可视化
- print("\n🎨 生成级联可视化页面...")
- visualizer = CascadeVisualizer()
- # 生成输出文件名
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- output_file = visualization_dir / f"cascade_results_{timestamp}.html"
- # 生成HTML
- result_file = visualizer.generate_html(all_features, stats, str(output_file))
- print(f"✓ 生成完成: {result_file}")
- print(f"\n🌐 在浏览器中打开查看:")
- print(f" file://{result_file}")
- def calculate_stats(evaluated_data: list) -> dict:
- """计算统计数据"""
- stats = {
- '原始特征数': len(evaluated_data),
- '搜索词总数': 0,
- '帖子总数': 0,
- '完全匹配': 0,
- '相似匹配': 0,
- '弱相似': 0,
- '无匹配': 0,
- '已过滤': 0
- }
- total_notes = 0
- complete_notes = 0
- for item in evaluated_data:
- groups = item.get('组合评估结果_分组', [])
- for group in groups:
- searches = group.get('top10_searches', [])
- stats['搜索词总数'] += len(searches)
- for search in searches:
- eval_data = search.get('evaluation_with_filter', {})
- search_stats = eval_data.get('statistics', {})
- stats['完全匹配'] += search_stats.get('完全匹配(0.8-1.0)', 0)
- stats['相似匹配'] += search_stats.get('相似匹配(0.6-0.79)', 0)
- stats['弱相似'] += search_stats.get('弱相似(0.5-0.59)', 0)
- stats['无匹配'] += search_stats.get('无匹配(≤0.4)', 0)
- stats['已过滤'] += eval_data.get('filtered_count', 0)
- # 统计帖子总数
- notes = search.get('search_result', {}).get('data', {}).get('data', [])
- total_notes += len(notes)
- # 统计完全匹配的帖子
- notes_with_scores = eval_data.get('notes_with_scores', [])
- for note_eval in notes_with_scores:
- match_level = note_eval.get('match_level', '')
- if '完全匹配' in match_level:
- complete_notes += 1
- stats['帖子总数'] = total_notes
- stats['完全匹配率'] = f"{(complete_notes / total_notes * 100):.1f}%" if total_notes > 0 else "0%"
- return stats
- def extract_all_features(evaluated_data: list, deep_analysis_data: list, similarity_data: list) -> list:
- """
- 提取所有特征信息,整合评估数据、解构数据和相似度数据
- """
- all_features = []
- # 遍历评估数据
- for eval_item in evaluated_data:
- post_target_word = eval_item.get('帖子目标词', '')
- persona_feature = eval_item.get('人设特征名称', '')
- # 简化处理:直接从eval_item中获取相似度得分
- # 如果没有,默认为0.5(部分匹配)
- similarity_score = eval_item.get('相似度得分', 0.5)
- # 整合数据
- feature = {
- '帖子目标词': post_target_word,
- '人设特征名称': persona_feature,
- '相似度得分': similarity_score,
- '组合评估结果_分组': eval_item.get('组合评估结果_分组', [])
- }
- all_features.append(feature)
- return all_features
- if __name__ == "__main__":
- main()
|