#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Stage6评估结果可视化工具 整合两层评估结果的交互式HTML页面 """ import json import os from datetime import datetime from typing import List, Dict, Any def load_data(json_path: str) -> List[Dict[str, Any]]: """加载JSON数据""" with open(json_path, 'r', encoding='utf-8') as f: return json.load(f) def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]: """计算统计数据(包括评估结果)""" total_features = len(data) total_search_words = 0 searched_count = 0 # 已执行搜索的数量 not_searched_count = 0 # 未执行搜索的数量 total_notes = 0 video_count = 0 normal_count = 0 # 评估统计 total_evaluated_notes = 0 total_filtered = 0 match_complete = 0 # 0.8-1.0分 match_similar = 0 # 0.6-0.79分 match_weak = 0 # 0.5-0.59分 match_none = 0 # ≤0.4分 for feature in data: grouped_results = feature.get('组合评估结果_分组', []) for group in grouped_results: search_items = group.get('top10_searches', []) total_search_words += len(search_items) for search_item in search_items: search_result = search_item.get('search_result', {}) # 统计搜索状态 if search_result: searched_count += 1 notes = search_result.get('data', {}).get('data', []) total_notes += len(notes) # 统计视频/图文类型 for note in notes: note_type = note.get('note_card', {}).get('type', '') if note_type == 'video': video_count += 1 else: normal_count += 1 # 统计评估结果 evaluation = search_item.get('evaluation_with_filter') if evaluation: total_evaluated_notes += evaluation.get('total_notes', 0) total_filtered += evaluation.get('filtered_count', 0) stats = evaluation.get('statistics', {}) match_complete += stats.get('完全匹配(0.8-1.0)', 0) match_similar += stats.get('相似匹配(0.6-0.79)', 0) match_weak += stats.get('弱相似(0.5-0.59)', 0) match_none += stats.get('无匹配(≤0.4)', 0) else: not_searched_count += 1 # 计算百分比 total_remaining = total_evaluated_notes - total_filtered if total_evaluated_notes > 0 else 0 return { 'total_features': total_features, 'total_search_words': total_search_words, 'searched_count': searched_count, 'not_searched_count': not_searched_count, 'searched_percentage': round(searched_count / total_search_words * 100, 1) if total_search_words > 0 else 0, 'total_notes': total_notes, 'video_count': video_count, 'normal_count': normal_count, 'video_percentage': round(video_count / total_notes * 100, 1) if total_notes > 0 else 0, 'normal_percentage': round(normal_count / total_notes * 100, 1) if total_notes > 0 else 0, # 评估统计 'total_evaluated': total_evaluated_notes, 'total_filtered': total_filtered, 'total_remaining': total_remaining, 'filter_rate': round(total_filtered / total_evaluated_notes * 100, 1) if total_evaluated_notes > 0 else 0, 'match_complete': match_complete, 'match_similar': match_similar, 'match_weak': match_weak, 'match_none': match_none, 'complete_rate': round(match_complete / total_remaining * 100, 1) if total_remaining > 0 else 0, 'similar_rate': round(match_similar / total_remaining * 100, 1) if total_remaining > 0 else 0, } def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path: str): """生成HTML可视化页面""" # 准备数据JSON(用于JavaScript) data_json = json.dumps(data, ensure_ascii=False, indent=2) html_content = f''' Stage6 评估结果可视化
📊 {stats['total_features']}
原始特征数
🔍 {stats['total_search_words']}
搜索词总数
✅ {stats['searched_count']}
已搜索 ({stats['searched_percentage']}%)
⏸️ {stats['not_searched_count']}
未搜索
📝 {stats['total_notes']}
帖子总数
🎬 {stats['video_count']}
视频 ({stats['video_percentage']}%)
📷 {stats['normal_count']}
图文 ({stats['normal_percentage']}%)
⚡ {stats['total_evaluated']}
已评估
⚫ {stats['total_filtered']}
已过滤 ({stats['filter_rate']}%)
🟢 {stats['match_complete']}
完全匹配 ({stats['complete_rate']}%)
🟡 {stats['match_similar']}
相似匹配 ({stats['similar_rate']}%)
🟠 {stats['match_weak']}
弱相似
🔴 {stats['match_none']}
无匹配
🔍 筛选显示:
''' # 写入文件 with open(output_path, 'w', encoding='utf-8') as f: f.write(html_content) def main(): """主函数""" # 配置路径 script_dir = os.path.dirname(os.path.abspath(__file__)) json_path = os.path.join(script_dir, 'output_v2', 'stage6_with_evaluations.json') output_dir = os.path.join(script_dir, 'visualization') os.makedirs(output_dir, exist_ok=True) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') output_path = os.path.join(output_dir, f'stage6_interactive_{timestamp}.html') # 加载数据 print(f"📖 加载数据: {json_path}") data = load_data(json_path) print(f"✓ 加载了 {len(data)} 个原始特征") # 计算统计 print("📊 计算统计数据...") stats = calculate_statistics(data) print(f"✓ 统计完成:") print(f" - 原始特征: {stats['total_features']}") print(f" - 搜索词总数: {stats['total_search_words']}") print(f" - 已搜索: {stats['searched_count']} ({stats['searched_percentage']}%)") print(f" - 未搜索: {stats['not_searched_count']}") print(f" - 帖子总数: {stats['total_notes']}") print(f" - 视频: {stats['video_count']} ({stats['video_percentage']}%)") print(f" - 图文: {stats['normal_count']} ({stats['normal_percentage']}%)") print(f"\n 评估结果:") print(f" - 已评估: {stats['total_evaluated']}") print(f" - 已过滤: {stats['total_filtered']} ({stats['filter_rate']}%)") print(f" - 完全匹配: {stats['match_complete']} ({stats['complete_rate']}%)") print(f" - 相似匹配: {stats['match_similar']} ({stats['similar_rate']}%)") print(f" - 弱相似: {stats['match_weak']}") print(f" - 无匹配: {stats['match_none']}") # 生成HTML print(f"\n🎨 生成可视化页面...") generate_html(data, stats, output_path) print(f"✓ 生成完成: {output_path}") # 打印访问提示 print(f"\n🌐 在浏览器中打开查看:") print(f" file://{output_path}") return output_path if __name__ == '__main__': main()