#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Stage 6 评估结果统计分析 分析两层评估的过滤效果和匹配质量 """ import json from typing import Dict, List, Any from collections import defaultdict def load_stage6_results(file_path: str) -> List[Dict[str, Any]]: """加载Stage 6评估结果""" with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) def analyze_evaluation_results(data: List[Dict[str, Any]]) -> Dict[str, Any]: """分析评估结果""" # 全局统计 global_stats = { 'total_search_words': 0, 'total_notes_evaluated': 0, 'total_filtered': 0, 'match_distribution': { '完全匹配(8-10)': 0, '相似匹配(6-7)': 0, '弱相似(5-6)': 0, '无匹配(≤4)': 0 } } # 按原始特征分组统计 feature_stats = defaultdict(lambda: { 'search_words_count': 0, 'total_notes': 0, 'total_filtered': 0, 'match_distribution': { '完全匹配(8-10)': 0, '相似匹配(6-7)': 0, '弱相似(5-6)': 0, '无匹配(≤4)': 0 }, 'search_words': [] }) # 所有搜索词的详细统计 search_word_details = [] # 遍历所有原始特征 for feature_result in data: original_feature = feature_result.get('原始特征名称', 'Unknown') # 从组合评估结果_分组中读取 grouped_results = feature_result.get('组合评估结果_分组', []) for group in grouped_results: base_word = group.get('base_word', '') for eval_item in group.get('top10_searches', []): # 检查是否有评估结果 evaluation = eval_item.get('evaluation_with_filter') if not evaluation: continue search_word = eval_item.get('search_word', '') # 提取评估数据 total_notes = evaluation.get('total_notes', 0) evaluated_notes = evaluation.get('evaluated_notes', 0) filtered_count = evaluation.get('filtered_count', 0) statistics = evaluation.get('statistics', {}) # 更新全局统计 global_stats['total_search_words'] += 1 global_stats['total_notes_evaluated'] += total_notes global_stats['total_filtered'] += filtered_count for key in global_stats['match_distribution']: global_stats['match_distribution'][key] += statistics.get(key, 0) # 更新特征统计 feature_stats[original_feature]['search_words_count'] += 1 feature_stats[original_feature]['total_notes'] += total_notes feature_stats[original_feature]['total_filtered'] += filtered_count for key in feature_stats[original_feature]['match_distribution']: feature_stats[original_feature]['match_distribution'][key] += statistics.get(key, 0) # 记录搜索词详情 search_word_info = { 'original_feature': original_feature, 'base_word': base_word, 'search_word': search_word, 'total_notes': total_notes, 'evaluated_notes': evaluated_notes, 'filtered_count': filtered_count, 'match_distribution': statistics, 'high_quality_count': statistics.get('完全匹配(8-10)', 0), 'similar_count': statistics.get('相似匹配(6-7)', 0) } search_word_details.append(search_word_info) feature_stats[original_feature]['search_words'].append(search_word_info) # 计算全局过滤率 if global_stats['total_notes_evaluated'] > 0: global_stats['filter_rate'] = global_stats['total_filtered'] / global_stats['total_notes_evaluated'] else: global_stats['filter_rate'] = 0.0 # 计算每个特征的过滤率 for feature_name, stats in feature_stats.items(): if stats['total_notes'] > 0: stats['filter_rate'] = stats['total_filtered'] / stats['total_notes'] else: stats['filter_rate'] = 0.0 # 按高质量匹配数排序搜索词 search_word_details.sort(key=lambda x: x['high_quality_count'], reverse=True) return { 'global_stats': global_stats, 'feature_stats': dict(feature_stats), 'search_word_details': search_word_details } def print_statistics(stats: Dict[str, Any]): """打印统计结果""" global_stats = stats['global_stats'] feature_stats = stats['feature_stats'] search_word_details = stats['search_word_details'] print("=" * 80) print("Stage 6 评估结果统计分析") print("=" * 80) # 全局统计 print("\n【全局统计】") print(f" 总搜索词数: {global_stats['total_search_words']}") print(f" 总评估帖子数: {global_stats['total_notes_evaluated']}") print(f" 总过滤帖子数: {global_stats['total_filtered']} (过滤率: {global_stats['filter_rate']*100:.1f}%)") print(f"\n 匹配度分布:") for match_type, count in global_stats['match_distribution'].items(): print(f" {match_type}: {count} 个帖子") # 按原始特征统计 print("\n" + "=" * 80) print("【按原始特征统计】") print("=" * 80) for feature_name, stats in sorted(feature_stats.items()): print(f"\n特征: {feature_name}") print(f" 搜索词数: {stats['search_words_count']}") print(f" 总评估帖子: {stats['total_notes']}") print(f" 总过滤帖子: {stats['total_filtered']} (过滤率: {stats['filter_rate']*100:.1f}%)") print(f" 高质量匹配: {stats['match_distribution']['完全匹配(8-10)']} 个帖子") print(f" 相似匹配: {stats['match_distribution']['相似匹配(6-7)']} 个帖子") # 找出该特征下高质量匹配最多的搜索词 best_searches = sorted(stats['search_words'], key=lambda x: x['high_quality_count'], reverse=True)[:3] if best_searches: print(f" Top 3 最佳搜索词:") for idx, sw in enumerate(best_searches, 1): print(f" {idx}. \"{sw['search_word']}\" - {sw['high_quality_count']}个完全匹配") # Top 10 最佳搜索词 print("\n" + "=" * 80) print("【Top 10 最佳搜索词(按完全匹配数排序)】") print("=" * 80) for idx, sw in enumerate(search_word_details[:10], 1): print(f"\n{idx}. \"{sw['search_word']}\"") print(f" 原始特征: {sw['original_feature']}") print(f" Base Word: {sw['base_word']}") print(f" 评估帖子: {sw['total_notes']}, 过滤: {sw['filtered_count']}") print(f" 完全匹配(8-10): {sw['high_quality_count']} 个") print(f" 相似匹配(6-7): {sw['similar_count']} 个") # 过滤效果分析 print("\n" + "=" * 80) print("【过滤效果分析】") print("=" * 80) total_evaluated = global_stats['total_notes_evaluated'] total_filtered = global_stats['total_filtered'] total_remaining = total_evaluated - total_filtered total_high_quality = global_stats['match_distribution']['完全匹配(8-10)'] total_similar = global_stats['match_distribution']['相似匹配(6-7)'] total_weak = global_stats['match_distribution']['弱相似(5-6)'] total_no_match = global_stats['match_distribution']['无匹配(≤4)'] print(f" 评估帖子总数: {total_evaluated}") print(f" 第一层过滤(Query不相关): {total_filtered} ({total_filtered/total_evaluated*100:.1f}%)") print(f" 通过过滤的帖子: {total_remaining} ({total_remaining/total_evaluated*100:.1f}%)") print(f"\n 通过过滤后的质量分布:") if total_remaining > 0: print(f" 完全匹配(8-10): {total_high_quality} ({total_high_quality/total_remaining*100:.1f}%)") print(f" 相似匹配(6-7): {total_similar} ({total_similar/total_remaining*100:.1f}%)") print(f" 弱相似(5-6): {total_weak} ({total_weak/total_remaining*100:.1f}%)") print(f" 无匹配(≤4): {total_no_match} ({total_no_match/total_remaining*100:.1f}%)") print("\n" + "=" * 80) def save_statistics(stats: Dict[str, Any], output_path: str): """保存统计结果到JSON文件""" with open(output_path, 'w', encoding='utf-8') as f: json.dump(stats, f, ensure_ascii=False, indent=2) print(f"\n统计结果已保存到: {output_path}") def main(): """主函数""" input_file = "output_v2/stage6_with_evaluations.json" output_file = "output_v2/stage6_statistics.json" print("正在加载数据...") data = load_stage6_results(input_file) print("正在分析评估结果...") stats = analyze_evaluation_results(data) # 打印统计结果 print_statistics(stats) # 保存结果 save_statistics(stats, output_file) if __name__ == '__main__': main()