| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Stage 6 评估结果统计分析
- 分析两层评估的过滤效果和匹配质量
- """
- import json
- from typing import Dict, List, Any
- from collections import defaultdict
- def load_stage6_results(file_path: str) -> List[Dict[str, Any]]:
- """加载Stage 6评估结果"""
- with open(file_path, 'r', encoding='utf-8') as f:
- return json.load(f)
- def analyze_evaluation_results(data: List[Dict[str, Any]]) -> Dict[str, Any]:
- """分析评估结果"""
- # 全局统计
- global_stats = {
- 'total_search_words': 0,
- 'total_notes_evaluated': 0,
- 'total_filtered': 0,
- 'match_distribution': {
- '完全匹配(8-10)': 0,
- '相似匹配(6-7)': 0,
- '弱相似(5-6)': 0,
- '无匹配(≤4)': 0
- }
- }
- # 按原始特征分组统计
- feature_stats = defaultdict(lambda: {
- 'search_words_count': 0,
- 'total_notes': 0,
- 'total_filtered': 0,
- 'match_distribution': {
- '完全匹配(8-10)': 0,
- '相似匹配(6-7)': 0,
- '弱相似(5-6)': 0,
- '无匹配(≤4)': 0
- },
- 'search_words': []
- })
- # 所有搜索词的详细统计
- search_word_details = []
- # 遍历所有原始特征
- for feature_result in data:
- original_feature = feature_result.get('原始特征名称', 'Unknown')
- # 从组合评估结果_分组中读取
- grouped_results = feature_result.get('组合评估结果_分组', [])
- for group in grouped_results:
- base_word = group.get('base_word', '')
- for eval_item in group.get('top10_searches', []):
- # 检查是否有评估结果
- evaluation = eval_item.get('evaluation_with_filter')
- if not evaluation:
- continue
- search_word = eval_item.get('search_word', '')
- # 提取评估数据
- total_notes = evaluation.get('total_notes', 0)
- evaluated_notes = evaluation.get('evaluated_notes', 0)
- filtered_count = evaluation.get('filtered_count', 0)
- statistics = evaluation.get('statistics', {})
- # 更新全局统计
- global_stats['total_search_words'] += 1
- global_stats['total_notes_evaluated'] += total_notes
- global_stats['total_filtered'] += filtered_count
- for key in global_stats['match_distribution']:
- global_stats['match_distribution'][key] += statistics.get(key, 0)
- # 更新特征统计
- feature_stats[original_feature]['search_words_count'] += 1
- feature_stats[original_feature]['total_notes'] += total_notes
- feature_stats[original_feature]['total_filtered'] += filtered_count
- for key in feature_stats[original_feature]['match_distribution']:
- feature_stats[original_feature]['match_distribution'][key] += statistics.get(key, 0)
- # 记录搜索词详情
- search_word_info = {
- 'original_feature': original_feature,
- 'base_word': base_word,
- 'search_word': search_word,
- 'total_notes': total_notes,
- 'evaluated_notes': evaluated_notes,
- 'filtered_count': filtered_count,
- 'match_distribution': statistics,
- 'high_quality_count': statistics.get('完全匹配(8-10)', 0),
- 'similar_count': statistics.get('相似匹配(6-7)', 0)
- }
- search_word_details.append(search_word_info)
- feature_stats[original_feature]['search_words'].append(search_word_info)
- # 计算全局过滤率
- if global_stats['total_notes_evaluated'] > 0:
- global_stats['filter_rate'] = global_stats['total_filtered'] / global_stats['total_notes_evaluated']
- else:
- global_stats['filter_rate'] = 0.0
- # 计算每个特征的过滤率
- for feature_name, stats in feature_stats.items():
- if stats['total_notes'] > 0:
- stats['filter_rate'] = stats['total_filtered'] / stats['total_notes']
- else:
- stats['filter_rate'] = 0.0
- # 按高质量匹配数排序搜索词
- search_word_details.sort(key=lambda x: x['high_quality_count'], reverse=True)
- return {
- 'global_stats': global_stats,
- 'feature_stats': dict(feature_stats),
- 'search_word_details': search_word_details
- }
- def print_statistics(stats: Dict[str, Any]):
- """打印统计结果"""
- global_stats = stats['global_stats']
- feature_stats = stats['feature_stats']
- search_word_details = stats['search_word_details']
- print("=" * 80)
- print("Stage 6 评估结果统计分析")
- print("=" * 80)
- # 全局统计
- print("\n【全局统计】")
- print(f" 总搜索词数: {global_stats['total_search_words']}")
- print(f" 总评估帖子数: {global_stats['total_notes_evaluated']}")
- print(f" 总过滤帖子数: {global_stats['total_filtered']} (过滤率: {global_stats['filter_rate']*100:.1f}%)")
- print(f"\n 匹配度分布:")
- for match_type, count in global_stats['match_distribution'].items():
- print(f" {match_type}: {count} 个帖子")
- # 按原始特征统计
- print("\n" + "=" * 80)
- print("【按原始特征统计】")
- print("=" * 80)
- for feature_name, stats in sorted(feature_stats.items()):
- print(f"\n特征: {feature_name}")
- print(f" 搜索词数: {stats['search_words_count']}")
- print(f" 总评估帖子: {stats['total_notes']}")
- print(f" 总过滤帖子: {stats['total_filtered']} (过滤率: {stats['filter_rate']*100:.1f}%)")
- print(f" 高质量匹配: {stats['match_distribution']['完全匹配(8-10)']} 个帖子")
- print(f" 相似匹配: {stats['match_distribution']['相似匹配(6-7)']} 个帖子")
- # 找出该特征下高质量匹配最多的搜索词
- best_searches = sorted(stats['search_words'], key=lambda x: x['high_quality_count'], reverse=True)[:3]
- if best_searches:
- print(f" Top 3 最佳搜索词:")
- for idx, sw in enumerate(best_searches, 1):
- print(f" {idx}. \"{sw['search_word']}\" - {sw['high_quality_count']}个完全匹配")
- # Top 10 最佳搜索词
- print("\n" + "=" * 80)
- print("【Top 10 最佳搜索词(按完全匹配数排序)】")
- print("=" * 80)
- for idx, sw in enumerate(search_word_details[:10], 1):
- print(f"\n{idx}. \"{sw['search_word']}\"")
- print(f" 原始特征: {sw['original_feature']}")
- print(f" Base Word: {sw['base_word']}")
- print(f" 评估帖子: {sw['total_notes']}, 过滤: {sw['filtered_count']}")
- print(f" 完全匹配(8-10): {sw['high_quality_count']} 个")
- print(f" 相似匹配(6-7): {sw['similar_count']} 个")
- # 过滤效果分析
- print("\n" + "=" * 80)
- print("【过滤效果分析】")
- print("=" * 80)
- total_evaluated = global_stats['total_notes_evaluated']
- total_filtered = global_stats['total_filtered']
- total_remaining = total_evaluated - total_filtered
- total_high_quality = global_stats['match_distribution']['完全匹配(8-10)']
- total_similar = global_stats['match_distribution']['相似匹配(6-7)']
- total_weak = global_stats['match_distribution']['弱相似(5-6)']
- total_no_match = global_stats['match_distribution']['无匹配(≤4)']
- print(f" 评估帖子总数: {total_evaluated}")
- print(f" 第一层过滤(Query不相关): {total_filtered} ({total_filtered/total_evaluated*100:.1f}%)")
- print(f" 通过过滤的帖子: {total_remaining} ({total_remaining/total_evaluated*100:.1f}%)")
- print(f"\n 通过过滤后的质量分布:")
- if total_remaining > 0:
- print(f" 完全匹配(8-10): {total_high_quality} ({total_high_quality/total_remaining*100:.1f}%)")
- print(f" 相似匹配(6-7): {total_similar} ({total_similar/total_remaining*100:.1f}%)")
- print(f" 弱相似(5-6): {total_weak} ({total_weak/total_remaining*100:.1f}%)")
- print(f" 无匹配(≤4): {total_no_match} ({total_no_match/total_remaining*100:.1f}%)")
- print("\n" + "=" * 80)
- def save_statistics(stats: Dict[str, Any], output_path: str):
- """保存统计结果到JSON文件"""
- with open(output_path, 'w', encoding='utf-8') as f:
- json.dump(stats, f, ensure_ascii=False, indent=2)
- print(f"\n统计结果已保存到: {output_path}")
- def main():
- """主函数"""
- input_file = "output_v2/stage6_with_evaluations.json"
- output_file = "output_v2/stage6_statistics.json"
- print("正在加载数据...")
- data = load_stage6_results(input_file)
- print("正在分析评估结果...")
- stats = analyze_evaluation_results(data)
- # 打印统计结果
- print_statistics(stats)
- # 保存结果
- save_statistics(stats, output_file)
- if __name__ == '__main__':
- main()
|