analyze_stage6_results.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Stage 6 评估结果统计分析
  5. 分析两层评估的过滤效果和匹配质量
  6. """
  7. import json
  8. from typing import Dict, List, Any
  9. from collections import defaultdict
  10. def load_stage6_results(file_path: str) -> List[Dict[str, Any]]:
  11. """加载Stage 6评估结果"""
  12. with open(file_path, 'r', encoding='utf-8') as f:
  13. return json.load(f)
  14. def analyze_evaluation_results(data: List[Dict[str, Any]]) -> Dict[str, Any]:
  15. """分析评估结果"""
  16. # 全局统计
  17. global_stats = {
  18. 'total_search_words': 0,
  19. 'total_notes_evaluated': 0,
  20. 'total_filtered': 0,
  21. 'match_distribution': {
  22. '完全匹配(8-10)': 0,
  23. '相似匹配(6-7)': 0,
  24. '弱相似(5-6)': 0,
  25. '无匹配(≤4)': 0
  26. }
  27. }
  28. # 按原始特征分组统计
  29. feature_stats = defaultdict(lambda: {
  30. 'search_words_count': 0,
  31. 'total_notes': 0,
  32. 'total_filtered': 0,
  33. 'match_distribution': {
  34. '完全匹配(8-10)': 0,
  35. '相似匹配(6-7)': 0,
  36. '弱相似(5-6)': 0,
  37. '无匹配(≤4)': 0
  38. },
  39. 'search_words': []
  40. })
  41. # 所有搜索词的详细统计
  42. search_word_details = []
  43. # 遍历所有原始特征
  44. for feature_result in data:
  45. original_feature = feature_result.get('原始特征名称', 'Unknown')
  46. # 从组合评估结果_分组中读取
  47. grouped_results = feature_result.get('组合评估结果_分组', [])
  48. for group in grouped_results:
  49. base_word = group.get('base_word', '')
  50. for eval_item in group.get('top10_searches', []):
  51. # 检查是否有评估结果
  52. evaluation = eval_item.get('evaluation_with_filter')
  53. if not evaluation:
  54. continue
  55. search_word = eval_item.get('search_word', '')
  56. # 提取评估数据
  57. total_notes = evaluation.get('total_notes', 0)
  58. evaluated_notes = evaluation.get('evaluated_notes', 0)
  59. filtered_count = evaluation.get('filtered_count', 0)
  60. statistics = evaluation.get('statistics', {})
  61. # 更新全局统计
  62. global_stats['total_search_words'] += 1
  63. global_stats['total_notes_evaluated'] += total_notes
  64. global_stats['total_filtered'] += filtered_count
  65. for key in global_stats['match_distribution']:
  66. global_stats['match_distribution'][key] += statistics.get(key, 0)
  67. # 更新特征统计
  68. feature_stats[original_feature]['search_words_count'] += 1
  69. feature_stats[original_feature]['total_notes'] += total_notes
  70. feature_stats[original_feature]['total_filtered'] += filtered_count
  71. for key in feature_stats[original_feature]['match_distribution']:
  72. feature_stats[original_feature]['match_distribution'][key] += statistics.get(key, 0)
  73. # 记录搜索词详情
  74. search_word_info = {
  75. 'original_feature': original_feature,
  76. 'base_word': base_word,
  77. 'search_word': search_word,
  78. 'total_notes': total_notes,
  79. 'evaluated_notes': evaluated_notes,
  80. 'filtered_count': filtered_count,
  81. 'match_distribution': statistics,
  82. 'high_quality_count': statistics.get('完全匹配(8-10)', 0),
  83. 'similar_count': statistics.get('相似匹配(6-7)', 0)
  84. }
  85. search_word_details.append(search_word_info)
  86. feature_stats[original_feature]['search_words'].append(search_word_info)
  87. # 计算全局过滤率
  88. if global_stats['total_notes_evaluated'] > 0:
  89. global_stats['filter_rate'] = global_stats['total_filtered'] / global_stats['total_notes_evaluated']
  90. else:
  91. global_stats['filter_rate'] = 0.0
  92. # 计算每个特征的过滤率
  93. for feature_name, stats in feature_stats.items():
  94. if stats['total_notes'] > 0:
  95. stats['filter_rate'] = stats['total_filtered'] / stats['total_notes']
  96. else:
  97. stats['filter_rate'] = 0.0
  98. # 按高质量匹配数排序搜索词
  99. search_word_details.sort(key=lambda x: x['high_quality_count'], reverse=True)
  100. return {
  101. 'global_stats': global_stats,
  102. 'feature_stats': dict(feature_stats),
  103. 'search_word_details': search_word_details
  104. }
  105. def print_statistics(stats: Dict[str, Any]):
  106. """打印统计结果"""
  107. global_stats = stats['global_stats']
  108. feature_stats = stats['feature_stats']
  109. search_word_details = stats['search_word_details']
  110. print("=" * 80)
  111. print("Stage 6 评估结果统计分析")
  112. print("=" * 80)
  113. # 全局统计
  114. print("\n【全局统计】")
  115. print(f" 总搜索词数: {global_stats['total_search_words']}")
  116. print(f" 总评估帖子数: {global_stats['total_notes_evaluated']}")
  117. print(f" 总过滤帖子数: {global_stats['total_filtered']} (过滤率: {global_stats['filter_rate']*100:.1f}%)")
  118. print(f"\n 匹配度分布:")
  119. for match_type, count in global_stats['match_distribution'].items():
  120. print(f" {match_type}: {count} 个帖子")
  121. # 按原始特征统计
  122. print("\n" + "=" * 80)
  123. print("【按原始特征统计】")
  124. print("=" * 80)
  125. for feature_name, stats in sorted(feature_stats.items()):
  126. print(f"\n特征: {feature_name}")
  127. print(f" 搜索词数: {stats['search_words_count']}")
  128. print(f" 总评估帖子: {stats['total_notes']}")
  129. print(f" 总过滤帖子: {stats['total_filtered']} (过滤率: {stats['filter_rate']*100:.1f}%)")
  130. print(f" 高质量匹配: {stats['match_distribution']['完全匹配(8-10)']} 个帖子")
  131. print(f" 相似匹配: {stats['match_distribution']['相似匹配(6-7)']} 个帖子")
  132. # 找出该特征下高质量匹配最多的搜索词
  133. best_searches = sorted(stats['search_words'], key=lambda x: x['high_quality_count'], reverse=True)[:3]
  134. if best_searches:
  135. print(f" Top 3 最佳搜索词:")
  136. for idx, sw in enumerate(best_searches, 1):
  137. print(f" {idx}. \"{sw['search_word']}\" - {sw['high_quality_count']}个完全匹配")
  138. # Top 10 最佳搜索词
  139. print("\n" + "=" * 80)
  140. print("【Top 10 最佳搜索词(按完全匹配数排序)】")
  141. print("=" * 80)
  142. for idx, sw in enumerate(search_word_details[:10], 1):
  143. print(f"\n{idx}. \"{sw['search_word']}\"")
  144. print(f" 原始特征: {sw['original_feature']}")
  145. print(f" Base Word: {sw['base_word']}")
  146. print(f" 评估帖子: {sw['total_notes']}, 过滤: {sw['filtered_count']}")
  147. print(f" 完全匹配(8-10): {sw['high_quality_count']} 个")
  148. print(f" 相似匹配(6-7): {sw['similar_count']} 个")
  149. # 过滤效果分析
  150. print("\n" + "=" * 80)
  151. print("【过滤效果分析】")
  152. print("=" * 80)
  153. total_evaluated = global_stats['total_notes_evaluated']
  154. total_filtered = global_stats['total_filtered']
  155. total_remaining = total_evaluated - total_filtered
  156. total_high_quality = global_stats['match_distribution']['完全匹配(8-10)']
  157. total_similar = global_stats['match_distribution']['相似匹配(6-7)']
  158. total_weak = global_stats['match_distribution']['弱相似(5-6)']
  159. total_no_match = global_stats['match_distribution']['无匹配(≤4)']
  160. print(f" 评估帖子总数: {total_evaluated}")
  161. print(f" 第一层过滤(Query不相关): {total_filtered} ({total_filtered/total_evaluated*100:.1f}%)")
  162. print(f" 通过过滤的帖子: {total_remaining} ({total_remaining/total_evaluated*100:.1f}%)")
  163. print(f"\n 通过过滤后的质量分布:")
  164. if total_remaining > 0:
  165. print(f" 完全匹配(8-10): {total_high_quality} ({total_high_quality/total_remaining*100:.1f}%)")
  166. print(f" 相似匹配(6-7): {total_similar} ({total_similar/total_remaining*100:.1f}%)")
  167. print(f" 弱相似(5-6): {total_weak} ({total_weak/total_remaining*100:.1f}%)")
  168. print(f" 无匹配(≤4): {total_no_match} ({total_no_match/total_remaining*100:.1f}%)")
  169. print("\n" + "=" * 80)
  170. def save_statistics(stats: Dict[str, Any], output_path: str):
  171. """保存统计结果到JSON文件"""
  172. with open(output_path, 'w', encoding='utf-8') as f:
  173. json.dump(stats, f, ensure_ascii=False, indent=2)
  174. print(f"\n统计结果已保存到: {output_path}")
  175. def main():
  176. """主函数"""
  177. input_file = "output_v2/stage6_with_evaluations.json"
  178. output_file = "output_v2/stage6_statistics.json"
  179. print("正在加载数据...")
  180. data = load_stage6_results(input_file)
  181. print("正在分析评估结果...")
  182. stats = analyze_evaluation_results(data)
  183. # 打印统计结果
  184. print_statistics(stats)
  185. # 保存结果
  186. save_statistics(stats, output_file)
  187. if __name__ == '__main__':
  188. main()