test_evaluation_v3.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. """
  2. 测试评估V3模块
  3. 从现有run_context.json读取帖子,使用V3评估模块重新评估,生成统计报告
  4. """
  5. import asyncio
  6. import json
  7. import sys
  8. from pathlib import Path
  9. from datetime import datetime
  10. from collections import defaultdict
  11. # 导入必要的模块
  12. from knowledge_search_traverse import Post
  13. from post_evaluator_v3 import evaluate_post_v3, apply_evaluation_v3_to_post, two_stage_batch_evaluate
  14. async def test_evaluation_v3(run_context_path: str, max_posts: int = 10):
  15. """
  16. 测试V3评估模块
  17. Args:
  18. run_context_path: run_context.json路径
  19. max_posts: 最多评估的帖子数量(用于快速测试)
  20. """
  21. print(f"\n{'='*80}")
  22. print(f"📊 评估V3测试 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  23. print(f"{'='*80}\n")
  24. # 读取run_context.json
  25. print(f"📂 读取: {run_context_path}")
  26. with open(run_context_path, 'r', encoding='utf-8') as f:
  27. run_context = json.load(f)
  28. # 提取原始query
  29. original_query = run_context.get('o', '')
  30. print(f"🔍 原始Query: {original_query}\n")
  31. # 提取所有帖子 (从rounds -> search_results -> post_list)
  32. post_data_list = []
  33. rounds = run_context.get('rounds', [])
  34. for round_idx, round_data in enumerate(rounds):
  35. search_results = round_data.get('search_results', [])
  36. for search_idx, search in enumerate(search_results):
  37. post_list = search.get('post_list', [])
  38. for post_idx, post_data in enumerate(post_list):
  39. # 生成唯一ID
  40. post_id = f"r{round_idx}_s{search_idx}_p{post_idx}"
  41. post_data_list.append((round_idx, search_idx, post_id, post_data))
  42. total_posts = len(post_data_list)
  43. print(f"📝 找到 {total_posts} 个帖子 (来自 {len(rounds)} 轮)")
  44. # 限制评估数量(快速测试)
  45. if max_posts and max_posts < total_posts:
  46. post_data_list = post_data_list[:max_posts]
  47. print(f"⚡ 快速测试模式: 仅评估前 {max_posts} 个帖子\n")
  48. else:
  49. print()
  50. # 将post_data转换为Post对象
  51. posts = []
  52. for round_idx, search_idx, post_id, post_data in post_data_list:
  53. post = Post(
  54. note_id=post_data.get('note_id', post_id),
  55. title=post_data.get('title', ''),
  56. body_text=post_data.get('body_text', ''),
  57. images=post_data.get('images', []),
  58. type=post_data.get('type', 'normal'),
  59. video=post_data.get('video', ''),
  60. interact_info=post_data.get('interact_info', {}),
  61. note_url=post_data.get('note_url', ''),
  62. author_name=post_data.get('author_name', ''),
  63. author_id=post_data.get('author_id', ''),
  64. publish_time=post_data.get('publish_time', 0),
  65. cdn_images=post_data.get('cdn_images', []),
  66. detail_fetched=post_data.get('detail_fetched', False)
  67. )
  68. posts.append((round_idx, search_idx, post_id, post))
  69. # 提取纯post列表用于两阶段评估
  70. post_list = [post for _, _, _, post in posts]
  71. # 使用两阶段批量评估
  72. await two_stage_batch_evaluate(post_list, original_query, quick_concurrent=15, detail_concurrent=15)
  73. # 处理评估结果(两阶段评估已将结果应用到post对象)
  74. results = []
  75. detailed_reports = [] # 收集详细评估报告
  76. print(f"📊 收集评估报告...\n")
  77. for i, (round_idx, search_idx, post_id, post) in enumerate(posts, 1):
  78. # 从post对象的嵌套字段中提取评估结果
  79. knowledge_eval = post.knowledge_evaluation
  80. content_eval = post.content_knowledge_evaluation
  81. purpose_eval = post.purpose_evaluation
  82. category_eval = post.category_evaluation
  83. # 收集详细报告
  84. if knowledge_eval:
  85. detailed_report = {
  86. 'post_index': i,
  87. 'note_id': post.note_id,
  88. 'title': post.title,
  89. 'final_score': post.final_score,
  90. 'match_level': post.match_level,
  91. 'is_knowledge': post.is_knowledge,
  92. 'is_content_knowledge': post.is_content_knowledge,
  93. 'knowledge_score': post.knowledge_score,
  94. 'evaluations': {
  95. 'knowledge': {
  96. 'conclusion': knowledge_eval.get('conclusion') if isinstance(knowledge_eval, dict) else getattr(knowledge_eval, 'conclusion', None),
  97. 'core_evidence': knowledge_eval.get('core_evidence') if isinstance(knowledge_eval, dict) else getattr(knowledge_eval, 'core_evidence', None),
  98. 'issues': knowledge_eval.get('issues') if isinstance(knowledge_eval, dict) else getattr(knowledge_eval, 'issues', None)
  99. },
  100. 'content_knowledge': {
  101. 'summary': content_eval.get('summary') if isinstance(content_eval, dict) else getattr(content_eval, 'summary', None),
  102. 'final_score': content_eval.get('final_score') if isinstance(content_eval, dict) else getattr(content_eval, 'final_score', None),
  103. 'level': content_eval.get('level') if isinstance(content_eval, dict) else getattr(content_eval, 'level', None)
  104. } if content_eval and post.is_content_knowledge else None,
  105. 'purpose': {
  106. 'score': purpose_eval.get('purpose_score') if isinstance(purpose_eval, dict) else getattr(purpose_eval, 'purpose_score', None),
  107. 'core_motivation': purpose_eval.get('core_motivation') if isinstance(purpose_eval, dict) else getattr(purpose_eval, 'core_motivation', None),
  108. 'core_basis': purpose_eval.get('core_basis') if isinstance(purpose_eval, dict) else getattr(purpose_eval, 'core_basis', None),
  109. 'match_level': purpose_eval.get('match_level') if isinstance(purpose_eval, dict) else getattr(purpose_eval, 'match_level', None)
  110. } if purpose_eval else None,
  111. 'category': {
  112. 'score': category_eval.get('category_score') if isinstance(category_eval, dict) else getattr(category_eval, 'category_score', None),
  113. 'core_basis': category_eval.get('core_basis') if isinstance(category_eval, dict) else getattr(category_eval, 'core_basis', None),
  114. 'match_level': category_eval.get('match_level') if isinstance(category_eval, dict) else getattr(category_eval, 'match_level', None)
  115. } if category_eval else None
  116. }
  117. }
  118. detailed_reports.append(detailed_report)
  119. results.append((round_idx, search_idx, post_id, post))
  120. print(f"\n✅ 评估完成: {len(results)}/{len(posts)} 成功\n")
  121. # 更新run_context.json中的帖子数据
  122. print("💾 更新 run_context.json...")
  123. for round_idx, search_idx, post_id, post in results:
  124. # 定位到对应的post_list
  125. if round_idx < len(rounds):
  126. search_results = rounds[round_idx].get('search_results', [])
  127. if search_idx < len(search_results):
  128. post_list = search_results[search_idx].get('post_list', [])
  129. # 找到对应的帖子并更新
  130. for p in post_list:
  131. if p.get('note_id') == post.note_id:
  132. # 更新V3顶层字段
  133. p['is_knowledge'] = post.is_knowledge
  134. p['is_content_knowledge'] = post.is_content_knowledge
  135. p['knowledge_score'] = post.knowledge_score
  136. p['purpose_score'] = post.purpose_score
  137. p['category_score'] = post.category_score
  138. p['final_score'] = post.final_score
  139. p['match_level'] = post.match_level
  140. p['evaluation_time'] = post.evaluation_time
  141. p['evaluator_version'] = post.evaluator_version
  142. # 更新V3嵌套字段
  143. p['knowledge_evaluation'] = post.knowledge_evaluation
  144. p['content_knowledge_evaluation'] = post.content_knowledge_evaluation
  145. p['purpose_evaluation'] = post.purpose_evaluation
  146. p['category_evaluation'] = post.category_evaluation
  147. break
  148. # 保存更新后的run_context.json
  149. output_path = run_context_path.replace('.json', '_v3.json')
  150. with open(output_path, 'w', encoding='utf-8') as f:
  151. json.dump(run_context, f, ensure_ascii=False, indent=2)
  152. print(f"✅ 已保存: {output_path}")
  153. # 保存详细评估报告
  154. report_path = run_context_path.replace('.json', '_evaluation_report.json')
  155. evaluation_report = {
  156. 'metadata': {
  157. 'original_query': original_query,
  158. 'total_posts': len(results),
  159. 'evaluation_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  160. 'evaluator_version': 'v3.0'
  161. },
  162. 'detailed_reports': detailed_reports
  163. }
  164. with open(report_path, 'w', encoding='utf-8') as f:
  165. json.dump(evaluation_report, f, ensure_ascii=False, indent=2)
  166. print(f"📄 已保存详细评估报告: {report_path}\n")
  167. # 生成统计报告
  168. print(f"\n{'='*80}")
  169. print("📊 统计报告")
  170. print(f"{'='*80}\n")
  171. # Prompt1: 是否是知识
  172. is_knowledge_counts = defaultdict(int)
  173. for _, _, _, post in results:
  174. if post.is_knowledge:
  175. is_knowledge_counts['是知识'] += 1
  176. else:
  177. is_knowledge_counts['非知识'] += 1
  178. total = len(results)
  179. print("🔍 Prompt1 - 是否是知识:")
  180. print(f" 是知识: {is_knowledge_counts['是知识']:3d} / {total} ({is_knowledge_counts['是知识']/total*100:.1f}%)")
  181. print(f" 非知识: {is_knowledge_counts['非知识']:3d} / {total} ({is_knowledge_counts['非知识']/total*100:.1f}%)")
  182. print()
  183. # Prompt2: 是否是内容知识
  184. is_content_knowledge_counts = defaultdict(int)
  185. knowledge_scores = []
  186. for _, _, _, post in results:
  187. if post.is_content_knowledge is not None:
  188. if post.is_content_knowledge:
  189. is_content_knowledge_counts['是内容知识'] += 1
  190. else:
  191. is_content_knowledge_counts['非内容知识'] += 1
  192. if post.knowledge_score is not None:
  193. knowledge_scores.append(post.knowledge_score)
  194. if is_content_knowledge_counts:
  195. content_total = sum(is_content_knowledge_counts.values())
  196. print("📚 Prompt2 - 是否是内容知识:")
  197. print(f" 是内容知识: {is_content_knowledge_counts['是内容知识']:3d} / {content_total} ({is_content_knowledge_counts['是内容知识']/content_total*100:.1f}%)")
  198. if is_content_knowledge_counts['非内容知识'] > 0:
  199. print(f" 非内容知识: {is_content_knowledge_counts['非内容知识']:3d} / {content_total} ({is_content_knowledge_counts['非内容知识']/content_total*100:.1f}%)")
  200. print()
  201. if knowledge_scores:
  202. avg_score = sum(knowledge_scores) / len(knowledge_scores)
  203. print(f" 知识平均得分: {avg_score:.1f}分")
  204. print(f" 知识最高得分: {max(knowledge_scores):.0f}分")
  205. print(f" 知识最低得分: {min(knowledge_scores):.0f}分")
  206. print()
  207. # Prompt3 & Prompt4: 目的性和品类匹配
  208. purpose_scores = []
  209. category_scores = []
  210. final_scores = []
  211. match_level_counts = defaultdict(int)
  212. for _, _, _, post in results:
  213. if post.purpose_score is not None:
  214. purpose_scores.append(post.purpose_score)
  215. if post.category_score is not None:
  216. category_scores.append(post.category_score)
  217. if post.final_score is not None:
  218. final_scores.append(post.final_score)
  219. if post.match_level:
  220. match_level_counts[post.match_level] += 1
  221. if purpose_scores:
  222. avg_purpose = sum(purpose_scores) / len(purpose_scores)
  223. print("🎯 Prompt3 - 目的性匹配:")
  224. print(f" 平均得分: {avg_purpose:.1f}分")
  225. print(f" 最高得分: {max(purpose_scores):.0f}分")
  226. print(f" 最低得分: {min(purpose_scores):.0f}分")
  227. print()
  228. if category_scores:
  229. avg_category = sum(category_scores) / len(category_scores)
  230. print("🏷️ Prompt4 - 品类匹配:")
  231. print(f" 平均得分: {avg_category:.1f}分")
  232. print(f" 最高得分: {max(category_scores):.0f}分")
  233. print(f" 最低得分: {min(category_scores):.0f}分")
  234. print()
  235. if final_scores:
  236. avg_final = sum(final_scores) / len(final_scores)
  237. print("🔥 综合得分 (目的性70% + 品类30%):")
  238. print(f" 平均得分: {avg_final:.2f}分")
  239. print(f" 最高得分: {max(final_scores):.2f}分")
  240. print(f" 最低得分: {min(final_scores):.2f}分")
  241. print()
  242. if match_level_counts:
  243. print("📊 匹配等级分布:")
  244. for level in ['高度匹配', '基本匹配', '部分匹配', '弱匹配', '不匹配']:
  245. count = match_level_counts.get(level, 0)
  246. if count > 0:
  247. bar = '█' * int(count / total * 50)
  248. print(f" {level:8s}: {count:3d} / {total} ({count/total*100:.1f}%) {bar}")
  249. print()
  250. # 综合分析
  251. print("🌟 高质量内容统计:")
  252. # 是知识 + 是内容知识
  253. is_quality_knowledge = sum(
  254. 1 for _, _, _, post in results
  255. if post.is_knowledge and post.is_content_knowledge
  256. )
  257. print(f" 知识内容: {is_quality_knowledge} / {total} ({is_quality_knowledge/total*100:.1f}%)")
  258. # 是知识 + 是内容知识 + 高度匹配
  259. high_match = sum(
  260. 1 for _, _, _, post in results
  261. if post.is_knowledge and post.is_content_knowledge and post.match_level == '高度匹配'
  262. )
  263. print(f" 高度匹配: {high_match} / {total} ({high_match/total*100:.1f}%)")
  264. # 是知识 + 是内容知识 + 综合得分>=70
  265. high_score = sum(
  266. 1 for _, _, _, post in results
  267. if post.is_knowledge and post.is_content_knowledge and post.final_score and post.final_score >= 70
  268. )
  269. print(f" 得分≥70: {high_score} / {total} ({high_score/total*100:.1f}%)")
  270. print()
  271. print(f"{'='*80}\n")
  272. return results
  273. if __name__ == "__main__":
  274. if len(sys.argv) < 2:
  275. print("用法: python3 test_evaluation_v3.py <run_context.json路径> [最大评估数量]")
  276. print()
  277. print("示例:")
  278. print(" python3 test_evaluation_v3.py input/test_case/output/knowledge_search_traverse/20251112/173512_dc/run_context.json")
  279. print(" python3 test_evaluation_v3.py input/test_case/output/knowledge_search_traverse/20251112/173512_dc/run_context.json 20")
  280. sys.exit(1)
  281. run_context_path = sys.argv[1]
  282. max_posts = int(sys.argv[2]) if len(sys.argv) > 2 else None
  283. asyncio.run(test_evaluation_v3(run_context_path, max_posts))