""" 测试评估V2模块 从现有run_context.json读取帖子,使用V2评估模块重新评估,生成统计报告 """ import asyncio import json import sys from pathlib import Path from datetime import datetime from collections import defaultdict # 导入必要的模块 from knowledge_search_traverse import Post from post_evaluator_v2 import evaluate_post_v2, apply_evaluation_v2_to_post async def test_evaluation_v2(run_context_path: str, max_posts: int = 10): """ 测试V2评估模块 Args: run_context_path: run_context.json路径 max_posts: 最多评估的帖子数量(用于快速测试) """ print(f"\n{'='*80}") print(f"📊 评估V2测试 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"{'='*80}\n") # 读取run_context.json print(f"📂 读取: {run_context_path}") with open(run_context_path, 'r', encoding='utf-8') as f: run_context = json.load(f) # 提取原始query original_query = run_context.get('o', '') print(f"🔍 原始Query: {original_query}\n") # 提取所有帖子 (从rounds -> search_results -> post_list) post_data_list = [] rounds = run_context.get('rounds', []) for round_idx, round_data in enumerate(rounds): search_results = round_data.get('search_results', []) for search_idx, search in enumerate(search_results): post_list = search.get('post_list', []) for post_idx, post_data in enumerate(post_list): # 生成唯一ID post_id = f"r{round_idx}_s{search_idx}_p{post_idx}" post_data_list.append((round_idx, search_idx, post_id, post_data)) total_posts = len(post_data_list) print(f"📝 找到 {total_posts} 个帖子 (来自 {len(rounds)} 轮)") # 限制评估数量(快速测试) if max_posts and max_posts < total_posts: post_data_list = post_data_list[:max_posts] print(f"⚡ 快速测试模式: 仅评估前 {max_posts} 个帖子\n") else: print() # 将post_data转换为Post对象 posts = [] for round_idx, search_idx, post_id, post_data in post_data_list: post = Post( note_id=post_data.get('note_id', post_id), title=post_data.get('title', ''), body_text=post_data.get('body_text', ''), images=post_data.get('images', []), type=post_data.get('type', 'normal') ) posts.append((round_idx, search_idx, post_id, post)) # 批量评估 print(f"🚀 开始批量评估 (并发数: 5)...\n") semaphore = asyncio.Semaphore(5) tasks = [] for round_idx, search_idx, post_id, post in posts: task = evaluate_post_v2(post, original_query, semaphore) tasks.append((round_idx, search_idx, post_id, post, task)) results = [] for i, (round_idx, search_idx, post_id, post, task) in enumerate(tasks, 1): print(f" [{i}/{len(tasks)}] 评估: {post.note_id}") knowledge_eval, relevance_eval = await task if knowledge_eval: # 应用评估结果(可能只有知识评估,没有相关性评估) apply_evaluation_v2_to_post(post, knowledge_eval, relevance_eval) results.append((round_idx, search_idx, post_id, post, knowledge_eval, relevance_eval)) # 输出已经在 evaluate_post_v2 中打印过了,这里不重复打印 else: print(f" ❌ 评估失败") print(f"\n✅ 评估完成: {len(results)}/{len(posts)} 成功\n") # 更新run_context.json中的帖子数据 print("💾 更新 run_context.json...") for round_idx, search_idx, post_id, post, knowledge_eval, relevance_eval in results: # 定位到对应的post_list if round_idx < len(rounds): search_results = rounds[round_idx].get('search_results', []) if search_idx < len(search_results): post_list = search_results[search_idx].get('post_list', []) # 找到对应的帖子并更新 for p in post_list: if p.get('note_id') == post.note_id: # 更新顶层字段 p['is_knowledge'] = post.is_knowledge p['knowledge_reason'] = post.knowledge_reason p['knowledge_score'] = post.knowledge_score p['knowledge_level'] = post.knowledge_level p['relevance_score'] = post.relevance_score p['relevance_level'] = post.relevance_level p['relevance_reason'] = post.relevance_reason p['relevance_conclusion'] = post.relevance_conclusion p['evaluation_time'] = post.evaluation_time p['evaluator_version'] = post.evaluator_version # 更新嵌套字段 p['knowledge_evaluation'] = post.knowledge_evaluation p['relevance_evaluation'] = post.relevance_evaluation break # 保存更新后的run_context.json output_path = run_context_path.replace('.json', '_v2.json') with open(output_path, 'w', encoding='utf-8') as f: json.dump(run_context, f, ensure_ascii=False, indent=2) print(f"✅ 已保存: {output_path}\n") # 生成统计报告 print(f"\n{'='*80}") print("📊 统计报告") print(f"{'='*80}\n") # 知识评估统计 knowledge_counts = defaultdict(int) knowledge_level_counts = defaultdict(int) knowledge_scores = [] for _, _, _, post, _, _ in results: if post.is_knowledge: knowledge_counts['知识内容'] += 1 else: knowledge_counts['非知识内容'] += 1 if post.knowledge_level: knowledge_level_counts[post.knowledge_level] += 1 if post.knowledge_score is not None: knowledge_scores.append(post.knowledge_score) total = len(results) print("📚 知识评估:") print(f" 知识内容: {knowledge_counts['知识内容']:3d} / {total} ({knowledge_counts['知识内容']/total*100:.1f}%)") print(f" 非知识内容: {knowledge_counts['非知识内容']:3d} / {total} ({knowledge_counts['非知识内容']/total*100:.1f}%)") print() if knowledge_scores: avg_score = sum(knowledge_scores) / len(knowledge_scores) print(f" 平均得分: {avg_score:.1f}分") print(f" 最高得分: {max(knowledge_scores):.0f}分") print(f" 最低得分: {min(knowledge_scores):.0f}分") print() print(" 星级分布:") for level in range(1, 6): count = knowledge_level_counts.get(level, 0) bar = '★' * count print(f" {level}星: {count:3d} {bar}") print() # 相关性评估统计 relevance_conclusion_counts = defaultdict(int) relevance_scores = [] purpose_scores = [] category_scores = [] for _, _, _, post, _, _ in results: if post.relevance_conclusion: relevance_conclusion_counts[post.relevance_conclusion] += 1 if post.relevance_score is not None: relevance_scores.append(post.relevance_score) if post.relevance_evaluation: if 'purpose_score' in post.relevance_evaluation: purpose_scores.append(post.relevance_evaluation['purpose_score']) if 'category_score' in post.relevance_evaluation: category_scores.append(post.relevance_evaluation['category_score']) print("🎯 相关性评估:") for conclusion in ['高度匹配', '中度匹配', '低度匹配', '不匹配']: count = relevance_conclusion_counts.get(conclusion, 0) if count > 0: print(f" {conclusion}: {count:3d} / {total} ({count/total*100:.1f}%)") print() if relevance_scores: avg_score = sum(relevance_scores) / len(relevance_scores) high_relevance = sum(1 for s in relevance_scores if s >= 70) print(f" 平均得分: {avg_score:.1f}分") print(f" 高相关性: {high_relevance} / {total} ({high_relevance/total*100:.1f}%) [≥70分]") print(f" 最高得分: {max(relevance_scores):.0f}分") print(f" 最低得分: {min(relevance_scores):.0f}分") print() if purpose_scores and category_scores: avg_purpose = sum(purpose_scores) / len(purpose_scores) avg_category = sum(category_scores) / len(category_scores) print(f" 目的性平均: {avg_purpose:.1f}分 (权重70%)") print(f" 品类平均: {avg_category:.1f}分 (权重30%)") print() # 综合分析 print("🔥 高质量内容 (知识内容 + 高相关性):") high_quality = sum( 1 for _, _, _, post, _, _ in results if post.is_knowledge and post.relevance_score and post.relevance_score >= 70 ) print(f" {high_quality} / {total} ({high_quality/total*100:.1f}%)") print() print(f"{'='*80}\n") return results if __name__ == "__main__": if len(sys.argv) < 2: print("用法: python3 test_evaluation_v2.py [最大评估数量]") print() print("示例:") print(" python3 test_evaluation_v2.py input/test_case/output/knowledge_search_traverse/20251112/173512_dc/run_context.json") print(" python3 test_evaluation_v2.py input/test_case/output/knowledge_search_traverse/20251112/173512_dc/run_context.json 20") sys.exit(1) run_context_path = sys.argv[1] max_posts = int(sys.argv[2]) if len(sys.argv) > 2 else None asyncio.run(test_evaluation_v2(run_context_path, max_posts))