""" 测试评估V3模块 从现有run_context.json读取帖子,使用V3评估模块重新评估,生成统计报告 """ import asyncio import json import sys from pathlib import Path from datetime import datetime from collections import defaultdict # 导入必要的模块 from knowledge_search_traverse import Post from post_evaluator_v3 import evaluate_post_v3, apply_evaluation_v3_to_post async def test_evaluation_v3(run_context_path: str, max_posts: int = 10): """ 测试V3评估模块 Args: run_context_path: run_context.json路径 max_posts: 最多评估的帖子数量(用于快速测试) """ print(f"\n{'='*80}") print(f"📊 评估V3测试 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"{'='*80}\n") # 读取run_context.json print(f"📂 读取: {run_context_path}") with open(run_context_path, 'r', encoding='utf-8') as f: run_context = json.load(f) # 提取原始query original_query = run_context.get('o', '') print(f"🔍 原始Query: {original_query}\n") # 提取所有帖子 (从rounds -> search_results -> post_list) post_data_list = [] rounds = run_context.get('rounds', []) for round_idx, round_data in enumerate(rounds): search_results = round_data.get('search_results', []) for search_idx, search in enumerate(search_results): post_list = search.get('post_list', []) for post_idx, post_data in enumerate(post_list): # 生成唯一ID post_id = f"r{round_idx}_s{search_idx}_p{post_idx}" post_data_list.append((round_idx, search_idx, post_id, post_data)) total_posts = len(post_data_list) print(f"📝 找到 {total_posts} 个帖子 (来自 {len(rounds)} 轮)") # 限制评估数量(快速测试) if max_posts and max_posts < total_posts: post_data_list = post_data_list[:max_posts] print(f"⚡ 快速测试模式: 仅评估前 {max_posts} 个帖子\n") else: print() # 将post_data转换为Post对象 posts = [] for round_idx, search_idx, post_id, post_data in post_data_list: post = Post( note_id=post_data.get('note_id', post_id), title=post_data.get('title', ''), body_text=post_data.get('body_text', ''), images=post_data.get('images', []), type=post_data.get('type', 'normal'), video=post_data.get('video', ''), interact_info=post_data.get('interact_info', {}), note_url=post_data.get('note_url', ''), author_name=post_data.get('author_name', ''), author_id=post_data.get('author_id', ''), publish_time=post_data.get('publish_time', 0), cdn_images=post_data.get('cdn_images', []), detail_fetched=post_data.get('detail_fetched', False) ) posts.append((round_idx, search_idx, post_id, post)) # 批量评估 print(f"🚀 开始并行评估 (最多{len(posts)}个任务,并发限制: 5)...\n") semaphore = asyncio.Semaphore(5) tasks = [] # 1. 创建所有任务 for round_idx, search_idx, post_id, post in posts: task = evaluate_post_v3(post, original_query, semaphore) tasks.append((round_idx, search_idx, post_id, post, task)) # 2. 并行执行所有任务 task_coroutines = [task for _, _, _, _, task in tasks] all_eval_results = await asyncio.gather(*task_coroutines) # 3. 处理结果 results = [] detailed_reports = [] # 收集详细评估报告 print(f"📊 处理评估结果...\n") for i, ((round_idx, search_idx, post_id, post, _), eval_result) in enumerate(zip(tasks, all_eval_results), 1): knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level = eval_result print(f" [{i}/{len(tasks)}] {post.note_id} - {post.title[:40]}", end="") if knowledge_eval: if final_score is not None: print(f" → {match_level} ({final_score:.1f}分)") elif content_eval and not content_eval.is_content_knowledge: print(f" → 非内容知识") elif knowledge_eval and not knowledge_eval.is_knowledge: print(f" → 非知识") else: print(f" → 评估未完成") # 打印详细判断原因 print(f" 📝 知识评估: {knowledge_eval.conclusion if knowledge_eval.conclusion else '无'}") if content_eval and content_eval.is_content_knowledge: print(f" 📚 内容知识: {content_eval.summary[:80] if content_eval.summary else '无'}...") if purpose_eval: print(f" 🎯 目的匹配: {purpose_eval.core_basis[:80] if purpose_eval.core_basis else '无'}...") if category_eval: print(f" 🏷️ 品类匹配: {category_eval.core_basis[:80] if category_eval.core_basis else '无'}...") print() # 收集详细报告 detailed_report = { 'post_index': i, 'note_id': post.note_id, 'title': post.title, 'final_score': final_score, 'match_level': match_level, 'is_knowledge': knowledge_eval.is_knowledge if knowledge_eval else None, 'is_content_knowledge': content_eval.is_content_knowledge if content_eval else None, 'knowledge_score': content_eval.final_score if content_eval else None, 'evaluations': { 'knowledge': { 'conclusion': knowledge_eval.conclusion if knowledge_eval else None, 'core_evidence': knowledge_eval.core_evidence if knowledge_eval and hasattr(knowledge_eval, 'core_evidence') else None, 'issues': knowledge_eval.issues if knowledge_eval and hasattr(knowledge_eval, 'issues') else None }, 'content_knowledge': { 'summary': content_eval.summary if content_eval else None, 'final_score': content_eval.final_score if content_eval else None, 'level': content_eval.level if content_eval else None } if content_eval and content_eval.is_content_knowledge else None, 'purpose': { 'score': purpose_eval.purpose_score if purpose_eval else None, 'core_motivation': purpose_eval.core_motivation if purpose_eval else None, 'core_basis': purpose_eval.core_basis if purpose_eval else None, 'match_level': purpose_eval.match_level if purpose_eval else None } if purpose_eval else None, 'category': { 'score': category_eval.category_score if category_eval else None, 'core_basis': category_eval.core_basis if category_eval else None, 'match_level': category_eval.match_level if category_eval else None } if category_eval else None } } detailed_reports.append(detailed_report) # 应用评估结果 apply_evaluation_v3_to_post( post, knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level ) results.append((round_idx, search_idx, post_id, post)) else: print(f" → ❌ 评估失败\n") print(f"\n✅ 评估完成: {len(results)}/{len(posts)} 成功\n") # 更新run_context.json中的帖子数据 print("💾 更新 run_context.json...") for round_idx, search_idx, post_id, post in results: # 定位到对应的post_list if round_idx < len(rounds): search_results = rounds[round_idx].get('search_results', []) if search_idx < len(search_results): post_list = search_results[search_idx].get('post_list', []) # 找到对应的帖子并更新 for p in post_list: if p.get('note_id') == post.note_id: # 更新V3顶层字段 p['is_knowledge'] = post.is_knowledge p['is_content_knowledge'] = post.is_content_knowledge p['knowledge_score'] = post.knowledge_score p['purpose_score'] = post.purpose_score p['category_score'] = post.category_score p['final_score'] = post.final_score p['match_level'] = post.match_level p['evaluation_time'] = post.evaluation_time p['evaluator_version'] = post.evaluator_version # 更新V3嵌套字段 p['knowledge_evaluation'] = post.knowledge_evaluation p['content_knowledge_evaluation'] = post.content_knowledge_evaluation p['purpose_evaluation'] = post.purpose_evaluation p['category_evaluation'] = post.category_evaluation break # 保存更新后的run_context.json output_path = run_context_path.replace('.json', '_v3.json') with open(output_path, 'w', encoding='utf-8') as f: json.dump(run_context, f, ensure_ascii=False, indent=2) print(f"✅ 已保存: {output_path}") # 保存详细评估报告 report_path = run_context_path.replace('.json', '_evaluation_report.json') evaluation_report = { 'metadata': { 'original_query': original_query, 'total_posts': len(results), 'evaluation_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'evaluator_version': 'v3.0' }, 'detailed_reports': detailed_reports } with open(report_path, 'w', encoding='utf-8') as f: json.dump(evaluation_report, f, ensure_ascii=False, indent=2) print(f"📄 已保存详细评估报告: {report_path}\n") # 生成统计报告 print(f"\n{'='*80}") print("📊 统计报告") print(f"{'='*80}\n") # Prompt1: 是否是知识 is_knowledge_counts = defaultdict(int) for _, _, _, post in results: if post.is_knowledge: is_knowledge_counts['是知识'] += 1 else: is_knowledge_counts['非知识'] += 1 total = len(results) print("🔍 Prompt1 - 是否是知识:") print(f" 是知识: {is_knowledge_counts['是知识']:3d} / {total} ({is_knowledge_counts['是知识']/total*100:.1f}%)") print(f" 非知识: {is_knowledge_counts['非知识']:3d} / {total} ({is_knowledge_counts['非知识']/total*100:.1f}%)") print() # Prompt2: 是否是内容知识 is_content_knowledge_counts = defaultdict(int) knowledge_scores = [] for _, _, _, post in results: if post.is_content_knowledge is not None: if post.is_content_knowledge: is_content_knowledge_counts['是内容知识'] += 1 else: is_content_knowledge_counts['非内容知识'] += 1 if post.knowledge_score is not None: knowledge_scores.append(post.knowledge_score) if is_content_knowledge_counts: content_total = sum(is_content_knowledge_counts.values()) print("📚 Prompt2 - 是否是内容知识:") print(f" 是内容知识: {is_content_knowledge_counts['是内容知识']:3d} / {content_total} ({is_content_knowledge_counts['是内容知识']/content_total*100:.1f}%)") if is_content_knowledge_counts['非内容知识'] > 0: print(f" 非内容知识: {is_content_knowledge_counts['非内容知识']:3d} / {content_total} ({is_content_knowledge_counts['非内容知识']/content_total*100:.1f}%)") print() if knowledge_scores: avg_score = sum(knowledge_scores) / len(knowledge_scores) print(f" 知识平均得分: {avg_score:.1f}分") print(f" 知识最高得分: {max(knowledge_scores):.0f}分") print(f" 知识最低得分: {min(knowledge_scores):.0f}分") print() # Prompt3 & Prompt4: 目的性和品类匹配 purpose_scores = [] category_scores = [] final_scores = [] match_level_counts = defaultdict(int) for _, _, _, post in results: if post.purpose_score is not None: purpose_scores.append(post.purpose_score) if post.category_score is not None: category_scores.append(post.category_score) if post.final_score is not None: final_scores.append(post.final_score) if post.match_level: match_level_counts[post.match_level] += 1 if purpose_scores: avg_purpose = sum(purpose_scores) / len(purpose_scores) print("🎯 Prompt3 - 目的性匹配:") print(f" 平均得分: {avg_purpose:.1f}分") print(f" 最高得分: {max(purpose_scores):.0f}分") print(f" 最低得分: {min(purpose_scores):.0f}分") print() if category_scores: avg_category = sum(category_scores) / len(category_scores) print("🏷️ Prompt4 - 品类匹配:") print(f" 平均得分: {avg_category:.1f}分") print(f" 最高得分: {max(category_scores):.0f}分") print(f" 最低得分: {min(category_scores):.0f}分") print() if final_scores: avg_final = sum(final_scores) / len(final_scores) print("🔥 综合得分 (目的性70% + 品类30%):") print(f" 平均得分: {avg_final:.2f}分") print(f" 最高得分: {max(final_scores):.2f}分") print(f" 最低得分: {min(final_scores):.2f}分") print() if match_level_counts: print("📊 匹配等级分布:") for level in ['高度匹配', '基本匹配', '部分匹配', '弱匹配', '不匹配']: count = match_level_counts.get(level, 0) if count > 0: bar = '█' * int(count / total * 50) print(f" {level:8s}: {count:3d} / {total} ({count/total*100:.1f}%) {bar}") print() # 综合分析 print("🌟 高质量内容统计:") # 是知识 + 是内容知识 is_quality_knowledge = sum( 1 for _, _, _, post in results if post.is_knowledge and post.is_content_knowledge ) print(f" 知识内容: {is_quality_knowledge} / {total} ({is_quality_knowledge/total*100:.1f}%)") # 是知识 + 是内容知识 + 高度匹配 high_match = sum( 1 for _, _, _, post in results if post.is_knowledge and post.is_content_knowledge and post.match_level == '高度匹配' ) print(f" 高度匹配: {high_match} / {total} ({high_match/total*100:.1f}%)") # 是知识 + 是内容知识 + 综合得分>=70 high_score = sum( 1 for _, _, _, post in results if post.is_knowledge and post.is_content_knowledge and post.final_score and post.final_score >= 70 ) print(f" 得分≥70: {high_score} / {total} ({high_score/total*100:.1f}%)") print() print(f"{'='*80}\n") return results if __name__ == "__main__": if len(sys.argv) < 2: print("用法: python3 test_evaluation_v3.py [最大评估数量]") print() print("示例:") print(" python3 test_evaluation_v3.py input/test_case/output/knowledge_search_traverse/20251112/173512_dc/run_context.json") print(" python3 test_evaluation_v3.py input/test_case/output/knowledge_search_traverse/20251112/173512_dc/run_context.json 20") sys.exit(1) run_context_path = sys.argv[1] max_posts = int(sys.argv[2]) if len(sys.argv) > 2 else None asyncio.run(test_evaluation_v3(run_context_path, max_posts))