2 mēneši atpakaļ · 74943c0838
--- a/test_evaluation_v3.py
+++ b/test_evaluation_v3.py
@@ -67,7 +67,15 @@ async def test_evaluation_v3(run_context_path: str, max_posts: int = 10):
 
				             title=post_data.get('title', ''),
			
 
				             body_text=post_data.get('body_text', ''),
			
 
				             images=post_data.get('images', []),
			
 
				-            type=post_data.get('type', 'normal')
			
 
				+            type=post_data.get('type', 'normal'),
			
 
				+            video=post_data.get('video', ''),
			
 
				+            interact_info=post_data.get('interact_info', {}),
			
 
				+            note_url=post_data.get('note_url', ''),
			
 
				+            author_name=post_data.get('author_name', ''),
			
 
				+            author_id=post_data.get('author_id', ''),
			
 
				+            publish_time=post_data.get('publish_time', 0),
			
 
				+            cdn_images=post_data.get('cdn_images', []),
			
 
				+            detail_fetched=post_data.get('detail_fetched', False)
			
 
				         )
			
 
				         posts.append((round_idx, search_idx, post_id, post))
			
 
				 
			
--- a/test_evaluation_v4.py
+++ b/test_evaluation_v4.py
@@ -0,0 +1,362 @@
 
				+"""
			
 
				+测试评估V4模块 (LangGraph + Gemini)
			
 
				+从现有run_context.json读取帖子,使用V4评估模块重新评估,生成统计报告
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from datetime import datetime
			
 
				+from collections import defaultdict
			
 
				+
			
 
				+# 导入必要的模块
			
 
				+from knowledge_search_traverse import Post
			
 
				+from post_evaluator_v4_langgraph import evaluate_post_v4, apply_evaluation_v4_to_post
			
 
				+
			
 
				+
			
 
				+async def test_evaluation_v4(run_context_path: str, max_posts: int = 20):
			
 
				+    """
			
 
				+    测试V4评估模块
			
 
				+
			
 
				+    Args:
			
 
				+        run_context_path: run_context.json路径
			
 
				+        max_posts: 最多评估的帖子数量(用于快速测试)
			
 
				+    """
			
 
				+    print(f"\n{'='*80}")
			
 
				+    print(f"📊 评估V4测试 (LangGraph + Gemini) - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
			
 
				+    print(f"{'='*80}\n")
			
 
				+
			
 
				+    # 读取run_context.json
			
 
				+    print(f"📂 读取: {run_context_path}")
			
 
				+    with open(run_context_path, 'r', encoding='utf-8') as f:
			
 
				+        run_context = json.load(f)
			
 
				+
			
 
				+    # 提取原始query
			
 
				+    original_query = run_context.get('o', '')
			
 
				+    print(f"🔍 原始Query: {original_query}\n")
			
 
				+
			
 
				+    # 提取所有帖子 (从rounds -> search_results -> post_list)
			
 
				+    post_data_list = []
			
 
				+    rounds = run_context.get('rounds', [])
			
 
				+
			
 
				+    for round_idx, round_data in enumerate(rounds):
			
 
				+        search_results = round_data.get('search_results', [])
			
 
				+        for search_idx, search in enumerate(search_results):
			
 
				+            post_list = search.get('post_list', [])
			
 
				+            for post_idx, post_data in enumerate(post_list):
			
 
				+                # 生成唯一ID
			
 
				+                post_id = f"r{round_idx}_s{search_idx}_p{post_idx}"
			
 
				+                post_data_list.append((round_idx, search_idx, post_id, post_data))
			
 
				+
			
 
				+    total_posts = len(post_data_list)
			
 
				+    print(f"📝 找到 {total_posts} 个帖子 (来自 {len(rounds)} 轮)")
			
 
				+
			
 
				+    # 限制评估数量(快速测试)
			
 
				+    if max_posts and max_posts < total_posts:
			
 
				+        post_data_list = post_data_list[:max_posts]
			
 
				+        print(f"⚡ 快速测试模式: 仅评估前 {max_posts} 个帖子\n")
			
 
				+    else:
			
 
				+        print()
			
 
				+
			
 
				+    # 将post_data转换为Post对象
			
 
				+    posts = []
			
 
				+    for round_idx, search_idx, post_id, post_data in post_data_list:
			
 
				+        post = Post(
			
 
				+            note_id=post_data.get('note_id', post_id),
			
 
				+            title=post_data.get('title', ''),
			
 
				+            body_text=post_data.get('body_text', ''),
			
 
				+            images=post_data.get('images', []),
			
 
				+            type=post_data.get('type', 'normal'),
			
 
				+            video=post_data.get('video', '')  # 添加video字段
			
 
				+        )
			
 
				+        posts.append((round_idx, search_idx, post_id, post))
			
 
				+
			
 
				+    # 批量评估
			
 
				+    print(f"🚀 开始并行评估 (最多{len(posts)}个任务,并发限制: 5)...\n")
			
 
				+
			
 
				+    semaphore = asyncio.Semaphore(5)
			
 
				+    tasks = []
			
 
				+
			
 
				+    # 1. 创建所有任务
			
 
				+    for round_idx, search_idx, post_id, post in posts:
			
 
				+        task = evaluate_post_v4(post, original_query, semaphore)
			
 
				+        tasks.append((round_idx, search_idx, post_id, post, task))
			
 
				+
			
 
				+    # 2. 并行执行所有任务
			
 
				+    task_coroutines = [task for _, _, _, _, task in tasks]
			
 
				+    all_eval_results = await asyncio.gather(*task_coroutines)
			
 
				+
			
 
				+    # 3. 处理结果
			
 
				+    results = []
			
 
				+    detailed_reports = []  # 收集详细评估报告
			
 
				+    print(f"📊 处理评估结果...\n")
			
 
				+    for i, ((round_idx, search_idx, post_id, post, _), eval_result) in enumerate(zip(tasks, all_eval_results), 1):
			
 
				+        knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level = eval_result
			
 
				+
			
 
				+        print(f"  [{i}/{len(tasks)}] {post.note_id} - {post.title[:40]}", end="")
			
 
				+        if knowledge_eval:
			
 
				+            if final_score is not None:
			
 
				+                print(f" → {match_level} ({final_score:.1f}分)")
			
 
				+            elif content_eval and not content_eval.is_content_knowledge:
			
 
				+                print(f" → 非内容知识")
			
 
				+            elif knowledge_eval and not knowledge_eval.is_knowledge:
			
 
				+                print(f" → 非知识")
			
 
				+            else:
			
 
				+                print(f" → 评估未完成")
			
 
				+
			
 
				+            # 打印详细判断原因
			
 
				+            print(f"      📝 知识评估: {knowledge_eval.conclusion if knowledge_eval.conclusion else '无'}")
			
 
				+            if content_eval and content_eval.is_content_knowledge:
			
 
				+                print(f"      📚 内容知识: {content_eval.summary[:80] if content_eval.summary else '无'}...")
			
 
				+            if purpose_eval:
			
 
				+                print(f"      🎯 目的匹配: {purpose_eval.core_basis[:80] if purpose_eval.core_basis else '无'}...")
			
 
				+            if category_eval:
			
 
				+                print(f"      🏷️  品类匹配: {category_eval.core_basis[:80] if category_eval.core_basis else '无'}...")
			
 
				+            print()
			
 
				+
			
 
				+            # 收集详细报告
			
 
				+            detailed_report = {
			
 
				+                'post_index': i,
			
 
				+                'note_id': post.note_id,
			
 
				+                'title': post.title,
			
 
				+                'type': post.type,
			
 
				+                'final_score': final_score,
			
 
				+                'match_level': match_level,
			
 
				+                'is_knowledge': knowledge_eval.is_knowledge if knowledge_eval else None,
			
 
				+                'is_content_knowledge': content_eval.is_content_knowledge if content_eval else None,
			
 
				+                'knowledge_score': content_eval.final_score if content_eval else None,
			
 
				+                'evaluations': {
			
 
				+                    'knowledge': {
			
 
				+                        'conclusion': knowledge_eval.conclusion if knowledge_eval else None,
			
 
				+                        'core_evidence': knowledge_eval.core_evidence if knowledge_eval and hasattr(knowledge_eval, 'core_evidence') else None,
			
 
				+                        'issues': knowledge_eval.issues if knowledge_eval and hasattr(knowledge_eval, 'issues') else None
			
 
				+                    },
			
 
				+                    'content_knowledge': {
			
 
				+                        'summary': content_eval.summary if content_eval else None,
			
 
				+                        'final_score': content_eval.final_score if content_eval else None,
			
 
				+                        'level': content_eval.level if content_eval else None
			
 
				+                    } if content_eval and content_eval.is_content_knowledge else None,
			
 
				+                    'purpose': {
			
 
				+                        'score': purpose_eval.purpose_score if purpose_eval else None,
			
 
				+                        'core_motivation': purpose_eval.core_motivation if purpose_eval else None,
			
 
				+                        'core_basis': purpose_eval.core_basis if purpose_eval else None,
			
 
				+                        'match_level': purpose_eval.match_level if purpose_eval else None
			
 
				+                    } if purpose_eval else None,
			
 
				+                    'category': {
			
 
				+                        'score': category_eval.category_score if category_eval else None,
			
 
				+                        'core_basis': category_eval.core_basis if category_eval else None,
			
 
				+                        'match_level': category_eval.match_level if category_eval else None
			
 
				+                    } if category_eval else None
			
 
				+                }
			
 
				+            }
			
 
				+            detailed_reports.append(detailed_report)
			
 
				+
			
 
				+            # 应用评估结果
			
 
				+            apply_evaluation_v4_to_post(
			
 
				+                post,
			
 
				+                knowledge_eval,
			
 
				+                content_eval,
			
 
				+                purpose_eval,
			
 
				+                category_eval,
			
 
				+                final_score,
			
 
				+                match_level
			
 
				+            )
			
 
				+            results.append((round_idx, search_idx, post_id, post))
			
 
				+        else:
			
 
				+            print(f" → ❌ 评估失败\n")
			
 
				+
			
 
				+    print(f"\n✅ 评估完成: {len(results)}/{len(posts)} 成功\n")
			
 
				+
			
 
				+    # 更新run_context.json中的帖子数据
			
 
				+    print("💾 更新 run_context.json...")
			
 
				+    for round_idx, search_idx, post_id, post in results:
			
 
				+        # 定位到对应的post_list
			
 
				+        if round_idx < len(rounds):
			
 
				+            search_results = rounds[round_idx].get('search_results', [])
			
 
				+            if search_idx < len(search_results):
			
 
				+                post_list = search_results[search_idx].get('post_list', [])
			
 
				+
			
 
				+                # 找到对应的帖子并更新
			
 
				+                for p in post_list:
			
 
				+                    if p.get('note_id') == post.note_id:
			
 
				+                        # 更新V4顶层字段
			
 
				+                        p['is_knowledge'] = post.is_knowledge
			
 
				+                        p['is_content_knowledge'] = post.is_content_knowledge
			
 
				+                        p['knowledge_score'] = post.knowledge_score
			
 
				+
			
 
				+                        p['purpose_score'] = post.purpose_score
			
 
				+                        p['category_score'] = post.category_score
			
 
				+                        p['final_score'] = post.final_score
			
 
				+                        p['match_level'] = post.match_level
			
 
				+
			
 
				+                        p['evaluation_time'] = post.evaluation_time
			
 
				+                        p['evaluator_version'] = post.evaluator_version
			
 
				+
			
 
				+                        # 更新V4嵌套字段
			
 
				+                        p['knowledge_evaluation'] = post.knowledge_evaluation
			
 
				+                        p['content_knowledge_evaluation'] = post.content_knowledge_evaluation
			
 
				+                        p['purpose_evaluation'] = post.purpose_evaluation
			
 
				+                        p['category_evaluation'] = post.category_evaluation
			
 
				+                        break
			
 
				+
			
 
				+    # 保存更新后的run_context.json
			
 
				+    output_path = run_context_path.replace('.json', '_v4.json')
			
 
				+    with open(output_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(run_context, f, ensure_ascii=False, indent=2)
			
 
				+    print(f"✅ 已保存: {output_path}")
			
 
				+
			
 
				+    # 保存详细评估报告
			
 
				+    report_path = run_context_path.replace('.json', '_evaluation_report_v4.json')
			
 
				+    evaluation_report = {
			
 
				+        'metadata': {
			
 
				+            'original_query': original_query,
			
 
				+            'total_posts': len(results),
			
 
				+            'evaluation_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+            'evaluator_version': 'v4.0_langgraph'
			
 
				+        },
			
 
				+        'detailed_reports': detailed_reports
			
 
				+    }
			
 
				+    with open(report_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(evaluation_report, f, ensure_ascii=False, indent=2)
			
 
				+    print(f"📄 已保存详细评估报告: {report_path}\n")
			
 
				+
			
 
				+    # 生成统计报告
			
 
				+    print(f"\n{'='*80}")
			
 
				+    print("📊 统计报告")
			
 
				+    print(f"{'='*80}\n")
			
 
				+
			
 
				+    # Prompt1: 是否是知识
			
 
				+    is_knowledge_counts = defaultdict(int)
			
 
				+    for _, _, _, post in results:
			
 
				+        if post.is_knowledge:
			
 
				+            is_knowledge_counts['是知识'] += 1
			
 
				+        else:
			
 
				+            is_knowledge_counts['非知识'] += 1
			
 
				+
			
 
				+    total = len(results)
			
 
				+    print("🔍 Prompt1 - 是否是知识:")
			
 
				+    print(f"  是知识: {is_knowledge_counts['是知识']:3d} / {total} ({is_knowledge_counts['是知识']/total*100:.1f}%)")
			
 
				+    print(f"  非知识: {is_knowledge_counts['非知识']:3d} / {total} ({is_knowledge_counts['非知识']/total*100:.1f}%)")
			
 
				+    print()
			
 
				+
			
 
				+    # Prompt2: 是否是内容知识
			
 
				+    is_content_knowledge_counts = defaultdict(int)
			
 
				+    knowledge_scores = []
			
 
				+    for _, _, _, post in results:
			
 
				+        if post.is_content_knowledge is not None:
			
 
				+            if post.is_content_knowledge:
			
 
				+                is_content_knowledge_counts['是内容知识'] += 1
			
 
				+            else:
			
 
				+                is_content_knowledge_counts['非内容知识'] += 1
			
 
				+
			
 
				+        if post.knowledge_score is not None:
			
 
				+            knowledge_scores.append(post.knowledge_score)
			
 
				+
			
 
				+    if is_content_knowledge_counts:
			
 
				+        content_total = sum(is_content_knowledge_counts.values())
			
 
				+        print("📚 Prompt2 - 是否是内容知识:")
			
 
				+        print(f"  是内容知识: {is_content_knowledge_counts['是内容知识']:3d} / {content_total} ({is_content_knowledge_counts['是内容知识']/content_total*100:.1f}%)")
			
 
				+        if is_content_knowledge_counts['非内容知识'] > 0:
			
 
				+            print(f"  非内容知识: {is_content_knowledge_counts['非内容知识']:3d} / {content_total} ({is_content_knowledge_counts['非内容知识']/content_total*100:.1f}%)")
			
 
				+        print()
			
 
				+
			
 
				+    if knowledge_scores:
			
 
				+        avg_score = sum(knowledge_scores) / len(knowledge_scores)
			
 
				+        print(f"  知识平均得分: {avg_score:.1f}分")
			
 
				+        print(f"  知识最高得分: {max(knowledge_scores):.0f}分")
			
 
				+        print(f"  知识最低得分: {min(knowledge_scores):.0f}分")
			
 
				+        print()
			
 
				+
			
 
				+    # Prompt3 & Prompt4: 目的性和品类匹配
			
 
				+    purpose_scores = []
			
 
				+    category_scores = []
			
 
				+    final_scores = []
			
 
				+    match_level_counts = defaultdict(int)
			
 
				+
			
 
				+    for _, _, _, post in results:
			
 
				+        if post.purpose_score is not None:
			
 
				+            purpose_scores.append(post.purpose_score)
			
 
				+        if post.category_score is not None:
			
 
				+            category_scores.append(post.category_score)
			
 
				+        if post.final_score is not None:
			
 
				+            final_scores.append(post.final_score)
			
 
				+        if post.match_level:
			
 
				+            match_level_counts[post.match_level] += 1
			
 
				+
			
 
				+    if purpose_scores:
			
 
				+        avg_purpose = sum(purpose_scores) / len(purpose_scores)
			
 
				+        print("🎯 Prompt3 - 目的性匹配:")
			
 
				+        print(f"  平均得分: {avg_purpose:.1f}分")
			
 
				+        print(f"  最高得分: {max(purpose_scores):.0f}分")
			
 
				+        print(f"  最低得分: {min(purpose_scores):.0f}分")
			
 
				+        print()
			
 
				+
			
 
				+    if category_scores:
			
 
				+        avg_category = sum(category_scores) / len(category_scores)
			
 
				+        print("🏷️  Prompt4 - 品类匹配:")
			
 
				+        print(f"  平均得分: {avg_category:.1f}分")
			
 
				+        print(f"  最高得分: {max(category_scores):.0f}分")
			
 
				+        print(f"  最低得分: {min(category_scores):.0f}分")
			
 
				+        print()
			
 
				+
			
 
				+    if final_scores:
			
 
				+        avg_final = sum(final_scores) / len(final_scores)
			
 
				+        print("🔥 综合得分 (目的性50% + 品类50%):")
			
 
				+        print(f"  平均得分: {avg_final:.2f}分")
			
 
				+        print(f"  最高得分: {max(final_scores):.2f}分")
			
 
				+        print(f"  最低得分: {min(final_scores):.2f}分")
			
 
				+        print()
			
 
				+
			
 
				+    if match_level_counts:
			
 
				+        print("📊 匹配等级分布:")
			
 
				+        for level in ['高度匹配', '基本匹配', '部分匹配', '弱匹配', '不匹配']:
			
 
				+            count = match_level_counts.get(level, 0)
			
 
				+            if count > 0:
			
 
				+                bar = '█' * int(count / total * 50)
			
 
				+                print(f"  {level:8s}: {count:3d} / {total} ({count/total*100:.1f}%) {bar}")
			
 
				+        print()
			
 
				+
			
 
				+    # 综合分析
			
 
				+    print("🌟 高质量内容统计:")
			
 
				+
			
 
				+    # 是知识 + 是内容知识
			
 
				+    is_quality_knowledge = sum(
			
 
				+        1 for _, _, _, post in results
			
 
				+        if post.is_knowledge and post.is_content_knowledge
			
 
				+    )
			
 
				+    print(f"  知识内容: {is_quality_knowledge} / {total} ({is_quality_knowledge/total*100:.1f}%)")
			
 
				+
			
 
				+    # 是知识 + 是内容知识 + 高度匹配
			
 
				+    high_match = sum(
			
 
				+        1 for _, _, _, post in results
			
 
				+        if post.is_knowledge and post.is_content_knowledge and post.match_level == '高度匹配'
			
 
				+    )
			
 
				+    print(f"  高度匹配: {high_match} / {total} ({high_match/total*100:.1f}%)")
			
 
				+
			
 
				+    # 是知识 + 是内容知识 + 综合得分>=70
			
 
				+    high_score = sum(
			
 
				+        1 for _, _, _, post in results
			
 
				+        if post.is_knowledge and post.is_content_knowledge and post.final_score and post.final_score >= 70
			
 
				+    )
			
 
				+    print(f"  得分≥70:  {high_score} / {total} ({high_score/total*100:.1f}%)")
			
 
				+    print()
			
 
				+
			
 
				+    print(f"{'='*80}\n")
			
 
				+
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    if len(sys.argv) < 2:
			
 
				+        print("用法: python3 test_evaluation_v4.py <run_context.json路径> [最大评估数量]")
			
 
				+        print()
			
 
				+        print("示例:")
			
 
				+        print("  python3 test_evaluation_v4.py input/test_case/output/knowledge_search_traverse/20251114/005215_b1/run_context.json")
			
 
				+        print("  python3 test_evaluation_v4.py input/test_case/output/knowledge_search_traverse/20251114/005215_b1/run_context.json 20")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    run_context_path = sys.argv[1]
			
 
				+    max_posts = int(sys.argv[2]) if len(sys.argv) > 2 else 20  # 默认20条
			
 
				+
			
 
				+    asyncio.run(test_evaluation_v4(run_context_path, max_posts))