2 meses atrás · 7ddc8edaf1
--- a/extract_topn_multimodal.py
+++ b/extract_topn_multimodal.py
@@ -0,0 +1,281 @@
 
				+"""
			
 
				+从 run_context_v3.json 中提取 topN 帖子并进行多模态解析
			
 
				+
			
 
				+功能：
			
 
				+1. 读取 run_context_v3.json
			
 
				+2. 提取所有帖子，按 final_score 排序，取 topN
			
 
				+3. 使用 multimodal_extractor 进行图片内容解析
			
 
				+4. 保存结果到独立的 JSON 文件
			
 
				+
			
 
				+参数化配置:
			
 
				+- top_n: 提取前N个帖子(默认10)
			
 
				+- max_concurrent: 最大并发数(默认5)
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+import asyncio
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+
			
 
				+# 导入必要的模块
			
 
				+from knowledge_search_traverse import Post
			
 
				+from multimodal_extractor import extract_all_posts
			
 
				+
			
 
				+
			
 
				+def load_run_context(json_path: str) -> dict:
			
 
				+    """加载 run_context_v3.json 文件"""
			
 
				+    with open(json_path, 'r', encoding='utf-8') as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def extract_all_posts_from_context(context_data: dict) -> list[dict]:
			
 
				+    """从 context 数据中提取所有帖子（按note_id去重，保留得分最高的）"""
			
 
				+    # 使用字典进行去重，key为note_id
			
 
				+    posts_dict = {}
			
 
				+
			
 
				+    # 遍历所有轮次
			
 
				+    for round_data in context_data.get('rounds', []):
			
 
				+        # 遍历搜索结果
			
 
				+        for search_result in round_data.get('search_results', []):
			
 
				+            # 遍历帖子列表
			
 
				+            for post in search_result.get('post_list', []):
			
 
				+                note_id = post.get('note_id')
			
 
				+                if not note_id:
			
 
				+                    continue
			
 
				+
			
 
				+                # 如果是新帖子，直接添加
			
 
				+                if note_id not in posts_dict:
			
 
				+                    posts_dict[note_id] = post
			
 
				+                else:
			
 
				+                    # 如果已存在，比较final_score，保留得分更高的
			
 
				+                    existing_score = posts_dict[note_id].get('final_score')
			
 
				+                    current_score = post.get('final_score')
			
 
				+
			
 
				+                    # 如果当前帖子的分数更高，或者现有帖子没有分数，则替换
			
 
				+                    if existing_score is None or (current_score is not None and current_score > existing_score):
			
 
				+                        posts_dict[note_id] = post
			
 
				+
			
 
				+    # 返回去重后的帖子列表
			
 
				+    return list(posts_dict.values())
			
 
				+
			
 
				+
			
 
				+def filter_and_sort_topn(posts: list[dict], top_n: int = 10) -> list[dict]:
			
 
				+    """过滤并排序，获取 final_score topN 的帖子"""
			
 
				+    # 过滤掉 final_score 为 null 的帖子
			
 
				+    valid_posts = [p for p in posts if p.get('final_score') is not None]
			
 
				+
			
 
				+    # 按 final_score 降序排序
			
 
				+    sorted_posts = sorted(valid_posts, key=lambda x: x.get('final_score', 0), reverse=True)
			
 
				+
			
 
				+    # 取前N个
			
 
				+    topn = sorted_posts[:top_n]
			
 
				+
			
 
				+    return topn
			
 
				+
			
 
				+
			
 
				+def convert_to_post_objects(post_dicts: list[dict]) -> list[Post]:
			
 
				+    """将字典数据转换为 Post 对象"""
			
 
				+    post_objects = []
			
 
				+
			
 
				+    for post_dict in post_dicts:
			
 
				+        # 创建 Post 对象，设置默认 type="normal"
			
 
				+        post = Post(
			
 
				+            note_id=post_dict.get('note_id', ''),
			
 
				+            note_url=post_dict.get('note_url', ''),
			
 
				+            title=post_dict.get('title', ''),
			
 
				+            body_text=post_dict.get('body_text', ''),
			
 
				+            type='normal',  # 默认值，因为原数据缺少此字段
			
 
				+            images=post_dict.get('images', []),
			
 
				+            video=post_dict.get('video', ''),
			
 
				+            interact_info=post_dict.get('interact_info', {}),
			
 
				+        )
			
 
				+        post_objects.append(post)
			
 
				+
			
 
				+    return post_objects
			
 
				+
			
 
				+
			
 
				+def save_extraction_results(results: dict, output_path: str, topn_posts: list[dict]):
			
 
				+    """保存多模态解析结果到 JSON 文件"""
			
 
				+    # 构建输出数据
			
 
				+    output_data = {
			
 
				+        'total_extracted': len(results),
			
 
				+        'extraction_results': []
			
 
				+    }
			
 
				+
			
 
				+    # 遍历每个解析结果
			
 
				+    for note_id, extraction in results.items():
			
 
				+        # 找到对应的原始帖子数据
			
 
				+        original_post = None
			
 
				+        for post in topn_posts:
			
 
				+            if post.get('note_id') == note_id:
			
 
				+                original_post = post
			
 
				+                break
			
 
				+
			
 
				+        # 构建结果条目
			
 
				+        result_entry = {
			
 
				+            'note_id': extraction.note_id,
			
 
				+            'note_url': extraction.note_url,
			
 
				+            'title': extraction.title,
			
 
				+            'body_text': extraction.body_text,
			
 
				+            'type': extraction.type,
			
 
				+            'extraction_time': extraction.extraction_time,
			
 
				+            'final_score': original_post.get('final_score') if original_post else None,
			
 
				+            'images': [
			
 
				+                {
			
 
				+                    'image_index': img.image_index,
			
 
				+                    'original_url': img.original_url,
			
 
				+                    'description': img.description,
			
 
				+                    'extract_text': img.extract_text
			
 
				+                }
			
 
				+                for img in extraction.images
			
 
				+            ]
			
 
				+        }
			
 
				+
			
 
				+        output_data['extraction_results'].append(result_entry)
			
 
				+
			
 
				+    # 保存到文件
			
 
				+    with open(output_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(output_data, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"\n✅ 结果已保存到: {output_path}")
			
 
				+
			
 
				+
			
 
				+async def main(context_file_path: str, output_file_path: str, top_n: int = 10,
			
 
				+               max_concurrent: int = 5):
			
 
				+    """主函数
			
 
				+
			
 
				+    Args:
			
 
				+        context_file_path: run_context_v3.json 文件路径
			
 
				+        output_file_path: 输出文件路径
			
 
				+        top_n: 提取前N个帖子(默认10)
			
 
				+        max_concurrent: 最大并发数(默认5)
			
 
				+    """
			
 
				+    print("=" * 80)
			
 
				+    print(f"多模态解析 - Top{top_n} 帖子")
			
 
				+    print("=" * 80)
			
 
				+
			
 
				+    # 1. 加载数据
			
 
				+    print(f"\n📂 加载文件: {context_file_path}")
			
 
				+    context_data = load_run_context(context_file_path)
			
 
				+
			
 
				+    # 2. 提取所有帖子
			
 
				+    print(f"\n🔍 提取所有帖子...")
			
 
				+    all_posts = extract_all_posts_from_context(context_data)
			
 
				+    print(f"   去重后共找到 {len(all_posts)} 个唯一帖子")
			
 
				+
			
 
				+    # 3. 过滤并排序获取 topN
			
 
				+    print(f"\n📊 筛选 top{top_n} 帖子...")
			
 
				+    topn_posts = filter_and_sort_topn(all_posts, top_n)
			
 
				+
			
 
				+    if len(topn_posts) == 0:
			
 
				+        print("   ⚠️  没有找到有效的帖子")
			
 
				+        return
			
 
				+
			
 
				+    print(f"   Top{top_n} 帖子得分范围: {topn_posts[-1].get('final_score')} ~ {topn_posts[0].get('final_score')}")
			
 
				+
			
 
				+    # 打印 topN 列表
			
 
				+    print(f"\n   Top{top_n} 帖子列表:")
			
 
				+    for i, post in enumerate(topn_posts, 1):
			
 
				+        print(f"   {i}. [{post.get('final_score')}] {post.get('title')[:40]}... ({post.get('note_id')})")
			
 
				+
			
 
				+    # 4. 转换为 Post 对象
			
 
				+    print(f"\n🔄 转换为 Post 对象...")
			
 
				+    post_objects = convert_to_post_objects(topn_posts)
			
 
				+    print(f"   成功转换 {len(post_objects)} 个 Post 对象")
			
 
				+
			
 
				+    # 5. 进行多模态解析
			
 
				+    print(f"\n🖼️  开始多模态图片内容解析...")
			
 
				+    print(f"   （并发限制: {max_concurrent}）")
			
 
				+    extraction_results = await extract_all_posts(
			
 
				+        post_objects,
			
 
				+        max_concurrent=max_concurrent
			
 
				+    )
			
 
				+
			
 
				+    # 6. 保存结果
			
 
				+    print(f"\n💾 保存解析结果...")
			
 
				+    save_extraction_results(extraction_results, output_file_path, topn_posts)
			
 
				+
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("✅ 处理完成！")
			
 
				+    print("=" * 80)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 创建命令行参数解析器
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='从 run_context_v3.json 中提取 topN 帖子并进行多模态解析',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog='''
			
 
				+示例用法:
			
 
				+  # 使用默认参数 (top10, 并发5)
			
 
				+  python3 extract_topn_multimodal.py
			
 
				+
			
 
				+  # 提取前20个帖子
			
 
				+  python3 extract_topn_multimodal.py --top-n 20
			
 
				+
			
 
				+  # 自定义并发数
			
 
				+  python3 extract_topn_multimodal.py --top-n 15 --max-concurrent 10
			
 
				+
			
 
				+  # 指定输入输出文件
			
 
				+  python3 extract_topn_multimodal.py -i input.json -o output.json --top-n 30
			
 
				+        '''
			
 
				+    )
			
 
				+
			
 
				+    # 默认路径配置
			
 
				+    DEFAULT_CONTEXT_FILE = "input/test_case/output/knowledge_search_traverse/20251114/005215_b1/run_context_v3.json"
			
 
				+    DEFAULT_OUTPUT_FILE = "input/test_case/output/knowledge_search_traverse/20251114/005215_b1/multimodal_extraction_topn.json"
			
 
				+
			
 
				+    # 添加参数
			
 
				+    parser.add_argument(
			
 
				+        '-i', '--input',
			
 
				+        dest='context_file',
			
 
				+        default=DEFAULT_CONTEXT_FILE,
			
 
				+        help=f'输入的 run_context_v3.json 文件路径 (默认: {DEFAULT_CONTEXT_FILE})'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '-o', '--output',
			
 
				+        dest='output_file',
			
 
				+        default=DEFAULT_OUTPUT_FILE,
			
 
				+        help=f'输出的 JSON 文件路径 (默认: {DEFAULT_OUTPUT_FILE})'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '-n', '--top-n',
			
 
				+        dest='top_n',
			
 
				+        type=int,
			
 
				+        default=10,
			
 
				+        help='提取前N个帖子 (默认: 10)'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '-c', '--max-concurrent',
			
 
				+        dest='max_concurrent',
			
 
				+        type=int,
			
 
				+        default=5,
			
 
				+        help='最大并发数 (默认: 5)'
			
 
				+    )
			
 
				+
			
 
				+    # 解析参数
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # 检查文件是否存在
			
 
				+    if not os.path.exists(args.context_file):
			
 
				+        print(f"❌ 错误: 文件不存在 - {args.context_file}")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    # 打印参数配置
			
 
				+    print(f"\n📋 参数配置:")
			
 
				+    print(f"   输入文件: {args.context_file}")
			
 
				+    print(f"   输出文件: {args.output_file}")
			
 
				+    print(f"   提取数量: Top{args.top_n}")
			
 
				+    print(f"   最大并发: {args.max_concurrent}")
			
 
				+    print()
			
 
				+
			
 
				+    # 运行主函数
			
 
				+    asyncio.run(main(
			
 
				+        args.context_file,
			
 
				+        args.output_file,
			
 
				+        args.top_n,
			
 
				+        args.max_concurrent
			
 
				+    ))
			
--- a/knowledge_search_traverse.py
+++ b/knowledge_search_traverse.py
@@ -3642,91 +3642,85 @@ async def run_round_v2(
 
				                     "type": "sug"
			
 
				                 })
			
 
				 
			
 
				-    # 步骤3: 搜索高分SUG
			
 
				-    print(f"\n[步骤3] 搜索高分SUG（阈值 > {sug_threshold}）...")
			
 
				-    high_score_sugs = [sug for sug in all_sugs if sug.score_with_o > sug_threshold]
			
 
				-    print(f"  找到 {len(high_score_sugs)} 个高分SUG")
			
 
				-
			
 
				-    search_list = []
			
 
				-    # extraction_results = {}  # 内容提取流程已断开
			
 
				-
			
 
				-    if len(high_score_sugs) > 0:
			
 
				-        async def search_for_sug(sug: Sug) -> Search:
			
 
				-            """返回Search结果"""
			
 
				-            print(f"    搜索: {sug.text}")
			
 
				-            # post_extractions = {}  # 内容提取流程已断开
			
 
				-
			
 
				-            try:
			
 
				-                search_result = xiaohongshu_search.search(keyword=sug.text)
			
 
				-                # xiaohongshu_search.search() 已经返回解析后的数据
			
 
				-                notes = search_result.get("data", {}).get("data", [])
			
 
				-                post_list = []
			
 
				-                for note in notes[:10]:
			
 
				+    # 定义通用搜索函数（供步骤2.5、3、5.5共用）
			
 
				+    async def search_keyword(text: str, score: float, source_type: str) -> Search:
			
 
				+        """通用搜索函数"""
			
 
				+        print(f"    搜索: {text} (来源: {source_type})")
			
 
				+        try:
			
 
				+            search_result = xiaohongshu_search.search(keyword=text)
			
 
				+            notes = search_result.get("data", {}).get("data", [])
			
 
				+            post_list = []
			
 
				+
			
 
				+            for note in notes[:10]:
			
 
				+                try:
			
 
				+                    post = process_note_data(note)
			
 
				+                    post_list.append(post)
			
 
				+                except Exception as e:
			
 
				+                    print(f"      ⚠️  解析帖子失败 {note.get('id', 'unknown')}: {str(e)[:50]}")
			
 
				+
			
 
				+            # 补充详情信息（仅视频类型需要补充视频URL）
			
 
				+            video_posts = [p for p in post_list if p.type == "video"]
			
 
				+            if video_posts:
			
 
				+                print(f"      补充详情（{len(video_posts)}个视频）...")
			
 
				+                for post in video_posts:
			
 
				                     try:
			
 
				-                        post = process_note_data(note)
			
 
				-
			
 
				-                        # # 🆕 多模态提取（搜索后立即处理） - 内容提取流程已断开
			
 
				-                        # if post.type == "normal" and len(post.images) > 0:
			
 
				-                        #     extraction = await extract_post_images(post)
			
 
				-                        #     if extraction:
			
 
				-                        #         post_extractions[post.note_id] = extraction
			
 
				-
			
 
				-                        post_list.append(post)
			
 
				+                        detail_response = xiaohongshu_detail.get_detail(post.note_id)
			
 
				+                        enrich_post_with_detail(post, detail_response)
			
 
				                     except Exception as e:
			
 
				-                        print(f"      ⚠️  解析帖子失败 {note.get('id', 'unknown')}: {str(e)[:50]}")
			
 
				+                        print(f"        ⚠️  详情补充失败 {post.note_id}: {str(e)[:50]}")
			
 
				 
			
 
				-                # 补充详情信息（仅视频类型需要补充视频URL）
			
 
				-                video_posts = [p for p in post_list if p.type == "video"]
			
 
				-                if video_posts:
			
 
				-                    print(f"      补充详情（{len(video_posts)}个视频）...")
			
 
				-                    for post in video_posts:
			
 
				-                        try:
			
 
				-                            detail_response = xiaohongshu_detail.get_detail(post.note_id)
			
 
				-                            enrich_post_with_detail(post, detail_response)
			
 
				-                        except Exception as e:
			
 
				-                            print(f"        ⚠️  详情补充失败 {post.note_id}: {str(e)[:50]}")
			
 
				+            print(f"      → 找到 {len(post_list)} 个帖子")
			
 
				+            return Search(text=text, score_with_o=score, post_list=post_list)
			
 
				+        except Exception as e:
			
 
				+            print(f"      ✗ 搜索失败: {e}")
			
 
				+            return Search(text=text, score_with_o=score, post_list=[])
			
 
				 
			
 
				-                print(f"      → 找到 {len(post_list)} 个帖子")
			
 
				+    # 初始化search_list
			
 
				+    search_list = []
			
 
				 
			
 
				-                return Search(
			
 
				-                    text=sug.text,
			
 
				-                    score_with_o=sug.score_with_o,
			
 
				-                    from_q=sug.from_q,
			
 
				-                    post_list=post_list
			
 
				-                )
			
 
				-                # , post_extractions  # 内容提取流程已断开
			
 
				+    # 步骤2.5: 搜索高分query_input
			
 
				+    print(f"\n[步骤2.5] 搜索高分输入query（阈值 > {sug_threshold}）...")
			
 
				+    high_score_queries = [q for q in query_input if q.score_with_o > sug_threshold]
			
 
				+    print(f"  找到 {len(high_score_queries)} 个高分输入query")
			
 
				 
			
 
				-            except Exception as e:
			
 
				-                print(f"      ✗ 搜索失败: {e}")
			
 
				-                return Search(
			
 
				-                    text=sug.text,
			
 
				-                    score_with_o=sug.score_with_o,
			
 
				-                    from_q=sug.from_q,
			
 
				-                    post_list=[]
			
 
				-                )
			
 
				-                # , {}  # 内容提取流程已断开
			
 
				+    if high_score_queries:
			
 
				+        query_search_tasks = [search_keyword(q.text, q.score_with_o, "query_input")
			
 
				+                              for q in high_score_queries]
			
 
				+        query_searches = await asyncio.gather(*query_search_tasks)
			
 
				+        search_list.extend(query_searches)
			
 
				 
			
 
				-        search_tasks = [search_for_sug(sug) for sug in high_score_sugs]
			
 
				-        results = await asyncio.gather(*search_tasks)
			
 
				+        # 评估搜索结果中的帖子
			
 
				+        if enable_evaluation:
			
 
				+            print(f"\n[评估] 评估query_input搜索结果中的帖子...")
			
 
				+            for search in query_searches:
			
 
				+                if search.post_list:
			
 
				+                    print(f"  评估来自 '{search.text}' 的 {len(search.post_list)} 个帖子")
			
 
				+                    for post in search.post_list:
			
 
				+                        knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level = await evaluate_post_v3(post, o, semaphore=None)
			
 
				+                        if knowledge_eval:
			
 
				+                            apply_evaluation_v3_to_post(post, knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level)
			
 
				 
			
 
				-        # 收集搜索结果
			
 
				-        for search in results:
			
 
				-            search_list.append(search)
			
 
				-            # extraction_results.update(extractions)  # 内容提取流程已断开
			
 
				+    # 步骤3: 搜索高分SUG
			
 
				+    print(f"\n[步骤3] 搜索高分SUG（阈值 > {sug_threshold}）...")
			
 
				+    high_score_sugs = [sug for sug in all_sugs if sug.score_with_o > sug_threshold]
			
 
				+    print(f"  找到 {len(high_score_sugs)} 个高分SUG")
			
 
				+
			
 
				+    if high_score_sugs:
			
 
				+        sug_search_tasks = [search_keyword(sug.text, sug.score_with_o, "sug")
			
 
				+                            for sug in high_score_sugs]
			
 
				+        sug_searches = await asyncio.gather(*sug_search_tasks)
			
 
				+        search_list.extend(sug_searches)
			
 
				 
			
 
				         # 评估搜索结果中的帖子
			
 
				         if enable_evaluation:
			
 
				-            print(f"\n[评估] 评估搜索结果中的帖子...")
			
 
				-            for search in search_list:
			
 
				+            print(f"\n[评估] 评估SUG搜索结果中的帖子...")
			
 
				+            for search in sug_searches:
			
 
				                 if search.post_list:
			
 
				                     print(f"  评估来自 '{search.text}' 的 {len(search.post_list)} 个帖子")
			
 
				-                    # 对每个帖子进行评估 (V3)
			
 
				                     for post in search.post_list:
			
 
				                         knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level = await evaluate_post_v3(post, o, semaphore=None)
			
 
				                         if knowledge_eval:
			
 
				                             apply_evaluation_v3_to_post(post, knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level)
			
 
				-        else:
			
 
				-            print(f"\n[评估] 实时评估已关闭 (使用 --enable-evaluation 启用)")
			
 
				 
			
 
				     # 步骤4: 生成N域组合
			
 
				     print(f"\n[步骤4] 生成{round_num}域组合...")
			
@@ -3824,6 +3818,29 @@ async def run_round_v2(
 
				             comb.score_with_o > score for score in flat_scores
			
 
				         )
			
 
				 
			
 
				+    # 步骤5.5: 搜索高分组合词
			
 
				+    print(f"\n[步骤5.5] 搜索高分组合词（阈值 > {sug_threshold}）...")
			
 
				+    high_score_combinations = [comb for comb in domain_combinations
			
 
				+                               if comb.score_with_o > sug_threshold]
			
 
				+    print(f"  找到 {len(high_score_combinations)} 个高分组合词")
			
 
				+
			
 
				+    if high_score_combinations:
			
 
				+        comb_search_tasks = [search_keyword(comb.text, comb.score_with_o, "combination")
			
 
				+                             for comb in high_score_combinations]
			
 
				+        comb_searches = await asyncio.gather(*comb_search_tasks)
			
 
				+        search_list.extend(comb_searches)
			
 
				+
			
 
				+        # 评估搜索结果中的帖子
			
 
				+        if enable_evaluation:
			
 
				+            print(f"\n[评估] 评估组合词搜索结果中的帖子...")
			
 
				+            for search in comb_searches:
			
 
				+                if search.post_list:
			
 
				+                    print(f"  评估来自 '{search.text}' 的 {len(search.post_list)} 个帖子")
			
 
				+                    for post in search.post_list:
			
 
				+                        knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level = await evaluate_post_v3(post, o, semaphore=None)
			
 
				+                        if knowledge_eval:
			
 
				+                            apply_evaluation_v3_to_post(post, knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level)
			
 
				+
			
 
				     # 步骤6: 构建 q_list_next（组合 + 高分SUG）
			
 
				     print(f"\n[步骤6] 生成下轮输入...")
			
 
				     q_list_next: list[Q] = []