2 주 전 · 678f2095d8
--- a/extract_top10_multimodal.py
+++ b/extract_top10_multimodal.py
@@ -0,0 +1,184 @@
 
															+"""
														
 
															+从 run_context_v3.json 中提取 top10 帖子并进行多模态解析
														
 
															+
														
 
															+功能：
														
 
															+1. 读取 run_context_v3.json
														
 
															+2. 提取所有帖子，按 final_score 排序，取 top10
														
 
															+3. 使用 multimodal_extractor 进行图片内容解析
														
 
															+4. 保存结果到独立的 JSON 文件
														
 
															+"""
														
 
															+
														
 
															+import asyncio
														
 
															+import json
														
 
															+import os
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+from typing import Optional
														
 
															+
														
 
															+# 导入必要的模块
														
 
															+from knowledge_search_traverse import Post
														
 
															+from multimodal_extractor import extract_all_posts
														
 
															+
														
 
															+
														
 
															+def load_run_context(json_path: str) -> dict:
														
 
															+    """加载 run_context_v3.json 文件"""
														
 
															+    with open(json_path, 'r', encoding='utf-8') as f:
														
 
															+        return json.load(f)
														
 
															+
														
 
															+
														
 
															+def extract_all_posts_from_context(context_data: dict) -> list[dict]:
														
 
															+    """从 context 数据中提取所有帖子"""
														
 
															+    all_posts = []
														
 
															+
														
 
															+    # 遍历所有轮次
														
 
															+    for round_data in context_data.get('rounds', []):
														
 
															+        # 遍历搜索结果
														
 
															+        for search_result in round_data.get('search_results', []):
														
 
															+            # 遍历帖子列表
														
 
															+            for post in search_result.get('post_list', []):
														
 
															+                all_posts.append(post)
														
 
															+
														
 
															+    return all_posts
														
 
															+
														
 
															+
														
 
															+def filter_and_sort_top10(posts: list[dict]) -> list[dict]:
														
 
															+    """过滤并排序，获取 final_score top10 的帖子"""
														
 
															+    # 过滤掉 final_score 为 null 的帖子
														
 
															+    valid_posts = [p for p in posts if p.get('final_score') is not None]
														
 
															+
														
 
															+    # 按 final_score 降序排序
														
 
															+    sorted_posts = sorted(valid_posts, key=lambda x: x.get('final_score', 0), reverse=True)
														
 
															+
														
 
															+    # 取前10个
														
 
															+    top10 = sorted_posts[:10]
														
 
															+
														
 
															+    return top10
														
 
															+
														
 
															+
														
 
															+def convert_to_post_objects(post_dicts: list[dict]) -> list[Post]:
														
 
															+    """将字典数据转换为 Post 对象"""
														
 
															+    post_objects = []
														
 
															+
														
 
															+    for post_dict in post_dicts:
														
 
															+        # 创建 Post 对象，设置默认 type="normal"
														
 
															+        post = Post(
														
 
															+            note_id=post_dict.get('note_id', ''),
														
 
															+            note_url=post_dict.get('note_url', ''),
														
 
															+            title=post_dict.get('title', ''),
														
 
															+            body_text=post_dict.get('body_text', ''),
														
 
															+            type='normal',  # 默认值，因为原数据缺少此字段
														
 
															+            images=post_dict.get('images', []),
														
 
															+            video=post_dict.get('video', ''),
														
 
															+            interact_info=post_dict.get('interact_info', {}),
														
 
															+        )
														
 
															+        post_objects.append(post)
														
 
															+
														
 
															+    return post_objects
														
 
															+
														
 
															+
														
 
															+def save_extraction_results(results: dict, output_path: str, top10_posts: list[dict]):
														
 
															+    """保存多模态解析结果到 JSON 文件"""
														
 
															+    # 构建输出数据
														
 
															+    output_data = {
														
 
															+        'total_extracted': len(results),
														
 
															+        'extraction_results': []
														
 
															+    }
														
 
															+
														
 
															+    # 遍历每个解析结果
														
 
															+    for note_id, extraction in results.items():
														
 
															+        # 找到对应的原始帖子数据
														
 
															+        original_post = None
														
 
															+        for post in top10_posts:
														
 
															+            if post.get('note_id') == note_id:
														
 
															+                original_post = post
														
 
															+                break
														
 
															+
														
 
															+        # 构建结果条目
														
 
															+        result_entry = {
														
 
															+            'note_id': extraction.note_id,
														
 
															+            'note_url': extraction.note_url,
														
 
															+            'title': extraction.title,
														
 
															+            'body_text': extraction.body_text,
														
 
															+            'type': extraction.type,
														
 
															+            'extraction_time': extraction.extraction_time,
														
 
															+            'final_score': original_post.get('final_score') if original_post else None,
														
 
															+            'images': [
														
 
															+                {
														
 
															+                    'image_index': img.image_index,
														
 
															+                    'original_url': img.original_url,
														
 
															+                    'description': img.description,
														
 
															+                    'extract_text': img.extract_text
														
 
															+                }
														
 
															+                for img in extraction.images
														
 
															+            ]
														
 
															+        }
														
 
															+
														
 
															+        output_data['extraction_results'].append(result_entry)
														
 
															+
														
 
															+    # 保存到文件
														
 
															+    with open(output_path, 'w', encoding='utf-8') as f:
														
 
															+        json.dump(output_data, f, ensure_ascii=False, indent=2)
														
 
															+
														
 
															+    print(f"\n✅ 结果已保存到: {output_path}")
														
 
															+
														
 
															+
														
 
															+async def main(context_file_path: str, output_file_path: str):
														
 
															+    """主函数"""
														
 
															+    print("=" * 80)
														
 
															+    print("多模态解析 - Top10 帖子")
														
 
															+    print("=" * 80)
														
 
															+
														
 
															+    # 1. 加载数据
														
 
															+    print(f"\n📂 加载文件: {context_file_path}")
														
 
															+    context_data = load_run_context(context_file_path)
														
 
															+
														
 
															+    # 2. 提取所有帖子
														
 
															+    print(f"\n🔍 提取所有帖子...")
														
 
															+    all_posts = extract_all_posts_from_context(context_data)
														
 
															+    print(f"   共找到 {len(all_posts)} 个帖子")
														
 
															+
														
 
															+    # 3. 过滤并排序获取 top10
														
 
															+    print(f"\n📊 筛选 top10 帖子...")
														
 
															+    top10_posts = filter_and_sort_top10(all_posts)
														
 
															+    print(f"   Top10 帖子得分范围: {top10_posts[-1].get('final_score')} ~ {top10_posts[0].get('final_score')}")
														
 
															+
														
 
															+    # 打印 top10 列表
														
 
															+    print("\n   Top10 帖子列表:")
														
 
															+    for i, post in enumerate(top10_posts, 1):
														
 
															+        print(f"   {i}. [{post.get('final_score')}] {post.get('title')[:40]}... ({post.get('note_id')})")
														
 
															+
														
 
															+    # 4. 转换为 Post 对象
														
 
															+    print(f"\n🔄 转换为 Post 对象...")
														
 
															+    post_objects = convert_to_post_objects(top10_posts)
														
 
															+    print(f"   成功转换 {len(post_objects)} 个 Post 对象")
														
 
															+
														
 
															+    # 5. 进行多模态解析
														
 
															+    print(f"\n🖼️  开始多模态图片内容解析...")
														
 
															+    print(f"   （并发限制: 5, 每张图片最多 10 张）")
														
 
															+    extraction_results = await extract_all_posts(post_objects, max_concurrent=5)
														
 
															+
														
 
															+    # 6. 保存结果
														
 
															+    print(f"\n💾 保存解析结果...")
														
 
															+    save_extraction_results(extraction_results, output_file_path, top10_posts)
														
 
															+
														
 
															+    print("\n" + "=" * 80)
														
 
															+    print("✅ 处理完成！")
														
 
															+    print("=" * 80)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # 默认路径配置
														
 
															+    DEFAULT_CONTEXT_FILE = "input/test_case/output/knowledge_search_traverse/20251114/005215_b1/run_context_v3.json"
														
 
															+    DEFAULT_OUTPUT_FILE = "input/test_case/output/knowledge_search_traverse/20251114/005215_b1/multimodal_extraction_top10.json"
														
 
															+
														
 
															+    # 可以通过命令行参数覆盖
														
 
															+    context_file = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_CONTEXT_FILE
														
 
															+    output_file = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_OUTPUT_FILE
														
 
															+
														
 
															+    # 检查文件是否存在
														
 
															+    if not os.path.exists(context_file):
														
 
															+        print(f"❌ 错误: 文件不存在 - {context_file}")
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+    # 运行主函数
														
 
															+    asyncio.run(main(context_file, output_file))
														
--- a/post_evaluator_v3.py
+++ b/post_evaluator_v3.py
@@ -10,8 +10,11 @@
 
															 """
														
 
															 import asyncio
														
 
															+import base64
														
 
															 import json
														
 
															+import mimetypes
														
 
															 import os
														
 
															+import traceback
														
 
															 from datetime import datetime
														
 
															 from typing import Optional
														
 
															 from pydantic import BaseModel, Field
														
@@ -26,6 +29,13 @@ API_TIMEOUT = 120
 
															 ENABLE_CACHE = True  # 是否启用评估结果缓存
														
 
															 CACHE_DIR = ".evaluation_cache"  # 缓存目录
														
 
															+# 视频处理配置
														
 
															+MAX_VIDEO_SIZE_MB = 60  # 最大视频大小限制(MB)
														
 
															+VIDEO_DOWNLOAD_TIMEOUT = 60  # 视频下载超时(秒)
														
 
															+TEMP_VIDEO_DIR = "/tmp/kg_agent_videos"  # 临时视频存储目录（同时也是缓存目录）
														
 
															+VIDEO_CHUNK_SIZE = 8192  # 下载分块大小(字节)
														
 
															+MAX_VIDEO_DOWNLOAD_RETRIES = 2  # 下载重试次数
														
 
															+
														
 
															 # ============================================================================
														
 
															 # 数据模型
														
 
															 # ============================================================================
														
@@ -987,6 +997,249 @@ PROMPT4_CATEGORY_MATCH = """# Prompt 2: 多模态内容品类匹配评估
 
															 # 辅助函数
														
 
															 # ============================================================================
														
 
															+# 视频处理函数
														
 
															+async def download_video(video_url: str, note_id: str) -> Optional[str]:
														
 
															+    """
														
 
															+    异步下载视频到本地文件（支持缓存）
														
 
															+
														
 
															+    Args:
														
 
															+        video_url: 视频URL
														
 
															+        note_id: 帖子ID（用于文件命名）
														
 
															+
														
 
															+    Returns:
														
 
															+        本地文件路径，失败返回None
														
 
															+    """
														
 
															+    os.makedirs(TEMP_VIDEO_DIR, exist_ok=True)
														
 
															+    video_path = os.path.join(TEMP_VIDEO_DIR, f"{note_id}.mp4")
														
 
															+    
														
 
															+    # 检查视频缓存（如果文件已存在，直接返回）
														
 
															+    if os.path.exists(video_path):
														
 
															+        file_size = os.path.getsize(video_path)
														
 
															+        print(f"      ♻️  使用缓存的视频: {file_size / 1024 / 1024:.2f}MB")
														
 
															+        return video_path
														
 
															+
														
 
															+    for attempt in range(MAX_VIDEO_DOWNLOAD_RETRIES + 1):
														
 
															+        try:
														
 
															+            loop = asyncio.get_event_loop()
														
 
															+            response = await loop.run_in_executor(
														
 
															+                None,
														
 
															+                lambda: requests.get(
														
 
															+                    video_url,
														
 
															+                    stream=True,
														
 
															+                    timeout=VIDEO_DOWNLOAD_TIMEOUT
														
 
															+                )
														
 
															+            )
														
 
															+
														
 
															+            if response.status_code != 200:
														
 
															+                raise Exception(f"HTTP {response.status_code}")
														
 
															+
														
 
															+            # 检查Content-Length header（如果存在）
														
 
															+            content_length = response.headers.get('content-length')
														
 
															+            if content_length:
														
 
															+                size_mb = int(content_length) / 1024 / 1024
														
 
															+                print(f"      📊 视频大小: {size_mb:.2f}MB")
														
 
															+                if size_mb > MAX_VIDEO_SIZE_MB:
														
 
															+                    print(f"      ⚠️  视频超过{MAX_VIDEO_SIZE_MB}MB限制，跳过下载")
														
 
															+                    return None
														
 
															+
														
 
															+            # 流式下载，检查大小
														
 
															+            current_size = 0
														
 
															+            max_size = MAX_VIDEO_SIZE_MB * 1024 * 1024
														
 
															+
														
 
															+            with open(video_path, 'wb') as f:
														
 
															+                for chunk in response.iter_content(chunk_size=VIDEO_CHUNK_SIZE):
														
 
															+                    if chunk:
														
 
															+                        current_size += len(chunk)
														
 
															+                        if current_size > max_size:
														
 
															+                            # 删除不完整的文件
														
 
															+                            if os.path.exists(video_path):
														
 
															+                                try:
														
 
															+                                    os.remove(video_path)
														
 
															+                                except:
														
 
															+                                    pass
														
 
															+                            print(f"      ⚠️  视频超过{MAX_VIDEO_SIZE_MB}MB限制")
														
 
															+                            return None
														
 
															+                        f.write(chunk)
														
 
															+
														
 
															+            print(f"      📥 视频下载成功: {current_size / 1024 / 1024:.2f}MB")
														
 
															+            return video_path
														
 
															+
														
 
															+        except Exception as e:
														
 
															+            if attempt < MAX_VIDEO_DOWNLOAD_RETRIES:
														
 
															+                wait_time = 2 * (attempt + 1)
														
 
															+                print(f"      ⚠️  下载失败，{wait_time}秒后重试 ({attempt + 1}/{MAX_VIDEO_DOWNLOAD_RETRIES}) - {str(e)[:100]}")
														
 
															+                await asyncio.sleep(wait_time)
														
 
															+            else:
														
 
															+                print(f"      ❌ 视频下载失败: {str(e)[:100]}")
														
 
															+                print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
														
 
															+                # 清理可能的不完整文件
														
 
															+                if os.path.exists(video_path):
														
 
															+                    try:
														
 
															+                        os.remove(video_path)
														
 
															+                    except:
														
 
															+                        pass
														
 
															+                return None
														
 
															+
														
 
															+
														
 
															+async def encode_video_to_base64(video_path: str) -> Optional[str]:
														
 
															+    """
														
 
															+    将视频文件编码为base64字符串
														
 
															+
														
 
															+    Args:
														
 
															+        video_path: 本地视频文件路径
														
 
															+
														
 
															+    Returns:
														
 
															+        base64编码字符串，失败返回None
														
 
															+    """
														
 
															+    try:
														
 
															+        # 检查文件是否存在
														
 
															+        if not os.path.exists(video_path):
														
 
															+            print(f"      ❌ 视频文件不存在: {video_path}")
														
 
															+            return None
														
 
															+        
														
 
															+        # 检查文件大小（base64编码会增加约33%的大小）
														
 
															+        file_size = os.path.getsize(video_path)
														
 
															+        estimated_base64_size = file_size * 1.33  # base64编码后的大小估算
														
 
															+        max_memory_size = MAX_VIDEO_SIZE_MB * 1024 * 1024 * 1.5  # 允许一些余量
														
 
															+        
														
 
															+        if estimated_base64_size > max_memory_size:
														
 
															+            print(f"      ⚠️  视频文件过大，无法编码到内存 ({file_size / 1024 / 1024:.2f}MB)")
														
 
															+            return None
														
 
															+        
														
 
															+        loop = asyncio.get_event_loop()
														
 
															+        
														
 
															+        def _encode_video():
														
 
															+            """同步编码函数"""
														
 
															+            try:
														
 
															+                with open(video_path, 'rb') as f:
														
 
															+                    video_data = f.read()
														
 
															+                    base64_str = base64.b64encode(video_data).decode('utf-8')
														
 
															+                    return base64_str
														
 
															+            except MemoryError as e:
														
 
															+                print(f"      ❌ 内存不足，无法编码视频: {str(e)[:100]}")
														
 
															+                raise
														
 
															+            except Exception as e:
														
 
															+                print(f"      ❌ 读取/编码视频文件失败: {str(e)[:100]}")
														
 
															+                raise
														
 
															+        
														
 
															+        base64_str = await loop.run_in_executor(None, _encode_video)
														
 
															+        
														
 
															+        if base64_str:
														
 
															+            print(f"      🔐 视频编码完成: {len(base64_str) / 1024 / 1024:.2f}MB (base64)")
														
 
															+            return base64_str
														
 
															+        else:
														
 
															+            return None
														
 
															+            
														
 
															+    except MemoryError as e:
														
 
															+        print(f"      ❌ 内存不足，无法编码视频: {str(e)[:100]}")
														
 
															+        print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
														
 
															+        return None
														
 
															+    except Exception as e:
														
 
															+        print(f"      ❌ 视频编码失败: {str(e)[:100]}")
														
 
															+        print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def get_video_mime_type(video_path: str) -> str:
														
 
															+    """
														
 
															+    检测视频的MIME类型
														
 
															+
														
 
															+    Args:
														
 
															+        video_path: 视频文件路径
														
 
															+
														
 
															+    Returns:
														
 
															+        MIME类型字符串 (默认 "video/mp4")
														
 
															+    """
														
 
															+    mime_type, _ = mimetypes.guess_type(video_path)
														
 
															+    if mime_type and mime_type.startswith('video/'):
														
 
															+        return mime_type
														
 
															+    return "video/mp4"
														
 
															+
														
 
															+
														
 
															+# 视频不再清理，保留作为缓存
														
 
															+# async def cleanup_video(video_path: str):
														
 
															+#     """
														
 
															+#     清理临时视频文件
														
 
															+#
														
 
															+#     Args:
														
 
															+#         video_path: 要删除的视频路径
														
 
															+#     """
														
 
															+#     try:
														
 
															+#         if os.path.exists(video_path):
														
 
															+#             os.remove(video_path)
														
 
															+#             print(f"      🗑️  清理临时文件: {os.path.basename(video_path)}")
														
 
															+#     except Exception as e:
														
 
															+#         print(f"      ⚠️  清理失败: {str(e)[:50]}")
														
 
															+
														
 
															+
														
 
															+async def _prepare_media_content(post) -> tuple[list[str], Optional[str], str]:
														
 
															+    """
														
 
															+    统一准备媒体内容（图片+视频）
														
 
															+
														
 
															+    Args:
														
 
															+        post: Post对象
														
 
															+
														
 
															+    Returns:
														
 
															+        (image_urls, video_base64, video_mime_type)
														
 
															+    """
														
 
															+    # 提取图片（包括视频封面图）
														
 
															+    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
														
 
															+
														
 
															+    # 初始化视频相关变量
														
 
															+    video_base64 = None
														
 
															+    video_mime_type = "video/mp4"
														
 
															+
														
 
															+    # 处理视频
														
 
															+    if post.type == "video" and post.video:
														
 
															+        print(f"      🎬 检测到视频帖子 (ID: {post.note_id})")
														
 
															+        print(f"      📍 视频URL: {post.video[:80]}...")
														
 
															+        print(f"      🖼️  封面图数量: {len(image_urls)}")
														
 
															+        print(f"      ⏳ 开始下载视频...")
														
 
															+
														
 
															+        video_path = None
														
 
															+        try:
														
 
															+            # 下载视频
														
 
															+            video_path = await download_video(post.video, post.note_id)
														
 
															+
														
 
															+            if video_path and os.path.exists(video_path):
														
 
															+                try:
														
 
															+                    print(f"      🔄 开始编码视频...")
														
 
															+                    # 编码视频
														
 
															+                    video_base64 = await encode_video_to_base64(video_path)
														
 
															+                    if video_base64:
														
 
															+                        video_mime_type = get_video_mime_type(video_path)
														
 
															+                        print(f"      ✅ 视频处理成功！类型: {video_mime_type}")
														
 
															+                        print(f"      📦 将使用视频+封面图进行评估")
														
 
															+                    else:
														
 
															+                        print(f"      ⚠️  视频编码失败，降级使用封面图评估")
														
 
															+                except MemoryError as e:
														
 
															+                    print(f"      ⚠️  内存不足，无法处理视频: {str(e)[:100]}")
														
 
															+                    print(f"      📦 降级使用封面图评估")
														
 
															+                except Exception as e:
														
 
															+                    print(f"      ⚠️  视频编码异常: {str(e)[:100]}")
														
 
															+                    print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
														
 
															+                    print(f"      📦 降级使用封面图评估")
														
 
															+                # 视频不再清理，保留作为缓存
														
 
															+            else:
														
 
															+                print(f"      ⚠️  视频下载失败，降级使用封面图评估")
														
 
															+        except Exception as e:
														
 
															+            print(f"      ⚠️  视频处理流程异常: {str(e)[:100]}")
														
 
															+            print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
														
 
															+            print(f"      📦 降级使用封面图评估")
														
 
															+            # 视频不再清理，保留作为缓存
														
 
															+    elif post.type == "video" and not post.video:
														
 
															+        print(f"      ⚠️  视频类型帖子但video字段为空 (ID: {post.note_id})")
														
 
															+
														
 
															+    # 打印最终使用的媒体
														
 
															+    if post.type == "video":
														
 
															+        if video_base64:
														
 
															+            print(f"      📊 最终媒体: {len(image_urls)}张图片 + 1个视频")
														
 
															+        else:
														
 
															+            print(f"      📊 最终媒体: {len(image_urls)}张图片 (视频处理失败)")
														
 
															+
														
 
															+    return image_urls, video_base64, video_mime_type
														
 
															+
														
 
															+
														
 
															 def _get_cache_key(note_id: str) -> str:
														
 
															     """
														
 
															     生成缓存key
														
@@ -1102,6 +1355,8 @@ def _clean_json_response(content_text: str) -> str:
 
															 async def _call_openrouter_api(
														
 
															     prompt_text: str,
														
 
															     image_urls: list[str],
														
 
															+    video_base64: Optional[str] = None,
														
 
															+    video_mime_type: str = "video/mp4",
														
 
															     semaphore: Optional[asyncio.Semaphore] = None
														
 
															 ) -> dict:
														
 
															     """
														
@@ -1110,6 +1365,8 @@ async def _call_openrouter_api(
 
															     Args:
														
 
															         prompt_text: Prompt文本
														
 
															         image_urls: 图片URL列表
														
 
															+        video_base64: 视频的base64编码字符串（可选）
														
 
															+        video_mime_type: 视频MIME类型（默认video/mp4）
														
 
															         semaphore: 并发控制信号量
														
 
															     Returns:
														
@@ -1123,6 +1380,11 @@ async def _call_openrouter_api(
 
															     for url in image_urls:
														
 
															         content.append({"type": "image_url", "image_url": {"url": url}})
														
 
															+    # 添加视频（如果存在）
														
 
															+    if video_base64:
														
 
															+        data_url = f"data:{video_mime_type};base64,{video_base64}"
														
 
															+        content.append({"type": "video_url", "video_url": {"url": data_url}})
														
 
															+
														
 
															     payload = {
														
 
															         "model": MODEL_NAME,
														
 
															         "messages": [{"role": "user", "content": content}],
														
@@ -1195,10 +1457,8 @@ async def evaluate_is_knowledge(
 
															     Returns:
														
 
															         KnowledgeEvaluation 或 None（失败时）
														
 
															     """
														
 
															-    if post.type == "video":
														
 
															-        return None
														
 
															-
														
 
															-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
														
 
															+    # 准备媒体内容（图片+视频）
														
 
															+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
														
 
															     try:
														
 
															         prompt_text = PROMPT1_IS_KNOWLEDGE.format(
														
@@ -1207,7 +1467,7 @@ async def evaluate_is_knowledge(
 
															             num_images=len(image_urls)
														
 
															         )
														
 
															-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
														
 
															+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
														
 
															         return KnowledgeEvaluation(
														
 
															             is_knowledge=data.get("is_knowledge", False),
														
@@ -1239,10 +1499,8 @@ async def evaluate_is_content_knowledge(
 
															     Returns:
														
 
															         ContentKnowledgeEvaluation 或 None（失败时）
														
 
															     """
														
 
															-    if post.type == "video":
														
 
															-        return None
														
 
															-
														
 
															-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
														
 
															+    # 准备媒体内容（图片+视频）
														
 
															+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
														
 
															     try:
														
 
															         prompt_text = PROMPT2_IS_CONTENT_KNOWLEDGE.format(
														
@@ -1251,7 +1509,7 @@ async def evaluate_is_content_knowledge(
 
															             num_images=len(image_urls)
														
 
															         )
														
 
															-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
														
 
															+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
														
 
															         # 判定是否是内容知识：得分 >= 55 分
														
 
															         final_score = data.get("final_score", 0)
														
@@ -1288,10 +1546,8 @@ async def evaluate_purpose_match(
 
															     Returns:
														
 
															         PurposeEvaluation 或 None（失败时）
														
 
															     """
														
 
															-    if post.type == "video":
														
 
															-        return None
														
 
															-
														
 
															-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
														
 
															+    # 准备媒体内容（图片+视频）
														
 
															+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
														
 
															     try:
														
 
															         prompt_text = PROMPT3_PURPOSE_MATCH.format(
														
@@ -1301,7 +1557,7 @@ async def evaluate_purpose_match(
 
															             num_images=len(image_urls)
														
 
															         )
														
 
															-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
														
 
															+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
														
 
															         # Prompt3的输出在"目的动机评估"键下
														
 
															         purpose_data = data.get("目的动机评估", {})
														
@@ -1336,10 +1592,8 @@ async def evaluate_category_match(
 
															     Returns:
														
 
															         CategoryEvaluation 或 None（失败时）
														
 
															     """
														
 
															-    if post.type == "video":
														
 
															-        return None
														
 
															-
														
 
															-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
														
 
															+    # 准备媒体内容（图片+视频）
														
 
															+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
														
 
															     try:
														
 
															         prompt_text = PROMPT4_CATEGORY_MATCH.format(
														
@@ -1349,7 +1603,7 @@ async def evaluate_category_match(
 
															             num_images=len(image_urls)
														
 
															         )
														
 
															-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
														
 
															+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
														
 
															         # Prompt4的输出在"品类评估"键下
														
 
															         category_data = data.get("品类评估", {})
														
@@ -1416,10 +1670,6 @@ async def evaluate_post_v3(
 
															         (knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level)
														
 
															         任一步骤失败，后续结果为None
														
 
															     """
														
 
															-    if post.type == "video":
														
 
															-        print(f"      ⊗ 跳过视频帖子: {post.note_id}")
														
 
															-        return (None, None, None, None, None, None)
														
 
															-
														
 
															     # 检查缓存
														
 
															     if ENABLE_CACHE:
														
 
															         cached_result = _load_from_cache(post.note_id)
														
--- a/post_evaluator_v4_langgraph.py
+++ b/post_evaluator_v4_langgraph.py
@@ -45,7 +45,7 @@ FILE_PROCESS_TIMEOUT = 180
 
															 # 代理配置（用于访问 Google File API）
														
 
															 HTTP_PROXY = "http://127.0.0.1:29758"
														
 
															-HTTPS_PROXY = "https://127.0.0.1:29758"
														
 
															+HTTPS_PROXY = "http://127.0.0.1:29758"
														
 
															 # 缓存配置
														
 
															 ENABLE_CACHE = False
														
--- a/script/search_recommendations/xiaohongshu_search_recommendations.py
+++ b/script/search_recommendations/xiaohongshu_search_recommendations.py
@@ -96,7 +96,7 @@ class XiaohongshuSearchRecommendations:
 
															         return None
														
 
															-    def get_recommendations(self, keyword: str, timeout: int = 300, max_retries: int = 10, retry_delay: int = 2, use_cache: bool = True) -> Dict[str, Any]:
														
 
															+    def get_recommendations(self, keyword: str, timeout: int = 300, max_retries: int = 4, retry_delay: int = 7, use_cache: bool = True) -> Dict[str, Any]:
														
 
															         """
														
 
															         获取小红书搜索推荐词
														
--- a/update_functions.py
+++ b/update_functions.py
@@ -0,0 +1,155 @@
 
															+#!/usr/bin/env python3
														
 
															+"""
														
 
															+更新 4 个评估函数,使用新的 system/user prompt 结构
														
 
															+"""
														
 
															+
														
 
															+import re
														
 
															+
														
 
															+def read_file(filepath):
														
 
															+    with open(filepath, 'r', encoding='utf-8') as f:
														
 
															+        return f.read()
														
 
															+
														
 
															+def write_file(filepath, content):
														
 
															+    with open(filepath, 'w', encoding='utf-8') as f:
														
 
															+        f.write(content)
														
 
															+
														
 
															+def update_evaluate_is_knowledge(content):
														
 
															+    """更新 evaluate_is_knowledge 函数"""
														
 
															+    # 查找并替换
														
 
															+    old_pattern = r'''try:
														
 
															+        prompt_text = PROMPT1_IS_KNOWLEDGE\.format\(
														
 
															+            title=post\.title,
														
 
															+            body_text=post\.body_text or "",
														
 
															+            num_images=len\(image_urls\)
														
 
															+        \)
														
 
															+
														
 
															+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
														
 
															+
														
 
															+    new_code = '''try:
														
 
															+        user_prompt = PROMPT1_USER_TEMPLATE.format(
														
 
															+            title=post.title,
														
 
															+            body_text=post.body_text or "",
														
 
															+            num_images=len(image_urls)
														
 
															+        )
														
 
															+
														
 
															+        data = await _call_openrouter_api(
														
 
															+            system_prompt=PROMPT1_SYSTEM,
														
 
															+            user_prompt=user_prompt,
														
 
															+            image_urls=image_urls,
														
 
															+            semaphore=semaphore
														
 
															+        )'''
														
 
															+
														
 
															+    content = re.sub(old_pattern, new_code, content)
														
 
															+    print("✅ evaluate_is_knowledge 更新完成")
														
 
															+    return content
														
 
															+
														
 
															+def update_evaluate_is_content_knowledge(content):
														
 
															+    """更新 evaluate_is_content_knowledge 函数"""
														
 
															+    old_pattern = r'''try:
														
 
															+        prompt_text = PROMPT2_IS_CONTENT_KNOWLEDGE\.format\(
														
 
															+            title=post\.title,
														
 
															+            body_text=post\.body_text or "",
														
 
															+            num_images=len\(image_urls\)
														
 
															+        \)
														
 
															+
														
 
															+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
														
 
															+
														
 
															+    new_code = '''try:
														
 
															+        user_prompt = PROMPT2_USER_TEMPLATE.format(
														
 
															+            title=post.title,
														
 
															+            body_text=post.body_text or "",
														
 
															+            num_images=len(image_urls)
														
 
															+        )
														
 
															+
														
 
															+        data = await _call_openrouter_api(
														
 
															+            system_prompt=PROMPT2_SYSTEM,
														
 
															+            user_prompt=user_prompt,
														
 
															+            image_urls=image_urls,
														
 
															+            semaphore=semaphore
														
 
															+        )'''
														
 
															+
														
 
															+    content = re.sub(old_pattern, new_code, content)
														
 
															+    print("✅ evaluate_is_content_knowledge 更新完成")
														
 
															+    return content
														
 
															+
														
 
															+def update_evaluate_purpose_match(content):
														
 
															+    """更新 evaluate_purpose_match 函数"""
														
 
															+    old_pattern = r'''try:
														
 
															+        prompt_text = PROMPT3_PURPOSE_MATCH\.format\(
														
 
															+            original_query=original_query,
														
 
															+            title=post\.title,
														
 
															+            body_text=post\.body_text or "",
														
 
															+            num_images=len\(image_urls\)
														
 
															+        \)
														
 
															+
														
 
															+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
														
 
															+
														
 
															+    new_code = '''try:
														
 
															+        user_prompt = PROMPT3_USER_TEMPLATE.format(
														
 
															+            original_query=original_query,
														
 
															+            title=post.title,
														
 
															+            body_text=post.body_text or "",
														
 
															+            num_images=len(image_urls)
														
 
															+        )
														
 
															+
														
 
															+        data = await _call_openrouter_api(
														
 
															+            system_prompt=PROMPT3_SYSTEM,
														
 
															+            user_prompt=user_prompt,
														
 
															+            image_urls=image_urls,
														
 
															+            semaphore=semaphore
														
 
															+        )'''
														
 
															+
														
 
															+    content = re.sub(old_pattern, new_code, content)
														
 
															+    print("✅ evaluate_purpose_match 更新完成")
														
 
															+    return content
														
 
															+
														
 
															+def update_evaluate_category_match(content):
														
 
															+    """更新 evaluate_category_match 函数"""
														
 
															+    old_pattern = r'''try:
														
 
															+        prompt_text = PROMPT4_CATEGORY_MATCH\.format\(
														
 
															+            original_query=original_query,
														
 
															+            title=post\.title,
														
 
															+            body_text=post\.body_text or "",
														
 
															+            num_images=len\(image_urls\)
														
 
															+        \)
														
 
															+
														
 
															+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
														
 
															+
														
 
															+    new_code = '''try:
														
 
															+        user_prompt = PROMPT4_USER_TEMPLATE.format(
														
 
															+            original_query=original_query,
														
 
															+            title=post.title,
														
 
															+            body_text=post.body_text or "",
														
 
															+            num_images=len(image_urls)
														
 
															+        )
														
 
															+
														
 
															+        data = await _call_openrouter_api(
														
 
															+            system_prompt=PROMPT4_SYSTEM,
														
 
															+            user_prompt=user_prompt,
														
 
															+            image_urls=image_urls,
														
 
															+            semaphore=semaphore
														
 
															+        )'''
														
 
															+
														
 
															+    content = re.sub(old_pattern, new_code, content)
														
 
															+    print("✅ evaluate_category_match 更新完成")
														
 
															+    return content
														
 
															+
														
 
															+def main():
														
 
															+    filepath = 'post_evaluator_v3.py'
														
 
															+
														
 
															+    print("📖 读取文件...")
														
 
															+    content = read_file(filepath)
														
 
															+
														
 
															+    print("\n🔧 更新评估函数...")
														
 
															+    content = update_evaluate_is_knowledge(content)
														
 
															+    content = update_evaluate_is_content_knowledge(content)
														
 
															+    content = update_evaluate_purpose_match(content)
														
 
															+    content = update_evaluate_category_match(content)
														
 
															+
														
 
															+    print("\n💾 保存文件...")
														
 
															+    write_file(filepath, content)
														
 
															+
														
 
															+    print("\n✅ 所有评估函数更新完成!")
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    main()
														
--- a/video_utils.py
+++ b/video_utils.py
@@ -0,0 +1,178 @@
 
															+"""
														
 
															+视频处理工具模块
														
 
															+
														
 
															+提供视频下载、Base64 编码等功能，用于支持视频评估
														
 
															+"""
														
 
															+
														
 
															+import asyncio
														
 
															+import base64
														
 
															+import hashlib
														
 
															+import os
														
 
															+from pathlib import Path
														
 
															+from typing import Optional
														
 
															+import requests
														
 
															+
														
 
															+
														
 
															+# 配置
														
 
															+VIDEO_CACHE_DIR = Path(".video_cache")
														
 
															+VIDEO_MAX_SIZE_MB = 50  # 最大视频大小（MB）
														
 
															+VIDEO_DOWNLOAD_TIMEOUT = 120  # 下载超时（秒）
														
 
															+MAX_RETRIES = 2  # 最大重试次数
														
 
															+
														
 
															+
														
 
															+async def download_video(
														
 
															+    video_url: str,
														
 
															+    cache_dir: Path = VIDEO_CACHE_DIR
														
 
															+) -> Optional[Path]:
														
 
															+    """
														
 
															+    异步下载视频文件
														
 
															+
														
 
															+    Args:
														
 
															+        video_url: 视频URL
														
 
															+        cache_dir: 缓存目录
														
 
															+
														
 
															+    Returns:
														
 
															+        视频文件路径，失败返回 None
														
 
															+    """
														
 
															+    # 创建缓存目录
														
 
															+    cache_dir.mkdir(exist_ok=True)
														
 
															+
														
 
															+    # 生成缓存文件名（基于URL hash）
														
 
															+    url_hash = hashlib.md5(video_url.encode()).hexdigest()
														
 
															+    cache_path = cache_dir / f"{url_hash}.mp4"
														
 
															+
														
 
															+    # 检查缓存
														
 
															+    if cache_path.exists():
														
 
															+        file_size_mb = cache_path.stat().st_size / (1024 * 1024)
														
 
															+        print(f"      ♻️  使用缓存视频: {file_size_mb:.2f}MB")
														
 
															+        return cache_path
														
 
															+
														
 
															+    # 异步下载
														
 
															+    loop = asyncio.get_event_loop()
														
 
															+
														
 
															+    for attempt in range(MAX_RETRIES + 1):
														
 
															+        try:
														
 
															+            print(f"      📥 下载视频... (尝试 {attempt + 1}/{MAX_RETRIES + 1})")
														
 
															+
														
 
															+            # 使用 executor 执行同步下载
														
 
															+            response = await loop.run_in_executor(
														
 
															+                None,
														
 
															+                lambda: requests.get(
														
 
															+                    video_url,
														
 
															+                    timeout=VIDEO_DOWNLOAD_TIMEOUT,
														
 
															+                    stream=True,
														
 
															+                    headers={"User-Agent": "Mozilla/5.0"}
														
 
															+                )
														
 
															+            )
														
 
															+
														
 
															+            response.raise_for_status()
														
 
															+
														
 
															+            # 检查文件大小
														
 
															+            content_length = response.headers.get('content-length')
														
 
															+            if content_length:
														
 
															+                size_mb = int(content_length) / (1024 * 1024)
														
 
															+                if size_mb > VIDEO_MAX_SIZE_MB:
														
 
															+                    print(f"      ⚠️  视频过大: {size_mb:.2f}MB > {VIDEO_MAX_SIZE_MB}MB")
														
 
															+                    return None
														
 
															+
														
 
															+            # 保存到临时文件
														
 
															+            temp_path = cache_path.with_suffix('.tmp')
														
 
															+
														
 
															+            def save_chunks():
														
 
															+                with open(temp_path, 'wb') as f:
														
 
															+                    for chunk in response.iter_content(chunk_size=8192):
														
 
															+                        if chunk:
														
 
															+                            f.write(chunk)
														
 
															+
														
 
															+            await loop.run_in_executor(None, save_chunks)
														
 
															+
														
 
															+            # 检查实际文件大小
														
 
															+            actual_size_mb = temp_path.stat().st_size / (1024 * 1024)
														
 
															+            if actual_size_mb > VIDEO_MAX_SIZE_MB:
														
 
															+                print(f"      ⚠️  视频过大: {actual_size_mb:.2f}MB > {VIDEO_MAX_SIZE_MB}MB")
														
 
															+                temp_path.unlink()
														
 
															+                return None
														
 
															+
														
 
															+            # 重命名为正式文件
														
 
															+            temp_path.rename(cache_path)
														
 
															+
														
 
															+            print(f"      ✅ 视频下载成功: {actual_size_mb:.2f}MB")
														
 
															+            return cache_path
														
 
															+
														
 
															+        except Exception as e:
														
 
															+            if attempt < MAX_RETRIES:
														
 
															+                wait_time = 2 * (attempt + 1)
														
 
															+                print(f"      ⚠️  下载失败，{wait_time}秒后重试: {str(e)[:50]}")
														
 
															+                await asyncio.sleep(wait_time)
														
 
															+            else:
														
 
															+                print(f"      ❌ 视频下载失败: {str(e)[:100]}")
														
 
															+                # 清理临时文件
														
 
															+                if cache_path.with_suffix('.tmp').exists():
														
 
															+                    cache_path.with_suffix('.tmp').unlink()
														
 
															+                return None
														
 
															+
														
 
															+    return None
														
 
															+
														
 
															+
														
 
															+async def encode_video_to_base64(video_path: Path) -> Optional[str]:
														
 
															+    """
														
 
															+    异步将视频文件编码为 Base64 data URL
														
 
															+
														
 
															+    Args:
														
 
															+        video_path: 视频文件路径
														
 
															+
														
 
															+    Returns:
														
 
															+        Base64 编码的 data URL，失败返回 None
														
 
															+    """
														
 
															+    try:
														
 
															+        loop = asyncio.get_event_loop()
														
 
															+
														
 
															+        # 异步读取文件
														
 
															+        def read_file():
														
 
															+            with open(video_path, 'rb') as f:
														
 
															+                return f.read()
														
 
															+
														
 
															+        print(f"      🔄 编码视频为 Base64...")
														
 
															+        video_bytes = await loop.run_in_executor(None, read_file)
														
 
															+
														
 
															+        # Base64 编码
														
 
															+        def encode():
														
 
															+            base64_str = base64.b64encode(video_bytes).decode('utf-8')
														
 
															+            return f"data:video/mp4;base64,{base64_str}"
														
 
															+
														
 
															+        data_url = await loop.run_in_executor(None, encode)
														
 
															+
														
 
															+        encoded_size_mb = len(data_url) / (1024 * 1024)
														
 
															+        print(f"      ✅ Base64 编码完成: {encoded_size_mb:.2f}MB")
														
 
															+
														
 
															+        return data_url
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        print(f"      ❌ Base64 编码失败: {str(e)[:100]}")
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def cleanup_video_cache(cache_dir: Path = VIDEO_CACHE_DIR, days: int = 7):
														
 
															+    """
														
 
															+    清理超过指定天数的视频缓存
														
 
															+
														
 
															+    Args:
														
 
															+        cache_dir: 缓存目录
														
 
															+        days: 保留天数
														
 
															+    """
														
 
															+    import time
														
 
															+
														
 
															+    if not cache_dir.exists():
														
 
															+        return
														
 
															+
														
 
															+    now = time.time()
														
 
															+    cutoff = now - (days * 24 * 60 * 60)
														
 
															+
														
 
															+    removed_count = 0
														
 
															+    for file_path in cache_dir.glob("*.mp4"):
														
 
															+        if file_path.stat().st_mtime < cutoff:
														
 
															+            file_path.unlink()
														
 
															+            removed_count += 1
														
 
															+
														
 
															+    if removed_count > 0:
														
 
															+        print(f"🗑️  清理了 {removed_count} 个过期视频缓存")