2 недель назад · 678f2095d8
--- a/extract_top10_multimodal.py
+++ b/extract_top10_multimodal.py
@@ -0,0 +1,184 @@
 
				+"""
			
 
				+从 run_context_v3.json 中提取 top10 帖子并进行多模态解析
			
 
				+
			
 
				+功能：
			
 
				+1. 读取 run_context_v3.json
			
 
				+2. 提取所有帖子，按 final_score 排序，取 top10
			
 
				+3. 使用 multimodal_extractor 进行图片内容解析
			
 
				+4. 保存结果到独立的 JSON 文件
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+
			
 
				+# 导入必要的模块
			
 
				+from knowledge_search_traverse import Post
			
 
				+from multimodal_extractor import extract_all_posts
			
 
				+
			
 
				+
			
 
				+def load_run_context(json_path: str) -> dict:
			
 
				+    """加载 run_context_v3.json 文件"""
			
 
				+    with open(json_path, 'r', encoding='utf-8') as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def extract_all_posts_from_context(context_data: dict) -> list[dict]:
			
 
				+    """从 context 数据中提取所有帖子"""
			
 
				+    all_posts = []
			
 
				+
			
 
				+    # 遍历所有轮次
			
 
				+    for round_data in context_data.get('rounds', []):
			
 
				+        # 遍历搜索结果
			
 
				+        for search_result in round_data.get('search_results', []):
			
 
				+            # 遍历帖子列表
			
 
				+            for post in search_result.get('post_list', []):
			
 
				+                all_posts.append(post)
			
 
				+
			
 
				+    return all_posts
			
 
				+
			
 
				+
			
 
				+def filter_and_sort_top10(posts: list[dict]) -> list[dict]:
			
 
				+    """过滤并排序，获取 final_score top10 的帖子"""
			
 
				+    # 过滤掉 final_score 为 null 的帖子
			
 
				+    valid_posts = [p for p in posts if p.get('final_score') is not None]
			
 
				+
			
 
				+    # 按 final_score 降序排序
			
 
				+    sorted_posts = sorted(valid_posts, key=lambda x: x.get('final_score', 0), reverse=True)
			
 
				+
			
 
				+    # 取前10个
			
 
				+    top10 = sorted_posts[:10]
			
 
				+
			
 
				+    return top10
			
 
				+
			
 
				+
			
 
				+def convert_to_post_objects(post_dicts: list[dict]) -> list[Post]:
			
 
				+    """将字典数据转换为 Post 对象"""
			
 
				+    post_objects = []
			
 
				+
			
 
				+    for post_dict in post_dicts:
			
 
				+        # 创建 Post 对象，设置默认 type="normal"
			
 
				+        post = Post(
			
 
				+            note_id=post_dict.get('note_id', ''),
			
 
				+            note_url=post_dict.get('note_url', ''),
			
 
				+            title=post_dict.get('title', ''),
			
 
				+            body_text=post_dict.get('body_text', ''),
			
 
				+            type='normal',  # 默认值，因为原数据缺少此字段
			
 
				+            images=post_dict.get('images', []),
			
 
				+            video=post_dict.get('video', ''),
			
 
				+            interact_info=post_dict.get('interact_info', {}),
			
 
				+        )
			
 
				+        post_objects.append(post)
			
 
				+
			
 
				+    return post_objects
			
 
				+
			
 
				+
			
 
				+def save_extraction_results(results: dict, output_path: str, top10_posts: list[dict]):
			
 
				+    """保存多模态解析结果到 JSON 文件"""
			
 
				+    # 构建输出数据
			
 
				+    output_data = {
			
 
				+        'total_extracted': len(results),
			
 
				+        'extraction_results': []
			
 
				+    }
			
 
				+
			
 
				+    # 遍历每个解析结果
			
 
				+    for note_id, extraction in results.items():
			
 
				+        # 找到对应的原始帖子数据
			
 
				+        original_post = None
			
 
				+        for post in top10_posts:
			
 
				+            if post.get('note_id') == note_id:
			
 
				+                original_post = post
			
 
				+                break
			
 
				+
			
 
				+        # 构建结果条目
			
 
				+        result_entry = {
			
 
				+            'note_id': extraction.note_id,
			
 
				+            'note_url': extraction.note_url,
			
 
				+            'title': extraction.title,
			
 
				+            'body_text': extraction.body_text,
			
 
				+            'type': extraction.type,
			
 
				+            'extraction_time': extraction.extraction_time,
			
 
				+            'final_score': original_post.get('final_score') if original_post else None,
			
 
				+            'images': [
			
 
				+                {
			
 
				+                    'image_index': img.image_index,
			
 
				+                    'original_url': img.original_url,
			
 
				+                    'description': img.description,
			
 
				+                    'extract_text': img.extract_text
			
 
				+                }
			
 
				+                for img in extraction.images
			
 
				+            ]
			
 
				+        }
			
 
				+
			
 
				+        output_data['extraction_results'].append(result_entry)
			
 
				+
			
 
				+    # 保存到文件
			
 
				+    with open(output_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(output_data, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"\n✅ 结果已保存到: {output_path}")
			
 
				+
			
 
				+
			
 
				+async def main(context_file_path: str, output_file_path: str):
			
 
				+    """主函数"""
			
 
				+    print("=" * 80)
			
 
				+    print("多模态解析 - Top10 帖子")
			
 
				+    print("=" * 80)
			
 
				+
			
 
				+    # 1. 加载数据
			
 
				+    print(f"\n📂 加载文件: {context_file_path}")
			
 
				+    context_data = load_run_context(context_file_path)
			
 
				+
			
 
				+    # 2. 提取所有帖子
			
 
				+    print(f"\n🔍 提取所有帖子...")
			
 
				+    all_posts = extract_all_posts_from_context(context_data)
			
 
				+    print(f"   共找到 {len(all_posts)} 个帖子")
			
 
				+
			
 
				+    # 3. 过滤并排序获取 top10
			
 
				+    print(f"\n📊 筛选 top10 帖子...")
			
 
				+    top10_posts = filter_and_sort_top10(all_posts)
			
 
				+    print(f"   Top10 帖子得分范围: {top10_posts[-1].get('final_score')} ~ {top10_posts[0].get('final_score')}")
			
 
				+
			
 
				+    # 打印 top10 列表
			
 
				+    print("\n   Top10 帖子列表:")
			
 
				+    for i, post in enumerate(top10_posts, 1):
			
 
				+        print(f"   {i}. [{post.get('final_score')}] {post.get('title')[:40]}... ({post.get('note_id')})")
			
 
				+
			
 
				+    # 4. 转换为 Post 对象
			
 
				+    print(f"\n🔄 转换为 Post 对象...")
			
 
				+    post_objects = convert_to_post_objects(top10_posts)
			
 
				+    print(f"   成功转换 {len(post_objects)} 个 Post 对象")
			
 
				+
			
 
				+    # 5. 进行多模态解析
			
 
				+    print(f"\n🖼️  开始多模态图片内容解析...")
			
 
				+    print(f"   （并发限制: 5, 每张图片最多 10 张）")
			
 
				+    extraction_results = await extract_all_posts(post_objects, max_concurrent=5)
			
 
				+
			
 
				+    # 6. 保存结果
			
 
				+    print(f"\n💾 保存解析结果...")
			
 
				+    save_extraction_results(extraction_results, output_file_path, top10_posts)
			
 
				+
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("✅ 处理完成！")
			
 
				+    print("=" * 80)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 默认路径配置
			
 
				+    DEFAULT_CONTEXT_FILE = "input/test_case/output/knowledge_search_traverse/20251114/005215_b1/run_context_v3.json"
			
 
				+    DEFAULT_OUTPUT_FILE = "input/test_case/output/knowledge_search_traverse/20251114/005215_b1/multimodal_extraction_top10.json"
			
 
				+
			
 
				+    # 可以通过命令行参数覆盖
			
 
				+    context_file = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_CONTEXT_FILE
			
 
				+    output_file = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_OUTPUT_FILE
			
 
				+
			
 
				+    # 检查文件是否存在
			
 
				+    if not os.path.exists(context_file):
			
 
				+        print(f"❌ 错误: 文件不存在 - {context_file}")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    # 运行主函数
			
 
				+    asyncio.run(main(context_file, output_file))
			
--- a/post_evaluator_v3.py
+++ b/post_evaluator_v3.py
@@ -10,8 +10,11 @@
 
				 """
			
 
				 
			
 
				 import asyncio
			
 
				+import base64
			
 
				 import json
			
 
				+import mimetypes
			
 
				 import os
			
 
				+import traceback
			
 
				 from datetime import datetime
			
 
				 from typing import Optional
			
 
				 from pydantic import BaseModel, Field
			
@@ -26,6 +29,13 @@ API_TIMEOUT = 120
 
				 ENABLE_CACHE = True  # 是否启用评估结果缓存
			
 
				 CACHE_DIR = ".evaluation_cache"  # 缓存目录
			
 
				 
			
 
				+# 视频处理配置
			
 
				+MAX_VIDEO_SIZE_MB = 60  # 最大视频大小限制(MB)
			
 
				+VIDEO_DOWNLOAD_TIMEOUT = 60  # 视频下载超时(秒)
			
 
				+TEMP_VIDEO_DIR = "/tmp/kg_agent_videos"  # 临时视频存储目录（同时也是缓存目录）
			
 
				+VIDEO_CHUNK_SIZE = 8192  # 下载分块大小(字节)
			
 
				+MAX_VIDEO_DOWNLOAD_RETRIES = 2  # 下载重试次数
			
 
				+
			
 
				 # ============================================================================
			
 
				 # 数据模型
			
 
				 # ============================================================================
			
@@ -987,6 +997,249 @@ PROMPT4_CATEGORY_MATCH = """# Prompt 2: 多模态内容品类匹配评估
 
				 # 辅助函数
			
 
				 # ============================================================================
			
 
				 
			
 
				+# 视频处理函数
			
 
				+async def download_video(video_url: str, note_id: str) -> Optional[str]:
			
 
				+    """
			
 
				+    异步下载视频到本地文件（支持缓存）
			
 
				+
			
 
				+    Args:
			
 
				+        video_url: 视频URL
			
 
				+        note_id: 帖子ID（用于文件命名）
			
 
				+
			
 
				+    Returns:
			
 
				+        本地文件路径，失败返回None
			
 
				+    """
			
 
				+    os.makedirs(TEMP_VIDEO_DIR, exist_ok=True)
			
 
				+    video_path = os.path.join(TEMP_VIDEO_DIR, f"{note_id}.mp4")
			
 
				+    
			
 
				+    # 检查视频缓存（如果文件已存在，直接返回）
			
 
				+    if os.path.exists(video_path):
			
 
				+        file_size = os.path.getsize(video_path)
			
 
				+        print(f"      ♻️  使用缓存的视频: {file_size / 1024 / 1024:.2f}MB")
			
 
				+        return video_path
			
 
				+
			
 
				+    for attempt in range(MAX_VIDEO_DOWNLOAD_RETRIES + 1):
			
 
				+        try:
			
 
				+            loop = asyncio.get_event_loop()
			
 
				+            response = await loop.run_in_executor(
			
 
				+                None,
			
 
				+                lambda: requests.get(
			
 
				+                    video_url,
			
 
				+                    stream=True,
			
 
				+                    timeout=VIDEO_DOWNLOAD_TIMEOUT
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+            if response.status_code != 200:
			
 
				+                raise Exception(f"HTTP {response.status_code}")
			
 
				+
			
 
				+            # 检查Content-Length header（如果存在）
			
 
				+            content_length = response.headers.get('content-length')
			
 
				+            if content_length:
			
 
				+                size_mb = int(content_length) / 1024 / 1024
			
 
				+                print(f"      📊 视频大小: {size_mb:.2f}MB")
			
 
				+                if size_mb > MAX_VIDEO_SIZE_MB:
			
 
				+                    print(f"      ⚠️  视频超过{MAX_VIDEO_SIZE_MB}MB限制，跳过下载")
			
 
				+                    return None
			
 
				+
			
 
				+            # 流式下载，检查大小
			
 
				+            current_size = 0
			
 
				+            max_size = MAX_VIDEO_SIZE_MB * 1024 * 1024
			
 
				+
			
 
				+            with open(video_path, 'wb') as f:
			
 
				+                for chunk in response.iter_content(chunk_size=VIDEO_CHUNK_SIZE):
			
 
				+                    if chunk:
			
 
				+                        current_size += len(chunk)
			
 
				+                        if current_size > max_size:
			
 
				+                            # 删除不完整的文件
			
 
				+                            if os.path.exists(video_path):
			
 
				+                                try:
			
 
				+                                    os.remove(video_path)
			
 
				+                                except:
			
 
				+                                    pass
			
 
				+                            print(f"      ⚠️  视频超过{MAX_VIDEO_SIZE_MB}MB限制")
			
 
				+                            return None
			
 
				+                        f.write(chunk)
			
 
				+
			
 
				+            print(f"      📥 视频下载成功: {current_size / 1024 / 1024:.2f}MB")
			
 
				+            return video_path
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            if attempt < MAX_VIDEO_DOWNLOAD_RETRIES:
			
 
				+                wait_time = 2 * (attempt + 1)
			
 
				+                print(f"      ⚠️  下载失败，{wait_time}秒后重试 ({attempt + 1}/{MAX_VIDEO_DOWNLOAD_RETRIES}) - {str(e)[:100]}")
			
 
				+                await asyncio.sleep(wait_time)
			
 
				+            else:
			
 
				+                print(f"      ❌ 视频下载失败: {str(e)[:100]}")
			
 
				+                print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
			
 
				+                # 清理可能的不完整文件
			
 
				+                if os.path.exists(video_path):
			
 
				+                    try:
			
 
				+                        os.remove(video_path)
			
 
				+                    except:
			
 
				+                        pass
			
 
				+                return None
			
 
				+
			
 
				+
			
 
				+async def encode_video_to_base64(video_path: str) -> Optional[str]:
			
 
				+    """
			
 
				+    将视频文件编码为base64字符串
			
 
				+
			
 
				+    Args:
			
 
				+        video_path: 本地视频文件路径
			
 
				+
			
 
				+    Returns:
			
 
				+        base64编码字符串，失败返回None
			
 
				+    """
			
 
				+    try:
			
 
				+        # 检查文件是否存在
			
 
				+        if not os.path.exists(video_path):
			
 
				+            print(f"      ❌ 视频文件不存在: {video_path}")
			
 
				+            return None
			
 
				+        
			
 
				+        # 检查文件大小（base64编码会增加约33%的大小）
			
 
				+        file_size = os.path.getsize(video_path)
			
 
				+        estimated_base64_size = file_size * 1.33  # base64编码后的大小估算
			
 
				+        max_memory_size = MAX_VIDEO_SIZE_MB * 1024 * 1024 * 1.5  # 允许一些余量
			
 
				+        
			
 
				+        if estimated_base64_size > max_memory_size:
			
 
				+            print(f"      ⚠️  视频文件过大，无法编码到内存 ({file_size / 1024 / 1024:.2f}MB)")
			
 
				+            return None
			
 
				+        
			
 
				+        loop = asyncio.get_event_loop()
			
 
				+        
			
 
				+        def _encode_video():
			
 
				+            """同步编码函数"""
			
 
				+            try:
			
 
				+                with open(video_path, 'rb') as f:
			
 
				+                    video_data = f.read()
			
 
				+                    base64_str = base64.b64encode(video_data).decode('utf-8')
			
 
				+                    return base64_str
			
 
				+            except MemoryError as e:
			
 
				+                print(f"      ❌ 内存不足，无法编码视频: {str(e)[:100]}")
			
 
				+                raise
			
 
				+            except Exception as e:
			
 
				+                print(f"      ❌ 读取/编码视频文件失败: {str(e)[:100]}")
			
 
				+                raise
			
 
				+        
			
 
				+        base64_str = await loop.run_in_executor(None, _encode_video)
			
 
				+        
			
 
				+        if base64_str:
			
 
				+            print(f"      🔐 视频编码完成: {len(base64_str) / 1024 / 1024:.2f}MB (base64)")
			
 
				+            return base64_str
			
 
				+        else:
			
 
				+            return None
			
 
				+            
			
 
				+    except MemoryError as e:
			
 
				+        print(f"      ❌ 内存不足，无法编码视频: {str(e)[:100]}")
			
 
				+        print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
			
 
				+        return None
			
 
				+    except Exception as e:
			
 
				+        print(f"      ❌ 视频编码失败: {str(e)[:100]}")
			
 
				+        print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def get_video_mime_type(video_path: str) -> str:
			
 
				+    """
			
 
				+    检测视频的MIME类型
			
 
				+
			
 
				+    Args:
			
 
				+        video_path: 视频文件路径
			
 
				+
			
 
				+    Returns:
			
 
				+        MIME类型字符串 (默认 "video/mp4")
			
 
				+    """
			
 
				+    mime_type, _ = mimetypes.guess_type(video_path)
			
 
				+    if mime_type and mime_type.startswith('video/'):
			
 
				+        return mime_type
			
 
				+    return "video/mp4"
			
 
				+
			
 
				+
			
 
				+# 视频不再清理，保留作为缓存
			
 
				+# async def cleanup_video(video_path: str):
			
 
				+#     """
			
 
				+#     清理临时视频文件
			
 
				+#
			
 
				+#     Args:
			
 
				+#         video_path: 要删除的视频路径
			
 
				+#     """
			
 
				+#     try:
			
 
				+#         if os.path.exists(video_path):
			
 
				+#             os.remove(video_path)
			
 
				+#             print(f"      🗑️  清理临时文件: {os.path.basename(video_path)}")
			
 
				+#     except Exception as e:
			
 
				+#         print(f"      ⚠️  清理失败: {str(e)[:50]}")
			
 
				+
			
 
				+
			
 
				+async def _prepare_media_content(post) -> tuple[list[str], Optional[str], str]:
			
 
				+    """
			
 
				+    统一准备媒体内容（图片+视频）
			
 
				+
			
 
				+    Args:
			
 
				+        post: Post对象
			
 
				+
			
 
				+    Returns:
			
 
				+        (image_urls, video_base64, video_mime_type)
			
 
				+    """
			
 
				+    # 提取图片（包括视频封面图）
			
 
				+    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
			
 
				+
			
 
				+    # 初始化视频相关变量
			
 
				+    video_base64 = None
			
 
				+    video_mime_type = "video/mp4"
			
 
				+
			
 
				+    # 处理视频
			
 
				+    if post.type == "video" and post.video:
			
 
				+        print(f"      🎬 检测到视频帖子 (ID: {post.note_id})")
			
 
				+        print(f"      📍 视频URL: {post.video[:80]}...")
			
 
				+        print(f"      🖼️  封面图数量: {len(image_urls)}")
			
 
				+        print(f"      ⏳ 开始下载视频...")
			
 
				+
			
 
				+        video_path = None
			
 
				+        try:
			
 
				+            # 下载视频
			
 
				+            video_path = await download_video(post.video, post.note_id)
			
 
				+
			
 
				+            if video_path and os.path.exists(video_path):
			
 
				+                try:
			
 
				+                    print(f"      🔄 开始编码视频...")
			
 
				+                    # 编码视频
			
 
				+                    video_base64 = await encode_video_to_base64(video_path)
			
 
				+                    if video_base64:
			
 
				+                        video_mime_type = get_video_mime_type(video_path)
			
 
				+                        print(f"      ✅ 视频处理成功！类型: {video_mime_type}")
			
 
				+                        print(f"      📦 将使用视频+封面图进行评估")
			
 
				+                    else:
			
 
				+                        print(f"      ⚠️  视频编码失败，降级使用封面图评估")
			
 
				+                except MemoryError as e:
			
 
				+                    print(f"      ⚠️  内存不足，无法处理视频: {str(e)[:100]}")
			
 
				+                    print(f"      📦 降级使用封面图评估")
			
 
				+                except Exception as e:
			
 
				+                    print(f"      ⚠️  视频编码异常: {str(e)[:100]}")
			
 
				+                    print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
			
 
				+                    print(f"      📦 降级使用封面图评估")
			
 
				+                # 视频不再清理，保留作为缓存
			
 
				+            else:
			
 
				+                print(f"      ⚠️  视频下载失败，降级使用封面图评估")
			
 
				+        except Exception as e:
			
 
				+            print(f"      ⚠️  视频处理流程异常: {str(e)[:100]}")
			
 
				+            print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
			
 
				+            print(f"      📦 降级使用封面图评估")
			
 
				+            # 视频不再清理，保留作为缓存
			
 
				+    elif post.type == "video" and not post.video:
			
 
				+        print(f"      ⚠️  视频类型帖子但video字段为空 (ID: {post.note_id})")
			
 
				+
			
 
				+    # 打印最终使用的媒体
			
 
				+    if post.type == "video":
			
 
				+        if video_base64:
			
 
				+            print(f"      📊 最终媒体: {len(image_urls)}张图片 + 1个视频")
			
 
				+        else:
			
 
				+            print(f"      📊 最终媒体: {len(image_urls)}张图片 (视频处理失败)")
			
 
				+
			
 
				+    return image_urls, video_base64, video_mime_type
			
 
				+
			
 
				+
			
 
				 def _get_cache_key(note_id: str) -> str:
			
 
				     """
			
 
				     生成缓存key
			
@@ -1102,6 +1355,8 @@ def _clean_json_response(content_text: str) -> str:
 
				 async def _call_openrouter_api(
			
 
				     prompt_text: str,
			
 
				     image_urls: list[str],
			
 
				+    video_base64: Optional[str] = None,
			
 
				+    video_mime_type: str = "video/mp4",
			
 
				     semaphore: Optional[asyncio.Semaphore] = None
			
 
				 ) -> dict:
			
 
				     """
			
@@ -1110,6 +1365,8 @@ async def _call_openrouter_api(
 
				     Args:
			
 
				         prompt_text: Prompt文本
			
 
				         image_urls: 图片URL列表
			
 
				+        video_base64: 视频的base64编码字符串（可选）
			
 
				+        video_mime_type: 视频MIME类型（默认video/mp4）
			
 
				         semaphore: 并发控制信号量
			
 
				 
			
 
				     Returns:
			
@@ -1123,6 +1380,11 @@ async def _call_openrouter_api(
 
				     for url in image_urls:
			
 
				         content.append({"type": "image_url", "image_url": {"url": url}})
			
 
				 
			
 
				+    # 添加视频（如果存在）
			
 
				+    if video_base64:
			
 
				+        data_url = f"data:{video_mime_type};base64,{video_base64}"
			
 
				+        content.append({"type": "video_url", "video_url": {"url": data_url}})
			
 
				+
			
 
				     payload = {
			
 
				         "model": MODEL_NAME,
			
 
				         "messages": [{"role": "user", "content": content}],
			
@@ -1195,10 +1457,8 @@ async def evaluate_is_knowledge(
 
				     Returns:
			
 
				         KnowledgeEvaluation 或 None（失败时）
			
 
				     """
			
 
				-    if post.type == "video":
			
 
				-        return None
			
 
				-
			
 
				-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
			
 
				+    # 准备媒体内容（图片+视频）
			
 
				+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
			
 
				 
			
 
				     try:
			
 
				         prompt_text = PROMPT1_IS_KNOWLEDGE.format(
			
@@ -1207,7 +1467,7 @@ async def evaluate_is_knowledge(
 
				             num_images=len(image_urls)
			
 
				         )
			
 
				 
			
 
				-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
			
 
				+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
			
 
				 
			
 
				         return KnowledgeEvaluation(
			
 
				             is_knowledge=data.get("is_knowledge", False),
			
@@ -1239,10 +1499,8 @@ async def evaluate_is_content_knowledge(
 
				     Returns:
			
 
				         ContentKnowledgeEvaluation 或 None（失败时）
			
 
				     """
			
 
				-    if post.type == "video":
			
 
				-        return None
			
 
				-
			
 
				-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
			
 
				+    # 准备媒体内容（图片+视频）
			
 
				+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
			
 
				 
			
 
				     try:
			
 
				         prompt_text = PROMPT2_IS_CONTENT_KNOWLEDGE.format(
			
@@ -1251,7 +1509,7 @@ async def evaluate_is_content_knowledge(
 
				             num_images=len(image_urls)
			
 
				         )
			
 
				 
			
 
				-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
			
 
				+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
			
 
				 
			
 
				         # 判定是否是内容知识：得分 >= 55 分
			
 
				         final_score = data.get("final_score", 0)
			
@@ -1288,10 +1546,8 @@ async def evaluate_purpose_match(
 
				     Returns:
			
 
				         PurposeEvaluation 或 None（失败时）
			
 
				     """
			
 
				-    if post.type == "video":
			
 
				-        return None
			
 
				-
			
 
				-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
			
 
				+    # 准备媒体内容（图片+视频）
			
 
				+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
			
 
				 
			
 
				     try:
			
 
				         prompt_text = PROMPT3_PURPOSE_MATCH.format(
			
@@ -1301,7 +1557,7 @@ async def evaluate_purpose_match(
 
				             num_images=len(image_urls)
			
 
				         )
			
 
				 
			
 
				-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
			
 
				+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
			
 
				 
			
 
				         # Prompt3的输出在"目的动机评估"键下
			
 
				         purpose_data = data.get("目的动机评估", {})
			
@@ -1336,10 +1592,8 @@ async def evaluate_category_match(
 
				     Returns:
			
 
				         CategoryEvaluation 或 None（失败时）
			
 
				     """
			
 
				-    if post.type == "video":
			
 
				-        return None
			
 
				-
			
 
				-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
			
 
				+    # 准备媒体内容（图片+视频）
			
 
				+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
			
 
				 
			
 
				     try:
			
 
				         prompt_text = PROMPT4_CATEGORY_MATCH.format(
			
@@ -1349,7 +1603,7 @@ async def evaluate_category_match(
 
				             num_images=len(image_urls)
			
 
				         )
			
 
				 
			
 
				-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
			
 
				+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
			
 
				 
			
 
				         # Prompt4的输出在"品类评估"键下
			
 
				         category_data = data.get("品类评估", {})
			
@@ -1416,10 +1670,6 @@ async def evaluate_post_v3(
 
				         (knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level)
			
 
				         任一步骤失败，后续结果为None
			
 
				     """
			
 
				-    if post.type == "video":
			
 
				-        print(f"      ⊗ 跳过视频帖子: {post.note_id}")
			
 
				-        return (None, None, None, None, None, None)
			
 
				-
			
 
				     # 检查缓存
			
 
				     if ENABLE_CACHE:
			
 
				         cached_result = _load_from_cache(post.note_id)
			
--- a/post_evaluator_v4_langgraph.py
+++ b/post_evaluator_v4_langgraph.py
@@ -45,7 +45,7 @@ FILE_PROCESS_TIMEOUT = 180
 
				 
			
 
				 # 代理配置（用于访问 Google File API）
			
 
				 HTTP_PROXY = "http://127.0.0.1:29758"
			
 
				-HTTPS_PROXY = "https://127.0.0.1:29758"
			
 
				+HTTPS_PROXY = "http://127.0.0.1:29758"
			
 
				 
			
 
				 # 缓存配置
			
 
				 ENABLE_CACHE = False
			
--- a/script/search_recommendations/xiaohongshu_search_recommendations.py
+++ b/script/search_recommendations/xiaohongshu_search_recommendations.py
@@ -96,7 +96,7 @@ class XiaohongshuSearchRecommendations:
 
				 
			
 
				         return None
			
 
				 
			
 
				-    def get_recommendations(self, keyword: str, timeout: int = 300, max_retries: int = 10, retry_delay: int = 2, use_cache: bool = True) -> Dict[str, Any]:
			
 
				+    def get_recommendations(self, keyword: str, timeout: int = 300, max_retries: int = 4, retry_delay: int = 7, use_cache: bool = True) -> Dict[str, Any]:
			
 
				         """
			
 
				         获取小红书搜索推荐词
			
 
				 
			
--- a/update_functions.py
+++ b/update_functions.py
@@ -0,0 +1,155 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+更新 4 个评估函数,使用新的 system/user prompt 结构
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+
			
 
				+def read_file(filepath):
			
 
				+    with open(filepath, 'r', encoding='utf-8') as f:
			
 
				+        return f.read()
			
 
				+
			
 
				+def write_file(filepath, content):
			
 
				+    with open(filepath, 'w', encoding='utf-8') as f:
			
 
				+        f.write(content)
			
 
				+
			
 
				+def update_evaluate_is_knowledge(content):
			
 
				+    """更新 evaluate_is_knowledge 函数"""
			
 
				+    # 查找并替换
			
 
				+    old_pattern = r'''try:
			
 
				+        prompt_text = PROMPT1_IS_KNOWLEDGE\.format\(
			
 
				+            title=post\.title,
			
 
				+            body_text=post\.body_text or "",
			
 
				+            num_images=len\(image_urls\)
			
 
				+        \)
			
 
				+
			
 
				+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
			
 
				+
			
 
				+    new_code = '''try:
			
 
				+        user_prompt = PROMPT1_USER_TEMPLATE.format(
			
 
				+            title=post.title,
			
 
				+            body_text=post.body_text or "",
			
 
				+            num_images=len(image_urls)
			
 
				+        )
			
 
				+
			
 
				+        data = await _call_openrouter_api(
			
 
				+            system_prompt=PROMPT1_SYSTEM,
			
 
				+            user_prompt=user_prompt,
			
 
				+            image_urls=image_urls,
			
 
				+            semaphore=semaphore
			
 
				+        )'''
			
 
				+
			
 
				+    content = re.sub(old_pattern, new_code, content)
			
 
				+    print("✅ evaluate_is_knowledge 更新完成")
			
 
				+    return content
			
 
				+
			
 
				+def update_evaluate_is_content_knowledge(content):
			
 
				+    """更新 evaluate_is_content_knowledge 函数"""
			
 
				+    old_pattern = r'''try:
			
 
				+        prompt_text = PROMPT2_IS_CONTENT_KNOWLEDGE\.format\(
			
 
				+            title=post\.title,
			
 
				+            body_text=post\.body_text or "",
			
 
				+            num_images=len\(image_urls\)
			
 
				+        \)
			
 
				+
			
 
				+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
			
 
				+
			
 
				+    new_code = '''try:
			
 
				+        user_prompt = PROMPT2_USER_TEMPLATE.format(
			
 
				+            title=post.title,
			
 
				+            body_text=post.body_text or "",
			
 
				+            num_images=len(image_urls)
			
 
				+        )
			
 
				+
			
 
				+        data = await _call_openrouter_api(
			
 
				+            system_prompt=PROMPT2_SYSTEM,
			
 
				+            user_prompt=user_prompt,
			
 
				+            image_urls=image_urls,
			
 
				+            semaphore=semaphore
			
 
				+        )'''
			
 
				+
			
 
				+    content = re.sub(old_pattern, new_code, content)
			
 
				+    print("✅ evaluate_is_content_knowledge 更新完成")
			
 
				+    return content
			
 
				+
			
 
				+def update_evaluate_purpose_match(content):
			
 
				+    """更新 evaluate_purpose_match 函数"""
			
 
				+    old_pattern = r'''try:
			
 
				+        prompt_text = PROMPT3_PURPOSE_MATCH\.format\(
			
 
				+            original_query=original_query,
			
 
				+            title=post\.title,
			
 
				+            body_text=post\.body_text or "",
			
 
				+            num_images=len\(image_urls\)
			
 
				+        \)
			
 
				+
			
 
				+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
			
 
				+
			
 
				+    new_code = '''try:
			
 
				+        user_prompt = PROMPT3_USER_TEMPLATE.format(
			
 
				+            original_query=original_query,
			
 
				+            title=post.title,
			
 
				+            body_text=post.body_text or "",
			
 
				+            num_images=len(image_urls)
			
 
				+        )
			
 
				+
			
 
				+        data = await _call_openrouter_api(
			
 
				+            system_prompt=PROMPT3_SYSTEM,
			
 
				+            user_prompt=user_prompt,
			
 
				+            image_urls=image_urls,
			
 
				+            semaphore=semaphore
			
 
				+        )'''
			
 
				+
			
 
				+    content = re.sub(old_pattern, new_code, content)
			
 
				+    print("✅ evaluate_purpose_match 更新完成")
			
 
				+    return content
			
 
				+
			
 
				+def update_evaluate_category_match(content):
			
 
				+    """更新 evaluate_category_match 函数"""
			
 
				+    old_pattern = r'''try:
			
 
				+        prompt_text = PROMPT4_CATEGORY_MATCH\.format\(
			
 
				+            original_query=original_query,
			
 
				+            title=post\.title,
			
 
				+            body_text=post\.body_text or "",
			
 
				+            num_images=len\(image_urls\)
			
 
				+        \)
			
 
				+
			
 
				+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
			
 
				+
			
 
				+    new_code = '''try:
			
 
				+        user_prompt = PROMPT4_USER_TEMPLATE.format(
			
 
				+            original_query=original_query,
			
 
				+            title=post.title,
			
 
				+            body_text=post.body_text or "",
			
 
				+            num_images=len(image_urls)
			
 
				+        )
			
 
				+
			
 
				+        data = await _call_openrouter_api(
			
 
				+            system_prompt=PROMPT4_SYSTEM,
			
 
				+            user_prompt=user_prompt,
			
 
				+            image_urls=image_urls,
			
 
				+            semaphore=semaphore
			
 
				+        )'''
			
 
				+
			
 
				+    content = re.sub(old_pattern, new_code, content)
			
 
				+    print("✅ evaluate_category_match 更新完成")
			
 
				+    return content
			
 
				+
			
 
				+def main():
			
 
				+    filepath = 'post_evaluator_v3.py'
			
 
				+
			
 
				+    print("📖 读取文件...")
			
 
				+    content = read_file(filepath)
			
 
				+
			
 
				+    print("\n🔧 更新评估函数...")
			
 
				+    content = update_evaluate_is_knowledge(content)
			
 
				+    content = update_evaluate_is_content_knowledge(content)
			
 
				+    content = update_evaluate_purpose_match(content)
			
 
				+    content = update_evaluate_category_match(content)
			
 
				+
			
 
				+    print("\n💾 保存文件...")
			
 
				+    write_file(filepath, content)
			
 
				+
			
 
				+    print("\n✅ 所有评估函数更新完成!")
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/video_utils.py
+++ b/video_utils.py
@@ -0,0 +1,178 @@
 
				+"""
			
 
				+视频处理工具模块
			
 
				+
			
 
				+提供视频下载、Base64 编码等功能，用于支持视频评估
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import base64
			
 
				+import hashlib
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+# 配置
			
 
				+VIDEO_CACHE_DIR = Path(".video_cache")
			
 
				+VIDEO_MAX_SIZE_MB = 50  # 最大视频大小（MB）
			
 
				+VIDEO_DOWNLOAD_TIMEOUT = 120  # 下载超时（秒）
			
 
				+MAX_RETRIES = 2  # 最大重试次数
			
 
				+
			
 
				+
			
 
				+async def download_video(
			
 
				+    video_url: str,
			
 
				+    cache_dir: Path = VIDEO_CACHE_DIR
			
 
				+) -> Optional[Path]:
			
 
				+    """
			
 
				+    异步下载视频文件
			
 
				+
			
 
				+    Args:
			
 
				+        video_url: 视频URL
			
 
				+        cache_dir: 缓存目录
			
 
				+
			
 
				+    Returns:
			
 
				+        视频文件路径，失败返回 None
			
 
				+    """
			
 
				+    # 创建缓存目录
			
 
				+    cache_dir.mkdir(exist_ok=True)
			
 
				+
			
 
				+    # 生成缓存文件名（基于URL hash）
			
 
				+    url_hash = hashlib.md5(video_url.encode()).hexdigest()
			
 
				+    cache_path = cache_dir / f"{url_hash}.mp4"
			
 
				+
			
 
				+    # 检查缓存
			
 
				+    if cache_path.exists():
			
 
				+        file_size_mb = cache_path.stat().st_size / (1024 * 1024)
			
 
				+        print(f"      ♻️  使用缓存视频: {file_size_mb:.2f}MB")
			
 
				+        return cache_path
			
 
				+
			
 
				+    # 异步下载
			
 
				+    loop = asyncio.get_event_loop()
			
 
				+
			
 
				+    for attempt in range(MAX_RETRIES + 1):
			
 
				+        try:
			
 
				+            print(f"      📥 下载视频... (尝试 {attempt + 1}/{MAX_RETRIES + 1})")
			
 
				+
			
 
				+            # 使用 executor 执行同步下载
			
 
				+            response = await loop.run_in_executor(
			
 
				+                None,
			
 
				+                lambda: requests.get(
			
 
				+                    video_url,
			
 
				+                    timeout=VIDEO_DOWNLOAD_TIMEOUT,
			
 
				+                    stream=True,
			
 
				+                    headers={"User-Agent": "Mozilla/5.0"}
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+            response.raise_for_status()
			
 
				+
			
 
				+            # 检查文件大小
			
 
				+            content_length = response.headers.get('content-length')
			
 
				+            if content_length:
			
 
				+                size_mb = int(content_length) / (1024 * 1024)
			
 
				+                if size_mb > VIDEO_MAX_SIZE_MB:
			
 
				+                    print(f"      ⚠️  视频过大: {size_mb:.2f}MB > {VIDEO_MAX_SIZE_MB}MB")
			
 
				+                    return None
			
 
				+
			
 
				+            # 保存到临时文件
			
 
				+            temp_path = cache_path.with_suffix('.tmp')
			
 
				+
			
 
				+            def save_chunks():
			
 
				+                with open(temp_path, 'wb') as f:
			
 
				+                    for chunk in response.iter_content(chunk_size=8192):
			
 
				+                        if chunk:
			
 
				+                            f.write(chunk)
			
 
				+
			
 
				+            await loop.run_in_executor(None, save_chunks)
			
 
				+
			
 
				+            # 检查实际文件大小
			
 
				+            actual_size_mb = temp_path.stat().st_size / (1024 * 1024)
			
 
				+            if actual_size_mb > VIDEO_MAX_SIZE_MB:
			
 
				+                print(f"      ⚠️  视频过大: {actual_size_mb:.2f}MB > {VIDEO_MAX_SIZE_MB}MB")
			
 
				+                temp_path.unlink()
			
 
				+                return None
			
 
				+
			
 
				+            # 重命名为正式文件
			
 
				+            temp_path.rename(cache_path)
			
 
				+
			
 
				+            print(f"      ✅ 视频下载成功: {actual_size_mb:.2f}MB")
			
 
				+            return cache_path
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            if attempt < MAX_RETRIES:
			
 
				+                wait_time = 2 * (attempt + 1)
			
 
				+                print(f"      ⚠️  下载失败，{wait_time}秒后重试: {str(e)[:50]}")
			
 
				+                await asyncio.sleep(wait_time)
			
 
				+            else:
			
 
				+                print(f"      ❌ 视频下载失败: {str(e)[:100]}")
			
 
				+                # 清理临时文件
			
 
				+                if cache_path.with_suffix('.tmp').exists():
			
 
				+                    cache_path.with_suffix('.tmp').unlink()
			
 
				+                return None
			
 
				+
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+async def encode_video_to_base64(video_path: Path) -> Optional[str]:
			
 
				+    """
			
 
				+    异步将视频文件编码为 Base64 data URL
			
 
				+
			
 
				+    Args:
			
 
				+        video_path: 视频文件路径
			
 
				+
			
 
				+    Returns:
			
 
				+        Base64 编码的 data URL，失败返回 None
			
 
				+    """
			
 
				+    try:
			
 
				+        loop = asyncio.get_event_loop()
			
 
				+
			
 
				+        # 异步读取文件
			
 
				+        def read_file():
			
 
				+            with open(video_path, 'rb') as f:
			
 
				+                return f.read()
			
 
				+
			
 
				+        print(f"      🔄 编码视频为 Base64...")
			
 
				+        video_bytes = await loop.run_in_executor(None, read_file)
			
 
				+
			
 
				+        # Base64 编码
			
 
				+        def encode():
			
 
				+            base64_str = base64.b64encode(video_bytes).decode('utf-8')
			
 
				+            return f"data:video/mp4;base64,{base64_str}"
			
 
				+
			
 
				+        data_url = await loop.run_in_executor(None, encode)
			
 
				+
			
 
				+        encoded_size_mb = len(data_url) / (1024 * 1024)
			
 
				+        print(f"      ✅ Base64 编码完成: {encoded_size_mb:.2f}MB")
			
 
				+
			
 
				+        return data_url
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"      ❌ Base64 编码失败: {str(e)[:100]}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def cleanup_video_cache(cache_dir: Path = VIDEO_CACHE_DIR, days: int = 7):
			
 
				+    """
			
 
				+    清理超过指定天数的视频缓存
			
 
				+
			
 
				+    Args:
			
 
				+        cache_dir: 缓存目录
			
 
				+        days: 保留天数
			
 
				+    """
			
 
				+    import time
			
 
				+
			
 
				+    if not cache_dir.exists():
			
 
				+        return
			
 
				+
			
 
				+    now = time.time()
			
 
				+    cutoff = now - (days * 24 * 60 * 60)
			
 
				+
			
 
				+    removed_count = 0
			
 
				+    for file_path in cache_dir.glob("*.mp4"):
			
 
				+        if file_path.stat().st_mtime < cutoff:
			
 
				+            file_path.unlink()
			
 
				+            removed_count += 1
			
 
				+
			
 
				+    if removed_count > 0:
			
 
				+        print(f"🗑️  清理了 {removed_count} 个过期视频缓存")