刘立冬 2 주 전
부모
커밋
678f2095d8
6개의 변경된 파일793개의 추가작업 그리고 26개의 파일을 삭제
  1. 184 0
      extract_top10_multimodal.py
  2. 274 24
      post_evaluator_v3.py
  3. 1 1
      post_evaluator_v4_langgraph.py
  4. 1 1
      script/search_recommendations/xiaohongshu_search_recommendations.py
  5. 155 0
      update_functions.py
  6. 178 0
      video_utils.py

+ 184 - 0
extract_top10_multimodal.py

@@ -0,0 +1,184 @@
+"""
+从 run_context_v3.json 中提取 top10 帖子并进行多模态解析
+
+功能:
+1. 读取 run_context_v3.json
+2. 提取所有帖子,按 final_score 排序,取 top10
+3. 使用 multimodal_extractor 进行图片内容解析
+4. 保存结果到独立的 JSON 文件
+"""
+
+import asyncio
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+# 导入必要的模块
+from knowledge_search_traverse import Post
+from multimodal_extractor import extract_all_posts
+
+
+def load_run_context(json_path: str) -> dict:
+    """加载 run_context_v3.json 文件"""
+    with open(json_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def extract_all_posts_from_context(context_data: dict) -> list[dict]:
+    """从 context 数据中提取所有帖子"""
+    all_posts = []
+
+    # 遍历所有轮次
+    for round_data in context_data.get('rounds', []):
+        # 遍历搜索结果
+        for search_result in round_data.get('search_results', []):
+            # 遍历帖子列表
+            for post in search_result.get('post_list', []):
+                all_posts.append(post)
+
+    return all_posts
+
+
+def filter_and_sort_top10(posts: list[dict]) -> list[dict]:
+    """过滤并排序,获取 final_score top10 的帖子"""
+    # 过滤掉 final_score 为 null 的帖子
+    valid_posts = [p for p in posts if p.get('final_score') is not None]
+
+    # 按 final_score 降序排序
+    sorted_posts = sorted(valid_posts, key=lambda x: x.get('final_score', 0), reverse=True)
+
+    # 取前10个
+    top10 = sorted_posts[:10]
+
+    return top10
+
+
+def convert_to_post_objects(post_dicts: list[dict]) -> list[Post]:
+    """将字典数据转换为 Post 对象"""
+    post_objects = []
+
+    for post_dict in post_dicts:
+        # 创建 Post 对象,设置默认 type="normal"
+        post = Post(
+            note_id=post_dict.get('note_id', ''),
+            note_url=post_dict.get('note_url', ''),
+            title=post_dict.get('title', ''),
+            body_text=post_dict.get('body_text', ''),
+            type='normal',  # 默认值,因为原数据缺少此字段
+            images=post_dict.get('images', []),
+            video=post_dict.get('video', ''),
+            interact_info=post_dict.get('interact_info', {}),
+        )
+        post_objects.append(post)
+
+    return post_objects
+
+
+def save_extraction_results(results: dict, output_path: str, top10_posts: list[dict]):
+    """保存多模态解析结果到 JSON 文件"""
+    # 构建输出数据
+    output_data = {
+        'total_extracted': len(results),
+        'extraction_results': []
+    }
+
+    # 遍历每个解析结果
+    for note_id, extraction in results.items():
+        # 找到对应的原始帖子数据
+        original_post = None
+        for post in top10_posts:
+            if post.get('note_id') == note_id:
+                original_post = post
+                break
+
+        # 构建结果条目
+        result_entry = {
+            'note_id': extraction.note_id,
+            'note_url': extraction.note_url,
+            'title': extraction.title,
+            'body_text': extraction.body_text,
+            'type': extraction.type,
+            'extraction_time': extraction.extraction_time,
+            'final_score': original_post.get('final_score') if original_post else None,
+            'images': [
+                {
+                    'image_index': img.image_index,
+                    'original_url': img.original_url,
+                    'description': img.description,
+                    'extract_text': img.extract_text
+                }
+                for img in extraction.images
+            ]
+        }
+
+        output_data['extraction_results'].append(result_entry)
+
+    # 保存到文件
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(output_data, f, ensure_ascii=False, indent=2)
+
+    print(f"\n✅ 结果已保存到: {output_path}")
+
+
+async def main(context_file_path: str, output_file_path: str):
+    """主函数"""
+    print("=" * 80)
+    print("多模态解析 - Top10 帖子")
+    print("=" * 80)
+
+    # 1. 加载数据
+    print(f"\n📂 加载文件: {context_file_path}")
+    context_data = load_run_context(context_file_path)
+
+    # 2. 提取所有帖子
+    print(f"\n🔍 提取所有帖子...")
+    all_posts = extract_all_posts_from_context(context_data)
+    print(f"   共找到 {len(all_posts)} 个帖子")
+
+    # 3. 过滤并排序获取 top10
+    print(f"\n📊 筛选 top10 帖子...")
+    top10_posts = filter_and_sort_top10(all_posts)
+    print(f"   Top10 帖子得分范围: {top10_posts[-1].get('final_score')} ~ {top10_posts[0].get('final_score')}")
+
+    # 打印 top10 列表
+    print("\n   Top10 帖子列表:")
+    for i, post in enumerate(top10_posts, 1):
+        print(f"   {i}. [{post.get('final_score')}] {post.get('title')[:40]}... ({post.get('note_id')})")
+
+    # 4. 转换为 Post 对象
+    print(f"\n🔄 转换为 Post 对象...")
+    post_objects = convert_to_post_objects(top10_posts)
+    print(f"   成功转换 {len(post_objects)} 个 Post 对象")
+
+    # 5. 进行多模态解析
+    print(f"\n🖼️  开始多模态图片内容解析...")
+    print(f"   (并发限制: 5, 每张图片最多 10 张)")
+    extraction_results = await extract_all_posts(post_objects, max_concurrent=5)
+
+    # 6. 保存结果
+    print(f"\n💾 保存解析结果...")
+    save_extraction_results(extraction_results, output_file_path, top10_posts)
+
+    print("\n" + "=" * 80)
+    print("✅ 处理完成!")
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    # 默认路径配置
+    DEFAULT_CONTEXT_FILE = "input/test_case/output/knowledge_search_traverse/20251114/005215_b1/run_context_v3.json"
+    DEFAULT_OUTPUT_FILE = "input/test_case/output/knowledge_search_traverse/20251114/005215_b1/multimodal_extraction_top10.json"
+
+    # 可以通过命令行参数覆盖
+    context_file = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_CONTEXT_FILE
+    output_file = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_OUTPUT_FILE
+
+    # 检查文件是否存在
+    if not os.path.exists(context_file):
+        print(f"❌ 错误: 文件不存在 - {context_file}")
+        sys.exit(1)
+
+    # 运行主函数
+    asyncio.run(main(context_file, output_file))

+ 274 - 24
post_evaluator_v3.py

@@ -10,8 +10,11 @@
 """
 """
 
 
 import asyncio
 import asyncio
+import base64
 import json
 import json
+import mimetypes
 import os
 import os
+import traceback
 from datetime import datetime
 from datetime import datetime
 from typing import Optional
 from typing import Optional
 from pydantic import BaseModel, Field
 from pydantic import BaseModel, Field
@@ -26,6 +29,13 @@ API_TIMEOUT = 120
 ENABLE_CACHE = True  # 是否启用评估结果缓存
 ENABLE_CACHE = True  # 是否启用评估结果缓存
 CACHE_DIR = ".evaluation_cache"  # 缓存目录
 CACHE_DIR = ".evaluation_cache"  # 缓存目录
 
 
+# 视频处理配置
+MAX_VIDEO_SIZE_MB = 60  # 最大视频大小限制(MB)
+VIDEO_DOWNLOAD_TIMEOUT = 60  # 视频下载超时(秒)
+TEMP_VIDEO_DIR = "/tmp/kg_agent_videos"  # 临时视频存储目录(同时也是缓存目录)
+VIDEO_CHUNK_SIZE = 8192  # 下载分块大小(字节)
+MAX_VIDEO_DOWNLOAD_RETRIES = 2  # 下载重试次数
+
 # ============================================================================
 # ============================================================================
 # 数据模型
 # 数据模型
 # ============================================================================
 # ============================================================================
@@ -987,6 +997,249 @@ PROMPT4_CATEGORY_MATCH = """# Prompt 2: 多模态内容品类匹配评估
 # 辅助函数
 # 辅助函数
 # ============================================================================
 # ============================================================================
 
 
+# 视频处理函数
+async def download_video(video_url: str, note_id: str) -> Optional[str]:
+    """
+    异步下载视频到本地文件(支持缓存)
+
+    Args:
+        video_url: 视频URL
+        note_id: 帖子ID(用于文件命名)
+
+    Returns:
+        本地文件路径,失败返回None
+    """
+    os.makedirs(TEMP_VIDEO_DIR, exist_ok=True)
+    video_path = os.path.join(TEMP_VIDEO_DIR, f"{note_id}.mp4")
+    
+    # 检查视频缓存(如果文件已存在,直接返回)
+    if os.path.exists(video_path):
+        file_size = os.path.getsize(video_path)
+        print(f"      ♻️  使用缓存的视频: {file_size / 1024 / 1024:.2f}MB")
+        return video_path
+
+    for attempt in range(MAX_VIDEO_DOWNLOAD_RETRIES + 1):
+        try:
+            loop = asyncio.get_event_loop()
+            response = await loop.run_in_executor(
+                None,
+                lambda: requests.get(
+                    video_url,
+                    stream=True,
+                    timeout=VIDEO_DOWNLOAD_TIMEOUT
+                )
+            )
+
+            if response.status_code != 200:
+                raise Exception(f"HTTP {response.status_code}")
+
+            # 检查Content-Length header(如果存在)
+            content_length = response.headers.get('content-length')
+            if content_length:
+                size_mb = int(content_length) / 1024 / 1024
+                print(f"      📊 视频大小: {size_mb:.2f}MB")
+                if size_mb > MAX_VIDEO_SIZE_MB:
+                    print(f"      ⚠️  视频超过{MAX_VIDEO_SIZE_MB}MB限制,跳过下载")
+                    return None
+
+            # 流式下载,检查大小
+            current_size = 0
+            max_size = MAX_VIDEO_SIZE_MB * 1024 * 1024
+
+            with open(video_path, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=VIDEO_CHUNK_SIZE):
+                    if chunk:
+                        current_size += len(chunk)
+                        if current_size > max_size:
+                            # 删除不完整的文件
+                            if os.path.exists(video_path):
+                                try:
+                                    os.remove(video_path)
+                                except:
+                                    pass
+                            print(f"      ⚠️  视频超过{MAX_VIDEO_SIZE_MB}MB限制")
+                            return None
+                        f.write(chunk)
+
+            print(f"      📥 视频下载成功: {current_size / 1024 / 1024:.2f}MB")
+            return video_path
+
+        except Exception as e:
+            if attempt < MAX_VIDEO_DOWNLOAD_RETRIES:
+                wait_time = 2 * (attempt + 1)
+                print(f"      ⚠️  下载失败,{wait_time}秒后重试 ({attempt + 1}/{MAX_VIDEO_DOWNLOAD_RETRIES}) - {str(e)[:100]}")
+                await asyncio.sleep(wait_time)
+            else:
+                print(f"      ❌ 视频下载失败: {str(e)[:100]}")
+                print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
+                # 清理可能的不完整文件
+                if os.path.exists(video_path):
+                    try:
+                        os.remove(video_path)
+                    except:
+                        pass
+                return None
+
+
+async def encode_video_to_base64(video_path: str) -> Optional[str]:
+    """
+    将视频文件编码为base64字符串
+
+    Args:
+        video_path: 本地视频文件路径
+
+    Returns:
+        base64编码字符串,失败返回None
+    """
+    try:
+        # 检查文件是否存在
+        if not os.path.exists(video_path):
+            print(f"      ❌ 视频文件不存在: {video_path}")
+            return None
+        
+        # 检查文件大小(base64编码会增加约33%的大小)
+        file_size = os.path.getsize(video_path)
+        estimated_base64_size = file_size * 1.33  # base64编码后的大小估算
+        max_memory_size = MAX_VIDEO_SIZE_MB * 1024 * 1024 * 1.5  # 允许一些余量
+        
+        if estimated_base64_size > max_memory_size:
+            print(f"      ⚠️  视频文件过大,无法编码到内存 ({file_size / 1024 / 1024:.2f}MB)")
+            return None
+        
+        loop = asyncio.get_event_loop()
+        
+        def _encode_video():
+            """同步编码函数"""
+            try:
+                with open(video_path, 'rb') as f:
+                    video_data = f.read()
+                    base64_str = base64.b64encode(video_data).decode('utf-8')
+                    return base64_str
+            except MemoryError as e:
+                print(f"      ❌ 内存不足,无法编码视频: {str(e)[:100]}")
+                raise
+            except Exception as e:
+                print(f"      ❌ 读取/编码视频文件失败: {str(e)[:100]}")
+                raise
+        
+        base64_str = await loop.run_in_executor(None, _encode_video)
+        
+        if base64_str:
+            print(f"      🔐 视频编码完成: {len(base64_str) / 1024 / 1024:.2f}MB (base64)")
+            return base64_str
+        else:
+            return None
+            
+    except MemoryError as e:
+        print(f"      ❌ 内存不足,无法编码视频: {str(e)[:100]}")
+        print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
+        return None
+    except Exception as e:
+        print(f"      ❌ 视频编码失败: {str(e)[:100]}")
+        print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
+        return None
+
+
+def get_video_mime_type(video_path: str) -> str:
+    """
+    检测视频的MIME类型
+
+    Args:
+        video_path: 视频文件路径
+
+    Returns:
+        MIME类型字符串 (默认 "video/mp4")
+    """
+    mime_type, _ = mimetypes.guess_type(video_path)
+    if mime_type and mime_type.startswith('video/'):
+        return mime_type
+    return "video/mp4"
+
+
+# 视频不再清理,保留作为缓存
+# async def cleanup_video(video_path: str):
+#     """
+#     清理临时视频文件
+#
+#     Args:
+#         video_path: 要删除的视频路径
+#     """
+#     try:
+#         if os.path.exists(video_path):
+#             os.remove(video_path)
+#             print(f"      🗑️  清理临时文件: {os.path.basename(video_path)}")
+#     except Exception as e:
+#         print(f"      ⚠️  清理失败: {str(e)[:50]}")
+
+
+async def _prepare_media_content(post) -> tuple[list[str], Optional[str], str]:
+    """
+    统一准备媒体内容(图片+视频)
+
+    Args:
+        post: Post对象
+
+    Returns:
+        (image_urls, video_base64, video_mime_type)
+    """
+    # 提取图片(包括视频封面图)
+    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
+
+    # 初始化视频相关变量
+    video_base64 = None
+    video_mime_type = "video/mp4"
+
+    # 处理视频
+    if post.type == "video" and post.video:
+        print(f"      🎬 检测到视频帖子 (ID: {post.note_id})")
+        print(f"      📍 视频URL: {post.video[:80]}...")
+        print(f"      🖼️  封面图数量: {len(image_urls)}")
+        print(f"      ⏳ 开始下载视频...")
+
+        video_path = None
+        try:
+            # 下载视频
+            video_path = await download_video(post.video, post.note_id)
+
+            if video_path and os.path.exists(video_path):
+                try:
+                    print(f"      🔄 开始编码视频...")
+                    # 编码视频
+                    video_base64 = await encode_video_to_base64(video_path)
+                    if video_base64:
+                        video_mime_type = get_video_mime_type(video_path)
+                        print(f"      ✅ 视频处理成功!类型: {video_mime_type}")
+                        print(f"      📦 将使用视频+封面图进行评估")
+                    else:
+                        print(f"      ⚠️  视频编码失败,降级使用封面图评估")
+                except MemoryError as e:
+                    print(f"      ⚠️  内存不足,无法处理视频: {str(e)[:100]}")
+                    print(f"      📦 降级使用封面图评估")
+                except Exception as e:
+                    print(f"      ⚠️  视频编码异常: {str(e)[:100]}")
+                    print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
+                    print(f"      📦 降级使用封面图评估")
+                # 视频不再清理,保留作为缓存
+            else:
+                print(f"      ⚠️  视频下载失败,降级使用封面图评估")
+        except Exception as e:
+            print(f"      ⚠️  视频处理流程异常: {str(e)[:100]}")
+            print(f"      📋 错误详情: {traceback.format_exc()[:300]}")
+            print(f"      📦 降级使用封面图评估")
+            # 视频不再清理,保留作为缓存
+    elif post.type == "video" and not post.video:
+        print(f"      ⚠️  视频类型帖子但video字段为空 (ID: {post.note_id})")
+
+    # 打印最终使用的媒体
+    if post.type == "video":
+        if video_base64:
+            print(f"      📊 最终媒体: {len(image_urls)}张图片 + 1个视频")
+        else:
+            print(f"      📊 最终媒体: {len(image_urls)}张图片 (视频处理失败)")
+
+    return image_urls, video_base64, video_mime_type
+
+
 def _get_cache_key(note_id: str) -> str:
 def _get_cache_key(note_id: str) -> str:
     """
     """
     生成缓存key
     生成缓存key
@@ -1102,6 +1355,8 @@ def _clean_json_response(content_text: str) -> str:
 async def _call_openrouter_api(
 async def _call_openrouter_api(
     prompt_text: str,
     prompt_text: str,
     image_urls: list[str],
     image_urls: list[str],
+    video_base64: Optional[str] = None,
+    video_mime_type: str = "video/mp4",
     semaphore: Optional[asyncio.Semaphore] = None
     semaphore: Optional[asyncio.Semaphore] = None
 ) -> dict:
 ) -> dict:
     """
     """
@@ -1110,6 +1365,8 @@ async def _call_openrouter_api(
     Args:
     Args:
         prompt_text: Prompt文本
         prompt_text: Prompt文本
         image_urls: 图片URL列表
         image_urls: 图片URL列表
+        video_base64: 视频的base64编码字符串(可选)
+        video_mime_type: 视频MIME类型(默认video/mp4)
         semaphore: 并发控制信号量
         semaphore: 并发控制信号量
 
 
     Returns:
     Returns:
@@ -1123,6 +1380,11 @@ async def _call_openrouter_api(
     for url in image_urls:
     for url in image_urls:
         content.append({"type": "image_url", "image_url": {"url": url}})
         content.append({"type": "image_url", "image_url": {"url": url}})
 
 
+    # 添加视频(如果存在)
+    if video_base64:
+        data_url = f"data:{video_mime_type};base64,{video_base64}"
+        content.append({"type": "video_url", "video_url": {"url": data_url}})
+
     payload = {
     payload = {
         "model": MODEL_NAME,
         "model": MODEL_NAME,
         "messages": [{"role": "user", "content": content}],
         "messages": [{"role": "user", "content": content}],
@@ -1195,10 +1457,8 @@ async def evaluate_is_knowledge(
     Returns:
     Returns:
         KnowledgeEvaluation 或 None(失败时)
         KnowledgeEvaluation 或 None(失败时)
     """
     """
-    if post.type == "video":
-        return None
-
-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
+    # 准备媒体内容(图片+视频)
+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
 
 
     try:
     try:
         prompt_text = PROMPT1_IS_KNOWLEDGE.format(
         prompt_text = PROMPT1_IS_KNOWLEDGE.format(
@@ -1207,7 +1467,7 @@ async def evaluate_is_knowledge(
             num_images=len(image_urls)
             num_images=len(image_urls)
         )
         )
 
 
-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
 
 
         return KnowledgeEvaluation(
         return KnowledgeEvaluation(
             is_knowledge=data.get("is_knowledge", False),
             is_knowledge=data.get("is_knowledge", False),
@@ -1239,10 +1499,8 @@ async def evaluate_is_content_knowledge(
     Returns:
     Returns:
         ContentKnowledgeEvaluation 或 None(失败时)
         ContentKnowledgeEvaluation 或 None(失败时)
     """
     """
-    if post.type == "video":
-        return None
-
-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
+    # 准备媒体内容(图片+视频)
+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
 
 
     try:
     try:
         prompt_text = PROMPT2_IS_CONTENT_KNOWLEDGE.format(
         prompt_text = PROMPT2_IS_CONTENT_KNOWLEDGE.format(
@@ -1251,7 +1509,7 @@ async def evaluate_is_content_knowledge(
             num_images=len(image_urls)
             num_images=len(image_urls)
         )
         )
 
 
-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
 
 
         # 判定是否是内容知识:得分 >= 55 分
         # 判定是否是内容知识:得分 >= 55 分
         final_score = data.get("final_score", 0)
         final_score = data.get("final_score", 0)
@@ -1288,10 +1546,8 @@ async def evaluate_purpose_match(
     Returns:
     Returns:
         PurposeEvaluation 或 None(失败时)
         PurposeEvaluation 或 None(失败时)
     """
     """
-    if post.type == "video":
-        return None
-
-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
+    # 准备媒体内容(图片+视频)
+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
 
 
     try:
     try:
         prompt_text = PROMPT3_PURPOSE_MATCH.format(
         prompt_text = PROMPT3_PURPOSE_MATCH.format(
@@ -1301,7 +1557,7 @@ async def evaluate_purpose_match(
             num_images=len(image_urls)
             num_images=len(image_urls)
         )
         )
 
 
-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
 
 
         # Prompt3的输出在"目的动机评估"键下
         # Prompt3的输出在"目的动机评估"键下
         purpose_data = data.get("目的动机评估", {})
         purpose_data = data.get("目的动机评估", {})
@@ -1336,10 +1592,8 @@ async def evaluate_category_match(
     Returns:
     Returns:
         CategoryEvaluation 或 None(失败时)
         CategoryEvaluation 或 None(失败时)
     """
     """
-    if post.type == "video":
-        return None
-
-    image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
+    # 准备媒体内容(图片+视频)
+    image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
 
 
     try:
     try:
         prompt_text = PROMPT4_CATEGORY_MATCH.format(
         prompt_text = PROMPT4_CATEGORY_MATCH.format(
@@ -1349,7 +1603,7 @@ async def evaluate_category_match(
             num_images=len(image_urls)
             num_images=len(image_urls)
         )
         )
 
 
-        data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
+        data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
 
 
         # Prompt4的输出在"品类评估"键下
         # Prompt4的输出在"品类评估"键下
         category_data = data.get("品类评估", {})
         category_data = data.get("品类评估", {})
@@ -1416,10 +1670,6 @@ async def evaluate_post_v3(
         (knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level)
         (knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level)
         任一步骤失败,后续结果为None
         任一步骤失败,后续结果为None
     """
     """
-    if post.type == "video":
-        print(f"      ⊗ 跳过视频帖子: {post.note_id}")
-        return (None, None, None, None, None, None)
-
     # 检查缓存
     # 检查缓存
     if ENABLE_CACHE:
     if ENABLE_CACHE:
         cached_result = _load_from_cache(post.note_id)
         cached_result = _load_from_cache(post.note_id)

+ 1 - 1
post_evaluator_v4_langgraph.py

@@ -45,7 +45,7 @@ FILE_PROCESS_TIMEOUT = 180
 
 
 # 代理配置(用于访问 Google File API)
 # 代理配置(用于访问 Google File API)
 HTTP_PROXY = "http://127.0.0.1:29758"
 HTTP_PROXY = "http://127.0.0.1:29758"
-HTTPS_PROXY = "https://127.0.0.1:29758"
+HTTPS_PROXY = "http://127.0.0.1:29758"
 
 
 # 缓存配置
 # 缓存配置
 ENABLE_CACHE = False
 ENABLE_CACHE = False

+ 1 - 1
script/search_recommendations/xiaohongshu_search_recommendations.py

@@ -96,7 +96,7 @@ class XiaohongshuSearchRecommendations:
 
 
         return None
         return None
 
 
-    def get_recommendations(self, keyword: str, timeout: int = 300, max_retries: int = 10, retry_delay: int = 2, use_cache: bool = True) -> Dict[str, Any]:
+    def get_recommendations(self, keyword: str, timeout: int = 300, max_retries: int = 4, retry_delay: int = 7, use_cache: bool = True) -> Dict[str, Any]:
         """
         """
         获取小红书搜索推荐词
         获取小红书搜索推荐词
 
 

+ 155 - 0
update_functions.py

@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+更新 4 个评估函数,使用新的 system/user prompt 结构
+"""
+
+import re
+
+def read_file(filepath):
+    with open(filepath, 'r', encoding='utf-8') as f:
+        return f.read()
+
+def write_file(filepath, content):
+    with open(filepath, 'w', encoding='utf-8') as f:
+        f.write(content)
+
+def update_evaluate_is_knowledge(content):
+    """更新 evaluate_is_knowledge 函数"""
+    # 查找并替换
+    old_pattern = r'''try:
+        prompt_text = PROMPT1_IS_KNOWLEDGE\.format\(
+            title=post\.title,
+            body_text=post\.body_text or "",
+            num_images=len\(image_urls\)
+        \)
+
+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
+
+    new_code = '''try:
+        user_prompt = PROMPT1_USER_TEMPLATE.format(
+            title=post.title,
+            body_text=post.body_text or "",
+            num_images=len(image_urls)
+        )
+
+        data = await _call_openrouter_api(
+            system_prompt=PROMPT1_SYSTEM,
+            user_prompt=user_prompt,
+            image_urls=image_urls,
+            semaphore=semaphore
+        )'''
+
+    content = re.sub(old_pattern, new_code, content)
+    print("✅ evaluate_is_knowledge 更新完成")
+    return content
+
+def update_evaluate_is_content_knowledge(content):
+    """更新 evaluate_is_content_knowledge 函数"""
+    old_pattern = r'''try:
+        prompt_text = PROMPT2_IS_CONTENT_KNOWLEDGE\.format\(
+            title=post\.title,
+            body_text=post\.body_text or "",
+            num_images=len\(image_urls\)
+        \)
+
+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
+
+    new_code = '''try:
+        user_prompt = PROMPT2_USER_TEMPLATE.format(
+            title=post.title,
+            body_text=post.body_text or "",
+            num_images=len(image_urls)
+        )
+
+        data = await _call_openrouter_api(
+            system_prompt=PROMPT2_SYSTEM,
+            user_prompt=user_prompt,
+            image_urls=image_urls,
+            semaphore=semaphore
+        )'''
+
+    content = re.sub(old_pattern, new_code, content)
+    print("✅ evaluate_is_content_knowledge 更新完成")
+    return content
+
+def update_evaluate_purpose_match(content):
+    """更新 evaluate_purpose_match 函数"""
+    old_pattern = r'''try:
+        prompt_text = PROMPT3_PURPOSE_MATCH\.format\(
+            original_query=original_query,
+            title=post\.title,
+            body_text=post\.body_text or "",
+            num_images=len\(image_urls\)
+        \)
+
+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
+
+    new_code = '''try:
+        user_prompt = PROMPT3_USER_TEMPLATE.format(
+            original_query=original_query,
+            title=post.title,
+            body_text=post.body_text or "",
+            num_images=len(image_urls)
+        )
+
+        data = await _call_openrouter_api(
+            system_prompt=PROMPT3_SYSTEM,
+            user_prompt=user_prompt,
+            image_urls=image_urls,
+            semaphore=semaphore
+        )'''
+
+    content = re.sub(old_pattern, new_code, content)
+    print("✅ evaluate_purpose_match 更新完成")
+    return content
+
+def update_evaluate_category_match(content):
+    """更新 evaluate_category_match 函数"""
+    old_pattern = r'''try:
+        prompt_text = PROMPT4_CATEGORY_MATCH\.format\(
+            original_query=original_query,
+            title=post\.title,
+            body_text=post\.body_text or "",
+            num_images=len\(image_urls\)
+        \)
+
+        data = await _call_openrouter_api\(prompt_text, image_urls, semaphore\)'''
+
+    new_code = '''try:
+        user_prompt = PROMPT4_USER_TEMPLATE.format(
+            original_query=original_query,
+            title=post.title,
+            body_text=post.body_text or "",
+            num_images=len(image_urls)
+        )
+
+        data = await _call_openrouter_api(
+            system_prompt=PROMPT4_SYSTEM,
+            user_prompt=user_prompt,
+            image_urls=image_urls,
+            semaphore=semaphore
+        )'''
+
+    content = re.sub(old_pattern, new_code, content)
+    print("✅ evaluate_category_match 更新完成")
+    return content
+
+def main():
+    filepath = 'post_evaluator_v3.py'
+
+    print("📖 读取文件...")
+    content = read_file(filepath)
+
+    print("\n🔧 更新评估函数...")
+    content = update_evaluate_is_knowledge(content)
+    content = update_evaluate_is_content_knowledge(content)
+    content = update_evaluate_purpose_match(content)
+    content = update_evaluate_category_match(content)
+
+    print("\n💾 保存文件...")
+    write_file(filepath, content)
+
+    print("\n✅ 所有评估函数更新完成!")
+
+if __name__ == '__main__':
+    main()

+ 178 - 0
video_utils.py

@@ -0,0 +1,178 @@
+"""
+视频处理工具模块
+
+提供视频下载、Base64 编码等功能,用于支持视频评估
+"""
+
+import asyncio
+import base64
+import hashlib
+import os
+from pathlib import Path
+from typing import Optional
+import requests
+
+
+# 配置
+VIDEO_CACHE_DIR = Path(".video_cache")
+VIDEO_MAX_SIZE_MB = 50  # 最大视频大小(MB)
+VIDEO_DOWNLOAD_TIMEOUT = 120  # 下载超时(秒)
+MAX_RETRIES = 2  # 最大重试次数
+
+
+async def download_video(
+    video_url: str,
+    cache_dir: Path = VIDEO_CACHE_DIR
+) -> Optional[Path]:
+    """
+    异步下载视频文件
+
+    Args:
+        video_url: 视频URL
+        cache_dir: 缓存目录
+
+    Returns:
+        视频文件路径,失败返回 None
+    """
+    # 创建缓存目录
+    cache_dir.mkdir(exist_ok=True)
+
+    # 生成缓存文件名(基于URL hash)
+    url_hash = hashlib.md5(video_url.encode()).hexdigest()
+    cache_path = cache_dir / f"{url_hash}.mp4"
+
+    # 检查缓存
+    if cache_path.exists():
+        file_size_mb = cache_path.stat().st_size / (1024 * 1024)
+        print(f"      ♻️  使用缓存视频: {file_size_mb:.2f}MB")
+        return cache_path
+
+    # 异步下载
+    loop = asyncio.get_event_loop()
+
+    for attempt in range(MAX_RETRIES + 1):
+        try:
+            print(f"      📥 下载视频... (尝试 {attempt + 1}/{MAX_RETRIES + 1})")
+
+            # 使用 executor 执行同步下载
+            response = await loop.run_in_executor(
+                None,
+                lambda: requests.get(
+                    video_url,
+                    timeout=VIDEO_DOWNLOAD_TIMEOUT,
+                    stream=True,
+                    headers={"User-Agent": "Mozilla/5.0"}
+                )
+            )
+
+            response.raise_for_status()
+
+            # 检查文件大小
+            content_length = response.headers.get('content-length')
+            if content_length:
+                size_mb = int(content_length) / (1024 * 1024)
+                if size_mb > VIDEO_MAX_SIZE_MB:
+                    print(f"      ⚠️  视频过大: {size_mb:.2f}MB > {VIDEO_MAX_SIZE_MB}MB")
+                    return None
+
+            # 保存到临时文件
+            temp_path = cache_path.with_suffix('.tmp')
+
+            def save_chunks():
+                with open(temp_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+
+            await loop.run_in_executor(None, save_chunks)
+
+            # 检查实际文件大小
+            actual_size_mb = temp_path.stat().st_size / (1024 * 1024)
+            if actual_size_mb > VIDEO_MAX_SIZE_MB:
+                print(f"      ⚠️  视频过大: {actual_size_mb:.2f}MB > {VIDEO_MAX_SIZE_MB}MB")
+                temp_path.unlink()
+                return None
+
+            # 重命名为正式文件
+            temp_path.rename(cache_path)
+
+            print(f"      ✅ 视频下载成功: {actual_size_mb:.2f}MB")
+            return cache_path
+
+        except Exception as e:
+            if attempt < MAX_RETRIES:
+                wait_time = 2 * (attempt + 1)
+                print(f"      ⚠️  下载失败,{wait_time}秒后重试: {str(e)[:50]}")
+                await asyncio.sleep(wait_time)
+            else:
+                print(f"      ❌ 视频下载失败: {str(e)[:100]}")
+                # 清理临时文件
+                if cache_path.with_suffix('.tmp').exists():
+                    cache_path.with_suffix('.tmp').unlink()
+                return None
+
+    return None
+
+
+async def encode_video_to_base64(video_path: Path) -> Optional[str]:
+    """
+    异步将视频文件编码为 Base64 data URL
+
+    Args:
+        video_path: 视频文件路径
+
+    Returns:
+        Base64 编码的 data URL,失败返回 None
+    """
+    try:
+        loop = asyncio.get_event_loop()
+
+        # 异步读取文件
+        def read_file():
+            with open(video_path, 'rb') as f:
+                return f.read()
+
+        print(f"      🔄 编码视频为 Base64...")
+        video_bytes = await loop.run_in_executor(None, read_file)
+
+        # Base64 编码
+        def encode():
+            base64_str = base64.b64encode(video_bytes).decode('utf-8')
+            return f"data:video/mp4;base64,{base64_str}"
+
+        data_url = await loop.run_in_executor(None, encode)
+
+        encoded_size_mb = len(data_url) / (1024 * 1024)
+        print(f"      ✅ Base64 编码完成: {encoded_size_mb:.2f}MB")
+
+        return data_url
+
+    except Exception as e:
+        print(f"      ❌ Base64 编码失败: {str(e)[:100]}")
+        return None
+
+
+def cleanup_video_cache(cache_dir: Path = VIDEO_CACHE_DIR, days: int = 7):
+    """
+    清理超过指定天数的视频缓存
+
+    Args:
+        cache_dir: 缓存目录
+        days: 保留天数
+    """
+    import time
+
+    if not cache_dir.exists():
+        return
+
+    now = time.time()
+    cutoff = now - (days * 24 * 60 * 60)
+
+    removed_count = 0
+    for file_path in cache_dir.glob("*.mp4"):
+        if file_path.stat().st_mtime < cutoff:
+            file_path.unlink()
+            removed_count += 1
+
+    if removed_count > 0:
+        print(f"🗑️  清理了 {removed_count} 个过期视频缓存")