ソースを参照

持久化缓存

刘立冬 2 週間 前
コミット
d4d9e47372
2 ファイル変更74 行追加3 行削除
  1. 73 2
      knowledge_search_traverse.py
  2. 1 1
      post_evaluator_v3.py

+ 73 - 2
knowledge_search_traverse.py

@@ -3,6 +3,8 @@ import json
 import os
 import sys
 import argparse
+import time
+import hashlib
 from datetime import datetime
 from typing import Literal, Optional
 
@@ -15,6 +17,8 @@ from lib.client import get_model
 MODEL_NAME = "google/gemini-2.5-flash"
 # 得分提升阈值:sug或组合词必须比来源query提升至少此幅度才能进入下一轮
 REQUIRED_SCORE_GAIN = 0.02
+SUG_CACHE_TTL = 24 * 3600  # 24小时
+SUG_CACHE_DIR = os.path.join(os.path.dirname(__file__), "data", "sug_cache")
 from script.search_recommendations.xiaohongshu_search_recommendations import XiaohongshuSearchRecommendations
 from script.search.xiaohongshu_search import XiaohongshuSearch
 from script.search.xiaohongshu_detail import XiaohongshuDetail
@@ -1781,6 +1785,73 @@ scope_category_evaluator = Agent[None](
 # v121 新增辅助函数
 # ============================================================================
 
+def _ensure_sug_cache_dir():
+    """确保SUG缓存目录存在"""
+    os.makedirs(SUG_CACHE_DIR, exist_ok=True)
+
+
+def _sug_cache_path(keyword: str) -> str:
+    """根据关键词生成缓存文件路径"""
+    key_hash = hashlib.md5(keyword.encode("utf-8")).hexdigest()
+    return os.path.join(SUG_CACHE_DIR, f"{key_hash}.json")
+
+
+def load_sug_cache(keyword: str) -> Optional[list[str]]:
+    """从持久化缓存中读取SUG结果"""
+    if not keyword:
+        return None
+
+    cache_path = _sug_cache_path(keyword)
+    if not os.path.exists(cache_path):
+        return None
+
+    file_age = time.time() - os.path.getmtime(cache_path)
+    if file_age > SUG_CACHE_TTL:
+        return None
+
+    try:
+        with open(cache_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        suggestions = data.get("suggestions")
+        if isinstance(suggestions, list):
+            return suggestions
+    except Exception as exc:
+        print(f"  ⚠️  读取SUG缓存失败({keyword}): {exc}")
+    return None
+
+
+def save_sug_cache(keyword: str, suggestions: list[str]):
+    """将SUG结果写入持久化缓存"""
+    if not keyword or not isinstance(suggestions, list):
+        return
+
+    _ensure_sug_cache_dir()
+    cache_path = _sug_cache_path(keyword)
+    try:
+        payload = {
+            "keyword": keyword,
+            "suggestions": suggestions,
+            "timestamp": datetime.now().isoformat()
+        }
+        with open(cache_path, "w", encoding="utf-8") as f:
+            json.dump(payload, f, ensure_ascii=False, indent=2)
+    except Exception as exc:
+        print(f"  ⚠️  写入SUG缓存失败({keyword}): {exc}")
+
+
+def get_suggestions_with_cache(keyword: str, api: XiaohongshuSearchRecommendations) -> list[str]:
+    """带持久化缓存的SUG获取"""
+    cached = load_sug_cache(keyword)
+    if cached is not None:
+        print(f"    📦 SUG缓存命中: {keyword} ({len(cached)} 个)")
+        return cached
+
+    suggestions = api.get_recommendations(keyword=keyword)
+    if suggestions:
+        save_sug_cache(keyword, suggestions)
+    return suggestions
+
+
 def get_ordered_subsets(words: list[str], min_len: int = 1) -> list[list[str]]:
     """
     生成words的所有有序子集(可跳过但不可重排)
@@ -2841,7 +2912,7 @@ async def run_round(
     sug_list_list = []  # list of list
     for q in q_list:
         print(f"\n  处理q: {q.text}")
-        suggestions = xiaohongshu_api.get_recommendations(keyword=q.text)
+        suggestions = get_suggestions_with_cache(q.text, xiaohongshu_api)
 
         q_sug_list = []
         if suggestions:
@@ -3530,7 +3601,7 @@ async def run_round_v2(
     sug_details = {}
 
     for q in query_input:
-        suggestions = xiaohongshu_api.get_recommendations(keyword=q.text)
+        suggestions = get_suggestions_with_cache(q.text, xiaohongshu_api)
         if suggestions:
             print(f"  {q.text}: 获取到 {len(suggestions)} 个SUG")
             for sug_text in suggestions:

+ 1 - 1
post_evaluator_v3.py

@@ -30,7 +30,7 @@ ENABLE_CACHE = True  # 是否启用评估结果缓存
 CACHE_DIR = ".evaluation_cache"  # 缓存目录
 
 # 视频处理配置
-MAX_VIDEO_SIZE_MB = 60  # 最大视频大小限制(MB)
+MAX_VIDEO_SIZE_MB = 10  # 最大视频大小限制(MB)
 VIDEO_DOWNLOAD_TIMEOUT = 60  # 视频下载超时(秒)
 TEMP_VIDEO_DIR = "/tmp/kg_agent_videos"  # 临时视频存储目录(同时也是缓存目录)
 VIDEO_CHUNK_SIZE = 8192  # 下载分块大小(字节)