Explorar o código

1.添加工程环境依赖文件 2. 增加评估缓存

刘立冬 hai 3 semanas
pai
achega
d53fcfd6a9
Modificáronse 4 ficheiros con 106 adicións e 9 borrados
  1. 8 0
      requirements-dev.txt
  2. 9 0
      requirements-optional.txt
  3. 15 0
      requirements.txt
  4. 74 9
      sug_v6_1_2_115.py

+ 8 - 0
requirements-dev.txt

@@ -0,0 +1,8 @@
+# 开发依赖 - Knowledge Agent 项目
+# 用于开发时的类型检查和代码质量工具
+
+# 类型检查桩文件
+types-requests==2.32.4.20250913
+
+# 安装方式:
+# pip3 install -r requirements-dev.txt

+ 9 - 0
requirements-optional.txt

@@ -0,0 +1,9 @@
+# 可选依赖 - Knowledge Agent 项目
+# 用于日志追踪和监控的可选功能
+
+# Logfire 日志追踪系统
+logfire==4.14.2
+logfire-api==4.14.2
+
+# 安装方式:
+# pip3 install -r requirements-optional.txt

+ 15 - 0
requirements.txt

@@ -0,0 +1,15 @@
+# 核心依赖 - Knowledge Agent 项目
+# 用于运行主要功能所需的必需依赖
+
+# OpenAI Agents 框架
+openai-agents==0.4.2
+
+# OpenAI API 客户端
+openai==2.6.1
+
+# 数据验证和设置管理
+pydantic==2.12.3
+pydantic-settings==2.11.0
+
+# HTTP 请求库
+requests==2.32.5

+ 74 - 9
sug_v6_1_2_115.py

@@ -98,6 +98,10 @@ class RunContext(BaseModel):
     # 最终结果
     final_output: str | None = None
 
+    # 评估缓存:避免重复评估相同文本
+    evaluation_cache: dict[str, tuple[float, str]] = Field(default_factory=dict)
+    # key: 文本, value: (score, reason)
+
 
 # ============================================================================
 # Agent 定义
@@ -441,6 +445,13 @@ word_selection_instructions = """
   * combined_query: 组合后的新query(只包含seed和word的原始文本)
   * reasoning: 选择理由(说明为什么选这个词)
 - overall_reasoning: 整体选择思路(说明这5个词的选择逻辑)
+
+## JSON输出规范
+1. **格式要求**:必须输出标准的、完整的JSON格式
+2. **字符限制**:不要在JSON中使用任何不可见的特殊字符或控制字符
+3. **引号规范**:字符串中如需表达引用或强调,使用书名号《》或单书名号「」,不要使用英文引号或中文引号""
+4. **编码规范**:所有文本使用UTF-8编码,不要包含二进制或转义序列
+5. **完整性**:确保JSON的开始和结束括号完整匹配,所有字段都正确闭合
 """.strip()
 
 word_selector = Agent[None](
@@ -492,6 +503,13 @@ def calculate_final_score(motivation_score: float, category_score: float) -> flo
     return base_score
 
 
+def clean_json_string(text: str) -> str:
+    """清理JSON中的非法控制字符(保留 \t \n \r)"""
+    import re
+    # 移除除了 \t(09) \n(0A) \r(0D) 之外的所有控制字符
+    return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', text)
+
+
 def process_note_data(note: dict) -> Post:
     """处理搜索接口返回的帖子数据"""
     note_card = note.get("note_card", {})
@@ -555,7 +573,7 @@ def process_note_data(note: dict) -> Post:
     )
 
 
-async def evaluate_with_o(text: str, o: str) -> tuple[float, str]:
+async def evaluate_with_o(text: str, o: str, cache: dict[str, tuple[float, str]] | None = None) -> tuple[float, str]:
     """评估文本与原始问题o的相关度
 
     采用两阶段评估 + 代码计算规则:
@@ -563,9 +581,20 @@ async def evaluate_with_o(text: str, o: str) -> tuple[float, str]:
     2. 品类维度评估(权重30%)
     3. 应用规则A/B/C调整得分
 
+    Args:
+        text: 待评估的文本
+        o: 原始问题
+        cache: 评估缓存(可选),用于避免重复评估
+
     Returns:
         tuple[float, str]: (最终相关度分数, 综合评估理由)
     """
+    # 检查缓存
+    if cache is not None and text in cache:
+        cached_score, cached_reason = cache[text]
+        print(f"  ⚡ 缓存命中: {text} -> {cached_score:.2f}")
+        return cached_score, cached_reason
+
     # 准备输入
     eval_input = f"""
 <原始问题>
@@ -630,6 +659,10 @@ async def evaluate_with_o(text: str, o: str) -> tuple[float, str]:
                 elif motivation_score <= 0.2:
                     combined_reason += "(应用规则B:动机低分限制机制)"
 
+            # 存入缓存
+            if cache is not None:
+                cache[text] = (final_score, combined_reason)
+
             return final_score, combined_reason
 
         except Exception as e:
@@ -684,7 +717,7 @@ async def initialize(o: str, context: RunContext) -> tuple[list[Seg], list[Word]
 
     async def evaluate_seg(seg: Seg) -> Seg:
         async with seg_semaphore:
-            seg.score_with_o, seg.reason = await evaluate_with_o(seg.text, o)
+            seg.score_with_o, seg.reason = await evaluate_with_o(seg.text, o, context.evaluation_cache)
             return seg
 
     if seg_list:
@@ -813,7 +846,7 @@ async def run_round(
 
     async def evaluate_sug(sug: Sug) -> Sug:
         async with semaphore:  # 限制并发数
-            sug.score_with_o, sug.reason = await evaluate_with_o(sug.text, o)
+            sug.score_with_o, sug.reason = await evaluate_with_o(sug.text, o, context.evaluation_cache)
             return sug
 
     if all_sugs:
@@ -887,6 +920,7 @@ async def run_round(
     # 4. 构建q_list_next
     print(f"\n[步骤4] 构建q_list_next...")
     q_list_next = []
+    existing_q_texts = set()  # 用于去重
     add_word_details = {}  # 保存每个seed对应的组合词列表
     all_seed_combinations = []  # 保存本轮所有seed的组合词(用于后续构建seed_list_next)
 
@@ -912,7 +946,7 @@ async def run_round(
 
         print(f"      候选词数量: {len(candidate_words)}")
 
-        # 调用Agent一次性选择并组合Top 5
+        # 调用Agent一次性选择并组合Top 5(添加重试机制)
         candidate_words_text = ', '.join([w.text for w in candidate_words])
         selection_input = f"""
 <原始问题>
@@ -929,15 +963,34 @@ async def run_round(
 
 请从候选词列表中选择最多5个最合适的词,分别与当前seed组合成新的query。
 """
-        result = await Runner.run(word_selector, selection_input)
-        selection_result: WordSelectionTop5 = result.final_output
+
+        # 重试机制
+        max_retries = 2
+        selection_result = None
+        for attempt in range(max_retries):
+            try:
+                result = await Runner.run(word_selector, selection_input)
+                selection_result = result.final_output
+                break  # 成功则跳出
+            except Exception as e:
+                error_msg = str(e)
+                if attempt < max_retries - 1:
+                    print(f"      ⚠️  选词失败 (尝试 {attempt+1}/{max_retries}): {error_msg[:100]}")
+                    await asyncio.sleep(1)
+                else:
+                    print(f"      ❌ 选词失败,跳过该seed: {error_msg[:100]}")
+                    break
+
+        if selection_result is None:
+            print(f"      跳过seed: {seed.text}")
+            continue
 
         print(f"      Agent选择了 {len(selection_result.combinations)} 个组合")
         print(f"      整体选择思路: {selection_result.overall_reasoning}")
 
         # 并发评估所有组合的相关度
         async def evaluate_combination(comb: WordCombination) -> dict:
-            score, reason = await evaluate_with_o(comb.combined_query, o)
+            score, reason = await evaluate_with_o(comb.combined_query, o, context.evaluation_cache)
             return {
                 'word': comb.selected_word,
                 'query': comb.combined_query,
@@ -951,8 +1004,13 @@ async def run_round(
 
         print(f"      评估完成,得到 {len(top_5)} 个组合")
 
-        # 将Top 5全部加入q_list_next
+        # 将Top 5全部加入q_list_next(去重检查)
         for comb in top_5:
+            # 去重检查
+            if comb['query'] in existing_q_texts:
+                print(f"        ⊗ 跳过重复: {comb['query']}")
+                continue
+
             print(f"        ✓ {comb['query']} (分数: {comb['score']:.2f})")
 
             new_q = Q(
@@ -962,6 +1020,7 @@ async def run_round(
                 from_source="add"
             )
             q_list_next.append(new_q)
+            existing_q_texts.add(comb['query'])  # 记录到去重集合
 
             # 记录已添加的词
             seed.added_words.append(comb['word'])
@@ -980,10 +1039,15 @@ async def run_round(
         # 保存到all_seed_combinations(用于构建seed_list_next)
         all_seed_combinations.extend(top_5)
 
-    # 4.2 对于sug_list_list中,每个sug大于来自的query分数,加到q_list_next
+    # 4.2 对于sug_list_list中,每个sug大于来自的query分数,加到q_list_next(去重检查)
     print(f"\n  4.2 将高分sug加入q_list_next...")
     for sug in all_sugs:
         if sug.from_q and sug.score_with_o > sug.from_q.score_with_o:
+            # 去重检查
+            if sug.text in existing_q_texts:
+                print(f"    ⊗ 跳过重复: {sug.text}")
+                continue
+
             new_q = Q(
                 text=sug.text,
                 score_with_o=sug.score_with_o,
@@ -991,6 +1055,7 @@ async def run_round(
                 from_source="sug"
             )
             q_list_next.append(new_q)
+            existing_q_texts.add(sug.text)  # 记录到去重集合
             print(f"    ✓ {sug.text} (分数: {sug.score_with_o:.2f} > {sug.from_q.score_with_o:.2f})")
 
     # 5. 构建seed_list_next(关键修改:不保留上一轮的seed)