Pārlūkot izejas kodu

feat: 优化Step4匹配逻辑和增强批处理功能

主要改进:
1. Step4匹配优化:将title+desc合并作为完整A内容参与匹配
   - 之前只用title作为A,desc作为A_Context
   - 现在将两者标识后拼接:"标题:xxx\n\n描述:xxx"
   - 更准确地评估灵感与帖子的匹配度

2. 增加--start参数支持分批处理
   - 默认值为1,支持指定起始位置
   - 配合--count使用,如:--start 11 --count 10
   - 支持与--sort-by-score、--shuffle等参数组合

3. 修复--force参数在search-and-match模式下不生效的问题
   - 在调用step4前删除旧文件
   - 确保force=True时能重新执行匹配

4. match_analyzer添加ModelSettings配置
   - 明确设置temperature=0.0和max_tokens=65536

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
yangxiaohui 2 nedēļas atpakaļ
vecāks
revīzija
25ad27c8e5
3 mainītis faili ar 57 papildinājumiem un 13 dzēšanām
  1. 6 1
      lib/match_analyzer.py
  2. 40 7
      run_inspiration_analysis.py
  3. 11 5
      step4_search_result_match.py

+ 6 - 1
lib/match_analyzer.py

@@ -15,11 +15,12 @@
 """
 import json
 from typing import List
-from agents import Agent, Runner
+from agents import Agent, Runner, ModelSettings
 from agents.tracing.create import custom_span
 from lib.client import get_model
 
 
+
 # ========== System Prompt ==========
 MATCH_SYSTEM_PROMPT = """
 # 任务
@@ -114,6 +115,10 @@ def create_match_agent(model_name: str) -> Agent:
         name="Information Match Expert",
         instructions=MATCH_SYSTEM_PROMPT,
         model=get_model(model_name),
+        model_settings=ModelSettings(
+            temperature=0.0,
+            max_tokens=65536,
+        ),
         tools=[],
     )
 

+ 40 - 7
run_inspiration_analysis.py

@@ -314,6 +314,12 @@ async def run_full_analysis(
             print(f"Step4: 搜索结果与灵感匹配")
             print(f"{'─' * 80}\n")
 
+            # 如果 force=True,删除已有的 step4 文件
+            if force and step4_exists:
+                for f in step4_files:
+                    os.remove(f)
+                    print(f"✓ 已删除旧文件: {f}")
+
             # 临时修改 sys.argv 来传递参数给 step4
             sys.argv = [
                 "step4_search_result_match.py",
@@ -493,6 +499,9 @@ async def main():
   # 搜索并匹配模式:基于已有 Step1 结果,执行搜索和 Step4 匹配
   python run_inspiration_analysis.py --search-and-match --count 10
 
+  # 从第11个开始,处理10个灵感
+  python run_inspiration_analysis.py --search-and-match --start 11 --count 10
+
   # 处理所有灵感,强制重新执行
   python run_inspiration_analysis.py --count all --force
 
@@ -520,6 +529,13 @@ async def main():
         help="处理的灵感数量,可以是数字或 'all' (默认: 1, 当指定 --inspiration 时忽略)"
     )
 
+    parser.add_argument(
+        "--start",
+        type=int,
+        default=1,
+        help="起始位置(从1开始),与 --count 配合使用 (默认: 1)"
+    )
+
     parser.add_argument(
         "--max-tasks",
         type=str,
@@ -572,6 +588,7 @@ async def main():
     enable_step2 = args.enable_step2
     search_only = args.search_only
     search_and_match = args.search_and_match
+    start_index = args.start
 
     # 互斥检查
     if search_only and search_and_match:
@@ -619,11 +636,14 @@ async def main():
         skip_selection = False
         # 确定要处理的灵感数量
         if args.count == "all":
-            inspiration_count = len(inspiration_list)
-            print(f"处理灵感: 全部 ({inspiration_count} 个)")
+            inspiration_count = len(inspiration_list) - (start_index - 1)
+            print(f"处理灵感: 从第 {start_index} 个到最后 (共 {inspiration_count} 个)")
         else:
             inspiration_count = int(args.count)
-            print(f"处理灵感: 前 {inspiration_count} 个")
+            if start_index > 1:
+                print(f"处理灵感: 从第 {start_index} 个开始,共 {inspiration_count} 个")
+            else:
+                print(f"处理灵感: 前 {inspiration_count} 个")
 
     if max_tasks:
         print(f"Step1 任务数限制: {max_tasks}")
@@ -648,18 +668,31 @@ async def main():
 
     # 选择要处理的灵感列表(如果没有指定 --inspiration)
     if not skip_selection:
+        # 验证 start_index
+        if start_index < 1:
+            print(f"❌ 错误: --start 必须 >= 1")
+            sys.exit(1)
+
         if sort_by_score:
             # 根据 Step1 结果分数排序
             sorted_list = sort_inspirations_by_score(persona_dir, inspiration_list, max_tasks)
-            inspirations_to_process = sorted_list[:inspiration_count]
+            # 应用 start 和 count(start 从 1 开始,转换为 0 索引)
+            start_idx = start_index - 1
+            end_idx = start_idx + inspiration_count
+            inspirations_to_process = sorted_list[start_idx:end_idx]
         elif shuffle:
             # 随机打乱灵感列表后选择
             shuffled_list = inspiration_list.copy()
             random.shuffle(shuffled_list)
-            inspirations_to_process = shuffled_list[:inspiration_count]
+            # 应用 start 和 count
+            start_idx = start_index - 1
+            end_idx = start_idx + inspiration_count
+            inspirations_to_process = shuffled_list[start_idx:end_idx]
         else:
-            # 按顺序选择前 N 个
-            inspirations_to_process = inspiration_list[:inspiration_count]
+            # 按顺序选择,应用 start 和 count
+            start_idx = start_index - 1
+            end_idx = start_idx + inspiration_count
+            inspirations_to_process = inspiration_list[start_idx:end_idx]
 
     print(f"\n将处理以下灵感:")
     for i, insp in enumerate(inspirations_to_process, 1):

+ 11 - 5
step4_search_result_match.py

@@ -41,22 +41,28 @@ async def match_single_note(
     desc = note.get("desc", "") or ""
     channel_content_id = note.get("channel_content_id", "") or ""
 
+    # 构建完整的帖子内容,标识标题和描述
+    if desc:
+        post_content = f"标题:{title}\n\n描述:{desc}"
+    else:
+        post_content = f"标题:{title}"
+
     # 调用通用匹配模块
-    # B = 灵感, A = 帖子标题, A_Context = 帖子描述
+    # B = 灵感, A = 帖子完整内容(标题+描述)
     match_result = await match_single(
         b_content=inspiration,
-        a_content=title,
+        a_content=post_content,
         model_name=MODEL_NAME,
-        a_context=desc
+        a_context=""
     )
 
     # 构建完整结果
     full_result = {
         "输入信息": {
             "B": inspiration,
-            "A": title,
+            "A": post_content,
             "B_Context": "",
-            "A_Context": desc
+            "A_Context": ""
         },
         "匹配结果": match_result,
         "业务信息": {