2 tuần trước cách đây · ec767b66a8
--- a/step4_search_result_match.py
+++ b/step4_search_result_match.py
@@ -0,0 +1,256 @@
 
				+"""
			
 
				+搜索结果与灵感匹配分析
			
 
				+
			
 
				+评估搜索到的帖子与当前灵感的匹配度
			
 
				+- 帖子标题（title）作为匹配要素
			
 
				+- 帖子描述（desc）作为上下文
			
 
				+"""
			
 
				+import asyncio
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+from typing import List, Dict, Optional
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from agents import trace
			
 
				+from lib.my_trace import set_trace_smith as set_trace
			
 
				+from lib.async_utils import process_tasks_with_semaphore
			
 
				+from lib.match_analyzer import match_single
			
 
				+from lib.data_loader import load_inspiration_list, select_inspiration
			
 
				+
			
 
				+# 模型配置
			
 
				+MODEL_NAME = "google/gemini-2.5-pro"
			
 
				+
			
 
				+
			
 
				+async def match_single_note(
			
 
				+    inspiration: str,
			
 
				+    note: dict,
			
 
				+    _index: int
			
 
				+) -> dict:
			
 
				+    """匹配单个帖子与灵感
			
 
				+
			
 
				+    Args:
			
 
				+        inspiration: 灵感点文本
			
 
				+        note: 帖子数据，包含 title, desc, channel_content_id 等
			
 
				+        _index: 任务索引（由 async_utils 传入）
			
 
				+
			
 
				+    Returns:
			
 
				+        匹配结果
			
 
				+    """
			
 
				+    title = note.get("title", "")
			
 
				+    desc = note.get("desc", "")
			
 
				+    channel_content_id = note.get("channel_content_id", "")
			
 
				+
			
 
				+    # 调用通用匹配模块
			
 
				+    # B = 灵感, A = 帖子标题, A_Context = 帖子描述
			
 
				+    match_result = await match_single(
			
 
				+        b_content=inspiration,
			
 
				+        a_content=title,
			
 
				+        model_name=MODEL_NAME,
			
 
				+        a_context=desc
			
 
				+    )
			
 
				+
			
 
				+    # 构建完整结果
			
 
				+    full_result = {
			
 
				+        "输入信息": {
			
 
				+            "B": inspiration,
			
 
				+            "A": title,
			
 
				+            "B_Context": "",
			
 
				+            "A_Context": desc
			
 
				+        },
			
 
				+        "匹配结果": match_result,
			
 
				+        "业务信息": {
			
 
				+            "灵感": inspiration,
			
 
				+            "channel_content_id": channel_content_id,
			
 
				+            "title": title,
			
 
				+            "likes": note.get("like_count", 0),
			
 
				+            "user_nickname": note.get("channel_account_name", "")
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return full_result
			
 
				+
			
 
				+
			
 
				+def find_search_result_file(persona_dir: str, inspiration: str, max_tasks: int = None) -> Optional[str]:
			
 
				+    """查找搜索结果文件
			
 
				+
			
 
				+    Args:
			
 
				+        persona_dir: 人设目录
			
 
				+        inspiration: 灵感点名称
			
 
				+        max_tasks: 任务数限制（用于确定文件前缀）
			
 
				+
			
 
				+    Returns:
			
 
				+        搜索结果文件路径，如果未找到返回 None
			
 
				+    """
			
 
				+    search_dir = os.path.join(persona_dir, "how", "灵感点", inspiration, "search")
			
 
				+
			
 
				+    if not os.path.exists(search_dir):
			
 
				+        return None
			
 
				+
			
 
				+    scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
			
 
				+    search_pattern = f"{scope_prefix}_search_*.json"
			
 
				+
			
 
				+    search_files = list(Path(search_dir).glob(search_pattern))
			
 
				+    if not search_files:
			
 
				+        return None
			
 
				+
			
 
				+    # 返回最新的文件
			
 
				+    return str(sorted(search_files, key=lambda x: x.stat().st_mtime, reverse=True)[0])
			
 
				+
			
 
				+
			
 
				+async def main(current_time: str = None, log_url: str = None, force: bool = False):
			
 
				+    """主函数
			
 
				+
			
 
				+    Args:
			
 
				+        current_time: 当前时间戳
			
 
				+        log_url: 日志链接
			
 
				+        force: 是否强制重新执行
			
 
				+    """
			
 
				+    # 解析命令行参数
			
 
				+    if len(sys.argv) < 3:
			
 
				+        print("用法: python step4_search_result_match.py <persona_dir> <inspiration> [max_tasks]")
			
 
				+        print("\n示例:")
			
 
				+        print("  python step4_search_result_match.py data/阿里多多酱/out/人设_1110 内容植入品牌推广")
			
 
				+        print("  python step4_search_result_match.py data/阿里多多酱/out/人设_1110 0 20")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    persona_dir = sys.argv[1]
			
 
				+    inspiration_arg = sys.argv[2]
			
 
				+    max_tasks = int(sys.argv[3]) if len(sys.argv) > 3 and sys.argv[3] != "all" else None
			
 
				+
			
 
				+    # 加载灵感列表
			
 
				+    inspiration_list = load_inspiration_list(persona_dir)
			
 
				+
			
 
				+    # 选择灵感
			
 
				+    inspiration = select_inspiration(inspiration_arg, inspiration_list)
			
 
				+
			
 
				+    print(f"{'=' * 80}")
			
 
				+    print(f"Step4: 搜索结果与灵感匹配分析")
			
 
				+    print(f"{'=' * 80}")
			
 
				+    print(f"人设目录: {persona_dir}")
			
 
				+    print(f"灵感: {inspiration}")
			
 
				+    print(f"模型: {MODEL_NAME}")
			
 
				+    print()
			
 
				+
			
 
				+    # 查找搜索结果文件
			
 
				+    search_file = find_search_result_file(persona_dir, inspiration, max_tasks)
			
 
				+    if not search_file:
			
 
				+        print(f"❌ 错误: 找不到搜索结果文件")
			
 
				+        print(f"请先运行搜索步骤: python run_inspiration_analysis.py --search-only --count 1")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    print(f"搜索结果文件: {search_file}\n")
			
 
				+
			
 
				+    # 读取搜索结果
			
 
				+    with open(search_file, 'r', encoding='utf-8') as f:
			
 
				+        search_data = json.load(f)
			
 
				+
			
 
				+    notes = search_data.get("notes", [])
			
 
				+    search_keyword = search_data.get("search_params", {}).get("keyword", "")
			
 
				+
			
 
				+    if not notes:
			
 
				+        print(f"⚠️  警告: 搜索结果为空")
			
 
				+        sys.exit(0)
			
 
				+
			
 
				+    print(f"搜索关键词: {search_keyword}")
			
 
				+    print(f"搜索结果数: {len(notes)}")
			
 
				+    print()
			
 
				+
			
 
				+    # 检查输出文件是否存在
			
 
				+    # 输出到 search/ 目录下
			
 
				+    output_dir = os.path.join(persona_dir, "how", "灵感点", inspiration, "search")
			
 
				+    os.makedirs(output_dir, exist_ok=True)
			
 
				+
			
 
				+    scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
			
 
				+    model_short = MODEL_NAME.replace("google/", "").replace("/", "_")
			
 
				+    output_file = os.path.join(output_dir, f"{scope_prefix}_step4_搜索结果匹配_{model_short}.json")
			
 
				+
			
 
				+    if os.path.exists(output_file) and not force:
			
 
				+        print(f"✓ 输出文件已存在: {output_file}")
			
 
				+        print(f"使用 force=True 可强制重新执行")
			
 
				+        return
			
 
				+
			
 
				+    # 执行匹配分析
			
 
				+    print(f"{'─' * 80}")
			
 
				+    print(f"开始匹配分析...")
			
 
				+    print(f"{'─' * 80}\n")
			
 
				+
			
 
				+    # 构建匹配任务
			
 
				+    tasks = [
			
 
				+        {"inspiration": inspiration, "note": note}
			
 
				+        for note in notes
			
 
				+    ]
			
 
				+
			
 
				+    # 并发执行匹配任务
			
 
				+    results = await process_tasks_with_semaphore(
			
 
				+        tasks=tasks,
			
 
				+        process_func=lambda task, idx: match_single_note(
			
 
				+            inspiration=task["inspiration"],
			
 
				+            note=task["note"],
			
 
				+            _index=idx
			
 
				+        ),
			
 
				+        max_concurrent=10,
			
 
				+        show_progress=True
			
 
				+    )
			
 
				+
			
 
				+    # 按匹配分数排序
			
 
				+    results_sorted = sorted(
			
 
				+        results,
			
 
				+        key=lambda x: x.get("匹配结果", {}).get("score", 0),
			
 
				+        reverse=True
			
 
				+    )
			
 
				+
			
 
				+    print(f"\n{'─' * 80}")
			
 
				+    print(f"匹配完成")
			
 
				+    print(f"{'─' * 80}\n")
			
 
				+
			
 
				+    # 显示 Top 5 结果
			
 
				+    print("Top 5 匹配结果:")
			
 
				+    for i, result in enumerate(results_sorted[:5], 1):
			
 
				+        score = result.get("匹配结果", {}).get("score", 0)
			
 
				+        title = result.get("业务信息", {}).get("title", "")
			
 
				+        channel_content_id = result.get("业务信息", {}).get("channel_content_id", "")
			
 
				+        print(f"  {i}. [score={score:.2f}] {title[:50]}... (ID: {channel_content_id})")
			
 
				+    print()
			
 
				+
			
 
				+    # 保存结果
			
 
				+    output_data = {
			
 
				+        "元数据": {
			
 
				+            "current_time": current_time,
			
 
				+            "log_url": log_url,
			
 
				+            "model": MODEL_NAME,
			
 
				+            "step": "step4_搜索结果匹配"
			
 
				+        },
			
 
				+        "输入信息": {
			
 
				+            "灵感": inspiration,
			
 
				+            "搜索关键词": search_keyword,
			
 
				+            "搜索结果数": len(notes),
			
 
				+            "搜索结果文件": search_file
			
 
				+        },
			
 
				+        "匹配结果列表": results_sorted
			
 
				+    }
			
 
				+
			
 
				+    with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(output_data, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"✓ 结果已保存: {output_file}")
			
 
				+    print()
			
 
				+
			
 
				+    # 统计信息
			
 
				+    high_score_count = sum(1 for r in results_sorted if r.get("匹配结果", {}).get("score", 0) >= 0.7)
			
 
				+    medium_score_count = sum(1 for r in results_sorted if 0.4 <= r.get("匹配结果", {}).get("score", 0) < 0.7)
			
 
				+    low_score_count = sum(1 for r in results_sorted if r.get("匹配结果", {}).get("score", 0) < 0.4)
			
 
				+
			
 
				+    print(f"匹配统计:")
			
 
				+    print(f"  高匹配 (≥0.7): {high_score_count} 个")
			
 
				+    print(f"  中匹配 (0.4-0.7): {medium_score_count} 个")
			
 
				+    print(f"  低匹配 (<0.4): {low_score_count} 个")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 设置 trace
			
 
				+    current_time, log_url = set_trace()
			
 
				+
			
 
				+    # 使用 trace 包装运行
			
 
				+    with trace("Step4: 搜索结果匹配"):
			
 
				+        asyncio.run(main(current_time, log_url))