2 hafta önce · dd33bd5b57
--- a/extract_inspirations.py
+++ b/extract_inspirations.py
@@ -0,0 +1,143 @@
 
				+"""
			
 
				+从 what 解构结果中提取灵感点列表
			
 
				+
			
 
				+读取指定文件夹中的所有 JSON 文件，提取灵感点，保存到同级目录
			
 
				+"""
			
 
				+import json
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+from typing import List
			
 
				+from lib.utils import read_json
			
 
				+
			
 
				+
			
 
				+def extract_inspirations_from_file(file_path: str) -> List[dict]:
			
 
				+    """从单个 what 解构文件中提取所有灵感点
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: JSON 文件路径
			
 
				+
			
 
				+    Returns:
			
 
				+        灵感点列表，每个元素包含 灵感点 和 meta 字段
			
 
				+    """
			
 
				+    # 从文件名提取 note_id（第一个下划线之前的部分）
			
 
				+    filename = os.path.basename(file_path)
			
 
				+    note_id = filename.split('_')[0]
			
 
				+
			
 
				+    try:
			
 
				+        data = read_json(file_path)
			
 
				+    except Exception as e:
			
 
				+        print(f"⚠️  读取文件失败: {file_path} - {e}")
			
 
				+        return []
			
 
				+
			
 
				+    inspirations = []
			
 
				+
			
 
				+    # 提取灵感点
			
 
				+    san_dian = data.get("三点解构", {})
			
 
				+    ling_gan_dian = san_dian.get("灵感点", {})
			
 
				+
			
 
				+    # 三个类别：全新内容、共性差异、共性内容
			
 
				+    for category in ["全新内容", "共性差异", "共性内容"]:
			
 
				+        items = ling_gan_dian.get(category, [])
			
 
				+        for item in items:
			
 
				+            inspiration_text = item.get("灵感点", "")
			
 
				+            if inspiration_text:
			
 
				+                # 构建 meta 字段：原有字段 + note_id + category + what文件路径，但排除"灵感点"字段
			
 
				+                meta = {k: v for k, v in item.items() if k != "灵感点"}
			
 
				+                meta["note_id"] = note_id
			
 
				+                meta["category"] = category
			
 
				+                meta["what_file"] = file_path
			
 
				+
			
 
				+                inspirations.append({
			
 
				+                    "灵感点": inspiration_text,
			
 
				+                    "meta": meta
			
 
				+                })
			
 
				+
			
 
				+    return inspirations
			
 
				+
			
 
				+
			
 
				+def extract_inspirations_from_folder(folder_path: str) -> List[dict]:
			
 
				+    """从文件夹中提取所有灵感点
			
 
				+
			
 
				+    Args:
			
 
				+        folder_path: what 解构结果文件夹路径
			
 
				+
			
 
				+    Returns:
			
 
				+        灵感点列表（保留所有，不去重）
			
 
				+    """
			
 
				+    folder = Path(folder_path)
			
 
				+    if not folder.exists():
			
 
				+        raise FileNotFoundError(f"文件夹不存在: {folder_path}")
			
 
				+
			
 
				+    # 收集所有 JSON 文件
			
 
				+    json_files = sorted(list(folder.glob("*.json")))
			
 
				+    print(f"\n找到 {len(json_files)} 个 JSON 文件")
			
 
				+
			
 
				+    # 提取所有灵感点
			
 
				+    all_inspirations = []
			
 
				+    for json_file in json_files:
			
 
				+        inspirations = extract_inspirations_from_file(str(json_file))
			
 
				+        all_inspirations.extend(inspirations)
			
 
				+        if inspirations:
			
 
				+            print(f"  ✓ {json_file.name}: {len(inspirations)} 个灵感点")
			
 
				+
			
 
				+    print(f"\n总计提取: {len(all_inspirations)} 个灵感点")
			
 
				+
			
 
				+    return all_inspirations
			
 
				+
			
 
				+
			
 
				+def save_inspirations(inspirations: List[dict], output_dir: str):
			
 
				+    """保存灵感点列表
			
 
				+
			
 
				+    Args:
			
 
				+        inspirations: 灵感点列表（包含 灵感点 和 meta 字段）
			
 
				+        output_dir: 输出目录
			
 
				+    """
			
 
				+    output_file = os.path.join(output_dir, "灵感点.json")
			
 
				+    with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(inspirations, f, ensure_ascii=False, indent=2)
			
 
				+    print(f"\n✓ 灵感点列表已保存到: {output_file}")
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    import sys
			
 
				+
			
 
				+    # 命令行参数：what 解构结果文件夹路径
			
 
				+    if len(sys.argv) > 1:
			
 
				+        what_folder = sys.argv[1]
			
 
				+    else:
			
 
				+        what_folder = "data/阿里多多酱/out/人设_v2/what解构结果"
			
 
				+
			
 
				+    print(f"{'=' * 80}")
			
 
				+    print(f"从 what 解构结果中提取灵感点")
			
 
				+    print(f"{'=' * 80}")
			
 
				+    print(f"输入文件夹: {what_folder}")
			
 
				+
			
 
				+    try:
			
 
				+        # 提取灵感点
			
 
				+        inspirations = extract_inspirations_from_folder(what_folder)
			
 
				+
			
 
				+        # 确定输出目录（输入文件夹的父目录，即同级目录）
			
 
				+        what_folder_path = Path(what_folder)
			
 
				+        output_dir = what_folder_path.parent  # data/阿里多多酱/out/人设_v2
			
 
				+
			
 
				+        # 保存结果
			
 
				+        save_inspirations(inspirations, str(output_dir))
			
 
				+
			
 
				+        # 显示前10个灵感点
			
 
				+        print(f"\n{'=' * 80}")
			
 
				+        print(f"灵感点预览（前10个）:")
			
 
				+        print(f"{'=' * 80}")
			
 
				+        for i, item in enumerate(inspirations[:10], 1):
			
 
				+            print(f"{i}. {item['灵感点']}")
			
 
				+        if len(inspirations) > 10:
			
 
				+            print(f"... 还有 {len(inspirations) - 10} 个")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"\n❌ 错误: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/how_decode_v1.py
+++ b/how_decode_v1.py
@@ -0,0 +1,240 @@
 
				+import asyncio
			
 
				+import json
			
 
				+
			
 
				+from agents import Agent, Runner
			
 
				+from lib.my_trace import set_trace_smith as set_trace
			
 
				+from lib.utils import read_json
			
 
				+from agents import Agent, Runner, trace
			
 
				+
			
 
				+from lib.client import get_model
			
 
				+MODEL_NAME = "google/gemini-2.5-flash"
			
 
				+
			
 
				+# 系统提示词：定义角色、能力、输出要求
			
 
				+SYSTEM_PROMPT = """
			
 
				+你是一个小红书帖子创作方法论分析专家，擅长分析创作过程中的"手段"和"方法"。
			
 
				+
			
 
				+你的能力：
			
 
				+- 分析灵感点的获取途径和具体手段
			
 
				+- 分析目的点的确定方法和决策逻辑
			
 
				+- 分析关键点的形成过程和推导方法
			
 
				+- 分析选题的整合逻辑和形成路径
			
 
				+
			
 
				+分析要求：
			
 
				+1. 基于已有的 what 解构结果（灵感点、目的点、关键点）
			
 
				+2. 深入分析"HOW"——创作者是通过什么手段、方法获得这些点的
			
 
				+3. 分析各个点之间的关联逻辑和推导过程
			
 
				+4. 总结可复用的创作方法论
			
 
				+
			
 
				+输出格式：
			
 
				+- 结构化的 JSON 格式
			
 
				+- 包含具体的手段、方法、步骤
			
 
				+- 包含推导逻辑和决策原理
			
 
				+""".strip()
			
 
				+
			
 
				+instructions = SYSTEM_PROMPT
			
 
				+
			
 
				+agent = Agent(
			
 
				+    name="How Deconstruction Expert",
			
 
				+    instructions=instructions,
			
 
				+    model=get_model(MODEL_NAME),
			
 
				+    tools=[],
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def build_how_analysis_message(post_data, what_result):
			
 
				+    """
			
 
				+    构造 how 分析的提示词
			
 
				+
			
 
				+    Args:
			
 
				+        post_data: 原始帖子数据
			
 
				+        what_result: what 解构结果
			
 
				+
			
 
				+    Returns:
			
 
				+        多模态消息列表
			
 
				+    """
			
 
				+    images = post_data.get('images', [])
			
 
				+
			
 
				+    # 提取 what 结果的关键信息
			
 
				+    inspiration_points = what_result.get('三点解构', {}).get('灵感点', {}).get('points', [])
			
 
				+    purpose = what_result.get('三点解构', {}).get('目的点', {})
			
 
				+    key_points = what_result.get('三点解构', {}).get('关键点', {}).get('key_points', [])
			
 
				+
			
 
				+    # 构造灵感点列表
			
 
				+    inspiration_list = []
			
 
				+    for idx, insp in enumerate(inspiration_points, 1):
			
 
				+        inspiration_list.append(f"{idx}. {insp.get('灵感点', '')}\n   描述：{insp.get('描述', '')}")
			
 
				+
			
 
				+    # 构造关键点列表
			
 
				+    key_point_list = []
			
 
				+    for idx, kp in enumerate(key_points, 1):
			
 
				+        key_point_list.append(f"{idx}. {kp.get('关键点', '')} ({kp.get('类型', '')})")
			
 
				+
			
 
				+    content = []
			
 
				+
			
 
				+    # 添加说明
			
 
				+    content.append({
			
 
				+        "type": "input_text",
			
 
				+        "text": "# HOW 业务解构任务\n\n你需要分析创作者是通过什么手段和方法完成创作的。"
			
 
				+    })
			
 
				+
			
 
				+    # 添加原始帖子图片
			
 
				+    content.append({
			
 
				+        "type": "input_text",
			
 
				+        "text": f"\n## 原始帖子（共{len(images)}张图片）"
			
 
				+    })
			
 
				+
			
 
				+    for img_url in images:
			
 
				+        content.append({
			
 
				+            "type": "input_image",
			
 
				+            "detail": "auto",
			
 
				+            "image_url": img_url
			
 
				+        })
			
 
				+
			
 
				+    # 添加帖子文本
			
 
				+    content.append({
			
 
				+        "type": "input_text",
			
 
				+        "text": f"""
			
 
				+## 帖子基本信息
			
 
				+标题：{post_data.get('title', '')}
			
 
				+正文：{post_data.get('body_text', '')}
			
 
				+"""
			
 
				+    })
			
 
				+
			
 
				+    # 添加 what 解构结果
			
 
				+    content.append({
			
 
				+        "type": "input_text",
			
 
				+        "text": f"""
			
 
				+## WHAT 解构结果
			
 
				+
			
 
				+### 灵感点列表
			
 
				+{chr(10).join(inspiration_list)}
			
 
				+
			
 
				+### 目的点
			
 
				+{purpose.get('main_purpose', '')}
			
 
				+描述：{purpose.get('description', '')}
			
 
				+
			
 
				+### 关键点列表
			
 
				+{chr(10).join(key_point_list)}
			
 
				+"""
			
 
				+    })
			
 
				+
			
 
				+    # 添加 how 分析任务
			
 
				+    content.append({
			
 
				+        "type": "input_text",
			
 
				+        "text": """
			
 
				+## 你的任务
			
 
				+
			
 
				+请基于以上信息，深入分析创作过程中的"手段"和"方法"，输出以下内容：
			
 
				+
			
 
				+### 1. 灵感点获取分析
			
 
				+对每个灵感点，分析：
			
 
				+- **来源途径**：这个灵感是从哪里来的？（生活观察、素材积累、热点追踪等）
			
 
				+- **获取手段**：创作者用了什么具体方法获得这个灵感？
			
 
				+  - 主要方法（如：日常行为模式观察、表情包库浏览等）
			
 
				+  - 具体手段（列举3-4个具体步骤或操作）
			
 
				+  - 触发点（什么时刻产生的灵感）
			
 
				+
			
 
				+### 2. 目的点确定分析
			
 
				+分析创作者如何确定这个目的：
			
 
				+- **来源途径**：这个目的是基于什么确定的？
			
 
				+- **确定手段**：
			
 
				+  - 主要方法（如：内容价值评估、受众分析等）
			
 
				+  - 具体手段（列举3-5个分析步骤）
			
 
				+  - 决策逻辑（如何从灵感点推导到目的点）
			
 
				+
			
 
				+### 3. 关键点形成分析
			
 
				+分析灵感点和目的点如何推导出关键点：
			
 
				+- **形成逻辑**：整体的推导思路
			
 
				+- **推导方法**：
			
 
				+  - 方法论名称（如：目标分解法、手段枚举法）
			
 
				+  - 具体过程（对每个关键点，说明如何从灵感点/目的点推导出来）
			
 
				+- **关键点分类**：按内容层、形式层、传播层分类
			
 
				+
			
 
				+### 4. 选题形成分析
			
 
				+分析三点如何整合成选题：
			
 
				+- **形成路径**：灵感点 → 目的点 → 关键点 → 选题
			
 
				+- **整合方法**：
			
 
				+  - 方法论（如：要素提取、逻辑串联、主题凝练）
			
 
				+  - 具体步骤（5个左右的步骤，每步说明手段、操作、结果）
			
 
				+- **整合原理**：说明三点在选题中的作用和验证标准
			
 
				+
			
 
				+### 5. 创作流程总结
			
 
				+- 完整流程（从灵感获取到选题确定的全流程）
			
 
				+- 核心方法论（每个阶段的关键方法）
			
 
				+- 关键能力要求（需要具备什么能力）
			
 
				+
			
 
				+请以 JSON 格式输出，结构清晰，包含所有分析维度。
			
 
				+"""
			
 
				+    })
			
 
				+
			
 
				+    return [{
			
 
				+        "role": "user",
			
 
				+        "content": content
			
 
				+    }]
			
 
				+
			
 
				+
			
 
				+async def main():
			
 
				+    # 读取帖子数据
			
 
				+    example_id = '689bf685000000001d0021d3'
			
 
				+    in_dir = f'examples/{example_id}/输入'
			
 
				+    out_dir = f'examples/{example_id}/输出'
			
 
				+
			
 
				+    target_note_file = f'{in_dir}/待解构帖子.json'
			
 
				+    what_result_file = f'{in_dir}/what_解构结果.json'
			
 
				+
			
 
				+    target_note = read_json(target_note_file)
			
 
				+    what_result = read_json(what_result_file)
			
 
				+
			
 
				+    print("=" * 80)
			
 
				+    print("开始 HOW 业务解构")
			
 
				+    print("=" * 80)
			
 
				+
			
 
				+    # 构造 how 分析消息
			
 
				+    multimodal_messages = build_how_analysis_message(
			
 
				+        post_data=target_note,
			
 
				+        what_result=what_result
			
 
				+    )
			
 
				+
			
 
				+    # 运行 agent
			
 
				+    print("\n正在分析创作手段和方法...")
			
 
				+    result = await Runner.run(agent, input=multimodal_messages)
			
 
				+
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("HOW 解构结果")
			
 
				+    print("=" * 80)
			
 
				+    print(result.final_output)
			
 
				+
			
 
				+    # 尝试解析并保存结果
			
 
				+    try:
			
 
				+        # 提取 JSON 内容（如果包含在 markdown 代码块中）
			
 
				+        output_text = result.final_output
			
 
				+        if "```json" in output_text:
			
 
				+            json_start = output_text.index("```json") + 7
			
 
				+            json_end = output_text.index("```", json_start)
			
 
				+            json_text = output_text[json_start:json_end].strip()
			
 
				+        elif "```" in output_text:
			
 
				+            json_start = output_text.index("```") + 3
			
 
				+            json_end = output_text.index("```", json_start)
			
 
				+            json_text = output_text[json_start:json_end].strip()
			
 
				+        else:
			
 
				+            json_text = output_text
			
 
				+
			
 
				+        # 解析 JSON
			
 
				+        how_result = json.loads(json_text)
			
 
				+
			
 
				+        # 保存结果
			
 
				+        output_file = f'{out_dir}/how_解构结果_生成.json'
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(how_result, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+        print(f"\n✓ 结果已保存到：{output_file}")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"\n⚠ 无法解析 JSON 结果：{e}")
			
 
				+        print("请手动检查输出内容")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    set_trace()
			
 
				+    with trace("how decode"): 
			
 
				+        asyncio.run(main())
			
--- a/how_decode_v8_new_structure.py
+++ b/how_decode_v8_new_structure.py
@@ -1,818 +0,0 @@
 
				-"""
			
 
				-HOW 解构 V8 - 适配新的输入输出结构
			
 
				-
			
 
				-新的输入结构：
			
 
				-- 帖子信息：examples_new/阿里多多酱/作者历史帖子/{帖子ID}.json
			
 
				-- what解构结果：examples_new/阿里多多酱/output/{帖子ID}_{运行日期}_{运行时间}.json
			
 
				-
			
 
				-新的输出结构：
			
 
				-- examples_new/阿里多多酱/how_output/{帖子ID}_{运行日期}_{运行时间}.json
			
 
				-
			
 
				-核心方法：
			
 
				-- 拆步骤分析：
			
 
				-  - Step 1: 来源类型初筛 - 找出可能的来源
			
 
				-  - Step 2: 解构每个来源 - 明确标识【输入→处理→输出】
			
 
				-  - Step 3: 路径验证 - 评分和证据
			
 
				-  - Step 4: 综合结论 - 最可能的路径
			
 
				-- 多模态方式传递历史帖子
			
 
				-- 只分析灵感点，且只使用名称（避免what中间过程的干扰）
			
 
				-"""
			
 
				-
			
 
				-import asyncio
			
 
				-import json
			
 
				-import os
			
 
				-from typing import Dict, List
			
 
				-from datetime import datetime
			
 
				-
			
 
				-from agents import Agent, Runner
			
 
				-from lib.my_trace import set_trace
			
 
				-from lib.utils import read_json
			
 
				-from lib.client import get_model
			
 
				-
			
 
				-MODEL_NAME = "google/gemini-2.5-flash"
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# 多模态消息构建
			
 
				-# ============================================================================
			
 
				-
			
 
				-def build_post_multimodal_content(post_data: Dict) -> List[Dict]:
			
 
				-    """构建单个帖子的多模态内容"""
			
 
				-    images = post_data.get('images', [])
			
 
				-    image_count = len(images)
			
 
				-
			
 
				-    content = []
			
 
				-
			
 
				-    if images:
			
 
				-        content.append({
			
 
				-            "type": "input_text",
			
 
				-            "text": f"[帖子图集：{image_count}张图片，第一张是封面]"
			
 
				-        })
			
 
				-
			
 
				-    for img_url in images:
			
 
				-        content.append({
			
 
				-            "type": "input_image",
			
 
				-            "detail": "auto",
			
 
				-            "image_url": img_url
			
 
				-        })
			
 
				-
			
 
				-    post_info = f"""
			
 
				-<标题>
			
 
				-{post_data.get('title', '')}
			
 
				-</标题>
			
 
				-
			
 
				-<正文>
			
 
				-{post_data.get('body_text', '')}
			
 
				-</正文>
			
 
				-
			
 
				-<发布时间>
			
 
				-{post_data.get('publish_time', '')}
			
 
				-</发布时间>
			
 
				-
			
 
				-<互动数据>
			
 
				-点赞: {post_data.get('like_count', 0)}, 收藏: {post_data.get('collect_count', 0)}
			
 
				-</互动数据>
			
 
				-"""
			
 
				-    content.append({
			
 
				-        "type": "input_text",
			
 
				-        "text": post_info.strip()
			
 
				-    })
			
 
				-
			
 
				-    return content
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Step 1: 来源类型初筛 Agent
			
 
				-# ============================================================================
			
 
				-
			
 
				-STEP1_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：对给定的灵感点，**广召回**所有可能的来源，并给出初步推测路径。
			
 
				-
			
 
				-## 关键要求
			
 
				-
			
 
				-1. **广召回**：对每个来源类型（A/B/C/D）都要分析，即使可能性很低也要列出
			
 
				-2. **可能性评估**：给出高/中/低的评级
			
 
				-3. **初步推测路径**：用3-5步描述从原始点到灵感点的推导过程
			
 
				-
			
 
				-## 推测路径的要求
			
 
				-
			
 
				-### 原始点
			
 
				-- **B类来源**：原始点是博主历史帖子数据
			
 
				-- **C类来源**：原始点是外部平台信息（小红书/微博/知乎等）
			
 
				-- **A类来源**：原始点是当前帖子的其他灵感点
			
 
				-- **D类来源**：原始点是混合的
			
 
				-
			
 
				-### 可用操作类型
			
 
				-只能使用这三种操作：
			
 
				-1. **从内搜**：搜索/浏览博主历史帖子、回忆过往经验
			
 
				-2. **从外搜**：搜索外部平台、浏览热点话题、查询知识
			
 
				-3. **信息处理**：观察、对比、提取、归纳、联想、组合、类比
			
 
				-
			
 
				-### 步骤格式
			
 
				-每步必须明确：操作类型 + 具体做什么 + 输出什么
			
 
				-
			
 
				-示例：
			
 
				-- `步骤1 [从内搜]: 浏览历史帖子 → 发现经常发"胖猫穿衣"内容`
			
 
				-- `步骤2 [信息处理]: 观察这些帖子的共同特征 → 体型圆滚滚`
			
 
				-- `步骤3 [信息处理]: 联想类比体型特征 → 像煤气罐`
			
 
				-
			
 
				-### 注意事项
			
 
				-- **不能跳步骤**：关键词、概念的来源必须说清楚
			
 
				-- **不能有黑盒**：不能用"突然想到"、"产生灵感"等说法
			
 
				-- **数字世界操作**：只能操作数字化的数据，不能有物理世界交互
			
 
				-- **合适的粒度**：3-5步说清楚，不要太细（不说底层实现），不要太粗（不能黑盒）
			
 
				-
			
 
				-## 来源类型分类
			
 
				-
			
 
				-**A. 从其他点推导**
			
 
				-- 从当前帖子的其他灵感点推导得出
			
 
				-
			
 
				-**B. 从博主账号历史**
			
 
				-- 从历史帖子中的内容、风格、经验推导
			
 
				-
			
 
				-**C. 从外部信息**
			
 
				-- 从平台热点、流行梗、社会现象推导
			
 
				-
			
 
				-**D. 混合输入**
			
 
				-- 由多个来源融合创新
			
 
				-
			
 
				-## 输出格式
			
 
				-
			
 
				-```json
			
 
				-{
			
 
				-  "可能的来源": {
			
 
				-    "A_其他点推导": {
			
 
				-      "可能性": "高/中/低",
			
 
				-      "理由": "为什么这个来源是可能的（1-2句话）",
			
 
				-      "初步推测路径": [
			
 
				-        "步骤1 [操作类型]: 具体操作 → 输出结果",
			
 
				-        "步骤2 [操作类型]: 具体操作 → 输出结果",
			
 
				-        "步骤3 [操作类型]: 具体操作 → 输出结果"
			
 
				-      ]
			
 
				-    },
			
 
				-    "B_博主历史": {
			
 
				-      "可能性": "高/中/低",
			
 
				-      "理由": "为什么这个来源是可能的（1-2句话）",
			
 
				-      "初步推测路径": [
			
 
				-        "原始点: 博主历史帖子数据",
			
 
				-        "步骤1 [从内搜]: 具体操作 → 输出结果",
			
 
				-        "步骤2 [信息处理]: 具体操作 → 输出结果",
			
 
				-        "步骤3 [信息处理]: 具体操作 → 输出结果"
			
 
				-      ]
			
 
				-    },
			
 
				-    "C_外部信息": {
			
 
				-      "可能性": "高/中/低",
			
 
				-      "理由": "为什么这个来源是可能的（1-2句话）",
			
 
				-      "初步推测路径": [
			
 
				-        "原始点: 外部平台信息",
			
 
				-        "步骤1 [从外搜]: 具体操作 → 输出结果",
			
 
				-        "步骤2 [信息处理]: 具体操作 → 输出结果",
			
 
				-        "步骤3 [信息处理]: 具体操作 → 输出结果"
			
 
				-      ]
			
 
				-    },
			
 
				-    "D_混合输入": {
			
 
				-      "可能性": "高/中/低",
			
 
				-      "理由": "可能混合了哪些来源",
			
 
				-      "初步推测路径": [
			
 
				-        "原始点: 混合（历史数据+外部信息）",
			
 
				-        "步骤1 [操作类型]: 具体操作 → 输出结果",
			
 
				-        "步骤2 [操作类型]: 具体操作 → 输出结果",
			
 
				-        "步骤3 [操作类型]: 具体操作 → 输出结果"
			
 
				-      ]
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-step1_agent = Agent(
			
 
				-    name="Source Type Filter",
			
 
				-    instructions=STEP1_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Step 2: 深入分析 Agent
			
 
				-# ============================================================================
			
 
				-
			
 
				-STEP2_B_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：解构从博主历史如何一步步得到这个灵感点。
			
 
				-
			
 
				-## 核心要求：明确标识 输入 → 处理 → 输出
			
 
				-
			
 
				-### 输入
			
 
				-- 具体是博主历史中的哪个/哪些帖子？
			
 
				-- 这些帖子里有什么内容？（图片/文字/主题）
			
 
				-
			
 
				-### 处理过程（一步步推导）
			
 
				-- 步骤1：创作者观察/接收到什么信息？
			
 
				-- 步骤2：产生了什么联想/思考？
			
 
				-- 步骤3：如何转化为具体的灵感？
			
 
				-- （可以有更多步骤）
			
 
				-
			
 
				-### 输出
			
 
				-- 最终得到的灵感点
			
 
				-
			
 
				-## 输出要求
			
 
				-
			
 
				-输出JSON格式：
			
 
				-```json
			
 
				-{
			
 
				-  "输入_博主历史帖子": {
			
 
				-    "相关帖子": [
			
 
				-      {
			
 
				-        "帖子序号": "历史帖子X/总数",
			
 
				-        "标题": "...",
			
 
				-        "关键内容": "具体是图片中什么/文字里什么"
			
 
				-      }
			
 
				-    ]
			
 
				-  },
			
 
				-  "处理_从输入到灵感的推导": {
			
 
				-    "步骤1": {
			
 
				-      "动作": "观察/接收",
			
 
				-      "内容": "创作者看到/注意到了什么"
			
 
				-    },
			
 
				-    "步骤2": {
			
 
				-      "动作": "联想/思考",
			
 
				-      "内容": "产生了什么想法/联系"
			
 
				-    },
			
 
				-    "步骤3": {
			
 
				-      "动作": "转化/形成",
			
 
				-      "内容": "如何变成具体的灵感"
			
 
				-    }
			
 
				-  },
			
 
				-  "输出_最终灵感": "灵感点名称"
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-STEP2_C_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：解构从外部信息如何一步步得到这个灵感点。
			
 
				-
			
 
				-## 核心要求：明确标识 输入 → 处理 → 输出
			
 
				-
			
 
				-### 输入
			
 
				-- 具体是什么外部信息？（热点话题/流行梗/社会现象）
			
 
				-- 这些信息的具体内容是什么？
			
 
				-
			
 
				-### 处理过程（一步步推导）
			
 
				-- 步骤1：创作者接触到什么外部信息？
			
 
				-- 步骤2：如何理解/解读这个信息？
			
 
				-- 步骤3：如何与自己的内容结合？
			
 
				-- 步骤4：如何转化为具体的灵感？
			
 
				-- （可以有更多步骤）
			
 
				-
			
 
				-### 输出
			
 
				-- 最终得到的灵感点
			
 
				-
			
 
				-## 输出要求
			
 
				-
			
 
				-输出JSON格式：
			
 
				-```json
			
 
				-{
			
 
				-  "输入_外部信息": {
			
 
				-    "信息类型": "平台热点/流行梗/社会现象",
			
 
				-    "具体内容": "是什么话题/梗/现象",
			
 
				-    "信息来源": "在哪里看到/了解到"
			
 
				-  },
			
 
				-  "处理_从输入到灵感的推导": {
			
 
				-    "步骤1": {
			
 
				-      "动作": "接触/了解",
			
 
				-      "内容": "创作者看到/听到了什么"
			
 
				-    },
			
 
				-    "步骤2": {
			
 
				-      "动作": "理解/解读",
			
 
				-      "内容": "如何理解这个信息"
			
 
				-    },
			
 
				-    "步骤3": {
			
 
				-      "动作": "结合/融合",
			
 
				-      "内容": "如何与自己的内容结合"
			
 
				-    },
			
 
				-    "步骤4": {
			
 
				-      "动作": "转化/形成",
			
 
				-      "内容": "如何变成具体的灵感"
			
 
				-    }
			
 
				-  },
			
 
				-  "输出_最终灵感": "灵感点名称"
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-step2_b_agent = Agent(
			
 
				-    name="Blogger History Analyzer",
			
 
				-    instructions=STEP2_B_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-step2_c_agent = Agent(
			
 
				-    name="External Info Analyzer",
			
 
				-    instructions=STEP2_C_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Step 3: 路径验证 Agent
			
 
				-# ============================================================================
			
 
				-
			
 
				-STEP3_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：对每个来源路径进行验证和评分。
			
 
				-
			
 
				-## 验证维度
			
 
				-
			
 
				-1. **支持证据**（3-5条具体证据）
			
 
				-2. **反驳点**（如果有不支持的因素）
			
 
				-3. **可能性评分**（1-10分，基于证据强度）
			
 
				-
			
 
				-## 输出要求
			
 
				-
			
 
				-输出JSON格式：
			
 
				-```json
			
 
				-{
			
 
				-  "来源类型": "B",
			
 
				-  "支持证据": [
			
 
				-    "证据1: ...",
			
 
				-    "证据2: ...",
			
 
				-    "证据3: ..."
			
 
				-  ],
			
 
				-  "反驳点": [
			
 
				-    "反驳1: ..."
			
 
				-  ],
			
 
				-  "可能性评分": 8,
			
 
				-  "评分说明": "为什么给这个分数"
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-step3_agent = Agent(
			
 
				-    name="Path Validator",
			
 
				-    instructions=STEP3_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Step 4: 综合结论 Agent
			
 
				-# ============================================================================
			
 
				-
			
 
				-STEP4_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：基于前面的分析，给出综合结论。
			
 
				-
			
 
				-## 输出要求
			
 
				-
			
 
				-输出JSON格式：
			
 
				-```json
			
 
				-{
			
 
				-  "最可能的来源路径": "...",
			
 
				-  "各来源的占比": {
			
 
				-    "B_博主历史": "60%",
			
 
				-    "C_外部信息": "40%"
			
 
				-  },
			
 
				-  "完整推导路径": "从...到...最终形成...",
			
 
				-  "关键转折点": "...",
			
 
				-  "整体置信度": 85
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-step4_agent = Agent(
			
 
				-    name="Conclusion Synthesizer",
			
 
				-    instructions=STEP4_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# 从新格式的what结果中提取所有点
			
 
				-# ============================================================================
			
 
				-
			
 
				-def extract_all_points_v8(what_result: Dict) -> List[Dict]:
			
 
				-    """
			
 
				-    从新格式的 what 解构结果中提取所有的点
			
 
				-
			
 
				-    重要：只提取灵感点，且只保留名称字段
			
 
				-    - 描述、在帖子中的体现等都是what的中间过程，会干扰how的解构
			
 
				-    """
			
 
				-    points = []
			
 
				-
			
 
				-    # 只提取灵感点，且只保留名称
			
 
				-    inspiration_points = what_result.get('三点解构', {}).get('灵感点', {}).get('points', [])
			
 
				-    for idx, point in enumerate(inspiration_points, 1):
			
 
				-        points.append({
			
 
				-            'type': '灵感点',
			
 
				-            'id': f'灵感点{idx}',
			
 
				-            'name': point.get('灵感点', '')  # 只要名称，不要其他字段
			
 
				-        })
			
 
				-
			
 
				-    return points
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# 加载博主历史数据
			
 
				-# ============================================================================
			
 
				-
			
 
				-def load_blogger_history_v8(history_dir: str, target_post_id: str) -> Dict:
			
 
				-    """加载博主历史数据 - V8版本"""
			
 
				-    history_posts = []
			
 
				-
			
 
				-    for filename in os.listdir(history_dir):
			
 
				-        if filename.endswith('.json'):
			
 
				-            post_id = filename.replace('.json', '')
			
 
				-            # 只过滤掉当前帖子本身（按ID）
			
 
				-            if post_id != target_post_id:
			
 
				-                filepath = os.path.join(history_dir, filename)
			
 
				-                with open(filepath, 'r', encoding='utf-8') as f:
			
 
				-                    data = json.load(f)
			
 
				-                    history_posts.append(data)
			
 
				-
			
 
				-    # 按时间排序
			
 
				-    history_posts.sort(key=lambda x: x.get('publish_timestamp', 0))
			
 
				-
			
 
				-    return {
			
 
				-        "历史帖子数": len(history_posts),
			
 
				-        "历史帖子列表": history_posts
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# 拆步骤分析（复用之前的逻辑）
			
 
				-# ============================================================================
			
 
				-
			
 
				-async def analyze_point_step_by_step(
			
 
				-    point: Dict,
			
 
				-    all_points: List[Dict],
			
 
				-    blogger_history: Dict,
			
 
				-    account_name: str
			
 
				-):
			
 
				-    """拆步骤分析单个点"""
			
 
				-
			
 
				-    print(f"\n{'='*80}")
			
 
				-    print(f"拆步骤溯源分析: {point['id']} - {point['name']}")
			
 
				-    print(f"{'='*80}")
			
 
				-
			
 
				-    # ========== 准备基础上下文 ==========
			
 
				-    content = []
			
 
				-
			
 
				-    # 待溯源的点信息（只有名称，避免what中间过程的干扰）
			
 
				-    content.append({
			
 
				-        "type": "input_text",
			
 
				-        "text": f"""
			
 
				-# 待溯源的灵感点
			
 
				-
			
 
				-**名称**: {point['name']}
			
 
				-
			
 
				-**说明**: 这是从what解构中提取的灵感点名称，请分析这个灵感点是如何产生的。
			
 
				-"""
			
 
				-    })
			
 
				-
			
 
				-    # 其他点信息
			
 
				-    other_points_info = []
			
 
				-    for p in all_points:
			
 
				-        if p['id'] != point['id']:
			
 
				-            other_points_info.append(f"- {p['id']}: {p['name']}")
			
 
				-
			
 
				-    content.append({
			
 
				-        "type": "input_text",
			
 
				-        "text": f"""
			
 
				----
			
 
				-
			
 
				-# 其他点（可能的输入来源A）
			
 
				-
			
 
				-{chr(10).join(other_points_info)}
			
 
				-"""
			
 
				-    })
			
 
				-
			
 
				-    # 博主历史信息（多模态）
			
 
				-    history_posts = blogger_history.get('历史帖子列表', [])
			
 
				-    content.append({
			
 
				-        "type": "input_text",
			
 
				-        "text": f"""
			
 
				----
			
 
				-
			
 
				-# 博主历史信息（可能的输入来源B）
			
 
				-
			
 
				-**账号名称**: {account_name}
			
 
				-**历史帖子数量**: {len(history_posts)} 个
			
 
				-
			
 
				-以下是博主的所有历史帖子（按发布时间排序）：
			
 
				-"""
			
 
				-    })
			
 
				-
			
 
				-    # 为每个历史帖子构建多模态内容
			
 
				-    for idx, hist_post in enumerate(history_posts, 1):
			
 
				-        content.append({
			
 
				-            "type": "input_text",
			
 
				-            "text": f"\n## 历史帖子 {idx}/{len(history_posts)}\n"
			
 
				-        })
			
 
				-        hist_post_content = build_post_multimodal_content(hist_post)
			
 
				-        content.extend(hist_post_content)
			
 
				-
			
 
				-    # ========== Step 1: 来源类型初筛 ==========
			
 
				-    print(f"\n{'='*60}")
			
 
				-    print("Step 1: 来源类型初筛")
			
 
				-    print(f"{'='*60}")
			
 
				-
			
 
				-    step1_messages = [{
			
 
				-        "role": "user",
			
 
				-        "content": content + [{
			
 
				-            "type": "input_text",
			
 
				-            "text": "\n---\n\n请根据以上信息，判断这个点最可能来自哪些来源类型（只选1-3个最可能的）。"
			
 
				-        }]
			
 
				-    }]
			
 
				-
			
 
				-    result1 = await Runner.run(step1_agent, input=step1_messages)
			
 
				-    print(f"\n✅ Step 1 结果:\n{result1.final_output[:300]}...\n")
			
 
				-
			
 
				-    step1_result = extract_json(result1.final_output)
			
 
				-
			
 
				-    # 暂时只返回 Step 1 结果，不继续后面的步骤
			
 
				-    return {
			
 
				-        "灵感点": point['name'],
			
 
				-        "step1_来源可能性分析": step1_result
			
 
				-    }
			
 
				-
			
 
				-    # TODO: 后续步骤暂时注释掉，先把 Step 1 做扎实
			
 
				-    # selected_types = step1_result.get('选择的来源类型', [])
			
 
				-    #
			
 
				-    # # ========== Step 2: 解构每个来源（输入→处理→输出） ==========
			
 
				-    # print(f"\n{'='*60}")
			
 
				-    # print(f"Step 2: 解构每个来源 - 输入→处理→输出 (针对类型: {', '.join(selected_types)})")
			
 
				-    # print(f"{'='*60}")
			
 
				-    #
			
 
				-    # step2_results = {}
			
 
				-    #
			
 
				-    # for source_type in selected_types:
			
 
				-    #     if source_type == "B":
			
 
				-    #         print(f"\n▶ Step 2.B: 解构从博主历史如何得到灵感")
			
 
				-    #         step2_messages = [{
			
 
				-    #             "role": "user",
			
 
				-    #             "content": content + [{
			
 
				-    #                 "type": "input_text",
			
 
				-    #                 "text": f"""
			
 
				-    # ---
			
 
				-    #
			
 
				-    # Step 1 已确定：这个灵感点可能来自"博主账号历史"
			
 
				-    #
			
 
				-    # 请解构：从博主历史如何一步步得到这个灵感点？
			
 
				-    #
			
 
				-    # 要求明确标识：
			
 
				-    # - 输入：具体是哪个历史帖子？里面有什么内容？
			
 
				-    # - 处理：如何从输入一步步推导到灵感？（步骤1、2、3...）
			
 
				-    # - 输出：最终得到的灵感点
			
 
				-    # """
			
 
				-    #             }]
			
 
				-    #         }]
			
 
				-    #         result2b = await Runner.run(step2_b_agent, input=step2_messages)
			
 
				-    #         print(f"\n✅ Step 2.B 结果:\n{result2b.final_output[:300]}...\n")
			
 
				-    #         step2_results['B'] = extract_json(result2b.final_output)
			
 
				-    #
			
 
				-    #     elif source_type == "C":
			
 
				-    #         print(f"\n▶ Step 2.C: 解构从外部信息如何得到灵感")
			
 
				-    #         step2_messages = [{
			
 
				-    #             "role": "user",
			
 
				-    #             "content": content + [{
			
 
				-    #                 "type": "input_text",
			
 
				-    #                 "text": f"""
			
 
				-    # ---
			
 
				-    #
			
 
				-    # Step 1 已确定：这个灵感点可能来自"外部信息"
			
 
				-    #
			
 
				-    # 请解构：从外部信息如何一步步得到这个灵感点？
			
 
				-    #
			
 
				-    # 要求明确标识：
			
 
				-    # - 输入：具体是什么外部信息？（热点/梗/现象）
			
 
				-    # - 处理：如何从输入一步步推导到灵感？（步骤1、2、3、4...）
			
 
				-    # - 输出：最终得到的灵感点
			
 
				-    # """
			
 
				-    #             }]
			
 
				-    #         }]
			
 
				-    #         result2c = await Runner.run(step2_c_agent, input=step2_messages)
			
 
				-    #         print(f"\n✅ Step 2.C 结果:\n{result2c.final_output[:300]}...\n")
			
 
				-    #         step2_results['C'] = extract_json(result2c.final_output)
			
 
				-    #
			
 
				-    # # ========== Step 3: 路径验证 ==========
			
 
				-    # print(f"\n{'='*60}")
			
 
				-    # print("Step 3: 路径验证")
			
 
				-    # print(f"{'='*60}")
			
 
				-    #
			
 
				-    # step3_results = []
			
 
				-    #
			
 
				-    # for source_type in selected_types:
			
 
				-    #     print(f"\n▶ Step 3.{source_type}: 验证来源路径")
			
 
				-    #
			
 
				-    #     step2_analysis = json.dumps(step2_results.get(source_type, {}), ensure_ascii=False, indent=2)
			
 
				-    #
			
 
				-    #     step3_messages = [{
			
 
				-    #         "role": "user",
			
 
				-    #         "content": [{
			
 
				-    #             "type": "input_text",
			
 
				-    #             "text": f"""
			
 
				-    # 基于前面的分析：
			
 
				-    #
			
 
				-    # Step 1: 初筛选择了来源类型 {source_type}
			
 
				-    # Step 2: 深入分析结果：
			
 
				-    # {step2_analysis}
			
 
				-    #
			
 
				-    # 请对这个来源路径进行验证：列出支持证据、反驳点、给出评分。
			
 
				-    # """
			
 
				-    #         }]
			
 
				-    #     }]
			
 
				-    #
			
 
				-    #     result3 = await Runner.run(step3_agent, input=step3_messages)
			
 
				-    #     print(f"\n✅ Step 3.{source_type} 结果:\n{result3.final_output[:300]}...\n")
			
 
				-    #     step3_results.append(extract_json(result3.final_output))
			
 
				-    #
			
 
				-    # # ========== Step 4: 综合结论 ==========
			
 
				-    # print(f"\n{'='*60}")
			
 
				-    # print("Step 4: 综合结论")
			
 
				-    # print(f"{'='*60}")
			
 
				-    #
			
 
				-    # all_analysis = {
			
 
				-    #     "step1": step1_result,
			
 
				-    #     "step2": step2_results,
			
 
				-    #     "step3": step3_results
			
 
				-    # }
			
 
				-    #
			
 
				-    # step4_messages = [{
			
 
				-    #     "role": "user",
			
 
				-    #     "content": [{
			
 
				-    #         "type": "input_text",
			
 
				-    #         "text": f"""
			
 
				-    # 基于前面所有步骤的分析：
			
 
				-    #
			
 
				-    # {json.dumps(all_analysis, ensure_ascii=False, indent=2)}
			
 
				-    #
			
 
				-    # 请给出综合结论：最可能的来源路径、各来源占比、完整推导过程。
			
 
				-    # """
			
 
				-    #     }]
			
 
				-    # }]
			
 
				-    #
			
 
				-    # result4 = await Runner.run(step4_agent, input=step4_messages)
			
 
				-    # print(f"\n✅ Step 4 结果:\n{result4.final_output[:300]}...\n")
			
 
				-    #
			
 
				-    # final_result = {
			
 
				-    #     "灵感点": point['name'],
			
 
				-    #     "step1_来源类型初筛": step1_result,
			
 
				-    #     "step2_深入分析": step2_results,
			
 
				-    #     "step3_路径验证": step3_results,
			
 
				-    #     "step4_综合结论": extract_json(result4.final_output)
			
 
				-    # }
			
 
				-    #
			
 
				-    # return final_result
			
 
				-
			
 
				-
			
 
				-def extract_json(text: str) -> Dict:
			
 
				-    """从文本中提取JSON"""
			
 
				-    try:
			
 
				-        if "```json" in text:
			
 
				-            json_start = text.index("```json") + 7
			
 
				-            json_end = text.index("```", json_start)
			
 
				-            json_text = text[json_start:json_end].strip()
			
 
				-        elif "```" in text:
			
 
				-            json_start = text.index("```") + 3
			
 
				-            json_end = text.index("```", json_start)
			
 
				-            json_text = text[json_start:json_end].strip()
			
 
				-        else:
			
 
				-            json_text = text
			
 
				-        return json.loads(json_text)
			
 
				-    except:
			
 
				-        return {"原始输出": text}
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Main
			
 
				-# ============================================================================
			
 
				-
			
 
				-async def main(current_time, log_url):
			
 
				-    import sys
			
 
				-
			
 
				-    # 参数解析
			
 
				-    if len(sys.argv) < 2:
			
 
				-        print("用法: python how_decode_v8_new_structure.py <what_result_file>")
			
 
				-        print("示例: python how_decode_v8_new_structure.py examples_new/阿里多多酱/output/685b593800000000120141d3_20251104_111017.json")
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    what_result_file = sys.argv[1]
			
 
				-
			
 
				-    # 解析文件名：{帖子ID}_{运行日期}_{运行时间}.json
			
 
				-    filename = os.path.basename(what_result_file)
			
 
				-    filename_without_ext = filename.replace('.json', '')
			
 
				-    parts = filename_without_ext.split('_')
			
 
				-
			
 
				-    if len(parts) < 3:
			
 
				-        print(f"❌ 文件名格式不正确: {filename}")
			
 
				-        print("期望格式: {帖子ID}_{运行日期}_{运行时间}.json")
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    post_id = parts[0]
			
 
				-    run_date = parts[1]
			
 
				-    run_time = parts[2]
			
 
				-
			
 
				-    print("="*80)
			
 
				-    print("HOW 解构 V8 - 拆步骤溯源分析")
			
 
				-    print("="*80)
			
 
				-    print(f"\n目标帖子ID: {post_id}")
			
 
				-    print(f"运行日期: {run_date}")
			
 
				-    print(f"运行时间: {run_time}")
			
 
				-
			
 
				-    # 读取 what 解构结果
			
 
				-    what_result = read_json(what_result_file)
			
 
				-    if not what_result:
			
 
				-        print(f"❌ 无法读取文件: {what_result_file}")
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    # 从输入路径中提取账号名称
			
 
				-    # 路径格式: examples_new/{账号名}/output/{帖子ID}_{运行日期}_{运行时间}.json
			
 
				-    path_parts = what_result_file.split('/')
			
 
				-    if len(path_parts) >= 3 and path_parts[0] == 'examples_new':
			
 
				-        author_name = path_parts[1]
			
 
				-    else:
			
 
				-        print(f"❌ 无法从路径中提取账号名称: {what_result_file}")
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    # 构建路径
			
 
				-    base_dir = f"examples_new/{author_name}"
			
 
				-    history_dir = f"{base_dir}/作者历史帖子"
			
 
				-
			
 
				-    # 读取目标帖子信息（用于获取账号名称）
			
 
				-    target_post_file = f"{history_dir}/{post_id}.json"
			
 
				-    target_post = read_json(target_post_file)
			
 
				-    account_name = target_post.get('channel_account_name', author_name) if target_post else author_name
			
 
				-
			
 
				-    # 加载博主历史数据
			
 
				-    print(f"\n加载博主历史数据...")
			
 
				-    blogger_history = load_blogger_history_v8(history_dir, post_id)
			
 
				-    print(f"✓ 已加载 {blogger_history['历史帖子数']} 个历史帖子")
			
 
				-
			
 
				-    # 提取所有灵感点（只提取灵感点，且只保留名称）
			
 
				-    all_points = extract_all_points_v8(what_result)
			
 
				-    print(f"\n从 WHAT 解构中提取了 {len(all_points)} 个灵感点：")
			
 
				-    for p in all_points:
			
 
				-        print(f"  - {p['id']}: {p['name']}")
			
 
				-
			
 
				-    # 对每个灵感点进行溯源分析
			
 
				-    source_analysis_results = []
			
 
				-
			
 
				-    for point in all_points:
			
 
				-        result = await analyze_point_step_by_step(
			
 
				-            point, all_points, blogger_history, account_name
			
 
				-        )
			
 
				-        source_analysis_results.append(result)
			
 
				-
			
 
				-        # 添加延迟避免API限流
			
 
				-        await asyncio.sleep(2)
			
 
				-
			
 
				-    # 保存结果
			
 
				-    now = datetime.now()
			
 
				-    output_filename = f"{post_id}_{now.strftime('%Y%m%d')}_{now.strftime('%H%M%S')}.json"
			
 
				-    output_dir = f"{base_dir}/how_output"
			
 
				-    os.makedirs(output_dir, exist_ok=True)
			
 
				-    output_file = f"{output_dir}/{output_filename}"
			
 
				-
			
 
				-    final_result = {
			
 
				-        "how_解构_V8": {
			
 
				-            "版本说明": "V8 - 拆步骤溯源分析，适配新的输入输出结构",
			
 
				-            "目标帖子ID": post_id,
			
 
				-            "运行时间": now.strftime('%Y-%m-%d %H:%M:%S'),
			
 
				-            "log_url": log_url,
			
 
				-            "历史数据统计": {
			
 
				-                "历史帖子数": blogger_history['历史帖子数'],
			
 
				-                "数据格式": "多模态（图片 + 结构化文本）"
			
 
				-            },
			
 
				-            "分析范围": "只分析灵感点",
			
 
				-            "灵感点数量": len(all_points),
			
 
				-            "分析方法": "拆步骤（Step1初筛 -> Step2深入 -> Step3验证 -> Step4结论），只使用灵感点名称，避免what中间过程的干扰",
			
 
				-            "灵感点溯源分析": source_analysis_results
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    with open(output_file, 'w', encoding='utf-8') as f:
			
 
				-        json.dump(final_result, f, ensure_ascii=False, indent=2)
			
 
				-
			
 
				-    print("\n" + "="*80)
			
 
				-    print(f"✓ V8 溯源分析完成！结果已保存到：")
			
 
				-    print(f"  {output_file}")
			
 
				-    print("="*80)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    from agents import trace
			
 
				-    current_time, log_url = set_trace()
			
 
				-    with trace("how decode v8"):
			
 
				-        asyncio.run(main(current_time, log_url))
			
--- a/how_decode_v9_point_dependency.py
+++ b/how_decode_v9_point_dependency.py
@@ -1,946 +0,0 @@
 
				-"""
			
 
				-HOW 解构 V9 - 点依赖关系处理
			
 
				-
			
 
				-V9 新特性：
			
 
				-- 正确处理点的依赖关系：
			
 
				-  - 灵感点 ↔ 目的点（双向互推）
			
 
				-  - 灵感点,目的点 → 关键点（单向推导）
			
 
				-  - 同类型点不能互推
			
 
				-
			
 
				-- 提取所有三类点（即使只分析灵感点）
			
 
				-- 根据点类型动态提供可推导来源
			
 
				-- 支持机器可模拟的推测路径
			
 
				-
			
 
				-输入输出结构：
			
 
				-- 帖子信息：examples_new/{账号}/作者历史帖子/{帖子ID}.json
			
 
				-- what解构结果：examples_new/{账号}/output/{帖子ID}_{运行日期}_{运行时间}.json
			
 
				-- 输出结果：examples_new/{账号}/how_output/{帖子ID}_{运行日期}_{运行时间}.json
			
 
				-"""
			
 
				-
			
 
				-import asyncio
			
 
				-import json
			
 
				-import os
			
 
				-from typing import Dict, List
			
 
				-from datetime import datetime
			
 
				-
			
 
				-from agents import Agent, Runner, trace
			
 
				-from agents.tracing.create import custom_span
			
 
				-from lib.my_trace import set_trace
			
 
				-from lib.utils import read_json
			
 
				-from lib.client import get_model
			
 
				-
			
 
				-MODEL_NAME = "google/gemini-2.5-flash"
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# 多模态消息构建
			
 
				-# ============================================================================
			
 
				-
			
 
				-def build_post_multimodal_content(post_data: Dict) -> List[Dict]:
			
 
				-    """构建单个帖子的多模态内容"""
			
 
				-    images = post_data.get('images', [])
			
 
				-    image_count = len(images)
			
 
				-
			
 
				-    content = []
			
 
				-
			
 
				-    if images:
			
 
				-        content.append({
			
 
				-            "type": "input_text",
			
 
				-            "text": f"[帖子图集：{image_count}张图片，第一张是封面]"
			
 
				-        })
			
 
				-
			
 
				-    for img_url in images:
			
 
				-        content.append({
			
 
				-            "type": "input_image",
			
 
				-            "detail": "auto",
			
 
				-            "image_url": img_url
			
 
				-        })
			
 
				-
			
 
				-    post_info = f"""
			
 
				-<标题>
			
 
				-{post_data.get('title', '')}
			
 
				-</标题>
			
 
				-
			
 
				-<正文>
			
 
				-{post_data.get('body_text', '')}
			
 
				-</正文>
			
 
				-
			
 
				-<发布时间>
			
 
				-{post_data.get('publish_time', '')}
			
 
				-</发布时间>
			
 
				-
			
 
				-<互动数据>
			
 
				-点赞: {post_data.get('like_count', 0)}, 收藏: {post_data.get('collect_count', 0)}
			
 
				-</互动数据>
			
 
				-"""
			
 
				-    content.append({
			
 
				-        "type": "input_text",
			
 
				-        "text": post_info.strip()
			
 
				-    })
			
 
				-
			
 
				-    return content
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Step 1: 来源类型初筛 Agent
			
 
				-# ============================================================================
			
 
				-
			
 
				-STEP1_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：对给定的灵感点，**广召回**所有可能的来源，并给出初步推测路径。
			
 
				-
			
 
				-## 关键要求
			
 
				-
			
 
				-1. **广召回**：对每个来源类型（A/B/C/D）都要分析，即使可能性很低也要列出
			
 
				-2. **可能性评估**：给出高/中/低的评级
			
 
				-3. **初步推测路径**：用3-5步描述从原始点到灵感点的推导过程
			
 
				-
			
 
				-## 推测路径的要求
			
 
				-
			
 
				-### 原始点
			
 
				-- **A类来源**：原始点是上下文中"可推导来源（A类来源）"提供的其他类型的点
			
 
				-- **B类来源**：原始点是博主历史帖子数据
			
 
				-- **C类来源**：原始点是外部平台信息（小红书/微博/知乎等）
			
 
				-- **D类来源**：原始点是混合的（多种来源组合）
			
 
				-
			
 
				-### 可用操作类型
			
 
				-只能使用这三种操作：
			
 
				-1. **从内搜**：搜索/浏览博主历史帖子、回忆过往经验
			
 
				-2. **从外搜**：搜索外部平台、浏览热点话题、查询知识
			
 
				-3. **信息处理**：观察、对比、提取、归纳、联想、组合、类比
			
 
				-
			
 
				-### 步骤格式
			
 
				-每步必须明确：操作类型 + 具体做什么 + 输出什么
			
 
				-
			
 
				-格式：`步骤X [操作类型]: 具体操作 → 输出结果`
			
 
				-
			
 
				-### 注意事项
			
 
				-- **不能跳步骤**：关键词、概念的来源必须说清楚
			
 
				-- **不能有黑盒**：不能用"突然想到"、"产生灵感"等说法
			
 
				-- **数字世界操作**：只能操作数字化的数据，不能有物理世界交互
			
 
				-- **合适的粒度**：3-5步说清楚，不要太细（不说底层实现），不要太粗（不能黑盒）
			
 
				-
			
 
				-## 来源类型分类
			
 
				-
			
 
				-**A. 从其他点推导**
			
 
				-- 从上下文中"可推导来源（A类来源）"部分提供的点推导
			
 
				-- 根据点依赖关系：
			
 
				-  - 灵感点可从目的点推导
			
 
				-  - 目的点可从灵感点推导
			
 
				-  - 关键点可从灵感点和目的点推导
			
 
				-- 如果上下文说明"没有可用于推导的其他类型的点"，则A类可能性为"无"
			
 
				-
			
 
				-**B. 从博主账号历史**
			
 
				-- 从历史帖子中的内容、风格、经验推导
			
 
				-
			
 
				-**C. 从外部信息**
			
 
				-- 从平台热点、流行梗、社会现象推导
			
 
				-
			
 
				-**D. 混合输入**
			
 
				-- 由多个来源融合创新
			
 
				-
			
 
				-## 输出格式
			
 
				-
			
 
				-**注意**：
			
 
				-- 如果上下文说明没有可推导来源，则A类来源可能性为"无"，理由说明原因
			
 
				-- 对所有A/B/C/D来源都要分析，即使可能性很低
			
 
				-
			
 
				-```json
			
 
				-{
			
 
				-  "可能的来源": {
			
 
				-    "A_其他点推导": {
			
 
				-      "可能性": "高/中/低/无",
			
 
				-      "理由": "为什么这个来源是可能的（1-2句话），如果可能性为'无'，说明为什么没有可推导来源",
			
 
				-      "初步推测路径": [
			
 
				-        "步骤1 [操作类型]: 具体操作 → 输出结果",
			
 
				-        "步骤2 [操作类型]: 具体操作 → 输出结果",
			
 
				-        "步骤3 [操作类型]: 具体操作 → 输出结果"
			
 
				-      ]
			
 
				-    },
			
 
				-    "B_博主历史": {
			
 
				-      "可能性": "高/中/低",
			
 
				-      "理由": "为什么这个来源是可能的（1-2句话）",
			
 
				-      "初步推测路径": [
			
 
				-        "原始点: 博主历史帖子数据",
			
 
				-        "步骤1 [从内搜]: 具体操作 → 输出结果",
			
 
				-        "步骤2 [信息处理]: 具体操作 → 输出结果",
			
 
				-        "步骤3 [信息处理]: 具体操作 → 输出结果"
			
 
				-      ]
			
 
				-    },
			
 
				-    "C_外部信息": {
			
 
				-      "可能性": "高/中/低",
			
 
				-      "理由": "为什么这个来源是可能的（1-2句话）",
			
 
				-      "初步推测路径": [
			
 
				-        "原始点: 外部平台信息",
			
 
				-        "步骤1 [从外搜]: 具体操作 → 输出结果",
			
 
				-        "步骤2 [信息处理]: 具体操作 → 输出结果",
			
 
				-        "步骤3 [信息处理]: 具体操作 → 输出结果"
			
 
				-      ]
			
 
				-    },
			
 
				-    "D_混合输入": {
			
 
				-      "可能性": "高/中/低",
			
 
				-      "理由": "可能混合了哪些来源",
			
 
				-      "初步推测路径": [
			
 
				-        "原始点: 混合（历史数据+外部信息）",
			
 
				-        "步骤1 [操作类型]: 具体操作 → 输出结果",
			
 
				-        "步骤2 [操作类型]: 具体操作 → 输出结果",
			
 
				-        "步骤3 [操作类型]: 具体操作 → 输出结果"
			
 
				-      ]
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-step1_agent = Agent(
			
 
				-    name="Source Type Filter",
			
 
				-    instructions=STEP1_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Step 2: 深入分析 Agent
			
 
				-# ============================================================================
			
 
				-
			
 
				-STEP2_B_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：解构从博主历史如何一步步得到这个灵感点。
			
 
				-
			
 
				-## 核心要求：明确标识 输入 → 处理 → 输出
			
 
				-
			
 
				-### 输入
			
 
				-- 具体是博主历史中的哪个/哪些帖子？
			
 
				-- 这些帖子里有什么内容？（图片/文字/主题）
			
 
				-
			
 
				-### 处理过程（一步步推导）
			
 
				-- 步骤1：创作者观察/接收到什么信息？
			
 
				-- 步骤2：产生了什么联想/思考？
			
 
				-- 步骤3：如何转化为具体的灵感？
			
 
				-- （可以有更多步骤）
			
 
				-
			
 
				-### 输出
			
 
				-- 最终得到的灵感点
			
 
				-
			
 
				-## 输出要求
			
 
				-
			
 
				-输出JSON格式：
			
 
				-```json
			
 
				-{
			
 
				-  "输入_博主历史帖子": {
			
 
				-    "相关帖子": [
			
 
				-      {
			
 
				-        "帖子序号": "历史帖子X/总数",
			
 
				-        "标题": "...",
			
 
				-        "关键内容": "具体是图片中什么/文字里什么"
			
 
				-      }
			
 
				-    ]
			
 
				-  },
			
 
				-  "处理_从输入到灵感的推导": {
			
 
				-    "步骤1": {
			
 
				-      "动作": "观察/接收",
			
 
				-      "内容": "创作者看到/注意到了什么"
			
 
				-    },
			
 
				-    "步骤2": {
			
 
				-      "动作": "联想/思考",
			
 
				-      "内容": "产生了什么想法/联系"
			
 
				-    },
			
 
				-    "步骤3": {
			
 
				-      "动作": "转化/形成",
			
 
				-      "内容": "如何变成具体的灵感"
			
 
				-    }
			
 
				-  },
			
 
				-  "输出_最终灵感": "灵感点名称"
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-STEP2_C_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：解构从外部信息如何一步步得到这个灵感点。
			
 
				-
			
 
				-## 核心要求：明确标识 输入 → 处理 → 输出
			
 
				-
			
 
				-### 输入
			
 
				-- 具体是什么外部信息？（热点话题/流行梗/社会现象）
			
 
				-- 这些信息的具体内容是什么？
			
 
				-
			
 
				-### 处理过程（一步步推导）
			
 
				-- 步骤1：创作者接触到什么外部信息？
			
 
				-- 步骤2：如何理解/解读这个信息？
			
 
				-- 步骤3：如何与自己的内容结合？
			
 
				-- 步骤4：如何转化为具体的灵感？
			
 
				-- （可以有更多步骤）
			
 
				-
			
 
				-### 输出
			
 
				-- 最终得到的灵感点
			
 
				-
			
 
				-## 输出要求
			
 
				-
			
 
				-输出JSON格式：
			
 
				-```json
			
 
				-{
			
 
				-  "输入_外部信息": {
			
 
				-    "信息类型": "平台热点/流行梗/社会现象",
			
 
				-    "具体内容": "是什么话题/梗/现象",
			
 
				-    "信息来源": "在哪里看到/了解到"
			
 
				-  },
			
 
				-  "处理_从输入到灵感的推导": {
			
 
				-    "步骤1": {
			
 
				-      "动作": "接触/了解",
			
 
				-      "内容": "创作者看到/听到了什么"
			
 
				-    },
			
 
				-    "步骤2": {
			
 
				-      "动作": "理解/解读",
			
 
				-      "内容": "如何理解这个信息"
			
 
				-    },
			
 
				-    "步骤3": {
			
 
				-      "动作": "结合/融合",
			
 
				-      "内容": "如何与自己的内容结合"
			
 
				-    },
			
 
				-    "步骤4": {
			
 
				-      "动作": "转化/形成",
			
 
				-      "内容": "如何变成具体的灵感"
			
 
				-    }
			
 
				-  },
			
 
				-  "输出_最终灵感": "灵感点名称"
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-step2_b_agent = Agent(
			
 
				-    name="Blogger History Analyzer",
			
 
				-    instructions=STEP2_B_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-step2_c_agent = Agent(
			
 
				-    name="External Info Analyzer",
			
 
				-    instructions=STEP2_C_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Step 3: 路径验证 Agent
			
 
				-# ============================================================================
			
 
				-
			
 
				-STEP3_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：对每个来源路径进行验证和评分。
			
 
				-
			
 
				-## 验证维度
			
 
				-
			
 
				-1. **支持证据**（3-5条具体证据）
			
 
				-2. **反驳点**（如果有不支持的因素）
			
 
				-3. **可能性评分**（1-10分，基于证据强度）
			
 
				-
			
 
				-## 输出要求
			
 
				-
			
 
				-输出JSON格式：
			
 
				-```json
			
 
				-{
			
 
				-  "来源类型": "B",
			
 
				-  "支持证据": [
			
 
				-    "证据1: ...",
			
 
				-    "证据2: ...",
			
 
				-    "证据3: ..."
			
 
				-  ],
			
 
				-  "反驳点": [
			
 
				-    "反驳1: ..."
			
 
				-  ],
			
 
				-  "可能性评分": 8,
			
 
				-  "评分说明": "为什么给这个分数"
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-step3_agent = Agent(
			
 
				-    name="Path Validator",
			
 
				-    instructions=STEP3_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Step 4: 综合结论 Agent
			
 
				-# ============================================================================
			
 
				-
			
 
				-STEP4_PROMPT = """
			
 
				-你是一个创作溯源分析专家。
			
 
				-
			
 
				-你的任务：基于前面的分析，给出综合结论。
			
 
				-
			
 
				-## 输出要求
			
 
				-
			
 
				-输出JSON格式：
			
 
				-```json
			
 
				-{
			
 
				-  "最可能的来源路径": "...",
			
 
				-  "各来源的占比": {
			
 
				-    "B_博主历史": "60%",
			
 
				-    "C_外部信息": "40%"
			
 
				-  },
			
 
				-  "完整推导路径": "从...到...最终形成...",
			
 
				-  "关键转折点": "...",
			
 
				-  "整体置信度": 85
			
 
				-}
			
 
				-```
			
 
				-"""
			
 
				-
			
 
				-step4_agent = Agent(
			
 
				-    name="Conclusion Synthesizer",
			
 
				-    instructions=STEP4_PROMPT,
			
 
				-    model=get_model(MODEL_NAME),
			
 
				-    tools=[],
			
 
				-)
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# 从新格式的what结果中提取所有点
			
 
				-# ============================================================================
			
 
				-
			
 
				-def extract_all_points_v9(what_result: Dict) -> Dict[str, List[Dict]]:
			
 
				-    """
			
 
				-    从新格式的 what 解构结果中提取所有三类点
			
 
				-
			
 
				-    V9版本：从选题理解.explicit_elements提取所有三类点，只保留名称
			
 
				-    - 描述、在帖子中的体现等都是what的中间过程，会干扰how的解构
			
 
				-
			
 
				-    返回格式：
			
 
				-    {
			
 
				-        '灵感点': [{'id': '灵感点1', 'name': 'xxx'}, ...],
			
 
				-        '目的点': [{'id': '目的点1', 'name': 'xxx'}, ...],
			
 
				-        '关键点': [{'id': '关键点1', 'name': 'xxx'}, ...]
			
 
				-    }
			
 
				-    """
			
 
				-    all_points = {
			
 
				-        '灵感点': [],
			
 
				-        '目的点': [],
			
 
				-        '关键点': []
			
 
				-    }
			
 
				-
			
 
				-    # 从选题理解.explicit_elements提取
			
 
				-    explicit_elements = what_result.get('选题理解', {}).get('explicit_elements', {})
			
 
				-
			
 
				-    # 提取灵感点列表（数组）
			
 
				-    inspiration_list = explicit_elements.get('灵感点列表', [])
			
 
				-    for idx, name in enumerate(inspiration_list, 1):
			
 
				-        all_points['灵感点'].append({
			
 
				-            'type': '灵感点',
			
 
				-            'id': f'灵感点{idx}',
			
 
				-            'name': name
			
 
				-        })
			
 
				-
			
 
				-    # 提取目的点（单个字符串）
			
 
				-    purpose_name = explicit_elements.get('目的点', '')
			
 
				-    if purpose_name:
			
 
				-        all_points['目的点'].append({
			
 
				-            'type': '目的点',
			
 
				-            'id': '目的点1',
			
 
				-            'name': purpose_name
			
 
				-        })
			
 
				-
			
 
				-    # 提取关键点列表（数组）
			
 
				-    key_list = explicit_elements.get('关键点列表', [])
			
 
				-    for idx, name in enumerate(key_list, 1):
			
 
				-        all_points['关键点'].append({
			
 
				-            'type': '关键点',
			
 
				-            'id': f'关键点{idx}',
			
 
				-            'name': name
			
 
				-        })
			
 
				-
			
 
				-    return all_points
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# 加载博主历史数据
			
 
				-# ============================================================================
			
 
				-
			
 
				-def load_blogger_history_v8(history_dir: str, target_post_id: str) -> Dict:
			
 
				-    """加载博主历史数据 - V8版本"""
			
 
				-    history_posts = []
			
 
				-
			
 
				-    for filename in os.listdir(history_dir):
			
 
				-        if filename.endswith('.json'):
			
 
				-            post_id = filename.replace('.json', '')
			
 
				-            # 只过滤掉当前帖子本身（按ID）
			
 
				-            if post_id != target_post_id:
			
 
				-                filepath = os.path.join(history_dir, filename)
			
 
				-                with open(filepath, 'r', encoding='utf-8') as f:
			
 
				-                    data = json.load(f)
			
 
				-                    history_posts.append(data)
			
 
				-
			
 
				-    # 按时间排序
			
 
				-    history_posts.sort(key=lambda x: x.get('publish_timestamp', 0))
			
 
				-
			
 
				-    return {
			
 
				-        "历史帖子数": len(history_posts),
			
 
				-        "历史帖子列表": history_posts
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# 拆步骤分析（复用之前的逻辑）
			
 
				-# ============================================================================
			
 
				-
			
 
				-async def analyze_point_step_by_step(
			
 
				-    point: Dict,
			
 
				-    all_points: Dict[str, List[Dict]],
			
 
				-    blogger_history: Dict,
			
 
				-    account_name: str
			
 
				-):
			
 
				-    """
			
 
				-    拆步骤分析单个点
			
 
				-
			
 
				-    V9版本：根据点的依赖关系动态提供可推导来源
			
 
				-    - 灵感点：可从目的点推导
			
 
				-    - 目的点：可从灵感点推导
			
 
				-    - 关键点：可从灵感点和目的点推导
			
 
				-    """
			
 
				-
			
 
				-    print(f"\n{'='*80}")
			
 
				-    print(f"拆步骤溯源分析: {point['id']} - {point['name']}")
			
 
				-    print(f"{'='*80}")
			
 
				-
			
 
				-    # ========== 根据点类型确定可推导来源 ==========
			
 
				-    point_type = point['type']
			
 
				-    derivable_sources = {}
			
 
				-
			
 
				-    if point_type == '灵感点':
			
 
				-        # 灵感点可从目的点推导
			
 
				-        derivable_sources['目的点'] = all_points.get('目的点', [])
			
 
				-    elif point_type == '目的点':
			
 
				-        # 目的点可从灵感点推导
			
 
				-        derivable_sources['灵感点'] = all_points.get('灵感点', [])
			
 
				-    elif point_type == '关键点':
			
 
				-        # 关键点可从灵感点和目的点推导
			
 
				-        derivable_sources['灵感点'] = all_points.get('灵感点', [])
			
 
				-        derivable_sources['目的点'] = all_points.get('目的点', [])
			
 
				-
			
 
				-    # ========== 准备基础上下文 ==========
			
 
				-    content = []
			
 
				-
			
 
				-    # 待溯源的点信息（只有名称，避免what中间过程的干扰）
			
 
				-    content.append({
			
 
				-        "type": "input_text",
			
 
				-        "text": f"""
			
 
				-# 待溯源的{point_type}
			
 
				-
			
 
				-**名称**: {point['name']}
			
 
				-
			
 
				-**说明**: 这是从what解构中提取的{point_type}名称，请分析这个{point_type}是如何产生的。
			
 
				-"""
			
 
				-    })
			
 
				-
			
 
				-    # 可推导来源的点信息（A类来源）
			
 
				-    if derivable_sources:
			
 
				-        sources_info = []
			
 
				-        for source_type, source_points in derivable_sources.items():
			
 
				-            if source_points:
			
 
				-                sources_info.append(f"\n## {source_type}")
			
 
				-                for p in source_points:
			
 
				-                    sources_info.append(f"- {p['id']}: {p['name']}")
			
 
				-
			
 
				-        if sources_info:
			
 
				-            content.append({
			
 
				-                "type": "input_text",
			
 
				-                "text": f"""
			
 
				----
			
 
				-
			
 
				-# 可推导来源（A类来源）
			
 
				-
			
 
				-根据点依赖关系，当前{point_type}可以从以下类型的点推导：
			
 
				-{''.join(sources_info)}
			
 
				-
			
 
				-**注意**: 同类型的点不能互相推导。
			
 
				-"""
			
 
				-            })
			
 
				-    else:
			
 
				-        # 如果没有可推导来源（例如灵感点且没有目的点）
			
 
				-        content.append({
			
 
				-            "type": "input_text",
			
 
				-            "text": f"""
			
 
				----
			
 
				-
			
 
				-# 可推导来源（A类来源）
			
 
				-
			
 
				-当前帖子中没有可用于推导{point_type}的其他类型的点。
			
 
				-因此A类来源可能性为：无
			
 
				-"""
			
 
				-        })
			
 
				-
			
 
				-    # 博主历史信息（多模态）
			
 
				-    history_posts = blogger_history.get('历史帖子列表', [])
			
 
				-    content.append({
			
 
				-        "type": "input_text",
			
 
				-        "text": f"""
			
 
				----
			
 
				-
			
 
				-# 博主历史信息（可能的输入来源B）
			
 
				-
			
 
				-**账号名称**: {account_name}
			
 
				-**历史帖子数量**: {len(history_posts)} 个
			
 
				-
			
 
				-以下是博主的所有历史帖子（按发布时间排序）：
			
 
				-"""
			
 
				-    })
			
 
				-
			
 
				-    # 为每个历史帖子构建多模态内容
			
 
				-    for idx, hist_post in enumerate(history_posts, 1):
			
 
				-        content.append({
			
 
				-            "type": "input_text",
			
 
				-            "text": f"\n## 历史帖子 {idx}/{len(history_posts)}\n"
			
 
				-        })
			
 
				-        hist_post_content = build_post_multimodal_content(hist_post)
			
 
				-        content.extend(hist_post_content)
			
 
				-
			
 
				-    # ========== Step 1: 来源类型初筛 ==========
			
 
				-    print(f"\n{'='*60}")
			
 
				-    print("Step 1: 来源类型初筛")
			
 
				-    print(f"{'='*60}")
			
 
				-
			
 
				-    step1_messages = [{
			
 
				-        "role": "user",
			
 
				-        "content": content + [{
			
 
				-            "type": "input_text",
			
 
				-            "text": "\n---\n\n请根据以上信息，判断这个点最可能来自哪些来源类型（只选1-3个最可能的）。"
			
 
				-        }]
			
 
				-    }]
			
 
				-
			
 
				-    # 使用custom_span添加更多元数据
			
 
				-    with custom_span(
			
 
				-        name=f"Step1: {point['id']}",
			
 
				-        data={
			
 
				-            "point_id": point['id'],
			
 
				-            "point_name": point['name'],
			
 
				-            "point_type": point_type,
			
 
				-            "step": "来源类型初筛",
			
 
				-            "可推导来源数": sum(len(v) for v in derivable_sources.values())
			
 
				-        }
			
 
				-    ):
			
 
				-        result1 = await Runner.run(step1_agent, input=step1_messages)
			
 
				-    print(f"\n✅ Step 1 结果:\n{result1.final_output[:300]}...\n")
			
 
				-
			
 
				-    step1_result = extract_json(result1.final_output)
			
 
				-
			
 
				-    # 暂时只返回 Step 1 结果，不继续后面的步骤
			
 
				-    return {
			
 
				-        "灵感点": point['name'],
			
 
				-        "step1_来源可能性分析": step1_result
			
 
				-    }
			
 
				-
			
 
				-    # TODO: 后续步骤暂时注释掉，先把 Step 1 做扎实
			
 
				-    # selected_types = step1_result.get('选择的来源类型', [])
			
 
				-    #
			
 
				-    # # ========== Step 2: 解构每个来源（输入→处理→输出） ==========
			
 
				-    # print(f"\n{'='*60}")
			
 
				-    # print(f"Step 2: 解构每个来源 - 输入→处理→输出 (针对类型: {', '.join(selected_types)})")
			
 
				-    # print(f"{'='*60}")
			
 
				-    #
			
 
				-    # step2_results = {}
			
 
				-    #
			
 
				-    # for source_type in selected_types:
			
 
				-    #     if source_type == "B":
			
 
				-    #         print(f"\n▶ Step 2.B: 解构从博主历史如何得到灵感")
			
 
				-    #         step2_messages = [{
			
 
				-    #             "role": "user",
			
 
				-    #             "content": content + [{
			
 
				-    #                 "type": "input_text",
			
 
				-    #                 "text": f"""
			
 
				-    # ---
			
 
				-    #
			
 
				-    # Step 1 已确定：这个灵感点可能来自"博主账号历史"
			
 
				-    #
			
 
				-    # 请解构：从博主历史如何一步步得到这个灵感点？
			
 
				-    #
			
 
				-    # 要求明确标识：
			
 
				-    # - 输入：具体是哪个历史帖子？里面有什么内容？
			
 
				-    # - 处理：如何从输入一步步推导到灵感？（步骤1、2、3...）
			
 
				-    # - 输出：最终得到的灵感点
			
 
				-    # """
			
 
				-    #             }]
			
 
				-    #         }]
			
 
				-    #         result2b = await Runner.run(step2_b_agent, input=step2_messages)
			
 
				-    #         print(f"\n✅ Step 2.B 结果:\n{result2b.final_output[:300]}...\n")
			
 
				-    #         step2_results['B'] = extract_json(result2b.final_output)
			
 
				-    #
			
 
				-    #     elif source_type == "C":
			
 
				-    #         print(f"\n▶ Step 2.C: 解构从外部信息如何得到灵感")
			
 
				-    #         step2_messages = [{
			
 
				-    #             "role": "user",
			
 
				-    #             "content": content + [{
			
 
				-    #                 "type": "input_text",
			
 
				-    #                 "text": f"""
			
 
				-    # ---
			
 
				-    #
			
 
				-    # Step 1 已确定：这个灵感点可能来自"外部信息"
			
 
				-    #
			
 
				-    # 请解构：从外部信息如何一步步得到这个灵感点？
			
 
				-    #
			
 
				-    # 要求明确标识：
			
 
				-    # - 输入：具体是什么外部信息？（热点/梗/现象）
			
 
				-    # - 处理：如何从输入一步步推导到灵感？（步骤1、2、3、4...）
			
 
				-    # - 输出：最终得到的灵感点
			
 
				-    # """
			
 
				-    #             }]
			
 
				-    #         }]
			
 
				-    #         result2c = await Runner.run(step2_c_agent, input=step2_messages)
			
 
				-    #         print(f"\n✅ Step 2.C 结果:\n{result2c.final_output[:300]}...\n")
			
 
				-    #         step2_results['C'] = extract_json(result2c.final_output)
			
 
				-    #
			
 
				-    # # ========== Step 3: 路径验证 ==========
			
 
				-    # print(f"\n{'='*60}")
			
 
				-    # print("Step 3: 路径验证")
			
 
				-    # print(f"{'='*60}")
			
 
				-    #
			
 
				-    # step3_results = []
			
 
				-    #
			
 
				-    # for source_type in selected_types:
			
 
				-    #     print(f"\n▶ Step 3.{source_type}: 验证来源路径")
			
 
				-    #
			
 
				-    #     step2_analysis = json.dumps(step2_results.get(source_type, {}), ensure_ascii=False, indent=2)
			
 
				-    #
			
 
				-    #     step3_messages = [{
			
 
				-    #         "role": "user",
			
 
				-    #         "content": [{
			
 
				-    #             "type": "input_text",
			
 
				-    #             "text": f"""
			
 
				-    # 基于前面的分析：
			
 
				-    #
			
 
				-    # Step 1: 初筛选择了来源类型 {source_type}
			
 
				-    # Step 2: 深入分析结果：
			
 
				-    # {step2_analysis}
			
 
				-    #
			
 
				-    # 请对这个来源路径进行验证：列出支持证据、反驳点、给出评分。
			
 
				-    # """
			
 
				-    #         }]
			
 
				-    #     }]
			
 
				-    #
			
 
				-    #     result3 = await Runner.run(step3_agent, input=step3_messages)
			
 
				-    #     print(f"\n✅ Step 3.{source_type} 结果:\n{result3.final_output[:300]}...\n")
			
 
				-    #     step3_results.append(extract_json(result3.final_output))
			
 
				-    #
			
 
				-    # # ========== Step 4: 综合结论 ==========
			
 
				-    # print(f"\n{'='*60}")
			
 
				-    # print("Step 4: 综合结论")
			
 
				-    # print(f"{'='*60}")
			
 
				-    #
			
 
				-    # all_analysis = {
			
 
				-    #     "step1": step1_result,
			
 
				-    #     "step2": step2_results,
			
 
				-    #     "step3": step3_results
			
 
				-    # }
			
 
				-    #
			
 
				-    # step4_messages = [{
			
 
				-    #     "role": "user",
			
 
				-    #     "content": [{
			
 
				-    #         "type": "input_text",
			
 
				-    #         "text": f"""
			
 
				-    # 基于前面所有步骤的分析：
			
 
				-    #
			
 
				-    # {json.dumps(all_analysis, ensure_ascii=False, indent=2)}
			
 
				-    #
			
 
				-    # 请给出综合结论：最可能的来源路径、各来源占比、完整推导过程。
			
 
				-    # """
			
 
				-    #     }]
			
 
				-    # }]
			
 
				-    #
			
 
				-    # result4 = await Runner.run(step4_agent, input=step4_messages)
			
 
				-    # print(f"\n✅ Step 4 结果:\n{result4.final_output[:300]}...\n")
			
 
				-    #
			
 
				-    # final_result = {
			
 
				-    #     "灵感点": point['name'],
			
 
				-    #     "step1_来源类型初筛": step1_result,
			
 
				-    #     "step2_深入分析": step2_results,
			
 
				-    #     "step3_路径验证": step3_results,
			
 
				-    #     "step4_综合结论": extract_json(result4.final_output)
			
 
				-    # }
			
 
				-    #
			
 
				-    # return final_result
			
 
				-
			
 
				-
			
 
				-def extract_json(text: str) -> Dict:
			
 
				-    """从文本中提取JSON"""
			
 
				-    try:
			
 
				-        if "```json" in text:
			
 
				-            json_start = text.index("```json") + 7
			
 
				-            json_end = text.index("```", json_start)
			
 
				-            json_text = text[json_start:json_end].strip()
			
 
				-        elif "```" in text:
			
 
				-            json_start = text.index("```") + 3
			
 
				-            json_end = text.index("```", json_start)
			
 
				-            json_text = text[json_start:json_end].strip()
			
 
				-        else:
			
 
				-            json_text = text
			
 
				-        return json.loads(json_text)
			
 
				-    except:
			
 
				-        return {"原始输出": text}
			
 
				-
			
 
				-
			
 
				-# ============================================================================
			
 
				-# Main
			
 
				-# ============================================================================
			
 
				-
			
 
				-async def main(current_time, log_url):
			
 
				-    import sys
			
 
				-
			
 
				-    # 默认测试文件
			
 
				-    DEFAULT_TEST_FILE = "examples_new/阿里多多酱/output/685b593800000000120141d3_20251104_111017.json"
			
 
				-
			
 
				-    # 参数解析
			
 
				-    if len(sys.argv) < 2:
			
 
				-        print(f"未提供参数，使用默认测试文件: {DEFAULT_TEST_FILE}")
			
 
				-        what_result_file = DEFAULT_TEST_FILE
			
 
				-    else:
			
 
				-        what_result_file = sys.argv[1]
			
 
				-
			
 
				-    # 解析文件名：{帖子ID}_{运行日期}_{运行时间}.json
			
 
				-    filename = os.path.basename(what_result_file)
			
 
				-    filename_without_ext = filename.replace('.json', '')
			
 
				-    parts = filename_without_ext.split('_')
			
 
				-
			
 
				-    if len(parts) < 3:
			
 
				-        print(f"❌ 文件名格式不正确: {filename}")
			
 
				-        print("期望格式: {帖子ID}_{运行日期}_{运行时间}.json")
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    post_id = parts[0]
			
 
				-    run_date = parts[1]
			
 
				-    run_time = parts[2]
			
 
				-
			
 
				-    print("="*80)
			
 
				-    print("HOW 解构 V9 - 点依赖关系处理")
			
 
				-    print("="*80)
			
 
				-    print(f"\n目标帖子ID: {post_id}")
			
 
				-    print(f"运行日期: {run_date}")
			
 
				-    print(f"运行时间: {run_time}")
			
 
				-
			
 
				-    # 读取 what 解构结果
			
 
				-    what_result = read_json(what_result_file)
			
 
				-    if not what_result:
			
 
				-        print(f"❌ 无法读取文件: {what_result_file}")
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    # 从输入路径中提取账号名称
			
 
				-    # 路径格式: examples_new/{账号名}/output/{帖子ID}_{运行日期}_{运行时间}.json
			
 
				-    path_parts = what_result_file.split('/')
			
 
				-    if len(path_parts) >= 3 and path_parts[0] == 'examples_new':
			
 
				-        author_name = path_parts[1]
			
 
				-    else:
			
 
				-        print(f"❌ 无法从路径中提取账号名称: {what_result_file}")
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    # 构建路径
			
 
				-    base_dir = f"examples_new/{author_name}"
			
 
				-    history_dir = f"{base_dir}/作者历史帖子"
			
 
				-
			
 
				-    # 读取目标帖子信息（用于获取账号名称）
			
 
				-    target_post_file = f"{history_dir}/{post_id}.json"
			
 
				-    target_post = read_json(target_post_file)
			
 
				-    account_name = target_post.get('channel_account_name', author_name) if target_post else author_name
			
 
				-
			
 
				-    # 加载博主历史数据
			
 
				-    print(f"\n加载博主历史数据...")
			
 
				-    blogger_history = load_blogger_history_v8(history_dir, post_id)
			
 
				-    print(f"✓ 已加载 {blogger_history['历史帖子数']} 个历史帖子")
			
 
				-
			
 
				-    # 提取所有三类点（即使只分析部分类型，也需要提取所有点用于依赖关系判断）
			
 
				-    all_points = extract_all_points_v9(what_result)
			
 
				-
			
 
				-    # 配置：决定分析哪些类型的点
			
 
				-    analyze_types = ['灵感点', '目的点', '关键点']  # 分析所有三类点
			
 
				-
			
 
				-    # 统计提取的点
			
 
				-    print(f"\n从 WHAT 解构中提取的点：")
			
 
				-    for point_type in ['灵感点', '目的点', '关键点']:
			
 
				-        count = len(all_points[point_type])
			
 
				-        print(f"  - {point_type}: {count} 个")
			
 
				-
			
 
				-    # 显示要分析的点
			
 
				-    print(f"\n本次分析范围: {', '.join(analyze_types)}")
			
 
				-    points_to_analyze = []
			
 
				-    for point_type in analyze_types:
			
 
				-        for point in all_points[point_type]:
			
 
				-            points_to_analyze.append(point)
			
 
				-            print(f"  - {point['id']}: {point['name']}")
			
 
				-
			
 
				-    # 对每个点进行溯源分析
			
 
				-    source_analysis_results = []
			
 
				-
			
 
				-    for idx, point in enumerate(points_to_analyze, 1):
			
 
				-        # 为每个点创建一个自定义span
			
 
				-        point_name_short = point['name'][:30] + "..." if len(point['name']) > 30 else point['name']
			
 
				-        with custom_span(
			
 
				-            name=f"{point['id']}: {point_name_short}",
			
 
				-            data={
			
 
				-                "point_index": f"{idx}/{len(points_to_analyze)}",
			
 
				-                "point_id": point['id'],
			
 
				-                "point_type": point['type'],
			
 
				-                "point_name": point['name'],
			
 
				-                "analysis_stage": "HOW解构溯源"
			
 
				-            }
			
 
				-        ):
			
 
				-            result = await analyze_point_step_by_step(
			
 
				-                point, all_points, blogger_history, account_name
			
 
				-            )
			
 
				-        source_analysis_results.append(result)
			
 
				-
			
 
				-        # 添加延迟避免API限流
			
 
				-        await asyncio.sleep(2)
			
 
				-
			
 
				-    # 保存结果
			
 
				-    now = datetime.now()
			
 
				-    output_filename = f"{post_id}_{now.strftime('%Y%m%d')}_{now.strftime('%H%M%S')}.json"
			
 
				-    output_dir = f"{base_dir}/how_output"
			
 
				-    os.makedirs(output_dir, exist_ok=True)
			
 
				-    output_file = f"{output_dir}/{output_filename}"
			
 
				-
			
 
				-    # 统计各类型点的数量
			
 
				-    points_stats = {
			
 
				-        point_type: len(all_points[point_type])
			
 
				-        for point_type in ['灵感点', '目的点', '关键点']
			
 
				-    }
			
 
				-
			
 
				-    final_result = {
			
 
				-        "how_解构_V9": {
			
 
				-            "版本说明": "V9 - 点依赖关系处理，支持机器可模拟的推测路径",
			
 
				-            "目标帖子ID": post_id,
			
 
				-            "运行时间": now.strftime('%Y-%m-%d %H:%M:%S'),
			
 
				-            "log_url": log_url,
			
 
				-            "历史数据统计": {
			
 
				-                "历史帖子数": blogger_history['历史帖子数'],
			
 
				-                "数据格式": "多模态（图片 + 结构化文本）"
			
 
				-            },
			
 
				-            "点统计": points_stats,
			
 
				-            "分析范围": ', '.join(analyze_types),
			
 
				-            "分析数量": len(points_to_analyze),
			
 
				-            "点依赖关系": {
			
 
				-                "说明": "灵感点 ↔ 目的点（双向互推），灵感点,目的点 → 关键点（单向推导）",
			
 
				-                "规则": "同类型的点不能互相推导"
			
 
				-            },
			
 
				-            "溯源分析结果": source_analysis_results
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    with open(output_file, 'w', encoding='utf-8') as f:
			
 
				-        json.dump(final_result, f, ensure_ascii=False, indent=2)
			
 
				-
			
 
				-    print("\n" + "="*80)
			
 
				-    print(f"✓ V9 溯源分析完成！结果已保存到：")
			
 
				-    print(f"  {output_file}")
			
 
				-    print("="*80)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    current_time, log_url = set_trace()
			
 
				-    with trace("how decode v9"):
			
 
				-        asyncio.run(main(current_time, log_url))
			
--- a/lib/README_async_utils.md
+++ b/lib/README_async_utils.md
@@ -0,0 +1,114 @@
 
				+# 异步并发处理工具
			
 
				+
			
 
				+## 文件说明
			
 
				+
			
 
				+`lib/async_utils.py` - 提供通用的异步任务并发执行功能
			
 
				+
			
 
				+## 功能列表
			
 
				+
			
 
				+### 1. `process_tasks_with_semaphore`
			
 
				+
			
 
				+基本的并发处理函数，使用信号量控制并发数量。
			
 
				+
			
 
				+#### 参数
			
 
				+
			
 
				+- `tasks`: 任务列表
			
 
				+- `process_func`: 处理单个任务的异步函数，签名为 `async def func(task, index) -> result`
			
 
				+- `max_concurrent`: 最大并发数（默认: 3）
			
 
				+- `show_progress`: 是否显示进度信息（默认: True）
			
 
				+
			
 
				+#### 使用示例
			
 
				+
			
 
				+```python
			
 
				+from lib.async_utils import process_tasks_with_semaphore
			
 
				+
			
 
				+# 定义处理单个任务的函数
			
 
				+async def process_one_task(task: dict, index: int) -> dict:
			
 
				+    # 你的处理逻辑
			
 
				+    result = await some_async_operation(task)
			
 
				+    return result
			
 
				+
			
 
				+# 准备任务列表
			
 
				+tasks = [task1, task2, task3, ...]
			
 
				+
			
 
				+# 并发处理所有任务
			
 
				+results = await process_tasks_with_semaphore(
			
 
				+    tasks,
			
 
				+    process_one_task,
			
 
				+    max_concurrent=3,
			
 
				+    show_progress=True
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 2. `process_tasks_with_semaphore_retry`
			
 
				+
			
 
				+支持重试的并发处理函数，适用于不稳定的网络请求。
			
 
				+
			
 
				+#### 参数
			
 
				+
			
 
				+- `tasks`: 任务列表
			
 
				+- `process_func`: 处理单个任务的异步函数
			
 
				+- `max_concurrent`: 最大并发数（默认: 3）
			
 
				+- `max_retries`: 最大重试次数（默认: 3）
			
 
				+- `show_progress`: 是否显示进度信息（默认: True）
			
 
				+
			
 
				+#### 使用示例
			
 
				+
			
 
				+```python
			
 
				+from lib.async_utils import process_tasks_with_semaphore_retry
			
 
				+
			
 
				+# 定义可能失败的异步任务
			
 
				+async def unstable_task(task: dict, index: int) -> dict:
			
 
				+    # 可能会抛出异常的操作
			
 
				+    result = await api_call(task)
			
 
				+    return result
			
 
				+
			
 
				+# 并发处理，失败时自动重试
			
 
				+results = await process_tasks_with_semaphore_retry(
			
 
				+    tasks,
			
 
				+    unstable_task,
			
 
				+    max_concurrent=3,
			
 
				+    max_retries=3,
			
 
				+    show_progress=True
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## 在 match_inspiration_to_persona.py 中的使用
			
 
				+
			
 
				+```python
			
 
				+# 1. 导入工具
			
 
				+from lib.async_utils import process_tasks_with_semaphore
			
 
				+
			
 
				+# 2. 定义处理函数
			
 
				+async def process_match_task_with_error_handling(task: dict, index: int) -> dict:
			
 
				+    try:
			
 
				+        result = await match_single_task(task)
			
 
				+        return result
			
 
				+    except Exception as e:
			
 
				+        # 错误处理逻辑
			
 
				+        return error_result
			
 
				+
			
 
				+# 3. 并发处理任务
			
 
				+results = await process_tasks_with_semaphore(
			
 
				+    test_tasks,
			
 
				+    process_match_task_with_error_handling,
			
 
				+    max_concurrent=3,
			
 
				+    show_progress=True
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## 特点
			
 
				+
			
 
				+1. **通用性**: 可用于任何需要并发处理的异步任务
			
 
				+2. **并发控制**: 使用信号量控制并发数量，避免资源耗尽
			
 
				+3. **顺序保证**: 返回结果与输入任务的顺序一致
			
 
				+4. **进度显示**: 可选的进度显示功能
			
 
				+5. **重试支持**: 第二个函数支持自动重试机制
			
 
				+
			
 
				+## 适用场景
			
 
				+
			
 
				+- API 批量请求
			
 
				+- 文件批量处理
			
 
				+- 数据库批量操作
			
 
				+- LLM 批量推理
			
 
				+- 任何需要并发控制的异步操作
			
--- a/lib/async_utils.py
+++ b/lib/async_utils.py
@@ -0,0 +1,99 @@
 
				+"""
			
 
				+异步并发处理工具模块
			
 
				+
			
 
				+提供通用的异步任务并发执行功能
			
 
				+"""
			
 
				+import asyncio
			
 
				+from typing import List, Callable, Any, Awaitable
			
 
				+
			
 
				+
			
 
				+async def process_tasks_with_semaphore(
			
 
				+        tasks: List[Any],
			
 
				+        process_func: Callable[[Any, int], Awaitable[Any]],
			
 
				+        max_concurrent: int = 3,
			
 
				+        show_progress: bool = True
			
 
				+) -> List[Any]:
			
 
				+    """使用信号量控制并发数量处理任务
			
 
				+
			
 
				+    Args:
			
 
				+        tasks: 任务列表
			
 
				+        process_func: 处理单个任务的异步函数，签名为 async def func(task, index) -> result
			
 
				+        max_concurrent: 最大并发数
			
 
				+        show_progress: 是否显示进度信息
			
 
				+
			
 
				+    Returns:
			
 
				+        结果列表（保持原始顺序）
			
 
				+
			
 
				+    Example:
			
 
				+        async def process_one(task, index):
			
 
				+            result = await some_async_operation(task)
			
 
				+            return result
			
 
				+
			
 
				+        tasks = [task1, task2, task3]
			
 
				+        results = await process_tasks_with_semaphore(tasks, process_one, max_concurrent=3)
			
 
				+    """
			
 
				+    semaphore = asyncio.Semaphore(max_concurrent)
			
 
				+
			
 
				+    async def process_with_semaphore(task: Any, index: int):
			
 
				+        """包装处理函数，添加信号量控制"""
			
 
				+        async with semaphore:
			
 
				+            result = await process_func(task, index)
			
 
				+            if show_progress:
			
 
				+                print(f"[{index + 1}/{len(tasks)}] 任务完成")
			
 
				+            return result
			
 
				+
			
 
				+    # 并发处理所有任务
			
 
				+    results = await asyncio.gather(
			
 
				+        *[process_with_semaphore(task, i) for i, task in enumerate(tasks)]
			
 
				+    )
			
 
				+
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+async def process_tasks_with_semaphore_retry(
			
 
				+        tasks: List[Any],
			
 
				+        process_func: Callable[[Any, int], Awaitable[Any]],
			
 
				+        max_concurrent: int = 3,
			
 
				+        max_retries: int = 3,
			
 
				+        show_progress: bool = True
			
 
				+) -> List[Any]:
			
 
				+    """使用信号量控制并发数量处理任务（支持重试）
			
 
				+
			
 
				+    Args:
			
 
				+        tasks: 任务列表
			
 
				+        process_func: 处理单个任务的异步函数，签名为 async def func(task, index) -> result
			
 
				+        max_concurrent: 最大并发数
			
 
				+        max_retries: 最大重试次数
			
 
				+        show_progress: 是否显示进度信息
			
 
				+
			
 
				+    Returns:
			
 
				+        结果列表（保持原始顺序）
			
 
				+    """
			
 
				+    semaphore = asyncio.Semaphore(max_concurrent)
			
 
				+
			
 
				+    async def process_with_semaphore_and_retry(task: Any, index: int):
			
 
				+        """包装处理函数，添加信号量控制和重试逻辑"""
			
 
				+        async with semaphore:
			
 
				+            for attempt in range(max_retries):
			
 
				+                try:
			
 
				+                    result = await process_func(task, index)
			
 
				+                    if show_progress:
			
 
				+                        print(f"[{index + 1}/{len(tasks)}] 任务完成")
			
 
				+                    return result
			
 
				+                except Exception as e:
			
 
				+                    if attempt < max_retries - 1:
			
 
				+                        if show_progress:
			
 
				+                            print(f"[{index + 1}/{len(tasks)}] 重试 {attempt + 1}/{max_retries - 1}: {e}")
			
 
				+                        await asyncio.sleep(1)  # 重试前等待1秒
			
 
				+                    else:
			
 
				+                        if show_progress:
			
 
				+                            print(f"[{index + 1}/{len(tasks)}] 失败（已重试 {max_retries} 次）: {e}")
			
 
				+                        raise
			
 
				+
			
 
				+    # 并发处理所有任务
			
 
				+    results = await asyncio.gather(
			
 
				+        *[process_with_semaphore_and_retry(task, i) for i, task in enumerate(tasks)],
			
 
				+        return_exceptions=True  # 返回异常而不是抛出
			
 
				+    )
			
 
				+
			
 
				+    return results
			
--- a/lib/client.py
+++ b/lib/client.py
@@ -0,0 +1,17 @@
 
				+
			
 
				+from agents import Agent, Runner, OpenAIChatCompletionsModel
			
 
				+from openai import AsyncOpenAI
			
 
				+import os
			
 
				+BASE_URL = os.getenv("EXAMPLE_BASE_URL") or "https://openrouter.ai/api/v1"
			
 
				+API_KEY = os.getenv("OPENROUTER_API_KEY")
			
 
				+MODEL_NAME = "google/gemini-2.5-flash"
			
 
				+client = AsyncOpenAI(
			
 
				+    base_url=BASE_URL,
			
 
				+    api_key=API_KEY,
			
 
				+    max_retries=5,
			
 
				+)
			
 
				+def get_model(model_name=MODEL_NAME):
			
 
				+    return OpenAIChatCompletionsModel(
			
 
				+        openai_client=client,
			
 
				+        model=model_name,
			
 
				+    )
			
--- a/lib/data_loader.py
+++ b/lib/data_loader.py
@@ -0,0 +1,154 @@
 
				+"""
			
 
				+通用数据加载模块
			
 
				+
			
 
				+提供项目中常用的数据加载函数
			
 
				+"""
			
 
				+import os
			
 
				+import sys
			
 
				+from typing import List
			
 
				+from lib.utils import read_json
			
 
				+
			
 
				+
			
 
				+def load_persona_data(persona_dir: str) -> dict:
			
 
				+    """加载人设数据
			
 
				+
			
 
				+    Args:
			
 
				+        persona_dir: 人设目录路径
			
 
				+
			
 
				+    Returns:
			
 
				+        人设数据字典
			
 
				+
			
 
				+    Raises:
			
 
				+        SystemExit: 文件不存在时退出
			
 
				+    """
			
 
				+    persona_data_path = os.path.join(persona_dir, "人设.json")
			
 
				+    try:
			
 
				+        return read_json(persona_data_path)
			
 
				+    except FileNotFoundError:
			
 
				+        print(f"❌ 找不到人设数据文件: {persona_data_path}")
			
 
				+        print(f"请检查路径是否正确: {persona_dir}")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+def load_inspiration_list(persona_dir: str) -> List[str]:
			
 
				+    """加载灵感点列表
			
 
				+
			
 
				+    Args:
			
 
				+        persona_dir: 人设目录路径
			
 
				+
			
 
				+    Returns:
			
 
				+        灵感点文本列表
			
 
				+
			
 
				+    Raises:
			
 
				+        SystemExit: 文件不存在或格式错误时退出
			
 
				+    """
			
 
				+    inspiration_list_path = os.path.join(persona_dir, "灵感点.json")
			
 
				+    try:
			
 
				+        inspiration_data = read_json(inspiration_list_path)
			
 
				+        if not isinstance(inspiration_data, list) or len(inspiration_data) == 0:
			
 
				+            print(f"❌ 灵感文件格式错误或为空: {inspiration_list_path}")
			
 
				+            sys.exit(1)
			
 
				+        return [item["灵感点"] for item in inspiration_data]
			
 
				+    except FileNotFoundError:
			
 
				+        print(f"❌ 找不到灵感文件: {inspiration_list_path}")
			
 
				+        print("请先运行 extract_inspirations.py 生成灵感点文件")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+def load_inspiration_data(persona_dir: str) -> List[dict]:
			
 
				+    """加载完整的灵感点数据（包含 meta 信息）
			
 
				+
			
 
				+    Args:
			
 
				+        persona_dir: 人设目录路径
			
 
				+
			
 
				+    Returns:
			
 
				+        灵感点数据列表，每项包含 {"灵感点": str, "meta": dict}
			
 
				+
			
 
				+    Raises:
			
 
				+        SystemExit: 文件不存在或格式错误时退出
			
 
				+    """
			
 
				+    inspiration_list_path = os.path.join(persona_dir, "灵感点.json")
			
 
				+    try:
			
 
				+        inspiration_data = read_json(inspiration_list_path)
			
 
				+        if not isinstance(inspiration_data, list) or len(inspiration_data) == 0:
			
 
				+            print(f"❌ 灵感文件格式错误或为空: {inspiration_list_path}")
			
 
				+            sys.exit(1)
			
 
				+        return inspiration_data
			
 
				+    except FileNotFoundError:
			
 
				+        print(f"❌ 找不到灵感文件: {inspiration_list_path}")
			
 
				+        print("请先运行 extract_inspirations.py 生成灵感点文件")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+def select_inspiration(inspiration_arg: str, inspiration_list: List[str]) -> str:
			
 
				+    """根据参数选择灵感
			
 
				+
			
 
				+    Args:
			
 
				+        inspiration_arg: 灵感参数（数字索引或灵感名称）
			
 
				+        inspiration_list: 灵感点文本列表
			
 
				+
			
 
				+    Returns:
			
 
				+        选中的灵感点文本
			
 
				+
			
 
				+    Raises:
			
 
				+        SystemExit: 选择失败时退出
			
 
				+    """
			
 
				+    try:
			
 
				+        # 尝试作为索引解析
			
 
				+        inspiration_index = int(inspiration_arg)
			
 
				+        if 0 <= inspiration_index < len(inspiration_list):
			
 
				+            inspiration = inspiration_list[inspiration_index]
			
 
				+            print(f"使用灵感[{inspiration_index}]: {inspiration}")
			
 
				+            return inspiration
			
 
				+        else:
			
 
				+            print(f"❌ 灵感索引超出范围: {inspiration_index} (有效范围: 0-{len(inspiration_list)-1})")
			
 
				+    except ValueError:
			
 
				+        # 不是数字，当作灵感名称
			
 
				+        if inspiration_arg in inspiration_list:
			
 
				+            print(f"使用灵感: {inspiration_arg}")
			
 
				+            return inspiration_arg
			
 
				+        else:
			
 
				+            print(f"❌ 找不到灵感: {inspiration_arg}")
			
 
				+
			
 
				+    # 显示可用灵感列表后退出
			
 
				+    print(f"可用灵感列表:")
			
 
				+    for i, insp in enumerate(inspiration_list[:10]):
			
 
				+        print(f"  {i}: {insp}")
			
 
				+    if len(inspiration_list) > 10:
			
 
				+        print(f"  ... 还有 {len(inspiration_list) - 10} 个")
			
 
				+    sys.exit(1)
			
 
				+
			
 
				+
			
 
				+def load_step1_result(persona_dir: str, inspiration: str, model_name: str, scope: str = "all") -> dict:
			
 
				+    """加载 step1 匹配结果
			
 
				+
			
 
				+    Args:
			
 
				+        persona_dir: 人设目录路径
			
 
				+        inspiration: 灵感点名称
			
 
				+        model_name: 模型名称（如 "google/gemini-2.5-pro"）
			
 
				+        scope: 范围标识（"all" 或 "top10" 等）
			
 
				+
			
 
				+    Returns:
			
 
				+        step1 结果字典
			
 
				+
			
 
				+    Raises:
			
 
				+        SystemExit: 文件不存在时退出
			
 
				+    """
			
 
				+    # 提取模型简称
			
 
				+    model_name_short = model_name.replace("google/", "").replace("/", "_")
			
 
				+
			
 
				+    # 构建文件路径
			
 
				+    step1_file = os.path.join(
			
 
				+        persona_dir,
			
 
				+        "how",
			
 
				+        "灵感点",
			
 
				+        inspiration,
			
 
				+        f"{scope}_step1_灵感人设匹配_{model_name_short}.json"
			
 
				+    )
			
 
				+
			
 
				+    try:
			
 
				+        return read_json(step1_file)
			
 
				+    except FileNotFoundError:
			
 
				+        print(f"❌ 找不到 step1 结果文件: {step1_file}")
			
 
				+        print(f"请先运行 step1_inspiration_match.py 生成结果")
			
 
				+        sys.exit(1)
			
--- a/lib/match_analyzer.py
+++ b/lib/match_analyzer.py
@@ -0,0 +1,348 @@
 
				+"""
			
 
				+通用的信息匹配分析模块
			
 
				+
			
 
				+分析 <B> 在 <A> 中的字面语义匹配关系
			
 
				+适用于任何信息匹配场景
			
 
				+
			
 
				+提供两个接口：
			
 
				+1. match_single(b_content, a_content, model_name, b_context="", a_context="") - 单个匹配
			
 
				+2. match_batch(b_items, a_content, model_name, b_context="", a_context="") - 批量匹配
			
 
				+
			
 
				+支持可选的 Context 参数：
			
 
				+- b_context: B 的补充上下文（帮助理解 B）
			
 
				+- a_context: A 的补充上下文（帮助理解 A）
			
 
				+- Context 默认为空，不提供时不会出现在 prompt 中
			
 
				+"""
			
 
				+import json
			
 
				+from typing import List
			
 
				+from agents import Agent, Runner
			
 
				+from agents.tracing.create import custom_span
			
 
				+from lib.client import get_model
			
 
				+
			
 
				+
			
 
				+# ========== System Prompt ==========
			
 
				+MATCH_SYSTEM_PROMPT = """
			
 
				+# 任务
			
 
				+分析 <B> 在 <A> 中的字面语义匹配关系。
			
 
				+
			
 
				+## 输入说明
			
 
				+
			
 
				+- **<B></B>**: 待匹配的内容（必选）
			
 
				+- **<A></A>**: 上下文内容（必选）
			
 
				+- **<B_Context></B_Context>**: B 的补充上下文（可选，帮助理解 B）
			
 
				+- **<A_Context></A_Context>**: A 的补充上下文（可选，帮助理解 A）
			
 
				+
			
 
				+**重要**：匹配分析发生在 <B> 和 <A> 之间，Context 仅作为补充理解的辅助信息。
			
 
				+
			
 
				+## 分析方法
			
 
				+
			
 
				+### 核心原则：字面语义匹配
			
 
				+只关注 <B> 和 <A> 在**字面词语和概念**上的重叠度，不考虑抽象关系。
			
 
				+
			
 
				+### 分析步骤
			
 
				+
			
 
				+1. **提取关键词/概念**
			
 
				+   - 从 <B> 中提取：关键词语和核心概念
			
 
				+   - 从 <A> 中提取：关键词语和核心概念
			
 
				+
			
 
				+2. **识别相同部分**
			
 
				+   - 完全相同的词语（字面一致）
			
 
				+   - 同义词或近义词
			
 
				+
			
 
				+3. **识别增量部分**
			
 
				+   - <B> 中有，但 <A> 中没有的词语/概念
			
 
				+   - 这些是 <B> 相对于 <A> 的额外信息
			
 
				+
			
 
				+4. **计算匹配分数**
			
 
				+   - 基于相同部分的覆盖度
			
 
				+   - 考虑词语/概念的重要性
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 评分标准（0-1分）
			
 
				+
			
 
				+**字面匹配度评分：**
			
 
				+- **0.9-1.0**：<B> 和 <A> 几乎完全一致，词语高度重叠
			
 
				+- **0.7-0.8**：大部分核心词语/概念匹配，少量增量
			
 
				+- **0.5-0.6**：部分核心词语/概念匹配，有一定增量
			
 
				+- **0.3-0.4**：少量词语/概念匹配，大部分不同
			
 
				+- **0.1-0.2**：几乎无字面匹配，仅有概念联系
			
 
				+- **0.0**：完全无关
			
 
				+
			
 
				+**重要原则：**
			
 
				+- 如果 <A> 是抽象/元级别的描述，而 <B> 是具体内容，字面上无词语重叠，应给低分（0.1-0.3）
			
 
				+- 优先考虑具体词语的匹配，而非抽象概念的包含关系
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 输出格式（严格JSON）
			
 
				+```json
			
 
				+{
			
 
				+  "score": 0.75,
			
 
				+  "score说明": "简要说明分数是如何计算的，基于哪些词语/概念的匹配",
			
 
				+  "相同部分": {
			
 
				+    "B中的词1": "与A中的'某词'完全相同",
			
 
				+    "B中的词2": "与A中的'某词'同义"
			
 
				+  },
			
 
				+  "增量部分": {
			
 
				+    "B中的词3": "A中无此概念"
			
 
				+  }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**输出要求**：
			
 
				+1. 必须严格按照上述JSON格式输出（score 和 score说明在最前面）
			
 
				+2. 所有字段都必须填写
			
 
				+3. **score字段**：必须是0-1之间的浮点数，保留2位小数
			
 
				+4. **score说明**：必须简洁说明评分依据（基于相同部分的覆盖度）
			
 
				+5. **相同部分**：字典格式，key是<B>中的词语，value说明它与<A>中哪个词的关系（完全相同/同义）；如果没有则填写空字典 {}
			
 
				+6. **增量部分**：字典格式，key是<B>中的词语，value说明为什么是增量（如"A中无此概念"）；如果没有增量部分，填写空字典 {}
			
 
				+7. **关键约束**：相同部分和增量部分的key必须只能是<B>中的词语，不能是<A>中的词语
			
 
				+""".strip()
			
 
				+
			
 
				+
			
 
				+def create_match_agent(model_name: str) -> Agent:
			
 
				+    """创建信息匹配分析的 Agent
			
 
				+
			
 
				+    Args:
			
 
				+        model_name: 模型名称
			
 
				+
			
 
				+    Returns:
			
 
				+        Agent 实例
			
 
				+    """
			
 
				+    agent = Agent(
			
 
				+        name="Information Match Expert",
			
 
				+        instructions=MATCH_SYSTEM_PROMPT,
			
 
				+        model=get_model(model_name),
			
 
				+        tools=[],
			
 
				+    )
			
 
				+
			
 
				+    return agent
			
 
				+
			
 
				+
			
 
				+def parse_match_response(response_content: str) -> dict:
			
 
				+    """解析匹配响应
			
 
				+
			
 
				+    Args:
			
 
				+        response_content: Agent 返回的响应内容
			
 
				+
			
 
				+    Returns:
			
 
				+        解析后的字典
			
 
				+    """
			
 
				+    try:
			
 
				+        # 如果响应包含在 markdown 代码块中，提取 JSON 部分
			
 
				+        if "```json" in response_content:
			
 
				+            json_start = response_content.index("```json") + 7
			
 
				+            json_end = response_content.index("```", json_start)
			
 
				+            json_text = response_content[json_start:json_end].strip()
			
 
				+        elif "```" in response_content:
			
 
				+            json_start = response_content.index("```") + 3
			
 
				+            json_end = response_content.index("```", json_start)
			
 
				+            json_text = response_content[json_start:json_end].strip()
			
 
				+        else:
			
 
				+            json_text = response_content.strip()
			
 
				+
			
 
				+        return json.loads(json_text)
			
 
				+    except Exception as e:
			
 
				+        print(f"解析响应失败: {e}")
			
 
				+        return {
			
 
				+            "相同部分": {},
			
 
				+            "增量部分": {},
			
 
				+            "score": 0.0,
			
 
				+            "score说明": f"解析失败: {str(e)}"
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def _create_batch_agent(model_name: str) -> Agent:
			
 
				+    """创建批量匹配的 Agent
			
 
				+
			
 
				+    Args:
			
 
				+        model_name: 模型名称
			
 
				+
			
 
				+    Returns:
			
 
				+        Agent 实例
			
 
				+    """
			
 
				+    # 批量匹配的 System Prompt（在单个匹配基础上修改输出格式）
			
 
				+    batch_prompt = MATCH_SYSTEM_PROMPT.replace(
			
 
				+        "## 输出格式（严格JSON）",
			
 
				+        "## 输出格式（JSON数组）\n对每个 <B> 输出一个匹配结果："
			
 
				+    ).replace(
			
 
				+        "```json\n{",
			
 
				+        "```json\n[{"
			
 
				+    ).replace(
			
 
				+        "}\n```",
			
 
				+        "}]\n```"
			
 
				+    ) + "\n\n**额外要求**：数组长度必须等于 <B> 的数量，顺序对应"
			
 
				+
			
 
				+    agent = Agent(
			
 
				+        name="Batch Information Match Expert",
			
 
				+        instructions=batch_prompt,
			
 
				+        model=get_model(model_name),
			
 
				+        tools=[],
			
 
				+    )
			
 
				+
			
 
				+    return agent
			
 
				+
			
 
				+
			
 
				+async def _run_match_agent(
			
 
				+    agent: Agent,
			
 
				+    b_content: str,
			
 
				+    a_content: str,
			
 
				+    request_desc: str,
			
 
				+    b_context: str = "",
			
 
				+    a_context: str = ""
			
 
				+) -> str:
			
 
				+    """运行匹配 Agent 的公共逻辑
			
 
				+
			
 
				+    Args:
			
 
				+        agent: Agent 实例
			
 
				+        b_content: B 的内容
			
 
				+        a_content: A 的内容
			
 
				+        request_desc: 请求描述（如"并输出 JSON 格式"或"并输出 JSON 数组格式"）
			
 
				+        b_context: B 的上下文（可选）
			
 
				+        a_context: A 的上下文（可选）
			
 
				+
			
 
				+    Returns:
			
 
				+        Agent 的原始输出
			
 
				+    """
			
 
				+    # 构建任务描述
			
 
				+    b_section = f"<B>\n{b_content}\n</B>"
			
 
				+    if b_context:
			
 
				+        b_section += f"\n\n<B_Context>\n{b_context}\n</B_Context>"
			
 
				+
			
 
				+    a_section = f"<A>\n{a_content}\n</A>"
			
 
				+    if a_context:
			
 
				+        a_section += f"\n\n<A_Context>\n{a_context}\n</A_Context>"
			
 
				+
			
 
				+    task_description = f"""## 本次分析任务
			
 
				+
			
 
				+{b_section}
			
 
				+
			
 
				+{a_section}
			
 
				+
			
 
				+请严格按照系统提示中的要求分析 <B> 在 <A> 中的字面语义匹配关系，{request_desc}的结果。"""
			
 
				+
			
 
				+    # 构造消息
			
 
				+    messages = [{
			
 
				+        "role": "user",
			
 
				+        "content": [
			
 
				+            {
			
 
				+                "type": "input_text",
			
 
				+                "text": task_description
			
 
				+            }
			
 
				+        ]
			
 
				+    }]
			
 
				+
			
 
				+    # 使用 custom_span 追踪匹配过程
			
 
				+    # 截断显示内容，避免 span name 过长
			
 
				+    b_short = (b_content[:40] + "...") if len(b_content) > 40 else b_content
			
 
				+    a_short = (a_content[:40] + "...") if len(a_content) > 40 else a_content
			
 
				+
			
 
				+    with custom_span(
			
 
				+        name=f"匹配分析: {b_short} in {a_short}",
			
 
				+        data={
			
 
				+            "B": b_content,
			
 
				+            "A": a_content,
			
 
				+            "B_Context": b_context if b_context else None,
			
 
				+            "A_Context": a_context if a_context else None,
			
 
				+            "模式": request_desc
			
 
				+        }
			
 
				+    ):
			
 
				+        # 运行 Agent
			
 
				+        result = await Runner.run(agent, input=messages)
			
 
				+
			
 
				+    return result.final_output
			
 
				+
			
 
				+
			
 
				+async def match_single(
			
 
				+    b_content: str,
			
 
				+    a_content: str,
			
 
				+    model_name: str,
			
 
				+    b_context: str = "",
			
 
				+    a_context: str = ""
			
 
				+) -> dict:
			
 
				+    """单个匹配：分析一个 B 在 A 中的匹配
			
 
				+
			
 
				+    Args:
			
 
				+        b_content: B（待匹配）的内容
			
 
				+        a_content: A（上下文）的内容
			
 
				+        model_name: 使用的模型名称
			
 
				+        b_context: B 的补充上下文（可选，默认为空）
			
 
				+        a_context: A 的补充上下文（可选，默认为空）
			
 
				+
			
 
				+    Returns:
			
 
				+        匹配结果字典：{"相同部分": {}, "增量部分": {}, "score": 0.0, "score说明": ""}
			
 
				+    """
			
 
				+    try:
			
 
				+        # 创建 Agent
			
 
				+        agent = create_match_agent(model_name)
			
 
				+
			
 
				+        # 运行匹配
			
 
				+        output = await _run_match_agent(
			
 
				+            agent, b_content, a_content, "并输出 JSON 格式",
			
 
				+            b_context=b_context, a_context=a_context
			
 
				+        )
			
 
				+
			
 
				+        # 解析响应
			
 
				+        parsed_result = parse_match_response(output)
			
 
				+
			
 
				+        return parsed_result
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        return {
			
 
				+            "相同部分": {},
			
 
				+            "增量部分": {},
			
 
				+            "score": 0.0,
			
 
				+            "score说明": f"匹配过程出错: {str(e)}"
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+async def match_batch(
			
 
				+    b_items: List[str],
			
 
				+    a_content: str,
			
 
				+    model_name: str,
			
 
				+    b_context: str = "",
			
 
				+    a_context: str = ""
			
 
				+) -> List[dict]:
			
 
				+    """批量匹配：分析多个 B 在 A 中的匹配（一次调用）
			
 
				+
			
 
				+    Args:
			
 
				+        b_items: B列表（多个待匹配项）
			
 
				+        a_content: A（上下文）的内容
			
 
				+        model_name: 使用的模型名称
			
 
				+        b_context: B 的补充上下文（可选，默认为空）
			
 
				+        a_context: A 的补充上下文（可选，默认为空）
			
 
				+
			
 
				+    Returns:
			
 
				+        匹配结果列表：[{"相同部分": {}, "增量部分": {}, "score": 0.0, "score说明": ""}, ...]
			
 
				+    """
			
 
				+    try:
			
 
				+        # 创建批量匹配 Agent
			
 
				+        agent = _create_batch_agent(model_name)
			
 
				+
			
 
				+        # 构建 B 列表字符串
			
 
				+        b_list_str = "\n".join([f"- {item}" for item in b_items])
			
 
				+
			
 
				+        # 运行匹配
			
 
				+        output = await _run_match_agent(
			
 
				+            agent, b_list_str, a_content, "并输出 JSON 数组格式",
			
 
				+            b_context=b_context, a_context=a_context
			
 
				+        )
			
 
				+
			
 
				+        # 解析响应（期望是数组）
			
 
				+        parsed_result = parse_match_response(output)
			
 
				+
			
 
				+        # 如果返回的是数组，直接返回；如果是单个对象，包装成数组
			
 
				+        if isinstance(parsed_result, list):
			
 
				+            return parsed_result
			
 
				+        else:
			
 
				+            return [parsed_result]
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        # 返回错误信息（为每个 B 创建一个错误条目）
			
 
				+        return [{
			
 
				+            "相同部分": {},
			
 
				+            "增量部分": {},
			
 
				+            "score": 0.0,
			
 
				+            "score说明": f"匹配过程出错: {str(e)}"
			
 
				+        } for _ in b_items]
			
--- a/lib/my_trace.py
+++ b/lib/my_trace.py
@@ -0,0 +1,80 @@
 
				+from datetime import datetime
			
 
				+import logging
			
 
				+
			
 
				+def get_current_time():
			
 
				+    import uuid
			
 
				+    random_uuid = str(uuid.uuid4())
			
 
				+    return datetime.now().strftime("%Y%m%d-%H%M%S") + "_" + random_uuid[:2]
			
 
				+
			
 
				+def set_trace_logfire():
			
 
				+    from agents.tracing.setup import GLOBAL_TRACE_PROVIDER
			
 
				+    GLOBAL_TRACE_PROVIDER.shutdown()
			
 
				+    import logfire
			
 
				+    current_time = get_current_time()
			
 
				+    logfire.configure(service_name=f'{current_time}')
			
 
				+    logfire.instrument_openai_agents()
			
 
				+    import urllib.parse
			
 
				+    current_time_encoded = urllib.parse.quote(current_time)
			
 
				+    import logging
			
 
				+    LOG_LEVEL = "WARNING"
			
 
				+    # 设置日志
			
 
				+    logging.basicConfig(
			
 
				+        level=getattr(logging, LOG_LEVEL),
			
 
				+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
			
 
				+    )
			
 
				+    logger = logging.getLogger(__name__)
			
 
				+    log_url = f'https://logfire-us.pydantic.dev/semsevens/test?q=service_name+%3D+%27{current_time_encoded}%27&last=30d'
			
 
				+    logger.warning(f"任务日志链接: {log_url}")
			
 
				+    return current_time, log_url
			
 
				+
			
 
				+def set_trace():
			
 
				+    # 设置全局logging级别，覆盖所有子模块
			
 
				+    logging.basicConfig(level=logging.WARNING, force=True)
			
 
				+    # 确保根logger级别生效
			
 
				+    logging.getLogger().setLevel(logging.WARNING)
			
 
				+    return set_trace_smith()
			
 
				+    # return set_trace_logfire()
			
 
				+
			
 
				+
			
 
				+def set_trace_smith():
			
 
				+    from agents.tracing.setup import GLOBAL_TRACE_PROVIDER
			
 
				+    GLOBAL_TRACE_PROVIDER.shutdown()
			
 
				+    from agents import set_trace_processors
			
 
				+    from langsmith.wrappers import OpenAIAgentsTracingProcessor
			
 
				+    import logging
			
 
				+    current_time = get_current_time()
			
 
				+    set_trace_processors([OpenAIAgentsTracingProcessor(name=f'{current_time}')])
			
 
				+    import urllib.parse
			
 
				+    LOG_LEVEL = "WARNING"
			
 
				+    # 设置日志
			
 
				+    logging.basicConfig(
			
 
				+        level=getattr(logging, LOG_LEVEL),
			
 
				+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
			
 
				+    )
			
 
				+    logger = logging.getLogger(__name__)
			
 
				+    current_time_encoded = urllib.parse.quote(current_time)
			
 
				+    log_url = f'https://smith.langchain.com/o/3ebe0715-9709-4594-a0aa-40a77a4e10bd/projects/p/611fa0d6-5510-4f60-b693-87e2ccc2ea5f?timeModel=%7B%22duration%22%3A%227d%22%7D&searchModel=%7B%22filter%22%3A%22and%28eq%28is_root%2C+true%29%2C+eq%28name%2C+%5C%22{current_time_encoded}%5C%22%29%29%22%2C%22searchFilter%22%3A%22eq%28is_root%2C+true%29%22%7D'
			
 
				+    LOG_LEVEL = "WARNING"
			
 
				+    logger.warning(f"任务日志链接: {log_url}")
			
 
				+    return current_time, log_url
			
 
				+        
			
 
				+def set_debug():
			
 
				+    import logging
			
 
				+    # 设置全局日志级别为DEBUG，确保所有模块生效
			
 
				+    logging.basicConfig(
			
 
				+        level=logging.DEBUG,
			
 
				+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
			
 
				+    )
			
 
				+    # 确保根日志记录器也设置为DEBUG级别
			
 
				+    logging.getLogger().setLevel(logging.DEBUG)
			
 
				+
			
 
				+def set_info():
			
 
				+    import logging
			
 
				+    # 设置全局日志级别为INFO，确保所有模块生效
			
 
				+    logging.basicConfig(
			
 
				+        level=logging.INFO,
			
 
				+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
			
 
				+    )
			
 
				+    # 确保根日志记录器也设置为INFO级别
			
 
				+    logging.getLogger().setLevel(logging.INFO)
			
 
				+    
			
--- a/lib/structured_logger.py
+++ b/lib/structured_logger.py
@@ -0,0 +1,305 @@
 
				+"""
			
 
				+结构化日志记录器
			
 
				+提供步骤化、可追溯、易于可视化的日志记录功能
			
 
				+"""
			
 
				+import json
			
 
				+import os
			
 
				+from datetime import datetime
			
 
				+from typing import Any, Optional
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+class StructuredLogger:
			
 
				+    """
			
 
				+    结构化日志记录器
			
 
				+
			
 
				+    特点：
			
 
				+    1. 每个步骤独立保存文件
			
 
				+    2. 记录完整的时间线
			
 
				+    3. 支持嵌套步骤（树形结构）
			
 
				+    4. 便于可视化和debug
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, log_dir: str, run_id: str):
			
 
				+        """
			
 
				+        初始化日志记录器
			
 
				+
			
 
				+        Args:
			
 
				+            log_dir: 日志根目录
			
 
				+            run_id: 本次运行的唯一标识
			
 
				+        """
			
 
				+        self.log_dir = Path(log_dir)
			
 
				+        self.run_id = run_id
			
 
				+
			
 
				+        # 创建目录结构
			
 
				+        self.steps_dir = self.log_dir / "steps"
			
 
				+        self.timeline_dir = self.log_dir / "timeline"
			
 
				+        self.artifacts_dir = self.log_dir / "artifacts"
			
 
				+
			
 
				+        for dir_path in [self.steps_dir, self.timeline_dir, self.artifacts_dir]:
			
 
				+            dir_path.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        # 时间线记录
			
 
				+        self.timeline = []
			
 
				+        self.step_counter = 0
			
 
				+        self.step_stack = []  # 用于嵌套步骤
			
 
				+
			
 
				+        # 初始化元数据
			
 
				+        self.metadata = {
			
 
				+            "run_id": run_id,
			
 
				+            "start_time": datetime.now().isoformat(),
			
 
				+            "status": "running",
			
 
				+            "steps_count": 0,
			
 
				+            "log_dir": str(self.log_dir),
			
 
				+        }
			
 
				+        self._save_metadata()
			
 
				+
			
 
				+    def start_step(
			
 
				+        self,
			
 
				+        step_name: str,
			
 
				+        step_type: str,
			
 
				+        description: str = "",
			
 
				+        input_data: Any = None
			
 
				+    ) -> int:
			
 
				+        """
			
 
				+        开始一个新步骤
			
 
				+
			
 
				+        Args:
			
 
				+            step_name: 步骤名称（如："extract_keywords", "explore_level_1"）
			
 
				+            step_type: 步骤类型（如："extraction", "exploration", "analysis", "evaluation"）
			
 
				+            description: 步骤描述
			
 
				+            input_data: 输入数据
			
 
				+
			
 
				+        Returns:
			
 
				+            step_id: 步骤ID
			
 
				+        """
			
 
				+        self.step_counter += 1
			
 
				+        step_id = self.step_counter
			
 
				+
			
 
				+        # 计算层级（基于栈深度）
			
 
				+        level = len(self.step_stack)
			
 
				+        parent_id = self.step_stack[-1] if self.step_stack else None
			
 
				+
			
 
				+        step_info = {
			
 
				+            "step_id": step_id,
			
 
				+            "step_name": step_name,
			
 
				+            "step_type": step_type,
			
 
				+            "description": description,
			
 
				+            "level": level,
			
 
				+            "parent_id": parent_id,
			
 
				+            "status": "running",
			
 
				+            "start_time": datetime.now().isoformat(),
			
 
				+            "end_time": None,
			
 
				+            "duration_seconds": None,
			
 
				+            "input": self._serialize(input_data),
			
 
				+            "output": None,
			
 
				+            "error": None,
			
 
				+        }
			
 
				+
			
 
				+        # 压入栈
			
 
				+        self.step_stack.append(step_id)
			
 
				+
			
 
				+        # 保存步骤开始信息
			
 
				+        self._save_step(step_id, step_info)
			
 
				+
			
 
				+        # 添加到时间线
			
 
				+        self.timeline.append({
			
 
				+            "timestamp": step_info["start_time"],
			
 
				+            "event": "step_start",
			
 
				+            "step_id": step_id,
			
 
				+            "step_name": step_name,
			
 
				+            "step_type": step_type,
			
 
				+        })
			
 
				+        self._save_timeline()
			
 
				+
			
 
				+        print(f"\n{'  ' * level}[Step {step_id}] {step_name} - {description}")
			
 
				+
			
 
				+        return step_id
			
 
				+
			
 
				+    def end_step(
			
 
				+        self,
			
 
				+        step_id: int,
			
 
				+        output_data: Any = None,
			
 
				+        status: str = "success",
			
 
				+        error: Optional[str] = None
			
 
				+    ):
			
 
				+        """
			
 
				+        结束一个步骤
			
 
				+
			
 
				+        Args:
			
 
				+            step_id: 步骤ID
			
 
				+            output_data: 输出数据
			
 
				+            status: 步骤状态（"success", "error", "skipped"）
			
 
				+            error: 错误信息（如果有）
			
 
				+        """
			
 
				+        # 从栈中弹出
			
 
				+        if self.step_stack and self.step_stack[-1] == step_id:
			
 
				+            self.step_stack.pop()
			
 
				+
			
 
				+        # 读取步骤信息
			
 
				+        step_info = self._load_step(step_id)
			
 
				+
			
 
				+        # 更新步骤信息
			
 
				+        end_time = datetime.now()
			
 
				+        start_time = datetime.fromisoformat(step_info["start_time"])
			
 
				+        duration = (end_time - start_time).total_seconds()
			
 
				+
			
 
				+        step_info.update({
			
 
				+            "status": status,
			
 
				+            "end_time": end_time.isoformat(),
			
 
				+            "duration_seconds": duration,
			
 
				+            "output": self._serialize(output_data),
			
 
				+            "error": error,
			
 
				+        })
			
 
				+
			
 
				+        # 保存步骤结束信息
			
 
				+        self._save_step(step_id, step_info)
			
 
				+
			
 
				+        # 添加到时间线
			
 
				+        self.timeline.append({
			
 
				+            "timestamp": step_info["end_time"],
			
 
				+            "event": "step_end",
			
 
				+            "step_id": step_id,
			
 
				+            "step_name": step_info["step_name"],
			
 
				+            "status": status,
			
 
				+            "duration_seconds": duration,
			
 
				+        })
			
 
				+        self._save_timeline()
			
 
				+
			
 
				+        level = len(self.step_stack)
			
 
				+        status_emoji = "✅" if status == "success" else "❌" if status == "error" else "⏭️"
			
 
				+        print(f"{'  ' * level}{status_emoji} [Step {step_id}] Completed in {duration:.2f}s")
			
 
				+
			
 
				+    def log_artifact(
			
 
				+        self,
			
 
				+        step_id: int,
			
 
				+        artifact_name: str,
			
 
				+        artifact_data: Any,
			
 
				+        artifact_type: str = "json"
			
 
				+    ) -> str:
			
 
				+        """
			
 
				+        保存步骤的关联产物（如：API响应、中间结果等）
			
 
				+
			
 
				+        Args:
			
 
				+            step_id: 步骤ID
			
 
				+            artifact_name: 产物名称
			
 
				+            artifact_data: 产物数据
			
 
				+            artifact_type: 产物类型（"json", "text", "image"等）
			
 
				+
			
 
				+        Returns:
			
 
				+            artifact_path: 产物文件路径
			
 
				+        """
			
 
				+        artifact_dir = self.artifacts_dir / f"step_{step_id:04d}"
			
 
				+        artifact_dir.mkdir(exist_ok=True)
			
 
				+
			
 
				+        if artifact_type == "json":
			
 
				+            artifact_path = artifact_dir / f"{artifact_name}.json"
			
 
				+            with open(artifact_path, "w", encoding="utf-8") as f:
			
 
				+                json.dump(artifact_data, f, ensure_ascii=False, indent=2)
			
 
				+        elif artifact_type == "text":
			
 
				+            artifact_path = artifact_dir / f"{artifact_name}.txt"
			
 
				+            with open(artifact_path, "w", encoding="utf-8") as f:
			
 
				+                f.write(str(artifact_data))
			
 
				+        else:
			
 
				+            artifact_path = artifact_dir / artifact_name
			
 
				+            with open(artifact_path, "wb") as f:
			
 
				+                f.write(artifact_data)
			
 
				+
			
 
				+        print(f"  📎 Artifact saved: {artifact_path.name}")
			
 
				+        return str(artifact_path)
			
 
				+
			
 
				+    def finalize(self, final_status: str = "success", final_output: Any = None):
			
 
				+        """
			
 
				+        完成整个运行，生成最终摘要
			
 
				+
			
 
				+        Args:
			
 
				+            final_status: 最终状态
			
 
				+            final_output: 最终输出
			
 
				+        """
			
 
				+        self.metadata.update({
			
 
				+            "end_time": datetime.now().isoformat(),
			
 
				+            "status": final_status,
			
 
				+            "steps_count": self.step_counter,
			
 
				+            "final_output": self._serialize(final_output),
			
 
				+        })
			
 
				+        self._save_metadata()
			
 
				+
			
 
				+        # 生成摘要
			
 
				+        self._generate_summary()
			
 
				+
			
 
				+        print(f"\n{'='*60}")
			
 
				+        print(f"Run completed: {final_status}")
			
 
				+        print(f"Total steps: {self.step_counter}")
			
 
				+        print(f"Log directory: {self.log_dir}")
			
 
				+        print(f"{'='*60}")
			
 
				+
			
 
				+    def _save_step(self, step_id: int, step_info: dict):
			
 
				+        """保存步骤信息"""
			
 
				+        step_file = self.steps_dir / f"step_{step_id:04d}.json"
			
 
				+        with open(step_file, "w", encoding="utf-8") as f:
			
 
				+            json.dump(step_info, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    def _load_step(self, step_id: int) -> dict:
			
 
				+        """加载步骤信息"""
			
 
				+        step_file = self.steps_dir / f"step_{step_id:04d}.json"
			
 
				+        with open(step_file, "r", encoding="utf-8") as f:
			
 
				+            return json.load(f)
			
 
				+
			
 
				+    def _save_timeline(self):
			
 
				+        """保存时间线"""
			
 
				+        timeline_file = self.timeline_dir / "timeline.json"
			
 
				+        with open(timeline_file, "w", encoding="utf-8") as f:
			
 
				+            json.dump(self.timeline, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    def _save_metadata(self):
			
 
				+        """保存元数据"""
			
 
				+        metadata_file = self.log_dir / "metadata.json"
			
 
				+        with open(metadata_file, "w", encoding="utf-8") as f:
			
 
				+            json.dump(self.metadata, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    def _serialize(self, data: Any) -> Any:
			
 
				+        """序列化数据（处理Pydantic模型等）"""
			
 
				+        if data is None:
			
 
				+            return None
			
 
				+
			
 
				+        # 处理Pydantic模型
			
 
				+        if hasattr(data, "model_dump"):
			
 
				+            return data.model_dump()
			
 
				+
			
 
				+        # 处理字典
			
 
				+        if isinstance(data, dict):
			
 
				+            return {k: self._serialize(v) for k, v in data.items()}
			
 
				+
			
 
				+        # 处理列表
			
 
				+        if isinstance(data, list):
			
 
				+            return [self._serialize(item) for item in data]
			
 
				+
			
 
				+        # 其他类型直接返回
			
 
				+        return data
			
 
				+
			
 
				+    def _generate_summary(self):
			
 
				+        """生成运行摘要"""
			
 
				+        summary = {
			
 
				+            "run_id": self.run_id,
			
 
				+            "status": self.metadata["status"],
			
 
				+            "start_time": self.metadata["start_time"],
			
 
				+            "end_time": self.metadata["end_time"],
			
 
				+            "total_steps": self.step_counter,
			
 
				+            "steps_overview": [],
			
 
				+        }
			
 
				+
			
 
				+        # 汇总所有步骤
			
 
				+        for step_id in range(1, self.step_counter + 1):
			
 
				+            step_info = self._load_step(step_id)
			
 
				+            summary["steps_overview"].append({
			
 
				+                "step_id": step_id,
			
 
				+                "step_name": step_info["step_name"],
			
 
				+                "step_type": step_info["step_type"],
			
 
				+                "status": step_info["status"],
			
 
				+                "duration_seconds": step_info["duration_seconds"],
			
 
				+            })
			
 
				+
			
 
				+        # 保存摘要
			
 
				+        summary_file = self.log_dir / "summary.json"
			
 
				+        with open(summary_file, "w", encoding="utf-8") as f:
			
 
				+            json.dump(summary, f, ensure_ascii=False, indent=2)
			
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -0,0 +1,624 @@
 
				+from typing import List, Dict, Any
			
 
				+import json
			
 
				+from .my_trace import get_current_time
			
 
				+import re
			
 
				+import uuid
			
 
				+import datetime
			
 
				+
			
 
				+def parse_json_from_text(text: str) -> dict:
			
 
				+    """
			
 
				+    从文本中解析JSON，支持多种格式的JSON代码块
			
 
				+    
			
 
				+    Args:
			
 
				+        text (str): 包含JSON的文本
			
 
				+    
			
 
				+    Returns:
			
 
				+        dict: 解析后的JSON数据，解析失败返回空字典
			
 
				+    """
			
 
				+    if not text or not isinstance(text, str):
			
 
				+        return {}
			
 
				+    
			
 
				+    # 去除首尾空白字符
			
 
				+    text = text.strip()
			
 
				+    
			
 
				+    # 定义可能的JSON代码块标记
			
 
				+    json_markers = [
			
 
				+        ("'''json", "'''"),
			
 
				+        ('"""json', '"""'),
			
 
				+        ("```json", "```"),
			
 
				+        ("```", "```")
			
 
				+    ]
			
 
				+    
			
 
				+    # 尝试提取JSON代码块
			
 
				+    json_content = text
			
 
				+    for start_marker, end_marker in json_markers:
			
 
				+        if text.startswith(start_marker):
			
 
				+            # 找到开始标记，查找结束标记
			
 
				+            start_pos = len(start_marker)
			
 
				+            end_pos = text.find(end_marker, start_pos)
			
 
				+            if end_pos != -1:
			
 
				+                json_content = text[start_pos:end_pos].strip()
			
 
				+                break
			
 
				+    
			
 
				+    # 如果没有找到代码块标记，检查是否以结束标记结尾并移除
			
 
				+    if json_content == text:
			
 
				+        for _, end_marker in json_markers:
			
 
				+            if text.endswith(end_marker):
			
 
				+                json_content = text[:-len(end_marker)].strip()
			
 
				+                break
			
 
				+    
			
 
				+    # 尝试解析JSON
			
 
				+    try:
			
 
				+        return json.loads(json_content)
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        print(f"JSON解析失败: {e}")
			
 
				+        # 如果直接解析失败，尝试查找第一个{到最后一个}的内容
			
 
				+        try:
			
 
				+            first_brace = json_content.find('{')
			
 
				+            last_brace = json_content.rfind('}')
			
 
				+            if first_brace != -1 and last_brace != -1 and first_brace < last_brace:
			
 
				+                json_part = json_content[first_brace:last_brace + 1]
			
 
				+                return json.loads(json_part)
			
 
				+        except json.JSONDecodeError:
			
 
				+            pass
			
 
				+        
			
 
				+        return {}
			
 
				+
			
 
				+
			
 
				+def get_safe_filename(filename: str) -> str:
			
 
				+    """
			
 
				+    生成安全的文件名，移除不安全字符
			
 
				+    
			
 
				+    Args:
			
 
				+        filename: 原始文件名
			
 
				+        
			
 
				+    Returns:
			
 
				+        str: 安全的文件名
			
 
				+    """
			
 
				+    # 移除不安全的字符，只保留字母、数字、下划线、连字符和点
			
 
				+    return re.sub(r'[^\w\-\./]', '_', filename)
			
 
				+
			
 
				+
			
 
				+def generate_image_filename(mime_type: str, prefix: str = "gemini_img") -> str:
			
 
				+    """
			
 
				+    生成合理的图片文件名
			
 
				+
			
 
				+    Args:
			
 
				+        mime_type: 文件MIME类型
			
 
				+        prefix: 文件名前缀
			
 
				+
			
 
				+    Returns:
			
 
				+        str: 生成的文件名
			
 
				+    """
			
 
				+    # 获取当前时间戳
			
 
				+    timestamp = datetime.datetime.now().strftime("%Y%m%d/%H%M%S")
			
 
				+
			
 
				+    # 获取文件扩展名
			
 
				+    extension = mime_type.split('/')[-1]
			
 
				+    if extension == "jpeg":
			
 
				+        extension = "jpg"
			
 
				+
			
 
				+    # 生成唯一ID (短UUID)
			
 
				+    unique_id = str(uuid.uuid4())[:4]
			
 
				+
			
 
				+    # 组合文件名
			
 
				+    filename = f"{prefix}/{timestamp}_{unique_id}.{extension}"
			
 
				+
			
 
				+    # 确保文件名安全
			
 
				+    return get_safe_filename(filename)
			
 
				+
			
 
				+def parse_multimodal_content(content: str) -> List[Dict[str, Any]]:
			
 
				+    """解析多模态内容，保持上下文顺序，适用于AI参数传递 """
			
 
				+    
			
 
				+    result = []
			
 
				+    lines = content.split('\n')
			
 
				+    role = ''
			
 
				+    
			
 
				+    for line in lines:
			
 
				+        line = line.strip()
			
 
				+        if not line:
			
 
				+            continue
			
 
				+            
			
 
				+        # 分割前缀和内容
			
 
				+        if ':' in line:
			
 
				+            prefix, content = line.split(':', 1)
			
 
				+            prefix = prefix.strip().lower()
			
 
				+            content = content.strip()
			
 
				+            row = {}
			
 
				+            if prefix == 'image':
			
 
				+                row = {
			
 
				+                    "type": "image_url",
			
 
				+                    "image_url": {
			
 
				+                        "url": content
			
 
				+                    }
			
 
				+                }
			
 
				+            elif prefix == 'text':
			
 
				+                row = {
			
 
				+                    "type": "text",
			
 
				+                    "text": content
			
 
				+                }
			
 
				+            elif prefix == 'role':
			
 
				+                role = content
			
 
				+            if row:
			
 
				+                if role:
			
 
				+                    row['role'] = role
			
 
				+                    role = ''
			
 
				+                result.append(row)
			
 
				+    
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def read_json(file_path):
			
 
				+    """
			
 
				+    读取JSON文件并返回解析后的数据
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path: JSON文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        解析后的JSON数据
			
 
				+    """
			
 
				+    try:
			
 
				+        with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+            return json.load(f)
			
 
				+    except Exception as e:
			
 
				+        print(f"读取JSON文件时出错: {e}")
			
 
				+        return None
			
 
				+
			
 
				+def save_json(data, file_path):
			
 
				+    """
			
 
				+    保存数据到JSON文件
			
 
				+    
			
 
				+    Args:
			
 
				+        data: 要保存的数据
			
 
				+        file_path: 保存路径
			
 
				+    """
			
 
				+    with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(data, f, ensure_ascii=False, indent=2)
			
 
				+        
			
 
				+
			
 
				+def get_script_data(file_path):
			
 
				+    """
			
 
				+    读取JSON文件并返回解析后的数据
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path: JSON文件路径
			
 
				+    """
			
 
				+    return read_json(file_path)['脚本']
			
 
				+
			
 
				+import os
			
 
				+import xml.etree.ElementTree as ET
			
 
				+from typing import Dict, List, Any
			
 
				+import re
			
 
				+import unicodedata
			
 
				+
			
 
				+
			
 
				+def get_model(model_name):
			
 
				+    # return 'gemini/gemini-2.5-flash'
			
 
				+    # return 'litellm/gemini/gemini-2.5-flash'
			
 
				+    if model_name.startswith('litellm'):
			
 
				+        return model_name
			
 
				+    else:
			
 
				+        from openai import AsyncOpenAI
			
 
				+        from agents import OpenAIChatCompletionsModel
			
 
				+        BASE_URL = os.getenv("EXAMPLE_BASE_URL") or "https://openrouter.ai/api/v1"
			
 
				+        API_KEY = os.getenv("OPENROUTER_API_KEY") or ""
			
 
				+        client = AsyncOpenAI(
			
 
				+            base_url=BASE_URL,
			
 
				+            api_key=API_KEY,
			
 
				+        )
			
 
				+        return OpenAIChatCompletionsModel(
			
 
				+            # model='google/gemini-2.5-pro-preview',
			
 
				+            # model='google/gemini-2.5-flash-preview-05-20',
			
 
				+            # model='google/gemini-2.5-flash-preview-05-20',
			
 
				+            # model='google/gemini-2.5-flash',
			
 
				+            # model='google/gemini-2.5-flash',
			
 
				+            # model='google/gemini-2.5-flash-preview-05-20:thinking',
			
 
				+            # model='google/gemini-2.0-flash-001',
			
 
				+            model=model_name,
			
 
				+            openai_client=client,
			
 
				+        )
			
 
				+
			
 
				+def read_file_as_string(file_path):
			
 
				+    """读取文件内容并返回字符串"""
			
 
				+    try:
			
 
				+        with open(file_path, 'r', encoding='utf-8') as file:
			
 
				+            content = file.read().strip()
			
 
				+        return content
			
 
				+    except Exception as e:
			
 
				+        print(f"读取文件时出错: {e}")
			
 
				+        return None
			
 
				+def save_file_as_string(file_path, content):
			
 
				+    """将字符串内容写入文件"""
			
 
				+    with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+        f.write(content)
			
 
				+
			
 
				+def extract_html_from_markdown(text):
			
 
				+    """
			
 
				+    从可能包含markdown或其他代码块的文本中提取HTML内容
			
 
				+    
			
 
				+    参数:
			
 
				+        text: 可能包含各种格式的文本
			
 
				+        
			
 
				+    返回:
			
 
				+        提取出的纯HTML内容
			
 
				+    """
			
 
				+    # 处理```html```格式（反引号）
			
 
				+    backtick_pattern = r"```(?:html)?\s*([\s\S]*?)```"
			
 
				+    backtick_matches = re.findall(backtick_pattern, text)
			
 
				+    
			
 
				+    # 处理'''html'''格式（单引号）
			
 
				+    single_quote_pattern = r"'''(?:html)?\s*([\s\S]*?)'''"
			
 
				+    single_quote_matches = re.findall(single_quote_pattern, text)
			
 
				+    
			
 
				+    # 处理"""html"""格式（双引号）
			
 
				+    double_quote_pattern = r'"""(?:html)?\s*([\s\S]*?)"""'
			
 
				+    double_quote_matches = re.findall(double_quote_pattern, text)
			
 
				+    
			
 
				+    if backtick_matches:
			
 
				+        # 优先使用反引号格式
			
 
				+        return backtick_matches[0].strip()
			
 
				+    elif single_quote_matches:
			
 
				+        # 其次使用单引号格式
			
 
				+        return single_quote_matches[0].strip()
			
 
				+    elif double_quote_matches:
			
 
				+        # 再次使用双引号格式
			
 
				+        return double_quote_matches[0].strip()
			
 
				+    else:
			
 
				+        # 如果没有代码块格式，直接返回原get_current_time始文本
			
 
				+        return text
			
 
				+    
			
 
				+def create_workspace_dir(current_time=None, make_dir=True):
			
 
				+    if not current_time:
			
 
				+        current_time = get_current_time()
			
 
				+    task_dir = f"result/{current_time}"
			
 
				+    if make_dir:
			
 
				+        os.makedirs(task_dir, exist_ok=True)
			
 
				+    task_dir_absolute = os.path.abspath(task_dir)
			
 
				+    # print(f"任务目录的绝对路径: {task_dir_absolute}")
			
 
				+    return task_dir_absolute, str(current_time)
			
 
				+
			
 
				+
			
 
				+def extract_tag_content(text, tag_name):
			
 
				+    """
			
 
				+    从文本中提取指定标签内的内容
			
 
				+    
			
 
				+    参数:
			
 
				+        text (str): 要处理的文本
			
 
				+        tag_name (str): 要提取的标签名称
			
 
				+    
			
 
				+    返回:
			
 
				+        str: 标签内的内容，如果未找到则返回空字符串
			
 
				+    """
			
 
				+    import re
			
 
				+    pattern = f"<{tag_name}>(.*?)</{tag_name}>"
			
 
				+    match = re.search(pattern, text, re.DOTALL)
			
 
				+    if match:
			
 
				+        return match.group(1).strip()
			
 
				+    return ""
			
 
				+
			
 
				+from typing import Dict, List, Optional
			
 
				+def parse_tasks(tasks_xml: str) -> List[Dict]:
			
 
				+    """Parse XML tasks into a list of task dictionaries."""
			
 
				+    tasks = []
			
 
				+    current_task = {}
			
 
				+    
			
 
				+    for line in tasks_xml.split('\n'):
			
 
				+        line = line.strip()
			
 
				+        if not line:
			
 
				+            continue
			
 
				+            
			
 
				+        if line.startswith("<task>"):
			
 
				+            current_task = {}
			
 
				+        elif line.startswith("<name>"):
			
 
				+            current_task["name"] = line[6:-7].strip()
			
 
				+        elif line.startswith("<output>"):
			
 
				+            current_task["output"] = line[12:-13].strip()
			
 
				+        elif line.startswith("</task>"):
			
 
				+            if "description" in current_task:
			
 
				+                if "type" not in current_task:
			
 
				+                    current_task["type"] = "default"
			
 
				+                tasks.append(current_task)
			
 
				+    
			
 
				+    return tasks
			
 
				+    
			
 
				+    
			
 
				+def parse_xml_content(xml_string: str) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    将XML字符串解析成字典，提取main_task、thoughts、tasks和resources
			
 
				+    
			
 
				+    参数:
			
 
				+        xml_string: 包含任务信息的XML字符串
			
 
				+        
			
 
				+    返回:
			
 
				+        包含所有解析信息的字典
			
 
				+    """
			
 
				+    # 创建结果字典
			
 
				+    result = {
			
 
				+        "main_task": {},
			
 
				+        "thoughts": "",
			
 
				+        "tasks": [],
			
 
				+        "resources": []
			
 
				+    }
			
 
				+    
			
 
				+    try:
			
 
				+        # 提取thoughts内容
			
 
				+        thoughts_match = re.search(r'<thoughts>(.*?)</thoughts>', xml_string, re.DOTALL)
			
 
				+        if thoughts_match:
			
 
				+            result["thoughts"] = thoughts_match.group(1).strip()
			
 
				+        
			
 
				+        # 提取main_task内容
			
 
				+        main_task_match = re.search(r'<main_task>(.*?)</main_task>', xml_string, re.DOTALL)
			
 
				+        if main_task_match:
			
 
				+            main_task_content = main_task_match.group(1)
			
 
				+            main_task = {}
			
 
				+            
			
 
				+            # 获取主任务名称
			
 
				+            name_match = re.search(r'<name>(.*?)</name>', main_task_content, re.DOTALL)
			
 
				+            if name_match:
			
 
				+                main_task['name'] = name_match.group(1).strip()
			
 
				+            
			
 
				+            # 获取主任务输出
			
 
				+            output_match = re.search(r'<output>(.*?)</output>', main_task_content, re.DOTALL)
			
 
				+            if output_match:
			
 
				+                main_task['output'] = output_match.group(1).strip()
			
 
				+            
			
 
				+            # 获取主任务描述
			
 
				+            description_match = re.search(r'<description>(.*?)</description>', main_task_content, re.DOTALL)
			
 
				+            if description_match:
			
 
				+                main_task['description'] = description_match.group(1).strip()
			
 
				+            
			
 
				+            result["main_task"] = main_task
			
 
				+        
			
 
				+        # 提取<tasks>...</tasks>部分
			
 
				+        tasks_pattern = re.compile(r'<tasks>(.*?)</tasks>', re.DOTALL)
			
 
				+        tasks_match = tasks_pattern.search(xml_string)
			
 
				+        
			
 
				+        if tasks_match:
			
 
				+            tasks_content = tasks_match.group(1)
			
 
				+            
			
 
				+            # 提取每个task块
			
 
				+            task_pattern = re.compile(r'<task>(.*?)</task>', re.DOTALL)
			
 
				+            task_matches = task_pattern.finditer(tasks_content)
			
 
				+            
			
 
				+            for task_match in task_matches:
			
 
				+                task_content = task_match.group(1)
			
 
				+                task_dict = {}
			
 
				+                
			
 
				+                # 获取任务名称
			
 
				+                name_match = re.search(r'<name>(.*?)</name>', task_content, re.DOTALL)
			
 
				+                if not name_match:
			
 
				+                    continue  # 跳过没有名称的任务
			
 
				+                
			
 
				+                name = name_match.group(1).strip()
			
 
				+                task_dict['name'] = name
			
 
				+                # 获取输出信息
			
 
				+                output_match = re.search(r'<output>(.*?)</output>', task_content, re.DOTALL)
			
 
				+                task_dict['output'] = output_match.group(1).strip() if output_match else ""
			
 
				+                
			
 
				+                # 获取描述信息
			
 
				+                description_match = re.search(r'<description>(.*?)</description>', task_content, re.DOTALL)
			
 
				+                task_dict['description'] = description_match.group(1).strip() if description_match else ""
			
 
				+                
			
 
				+                # 获取依赖任务列表
			
 
				+                depend_tasks = []
			
 
				+                depend_tasks_section = re.search(r'<depend_tasks>(.*?)</depend_tasks>', task_content, re.DOTALL)
			
 
				+                if depend_tasks_section:
			
 
				+                    depend_task_matches = re.finditer(r'<depend_task>(.*?)</depend_task>', 
			
 
				+                                                   depend_tasks_section.group(1), re.DOTALL)
			
 
				+                    for dt_match in depend_task_matches:
			
 
				+                        if dt_match.group(1).strip():
			
 
				+                            depend_tasks.append(dt_match.group(1).strip())
			
 
				+                
			
 
				+                task_dict['depend_tasks'] = depend_tasks
			
 
				+                
			
 
				+                # 获取依赖资源列表
			
 
				+                depend_resources = []
			
 
				+                resources_match = re.search(r'<depend_resources>(.*?)</depend_resources>', task_content, re.DOTALL)
			
 
				+                if resources_match and resources_match.group(1).strip():
			
 
				+                    resources_text = resources_match.group(1).strip()
			
 
				+                    depend_resources = [res.strip() for res in resources_text.split(',') if res.strip()]
			
 
				+                
			
 
				+                task_dict['depend_resources'] = depend_resources
			
 
				+                
			
 
				+                # 将任务添加到结果字典
			
 
				+                result["tasks"].append(task_dict)
			
 
				+        
			
 
				+        # 提取resources内容
			
 
				+        resources_pattern = re.compile(r'<resources>(.*?)</resources>', re.DOTALL)
			
 
				+        resources_match = resources_pattern.search(xml_string)
			
 
				+        
			
 
				+        if resources_match:
			
 
				+            resources_content = resources_match.group(1).strip()
			
 
				+            result["resources"] = resources_content
			
 
				+        return result
			
 
				+    
			
 
				+    except Exception as e:
			
 
				+        raise ValueError(f"处理XML数据时发生错误: {e}")
			
 
				+
			
 
				+
			
 
				+def parse_planner_result(result):
			
 
				+    """
			
 
				+    解析规划结果，并为每个任务添加任务目录名
			
 
				+    
			
 
				+    参数:
			
 
				+        result: 包含thoughts、main_task、tasks和resources的规划结果字符串
			
 
				+        
			
 
				+    返回:
			
 
				+        解析后的完整规划信息字典
			
 
				+    """
			
 
				+    # 使用parse_xml_content解析完整内容
			
 
				+    parsed_result = parse_xml_content(result)
			
 
				+    task_name_to_index = {}
			
 
				+    task_dict = {
			
 
				+        'tasks': {},
			
 
				+        'max_index': 1,
			
 
				+    }
			
 
				+    
			
 
				+    # 为每个任务添加task_dir字段
			
 
				+    for i, task_info in enumerate(parsed_result["tasks"]):
			
 
				+        # 使用sanitize_filename生成目录名
			
 
				+        task_name = task_info.get("name", "task")
			
 
				+        depend_tasks_dir = []
			
 
				+        task_info['task_dir'] = get_task_dir(task_name, task_dict)
			
 
				+        for depend_task in task_info.get("depend_tasks", []):
			
 
				+            depend_tasks_dir.append(get_task_dir(depend_task, task_dict))
			
 
				+        task_info['depend_tasks_dir'] = depend_tasks_dir
			
 
				+        task_info['status'] = 'todo' # 任务状态，todo: 未开始，doing: 进行中，success: 已完成，fail: 失败
			
 
				+        task_name_to_index[task_name] = i
			
 
				+    
			
 
				+    # 为主任务也添加task_dir字段
			
 
				+    if parsed_result["main_task"]:
			
 
				+        main_task_name = parsed_result["main_task"].get("name", "main_task")
			
 
				+        parsed_result["main_task"]["task_dir"] = sanitize_filename(main_task_name)
			
 
				+    
			
 
				+    return parsed_result, task_name_to_index
			
 
				+def get_task_dir(task_name, task_dict, append_index=True):
			
 
				+    max_index = task_dict.get('max_index', 1)
			
 
				+    if task_name in task_dict['tasks']:
			
 
				+        return task_dict['tasks'][task_name]
			
 
				+    max_index_str = f"{max_index:02d}"
			
 
				+    task_dir_raw = sanitize_filename(task_name)
			
 
				+    if append_index:
			
 
				+        task_dir = f"{max_index_str}_{task_dir_raw}"
			
 
				+    else:
			
 
				+        task_dir = task_dir_raw
			
 
				+    task_dict['tasks'][task_name] = task_dir
			
 
				+    task_dict['max_index'] = max_index + 1
			
 
				+    return task_dir
			
 
				+    
			
 
				+def sanitize_filename(task_name: str, max_length: int = 20) -> str:
			
 
				+    """
			
 
				+    将任务名称转换为适合作为文件夹名称的字符串
			
 
				+    
			
 
				+    参数:
			
 
				+        task_name: 需要转换的任务名称
			
 
				+        max_length: 文件名最大长度限制，默认80个字符
			
 
				+        
			
 
				+    返回:
			
 
				+        处理后适合作为文件名/文件夹名的字符串
			
 
				+    """
			
 
				+    # 替换Windows和Unix系统中不允许的文件名字符
			
 
				+    # 替换 / \ : * ? " < > | 等字符为下划线
			
 
				+    sanitized = re.sub(r'[\\/*?:"<>|]', '_', task_name)
			
 
				+    
			
 
				+    # 替换连续的空白字符为单个下划线
			
 
				+    sanitized = re.sub(r'\s+', '_', sanitized)
			
 
				+    
			
 
				+    # 移除开头和结尾的点和空格
			
 
				+    sanitized = sanitized.strip('. ')
			
 
				+    
			
 
				+    # 如果名称过长，截断它
			
 
				+    if len(sanitized) > max_length:
			
 
				+        # 保留前面的部分和后面的部分，中间用...连接
			
 
				+        half_length = (max_length - 3) // 2
			
 
				+        sanitized = sanitized[:half_length] + '...' + sanitized[-half_length:]
			
 
				+    
			
 
				+    # 确保名称不为空
			
 
				+    if not sanitized:
			
 
				+        sanitized = "unnamed_task"
			
 
				+    
			
 
				+    return sanitized
			
 
				+
			
 
				+def write_json(data, file_path: str) -> None:
			
 
				+    """
			
 
				+    将数据写入JSON文件
			
 
				+    
			
 
				+    参数:
			
 
				+        data: 要写入的数据对象
			
 
				+        file_path: 目标文件路径
			
 
				+        
			
 
				+    返回:
			
 
				+        无
			
 
				+    """
			
 
				+    import json
			
 
				+    with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(data, f, ensure_ascii=False, indent=2)
			
 
				+def write_string_to_file(content: str, file_path: str) -> None:
			
 
				+    """
			
 
				+    将字符串内容写入文件
			
 
				+    
			
 
				+    参数:
			
 
				+        content: 要写入的字符串内容
			
 
				+        file_path: 目标文件路径
			
 
				+        
			
 
				+    返回:
			
 
				+        无
			
 
				+    """
			
 
				+    with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+        f.write(content)
			
 
				+
			
 
				+def pretty_process(result):
			
 
				+    def format_output(in_str):
			
 
				+        return in_str.replace('\n\n', '\n').replace('\\"', '"')
			
 
				+    process_list = []
			
 
				+    i = 0
			
 
				+    call_dict = {}
			
 
				+    
			
 
				+    # 首先收集所有工具调用输出
			
 
				+    for row in result:
			
 
				+        if isinstance(row, list):
			
 
				+            # 处理列表：递归处理列表中的每个项目
			
 
				+            for item in row:
			
 
				+                if isinstance(item, dict) and item.get('type', '') == 'function_call_output':
			
 
				+                    call_id = item['call_id']
			
 
				+                    call_dict[call_id] = item['output']
			
 
				+        elif isinstance(row, dict) and row.get('type', '') == 'function_call_output':
			
 
				+            call_id = row['call_id']
			
 
				+            call_dict[call_id] = row['output']
			
 
				+    
			
 
				+    # 然后处理每一行
			
 
				+    for row in result:
			
 
				+        if isinstance(row, list):
			
 
				+            # 递归处理列表中的每个项目
			
 
				+            for item in row:
			
 
				+                if isinstance(item, dict):
			
 
				+                    process_row(item, process_list, call_dict, i)
			
 
				+                    i += 1
			
 
				+        else:
			
 
				+            # 直接处理字典项
			
 
				+            process_row(row, process_list, call_dict, i)
			
 
				+            i += 1
			
 
				+    
			
 
				+    process_str = '\n'.join(process_list)
			
 
				+    return process_str
			
 
				+
			
 
				+def process_row(row, process_list, call_dict, i):
			
 
				+    """处理单个行项目，添加到处理列表中"""
			
 
				+    def format_output(in_str):
			
 
				+        return in_str.replace('\n\n', '\n').replace('\\"', '"')
			
 
				+    
			
 
				+    if not isinstance(row, dict):
			
 
				+        return
			
 
				+        
			
 
				+    action = ''
			
 
				+    out = ''
			
 
				+    call_id = ''
			
 
				+    role_ = row.get('role', '')
			
 
				+    type_ = row.get('type', '')
			
 
				+    
			
 
				+    if type_ == 'function_call':
			
 
				+        action = f'工具调用-{row.get("name")}'
			
 
				+        out = row['arguments']
			
 
				+        call_id = row['call_id']
			
 
				+    elif type_ == 'function_call_output':
			
 
				+        return  # 跳过函数调用输出，它们已经被收集到call_dict中
			
 
				+    elif role_ in ('user', 'assistant'):
			
 
				+        action = role_
			
 
				+        if isinstance(row['content'], str):
			
 
				+            out = row['content']
			
 
				+        else:
			
 
				+            content_text = ""
			
 
				+            for this_c in row['content']:
			
 
				+                if isinstance(this_c, dict) and 'text' in this_c:
			
 
				+                    content_text += this_c['text']
			
 
				+            out = content_text
			
 
				+    
			
 
				+    process_list.append('\n\n' + f'{i+1}. ' + '## ' + action + ' ' * 4 + '-' * 32 + '\n')
			
 
				+    process_list.append(format_output(str(out)))
			
 
				+    
			
 
				+    # 如果存在对应的工具输出，添加它
			
 
				+    if call_id and call_id in call_dict:
			
 
				+        process_list.append('\n\n' + f'{i+2}. ' + '## ' + '工具输出' + ' ' * 4 + '-' * 32 + '\n')
			
 
				+        process_list.append(format_output(call_dict[call_id]))
			
 
				+
			
--- a/scripts/__init__.py
+++ b/scripts/__init__.py
@@ -0,0 +1,3 @@
 
				+"""
			
 
				+小红书数据获取脚本集合
			
 
				+"""
			
--- a/scripts/batch_fetch_accounts.py
+++ b/scripts/batch_fetch_accounts.py
@@ -0,0 +1,527 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+批量处理账号数据脚本
			
 
				+功能：根据账号整理.json文件批量获取账号数据
			
 
				+输出目录结构：examples/[品类名称]/[tag名称]/[账号名称]/[帖子ID]/输入/
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+import sys
			
 
				+import argparse
			
 
				+import shutil
			
 
				+
			
 
				+# 导入共享工具模块
			
 
				+from xhs_utils import (
			
 
				+    get_note_detail,
			
 
				+    get_author_history_notes,
			
 
				+    merge_note_data
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def extract_account_id_from_url(url: str) -> str:
			
 
				+    """
			
 
				+    从小红书账号URL中提取account_id
			
 
				+
			
 
				+    Args:
			
 
				+        url: 小红书账号URL
			
 
				+
			
 
				+    Returns:
			
 
				+        account_id: 账号ID
			
 
				+    """
			
 
				+    import re
			
 
				+    # 尝试从URL路径中提取
			
 
				+    pattern = r'/user/profile/([a-f0-9]+)'
			
 
				+    match = re.search(pattern, url)
			
 
				+
			
 
				+    if match:
			
 
				+        return match.group(1)
			
 
				+
			
 
				+    # 如果直接传入的是account_id，则直接返回
			
 
				+    if re.match(r'^[a-f0-9]{24}$', url):
			
 
				+        return url
			
 
				+
			
 
				+    raise ValueError(f"无法从URL中提取account_id: {url}")
			
 
				+
			
 
				+
			
 
				+def save_note_to_file(note_data: dict, file_path: Path):
			
 
				+    """
			
 
				+    将帖子数据保存到JSON文件
			
 
				+
			
 
				+    Args:
			
 
				+        note_data: 帖子数据
			
 
				+        file_path: 文件路径
			
 
				+    """
			
 
				+    # 确保目录存在
			
 
				+    file_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # 保存JSON文件
			
 
				+    with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(note_data, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"  已保存: {file_path}")
			
 
				+
			
 
				+
			
 
				+def check_note_data_integrity(note_data: dict) -> bool:
			
 
				+    """
			
 
				+    检查帖子数据的完整性
			
 
				+
			
 
				+    Args:
			
 
				+        note_data: 帖子数据字典
			
 
				+
			
 
				+    Returns:
			
 
				+        bool: 如果 images 或 video 字段至少一个不为空，返回 True，否则返回 False
			
 
				+    """
			
 
				+    images = note_data.get("images", [])
			
 
				+    video = note_data.get("video")
			
 
				+
			
 
				+    # 检查 images 是否为非空列表
			
 
				+    has_images = isinstance(images, list) and len(images) > 0
			
 
				+
			
 
				+    # 检查 video 是否存在且不为空（字符串或字典都可以）
			
 
				+    has_video = video is not None and video != "" and video != {}
			
 
				+
			
 
				+    return has_images or has_video
			
 
				+
			
 
				+
			
 
				+def check_account_data_exists(category_name: str, tag_name: str, account_name: str,
			
 
				+                               note_id: str = None, output_dir: str = "examples") -> dict:
			
 
				+    """
			
 
				+    检查账号数据是否已经存在且完整
			
 
				+
			
 
				+    Args:
			
 
				+        category_name: 品类名称
			
 
				+        tag_name: tag名称
			
 
				+        account_name: 账号名称
			
 
				+        note_id: 帖子ID（可选，如果提供则检查该特定帖子）
			
 
				+        output_dir: 输出根目录
			
 
				+
			
 
				+    Returns:
			
 
				+        dict: 包含检查结果的字典
			
 
				+            {
			
 
				+                "exists": bool,  # 数据是否存在
			
 
				+                "complete": bool,  # 数据是否完整
			
 
				+                "target_note_path": Path or None,  # 待解构帖子路径
			
 
				+                "history_notes_path": Path or None,  # 历史帖子目录路径
			
 
				+                "incomplete_files": list,  # 不完整的文件列表
			
 
				+                "note_id": str or None  # 如果已存在，返回帖子ID
			
 
				+            }
			
 
				+    """
			
 
				+    result = {
			
 
				+        "exists": False,
			
 
				+        "complete": False,
			
 
				+        "target_note_path": None,
			
 
				+        "history_notes_path": None,
			
 
				+        "incomplete_files": [],
			
 
				+        "note_id": None
			
 
				+    }
			
 
				+
			
 
				+    # 如果没有提供note_id，需要先查找账号目录下是否有数据
			
 
				+    base_dir = Path(output_dir) / category_name / tag_name / account_name
			
 
				+
			
 
				+    if not base_dir.exists():
			
 
				+        return result
			
 
				+
			
 
				+    # 如果没有提供note_id，尝试查找现有的note_id目录
			
 
				+    if note_id is None:
			
 
				+        # 查找第一个存在的note_id目录
			
 
				+        note_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
			
 
				+        if not note_dirs:
			
 
				+            return result
			
 
				+        # 使用第一个找到的目录
			
 
				+        note_id = note_dirs[0].name
			
 
				+
			
 
				+    result["note_id"] = note_id
			
 
				+
			
 
				+    # 构建路径
			
 
				+    input_dir = base_dir / note_id / "输入"
			
 
				+    target_note_path = input_dir / "待解构帖子.json"
			
 
				+    history_notes_path = input_dir / "作者历史帖子"
			
 
				+
			
 
				+    result["target_note_path"] = target_note_path
			
 
				+    result["history_notes_path"] = history_notes_path
			
 
				+
			
 
				+    # 检查输入目录是否存在
			
 
				+    if not input_dir.exists():
			
 
				+        return result
			
 
				+
			
 
				+    result["exists"] = True
			
 
				+
			
 
				+    # 检查待解构帖子是否存在且完整
			
 
				+    if not target_note_path.exists():
			
 
				+        result["incomplete_files"].append(str(target_note_path))
			
 
				+        return result
			
 
				+
			
 
				+    try:
			
 
				+        with open(target_note_path, 'r', encoding='utf-8') as f:
			
 
				+            target_note_data = json.load(f)
			
 
				+
			
 
				+        if not check_note_data_integrity(target_note_data):
			
 
				+            result["incomplete_files"].append(str(target_note_path))
			
 
				+    except Exception as e:
			
 
				+        result["incomplete_files"].append(f"{target_note_path} (读取错误: {e})")
			
 
				+
			
 
				+    # 检查历史帖子目录
			
 
				+    if not history_notes_path.exists():
			
 
				+        result["incomplete_files"].append(str(history_notes_path))
			
 
				+        return result
			
 
				+
			
 
				+    # 检查历史帖子文件的完整性
			
 
				+    history_files = list(history_notes_path.glob("*.json"))
			
 
				+
			
 
				+    if len(history_files) == 0:
			
 
				+        result["incomplete_files"].append(f"{history_notes_path} (没有历史帖子文件)")
			
 
				+    else:
			
 
				+        # 统计有效的历史帖子数量
			
 
				+        valid_history_count = 0
			
 
				+
			
 
				+        for history_file in history_files:
			
 
				+            try:
			
 
				+                with open(history_file, 'r', encoding='utf-8') as f:
			
 
				+                    history_note_data = json.load(f)
			
 
				+
			
 
				+                if not check_note_data_integrity(history_note_data):
			
 
				+                    result["incomplete_files"].append(str(history_file))
			
 
				+                else:
			
 
				+                    valid_history_count += 1
			
 
				+            except Exception as e:
			
 
				+                result["incomplete_files"].append(f"{history_file} (读取错误: {e})")
			
 
				+
			
 
				+        # 验证历史帖子数量必须大于4
			
 
				+        if valid_history_count <= 4:
			
 
				+            result["incomplete_files"].append(f"{history_notes_path} (有效历史帖子数量 {valid_history_count} ≤ 4，不满足要求)")
			
 
				+
			
 
				+    # 如果没有不完整的文件，则数据完整
			
 
				+    result["complete"] = len(result["incomplete_files"]) == 0
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def delete_incomplete_data(category_name: str, tag_name: str, account_name: str,
			
 
				+                           note_id: str, output_dir: str = "examples") -> bool:
			
 
				+    """
			
 
				+    删除不完整的账号数据目录
			
 
				+
			
 
				+    Args:
			
 
				+        category_name: 品类名称
			
 
				+        tag_name: tag名称
			
 
				+        account_name: 账号名称
			
 
				+        note_id: 帖子ID
			
 
				+        output_dir: 输出根目录
			
 
				+
			
 
				+    Returns:
			
 
				+        bool: 删除成功返回True，否则返回False
			
 
				+    """
			
 
				+    try:
			
 
				+        # 构建要删除的目录路径：examples/[品类]/[tag]/[账号]/[帖子ID]
			
 
				+        target_dir = Path(output_dir) / category_name / tag_name / account_name / note_id
			
 
				+
			
 
				+        if target_dir.exists():
			
 
				+            shutil.rmtree(target_dir)
			
 
				+            print(f"  ✓ 已删除不完整数据目录: {target_dir}")
			
 
				+            return True
			
 
				+        else:
			
 
				+            print(f"  ⚠️  目录不存在: {target_dir}")
			
 
				+            return False
			
 
				+    except Exception as e:
			
 
				+        print(f"  ✗ 删除目录失败: {e}")
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def process_account(category_name: str, tag_name: str, account_info: dict,
			
 
				+                    output_dir: str = "examples", check_only: bool = False,
			
 
				+                    skip_if_exists: bool = True, clean_incomplete: bool = False):
			
 
				+    """
			
 
				+    处理单个账号的数据获取
			
 
				+
			
 
				+    Args:
			
 
				+        category_name: 品类名称
			
 
				+        tag_name: tag名称
			
 
				+        account_info: 账号信息字典，包含name和url
			
 
				+        output_dir: 输出根目录
			
 
				+        check_only: 如果为True，只检查数据是否存在，不执行获取操作
			
 
				+        skip_if_exists: 如果为True且数据已存在且完整，则跳过获取
			
 
				+        clean_incomplete: 如果为True，检测到不完整数据时自动删除
			
 
				+    """
			
 
				+    account_name = account_info.get("name", "未知账号")
			
 
				+    account_url = account_info.get("url", "")
			
 
				+
			
 
				+    if not account_url:
			
 
				+        print(f"⚠️  账号 {account_name} 没有URL，跳过")
			
 
				+        return
			
 
				+
			
 
				+    print(f"\n{'='*80}")
			
 
				+    print(f"{'[检查模式]' if check_only else '[处理模式]'} 账号: {account_name}")
			
 
				+    print(f"  品类: {category_name}")
			
 
				+    print(f"  Tag: {tag_name}")
			
 
				+    print(f"  URL: {account_url}")
			
 
				+    print(f"{'='*80}")
			
 
				+
			
 
				+    # 先检查数据是否已存在
			
 
				+    check_result = check_account_data_exists(category_name, tag_name, account_name, output_dir=output_dir)
			
 
				+
			
 
				+    if check_result["exists"]:
			
 
				+        if check_result["complete"]:
			
 
				+            print(f"✓ 数据已存在且完整")
			
 
				+            print(f"  帖子ID: {check_result['note_id']}")
			
 
				+            print(f"  待解构帖子: {check_result['target_note_path']}")
			
 
				+            print(f"  历史帖子目录: {check_result['history_notes_path']}")
			
 
				+
			
 
				+            if check_only or skip_if_exists:
			
 
				+                print(f"{'  [检查模式] 跳过获取' if check_only else '  [跳过] 数据已完整'}")
			
 
				+                return
			
 
				+        else:
			
 
				+            print(f"⚠️  数据存在但不完整")
			
 
				+            print(f"  帖子ID: {check_result['note_id']}")
			
 
				+            print(f"  不完整的文件:")
			
 
				+            for incomplete_file in check_result["incomplete_files"]:
			
 
				+                print(f"    - {incomplete_file}")
			
 
				+
			
 
				+            # 如果启用了清理不完整数据的功能
			
 
				+            if clean_incomplete:
			
 
				+                print(f"  [清理模式] 删除不完整数据...")
			
 
				+                delete_incomplete_data(category_name, tag_name, account_name,
			
 
				+                                     check_result['note_id'], output_dir)
			
 
				+
			
 
				+            if check_only:
			
 
				+                print(f"  [检查模式] 需要重新获取")
			
 
				+                return
			
 
				+            else:
			
 
				+                print(f"  将重新获取数据...")
			
 
				+    else:
			
 
				+        print(f"ℹ️  数据不存在")
			
 
				+        if check_only:
			
 
				+            print(f"  [检查模式] 需要获取")
			
 
				+            return
			
 
				+
			
 
				+    # 如果是检查模式，到这里就结束了
			
 
				+    if check_only:
			
 
				+        return
			
 
				+
			
 
				+    try:
			
 
				+        # 1. 提取account_id
			
 
				+        account_id = extract_account_id_from_url(account_url)
			
 
				+        print(f"✓ 提取到account_id: {account_id}")
			
 
				+
			
 
				+        # 2. 获取账号的所有历史帖子
			
 
				+        print(f"正在获取历史帖子...")
			
 
				+        history_notes = get_author_history_notes(account_id)
			
 
				+
			
 
				+        if not history_notes or len(history_notes) == 0:
			
 
				+            print(f"⚠️  未找到历史帖子")
			
 
				+            return
			
 
				+
			
 
				+        print(f"✓ 找到 {len(history_notes)} 个历史帖子")
			
 
				+
			
 
				+        # 3. 找出点赞数最高的帖子
			
 
				+        max_like_note = max(history_notes, key=lambda x: x.get("like_count", 0))
			
 
				+        max_like_note_id = max_like_note.get("note_id", "")
			
 
				+        max_like_count = max_like_note.get("like_count", 0)
			
 
				+
			
 
				+        print(f"✓ 点赞数最高的帖子:")
			
 
				+        print(f"    - 帖子ID: {max_like_note_id}")
			
 
				+        print(f"    - 标题: {max_like_note.get('title', '无标题')}")
			
 
				+        print(f"    - 点赞数: {max_like_count}")
			
 
				+
			
 
				+        # 4. 处理点赞数最高的帖子（待解构帖子）
			
 
				+        print(f"正在处理待解构帖子...")
			
 
				+        need_detail = not (max_like_note.get("desc") or max_like_note.get("note_text") or max_like_note.get("body_text"))
			
 
				+
			
 
				+        target_note_detail = None
			
 
				+        if need_detail:
			
 
				+            target_note_detail = get_note_detail(max_like_note_id)
			
 
				+
			
 
				+        # 合并历史API和详情API的数据
			
 
				+        transformed_target = merge_note_data(max_like_note, target_note_detail)
			
 
				+
			
 
				+        # 5. 创建新的目录结构：examples/[品类名称]/[tag名称]/[账号名称]/[帖子ID]/输入/
			
 
				+        base_path = Path(output_dir) / category_name / tag_name / account_name / max_like_note_id / "输入"
			
 
				+        history_path = base_path / "作者历史帖子"
			
 
				+
			
 
				+        # 6. 保存待解构帖子
			
 
				+        target_note_path = base_path / "待解构帖子.json"
			
 
				+        save_note_to_file(transformed_target, target_note_path)
			
 
				+
			
 
				+        # 7. 为每个历史帖子处理数据并保存
			
 
				+        print(f"正在处理所有历史帖子...")
			
 
				+
			
 
				+        success_count = 0
			
 
				+        for idx, note in enumerate(history_notes, 1):
			
 
				+            history_note_id = note.get("note_id", "")
			
 
				+
			
 
				+            if history_note_id:
			
 
				+                try:
			
 
				+                    # 检查历史API数据是否缺少关键字段（主要是body_text）
			
 
				+                    need_detail = not (note.get("desc") or note.get("note_text") or note.get("body_text"))
			
 
				+
			
 
				+                    detail_data = None
			
 
				+                    if need_detail:
			
 
				+                        detail_data = get_note_detail(history_note_id)
			
 
				+                        # 添加请求间隔，避免频繁调用
			
 
				+                        if idx < len(history_notes):
			
 
				+                            time.sleep(0.5)
			
 
				+
			
 
				+                    # 合并历史API和详情API的数据
			
 
				+                    merged_note = merge_note_data(note, detail_data)
			
 
				+
			
 
				+                    # 保存到文件
			
 
				+                    history_note_path = history_path / f"{history_note_id}.json"
			
 
				+                    save_note_to_file(merged_note, history_note_path)
			
 
				+                    success_count += 1
			
 
				+                except Exception as e:
			
 
				+                    print(f"  ⚠️  处理帖子 {history_note_id} 失败: {e}")
			
 
				+                    continue
			
 
				+
			
 
				+        print(f"\n✓ 账号 {account_name} 处理完成！")
			
 
				+        print(f"✓ 待解构帖子: {max_like_note_id}")
			
 
				+        print(f"✓ 共保存 {success_count} 个历史帖子")
			
 
				+        print(f"✓ 输出目录: {base_path}")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"✗ 处理账号 {account_name} 失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    # 解析命令行参数
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='批量处理账号数据脚本',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog="""
			
 
				+使用示例:
			
 
				+  # 默认模式：获取数据，如果已存在且完整则跳过
			
 
				+  python batch_fetch_accounts.py
			
 
				+
			
 
				+  # 只检查模式：只检查数据是否存在且完整，不获取数据
			
 
				+  python batch_fetch_accounts.py --check-only
			
 
				+
			
 
				+  # 检查并清理不完整数据
			
 
				+  python batch_fetch_accounts.py --check-only --clean-incomplete
			
 
				+
			
 
				+  # 强制获取模式：即使数据已存在也重新获取
			
 
				+  python batch_fetch_accounts.py --no-skip-if-exists
			
 
				+
			
 
				+  # 指定配置文件
			
 
				+  python batch_fetch_accounts.py --config 账号整理.json
			
 
				+        """
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--config',
			
 
				+        type=str,
			
 
				+        default='账号整理.json',
			
 
				+        help='配置文件路径 (默认: 账号整理.json)'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--check-only',
			
 
				+        action='store_true',
			
 
				+        help='只检查数据是否存在且完整，不执行获取操作'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--no-skip-if-exists',
			
 
				+        action='store_true',
			
 
				+        help='即使数据已存在且完整也重新获取'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--clean-incomplete',
			
 
				+        action='store_true',
			
 
				+        help='自动删除检测到的不完整数据目录'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--output-dir',
			
 
				+        type=str,
			
 
				+        default='examples',
			
 
				+        help='输出根目录 (默认: examples)'
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    config_file = args.config
			
 
				+    check_only = args.check_only
			
 
				+    skip_if_exists = not args.no_skip_if_exists
			
 
				+    clean_incomplete = args.clean_incomplete
			
 
				+    output_dir = args.output_dir
			
 
				+
			
 
				+    print(f"{'='*80}")
			
 
				+    print(f"批量账号数据{'检查' if check_only else '获取'}脚本")
			
 
				+    print(f"{'='*80}")
			
 
				+    print(f"配置文件: {config_file}")
			
 
				+    print(f"模式: {'只检查' if check_only else '获取数据'}")
			
 
				+    print(f"跳过已存在: {'是' if skip_if_exists else '否'}")
			
 
				+    print(f"清理不完整数据: {'是' if clean_incomplete else '否'}")
			
 
				+    print(f"输出目录: {output_dir}")
			
 
				+    print(f"{'='*80}\n")
			
 
				+
			
 
				+    try:
			
 
				+        with open(config_file, 'r', encoding='utf-8') as f:
			
 
				+            config = json.load(f)
			
 
				+    except FileNotFoundError:
			
 
				+        print(f"错误: 找不到文件 {config_file}")
			
 
				+        return 1
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        print(f"错误: JSON格式错误 - {e}")
			
 
				+        return 1
			
 
				+
			
 
				+    # 解析配置文件
			
 
				+    categories = config.get("categories", [])
			
 
				+
			
 
				+    if not categories:
			
 
				+        print("错误: 配置文件中没有找到 categories 数据")
			
 
				+        return 1
			
 
				+
			
 
				+    # 统计信息
			
 
				+    total_accounts = 0
			
 
				+    processed_accounts = 0
			
 
				+
			
 
				+    # 遍历所有品类
			
 
				+    for category in categories:
			
 
				+        category_name = category.get("name", "未知品类")
			
 
				+        tags = category.get("tags", [])
			
 
				+
			
 
				+        # 遍历所有tag
			
 
				+        for tag_info in tags:
			
 
				+            tag_name = tag_info.get("tag", "未知tag")
			
 
				+            accounts = tag_info.get("accounts", [])
			
 
				+
			
 
				+            # 遍历所有账号
			
 
				+            for account in accounts:
			
 
				+                total_accounts += 1
			
 
				+
			
 
				+                try:
			
 
				+                    process_account(
			
 
				+                        category_name,
			
 
				+                        tag_name,
			
 
				+                        account,
			
 
				+                        output_dir=output_dir,
			
 
				+                        check_only=check_only,
			
 
				+                        skip_if_exists=skip_if_exists,
			
 
				+                        clean_incomplete=clean_incomplete
			
 
				+                    )
			
 
				+                    processed_accounts += 1
			
 
				+                except Exception as e:
			
 
				+                    print(f"处理账号失败: {e}")
			
 
				+                    continue
			
 
				+
			
 
				+                # 账号之间添加延迟（检查模式不需要延迟）
			
 
				+                if not check_only:
			
 
				+                    time.sleep(1)
			
 
				+
			
 
				+    print(f"\n{'='*80}")
			
 
				+    print(f"批处理完成！")
			
 
				+    print(f"总共: {total_accounts} 个账号")
			
 
				+    print(f"成功: {processed_accounts} 个账号")
			
 
				+    print(f"失败: {total_accounts - processed_accounts} 个账号")
			
 
				+    print(f"{'='*80}")
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    exit(main())
			
--- a/scripts/fetch_xhs_data.py
+++ b/scripts/fetch_xhs_data.py
@@ -0,0 +1,415 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+小红书帖子数据获取脚本
			
 
				+功能：根据帖子链接获取帖子详情和作者历史帖子，并保存到本地目录
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import re
			
 
				+from pathlib import Path
			
 
				+from typing import Dict
			
 
				+import sys
			
 
				+import argparse
			
 
				+import shutil
			
 
				+
			
 
				+# 导入共享工具模块
			
 
				+from xhs_utils import (
			
 
				+    get_note_detail,
			
 
				+    get_author_history_notes,
			
 
				+    merge_note_data,
			
 
				+    transform_note_data
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def extract_note_id_from_url(url: str) -> str:
			
 
				+    """
			
 
				+    从小红书URL中提取note_id
			
 
				+
			
 
				+    Args:
			
 
				+        url: 小红书帖子URL
			
 
				+
			
 
				+    Returns:
			
 
				+        note_id: 帖子ID
			
 
				+
			
 
				+    Example:
			
 
				+        https://www.xiaohongshu.com/explore/68c6a924000000001b0336d0?xsec_token=...
			
 
				+        返回: 68c6a924000000001b0336d0
			
 
				+    """
			
 
				+    # 尝试从URL路径中提取
			
 
				+    pattern = r'/explore/([a-f0-9]+)'
			
 
				+    match = re.search(pattern, url)
			
 
				+
			
 
				+    if match:
			
 
				+        return match.group(1)
			
 
				+
			
 
				+    # 如果直接传入的是note_id，则直接返回
			
 
				+    if re.match(r'^[a-f0-9]{24}$', url):
			
 
				+        return url
			
 
				+
			
 
				+    raise ValueError(f"无法从URL中提取note_id: {url}")
			
 
				+
			
 
				+
			
 
				+def save_note_to_file(note_data: Dict, file_path: Path):
			
 
				+    """
			
 
				+    将帖子数据保存到JSON文件
			
 
				+
			
 
				+    Args:
			
 
				+        note_data: 帖子数据
			
 
				+        file_path: 文件路径
			
 
				+    """
			
 
				+    # 确保目录存在
			
 
				+    file_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # 保存JSON文件
			
 
				+    with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(note_data, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"已保存: {file_path}")
			
 
				+
			
 
				+
			
 
				+def check_note_data_integrity(note_data: dict) -> bool:
			
 
				+    """
			
 
				+    检查帖子数据的完整性
			
 
				+
			
 
				+    Args:
			
 
				+        note_data: 帖子数据字典
			
 
				+
			
 
				+    Returns:
			
 
				+        bool: 如果 images 或 video 字段至少一个不为空，返回 True，否则返回 False
			
 
				+    """
			
 
				+    images = note_data.get("images", [])
			
 
				+    video = note_data.get("video")
			
 
				+
			
 
				+    # 检查 images 是否为非空列表
			
 
				+    has_images = isinstance(images, list) and len(images) > 0
			
 
				+
			
 
				+    # 检查 video 是否存在且不为空（字符串或字典都可以）
			
 
				+    has_video = video is not None and video != "" and video != {}
			
 
				+
			
 
				+    return has_images or has_video
			
 
				+
			
 
				+
			
 
				+def check_data_exists(note_id: str, output_dir: str = "examples") -> dict:
			
 
				+    """
			
 
				+    检查数据是否已经存在且完整
			
 
				+
			
 
				+    Args:
			
 
				+        note_id: 帖子ID
			
 
				+        output_dir: 输出根目录
			
 
				+
			
 
				+    Returns:
			
 
				+        dict: 包含检查结果的字典
			
 
				+    """
			
 
				+    result = {
			
 
				+        "exists": False,
			
 
				+        "complete": False,
			
 
				+        "target_note_path": None,
			
 
				+        "history_notes_path": None,
			
 
				+        "incomplete_files": [],
			
 
				+        "note_id": note_id
			
 
				+    }
			
 
				+
			
 
				+    # 构建路径
			
 
				+    input_dir = Path(output_dir) / note_id / "输入"
			
 
				+    target_note_path = input_dir / "待解构帖子.json"
			
 
				+    history_notes_path = input_dir / "作者历史帖子"
			
 
				+
			
 
				+    result["target_note_path"] = target_note_path
			
 
				+    result["history_notes_path"] = history_notes_path
			
 
				+
			
 
				+    # 检查输入目录是否存在
			
 
				+    if not input_dir.exists():
			
 
				+        return result
			
 
				+
			
 
				+    result["exists"] = True
			
 
				+
			
 
				+    # 检查待解构帖子是否存在且完整
			
 
				+    if not target_note_path.exists():
			
 
				+        result["incomplete_files"].append(str(target_note_path))
			
 
				+        return result
			
 
				+
			
 
				+    try:
			
 
				+        with open(target_note_path, 'r', encoding='utf-8') as f:
			
 
				+            target_note_data = json.load(f)
			
 
				+
			
 
				+        if not check_note_data_integrity(target_note_data):
			
 
				+            result["incomplete_files"].append(str(target_note_path))
			
 
				+    except Exception as e:
			
 
				+        result["incomplete_files"].append(f"{target_note_path} (读取错误: {e})")
			
 
				+
			
 
				+    # 检查历史帖子目录
			
 
				+    if not history_notes_path.exists():
			
 
				+        result["incomplete_files"].append(str(history_notes_path))
			
 
				+        return result
			
 
				+
			
 
				+    # 检查历史帖子文件的完整性
			
 
				+    history_files = list(history_notes_path.glob("*.json"))
			
 
				+
			
 
				+    if len(history_files) == 0:
			
 
				+        result["incomplete_files"].append(f"{history_notes_path} (没有历史帖子文件)")
			
 
				+    else:
			
 
				+        # 统计有效的历史帖子数量
			
 
				+        valid_history_count = 0
			
 
				+
			
 
				+        for history_file in history_files:
			
 
				+            try:
			
 
				+                with open(history_file, 'r', encoding='utf-8') as f:
			
 
				+                    history_note_data = json.load(f)
			
 
				+
			
 
				+                if not check_note_data_integrity(history_note_data):
			
 
				+                    result["incomplete_files"].append(str(history_file))
			
 
				+                else:
			
 
				+                    valid_history_count += 1
			
 
				+            except Exception as e:
			
 
				+                result["incomplete_files"].append(f"{history_file} (读取错误: {e})")
			
 
				+
			
 
				+        # 验证历史帖子数量必须大于4
			
 
				+        if valid_history_count <= 4:
			
 
				+            result["incomplete_files"].append(f"{history_notes_path} (有效历史帖子数量 {valid_history_count} ≤ 4，不满足要求)")
			
 
				+
			
 
				+    # 如果没有不完整的文件，则数据完整
			
 
				+    result["complete"] = len(result["incomplete_files"]) == 0
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def delete_incomplete_data(note_id: str, output_dir: str = "examples") -> bool:
			
 
				+    """
			
 
				+    删除不完整的数据目录
			
 
				+
			
 
				+    Args:
			
 
				+        note_id: 帖子ID
			
 
				+        output_dir: 输出根目录
			
 
				+
			
 
				+    Returns:
			
 
				+        bool: 删除成功返回True，否则返回False
			
 
				+    """
			
 
				+    try:
			
 
				+        target_dir = Path(output_dir) / note_id
			
 
				+
			
 
				+        if target_dir.exists():
			
 
				+            shutil.rmtree(target_dir)
			
 
				+            print(f"  ✓ 已删除不完整数据目录: {target_dir}")
			
 
				+            return True
			
 
				+        else:
			
 
				+            print(f"  ⚠️  目录不存在: {target_dir}")
			
 
				+            return False
			
 
				+    except Exception as e:
			
 
				+        print(f"  ✗ 删除目录失败: {e}")
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def fetch_and_save_xhs_data(url: str, output_dir: str = "examples",
			
 
				+                             check_only: bool = False, skip_if_exists: bool = True,
			
 
				+                             clean_incomplete: bool = False):
			
 
				+    """
			
 
				+    获取小红书帖子数据并保存到本地
			
 
				+
			
 
				+    Args:
			
 
				+        url: 小红书帖子URL
			
 
				+        output_dir: 输出目录，默认为examples
			
 
				+        check_only: 如果为True，只检查数据是否存在，不执行获取操作
			
 
				+        skip_if_exists: 如果为True且数据已存在且完整，则跳过获取
			
 
				+        clean_incomplete: 如果为True，检测到不完整数据时自动删除
			
 
				+    """
			
 
				+    print(f"\n{'='*80}")
			
 
				+    print(f"{'[检查模式]' if check_only else '[处理模式]'} 根据帖子URL获取数据")
			
 
				+    print(f"{'='*80}")
			
 
				+
			
 
				+    # 1. 提取note_id
			
 
				+    print(f"正在解析URL: {url}")
			
 
				+    note_id = extract_note_id_from_url(url)
			
 
				+    print(f"提取到note_id: {note_id}")
			
 
				+
			
 
				+    # 先检查数据是否已存在
			
 
				+    check_result = check_data_exists(note_id, output_dir=output_dir)
			
 
				+
			
 
				+    if check_result["exists"]:
			
 
				+        if check_result["complete"]:
			
 
				+            print(f"\n✓ 数据已存在且完整")
			
 
				+            print(f"  待解构帖子: {check_result['target_note_path']}")
			
 
				+            print(f"  历史帖子目录: {check_result['history_notes_path']}")
			
 
				+
			
 
				+            if check_only or skip_if_exists:
			
 
				+                print(f"{'  [检查模式] 跳过获取' if check_only else '  [跳过] 数据已完整'}")
			
 
				+                return
			
 
				+        else:
			
 
				+            print(f"\n⚠️  数据存在但不完整")
			
 
				+            print(f"  不完整的文件:")
			
 
				+            for incomplete_file in check_result["incomplete_files"]:
			
 
				+                print(f"    - {incomplete_file}")
			
 
				+
			
 
				+            # 如果启用了清理不完整数据的功能
			
 
				+            if clean_incomplete:
			
 
				+                print(f"  [清理模式] 删除不完整数据...")
			
 
				+                delete_incomplete_data(note_id, output_dir)
			
 
				+
			
 
				+            if check_only:
			
 
				+                print(f"  [检查模式] 需要重新获取")
			
 
				+                return
			
 
				+            else:
			
 
				+                print(f"  将重新获取数据...")
			
 
				+    else:
			
 
				+        print(f"\nℹ️  数据不存在")
			
 
				+        if check_only:
			
 
				+            print(f"  [检查模式] 需要获取")
			
 
				+            return
			
 
				+
			
 
				+    # 如果是检查模式，到这里就结束了
			
 
				+    if check_only:
			
 
				+        return
			
 
				+
			
 
				+    # 2. 获取帖子详情
			
 
				+    print(f"正在获取帖子详情...")
			
 
				+    note_detail = get_note_detail(note_id)
			
 
				+
			
 
				+    # 3. 转换数据格式
			
 
				+    transformed_note = transform_note_data(note_detail)
			
 
				+    account_id = transformed_note["channel_account_id"]
			
 
				+
			
 
				+    # 4. 创建目录结构
			
 
				+    base_path = Path(output_dir) / note_id / "输入"
			
 
				+    history_path = base_path / "作者历史帖子"
			
 
				+
			
 
				+    # 5. 保存待解构帖子
			
 
				+    target_note_path = base_path / "待解构帖子.json"
			
 
				+    save_note_to_file(transformed_note, target_note_path)
			
 
				+
			
 
				+    # 6. 获取作者历史帖子
			
 
				+    if account_id:
			
 
				+        print(f"正在获取作者历史帖子 (账号ID: {account_id})...")
			
 
				+        history_notes = get_author_history_notes(account_id)
			
 
				+
			
 
				+        # 7. 为每个历史帖子处理数据并保存
			
 
				+        if isinstance(history_notes, list):
			
 
				+            print(f"找到 {len(history_notes)} 个历史帖子，正在处理...")
			
 
				+
			
 
				+            for idx, note in enumerate(history_notes, 1):
			
 
				+                # 从历史帖子列表中提取note_id
			
 
				+                history_note_id = note.get("note_id", "")
			
 
				+
			
 
				+                if history_note_id:
			
 
				+                    print(f"  [{idx}/{len(history_notes)}] 处理帖子: {history_note_id}")
			
 
				+
			
 
				+                    try:
			
 
				+                        # 检查历史API数据是否缺少关键字段（主要是body_text）
			
 
				+                        need_detail = not (note.get("desc") or note.get("note_text") or note.get("body_text"))
			
 
				+
			
 
				+                        detail_data = None
			
 
				+                        if need_detail:
			
 
				+                            print(f"    → 缺少正文，调用详情API补充...")
			
 
				+                            detail_data = get_note_detail(history_note_id)
			
 
				+
			
 
				+                        # 合并历史API和详情API的数据
			
 
				+                        merged_note = merge_note_data(note, detail_data)
			
 
				+
			
 
				+                        # 保存到文件
			
 
				+                        history_note_path = history_path / f"{history_note_id}.json"
			
 
				+                        save_note_to_file(merged_note, history_note_path)
			
 
				+                        if transformed_note['channel_content_id'] == merged_note['channel_content_id']:
			
 
				+                            save_note_to_file(merged_note, target_note_path)
			
 
				+                    except Exception as e:
			
 
				+                        print(f"  ⚠️  处理帖子 {history_note_id} 失败: {e}")
			
 
				+                        continue
			
 
				+
			
 
				+            print(f"\n共成功保存 {len(history_notes)} 个历史帖子")
			
 
				+        else:
			
 
				+            print("历史帖子数据格式不正确")
			
 
				+    else:
			
 
				+        print("未找到账号ID，跳过获取历史帖子")
			
 
				+
			
 
				+    print(f"\n✓ 数据获取完成！")
			
 
				+    print(f"输出目录: {base_path}")
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    # 解析命令行参数
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='小红书帖子数据获取脚本',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog="""
			
 
				+使用示例:
			
 
				+  # 获取帖子数据
			
 
				+  python fetch_xhs_data.py <帖子URL>
			
 
				+
			
 
				+  # 只检查数据是否存在且完整
			
 
				+  python fetch_xhs_data.py <帖子URL> --check-only
			
 
				+
			
 
				+  # 检查并清理不完整数据
			
 
				+  python fetch_xhs_data.py <帖子URL> --check-only --clean-incomplete
			
 
				+
			
 
				+  # 强制重新获取（即使数据已存在）
			
 
				+  python fetch_xhs_data.py <帖子URL> --no-skip-if-exists
			
 
				+        """
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        'url',
			
 
				+        nargs='?',
			
 
				+        default='https://www.xiaohongshu.com/explore/68c6a924000000001b0336d0',
			
 
				+        help='小红书帖子URL（可选，默认使用示例URL）'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--check-only',
			
 
				+        action='store_true',
			
 
				+        help='只检查数据是否存在且完整，不执行获取操作'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--no-skip-if-exists',
			
 
				+        action='store_true',
			
 
				+        help='即使数据已存在且完整也重新获取'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--clean-incomplete',
			
 
				+        action='store_true',
			
 
				+        help='自动删除检测到的不完整数据目录'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--output-dir',
			
 
				+        type=str,
			
 
				+        default='examples',
			
 
				+        help='输出根目录 (默认: examples)'
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    url = args.url
			
 
				+    check_only = args.check_only
			
 
				+    skip_if_exists = not args.no_skip_if_exists
			
 
				+    clean_incomplete = args.clean_incomplete
			
 
				+    output_dir = args.output_dir
			
 
				+
			
 
				+    print(f"{'='*80}")
			
 
				+    print(f"小红书帖子数据{'检查' if check_only else '获取'}脚本")
			
 
				+    print(f"{'='*80}")
			
 
				+    print(f"帖子URL: {url}")
			
 
				+    print(f"模式: {'只检查' if check_only else '获取数据'}")
			
 
				+    print(f"跳过已存在: {'是' if skip_if_exists else '否'}")
			
 
				+    print(f"清理不完整数据: {'是' if clean_incomplete else '否'}")
			
 
				+    print(f"输出目录: {output_dir}")
			
 
				+    print(f"{'='*80}")
			
 
				+
			
 
				+    try:
			
 
				+        fetch_and_save_xhs_data(
			
 
				+            url,
			
 
				+            output_dir=output_dir,
			
 
				+            check_only=check_only,
			
 
				+            skip_if_exists=skip_if_exists,
			
 
				+            clean_incomplete=clean_incomplete
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        print(f"错误: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return 1
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    exit(main())
			
--- a/scripts/fetch_xhs_data_by_account.py
+++ b/scripts/fetch_xhs_data_by_account.py
@@ -0,0 +1,434 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+小红书账号数据获取脚本
			
 
				+功能：根据账号链接获取该账号的所有历史帖子，选择点赞数最高的作为待解构帖子
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import re
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+from typing import Dict
			
 
				+import sys
			
 
				+import argparse
			
 
				+import shutil
			
 
				+
			
 
				+# 导入共享工具模块
			
 
				+from xhs_utils import (
			
 
				+    get_note_detail,
			
 
				+    get_author_history_notes,
			
 
				+    merge_note_data,
			
 
				+    transform_note_data
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def extract_account_id_from_url(url: str) -> str:
			
 
				+    """
			
 
				+    从小红书账号URL中提取account_id
			
 
				+
			
 
				+    Args:
			
 
				+        url: 小红书账号URL
			
 
				+
			
 
				+    Returns:
			
 
				+        account_id: 账号ID
			
 
				+
			
 
				+    Example:
			
 
				+        https://www.xiaohongshu.com/user/profile/5ff3e96a000000000100995a?xsec_token=...
			
 
				+        返回: 5ff3e96a000000000100995a
			
 
				+    """
			
 
				+    # 尝试从URL路径中提取
			
 
				+    pattern = r'/user/profile/([a-f0-9]+)'
			
 
				+    match = re.search(pattern, url)
			
 
				+
			
 
				+    if match:
			
 
				+        return match.group(1)
			
 
				+
			
 
				+    # 如果直接传入的是account_id，则直接返回
			
 
				+    if re.match(r'^[a-f0-9]{24}$', url):
			
 
				+        return url
			
 
				+
			
 
				+    raise ValueError(f"无法从URL中提取account_id: {url}")
			
 
				+
			
 
				+
			
 
				+def save_note_to_file(note_data: Dict, file_path: Path):
			
 
				+    """
			
 
				+    将帖子数据保存到JSON文件
			
 
				+
			
 
				+    Args:
			
 
				+        note_data: 帖子数据
			
 
				+        file_path: 文件路径
			
 
				+    """
			
 
				+    # 确保目录存在
			
 
				+    file_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # 保存JSON文件
			
 
				+    with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(note_data, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"已保存: {file_path}")
			
 
				+
			
 
				+
			
 
				+def check_note_data_integrity(note_data: dict) -> bool:
			
 
				+    """
			
 
				+    检查帖子数据的完整性
			
 
				+
			
 
				+    Args:
			
 
				+        note_data: 帖子数据字典
			
 
				+
			
 
				+    Returns:
			
 
				+        bool: 如果 images 或 video 字段至少一个不为空，返回 True，否则返回 False
			
 
				+    """
			
 
				+    images = note_data.get("images", [])
			
 
				+    video = note_data.get("video")
			
 
				+
			
 
				+    # 检查 images 是否为非空列表
			
 
				+    has_images = isinstance(images, list) and len(images) > 0
			
 
				+
			
 
				+    # 检查 video 是否存在且不为空（字符串或字典都可以）
			
 
				+    has_video = video is not None and video != "" and video != {}
			
 
				+
			
 
				+    return has_images or has_video
			
 
				+
			
 
				+
			
 
				+def check_account_data_exists(note_id: str, output_dir: str = "examples") -> dict:
			
 
				+    """
			
 
				+    检查账号数据是否已经存在且完整
			
 
				+
			
 
				+    Args:
			
 
				+        note_id: 帖子ID
			
 
				+        output_dir: 输出根目录
			
 
				+
			
 
				+    Returns:
			
 
				+        dict: 包含检查结果的字典
			
 
				+    """
			
 
				+    result = {
			
 
				+        "exists": False,
			
 
				+        "complete": False,
			
 
				+        "target_note_path": None,
			
 
				+        "history_notes_path": None,
			
 
				+        "incomplete_files": [],
			
 
				+        "note_id": note_id
			
 
				+    }
			
 
				+
			
 
				+    # 构建路径
			
 
				+    input_dir = Path(output_dir) / note_id / "输入"
			
 
				+    target_note_path = input_dir / "待解构帖子.json"
			
 
				+    history_notes_path = input_dir / "作者历史帖子"
			
 
				+
			
 
				+    result["target_note_path"] = target_note_path
			
 
				+    result["history_notes_path"] = history_notes_path
			
 
				+
			
 
				+    # 检查输入目录是否存在
			
 
				+    if not input_dir.exists():
			
 
				+        return result
			
 
				+
			
 
				+    result["exists"] = True
			
 
				+
			
 
				+    # 检查待解构帖子是否存在且完整
			
 
				+    if not target_note_path.exists():
			
 
				+        result["incomplete_files"].append(str(target_note_path))
			
 
				+        return result
			
 
				+
			
 
				+    try:
			
 
				+        with open(target_note_path, 'r', encoding='utf-8') as f:
			
 
				+            target_note_data = json.load(f)
			
 
				+
			
 
				+        if not check_note_data_integrity(target_note_data):
			
 
				+            result["incomplete_files"].append(str(target_note_path))
			
 
				+    except Exception as e:
			
 
				+        result["incomplete_files"].append(f"{target_note_path} (读取错误: {e})")
			
 
				+
			
 
				+    # 检查历史帖子目录
			
 
				+    if not history_notes_path.exists():
			
 
				+        result["incomplete_files"].append(str(history_notes_path))
			
 
				+        return result
			
 
				+
			
 
				+    # 检查历史帖子文件的完整性
			
 
				+    history_files = list(history_notes_path.glob("*.json"))
			
 
				+
			
 
				+    if len(history_files) == 0:
			
 
				+        result["incomplete_files"].append(f"{history_notes_path} (没有历史帖子文件)")
			
 
				+    else:
			
 
				+        # 统计有效的历史帖子数量
			
 
				+        valid_history_count = 0
			
 
				+
			
 
				+        for history_file in history_files:
			
 
				+            try:
			
 
				+                with open(history_file, 'r', encoding='utf-8') as f:
			
 
				+                    history_note_data = json.load(f)
			
 
				+
			
 
				+                if not check_note_data_integrity(history_note_data):
			
 
				+                    result["incomplete_files"].append(str(history_file))
			
 
				+                else:
			
 
				+                    valid_history_count += 1
			
 
				+            except Exception as e:
			
 
				+                result["incomplete_files"].append(f"{history_file} (读取错误: {e})")
			
 
				+
			
 
				+        # 验证历史帖子数量必须大于4
			
 
				+        if valid_history_count <= 4:
			
 
				+            result["incomplete_files"].append(f"{history_notes_path} (有效历史帖子数量 {valid_history_count} ≤ 4，不满足要求)")
			
 
				+
			
 
				+    # 如果没有不完整的文件，则数据完整
			
 
				+    result["complete"] = len(result["incomplete_files"]) == 0
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def delete_incomplete_data(note_id: str, output_dir: str = "examples") -> bool:
			
 
				+    """
			
 
				+    删除不完整的数据目录
			
 
				+
			
 
				+    Args:
			
 
				+        note_id: 帖子ID
			
 
				+        output_dir: 输出根目录
			
 
				+
			
 
				+    Returns:
			
 
				+        bool: 删除成功返回True，否则返回False
			
 
				+    """
			
 
				+    try:
			
 
				+        target_dir = Path(output_dir) / note_id
			
 
				+
			
 
				+        if target_dir.exists():
			
 
				+            shutil.rmtree(target_dir)
			
 
				+            print(f"  ✓ 已删除不完整数据目录: {target_dir}")
			
 
				+            return True
			
 
				+        else:
			
 
				+            print(f"  ⚠️  目录不存在: {target_dir}")
			
 
				+            return False
			
 
				+    except Exception as e:
			
 
				+        print(f"  ✗ 删除目录失败: {e}")
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def fetch_and_save_by_account(account_url: str, output_dir: str = "examples",
			
 
				+                               check_only: bool = False, skip_if_exists: bool = True,
			
 
				+                               clean_incomplete: bool = False):
			
 
				+    """
			
 
				+    根据账号URL获取数据并保存到本地
			
 
				+    选择点赞数最高的帖子作为待解构帖子
			
 
				+
			
 
				+    Args:
			
 
				+        account_url: 小红书账号URL
			
 
				+        output_dir: 输出目录，默认为examples
			
 
				+        check_only: 如果为True，只检查数据是否存在，不执行获取操作
			
 
				+        skip_if_exists: 如果为True且数据已存在且完整，则跳过获取
			
 
				+        clean_incomplete: 如果为True，检测到不完整数据时自动删除
			
 
				+    """
			
 
				+    print(f"\n{'='*80}")
			
 
				+    print(f"{'[检查模式]' if check_only else '[处理模式]'} 根据账号URL获取数据")
			
 
				+    print(f"{'='*80}")
			
 
				+
			
 
				+    # 1. 提取account_id
			
 
				+    print(f"正在解析账号URL: {account_url}")
			
 
				+    account_id = extract_account_id_from_url(account_url)
			
 
				+    print(f"提取到account_id: {account_id}")
			
 
				+
			
 
				+    # 2. 获取账号的所有历史帖子
			
 
				+    print(f"\n正在获取账号历史帖子...")
			
 
				+    history_notes = get_author_history_notes(account_id)
			
 
				+
			
 
				+    if not history_notes or len(history_notes) == 0:
			
 
				+        print("未找到历史帖子")
			
 
				+        return
			
 
				+
			
 
				+    print(f"找到 {len(history_notes)} 个历史帖子")
			
 
				+
			
 
				+    # 3. 找出点赞数最高的帖子
			
 
				+    print("\n正在分析点赞数...")
			
 
				+    max_like_note = max(history_notes, key=lambda x: x.get("like_count", 0))
			
 
				+    max_like_note_id = max_like_note.get("note_id", "")
			
 
				+    max_like_count = max_like_note.get("like_count", 0)
			
 
				+
			
 
				+    print(f"点赞数最高的帖子:")
			
 
				+    print(f"  - 帖子ID: {max_like_note_id}")
			
 
				+    print(f"  - 标题: {max_like_note.get('title', '无标题')}")
			
 
				+    print(f"  - 点赞数: {max_like_count}")
			
 
				+
			
 
				+    # 先检查数据是否已存在
			
 
				+    check_result = check_account_data_exists(max_like_note_id, output_dir=output_dir)
			
 
				+
			
 
				+    if check_result["exists"]:
			
 
				+        if check_result["complete"]:
			
 
				+            print(f"\n✓ 数据已存在且完整")
			
 
				+            print(f"  待解构帖子: {check_result['target_note_path']}")
			
 
				+            print(f"  历史帖子目录: {check_result['history_notes_path']}")
			
 
				+
			
 
				+            if check_only or skip_if_exists:
			
 
				+                print(f"{'  [检查模式] 跳过获取' if check_only else '  [跳过] 数据已完整'}")
			
 
				+                return
			
 
				+        else:
			
 
				+            print(f"\n⚠️  数据存在但不完整")
			
 
				+            print(f"  不完整的文件:")
			
 
				+            for incomplete_file in check_result["incomplete_files"]:
			
 
				+                print(f"    - {incomplete_file}")
			
 
				+
			
 
				+            # 如果启用了清理不完整数据的功能
			
 
				+            if clean_incomplete:
			
 
				+                print(f"  [清理模式] 删除不完整数据...")
			
 
				+                delete_incomplete_data(max_like_note_id, output_dir)
			
 
				+
			
 
				+            if check_only:
			
 
				+                print(f"  [检查模式] 需要重新获取")
			
 
				+                return
			
 
				+            else:
			
 
				+                print(f"  将重新获取数据...")
			
 
				+    else:
			
 
				+        print(f"\nℹ️  数据不存在")
			
 
				+        if check_only:
			
 
				+            print(f"  [检查模式] 需要获取")
			
 
				+            return
			
 
				+
			
 
				+    # 如果是检查模式，到这里就结束了
			
 
				+    if check_only:
			
 
				+        return
			
 
				+
			
 
				+    # 4. 处理点赞数最高的帖子（待解构帖子）
			
 
				+    print(f"\n正在处理待解构帖子...")
			
 
				+
			
 
				+    # 检查是否需要调用详情API补充数据
			
 
				+    need_detail = not (max_like_note.get("desc") or max_like_note.get("note_text") or max_like_note.get("body_text"))
			
 
				+
			
 
				+    target_note_detail = None
			
 
				+    if need_detail:
			
 
				+        print(f"  → 缺少正文，调用详情API补充...")
			
 
				+        target_note_detail = get_note_detail(max_like_note_id)
			
 
				+
			
 
				+    # 合并历史API和详情API的数据
			
 
				+    transformed_target = merge_note_data(max_like_note, target_note_detail)
			
 
				+
			
 
				+    # 5. 创建目录结构
			
 
				+    base_path = Path(output_dir) / max_like_note_id / "输入"
			
 
				+    history_path = base_path / "作者历史帖子"
			
 
				+
			
 
				+    # 6. 保存待解构帖子（点赞数最高的）
			
 
				+    target_note_path = base_path / "待解构帖子.json"
			
 
				+    save_note_to_file(transformed_target, target_note_path)
			
 
				+
			
 
				+    # 7. 为每个历史帖子处理数据并保存
			
 
				+    print(f"\n正在处理所有历史帖子...")
			
 
				+
			
 
				+    for idx, note in enumerate(history_notes, 1):
			
 
				+        history_note_id = note.get("note_id", "")
			
 
				+
			
 
				+        if history_note_id:
			
 
				+            print(f"  [{idx}/{len(history_notes)}] 处理帖子: {history_note_id}")
			
 
				+
			
 
				+            try:
			
 
				+                # 检查历史API数据是否缺少关键字段（主要是body_text）
			
 
				+                need_detail = not (note.get("desc") or note.get("note_text") or note.get("body_text"))
			
 
				+
			
 
				+                detail_data = None
			
 
				+                if need_detail:
			
 
				+                    print(f"    → 缺少正文，调用详情API补充...")
			
 
				+                    detail_data = get_note_detail(history_note_id)
			
 
				+                    # 添加请求间隔，避免频繁调用
			
 
				+                    if idx < len(history_notes):
			
 
				+                        time.sleep(0.5)
			
 
				+
			
 
				+                # 合并历史API和详情API的数据
			
 
				+                merged_note = merge_note_data(note, detail_data)
			
 
				+
			
 
				+                # 保存到文件
			
 
				+                history_note_path = history_path / f"{history_note_id}.json"
			
 
				+                save_note_to_file(merged_note, history_note_path)
			
 
				+            except Exception as e:
			
 
				+                print(f"  ⚠️  处理帖子 {history_note_id} 失败: {e}")
			
 
				+                continue
			
 
				+
			
 
				+    print(f"\n✓ 数据获取完成！")
			
 
				+    print(f"✓ 待解构帖子（点赞数最高）: {max_like_note_id}")
			
 
				+    print(f"✓ 共保存 {len(history_notes)} 个历史帖子详情")
			
 
				+    print(f"✓ 输出目录: {base_path}")
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    # 解析命令行参数
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='小红书账号数据获取脚本',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog="""
			
 
				+使用示例:
			
 
				+  # 获取账号数据
			
 
				+  python fetch_xhs_data_by_account.py <账号URL>
			
 
				+
			
 
				+  # 只检查数据是否存在且完整
			
 
				+  python fetch_xhs_data_by_account.py <账号URL> --check-only
			
 
				+
			
 
				+  # 检查并清理不完整数据
			
 
				+  python fetch_xhs_data_by_account.py <账号URL> --check-only --clean-incomplete
			
 
				+
			
 
				+  # 强制重新获取（即使数据已存在）
			
 
				+  python fetch_xhs_data_by_account.py <账号URL> --no-skip-if-exists
			
 
				+        """
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        'url',
			
 
				+        nargs='?',
			
 
				+        default='https://www.xiaohongshu.com/user/profile/5ff3e96a000000000100995a',
			
 
				+        help='小红书账号URL（可选，默认使用示例URL）'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--check-only',
			
 
				+        action='store_true',
			
 
				+        help='只检查数据是否存在且完整，不执行获取操作'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--no-skip-if-exists',
			
 
				+        action='store_true',
			
 
				+        help='即使数据已存在且完整也重新获取'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--clean-incomplete',
			
 
				+        action='store_true',
			
 
				+        help='自动删除检测到的不完整数据目录'
			
 
				+    )
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--output-dir',
			
 
				+        type=str,
			
 
				+        default='examples',
			
 
				+        help='输出根目录 (默认: examples)'
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    url = args.url
			
 
				+    check_only = args.check_only
			
 
				+    skip_if_exists = not args.no_skip_if_exists
			
 
				+    clean_incomplete = args.clean_incomplete
			
 
				+    output_dir = args.output_dir
			
 
				+
			
 
				+    print(f"{'='*80}")
			
 
				+    print(f"小红书账号数据{'检查' if check_only else '获取'}脚本")
			
 
				+    print(f"{'='*80}")
			
 
				+    print(f"账号URL: {url}")
			
 
				+    print(f"模式: {'只检查' if check_only else '获取数据'}")
			
 
				+    print(f"跳过已存在: {'是' if skip_if_exists else '否'}")
			
 
				+    print(f"清理不完整数据: {'是' if clean_incomplete else '否'}")
			
 
				+    print(f"输出目录: {output_dir}")
			
 
				+    print(f"{'='*80}")
			
 
				+
			
 
				+    try:
			
 
				+        fetch_and_save_by_account(
			
 
				+            url,
			
 
				+            output_dir=output_dir,
			
 
				+            check_only=check_only,
			
 
				+            skip_if_exists=skip_if_exists,
			
 
				+            clean_incomplete=clean_incomplete
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        print(f"错误: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return 1
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    exit(main())
			
--- a/scripts/xhs_utils.py
+++ b/scripts/xhs_utils.py
@@ -0,0 +1,245 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+小红书数据获取工具模块
			
 
				+包含共用的API调用、数据处理等函数
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import time
			
 
				+from typing import Dict, List
			
 
				+from datetime import datetime
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+# API配置
			
 
				+BASE_URL = "http://47.84.182.56:8001"
			
 
				+API_GET_DETAIL = f"{BASE_URL}/tools/call/get_xhs_detail_by_note_id"
			
 
				+API_GET_HISTORY = f"{BASE_URL}/tools/call/get_xhs_history_note_list_by_account_id"
			
 
				+
			
 
				+
			
 
				+def call_api(api_url: str, params: Dict, max_retries: int = 3) -> Dict:
			
 
				+    """
			
 
				+    调用API（带重试机制）
			
 
				+
			
 
				+    Args:
			
 
				+        api_url: API地址
			
 
				+        params: 请求参数
			
 
				+        max_retries: 最大重试次数
			
 
				+
			
 
				+    Returns:
			
 
				+        响应数据
			
 
				+    """
			
 
				+    for attempt in range(max_retries):
			
 
				+        try:
			
 
				+            print(f"调用API: {api_url}，参数: {params} (尝试 {attempt + 1}/{max_retries})")
			
 
				+            print(params)
			
 
				+            response = requests.post(api_url, json=params, timeout=600)
			
 
				+            response.raise_for_status()
			
 
				+            return response.json()
			
 
				+        except requests.exceptions.RequestException as e:
			
 
				+            if attempt < max_retries - 1:
			
 
				+                print(f"  API调用失败，{2}秒后重试... (尝试 {attempt + 1}/{max_retries})")
			
 
				+                time.sleep(2)
			
 
				+            else:
			
 
				+                print(f"API调用失败: {e}")
			
 
				+                raise
			
 
				+
			
 
				+
			
 
				+def get_note_detail(note_id: str) -> Dict:
			
 
				+    """
			
 
				+    获取帖子详情
			
 
				+
			
 
				+    Args:
			
 
				+        note_id: 帖子ID
			
 
				+
			
 
				+    Returns:
			
 
				+        帖子详情数据
			
 
				+    """
			
 
				+    params = {"note_id": note_id}
			
 
				+    result = call_api(API_GET_DETAIL, params)
			
 
				+
			
 
				+    # 解析API返回的数据结构
			
 
				+    try:
			
 
				+        if result.get("success") and result.get("result"):
			
 
				+            # result字段是一个JSON字符串，需要解析
			
 
				+            result_data = json.loads(result["result"])
			
 
				+            if isinstance(result_data, list) and len(result_data) > 0:
			
 
				+                # 返回第一个元素的data字段
			
 
				+                return result_data[0].get("data", {})
			
 
				+    except:
			
 
				+        print(result)
			
 
				+        raise
			
 
				+    return {}
			
 
				+
			
 
				+
			
 
				+def format_timestamp(timestamp_ms) -> str:
			
 
				+    """
			
 
				+    将毫秒时间戳转换为年月日时分秒格式
			
 
				+
			
 
				+    Args:
			
 
				+        timestamp_ms: 毫秒级时间戳
			
 
				+
			
 
				+    Returns:
			
 
				+        格式化的时间字符串 (YYYY-MM-DD HH:MM:SS)
			
 
				+    """
			
 
				+    try:
			
 
				+        if timestamp_ms:
			
 
				+            # 将毫秒时间戳转换为秒
			
 
				+            timestamp_s = int(timestamp_ms) / 1000
			
 
				+            dt = datetime.fromtimestamp(timestamp_s)
			
 
				+            return dt.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+    except (ValueError, TypeError, OSError):
			
 
				+        pass
			
 
				+    return ""
			
 
				+
			
 
				+
			
 
				+def get_author_history_notes(account_id: str) -> List[Dict]:
			
 
				+    """
			
 
				+    获取作者历史帖子列表
			
 
				+
			
 
				+    Args:
			
 
				+        account_id: 账号ID
			
 
				+
			
 
				+    Returns:
			
 
				+        历史帖子列表
			
 
				+    """
			
 
				+    params = {"account_id": account_id}
			
 
				+    result = call_api(API_GET_HISTORY, params)
			
 
				+
			
 
				+    # 解析API返回的数据结构
			
 
				+    if result.get("success") and result.get("result"):
			
 
				+        # result字段是一个JSON字符串，需要解析
			
 
				+        result_data = json.loads(result["result"])
			
 
				+
			
 
				+        if isinstance(result_data, list) and len(result_data) > 0:
			
 
				+            # 历史帖子API返回格式: [{'data': [note1, note2, ...]}]
			
 
				+            # 提取第一个元素的data字段，它是一个帖子列表
			
 
				+            first_item = result_data[0]
			
 
				+            if isinstance(first_item, dict) and "data" in first_item:
			
 
				+                data = first_item.get("data")
			
 
				+                if isinstance(data, list):
			
 
				+                    return data
			
 
				+
			
 
				+    return []
			
 
				+
			
 
				+
			
 
				+def merge_note_data(history_data: Dict, detail_data: Dict) -> Dict:
			
 
				+    """
			
 
				+    合并历史API和详情API的数据，优先使用历史API数据
			
 
				+
			
 
				+    Args:
			
 
				+        history_data: 历史API返回的数据
			
 
				+        detail_data: 详情API返回的数据
			
 
				+
			
 
				+    Returns:
			
 
				+        合并后的数据
			
 
				+    """
			
 
				+    # 从历史数据提取基本信息
			
 
				+    note_id = history_data.get("note_id", "")
			
 
				+
			
 
				+    # 优先使用详情API的完整链接（包含token），否则用note_id拼接简单链接
			
 
				+    if detail_data and detail_data.get("content_link"):
			
 
				+        link = detail_data.get("content_link")
			
 
				+    else:
			
 
				+        link = f"https://www.xiaohongshu.com/explore/{note_id}" if note_id else ""
			
 
				+
			
 
				+    # 提取用户信息
			
 
				+    user_info = history_data.get("user", {})
			
 
				+    user_id = user_info.get("user_id", "") if isinstance(user_info, dict) else ""
			
 
				+    nickname = user_info.get("nickname", "") if isinstance(user_info, dict) else ""
			
 
				+
			
 
				+    # 提取图片列表（优先使用历史API的图片）
			
 
				+    images = []
			
 
				+    if "image_url_list" in history_data and isinstance(history_data["image_url_list"], list):
			
 
				+        images = [img.get("cdn_url") or img.get("url", "") for img in history_data["image_url_list"]]
			
 
				+    elif "cover" in history_data and isinstance(history_data["cover"], dict):
			
 
				+        cover_url = history_data["cover"].get("cdn_url") or history_data["cover"].get("url", "")
			
 
				+        if cover_url:
			
 
				+            images.append(cover_url)
			
 
				+
			
 
				+    # 如果历史API没有图片，尝试从详情API获取
			
 
				+    if detail_data:
			
 
				+        if "images" in detail_data and isinstance(detail_data["images"], list) and len(detail_data["images"]) > 0:
			
 
				+            images = [img.get("cdn_url") or img.get("url", "") for img in detail_data["images"]]
			
 
				+
			
 
				+    # 去重：保留第一次出现的图片，过滤空字符串
			
 
				+    seen = set()
			
 
				+    unique_images = []
			
 
				+    for img_url in images:
			
 
				+        if img_url and img_url not in seen:
			
 
				+            seen.add(img_url)
			
 
				+            unique_images.append(img_url)
			
 
				+    images = unique_images
			
 
				+
			
 
				+    # 提取发布时间戳（优先使用历史API数据）
			
 
				+    publish_timestamp = history_data.get("publish_timestamp") or (detail_data.get("publish_timestamp") if detail_data else None)
			
 
				+    publish_time = format_timestamp(publish_timestamp)
			
 
				+
			
 
				+    # 优先使用历史API的数据，缺失时从详情API补充
			
 
				+    merged = {
			
 
				+        "channel_content_id": note_id,
			
 
				+        "link": link,
			
 
				+        "comment_count": history_data.get("comment_count", detail_data.get("comment_count", 0) if detail_data else 0),
			
 
				+        "images": images,
			
 
				+        "like_count": history_data.get("like_count", detail_data.get("like_count", 0) if detail_data else 0),
			
 
				+        "body_text": history_data.get("desc") or history_data.get("note_text") or (detail_data.get("body_text", "") if detail_data else ""),
			
 
				+        "title": history_data.get("title", detail_data.get("title", "") if detail_data else ""),
			
 
				+        "collect_count": history_data.get("collecte_count") or history_data.get("collect_count", detail_data.get("collect_count", 0) if detail_data else 0),
			
 
				+        "channel_account_id": user_id or (detail_data.get("channel_account_id", "") if detail_data else ""),
			
 
				+        "channel_account_name": nickname or (detail_data.get("channel_account_name", "") if detail_data else ""),
			
 
				+        "content_type": history_data.get("type", detail_data.get("content_type", "") if detail_data else ""),
			
 
				+        "video": history_data.get("video", detail_data.get("video", {}) if detail_data else {}),
			
 
				+        "publish_timestamp": publish_timestamp if publish_timestamp else 0,
			
 
				+        "publish_time": publish_time
			
 
				+    }
			
 
				+
			
 
				+    return merged
			
 
				+
			
 
				+
			
 
				+def transform_note_data(note_data: Dict) -> Dict:
			
 
				+    """
			
 
				+    将详情API返回的数据转换为目标格式
			
 
				+
			
 
				+    Args:
			
 
				+        note_data: 详情API返回的原始数据
			
 
				+
			
 
				+    Returns:
			
 
				+        转换后的数据
			
 
				+    """
			
 
				+    # 提取图片URL列表
			
 
				+    images = []
			
 
				+    if "images" in note_data and isinstance(note_data["images"], list):
			
 
				+        # 优先取cdn_url，否则取url
			
 
				+        images = [img.get("cdn_url") or img.get("url", "") for img in note_data["images"]]
			
 
				+
			
 
				+    # 去重：保留第一次出现的图片，过滤空字符串
			
 
				+    seen = set()
			
 
				+    unique_images = []
			
 
				+    for img_url in images:
			
 
				+        if img_url and img_url not in seen:
			
 
				+            seen.add(img_url)
			
 
				+            unique_images.append(img_url)
			
 
				+    images = unique_images
			
 
				+
			
 
				+    # 提取发布时间戳并格式化
			
 
				+    publish_timestamp = note_data.get("publish_timestamp")
			
 
				+    publish_time = format_timestamp(publish_timestamp)
			
 
				+
			
 
				+    transformed = {
			
 
				+        "channel_content_id": note_data.get("channel_content_id", ""),
			
 
				+        "link": note_data.get("content_link", ""),
			
 
				+        "comment_count": note_data.get("comment_count", 0),
			
 
				+        "images": images,
			
 
				+        "like_count": note_data.get("like_count", 0),
			
 
				+        "body_text": note_data.get("body_text", ""),
			
 
				+        "title": note_data.get("title", ""),
			
 
				+        "collect_count": note_data.get("collect_count", 0),
			
 
				+        "channel_account_id": note_data.get("channel_account_id", ""),
			
 
				+        "channel_account_name": note_data.get("channel_account_name", ""),
			
 
				+        "content_type": note_data.get("content_type", ""),
			
 
				+        "video": note_data.get("video", {}),
			
 
				+        "publish_timestamp": publish_timestamp if publish_timestamp else 0,
			
 
				+        "publish_time": publish_time
			
 
				+    }
			
 
				+
			
 
				+    return transformed
			
--- a/step1_inspiration_match.py
+++ b/step1_inspiration_match.py
@@ -0,0 +1,280 @@
 
				+"""
			
 
				+灵感点与人设匹配分析 - Agent 框架版
			
 
				+
			
 
				+基于 how_decode_v1.py 的 Agent 框架实现
			
 
				+参考 step1_match_inspiration_to_persona_v11.py 的业务逻辑
			
 
				+"""
			
 
				+import asyncio
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+from typing import List, Dict
			
 
				+
			
 
				+from agents import trace
			
 
				+from agents.tracing.create import custom_span
			
 
				+from lib.my_trace import set_trace_smith as set_trace
			
 
				+from lib.async_utils import process_tasks_with_semaphore
			
 
				+from lib.match_analyzer import match_single
			
 
				+from lib.data_loader import load_persona_data, load_inspiration_list, select_inspiration
			
 
				+
			
 
				+# 模型配置
			
 
				+MODEL_NAME = "google/gemini-2.5-pro"
			
 
				+
			
 
				+
			
 
				+def build_context_str(perspective_name: str, level1_name: str = None) -> str:
			
 
				+    """构建上下文字符串
			
 
				+
			
 
				+    Args:
			
 
				+        perspective_name: 视角名称
			
 
				+        level1_name: 一级分类名称（仅在匹配二级分类时提供）
			
 
				+
			
 
				+    Returns:
			
 
				+        上下文字符串
			
 
				+    """
			
 
				+    if level1_name:
			
 
				+        # 匹配二级分类：包含视角和一级分类
			
 
				+        return f"""所属视角: {perspective_name}
			
 
				+一级分类: {level1_name}"""
			
 
				+    else:
			
 
				+        # 匹配一级分类：只包含视角
			
 
				+        return f"""所属视角: {perspective_name}"""
			
 
				+
			
 
				+
			
 
				+# ========== 核心匹配逻辑 ==========
			
 
				+async def match_single_task(task: dict, _index: int) -> dict:
			
 
				+    """执行单个匹配任务（异步版本）
			
 
				+
			
 
				+    Args:
			
 
				+        task: 匹配任务，包含：
			
 
				+            - 灵感: 灵感点文本
			
 
				+            - 要素: 要素名称
			
 
				+            - 要素类型: "一级分类" 或 "二级分类"
			
 
				+            - 上下文: 上下文字符串
			
 
				+        _index: 任务索引（由 async_utils 传入，此处未使用）
			
 
				+
			
 
				+    Returns:
			
 
				+        匹配结果
			
 
				+    """
			
 
				+    inspiration = task["灵感"]
			
 
				+    element = task["要素"]
			
 
				+    context_str = task["上下文"]
			
 
				+
			
 
				+    # 调用通用匹配模块（内部已包含错误处理和 custom_span 追踪）
			
 
				+    # B = 灵感, A = 要素, A_Context = 上下文
			
 
				+    match_result = await match_single(
			
 
				+        b_content=inspiration,
			
 
				+        a_content=element,
			
 
				+        model_name=MODEL_NAME,
			
 
				+        a_context=context_str  # 要素的上下文
			
 
				+    )
			
 
				+
			
 
				+    # 构建完整结果（通用字段 + 业务信息统一存储在最后）
			
 
				+    full_result = {
			
 
				+        "输入信息": {
			
 
				+            "B": inspiration,         # 待匹配：灵感
			
 
				+            "A": element,             # 上下文：要素
			
 
				+            "B_Context": "",          # B的上下文（暂时为空）
			
 
				+            "A_Context": context_str  # A的上下文：所属视角/一级分类
			
 
				+        },
			
 
				+        "匹配结果": match_result,     # {"相同部分": {}, "增量部分": {}, "score": 0.0, "score说明": ""}
			
 
				+        "业务信息": {                 # 业务语义信息（统一存储在最后）
			
 
				+            "灵感": inspiration,
			
 
				+            "匹配要素": element
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return full_result
			
 
				+
			
 
				+
			
 
				+# ========== 任务构建 ==========
			
 
				+def build_match_tasks(
			
 
				+        persona_data: dict,
			
 
				+        inspiration: str,
			
 
				+        max_tasks: int = None
			
 
				+) -> List[dict]:
			
 
				+    """构建匹配任务列表
			
 
				+
			
 
				+    Args:
			
 
				+        persona_data: 人设数据
			
 
				+        inspiration: 灵感点
			
 
				+        max_tasks: 最大任务数（None 表示不限制）
			
 
				+
			
 
				+    Returns:
			
 
				+        任务列表
			
 
				+    """
			
 
				+    tasks = []
			
 
				+
			
 
				+    # 从"灵感点列表"中提取任务
			
 
				+    for perspective in persona_data.get("灵感点列表", []):
			
 
				+        if max_tasks is not None and len(tasks) >= max_tasks:
			
 
				+            break
			
 
				+
			
 
				+        perspective_name = perspective.get("视角名称", "")
			
 
				+
			
 
				+        for pattern in perspective.get("模式列表", []):
			
 
				+            if max_tasks is not None and len(tasks) >= max_tasks:
			
 
				+                break
			
 
				+
			
 
				+            level1_name = pattern.get("分类名称", "")
			
 
				+
			
 
				+            # 添加一级分类任务
			
 
				+            context_str = build_context_str(perspective_name)
			
 
				+            tasks.append({
			
 
				+                "灵感": inspiration,
			
 
				+                "要素": level1_name,
			
 
				+                "要素类型": "一级分类",
			
 
				+                "上下文": context_str
			
 
				+            })
			
 
				+
			
 
				+            # 添加该一级下的所有二级分类任务
			
 
				+            for level2 in pattern.get("二级细分", []):
			
 
				+                if max_tasks is not None and len(tasks) >= max_tasks:
			
 
				+                    break
			
 
				+
			
 
				+                level2_name = level2.get("分类名称", "")
			
 
				+
			
 
				+                context_str = build_context_str(perspective_name, level1_name)
			
 
				+                tasks.append({
			
 
				+                    "灵感": inspiration,
			
 
				+                    "要素": level2_name,
			
 
				+                    "要素类型": "二级分类",
			
 
				+                    "上下文": context_str
			
 
				+                })
			
 
				+
			
 
				+    return tasks
			
 
				+
			
 
				+
			
 
				+# ========== 核心业务逻辑 ==========
			
 
				+async def process_inspiration_match(
			
 
				+        persona_data: dict,
			
 
				+        inspiration: str,
			
 
				+        max_tasks: int = None,
			
 
				+        max_concurrent: int = 3,
			
 
				+        current_time: str = None,
			
 
				+        log_url: str = None
			
 
				+) -> dict:
			
 
				+    """执行灵感与人设匹配分析（核心业务逻辑）
			
 
				+
			
 
				+    Args:
			
 
				+        persona_data: 人设数据字典
			
 
				+        inspiration: 灵感点文本
			
 
				+        max_tasks: 最大任务数（None 表示不限制）
			
 
				+        max_concurrent: 最大并发数
			
 
				+        current_time: 当前时间戳
			
 
				+        log_url: 日志链接
			
 
				+
			
 
				+    Returns:
			
 
				+        匹配结果字典，包含元数据和匹配结果列表
			
 
				+    """
			
 
				+    # 构建匹配任务
			
 
				+    test_tasks = build_match_tasks(persona_data, inspiration, max_tasks)
			
 
				+
			
 
				+    print(f"\n开始匹配分析: {inspiration}")
			
 
				+    print(f"任务数: {len(test_tasks)}, 模型: {MODEL_NAME}\n")
			
 
				+
			
 
				+    # 使用 custom_span 标识整个匹配流程
			
 
				+    with custom_span(
			
 
				+        name=f"Step1: 灵感与人设匹配 - {inspiration}",
			
 
				+        data={
			
 
				+            "灵感": inspiration,
			
 
				+            "任务总数": len(test_tasks),
			
 
				+            "模型": MODEL_NAME,
			
 
				+            "并发数": max_concurrent,
			
 
				+            "步骤": "字面语义匹配分析"
			
 
				+        }
			
 
				+    ):
			
 
				+        # 异步并发执行匹配（match_single_task 内部已处理所有错误）
			
 
				+        results = await process_tasks_with_semaphore(
			
 
				+            test_tasks,
			
 
				+            match_single_task,
			
 
				+            max_concurrent=max_concurrent,
			
 
				+            show_progress=True
			
 
				+        )
			
 
				+
			
 
				+        # 按 score 降序排序
			
 
				+        results.sort(key=lambda x: x.get('匹配结果', {}).get('score', 0), reverse=True)
			
 
				+
			
 
				+    # 构建输出结果
			
 
				+    output = {
			
 
				+        "元数据": {
			
 
				+            "current_time": current_time,
			
 
				+            "log_url": log_url,
			
 
				+            "model": MODEL_NAME
			
 
				+        },
			
 
				+        "灵感": inspiration,
			
 
				+        "匹配结果列表": results
			
 
				+    }
			
 
				+
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+# ========== 主函数 ==========
			
 
				+async def main(current_time: str = None, log_url: str = None):
			
 
				+    """主函数：负责参数解析、文件读取、结果保存
			
 
				+
			
 
				+    Args:
			
 
				+        current_time: 当前时间戳（从外部传入）
			
 
				+        log_url: 日志链接（从外部传入）
			
 
				+    """
			
 
				+    # 解析命令行参数
			
 
				+    # 第一个参数：人设文件夹路径（默认值）
			
 
				+    if len(sys.argv) > 1:
			
 
				+        persona_dir = sys.argv[1]
			
 
				+    else:
			
 
				+        persona_dir = "data/阿里多多酱/out/人设_1110"
			
 
				+
			
 
				+    # 第二个参数：灵感索引（数字）或灵感名称（字符串），默认为 0
			
 
				+    inspiration_arg = sys.argv[2] if len(sys.argv) > 2 else "0"
			
 
				+
			
 
				+    # 第三个参数：任务数限制，默认为 None（所有任务）
			
 
				+    max_tasks = None if len(sys.argv) > 3 and sys.argv[3] == "all" else (
			
 
				+        int(sys.argv[3]) if len(sys.argv) > 3 else None
			
 
				+    )
			
 
				+
			
 
				+    # 加载数据（使用辅助函数，失败时自动退出）
			
 
				+    persona_data = load_persona_data(persona_dir)
			
 
				+    inspiration_list = load_inspiration_list(persona_dir)
			
 
				+    test_inspiration = select_inspiration(inspiration_arg, inspiration_list)
			
 
				+
			
 
				+    # 执行核心业务逻辑
			
 
				+    output = await process_inspiration_match(
			
 
				+        persona_data=persona_data,
			
 
				+        inspiration=test_inspiration,
			
 
				+        max_tasks=max_tasks,
			
 
				+        max_concurrent=5,
			
 
				+        current_time=current_time,
			
 
				+        log_url=log_url
			
 
				+    )
			
 
				+
			
 
				+    # 保存结果文件
			
 
				+    # 路径格式：how/灵感点/[灵感点名称]/[top_n?]_[步骤名称中文]_[模型名称].json
			
 
				+    output_dir = os.path.join(persona_dir, "how", "灵感点", test_inspiration)
			
 
				+
			
 
				+    # 提取模型名称
			
 
				+    model_name_short = MODEL_NAME.replace("google/", "").replace("/", "_")
			
 
				+    step_name_cn = "灵感人设匹配"
			
 
				+
			
 
				+    # 构建文件名：范围标识（all 或 top_n）+ step1 + 步骤名称中文 + 模型名称
			
 
				+    scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
			
 
				+    output_filename = f"{scope_prefix}_step1_{step_name_cn}_{model_name_short}.json"
			
 
				+
			
 
				+    # 确保目录存在
			
 
				+    os.makedirs(output_dir, exist_ok=True)
			
 
				+    output_file = os.path.join(output_dir, output_filename)
			
 
				+
			
 
				+    # 保存结果
			
 
				+    with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(output, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"\n完成！结果已保存到: {output_file}")
			
 
				+    if log_url:
			
 
				+        print(f"Trace: {log_url}\n")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 设置 trace
			
 
				+    current_time, log_url = set_trace()
			
 
				+
			
 
				+    # 使用 trace 上下文包裹整个执行流程
			
 
				+    with trace("灵感与人设匹配"):
			
 
				+        asyncio.run(main(current_time, log_url))
			
--- a/step2_incremental_match.py
+++ b/step2_incremental_match.py
@@ -0,0 +1,273 @@
 
				+"""
			
 
				+Step2: 增量词在人设中的匹配分析
			
 
				+
			
 
				+基于 Step1 的匹配结果（取 Top1），分析增量词在人设系统中的匹配情况
			
 
				+"""
			
 
				+import os
			
 
				+import sys
			
 
				+import json
			
 
				+import asyncio
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from agents import trace
			
 
				+from agents.tracing.create import custom_span
			
 
				+from lib.my_trace import set_trace_smith as set_trace
			
 
				+from lib.match_analyzer import match_batch
			
 
				+from lib.data_loader import load_persona_data, load_inspiration_data, select_inspiration
			
 
				+
			
 
				+# 模型配置
			
 
				+MODEL_NAME = "google/gemini-2.5-pro"
			
 
				+
			
 
				+
			
 
				+def format_persona_system(persona_data: dict) -> str:
			
 
				+    """格式化完整人设系统为参考文本
			
 
				+
			
 
				+    Args:
			
 
				+        persona_data: 人设数据
			
 
				+
			
 
				+    Returns:
			
 
				+        格式化的人设系统文本
			
 
				+    """
			
 
				+    lines = ["# 人设系统"]
			
 
				+
			
 
				+    # 处理三个部分：灵感点列表、目的点、关键点列表
			
 
				+    for section_key, section_title in [
			
 
				+        ("灵感点列表", "【灵感点】灵感的来源和性质"),
			
 
				+        ("目的点", "【目的点】创作的目的和价值导向"),
			
 
				+        ("关键点列表", "【关键点】内容的核心主体和表达方式")
			
 
				+    ]:
			
 
				+        section_data = persona_data.get(section_key, [])
			
 
				+        if not section_data:
			
 
				+            continue
			
 
				+
			
 
				+        lines.append(f"\n## {section_title}\n")
			
 
				+
			
 
				+        for perspective in section_data:
			
 
				+            perspective_name = perspective.get("视角名称", "")
			
 
				+            lines.append(f"\n### 视角：{perspective_name}")
			
 
				+
			
 
				+            for pattern in perspective.get("模式列表", []):
			
 
				+                pattern_name = pattern.get("分类名称", "")
			
 
				+                pattern_def = pattern.get("核心定义", "")
			
 
				+                lines.append(f"\n  【一级】{pattern_name}")
			
 
				+                if pattern_def:
			
 
				+                    lines.append(f"    定义：{pattern_def}")
			
 
				+
			
 
				+                # 二级细分
			
 
				+                for sub in pattern.get("二级细分", []):
			
 
				+                    sub_name = sub.get("分类名称", "")
			
 
				+                    sub_def = sub.get("分类定义", "")
			
 
				+                    lines.append(f"    【二级】{sub_name}：{sub_def}")
			
 
				+
			
 
				+    return "\n".join(lines)
			
 
				+
			
 
				+
			
 
				+def find_step1_file(persona_dir: str, inspiration: str, model_name: str) -> str:
			
 
				+    """查找 step1 输出文件
			
 
				+
			
 
				+    Args:
			
 
				+        persona_dir: 人设目录
			
 
				+        inspiration: 灵感点名称
			
 
				+        model_name: 模型名称
			
 
				+
			
 
				+    Returns:
			
 
				+        step1 文件路径
			
 
				+
			
 
				+    Raises:
			
 
				+        SystemExit: 找不到文件时退出
			
 
				+    """
			
 
				+    step1_dir = os.path.join(persona_dir, "how", "灵感点", inspiration)
			
 
				+    model_name_short = model_name.replace("google/", "").replace("/", "_")
			
 
				+    step1_file_pattern = f"*_step1_*_{model_name_short}.json"
			
 
				+
			
 
				+    step1_files = list(Path(step1_dir).glob(step1_file_pattern))
			
 
				+    if not step1_files:
			
 
				+        print(f"❌ 找不到 step1 输出文件")
			
 
				+        print(f"查找路径: {step1_dir}/{step1_file_pattern}")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    return str(step1_files[0])
			
 
				+
			
 
				+
			
 
				+async def process_step2_incremental_match(
			
 
				+    step1_top1: dict,
			
 
				+    persona_data: dict,
			
 
				+    inspiration: str,
			
 
				+    current_time: str = None,
			
 
				+    log_url: str = None
			
 
				+) -> dict:
			
 
				+    """执行增量词匹配分析（核心业务逻辑）
			
 
				+
			
 
				+    Args:
			
 
				+        step1_top1: step1 的 top1 匹配结果
			
 
				+        persona_data: 人设数据
			
 
				+        inspiration: 灵感名称
			
 
				+        current_time: 当前时间戳
			
 
				+        log_url: trace URL
			
 
				+
			
 
				+    Returns:
			
 
				+        匹配结果字典
			
 
				+    """
			
 
				+    # 从 step1 结果中提取信息
			
 
				+    business_info = step1_top1.get("业务信息", {})
			
 
				+    match_result = step1_top1.get("匹配结果", {})
			
 
				+
			
 
				+    step1_inspiration = business_info.get("灵感", "")
			
 
				+    matched_element = business_info.get("匹配要素", "")
			
 
				+    incremental_parts = match_result.get("增量部分", {})
			
 
				+    incremental_words = list(incremental_parts.keys())
			
 
				+
			
 
				+    # 格式化人设系统
			
 
				+    persona_system_text = format_persona_system(persona_data)
			
 
				+
			
 
				+    # 构建补充上下文（B_Context - 统一构造一次）
			
 
				+    b_context = f"""这些增量词来自灵感「{step1_inspiration}」，
			
 
				+在 step1 匹配中，与人设要素「{matched_element}」匹配时产生的增量部分。"""
			
 
				+
			
 
				+    if not incremental_words:
			
 
				+        print("⚠️  Top1 结果没有增量词，跳过分析")
			
 
				+        return {
			
 
				+            "元数据": {
			
 
				+                "current_time": current_time,
			
 
				+                "log_url": log_url,
			
 
				+                "model": MODEL_NAME,
			
 
				+                "步骤": "Step2: 增量词在人设中的匹配"
			
 
				+            },
			
 
				+            "灵感": step1_inspiration,
			
 
				+            "输入信息": {
			
 
				+                "B": [],
			
 
				+                "A": persona_system_text,
			
 
				+                "B_Context": b_context,  # 使用统一构造的 context
			
 
				+                "A_Context": ""
			
 
				+            },
			
 
				+            "step1_结果": step1_top1,
			
 
				+            "匹配结果": []
			
 
				+        }
			
 
				+
			
 
				+    print(f"\n开始增量词匹配分析: {step1_inspiration}")
			
 
				+    print(f"匹配要素: {matched_element}")
			
 
				+    print(f"增量词数量: {len(incremental_words)}, 模型: {MODEL_NAME}\n")
			
 
				+
			
 
				+    # 使用 custom_span 标识整个流程
			
 
				+    with custom_span(
			
 
				+        name=f"Step2: 增量词匹配 - {step1_inspiration}",
			
 
				+        data={
			
 
				+            "灵感": step1_inspiration,
			
 
				+            "匹配要素": matched_element,
			
 
				+            "增量词数量": len(incremental_words),
			
 
				+            "模型": MODEL_NAME,
			
 
				+            "步骤": "增量词在人设中的匹配分析"
			
 
				+        }
			
 
				+    ):
			
 
				+        # 调用通用批量匹配模块
			
 
				+        match_results = await match_batch(
			
 
				+            b_items=incremental_words,
			
 
				+            a_content=persona_system_text,
			
 
				+            model_name=MODEL_NAME,
			
 
				+            b_context=b_context
			
 
				+        )
			
 
				+
			
 
				+    # 按 score 降序排序
			
 
				+    if isinstance(match_results, list):
			
 
				+        match_results.sort(key=lambda x: x.get('score', 0), reverse=True)
			
 
				+
			
 
				+    # 构建输出（使用统一构造的变量）
			
 
				+    return {
			
 
				+        "元数据": {
			
 
				+            "current_time": current_time,
			
 
				+            "log_url": log_url,
			
 
				+            "model": MODEL_NAME,
			
 
				+            "步骤": "Step2: 增量词在人设中的匹配"
			
 
				+        },
			
 
				+        "灵感": step1_inspiration,
			
 
				+        "输入信息": {
			
 
				+            "B": incremental_words,
			
 
				+            "A": persona_system_text,
			
 
				+            "B_Context": b_context,  # 使用统一构造的 context
			
 
				+            "A_Context": ""
			
 
				+        },
			
 
				+        "匹配结果": match_results,
			
 
				+        "step1_结果": step1_top1,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+async def main(current_time: str, log_url: str):
			
 
				+    """主函数"""
			
 
				+    # 解析命令行参数
			
 
				+    persona_dir = sys.argv[1] if len(sys.argv) > 1 else "data/阿里多多酱/out/人设_1110"
			
 
				+    inspiration_arg = sys.argv[2] if len(sys.argv) > 2 else "0"
			
 
				+
			
 
				+    print(f"{'=' * 80}")
			
 
				+    print(f"Step2: 增量词在人设中的匹配分析（Top1）")
			
 
				+    print(f"{'=' * 80}")
			
 
				+    print(f"人设目录: {persona_dir}")
			
 
				+    print(f"灵感参数: {inspiration_arg}")
			
 
				+
			
 
				+    # 加载数据
			
 
				+    persona_data = load_persona_data(persona_dir)
			
 
				+    inspiration_data = load_inspiration_data(persona_dir)
			
 
				+    inspiration_list = [item["灵感点"] for item in inspiration_data]
			
 
				+    test_inspiration = select_inspiration(inspiration_arg, inspiration_list)
			
 
				+
			
 
				+    # 查找并加载 step1 结果
			
 
				+    step1_file = find_step1_file(persona_dir, test_inspiration, MODEL_NAME)
			
 
				+    step1_filename = os.path.basename(step1_file)
			
 
				+    step1_basename = os.path.splitext(step1_filename)[0]
			
 
				+
			
 
				+    print(f"Step1 输入文件: {step1_file}")
			
 
				+
			
 
				+    with open(step1_file, 'r', encoding='utf-8') as f:
			
 
				+        step1_data = json.load(f)
			
 
				+
			
 
				+    actual_inspiration = step1_data.get("灵感", "")
			
 
				+    step1_results = step1_data.get("匹配结果列表", [])
			
 
				+
			
 
				+    if not step1_results:
			
 
				+        print("❌ step1 结果为空")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    print(f"灵感: {actual_inspiration}")
			
 
				+
			
 
				+    # 默认处理 top1，后续可以支持指定第几个
			
 
				+    result_index = 0  # 使用第 1 个匹配结果（top1）
			
 
				+    selected_result = step1_results[result_index]
			
 
				+    print(f"处理第 {result_index + 1} 个匹配结果（Top{result_index + 1}）\n")
			
 
				+
			
 
				+    # 执行核心业务逻辑
			
 
				+    output = await process_step2_incremental_match(
			
 
				+        step1_top1=selected_result,
			
 
				+        persona_data=persona_data,
			
 
				+        inspiration=actual_inspiration,
			
 
				+        current_time=current_time,
			
 
				+        log_url=log_url
			
 
				+    )
			
 
				+
			
 
				+    # 在元数据中添加 step1 匹配索引
			
 
				+    output["元数据"]["step1_匹配索引"] = result_index + 1
			
 
				+
			
 
				+    # 保存结果
			
 
				+    output_dir = os.path.join(persona_dir, "how", "灵感点", test_inspiration)
			
 
				+    model_name_short = MODEL_NAME.replace("google/", "").replace("/", "_")
			
 
				+
			
 
				+    # 提取 step1 的范围标识（all 或 top10 等）
			
 
				+    scope_prefix = step1_basename.split("_")[0]  # 提取 "all" 或 "top10" 等
			
 
				+    output_filename = f"{scope_prefix}_step2_top{result_index + 1}_增量词匹配_{model_name_short}.json"
			
 
				+
			
 
				+    os.makedirs(output_dir, exist_ok=True)
			
 
				+    output_file = os.path.join(output_dir, output_filename)
			
 
				+
			
 
				+    with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(output, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"\n完成！结果已保存到: {output_file}")
			
 
				+    if log_url:
			
 
				+        print(f"Trace: {log_url}\n")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 设置 trace
			
 
				+    current_time, log_url = set_trace()
			
 
				+
			
 
				+    # 使用 trace 上下文包裹整个执行流程
			
 
				+    with trace("Step2: 增量词匹配分析"):
			
 
				+        asyncio.run(main(current_time, log_url))
			
--- a/test_how_decode_v9.py
+++ b/test_how_decode_v9.py
@@ -1,101 +0,0 @@
 
				-"""
			
 
				-HOW解构V9测试脚本
			
 
				-
			
 
				-测试集：从三个账号各选2个帖子，共6个测试用例
			
 
				-"""
			
 
				-
			
 
				-import asyncio
			
 
				-import sys
			
 
				-import os
			
 
				-
			
 
				-# 添加当前目录到path
			
 
				-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
			
 
				-
			
 
				-from how_decode_v9_point_dependency import main
			
 
				-
			
 
				-# 测试数据集：每个账号选2个帖子
			
 
				-TEST_CASES = [
			
 
				-    # 阿里多多酱（宠物类账号）
			
 
				-    {
			
 
				-        "account": "阿里多多酱",
			
 
				-        "files": [
			
 
				-            "examples_new/阿里多多酱/output/685b593800000000120141d3_20251104_111017.json",
			
 
				-            "examples_new/阿里多多酱/output/6865e3ac00000000100251b6_20251104_111021.json",
			
 
				-        ]
			
 
				-    },
			
 
				-    # 白流苏（美甲类账号）
			
 
				-    {
			
 
				-        "account": "白流苏",
			
 
				-        "files": [
			
 
				-            "examples_new/白流苏/output/682c53fd000000000303c64f_20251104_112822.json",
			
 
				-            "examples_new/白流苏/output/6839827d0000000003039f7e_20251104_112821.json",
			
 
				-        ]
			
 
				-    },
			
 
				-    # 摸鱼阿希
			
 
				-    {
			
 
				-        "account": "摸鱼阿希",
			
 
				-        "files": [
			
 
				-            "examples_new/摸鱼阿希/output/61bdc28b0000000001024896_20251104_132959.json",
			
 
				-            "examples_new/摸鱼阿希/output/66619827000000000600486f_20251104_133004.json",
			
 
				-        ]
			
 
				-    }
			
 
				-]
			
 
				-
			
 
				-
			
 
				-async def run_test_case(file_path: str, case_num: int, total: int):
			
 
				-    """运行单个测试用例"""
			
 
				-    print("\n" + "="*100)
			
 
				-    print(f"测试用例 {case_num}/{total}: {file_path}")
			
 
				-    print("="*100)
			
 
				-
			
 
				-    # 临时修改sys.argv来模拟命令行参数
			
 
				-    original_argv = sys.argv.copy()
			
 
				-    sys.argv = ["test_script", file_path]
			
 
				-
			
 
				-    try:
			
 
				-        from lib.my_trace import set_trace
			
 
				-        current_time, log_url = set_trace()
			
 
				-        from agents import trace
			
 
				-        with trace(f"test case {case_num}"):
			
 
				-            await main(current_time, log_url)
			
 
				-        print(f"\n✅ 测试用例 {case_num} 完成")
			
 
				-    except Exception as e:
			
 
				-        print(f"\n❌ 测试用例 {case_num} 失败: {e}")
			
 
				-        import traceback
			
 
				-        traceback.print_exc()
			
 
				-    finally:
			
 
				-        # 恢复原始参数
			
 
				-        sys.argv = original_argv
			
 
				-
			
 
				-
			
 
				-async def run_all_tests():
			
 
				-    """运行所有测试用例"""
			
 
				-    all_files = []
			
 
				-
			
 
				-    # 收集所有测试文件
			
 
				-    for account_data in TEST_CASES:
			
 
				-        account = account_data["account"]
			
 
				-        for file_path in account_data["files"]:
			
 
				-            all_files.append((account, file_path))
			
 
				-
			
 
				-    total = len(all_files)
			
 
				-    print(f"\n{'='*100}")
			
 
				-    print(f"HOW解构V9测试 - 共 {total} 个测试用例")
			
 
				-    print(f"{'='*100}")
			
 
				-
			
 
				-    for idx, (account, file_path) in enumerate(all_files, 1):
			
 
				-        print(f"\n账号: {account}")
			
 
				-        await run_test_case(file_path, idx, total)
			
 
				-
			
 
				-        # 添加延迟避免API限流
			
 
				-        if idx < total:
			
 
				-            print(f"\n⏸ 等待2秒后继续下一个测试用例...")
			
 
				-            await asyncio.sleep(2)
			
 
				-
			
 
				-    print("\n" + "="*100)
			
 
				-    print(f"✅ 所有 {total} 个测试用例完成！")
			
 
				-    print("="*100)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    asyncio.run(run_all_tests())
			
--- a/test_how_decode_v9.sh
+++ b/test_how_decode_v9.sh
@@ -1,68 +0,0 @@
 
				-#!/bin/bash
			
 
				-
			
 
				-# HOW解构V9测试脚本
			
 
				-# 测试集：从三个账号各选2个帖子，共6个测试用例
			
 
				-
			
 
				-echo "========================================================================================================"
			
 
				-echo "HOW解构V9测试 - 共 6 个测试用例"
			
 
				-echo "========================================================================================================"
			
 
				-
			
 
				-# 测试用例数组
			
 
				-test_cases=(
			
 
				-    # 阿里多多酱（宠物类账号）
			
 
				-    # "阿里多多酱|examples_new/阿里多多酱/output/685b593800000000120141d3_20251104_111017.json"
			
 
				-    # "阿里多多酱|examples_new/阿里多多酱/output/6865e3ac00000000100251b6_20251104_111021.json"
			
 
				-
			
 
				-    # 白流苏（美甲类账号）
			
 
				-    "白流苏|examples_new/白流苏/output/682c53fd000000000303c64f_20251104_112822.json"
			
 
				-    "白流苏|examples_new/白流苏/output/6839827d0000000003039f7e_20251104_112821.json"
			
 
				-
			
 
				-    # 摸鱼阿希
			
 
				-    "摸鱼阿希|examples_new/摸鱼阿希/output/61bdc28b0000000001024896_20251104_132959.json"
			
 
				-    "摸鱼阿希|examples_new/摸鱼阿希/output/66619827000000000600486f_20251104_133004.json"
			
 
				-)
			
 
				-
			
 
				-total=${#test_cases[@]}
			
 
				-current=1
			
 
				-success_count=0
			
 
				-failed_count=0
			
 
				-
			
 
				-for test_case in "${test_cases[@]}"; do
			
 
				-    # 分割账号和文件路径
			
 
				-    IFS='|' read -r account file_path <<< "$test_case"
			
 
				-
			
 
				-    echo ""
			
 
				-    echo "========================================================================================================"
			
 
				-    echo "测试用例 $current/$total"
			
 
				-    echo "账号: $account"
			
 
				-    echo "文件: $file_path"
			
 
				-    echo "========================================================================================================"
			
 
				-
			
 
				-    # 运行测试
			
 
				-    if python how_decode_v9_point_dependency.py "$file_path"; then
			
 
				-        echo ""
			
 
				-        echo "✅ 测试用例 $current 完成"
			
 
				-        ((success_count++))
			
 
				-    else
			
 
				-        echo ""
			
 
				-        echo "❌ 测试用例 $current 失败"
			
 
				-        ((failed_count++))
			
 
				-    fi
			
 
				-
			
 
				-    # 除了最后一个，都等待2秒
			
 
				-    if [ $current -lt $total ]; then
			
 
				-        echo ""
			
 
				-        echo "⏸  等待2秒后继续下一个测试用例..."
			
 
				-        sleep 2
			
 
				-    fi
			
 
				-
			
 
				-    ((current++))
			
 
				-done
			
 
				-
			
 
				-echo ""
			
 
				-echo "========================================================================================================"
			
 
				-echo "测试完成！"
			
 
				-echo "总计: $total 个用例"
			
 
				-echo "成功: $success_count 个"
			
 
				-echo "失败: $failed_count 个"
			
 
				-echo "========================================================================================================"