Bladeren bron

feat: 添加下一步分析和迭代推导功能

- 添加步骤4(下一步分析)和步骤5(完整迭代循环)
- 增加人设全局占比字段用于优化推导
- 节点添加category属性支持分类信息
- 添加辅助分析脚本(analyze_creation_origin等)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
yangxiaohui 23 uur geleden
bovenliggende
commit
f10b5afffc

+ 687 - 0
script/data_processing/analyze_creation_origin.py

@@ -0,0 +1,687 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+创作起点分析
+
+整合数据准备 + AI分析两步流程:
+1. 根据帖子图谱 + 人设图谱,准备待分析数据
+2. 调用AI分析起点
+
+输入:帖子图谱 + 人设图谱
+输出:起点分析结果
+"""
+
+import asyncio
+import json
+from pathlib import Path
+from typing import Dict, List, Optional
+import sys
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from agents import Agent, Runner, ModelSettings, trace
+from agents.tracing.create import custom_span
+from lib.client import get_model
+from lib.my_trace import set_trace_smith as set_trace
+from script.data_processing.path_config import PathConfig
+
+
+# ===== 配置 =====
+MODEL_NAME = "google/gemini-3-pro-preview"
+# MODEL_NAME = "anthropic/claude-sonnet-4"
+
+MATCH_SCORE_THRESHOLD = 0.8  # 匹配分数阈值
+GLOBAL_RATIO_THRESHOLD = 0.8  # 全局占比阈值
+
+agent = Agent(
+    name="Creation Origin Analyzer",
+    model=get_model(MODEL_NAME),
+    model_settings=ModelSettings(
+        temperature=0.0,
+        max_tokens=8192,
+    ),
+    tools=[],
+)
+
+
+# ===== 数据加载 =====
+
+def load_json(file_path: Path) -> Dict:
+    """加载JSON文件"""
+    with open(file_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def get_post_graph_files(config: PathConfig) -> List[Path]:
+    """获取所有帖子图谱文件"""
+    post_graph_dir = config.intermediate_dir / "post_graph"
+    return sorted(post_graph_dir.glob("*_帖子图谱.json"))
+
+
+def get_result_file(config: PathConfig, post_id: str) -> Path:
+    """获取分析结果文件路径"""
+    return config.intermediate_dir / "origin_analysis_result" / f"{post_id}_起点分析.json"
+
+
+def is_already_processed(config: PathConfig, post_id: str) -> bool:
+    """检查帖子是否已处理过"""
+    result_file = get_result_file(config, post_id)
+    return result_file.exists()
+
+
+# ===== 第一步:数据准备 =====
+
+def extract_post_detail(post_graph: Dict) -> Dict:
+    """提取帖子详情(保留原始字段名)"""
+    meta = post_graph.get("meta", {})
+    post_detail = meta.get("postDetail", {})
+
+    return {
+        "postId": meta.get("postId", ""),
+        "postTitle": meta.get("postTitle", ""),
+        "body_text": post_detail.get("body_text", ""),
+        "images": post_detail.get("images", []),
+        "video": post_detail.get("video"),
+        "publish_time": post_detail.get("publish_time", ""),
+        "like_count": post_detail.get("like_count", 0),
+        "collect_count": post_detail.get("collect_count", 0),
+    }
+
+
+def extract_analysis_nodes(post_graph: Dict, persona_graph: Dict) -> tuple:
+    """
+    提取待分析节点列表
+
+    待分析节点 = 灵感点 + 目的点 + 关键点
+    """
+    nodes = post_graph.get("nodes", {})
+    edges = post_graph.get("edges", {})
+    persona_nodes = persona_graph.get("nodes", {})
+    persona_index = persona_graph.get("index", {})
+
+    # 1. 收集关键点信息(用于支撑信息)
+    keypoints = {}
+    for node_id, node in nodes.items():
+        if node.get("type") == "标签" and node.get("dimension") == "关键点":
+            keypoints[node_id] = {
+                "名称": node.get("name", ""),
+                "描述": node.get("detail", {}).get("description", ""),
+            }
+
+    # 2. 分析支撑关系:关键点 → 灵感点/目的点
+    support_map = {}  # {target_node_id: [支撑的关键点信息]}
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "支撑":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+            if source_id in keypoints:
+                if target_id not in support_map:
+                    support_map[target_id] = []
+                support_map[target_id].append(keypoints[source_id])
+
+    # 3. 分析关联关系
+    relation_map = {}  # {node_id: [关联的节点名称]}
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "关联":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+            source_name = nodes.get(source_id, {}).get("name", "")
+            target_name = nodes.get(target_id, {}).get("name", "")
+
+            # 双向记录
+            if source_id not in relation_map:
+                relation_map[source_id] = []
+            relation_map[source_id].append(target_name)
+
+            if target_id not in relation_map:
+                relation_map[target_id] = []
+            relation_map[target_id].append(source_name)
+
+    # 4. 分析人设匹配
+    match_map = {}  # {node_id: 匹配信息}
+    persona_out_edges = persona_index.get("outEdges", {})
+
+    def get_node_info(node_id: str) -> Optional[Dict]:
+        """获取人设节点的标准信息"""
+        node = persona_nodes.get(node_id, {})
+        if not node:
+            return None
+        detail = node.get("detail", {})
+        parent_path = detail.get("parentPath", [])
+        return {
+            "节点ID": node_id,
+            "节点名称": node.get("name", ""),
+            "节点分类": "/".join(parent_path) if parent_path else "",
+            "节点维度": node.get("dimension", ""),
+            "节点类型": node.get("type", ""),
+            "人设全局占比": detail.get("probGlobal", 0),
+            "父类下占比": detail.get("probToParent", 0),
+        }
+
+    def get_parent_category_id(node_id: str) -> Optional[str]:
+        """通过属于边获取父分类节点ID"""
+        belong_edges = persona_out_edges.get(node_id, {}).get("属于", [])
+        for edge in belong_edges:
+            target_id = edge.get("target", "")
+            target_node = persona_nodes.get(target_id, {})
+            if target_node.get("type") == "分类":
+                return target_id
+        return None
+
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "匹配":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+
+            # 只处理 帖子节点 → 人设节点 的匹配
+            if source_id.startswith("帖子:") and target_id.startswith("人设:"):
+                match_score = edge.get("score", 0)
+                persona_node = persona_nodes.get(target_id, {})
+
+                if persona_node:
+                    node_type = persona_node.get("type", "")
+
+                    # 获取匹配节点信息
+                    match_node_info = get_node_info(target_id)
+                    if not match_node_info:
+                        continue
+
+                    # 确定所属分类节点
+                    if node_type == "标签":
+                        # 标签:找父分类
+                        category_id = get_parent_category_id(target_id)
+                    else:
+                        # 分类:就是自己
+                        category_id = target_id
+
+                    # 获取所属分类信息和常见搭配
+                    category_info = None
+                    if category_id:
+                        category_node = persona_nodes.get(category_id, {})
+                        if category_node:
+                            category_detail = category_node.get("detail", {})
+                            category_path = category_detail.get("parentPath", [])
+                            category_info = {
+                                "节点ID": category_id,
+                                "节点名称": category_node.get("name", ""),
+                                "节点分类": "/".join(category_path) if category_path else "",
+                                "节点维度": category_node.get("dimension", ""),
+                                "节点类型": "分类",
+                                "人设全局占比": category_detail.get("probGlobal", 0),
+                                "父类下占比": category_detail.get("probToParent", 0),
+                                "历史共现分类": [],
+                            }
+
+                            # 获取分类共现节点(按共现度降序排列)
+                            co_occur_edges = persona_out_edges.get(category_id, {}).get("分类共现", [])
+                            co_occur_edges_sorted = sorted(co_occur_edges, key=lambda x: x.get("score", 0), reverse=True)
+                            for co_edge in co_occur_edges_sorted[:5]:  # 取前5个
+                                co_target_id = co_edge.get("target", "")
+                                co_score = co_edge.get("score", 0)
+                                co_node = persona_nodes.get(co_target_id, {})
+                                if co_node:
+                                    co_detail = co_node.get("detail", {})
+                                    co_path = co_detail.get("parentPath", [])
+                                    category_info["历史共现分类"].append({
+                                        "节点ID": co_target_id,
+                                        "节点名称": co_node.get("name", ""),
+                                        "节点分类": "/".join(co_path) if co_path else "",
+                                        "节点维度": co_node.get("dimension", ""),
+                                        "节点类型": "分类",
+                                        "人设全局占比": co_detail.get("probGlobal", 0),
+                                        "父类下占比": co_detail.get("probToParent", 0),
+                                        "共现度": round(co_score, 4),
+                                    })
+
+                    match_map[source_id] = {
+                        "匹配节点": match_node_info,
+                        "匹配分数": round(match_score, 4),
+                        "所属分类": category_info,
+                    }
+
+    # 5. 构建待分析节点列表(灵感点、目的点、关键点)
+    analysis_nodes = []
+    for node_id, node in nodes.items():
+        if node.get("type") == "标签" and node.get("domain") == "帖子":
+            dimension = node.get("dimension", "")
+            if dimension in ["灵感点", "目的点", "关键点"]:
+                # 人设匹配信息
+                match_info = match_map.get(node_id)
+
+                analysis_nodes.append({
+                    "节点ID": node_id,
+                    "节点名称": node.get("name", ""),
+                    "节点分类": node.get("category", ""),  # 根分类:意图/实质/形式
+                    "节点维度": dimension,
+                    "节点类型": node.get("type", ""),
+                    "节点描述": node.get("detail", {}).get("description", ""),
+                    "人设匹配": match_info,
+                })
+
+    # 6. 构建可能的关系列表
+    relation_list = []
+
+    # 支撑关系:关键点 → 灵感点/目的点
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "支撑":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+            if source_id in keypoints:
+                relation_list.append({
+                    "来源节点": source_id,
+                    "目标节点": target_id,
+                    "关系类型": "支撑",
+                })
+
+    # 关联关系:节点之间的关联(去重,只记录一次)
+    seen_relations = set()
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "关联":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+            # 用排序后的元组作为key去重
+            key = tuple(sorted([source_id, target_id]))
+            if key not in seen_relations:
+                seen_relations.add(key)
+                relation_list.append({
+                    "来源节点": source_id,
+                    "目标节点": target_id,
+                    "关系类型": "关联",
+                })
+
+    return analysis_nodes, relation_list
+
+
+def prepare_analysis_data(post_graph: Dict, persona_graph: Dict) -> Dict:
+    """
+    准备完整的分析数据
+
+    Returns:
+        {
+            "帖子详情": {...},
+            "待分析节点列表": [...],
+            "可能的关系列表": [...]
+        }
+    """
+    analysis_nodes, relation_list = extract_analysis_nodes(post_graph, persona_graph)
+    return {
+        "帖子详情": extract_post_detail(post_graph),
+        "待分析节点列表": analysis_nodes,
+        "可能的关系列表": relation_list,
+    }
+
+
+# ===== 第二步:AI分析 =====
+
+def build_context(data: Dict) -> Dict:
+    """
+    构造AI分析的上下文
+
+    Returns:
+        {
+            "all_points": [...],  # 全部创意点(含详细信息)
+            "candidates": [...],  # 起点候选集(名称列表)
+            "constants": [...],   # 人设常量(名称列表)
+        }
+    """
+    nodes = data.get("待分析节点列表", [])
+
+    # 全部创意点(含详细信息)
+    all_points = []
+    for node in nodes:
+        match_info = node.get("人设匹配")
+        match_score = 0
+        category_global_ratio = 0
+        if match_info:
+            match_score = match_info.get("匹配分数", 0)
+            category_info = match_info.get("所属分类", {})
+            if category_info:
+                category_global_ratio = category_info.get("人设全局占比", 0)
+
+        all_points.append({
+            "名称": node["节点名称"],
+            "分类": node.get("节点分类", ""),
+            "维度": node.get("节点维度", ""),
+            "描述": node.get("节点描述", ""),
+            "人设匹配度": round(match_score, 2),
+            "所属分类全局占比": round(category_global_ratio, 2),
+        })
+
+    # 起点候选集(灵感点 + 目的点)
+    candidates = [
+        node["节点名称"]
+        for node in nodes
+        if node["节点维度"] in ["灵感点", "目的点"]
+    ]
+
+    # 人设常量(匹配分数 > 0.8 且 全局占比 > 0.8)
+    constants = []
+    for node in nodes:
+        match_info = node.get("人设匹配")
+        if match_info:
+            match_score = match_info.get("匹配分数", 0)
+            match_node = match_info.get("匹配节点", {})
+            global_ratio = match_node.get("人设全局占比", 0)
+
+            if match_score > MATCH_SCORE_THRESHOLD and global_ratio > GLOBAL_RATIO_THRESHOLD:
+                constants.append(node["节点名称"])
+
+    return {
+        "all_points": all_points,
+        "candidates": candidates,
+        "constants": constants,
+    }
+
+
+def format_prompt(context: Dict) -> str:
+    """
+    格式化为AI prompt
+    """
+    all_points = context["all_points"]
+    candidates = context["candidates"]
+    constants = context["constants"]
+
+    # 格式化全部创意点为易读文本
+    points_text = ""
+    for p in all_points:
+        points_text += f"- {p['名称']}\n"
+        points_text += f"  维度: {p['维度']} | 分类: {p['分类']}\n"
+        points_text += f"  描述: {p['描述']}\n"
+        points_text += f"  人设匹配度: {p['人设匹配度']} | 所属分类全局占比: {p['所属分类全局占比']}\n"
+        points_text += "\n"
+
+    # 格式化起点候选集
+    candidates_text = "、".join(candidates)
+
+    # 格式化人设常量
+    constants_text = "、".join(constants) if constants else "无"
+
+    prompt = f"""# Role
+你是小红书爆款内容的"逆向工程"专家。你的核心能力是透过内容的表象(视觉/形式),还原创作者最初的脑回路(动机/实质)。
+
+# Task
+我提供一组笔记的【创意标签】和一个【起点候选集】。
+请推理出哪些选项是真正的**创意起点**。
+
+
+# Input Data
+
+## 全部创意点
+
+{points_text}
+
+## 起点候选集
+{candidates_text}
+
+## 来自人设的常量
+{constants_text}
+
+
+# 推理约束
+
+1. 实质推形式,而不是形式推实质,除非形式是一切创意的起点
+2. 因推果而不是果推因
+3. 无法被其他项或人设推理出的点,即为起点
+
+# Output Format
+
+请输出一个标准的 JSON 格式。
+- Key: 候选集中的词。
+- Value: 一个对象,包含:
+  - `score`: 0.0 到 1.0 的浮点数(代表是起点的可能性)。
+  - `analysis`: 一句话推理"""
+
+    return prompt
+
+
+# ===== 显示函数 =====
+
+def display_context(context: Dict, post_id: str):
+    """显示构造的上下文"""
+    print(f"\n帖子: {post_id}")
+    print(f"\n全部创意点 ({len(context['all_points'])} 个):")
+    for p in context['all_points']:
+        print(f"  - {p['名称']} ({p['维度']}/{p['分类']}) 匹配度={p['人设匹配度']}, 分类占比={p['所属分类全局占比']}")
+    print(f"\n起点候选集 ({len(context['candidates'])} 个):")
+    print(f"  {context['candidates']}")
+    print(f"\n人设常量 ({len(context['constants'])} 个):")
+    print(f"  {context['constants']}")
+
+
+def display_result(result: Dict):
+    """显示分析结果"""
+    output = result.get("输出")
+    if output:
+        print("\n起点分析结果:")
+        # 按score降序排列
+        sorted_items = sorted(output.items(), key=lambda x: x[1].get("score", 0), reverse=True)
+        for name, info in sorted_items:
+            score = info.get("score", 0)
+            analysis = info.get("analysis", "")
+            marker = "★" if score >= 0.7 else "○"
+            print(f"  {marker} {name}: {score:.2f}")
+            print(f"      {analysis}")
+    else:
+        print(f"  分析失败: {result.get('错误', 'N/A')}")
+
+
+# ===== 处理函数 =====
+
+async def process_single_post(
+    post_file: Path,
+    persona_graph: Dict,
+    config: PathConfig,
+    current_time: str = None,
+    log_url: str = None,
+    force: bool = False,
+) -> Dict:
+    """
+    处理单个帖子(数据准备 + AI分析)
+    """
+    # 加载帖子图谱
+    post_graph = load_json(post_file)
+    post_id = post_graph.get("meta", {}).get("postId", "unknown")
+
+    # 检查是否已处理
+    if not force and is_already_processed(config, post_id):
+        print(f"\n跳过帖子 {post_id}(已处理,使用 --force 强制重新分析)")
+        # 返回已有结果
+        result_file = get_result_file(config, post_id)
+        return load_json(result_file)
+
+    print(f"\n{'=' * 60}")
+    print(f"处理帖子: {post_id}")
+    print("-" * 60)
+
+    # 第一步:准备数据
+    data = prepare_analysis_data(post_graph, persona_graph)
+
+    # 构造上下文
+    context = build_context(data)
+    display_context(context, post_id)
+
+    # 格式化prompt
+    prompt = format_prompt(context)
+
+    # 第二步:调用AI
+    print("\n调用AI分析中...")
+    with custom_span(
+        name=f"创作起点分析 - {post_id}",
+        data={
+            "帖子id": post_id,
+            "候选数": len(context["candidates"]),
+            "模型": MODEL_NAME
+        }
+    ):
+        result = await Runner.run(agent, input=prompt)
+        output_text = result.final_output
+
+    # 解析JSON
+    try:
+        if "```json" in output_text:
+            json_start = output_text.find("```json") + 7
+            json_end = output_text.find("```", json_start)
+            json_str = output_text[json_start:json_end].strip()
+        elif "{" in output_text and "}" in output_text:
+            json_start = output_text.find("{")
+            json_end = output_text.rfind("}") + 1
+            json_str = output_text[json_start:json_end]
+        else:
+            json_str = output_text
+
+        analysis_result = json.loads(json_str)
+
+        result_data = {
+            "帖子id": post_id,
+            "模型": MODEL_NAME,
+            "输入": context,
+            "输出": analysis_result
+        }
+    except Exception as e:
+        result_data = {
+            "帖子id": post_id,
+            "模型": MODEL_NAME,
+            "输入": context,
+            "输出": None,
+            "错误": str(e),
+            "原始输出": output_text
+        }
+
+    # 显示结果
+    display_result(result_data)
+
+    # 保存结果
+    output_dir = config.intermediate_dir / "origin_analysis_result"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    output_with_meta = {
+        "元数据": {
+            "current_time": current_time,
+            "log_url": log_url,
+            "model": MODEL_NAME
+        },
+        **result_data
+    }
+
+    output_file = output_dir / f"{post_id}_起点分析.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(output_with_meta, f, ensure_ascii=False, indent=2)
+
+    print(f"\n已保存: {output_file.name}")
+
+    return result_data
+
+
+# ===== 主函数 =====
+
+async def main(
+    post_id: str = None,
+    all_posts: bool = False,
+    force: bool = False,
+):
+    """
+    主函数
+
+    Args:
+        post_id: 帖子ID,可选
+        all_posts: 是否处理所有帖子
+        force: 强制重新分析已处理的帖子
+    """
+    # 设置 trace
+    current_time, log_url = set_trace()
+
+    config = PathConfig()
+
+    print(f"账号: {config.account_name}")
+    print(f"使用模型: {MODEL_NAME}")
+    print(f"Trace URL: {log_url}")
+
+    # 加载人设图谱
+    persona_graph_file = config.intermediate_dir / "人设图谱.json"
+    if not persona_graph_file.exists():
+        print(f"错误: 人设图谱文件不存在: {persona_graph_file}")
+        return
+
+    persona_graph = load_json(persona_graph_file)
+    print(f"人设图谱节点数: {len(persona_graph.get('nodes', {}))}")
+
+    # 获取帖子图谱文件
+    post_graph_files = get_post_graph_files(config)
+    if not post_graph_files:
+        print("错误: 没有找到帖子图谱文件")
+        return
+
+    # 确定要处理的帖子
+    if post_id:
+        target_file = next(
+            (f for f in post_graph_files if post_id in f.name),
+            None
+        )
+        if not target_file:
+            print(f"错误: 未找到帖子 {post_id}")
+            return
+        files_to_process = [target_file]
+    elif all_posts:
+        files_to_process = post_graph_files
+    else:
+        files_to_process = [post_graph_files[0]]
+
+    print(f"待处理帖子数: {len(files_to_process)}")
+
+    # 处理
+    with trace("创作起点分析"):
+        results = []
+        skipped = 0
+        for i, post_file in enumerate(files_to_process, 1):
+            print(f"\n{'#' * 60}")
+            print(f"# 处理帖子 {i}/{len(files_to_process)}")
+            print(f"{'#' * 60}")
+
+            result = await process_single_post(
+                post_file=post_file,
+                persona_graph=persona_graph,
+                config=config,
+                current_time=current_time,
+                log_url=log_url,
+                force=force,
+            )
+
+            # 检查是否是跳过的
+            if not force and "元数据" in result:
+                skipped += 1
+
+            results.append(result)
+
+    # 汇总
+    print(f"\n{'#' * 60}")
+    print(f"# 完成! 共处理 {len(results)} 个帖子 (跳过 {skipped} 个已处理)")
+    print(f"{'#' * 60}")
+    print(f"Trace: {log_url}")
+
+    print("\n汇总(score >= 0.7 的起点):")
+    for result in results:
+        post_id = result.get("帖子id")
+        output = result.get("输出")
+        if output:
+            origins = [f"{k}({v['score']:.2f})" for k, v in output.items() if v.get("score", 0) >= 0.7]
+            print(f"  {post_id}: {', '.join(origins) if origins else '无高置信起点'}")
+        else:
+            print(f"  {post_id}: 分析失败")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="创作起点分析")
+    parser.add_argument("--post-id", type=str, help="帖子ID")
+    parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
+    parser.add_argument("--force", action="store_true", help="强制重新分析已处理的帖子")
+    args = parser.parse_args()
+
+    asyncio.run(main(
+        post_id=args.post_id,
+        all_posts=args.all_posts,
+        force=args.force,
+    ))

+ 417 - 9
script/data_processing/analyze_creation_pattern.py

@@ -405,6 +405,7 @@ def build_origin_context(nodes: List[Dict]) -> Dict:
             "维度": node.get("节点维度", ""),
             "描述": node.get("节点描述", ""),
             "人设匹配度": round(get_match_score(node), 2),
+            "人设全局占比": round(get_category_global_ratio(node), 2),
         })
 
     # 起点候选集(灵感点 + 目的点)
@@ -439,7 +440,7 @@ def format_origin_prompt(context: Dict) -> str:
         points_text += f"- {p['名称']}\n"
         points_text += f"  维度: {p['维度']} | 分类: {p['分类']}\n"
         points_text += f"  描述: {p['描述']}\n"
-        points_text += f"  人设匹配度: {p['人设匹配度']}\n"
+        points_text += f"  人设匹配度: {p['人设匹配度']} | 人设全局占比: {p['人设全局占比']}\n"
         points_text += "\n"
 
     candidates_text = "、".join(candidates)
@@ -673,6 +674,171 @@ def derive_patterns(
     }
 
 
+# ===== 第四步:下一步分析 =====
+
+def build_next_step_context(known_nodes: List[Dict], unknown_nodes: List[Dict], all_nodes: List[Dict]) -> Dict:
+    """构造下一步分析的上下文"""
+
+    # 已知点信息(按发现顺序排序)
+    known_sorted = sorted(known_nodes, key=lambda n: n.get("发现编号") or 999)
+    known_info = []
+    for n in known_sorted:
+        info = {
+            "名称": n["节点名称"],
+            "维度": n["节点维度"],
+            "分类": n.get("节点分类", ""),
+            "描述": n.get("节点描述", ""),
+            "人设匹配度": round(get_match_score(n), 2),
+            "人设全局占比": round(get_category_global_ratio(n), 2),
+            "发现编号": n.get("发现编号"),
+        }
+        # 如果有起点分析,加上
+        if n.get("起点分析"):
+            info["起点说明"] = n["起点分析"].get("说明", "")
+        known_info.append(info)
+
+    # 未知点信息
+    unknown_info = []
+    for n in unknown_nodes:
+        unknown_info.append({
+            "名称": n["节点名称"],
+            "维度": n["节点维度"],
+            "分类": n.get("节点分类", ""),
+            "描述": n.get("节点描述", ""),
+            "人设匹配度": round(get_match_score(n), 2),
+            "人设全局占比": round(get_category_global_ratio(n), 2),
+        })
+
+    # 人设常量(从全部节点中筛选)
+    constants = [
+        n["节点名称"]
+        for n in all_nodes
+        if is_persona_constant(n)
+    ]
+
+    return {
+        "known_nodes": known_info,
+        "unknown_nodes": unknown_info,
+        "constants": constants,
+    }
+
+
+def format_next_step_prompt(context: Dict) -> str:
+    """格式化下一步分析的prompt"""
+
+    known_text = ""
+    for i, n in enumerate(context["known_nodes"], 1):
+        known_text += f"{i}. {n['名称']} ({n['维度']})\n"
+        known_text += f"   分类: {n['分类']}\n"
+        known_text += f"   描述: {n['描述']}\n"
+        known_text += f"   人设匹配度: {n['人设匹配度']} | 人设全局占比: {n['人设全局占比']}\n"
+        if n.get("起点说明"):
+            known_text += f"   起点说明: {n['起点说明']}\n"
+        known_text += "\n"
+
+    unknown_text = ""
+    for n in context["unknown_nodes"]:
+        unknown_text += f"- {n['名称']} ({n['维度']})\n"
+        unknown_text += f"  分类: {n['分类']}\n"
+        unknown_text += f"  描述: {n['描述']}\n"
+        unknown_text += f"  人设匹配度: {n['人设匹配度']} | 人设全局占比: {n['人设全局占比']}\n\n"
+
+    constants = context.get("constants", [])
+    constants_text = "、".join(constants) if constants else "无"
+
+    prompt = f"""# Role
+你是小红书爆款内容的"逆向工程"专家。你的任务是还原创作者的思维路径。
+
+# Task
+基于已知的创意点,推理哪些未知点最可能是创作者**下一步直接想到**的点。
+可以有多个点同时被想到(如果它们在逻辑上是并列的)。
+
+## 已知点(按发现顺序)
+{known_text}
+## 未知点(待推理)
+{unknown_text}
+## 人设常量
+{constants_text}
+
+# 推理约束
+1. 创作者的思维是有逻辑的:先有动机/目的,再想形式/手法
+2. 关键点通常是为了支撑灵感点或目的点
+3. 人设常量是创作者固有的风格,不需要推理
+4. 只输出"下一步直接能想到"的点,不是所有未知点
+
+# Output Format
+输出 JSON,对每个未知点评分:
+- Key: 未知点名称
+- Value: 对象,包含:
+  - `score`: 0.0-1.0(下一步被想到的可能性)
+  - `from`: 从哪个已知点推导出来(已知点名称)
+  - `reason`: 如何从该已知点推导出来(一句话)"""
+
+    return prompt
+
+
+async def analyze_next_step(
+    nodes: List[Dict],
+    force_llm: bool = False
+) -> Dict:
+    """
+    执行下一步分析
+
+    输入: 节点列表(有已知和未知)
+    输出: 最可能的下一步点列表
+    """
+    # 分离已知和未知
+    known_nodes = [n for n in nodes if n.get("是否已知")]
+    unknown_nodes = [n for n in nodes if not n.get("是否已知")]
+
+    if not unknown_nodes:
+        return {
+            "输入上下文": {"已知点": [], "未知点": [], "人设常量": []},
+            "中间结果": [],
+            "下一步点": [],
+        }
+
+    context = build_next_step_context(known_nodes, unknown_nodes, nodes)
+    prompt = format_next_step_prompt(context)
+
+    print(f"\n  已知点: {len(known_nodes)} 个")
+    print(f"  未知点: {len(unknown_nodes)} 个")
+
+    result = await analyze(
+        prompt=prompt,
+        task_name=f"{TASK_NAME}/next_step",
+        force=force_llm,
+        parse_json=True,
+    )
+
+    # 解析结果(现在是 {name: {score, from, reason}} 格式)
+    llm_result = result.data or {}
+
+    # 构建候选列表,按分数排序
+    candidates = []
+    for name, info in llm_result.items():
+        candidates.append({
+            "节点名称": name,
+            "可能性分数": info.get("score", 0),
+            "推导来源": info.get("from", ""),
+            "推理说明": info.get("reason", ""),
+        })
+    candidates.sort(key=lambda x: x["可能性分数"], reverse=True)
+
+    return {
+        "输入上下文": {
+            "已知点": context["known_nodes"],
+            "未知点": context["unknown_nodes"],
+            "人设常量": context["constants"],
+        },
+        "中间结果": llm_result,
+        "下一步候选": candidates,
+        "cache_hit": result.cache_hit,
+        "model": result.model_name,
+        "log_url": result.log_url,
+    }
+
+
 # ===== 完整流程 =====
 
 def save_result(post_id: str, post_detail: Dict, steps: List, config: PathConfig) -> Path:
@@ -855,6 +1021,244 @@ async def process_single_post(
     # 步骤3完成,保存
     save_result(post_id, post_detail, steps, config)
 
+    if max_step == 3:
+        return {"帖子详情": post_detail, "步骤列表": steps}
+
+    # ===== 步骤4:下一步分析 =====
+    print("\n[步骤4] 下一步分析...")
+    next_step_result = await analyze_next_step(nodes_step3, force_llm=force_llm)
+
+    # 获取候选列表
+    candidates = next_step_result["下一步候选"]
+
+    # 筛选高分候选 (>= 0.8)
+    NEXT_STEP_THRESHOLD = 0.8
+    high_score_candidates = [c for c in candidates if c["可能性分数"] >= NEXT_STEP_THRESHOLD]
+
+    # 构建节点名称到节点的映射
+    node_by_name = {n["节点名称"]: n for n in nodes_step3}
+
+    # 找出当前最大发现编号
+    max_order = max((n.get("发现编号") or 0) for n in nodes_step3)
+
+    # 更新节点:把高分候选标记为已知
+    nodes_step4 = []
+    new_known_names = []
+    current_order = max_order + 1
+
+    for node in nodes_step3:
+        new_node = dict(node)
+        name = node["节点名称"]
+
+        # 检查是否在高分候选中
+        matching = [c for c in high_score_candidates if c["节点名称"] == name]
+        if matching and not node.get("是否已知"):
+            new_node["是否已知"] = True
+            new_node["发现编号"] = current_order
+            current_order += 1
+            new_known_names.append(name)
+
+        nodes_step4.append(new_node)
+
+    # 创建新的边(推导边)
+    new_edges = []
+    for c in high_score_candidates:
+        target_node = node_by_name.get(c["节点名称"])
+        source_name = c["推导来源"]
+        source_node = node_by_name.get(source_name)
+        if target_node and source_node:
+            new_edges.append({
+                "来源": source_node["节点ID"],
+                "目标": target_node["节点ID"],
+                "关系类型": "AI推导",
+                "可能性分数": c["可能性分数"],
+                "推理说明": c["推理说明"],
+            })
+
+    # 合并边列表
+    all_edges_step4 = all_edges + new_edges
+
+    step4 = {
+        "步骤": "下一步分析",
+        "输入": {
+            "已知点": next_step_result["输入上下文"]["已知点"],
+            "未知点": next_step_result["输入上下文"]["未知点"],
+            "人设常量": next_step_result["输入上下文"]["人设常量"],
+        },
+        "中间结果": next_step_result["中间结果"],
+        "输出": {
+            "新的已知节点": new_known_names,
+            "新的边": new_edges,
+            "节点列表": nodes_step4,
+            "边列表": all_edges_step4,
+        },
+        "摘要": {
+            "已知点数": sum(1 for n in nodes_step4 if n.get("是否已知")),
+            "新已知数": len(new_known_names),
+            "新边数": len(new_edges),
+            "未知点数": sum(1 for n in nodes_step4 if not n.get("是否已知")),
+            "model": next_step_result.get("model"),
+            "cache_hit": next_step_result.get("cache_hit"),
+            "log_url": next_step_result.get("log_url"),
+        },
+    }
+    steps.append(step4)
+
+    # 打印高分候选
+    print(f"  候选数: {len(candidates)} 个")
+    print(f"  高分候选 (>={NEXT_STEP_THRESHOLD}): {len(high_score_candidates)} 个")
+    for c in high_score_candidates:
+        print(f"    ★ {c['节点名称']} ({c['可能性分数']:.2f}) ← {c['推导来源']}")
+        print(f"      {c['推理说明']}")
+
+    # 步骤4完成,保存
+    save_result(post_id, post_detail, steps, config)
+
+    if max_step == 4:
+        return {"帖子详情": post_detail, "步骤列表": steps}
+
+    # ===== 循环:步骤3→步骤4 直到全部已知 =====
+    iteration = 1
+    current_nodes = nodes_step4
+    current_edges = all_edges_step4
+    MAX_ITERATIONS = 10  # 防止无限循环
+
+    while True:
+        # 检查是否还有未知节点
+        unknown_count = sum(1 for n in current_nodes if not n.get("是否已知"))
+        if unknown_count == 0:
+            print(f"\n[完成] 所有节点已变为已知")
+            break
+
+        if iteration > MAX_ITERATIONS:
+            print(f"\n[警告] 达到最大迭代次数 {MAX_ITERATIONS},停止循环")
+            break
+
+        # ===== 迭代步骤3:共现推导 =====
+        print(f"\n[迭代{iteration}-步骤3] 模式推导...")
+        derivation_result = derive_patterns(current_nodes, persona_co_occur)
+        nodes_iter3 = derivation_result["输出节点"]
+        edges_iter3 = derivation_result["推导边列表"]
+
+        # 统计新推导的
+        prev_known_names = {n["节点名称"] for n in current_nodes if n.get("是否已知")}
+        new_known_step3 = [n["节点名称"] for n in nodes_iter3 if n.get("是否已知") and n["节点名称"] not in prev_known_names]
+        new_edges_step3 = edges_iter3  # derive_patterns 返回的是本轮新增的边
+
+        all_edges_iter3 = current_edges + new_edges_step3
+
+        step_iter3 = {
+            "步骤": f"迭代{iteration}-模式推导",
+            "输入": {
+                "节点列表": current_nodes,
+                "人设共现关系": persona_co_occur,
+            },
+            "输出": {
+                "新的已知节点": new_known_step3,
+                "新的边": new_edges_step3,
+                "节点列表": nodes_iter3,
+                "边列表": all_edges_iter3,
+            },
+            "摘要": {
+                "已知点数": sum(1 for n in nodes_iter3 if n.get("是否已知")),
+                "新已知数": len(new_known_step3),
+                "新边数": len(new_edges_step3),
+                "未知点数": sum(1 for n in nodes_iter3 if not n.get("是否已知")),
+            },
+        }
+        steps.append(step_iter3)
+
+        print(f"  新已知: {len(new_known_step3)} 个")
+        print(f"  新边: {len(new_edges_step3)} 条")
+
+        save_result(post_id, post_detail, steps, config)
+
+        # 检查是否还有未知
+        unknown_after_step3 = sum(1 for n in nodes_iter3 if not n.get("是否已知"))
+        if unknown_after_step3 == 0:
+            print(f"\n[完成] 所有节点已变为已知")
+            break
+
+        # ===== 迭代步骤4:AI推导 =====
+        print(f"\n[迭代{iteration}-步骤4] 下一步分析...")
+        next_step_result = await analyze_next_step(nodes_iter3, force_llm=force_llm)
+        candidates_iter4 = next_step_result["下一步候选"]
+        high_score_iter4 = [c for c in candidates_iter4 if c["可能性分数"] >= NEXT_STEP_THRESHOLD]
+
+        # 更新节点
+        node_by_name_iter4 = {n["节点名称"]: n for n in nodes_iter3}
+        max_order_iter4 = max((n.get("发现编号") or 0) for n in nodes_iter3)
+        nodes_iter4 = []
+        new_known_iter4 = []
+        current_order_iter4 = max_order_iter4 + 1
+
+        for node in nodes_iter3:
+            new_node = dict(node)
+            name = node["节点名称"]
+            matching = [c for c in high_score_iter4 if c["节点名称"] == name]
+            if matching and not node.get("是否已知"):
+                new_node["是否已知"] = True
+                new_node["发现编号"] = current_order_iter4
+                current_order_iter4 += 1
+                new_known_iter4.append(name)
+            nodes_iter4.append(new_node)
+
+        # 创建新边
+        new_edges_iter4 = []
+        for c in high_score_iter4:
+            target_node = node_by_name_iter4.get(c["节点名称"])
+            source_node = node_by_name_iter4.get(c["推导来源"])
+            if target_node and source_node:
+                new_edges_iter4.append({
+                    "来源": source_node["节点ID"],
+                    "目标": target_node["节点ID"],
+                    "关系类型": "AI推导",
+                    "可能性分数": c["可能性分数"],
+                    "推理说明": c["推理说明"],
+                })
+
+        all_edges_iter4 = all_edges_iter3 + new_edges_iter4
+
+        step_iter4 = {
+            "步骤": f"迭代{iteration}-下一步分析",
+            "输入": {
+                "已知点": next_step_result["输入上下文"]["已知点"],
+                "未知点": next_step_result["输入上下文"]["未知点"],
+                "人设常量": next_step_result["输入上下文"]["人设常量"],
+            },
+            "中间结果": next_step_result["中间结果"],
+            "输出": {
+                "新的已知节点": new_known_iter4,
+                "新的边": new_edges_iter4,
+                "节点列表": nodes_iter4,
+                "边列表": all_edges_iter4,
+            },
+            "摘要": {
+                "已知点数": sum(1 for n in nodes_iter4 if n.get("是否已知")),
+                "新已知数": len(new_known_iter4),
+                "新边数": len(new_edges_iter4),
+                "未知点数": sum(1 for n in nodes_iter4 if not n.get("是否已知")),
+                "model": next_step_result.get("model"),
+                "cache_hit": next_step_result.get("cache_hit"),
+            },
+        }
+        steps.append(step_iter4)
+
+        print(f"  新已知: {len(new_known_iter4)} 个")
+        print(f"  新边: {len(new_edges_iter4)} 条")
+
+        save_result(post_id, post_detail, steps, config)
+
+        # 如果这轮没有新进展,停止
+        if len(new_known_step3) == 0 and len(new_known_iter4) == 0:
+            print(f"\n[停止] 本轮无新进展,停止循环")
+            break
+
+        # 更新状态,进入下一轮
+        current_nodes = nodes_iter4
+        current_edges = all_edges_iter4
+        iteration += 1
+
     return {"帖子详情": post_detail, "步骤列表": steps}
 
 
@@ -939,13 +1343,17 @@ async def main(
             print(f"  {post_id}: 节点数={step1_summary.get('节点数', 0)} (仅数据准备)")
         elif num_steps == 2:
             step2_summary = steps[1].get("摘要", {})
-            print(f"  {post_id}: 起点={step2_summary.get('高分起点数', 0)} (未推导)")
-        elif num_steps >= 3:
-            step2_summary = steps[1].get("摘要", {})
+            print(f"  {post_id}: 起点={step2_summary.get('新已知数', 0)} (未推导)")
+        elif num_steps == 3:
             step3_summary = steps[2].get("摘要", {})
-            print(f"  {post_id}: 起点={step2_summary.get('高分起点数', 0)}, "
-                  f"已知={step3_summary.get('已知点数', 0)}, "
-                  f"推导边={step3_summary.get('推导边数', 0)}")
+            print(f"  {post_id}: 已知={step3_summary.get('已知点数', 0)}, "
+                  f"未知={step3_summary.get('未知点数', 0)}")
+        elif num_steps >= 4:
+            step4_summary = steps[3].get("摘要", {})
+            print(f"  {post_id}: 已知={step4_summary.get('已知点数', 0)}, "
+                  f"新已知={step4_summary.get('新已知数', 0)}, "
+                  f"新边={step4_summary.get('新边数', 0)}, "
+                  f"未知={step4_summary.get('未知点数', 0)}")
         else:
             print(f"  {post_id}: 无步骤数据")
 
@@ -957,8 +1365,8 @@ if __name__ == "__main__":
     parser.add_argument("--post-id", type=str, help="帖子ID")
     parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
     parser.add_argument("--force-llm", action="store_true", help="强制重新调用LLM(跳过LLM缓存)")
-    parser.add_argument("--step", type=int, default=3, choices=[1, 2, 3],
-                        help="运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导)")
+    parser.add_argument("--step", type=int, default=5, choices=[1, 2, 3, 4, 5],
+                        help="运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导, 4=下一步分析, 5=完整循环)")
     args = parser.parse_args()
 
     asyncio.run(main(

+ 9 - 3
script/data_processing/build_post_graph.py

@@ -121,16 +121,20 @@ def create_node(
     dimension: str,
     node_type: str,
     name: str,
-    detail: Dict = None
+    detail: Dict = None,
+    category: str = None
 ) -> Dict:
     """创建节点"""
-    return {
+    node = {
         "name": name,
         "type": node_type,
         "dimension": dimension,
         "domain": domain,
         "detail": detail or {}
     }
+    if category:
+        node["category"] = category
+    return node
 
 
 def create_edge(
@@ -187,6 +191,7 @@ def extract_tags_and_matches(filtered_data: Dict) -> tuple:
             tag_name = point.get("名称", "")
             tag_desc = point.get("描述", "")
             point_id = point.get("ID", "")
+            point_category = point.get("类型", "")  # 根分类:意图/实质/形式
 
             if not tag_name:
                 continue
@@ -201,7 +206,8 @@ def extract_tags_and_matches(filtered_data: Dict) -> tuple:
                 detail={
                     "description": tag_desc,
                     "pointId": point_id
-                }
+                },
+                category=point_category
             )
 
             # 建立 ID 映射

+ 411 - 0
script/data_processing/derive_pattern_relations.py

@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+创作模式推导 - 第二步:基于共现关系的迭代推导
+
+输入:起点分析结果 + 待分析节点数据
+输出:推导结果(已知点集合 + 推导关系)
+
+算法:
+1. 初始化:起点分析中 score >= 0.8 的点 → 已知点集合
+2. 迭代:
+   - 从新加入的已知点中,筛选人设匹配分数 >= 0.8 的
+   - 获取它们的所属分类的历史共现分类ID列表
+   - 遍历未知点(人设匹配 >= 0.8),检查其所属分类ID是否在共现列表中
+   - 如果在,加入已知点,建立关系
+3. 直到没有新点加入
+"""
+
+import json
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Dict, List, Set, Optional
+import sys
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from script.data_processing.path_config import PathConfig
+
+
+# ===== 配置 =====
+ORIGIN_SCORE_THRESHOLD = 0.8  # 起点分数阈值
+MATCH_SCORE_THRESHOLD = 0.8   # 人设匹配分数阈值
+
+
+# ===== 数据结构 =====
+
+@dataclass
+class AnalysisNode:
+    """待分析节点"""
+    节点ID: str
+    节点名称: str
+    节点分类: str
+    节点维度: str
+    人设匹配分数: float
+    所属分类ID: Optional[str]
+    历史共现分类: Dict[str, float] = field(default_factory=dict)  # {分类ID: 共现度}
+
+    @classmethod
+    def from_raw(cls, raw: Dict) -> "AnalysisNode":
+        """从原始数据构造"""
+        match_info = raw.get("人设匹配") or {}
+        match_score = match_info.get("匹配分数", 0)
+
+        category_info = match_info.get("所属分类") or {}
+        category_id = category_info.get("节点ID")
+
+        co_occur_list = category_info.get("历史共现分类", [])
+        co_occur_map = {
+            c.get("节点ID"): c.get("共现度", 0)
+            for c in co_occur_list
+            if c.get("节点ID")
+        }
+
+        return cls(
+            节点ID=raw.get("节点ID", ""),
+            节点名称=raw.get("节点名称", ""),
+            节点分类=raw.get("节点分类", ""),
+            节点维度=raw.get("节点维度", ""),
+            人设匹配分数=match_score,
+            所属分类ID=category_id,
+            历史共现分类=co_occur_map,
+        )
+
+
+@dataclass
+class DerivedRelation:
+    """推导出的关系"""
+    来源节点ID: str
+    来源节点名称: str
+    目标节点ID: str
+    目标节点名称: str
+    关系类型: str  # "共现推导"
+    推导轮次: int
+    共现分类ID: str  # 通过哪个共现分类建立的关系
+    共现度: float  # 共现度分数
+
+
+@dataclass
+class DerivationResult:
+    """推导结果"""
+    帖子ID: str
+    起点列表: List[Dict]  # {节点ID, 节点名称, 起点分数}
+    已知点列表: List[Dict]  # {节点ID, 节点名称, 加入轮次, 加入原因}
+    推导关系列表: List[Dict]  # DerivedRelation 的 dict 形式
+    推导轮次: int
+    未知点列表: List[Dict]  # 未被推导的点
+
+
+# ===== 数据加载 =====
+
+def load_json(file_path: Path) -> Dict:
+    """加载JSON文件"""
+    with open(file_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def get_origin_result_files(config: PathConfig) -> List[Path]:
+    """获取所有起点分析结果文件"""
+    result_dir = config.intermediate_dir / "origin_analysis_result"
+    return sorted(result_dir.glob("*_起点分析.json"))
+
+
+def get_prepared_file(config: PathConfig, post_id: str) -> Optional[Path]:
+    """获取待分析数据文件"""
+    prepared_dir = config.intermediate_dir / "origin_analysis_prepared"
+    files = list(prepared_dir.glob(f"{post_id}_待分析数据.json"))
+    return files[0] if files else None
+
+
+# ===== 核心算法 =====
+
+def derive_patterns(
+    nodes: List[AnalysisNode],
+    origin_scores: Dict[str, float],  # {节点名称: 起点分数}
+) -> DerivationResult:
+    """
+    基于共现关系的迭代推导
+
+    Args:
+        nodes: 所有待分析节点
+        origin_scores: 起点分析的分数 {节点名称: score}
+
+    Returns:
+        DerivationResult
+    """
+    # 构建索引
+    node_by_name: Dict[str, AnalysisNode] = {n.节点名称: n for n in nodes}
+    node_by_id: Dict[str, AnalysisNode] = {n.节点ID: n for n in nodes}
+
+    # 1. 初始化已知点集合(起点分数 >= 0.8)
+    known_names: Set[str] = set()
+    known_info: List[Dict] = []  # {节点ID, 节点名称, 加入轮次, 加入原因}
+    origins: List[Dict] = []
+
+    for name, score in origin_scores.items():
+        if score >= ORIGIN_SCORE_THRESHOLD:
+            known_names.add(name)
+            node = node_by_name.get(name)
+            if node:
+                origins.append({
+                    "节点ID": node.节点ID,
+                    "节点名称": name,
+                    "起点分数": score,
+                })
+                known_info.append({
+                    "节点ID": node.节点ID,
+                    "节点名称": name,
+                    "加入轮次": 0,
+                    "加入原因": f"起点(score={score:.2f})",
+                })
+
+    # 未知点集合
+    unknown_names: Set[str] = set(node_by_name.keys()) - known_names
+
+    # 推导关系
+    relations: List[DerivedRelation] = []
+
+    # 2. 迭代推导
+    round_num = 0
+    new_known_this_round = known_names.copy()  # 第0轮新加入的就是起点
+
+    while new_known_this_round:
+        round_num += 1
+        print(f"\n  第 {round_num} 轮推导...")
+
+        # 本轮新加入的点
+        new_known_next_round: Set[str] = set()
+
+        # 遍历上一轮新加入的已知点
+        for known_name in new_known_this_round:
+            known_node = node_by_name.get(known_name)
+            if not known_node:
+                continue
+
+            # 过滤:人设匹配分数 >= 0.8
+            if known_node.人设匹配分数 < MATCH_SCORE_THRESHOLD:
+                continue
+
+            # 获取历史共现分类 {ID: 共现度}
+            co_occur_map = known_node.历史共现分类
+            if not co_occur_map:
+                continue
+
+            # 遍历未知点
+            for unknown_name in list(unknown_names):
+                unknown_node = node_by_name.get(unknown_name)
+                if not unknown_node:
+                    continue
+
+                # 过滤:人设匹配分数 >= 0.8
+                if unknown_node.人设匹配分数 < MATCH_SCORE_THRESHOLD:
+                    continue
+
+                # 检查:未知点的所属分类ID 是否在已知点的共现列表中
+                if unknown_node.所属分类ID and unknown_node.所属分类ID in co_occur_map:
+                    # 找到关联!
+                    co_occur_score = co_occur_map[unknown_node.所属分类ID]
+                    new_known_next_round.add(unknown_name)
+
+                    # 建立关系
+                    relations.append(DerivedRelation(
+                        来源节点ID=known_node.节点ID,
+                        来源节点名称=known_name,
+                        目标节点ID=unknown_node.节点ID,
+                        目标节点名称=unknown_name,
+                        关系类型="共现推导",
+                        推导轮次=round_num,
+                        共现分类ID=unknown_node.所属分类ID,
+                        共现度=co_occur_score,
+                    ))
+
+                    print(f"    {known_name} → {unknown_name} (共现度: {co_occur_score:.2f})")
+
+        # 更新集合
+        for name in new_known_next_round:
+            node = node_by_name.get(name)
+            if node:
+                known_info.append({
+                    "节点ID": node.节点ID,
+                    "节点名称": name,
+                    "加入轮次": round_num,
+                    "加入原因": "共现推导",
+                })
+
+        known_names.update(new_known_next_round)
+        unknown_names -= new_known_next_round
+        new_known_this_round = new_known_next_round
+
+        if not new_known_next_round:
+            print(f"    无新点加入,推导结束")
+            break
+
+    # 3. 构建未知点列表
+    unknown_list = []
+    for name in unknown_names:
+        node = node_by_name.get(name)
+        if node:
+            unknown_list.append({
+                "节点ID": node.节点ID,
+                "节点名称": name,
+                "节点维度": node.节点维度,
+                "人设匹配分数": node.人设匹配分数,
+                "未加入原因": "人设匹配分数不足" if node.人设匹配分数 < MATCH_SCORE_THRESHOLD else "无共现关联",
+            })
+
+    return DerivationResult(
+        帖子ID="",  # 由调用方设置
+        起点列表=origins,
+        已知点列表=known_info,
+        推导关系列表=[asdict(r) for r in relations],
+        推导轮次=round_num,
+        未知点列表=unknown_list,
+    )
+
+
+# ===== 处理函数 =====
+
+def process_single_post(
+    origin_file: Path,
+    config: PathConfig,
+) -> Optional[Dict]:
+    """处理单个帖子"""
+    # 加载起点分析结果
+    origin_data = load_json(origin_file)
+    post_id = origin_data.get("帖子id", "unknown")
+
+    print(f"\n{'=' * 60}")
+    print(f"处理帖子: {post_id}")
+    print("-" * 60)
+
+    # 获取起点分数
+    origin_output = origin_data.get("输出", {})
+    if not origin_output:
+        print("  错误: 起点分析结果为空")
+        return None
+
+    origin_scores = {name: info.get("score", 0) for name, info in origin_output.items()}
+
+    # 加载待分析数据(获取完整节点信息)
+    prepared_file = get_prepared_file(config, post_id)
+    if not prepared_file:
+        print(f"  错误: 未找到待分析数据文件")
+        return None
+
+    prepared_data = load_json(prepared_file)
+    raw_nodes = prepared_data.get("待分析节点列表", [])
+
+    # 转换为 AnalysisNode
+    nodes = [AnalysisNode.from_raw(raw) for raw in raw_nodes]
+    print(f"  节点数: {len(nodes)}")
+
+    # 显示起点
+    origins = [(name, score) for name, score in origin_scores.items() if score >= ORIGIN_SCORE_THRESHOLD]
+    print(f"  起点 (score >= {ORIGIN_SCORE_THRESHOLD}): {len(origins)} 个")
+    for name, score in sorted(origins, key=lambda x: -x[1]):
+        print(f"    ★ {name}: {score:.2f}")
+
+    # 执行推导
+    result = derive_patterns(nodes, origin_scores)
+    result.帖子ID = post_id
+
+    # 显示结果
+    print(f"\n  推导轮次: {result.推导轮次}")
+    print(f"  已知点: {len(result.已知点列表)} 个")
+    print(f"  推导关系: {len(result.推导关系列表)} 条")
+    print(f"  未知点: {len(result.未知点列表)} 个")
+
+    # 保存结果
+    output_dir = config.intermediate_dir / "pattern_derivation"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    output_file = output_dir / f"{post_id}_模式推导.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(asdict(result), f, ensure_ascii=False, indent=2)
+
+    print(f"\n  已保存: {output_file.name}")
+
+    return asdict(result)
+
+
+# ===== 主函数 =====
+
+def main(
+    post_id: str = None,
+    all_posts: bool = False,
+):
+    """
+    主函数
+
+    Args:
+        post_id: 帖子ID,可选
+        all_posts: 是否处理所有帖子
+    """
+    config = PathConfig()
+
+    print(f"账号: {config.account_name}")
+    print(f"起点分数阈值: {ORIGIN_SCORE_THRESHOLD}")
+    print(f"匹配分数阈值: {MATCH_SCORE_THRESHOLD}")
+
+    # 获取起点分析结果文件
+    origin_files = get_origin_result_files(config)
+    if not origin_files:
+        print("错误: 没有找到起点分析结果,请先运行 analyze_creation_origin.py")
+        return
+
+    # 确定要处理的帖子
+    if post_id:
+        target_file = next(
+            (f for f in origin_files if post_id in f.name),
+            None
+        )
+        if not target_file:
+            print(f"错误: 未找到帖子 {post_id} 的起点分析结果")
+            return
+        files_to_process = [target_file]
+    elif all_posts:
+        files_to_process = origin_files
+    else:
+        files_to_process = [origin_files[0]]
+
+    print(f"待处理帖子数: {len(files_to_process)}")
+
+    # 处理
+    results = []
+    for i, origin_file in enumerate(files_to_process, 1):
+        print(f"\n{'#' * 60}")
+        print(f"# 处理帖子 {i}/{len(files_to_process)}")
+        print(f"{'#' * 60}")
+
+        result = process_single_post(origin_file, config)
+        if result:
+            results.append(result)
+
+    # 汇总
+    print(f"\n{'#' * 60}")
+    print(f"# 完成! 共处理 {len(results)} 个帖子")
+    print(f"{'#' * 60}")
+
+    print("\n汇总:")
+    for result in results:
+        post_id = result.get("帖子ID")
+        known_count = len(result.get("已知点列表", []))
+        relation_count = len(result.get("推导关系列表", []))
+        unknown_count = len(result.get("未知点列表", []))
+        print(f"  {post_id}: 已知={known_count}, 关系={relation_count}, 未知={unknown_count}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="创作模式推导")
+    parser.add_argument("--post-id", type=str, help="帖子ID")
+    parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
+    args = parser.parse_args()
+
+    main(
+        post_id=args.post_id,
+        all_posts=args.all_posts,
+    )

+ 453 - 0
script/data_processing/prepare_origin_analysis.py

@@ -0,0 +1,453 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+创作起点分析 - 数据准备脚本
+
+第一步:根据帖子图谱 + 人设图谱,把信息压缩到待分析节点中
+
+输入:帖子图谱 + 人设图谱
+输出:待分析数据结构
+"""
+
+import json
+from pathlib import Path
+from typing import Dict, List, Optional
+import sys
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from script.data_processing.path_config import PathConfig
+
+
+# ===== 数据加载函数 =====
+
+def load_json(file_path: Path) -> Dict:
+    """加载JSON文件"""
+    with open(file_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def get_post_graph_files(config: PathConfig) -> List[Path]:
+    """获取所有帖子图谱文件"""
+    post_graph_dir = config.intermediate_dir / "post_graph"
+    return sorted(post_graph_dir.glob("*_帖子图谱.json"))
+
+
+# ===== 数据提取函数 =====
+
+def extract_post_detail(post_graph: Dict) -> Dict:
+    """
+    提取帖子详情(保留原始字段名)
+    """
+    meta = post_graph.get("meta", {})
+    post_detail = meta.get("postDetail", {})
+
+    return {
+        "postId": meta.get("postId", ""),
+        "postTitle": meta.get("postTitle", ""),
+        "body_text": post_detail.get("body_text", ""),
+        "images": post_detail.get("images", []),
+        "video": post_detail.get("video"),
+        "publish_time": post_detail.get("publish_time", ""),
+        "like_count": post_detail.get("like_count", 0),
+        "collect_count": post_detail.get("collect_count", 0),
+    }
+
+
+def extract_analysis_nodes(post_graph: Dict, persona_graph: Dict) -> List[Dict]:
+    """
+    提取待分析节点列表
+
+    待分析节点 = 灵感点 + 目的点(不包括关键点,关键点是支撑信息)
+    """
+    nodes = post_graph.get("nodes", {})
+    edges = post_graph.get("edges", {})
+    persona_nodes = persona_graph.get("nodes", {})
+    persona_index = persona_graph.get("index", {})
+
+    # 1. 收集关键点信息(用于支撑信息)
+    keypoints = {}
+    for node_id, node in nodes.items():
+        if node.get("type") == "标签" and node.get("dimension") == "关键点":
+            keypoints[node_id] = {
+                "名称": node.get("name", ""),
+                "描述": node.get("detail", {}).get("description", ""),
+            }
+
+    # 2. 分析支撑关系:关键点 → 灵感点/目的点
+    support_map = {}  # {target_node_id: [支撑的关键点信息]}
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "支撑":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+            if source_id in keypoints:
+                if target_id not in support_map:
+                    support_map[target_id] = []
+                support_map[target_id].append(keypoints[source_id])
+
+    # 3. 分析关联关系
+    relation_map = {}  # {node_id: [关联的节点名称]}
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "关联":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+            source_name = nodes.get(source_id, {}).get("name", "")
+            target_name = nodes.get(target_id, {}).get("name", "")
+
+            # 双向记录
+            if source_id not in relation_map:
+                relation_map[source_id] = []
+            relation_map[source_id].append(target_name)
+
+            if target_id not in relation_map:
+                relation_map[target_id] = []
+            relation_map[target_id].append(source_name)
+
+    # 4. 分析人设匹配
+    match_map = {}  # {node_id: 匹配信息}
+    persona_out_edges = persona_index.get("outEdges", {})
+
+    def get_node_info(node_id: str) -> Optional[Dict]:
+        """获取人设节点的标准信息"""
+        node = persona_nodes.get(node_id, {})
+        if not node:
+            return None
+        detail = node.get("detail", {})
+        parent_path = detail.get("parentPath", [])
+        return {
+            "节点ID": node_id,
+            "节点名称": node.get("name", ""),
+            "节点分类": "/".join(parent_path) if parent_path else "",
+            "节点维度": node.get("dimension", ""),
+            "节点类型": node.get("type", ""),
+            "人设全局占比": detail.get("probGlobal", 0),
+            "父类下占比": detail.get("probToParent", 0),
+        }
+
+    def get_parent_category_id(node_id: str) -> Optional[str]:
+        """通过属于边获取父分类节点ID"""
+        belong_edges = persona_out_edges.get(node_id, {}).get("属于", [])
+        for edge in belong_edges:
+            target_id = edge.get("target", "")
+            target_node = persona_nodes.get(target_id, {})
+            if target_node.get("type") == "分类":
+                return target_id
+        return None
+
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "匹配":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+
+            # 只处理 帖子节点 → 人设节点 的匹配
+            if source_id.startswith("帖子:") and target_id.startswith("人设:"):
+                match_score = edge.get("score", 0)
+                persona_node = persona_nodes.get(target_id, {})
+
+                if persona_node:
+                    node_type = persona_node.get("type", "")
+
+                    # 获取匹配节点信息
+                    match_node_info = get_node_info(target_id)
+                    if not match_node_info:
+                        continue
+
+                    # 确定所属分类节点
+                    if node_type == "标签":
+                        # 标签:找父分类
+                        category_id = get_parent_category_id(target_id)
+                    else:
+                        # 分类:就是自己
+                        category_id = target_id
+
+                    # 获取所属分类信息和常见搭配
+                    category_info = None
+                    if category_id:
+                        category_node = persona_nodes.get(category_id, {})
+                        if category_node:
+                            category_detail = category_node.get("detail", {})
+                            category_path = category_detail.get("parentPath", [])
+                            category_info = {
+                                "节点ID": category_id,
+                                "节点名称": category_node.get("name", ""),
+                                "节点分类": "/".join(category_path) if category_path else "",
+                                "节点维度": category_node.get("dimension", ""),
+                                "节点类型": "分类",
+                                "人设全局占比": category_detail.get("probGlobal", 0),
+                                "父类下占比": category_detail.get("probToParent", 0),
+                                "历史共现分类": [],
+                            }
+
+                            # 获取分类共现节点(按共现度降序排列)
+                            co_occur_edges = persona_out_edges.get(category_id, {}).get("分类共现", [])
+                            co_occur_edges_sorted = sorted(co_occur_edges, key=lambda x: x.get("score", 0), reverse=True)
+                            for co_edge in co_occur_edges_sorted[:5]:  # 取前5个
+                                co_target_id = co_edge.get("target", "")
+                                co_score = co_edge.get("score", 0)
+                                co_node = persona_nodes.get(co_target_id, {})
+                                if co_node:
+                                    co_detail = co_node.get("detail", {})
+                                    co_path = co_detail.get("parentPath", [])
+                                    category_info["历史共现分类"].append({
+                                        "节点ID": co_target_id,
+                                        "节点名称": co_node.get("name", ""),
+                                        "节点分类": "/".join(co_path) if co_path else "",
+                                        "节点维度": co_node.get("dimension", ""),
+                                        "节点类型": "分类",
+                                        "人设全局占比": co_detail.get("probGlobal", 0),
+                                        "父类下占比": co_detail.get("probToParent", 0),
+                                        "共现度": round(co_score, 4),
+                                    })
+
+                    match_map[source_id] = {
+                        "匹配节点": match_node_info,
+                        "匹配分数": round(match_score, 4),
+                        "所属分类": category_info,
+                    }
+
+    # 5. 构建待分析节点列表(灵感点、目的点、关键点)
+    analysis_nodes = []
+    for node_id, node in nodes.items():
+        if node.get("type") == "标签" and node.get("domain") == "帖子":
+            dimension = node.get("dimension", "")
+            if dimension in ["灵感点", "目的点", "关键点"]:
+                # 人设匹配信息
+                match_info = match_map.get(node_id)
+
+                analysis_nodes.append({
+                    "节点ID": node_id,
+                    "节点名称": node.get("name", ""),
+                    "节点分类": node.get("category", ""),  # 根分类:意图/实质/形式
+                    "节点维度": dimension,
+                    "节点类型": node.get("type", ""),
+                    "节点描述": node.get("detail", {}).get("description", ""),
+                    "人设匹配": match_info,
+                })
+
+    # 6. 构建可能的关系列表
+    relation_list = []
+
+    # 支撑关系:关键点 → 灵感点/目的点
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "支撑":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+            if source_id in keypoints:
+                relation_list.append({
+                    "来源节点": source_id,
+                    "目标节点": target_id,
+                    "关系类型": "支撑",
+                })
+
+    # 关联关系:节点之间的关联(去重,只记录一次)
+    seen_relations = set()
+    for edge_id, edge in edges.items():
+        if edge.get("type") == "关联":
+            source_id = edge.get("source", "")
+            target_id = edge.get("target", "")
+            # 用排序后的元组作为key去重
+            key = tuple(sorted([source_id, target_id]))
+            if key not in seen_relations:
+                seen_relations.add(key)
+                relation_list.append({
+                    "来源节点": source_id,
+                    "目标节点": target_id,
+                    "关系类型": "关联",
+                })
+
+    return analysis_nodes, relation_list
+
+
+def prepare_analysis_data(post_graph: Dict, persona_graph: Dict) -> Dict:
+    """
+    准备完整的分析数据
+
+    Returns:
+        {
+            "帖子详情": {...},
+            "待分析节点列表": [...],
+            "可能的关系列表": [...]
+        }
+    """
+    analysis_nodes, relation_list = extract_analysis_nodes(post_graph, persona_graph)
+    return {
+        "帖子详情": extract_post_detail(post_graph),
+        "待分析节点列表": analysis_nodes,
+        "可能的关系列表": relation_list,
+    }
+
+
+# ===== 显示函数 =====
+
+def display_prepared_data(data: Dict):
+    """显示准备好的数据"""
+    post = data["帖子详情"]
+    nodes = data["待分析节点列表"]
+    relations = data["可能的关系列表"]
+
+    print(f"\n帖子: {post['postId']}")
+    print(f"标题: {post['postTitle']}")
+    print(f"正文: {post['body_text'][:100]}...")
+
+    print(f"\n待分析节点 ({len(nodes)} 个):")
+    for node in nodes:
+        match = node.get("人设匹配")
+        category = node.get('节点分类', '')
+        print(f"  - [{node['节点ID']}] {node['节点名称']} ({node['节点维度']}/{category})")
+
+        if match:
+            match_node = match.get("匹配节点", {})
+            category_node = match.get("所属分类", {})
+            print(f"    匹配: {match_node.get('节点名称', '')} ({match_node.get('节点类型', '')}, 全局占比={match_node.get('人设全局占比', 0):.2%})")
+            if category_node:
+                co_count = len(category_node.get("历史共现分类", []))
+                print(f"    所属分类: {category_node.get('节点名称', '')} (全局占比={category_node.get('人设全局占比', 0):.2%}, {co_count}个历史共现分类)")
+        else:
+            print(f"    人设: 无匹配")
+
+    print(f"\n可能的关系 ({len(relations)} 条):")
+    for rel in relations:
+        rel_type = rel["关系类型"]
+        if rel_type == "支撑":
+            print(f"  - {rel['来源节点']} → {rel['目标节点']} [支撑]")
+        else:
+            print(f"  - {rel['来源节点']} ↔ {rel['目标节点']} [关联]")
+
+
+# ===== 处理函数 =====
+
+def process_single_post(
+    post_file: Path,
+    persona_graph: Dict,
+    config: PathConfig,
+    save: bool = True,
+) -> Dict:
+    """
+    处理单个帖子
+
+    Args:
+        post_file: 帖子图谱文件路径
+        persona_graph: 人设图谱数据
+        config: 路径配置
+        save: 是否保存结果
+
+    Returns:
+        准备好的分析数据
+    """
+    # 加载帖子图谱
+    post_graph = load_json(post_file)
+    post_id = post_graph.get("meta", {}).get("postId", "unknown")
+
+    print(f"\n{'=' * 60}")
+    print(f"处理帖子: {post_id}")
+    print("-" * 60)
+
+    # 准备数据
+    data = prepare_analysis_data(post_graph, persona_graph)
+
+    # 显示
+    display_prepared_data(data)
+
+    # 保存
+    if save:
+        output_dir = config.intermediate_dir / "origin_analysis_prepared"
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        output_file = output_dir / f"{post_id}_待分析数据.json"
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+
+        print(f"\n已保存: {output_file.name}")
+
+    return data
+
+
+# ===== 主函数 =====
+
+def main(
+    post_id: str = None,
+    all_posts: bool = False,
+    save: bool = True,
+):
+    """
+    主函数
+
+    Args:
+        post_id: 帖子ID,可选
+        all_posts: 是否处理所有帖子
+        save: 是否保存结果
+    """
+    config = PathConfig()
+
+    print(f"账号: {config.account_name}")
+
+    # 加载人设图谱
+    persona_graph_file = config.intermediate_dir / "人设图谱.json"
+    if not persona_graph_file.exists():
+        print(f"错误: 人设图谱文件不存在: {persona_graph_file}")
+        return
+
+    persona_graph = load_json(persona_graph_file)
+    print(f"人设图谱节点数: {len(persona_graph.get('nodes', {}))}")
+
+    # 获取帖子图谱文件
+    post_graph_files = get_post_graph_files(config)
+    if not post_graph_files:
+        print("错误: 没有找到帖子图谱文件")
+        return
+
+    # 确定要处理的帖子
+    if post_id:
+        target_file = next(
+            (f for f in post_graph_files if post_id in f.name),
+            None
+        )
+        if not target_file:
+            print(f"错误: 未找到帖子 {post_id}")
+            return
+        files_to_process = [target_file]
+    elif all_posts:
+        files_to_process = post_graph_files
+    else:
+        files_to_process = [post_graph_files[0]]
+
+    print(f"待处理帖子数: {len(files_to_process)}")
+
+    # 处理
+    results = []
+    for i, post_file in enumerate(files_to_process, 1):
+        print(f"\n{'#' * 60}")
+        print(f"# 处理帖子 {i}/{len(files_to_process)}")
+        print(f"{'#' * 60}")
+
+        data = process_single_post(
+            post_file=post_file,
+            persona_graph=persona_graph,
+            config=config,
+            save=save,
+        )
+        results.append(data)
+
+    print(f"\n{'#' * 60}")
+    print(f"# 完成! 共处理 {len(results)} 个帖子")
+    print(f"{'#' * 60}")
+
+    return results
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="创作起点分析 - 数据准备")
+    parser.add_argument("--post-id", type=str, help="帖子ID")
+    parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
+    parser.add_argument("--no-save", action="store_true", help="不保存结果")
+    args = parser.parse_args()
+
+    main(
+        post_id=args.post_id,
+        all_posts=args.all_posts,
+        save=not args.no_save,
+    )