23 uur geleden · f10b5afffc
--- a/script/data_processing/analyze_creation_origin.py
+++ b/script/data_processing/analyze_creation_origin.py
@@ -0,0 +1,687 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+创作起点分析
			
 
				+
			
 
				+整合数据准备 + AI分析两步流程：
			
 
				+1. 根据帖子图谱 + 人设图谱，准备待分析数据
			
 
				+2. 调用AI分析起点
			
 
				+
			
 
				+输入：帖子图谱 + 人设图谱
			
 
				+输出：起点分析结果
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional
			
 
				+import sys
			
 
				+
			
 
				+# 添加项目根目录到路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from agents import Agent, Runner, ModelSettings, trace
			
 
				+from agents.tracing.create import custom_span
			
 
				+from lib.client import get_model
			
 
				+from lib.my_trace import set_trace_smith as set_trace
			
 
				+from script.data_processing.path_config import PathConfig
			
 
				+
			
 
				+
			
 
				+# ===== 配置 =====
			
 
				+MODEL_NAME = "google/gemini-3-pro-preview"
			
 
				+# MODEL_NAME = "anthropic/claude-sonnet-4"
			
 
				+
			
 
				+MATCH_SCORE_THRESHOLD = 0.8  # 匹配分数阈值
			
 
				+GLOBAL_RATIO_THRESHOLD = 0.8  # 全局占比阈值
			
 
				+
			
 
				+agent = Agent(
			
 
				+    name="Creation Origin Analyzer",
			
 
				+    model=get_model(MODEL_NAME),
			
 
				+    model_settings=ModelSettings(
			
 
				+        temperature=0.0,
			
 
				+        max_tokens=8192,
			
 
				+    ),
			
 
				+    tools=[],
			
 
				+)
			
 
				+
			
 
				+
			
 
				+# ===== 数据加载 =====
			
 
				+
			
 
				+def load_json(file_path: Path) -> Dict:
			
 
				+    """加载JSON文件"""
			
 
				+    with open(file_path, "r", encoding="utf-8") as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def get_post_graph_files(config: PathConfig) -> List[Path]:
			
 
				+    """获取所有帖子图谱文件"""
			
 
				+    post_graph_dir = config.intermediate_dir / "post_graph"
			
 
				+    return sorted(post_graph_dir.glob("*_帖子图谱.json"))
			
 
				+
			
 
				+
			
 
				+def get_result_file(config: PathConfig, post_id: str) -> Path:
			
 
				+    """获取分析结果文件路径"""
			
 
				+    return config.intermediate_dir / "origin_analysis_result" / f"{post_id}_起点分析.json"
			
 
				+
			
 
				+
			
 
				+def is_already_processed(config: PathConfig, post_id: str) -> bool:
			
 
				+    """检查帖子是否已处理过"""
			
 
				+    result_file = get_result_file(config, post_id)
			
 
				+    return result_file.exists()
			
 
				+
			
 
				+
			
 
				+# ===== 第一步：数据准备 =====
			
 
				+
			
 
				+def extract_post_detail(post_graph: Dict) -> Dict:
			
 
				+    """提取帖子详情（保留原始字段名）"""
			
 
				+    meta = post_graph.get("meta", {})
			
 
				+    post_detail = meta.get("postDetail", {})
			
 
				+
			
 
				+    return {
			
 
				+        "postId": meta.get("postId", ""),
			
 
				+        "postTitle": meta.get("postTitle", ""),
			
 
				+        "body_text": post_detail.get("body_text", ""),
			
 
				+        "images": post_detail.get("images", []),
			
 
				+        "video": post_detail.get("video"),
			
 
				+        "publish_time": post_detail.get("publish_time", ""),
			
 
				+        "like_count": post_detail.get("like_count", 0),
			
 
				+        "collect_count": post_detail.get("collect_count", 0),
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def extract_analysis_nodes(post_graph: Dict, persona_graph: Dict) -> tuple:
			
 
				+    """
			
 
				+    提取待分析节点列表
			
 
				+
			
 
				+    待分析节点 = 灵感点 + 目的点 + 关键点
			
 
				+    """
			
 
				+    nodes = post_graph.get("nodes", {})
			
 
				+    edges = post_graph.get("edges", {})
			
 
				+    persona_nodes = persona_graph.get("nodes", {})
			
 
				+    persona_index = persona_graph.get("index", {})
			
 
				+
			
 
				+    # 1. 收集关键点信息（用于支撑信息）
			
 
				+    keypoints = {}
			
 
				+    for node_id, node in nodes.items():
			
 
				+        if node.get("type") == "标签" and node.get("dimension") == "关键点":
			
 
				+            keypoints[node_id] = {
			
 
				+                "名称": node.get("name", ""),
			
 
				+                "描述": node.get("detail", {}).get("description", ""),
			
 
				+            }
			
 
				+
			
 
				+    # 2. 分析支撑关系：关键点 → 灵感点/目的点
			
 
				+    support_map = {}  # {target_node_id: [支撑的关键点信息]}
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "支撑":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            if source_id in keypoints:
			
 
				+                if target_id not in support_map:
			
 
				+                    support_map[target_id] = []
			
 
				+                support_map[target_id].append(keypoints[source_id])
			
 
				+
			
 
				+    # 3. 分析关联关系
			
 
				+    relation_map = {}  # {node_id: [关联的节点名称]}
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "关联":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            source_name = nodes.get(source_id, {}).get("name", "")
			
 
				+            target_name = nodes.get(target_id, {}).get("name", "")
			
 
				+
			
 
				+            # 双向记录
			
 
				+            if source_id not in relation_map:
			
 
				+                relation_map[source_id] = []
			
 
				+            relation_map[source_id].append(target_name)
			
 
				+
			
 
				+            if target_id not in relation_map:
			
 
				+                relation_map[target_id] = []
			
 
				+            relation_map[target_id].append(source_name)
			
 
				+
			
 
				+    # 4. 分析人设匹配
			
 
				+    match_map = {}  # {node_id: 匹配信息}
			
 
				+    persona_out_edges = persona_index.get("outEdges", {})
			
 
				+
			
 
				+    def get_node_info(node_id: str) -> Optional[Dict]:
			
 
				+        """获取人设节点的标准信息"""
			
 
				+        node = persona_nodes.get(node_id, {})
			
 
				+        if not node:
			
 
				+            return None
			
 
				+        detail = node.get("detail", {})
			
 
				+        parent_path = detail.get("parentPath", [])
			
 
				+        return {
			
 
				+            "节点ID": node_id,
			
 
				+            "节点名称": node.get("name", ""),
			
 
				+            "节点分类": "/".join(parent_path) if parent_path else "",
			
 
				+            "节点维度": node.get("dimension", ""),
			
 
				+            "节点类型": node.get("type", ""),
			
 
				+            "人设全局占比": detail.get("probGlobal", 0),
			
 
				+            "父类下占比": detail.get("probToParent", 0),
			
 
				+        }
			
 
				+
			
 
				+    def get_parent_category_id(node_id: str) -> Optional[str]:
			
 
				+        """通过属于边获取父分类节点ID"""
			
 
				+        belong_edges = persona_out_edges.get(node_id, {}).get("属于", [])
			
 
				+        for edge in belong_edges:
			
 
				+            target_id = edge.get("target", "")
			
 
				+            target_node = persona_nodes.get(target_id, {})
			
 
				+            if target_node.get("type") == "分类":
			
 
				+                return target_id
			
 
				+        return None
			
 
				+
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "匹配":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+
			
 
				+            # 只处理 帖子节点 → 人设节点 的匹配
			
 
				+            if source_id.startswith("帖子:") and target_id.startswith("人设:"):
			
 
				+                match_score = edge.get("score", 0)
			
 
				+                persona_node = persona_nodes.get(target_id, {})
			
 
				+
			
 
				+                if persona_node:
			
 
				+                    node_type = persona_node.get("type", "")
			
 
				+
			
 
				+                    # 获取匹配节点信息
			
 
				+                    match_node_info = get_node_info(target_id)
			
 
				+                    if not match_node_info:
			
 
				+                        continue
			
 
				+
			
 
				+                    # 确定所属分类节点
			
 
				+                    if node_type == "标签":
			
 
				+                        # 标签：找父分类
			
 
				+                        category_id = get_parent_category_id(target_id)
			
 
				+                    else:
			
 
				+                        # 分类：就是自己
			
 
				+                        category_id = target_id
			
 
				+
			
 
				+                    # 获取所属分类信息和常见搭配
			
 
				+                    category_info = None
			
 
				+                    if category_id:
			
 
				+                        category_node = persona_nodes.get(category_id, {})
			
 
				+                        if category_node:
			
 
				+                            category_detail = category_node.get("detail", {})
			
 
				+                            category_path = category_detail.get("parentPath", [])
			
 
				+                            category_info = {
			
 
				+                                "节点ID": category_id,
			
 
				+                                "节点名称": category_node.get("name", ""),
			
 
				+                                "节点分类": "/".join(category_path) if category_path else "",
			
 
				+                                "节点维度": category_node.get("dimension", ""),
			
 
				+                                "节点类型": "分类",
			
 
				+                                "人设全局占比": category_detail.get("probGlobal", 0),
			
 
				+                                "父类下占比": category_detail.get("probToParent", 0),
			
 
				+                                "历史共现分类": [],
			
 
				+                            }
			
 
				+
			
 
				+                            # 获取分类共现节点（按共现度降序排列）
			
 
				+                            co_occur_edges = persona_out_edges.get(category_id, {}).get("分类共现", [])
			
 
				+                            co_occur_edges_sorted = sorted(co_occur_edges, key=lambda x: x.get("score", 0), reverse=True)
			
 
				+                            for co_edge in co_occur_edges_sorted[:5]:  # 取前5个
			
 
				+                                co_target_id = co_edge.get("target", "")
			
 
				+                                co_score = co_edge.get("score", 0)
			
 
				+                                co_node = persona_nodes.get(co_target_id, {})
			
 
				+                                if co_node:
			
 
				+                                    co_detail = co_node.get("detail", {})
			
 
				+                                    co_path = co_detail.get("parentPath", [])
			
 
				+                                    category_info["历史共现分类"].append({
			
 
				+                                        "节点ID": co_target_id,
			
 
				+                                        "节点名称": co_node.get("name", ""),
			
 
				+                                        "节点分类": "/".join(co_path) if co_path else "",
			
 
				+                                        "节点维度": co_node.get("dimension", ""),
			
 
				+                                        "节点类型": "分类",
			
 
				+                                        "人设全局占比": co_detail.get("probGlobal", 0),
			
 
				+                                        "父类下占比": co_detail.get("probToParent", 0),
			
 
				+                                        "共现度": round(co_score, 4),
			
 
				+                                    })
			
 
				+
			
 
				+                    match_map[source_id] = {
			
 
				+                        "匹配节点": match_node_info,
			
 
				+                        "匹配分数": round(match_score, 4),
			
 
				+                        "所属分类": category_info,
			
 
				+                    }
			
 
				+
			
 
				+    # 5. 构建待分析节点列表（灵感点、目的点、关键点）
			
 
				+    analysis_nodes = []
			
 
				+    for node_id, node in nodes.items():
			
 
				+        if node.get("type") == "标签" and node.get("domain") == "帖子":
			
 
				+            dimension = node.get("dimension", "")
			
 
				+            if dimension in ["灵感点", "目的点", "关键点"]:
			
 
				+                # 人设匹配信息
			
 
				+                match_info = match_map.get(node_id)
			
 
				+
			
 
				+                analysis_nodes.append({
			
 
				+                    "节点ID": node_id,
			
 
				+                    "节点名称": node.get("name", ""),
			
 
				+                    "节点分类": node.get("category", ""),  # 根分类：意图/实质/形式
			
 
				+                    "节点维度": dimension,
			
 
				+                    "节点类型": node.get("type", ""),
			
 
				+                    "节点描述": node.get("detail", {}).get("description", ""),
			
 
				+                    "人设匹配": match_info,
			
 
				+                })
			
 
				+
			
 
				+    # 6. 构建可能的关系列表
			
 
				+    relation_list = []
			
 
				+
			
 
				+    # 支撑关系：关键点 → 灵感点/目的点
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "支撑":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            if source_id in keypoints:
			
 
				+                relation_list.append({
			
 
				+                    "来源节点": source_id,
			
 
				+                    "目标节点": target_id,
			
 
				+                    "关系类型": "支撑",
			
 
				+                })
			
 
				+
			
 
				+    # 关联关系：节点之间的关联（去重，只记录一次）
			
 
				+    seen_relations = set()
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "关联":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            # 用排序后的元组作为key去重
			
 
				+            key = tuple(sorted([source_id, target_id]))
			
 
				+            if key not in seen_relations:
			
 
				+                seen_relations.add(key)
			
 
				+                relation_list.append({
			
 
				+                    "来源节点": source_id,
			
 
				+                    "目标节点": target_id,
			
 
				+                    "关系类型": "关联",
			
 
				+                })
			
 
				+
			
 
				+    return analysis_nodes, relation_list
			
 
				+
			
 
				+
			
 
				+def prepare_analysis_data(post_graph: Dict, persona_graph: Dict) -> Dict:
			
 
				+    """
			
 
				+    准备完整的分析数据
			
 
				+
			
 
				+    Returns:
			
 
				+        {
			
 
				+            "帖子详情": {...},
			
 
				+            "待分析节点列表": [...],
			
 
				+            "可能的关系列表": [...]
			
 
				+        }
			
 
				+    """
			
 
				+    analysis_nodes, relation_list = extract_analysis_nodes(post_graph, persona_graph)
			
 
				+    return {
			
 
				+        "帖子详情": extract_post_detail(post_graph),
			
 
				+        "待分析节点列表": analysis_nodes,
			
 
				+        "可能的关系列表": relation_list,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+# ===== 第二步：AI分析 =====
			
 
				+
			
 
				+def build_context(data: Dict) -> Dict:
			
 
				+    """
			
 
				+    构造AI分析的上下文
			
 
				+
			
 
				+    Returns:
			
 
				+        {
			
 
				+            "all_points": [...],  # 全部创意点（含详细信息）
			
 
				+            "candidates": [...],  # 起点候选集（名称列表）
			
 
				+            "constants": [...],   # 人设常量（名称列表）
			
 
				+        }
			
 
				+    """
			
 
				+    nodes = data.get("待分析节点列表", [])
			
 
				+
			
 
				+    # 全部创意点（含详细信息）
			
 
				+    all_points = []
			
 
				+    for node in nodes:
			
 
				+        match_info = node.get("人设匹配")
			
 
				+        match_score = 0
			
 
				+        category_global_ratio = 0
			
 
				+        if match_info:
			
 
				+            match_score = match_info.get("匹配分数", 0)
			
 
				+            category_info = match_info.get("所属分类", {})
			
 
				+            if category_info:
			
 
				+                category_global_ratio = category_info.get("人设全局占比", 0)
			
 
				+
			
 
				+        all_points.append({
			
 
				+            "名称": node["节点名称"],
			
 
				+            "分类": node.get("节点分类", ""),
			
 
				+            "维度": node.get("节点维度", ""),
			
 
				+            "描述": node.get("节点描述", ""),
			
 
				+            "人设匹配度": round(match_score, 2),
			
 
				+            "所属分类全局占比": round(category_global_ratio, 2),
			
 
				+        })
			
 
				+
			
 
				+    # 起点候选集（灵感点 + 目的点）
			
 
				+    candidates = [
			
 
				+        node["节点名称"]
			
 
				+        for node in nodes
			
 
				+        if node["节点维度"] in ["灵感点", "目的点"]
			
 
				+    ]
			
 
				+
			
 
				+    # 人设常量（匹配分数 > 0.8 且 全局占比 > 0.8）
			
 
				+    constants = []
			
 
				+    for node in nodes:
			
 
				+        match_info = node.get("人设匹配")
			
 
				+        if match_info:
			
 
				+            match_score = match_info.get("匹配分数", 0)
			
 
				+            match_node = match_info.get("匹配节点", {})
			
 
				+            global_ratio = match_node.get("人设全局占比", 0)
			
 
				+
			
 
				+            if match_score > MATCH_SCORE_THRESHOLD and global_ratio > GLOBAL_RATIO_THRESHOLD:
			
 
				+                constants.append(node["节点名称"])
			
 
				+
			
 
				+    return {
			
 
				+        "all_points": all_points,
			
 
				+        "candidates": candidates,
			
 
				+        "constants": constants,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def format_prompt(context: Dict) -> str:
			
 
				+    """
			
 
				+    格式化为AI prompt
			
 
				+    """
			
 
				+    all_points = context["all_points"]
			
 
				+    candidates = context["candidates"]
			
 
				+    constants = context["constants"]
			
 
				+
			
 
				+    # 格式化全部创意点为易读文本
			
 
				+    points_text = ""
			
 
				+    for p in all_points:
			
 
				+        points_text += f"- {p['名称']}\n"
			
 
				+        points_text += f"  维度: {p['维度']} | 分类: {p['分类']}\n"
			
 
				+        points_text += f"  描述: {p['描述']}\n"
			
 
				+        points_text += f"  人设匹配度: {p['人设匹配度']} | 所属分类全局占比: {p['所属分类全局占比']}\n"
			
 
				+        points_text += "\n"
			
 
				+
			
 
				+    # 格式化起点候选集
			
 
				+    candidates_text = "、".join(candidates)
			
 
				+
			
 
				+    # 格式化人设常量
			
 
				+    constants_text = "、".join(constants) if constants else "无"
			
 
				+
			
 
				+    prompt = f"""# Role
			
 
				+你是小红书爆款内容的"逆向工程"专家。你的核心能力是透过内容的表象（视觉/形式），还原创作者最初的脑回路（动机/实质）。
			
 
				+
			
 
				+# Task
			
 
				+我提供一组笔记的【创意标签】和一个【起点候选集】。
			
 
				+请推理出哪些选项是真正的**创意起点**。
			
 
				+
			
 
				+
			
 
				+# Input Data
			
 
				+
			
 
				+## 全部创意点
			
 
				+
			
 
				+{points_text}
			
 
				+
			
 
				+## 起点候选集
			
 
				+{candidates_text}
			
 
				+
			
 
				+## 来自人设的常量
			
 
				+{constants_text}
			
 
				+
			
 
				+
			
 
				+# 推理约束
			
 
				+
			
 
				+1. 实质推形式，而不是形式推实质，除非形式是一切创意的起点
			
 
				+2. 因推果而不是果推因
			
 
				+3. 无法被其他项或人设推理出的点，即为起点
			
 
				+
			
 
				+# Output Format
			
 
				+
			
 
				+请输出一个标准的 JSON 格式。
			
 
				+- Key: 候选集中的词。
			
 
				+- Value: 一个对象，包含：
			
 
				+  - `score`: 0.0 到 1.0 的浮点数（代表是起点的可能性）。
			
 
				+  - `analysis`: 一句话推理"""
			
 
				+
			
 
				+    return prompt
			
 
				+
			
 
				+
			
 
				+# ===== 显示函数 =====
			
 
				+
			
 
				+def display_context(context: Dict, post_id: str):
			
 
				+    """显示构造的上下文"""
			
 
				+    print(f"\n帖子: {post_id}")
			
 
				+    print(f"\n全部创意点 ({len(context['all_points'])} 个):")
			
 
				+    for p in context['all_points']:
			
 
				+        print(f"  - {p['名称']} ({p['维度']}/{p['分类']}) 匹配度={p['人设匹配度']}, 分类占比={p['所属分类全局占比']}")
			
 
				+    print(f"\n起点候选集 ({len(context['candidates'])} 个):")
			
 
				+    print(f"  {context['candidates']}")
			
 
				+    print(f"\n人设常量 ({len(context['constants'])} 个):")
			
 
				+    print(f"  {context['constants']}")
			
 
				+
			
 
				+
			
 
				+def display_result(result: Dict):
			
 
				+    """显示分析结果"""
			
 
				+    output = result.get("输出")
			
 
				+    if output:
			
 
				+        print("\n起点分析结果:")
			
 
				+        # 按score降序排列
			
 
				+        sorted_items = sorted(output.items(), key=lambda x: x[1].get("score", 0), reverse=True)
			
 
				+        for name, info in sorted_items:
			
 
				+            score = info.get("score", 0)
			
 
				+            analysis = info.get("analysis", "")
			
 
				+            marker = "★" if score >= 0.7 else "○"
			
 
				+            print(f"  {marker} {name}: {score:.2f}")
			
 
				+            print(f"      {analysis}")
			
 
				+    else:
			
 
				+        print(f"  分析失败: {result.get('错误', 'N/A')}")
			
 
				+
			
 
				+
			
 
				+# ===== 处理函数 =====
			
 
				+
			
 
				+async def process_single_post(
			
 
				+    post_file: Path,
			
 
				+    persona_graph: Dict,
			
 
				+    config: PathConfig,
			
 
				+    current_time: str = None,
			
 
				+    log_url: str = None,
			
 
				+    force: bool = False,
			
 
				+) -> Dict:
			
 
				+    """
			
 
				+    处理单个帖子（数据准备 + AI分析）
			
 
				+    """
			
 
				+    # 加载帖子图谱
			
 
				+    post_graph = load_json(post_file)
			
 
				+    post_id = post_graph.get("meta", {}).get("postId", "unknown")
			
 
				+
			
 
				+    # 检查是否已处理
			
 
				+    if not force and is_already_processed(config, post_id):
			
 
				+        print(f"\n跳过帖子 {post_id}（已处理，使用 --force 强制重新分析）")
			
 
				+        # 返回已有结果
			
 
				+        result_file = get_result_file(config, post_id)
			
 
				+        return load_json(result_file)
			
 
				+
			
 
				+    print(f"\n{'=' * 60}")
			
 
				+    print(f"处理帖子: {post_id}")
			
 
				+    print("-" * 60)
			
 
				+
			
 
				+    # 第一步：准备数据
			
 
				+    data = prepare_analysis_data(post_graph, persona_graph)
			
 
				+
			
 
				+    # 构造上下文
			
 
				+    context = build_context(data)
			
 
				+    display_context(context, post_id)
			
 
				+
			
 
				+    # 格式化prompt
			
 
				+    prompt = format_prompt(context)
			
 
				+
			
 
				+    # 第二步：调用AI
			
 
				+    print("\n调用AI分析中...")
			
 
				+    with custom_span(
			
 
				+        name=f"创作起点分析 - {post_id}",
			
 
				+        data={
			
 
				+            "帖子id": post_id,
			
 
				+            "候选数": len(context["candidates"]),
			
 
				+            "模型": MODEL_NAME
			
 
				+        }
			
 
				+    ):
			
 
				+        result = await Runner.run(agent, input=prompt)
			
 
				+        output_text = result.final_output
			
 
				+
			
 
				+    # 解析JSON
			
 
				+    try:
			
 
				+        if "```json" in output_text:
			
 
				+            json_start = output_text.find("```json") + 7
			
 
				+            json_end = output_text.find("```", json_start)
			
 
				+            json_str = output_text[json_start:json_end].strip()
			
 
				+        elif "{" in output_text and "}" in output_text:
			
 
				+            json_start = output_text.find("{")
			
 
				+            json_end = output_text.rfind("}") + 1
			
 
				+            json_str = output_text[json_start:json_end]
			
 
				+        else:
			
 
				+            json_str = output_text
			
 
				+
			
 
				+        analysis_result = json.loads(json_str)
			
 
				+
			
 
				+        result_data = {
			
 
				+            "帖子id": post_id,
			
 
				+            "模型": MODEL_NAME,
			
 
				+            "输入": context,
			
 
				+            "输出": analysis_result
			
 
				+        }
			
 
				+    except Exception as e:
			
 
				+        result_data = {
			
 
				+            "帖子id": post_id,
			
 
				+            "模型": MODEL_NAME,
			
 
				+            "输入": context,
			
 
				+            "输出": None,
			
 
				+            "错误": str(e),
			
 
				+            "原始输出": output_text
			
 
				+        }
			
 
				+
			
 
				+    # 显示结果
			
 
				+    display_result(result_data)
			
 
				+
			
 
				+    # 保存结果
			
 
				+    output_dir = config.intermediate_dir / "origin_analysis_result"
			
 
				+    output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    output_with_meta = {
			
 
				+        "元数据": {
			
 
				+            "current_time": current_time,
			
 
				+            "log_url": log_url,
			
 
				+            "model": MODEL_NAME
			
 
				+        },
			
 
				+        **result_data
			
 
				+    }
			
 
				+
			
 
				+    output_file = output_dir / f"{post_id}_起点分析.json"
			
 
				+    with open(output_file, "w", encoding="utf-8") as f:
			
 
				+        json.dump(output_with_meta, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"\n已保存: {output_file.name}")
			
 
				+
			
 
				+    return result_data
			
 
				+
			
 
				+
			
 
				+# ===== 主函数 =====
			
 
				+
			
 
				+async def main(
			
 
				+    post_id: str = None,
			
 
				+    all_posts: bool = False,
			
 
				+    force: bool = False,
			
 
				+):
			
 
				+    """
			
 
				+    主函数
			
 
				+
			
 
				+    Args:
			
 
				+        post_id: 帖子ID，可选
			
 
				+        all_posts: 是否处理所有帖子
			
 
				+        force: 强制重新分析已处理的帖子
			
 
				+    """
			
 
				+    # 设置 trace
			
 
				+    current_time, log_url = set_trace()
			
 
				+
			
 
				+    config = PathConfig()
			
 
				+
			
 
				+    print(f"账号: {config.account_name}")
			
 
				+    print(f"使用模型: {MODEL_NAME}")
			
 
				+    print(f"Trace URL: {log_url}")
			
 
				+
			
 
				+    # 加载人设图谱
			
 
				+    persona_graph_file = config.intermediate_dir / "人设图谱.json"
			
 
				+    if not persona_graph_file.exists():
			
 
				+        print(f"错误: 人设图谱文件不存在: {persona_graph_file}")
			
 
				+        return
			
 
				+
			
 
				+    persona_graph = load_json(persona_graph_file)
			
 
				+    print(f"人设图谱节点数: {len(persona_graph.get('nodes', {}))}")
			
 
				+
			
 
				+    # 获取帖子图谱文件
			
 
				+    post_graph_files = get_post_graph_files(config)
			
 
				+    if not post_graph_files:
			
 
				+        print("错误: 没有找到帖子图谱文件")
			
 
				+        return
			
 
				+
			
 
				+    # 确定要处理的帖子
			
 
				+    if post_id:
			
 
				+        target_file = next(
			
 
				+            (f for f in post_graph_files if post_id in f.name),
			
 
				+            None
			
 
				+        )
			
 
				+        if not target_file:
			
 
				+            print(f"错误: 未找到帖子 {post_id}")
			
 
				+            return
			
 
				+        files_to_process = [target_file]
			
 
				+    elif all_posts:
			
 
				+        files_to_process = post_graph_files
			
 
				+    else:
			
 
				+        files_to_process = [post_graph_files[0]]
			
 
				+
			
 
				+    print(f"待处理帖子数: {len(files_to_process)}")
			
 
				+
			
 
				+    # 处理
			
 
				+    with trace("创作起点分析"):
			
 
				+        results = []
			
 
				+        skipped = 0
			
 
				+        for i, post_file in enumerate(files_to_process, 1):
			
 
				+            print(f"\n{'#' * 60}")
			
 
				+            print(f"# 处理帖子 {i}/{len(files_to_process)}")
			
 
				+            print(f"{'#' * 60}")
			
 
				+
			
 
				+            result = await process_single_post(
			
 
				+                post_file=post_file,
			
 
				+                persona_graph=persona_graph,
			
 
				+                config=config,
			
 
				+                current_time=current_time,
			
 
				+                log_url=log_url,
			
 
				+                force=force,
			
 
				+            )
			
 
				+
			
 
				+            # 检查是否是跳过的
			
 
				+            if not force and "元数据" in result:
			
 
				+                skipped += 1
			
 
				+
			
 
				+            results.append(result)
			
 
				+
			
 
				+    # 汇总
			
 
				+    print(f"\n{'#' * 60}")
			
 
				+    print(f"# 完成! 共处理 {len(results)} 个帖子 (跳过 {skipped} 个已处理)")
			
 
				+    print(f"{'#' * 60}")
			
 
				+    print(f"Trace: {log_url}")
			
 
				+
			
 
				+    print("\n汇总（score >= 0.7 的起点）:")
			
 
				+    for result in results:
			
 
				+        post_id = result.get("帖子id")
			
 
				+        output = result.get("输出")
			
 
				+        if output:
			
 
				+            origins = [f"{k}({v['score']:.2f})" for k, v in output.items() if v.get("score", 0) >= 0.7]
			
 
				+            print(f"  {post_id}: {', '.join(origins) if origins else '无高置信起点'}")
			
 
				+        else:
			
 
				+            print(f"  {post_id}: 分析失败")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import argparse
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(description="创作起点分析")
			
 
				+    parser.add_argument("--post-id", type=str, help="帖子ID")
			
 
				+    parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
			
 
				+    parser.add_argument("--force", action="store_true", help="强制重新分析已处理的帖子")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    asyncio.run(main(
			
 
				+        post_id=args.post_id,
			
 
				+        all_posts=args.all_posts,
			
 
				+        force=args.force,
			
 
				+    ))
			
--- a/script/data_processing/analyze_creation_pattern.py
+++ b/script/data_processing/analyze_creation_pattern.py
@@ -405,6 +405,7 @@ def build_origin_context(nodes: List[Dict]) -> Dict:
 
				             "维度": node.get("节点维度", ""),
			
 
				             "描述": node.get("节点描述", ""),
			
 
				             "人设匹配度": round(get_match_score(node), 2),
			
 
				+            "人设全局占比": round(get_category_global_ratio(node), 2),
			
 
				         })
			
 
				 
			
 
				     # 起点候选集（灵感点 + 目的点）
			
@@ -439,7 +440,7 @@ def format_origin_prompt(context: Dict) -> str:
 
				         points_text += f"- {p['名称']}\n"
			
 
				         points_text += f"  维度: {p['维度']} | 分类: {p['分类']}\n"
			
 
				         points_text += f"  描述: {p['描述']}\n"
			
 
				-        points_text += f"  人设匹配度: {p['人设匹配度']}\n"
			
 
				+        points_text += f"  人设匹配度: {p['人设匹配度']} | 人设全局占比: {p['人设全局占比']}\n"
			
 
				         points_text += "\n"
			
 
				 
			
 
				     candidates_text = "、".join(candidates)
			
@@ -673,6 +674,171 @@ def derive_patterns(
 
				     }
			
 
				 
			
 
				 
			
 
				+# ===== 第四步：下一步分析 =====
			
 
				+
			
 
				+def build_next_step_context(known_nodes: List[Dict], unknown_nodes: List[Dict], all_nodes: List[Dict]) -> Dict:
			
 
				+    """构造下一步分析的上下文"""
			
 
				+
			
 
				+    # 已知点信息（按发现顺序排序）
			
 
				+    known_sorted = sorted(known_nodes, key=lambda n: n.get("发现编号") or 999)
			
 
				+    known_info = []
			
 
				+    for n in known_sorted:
			
 
				+        info = {
			
 
				+            "名称": n["节点名称"],
			
 
				+            "维度": n["节点维度"],
			
 
				+            "分类": n.get("节点分类", ""),
			
 
				+            "描述": n.get("节点描述", ""),
			
 
				+            "人设匹配度": round(get_match_score(n), 2),
			
 
				+            "人设全局占比": round(get_category_global_ratio(n), 2),
			
 
				+            "发现编号": n.get("发现编号"),
			
 
				+        }
			
 
				+        # 如果有起点分析，加上
			
 
				+        if n.get("起点分析"):
			
 
				+            info["起点说明"] = n["起点分析"].get("说明", "")
			
 
				+        known_info.append(info)
			
 
				+
			
 
				+    # 未知点信息
			
 
				+    unknown_info = []
			
 
				+    for n in unknown_nodes:
			
 
				+        unknown_info.append({
			
 
				+            "名称": n["节点名称"],
			
 
				+            "维度": n["节点维度"],
			
 
				+            "分类": n.get("节点分类", ""),
			
 
				+            "描述": n.get("节点描述", ""),
			
 
				+            "人设匹配度": round(get_match_score(n), 2),
			
 
				+            "人设全局占比": round(get_category_global_ratio(n), 2),
			
 
				+        })
			
 
				+
			
 
				+    # 人设常量（从全部节点中筛选）
			
 
				+    constants = [
			
 
				+        n["节点名称"]
			
 
				+        for n in all_nodes
			
 
				+        if is_persona_constant(n)
			
 
				+    ]
			
 
				+
			
 
				+    return {
			
 
				+        "known_nodes": known_info,
			
 
				+        "unknown_nodes": unknown_info,
			
 
				+        "constants": constants,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def format_next_step_prompt(context: Dict) -> str:
			
 
				+    """格式化下一步分析的prompt"""
			
 
				+
			
 
				+    known_text = ""
			
 
				+    for i, n in enumerate(context["known_nodes"], 1):
			
 
				+        known_text += f"{i}. {n['名称']} ({n['维度']})\n"
			
 
				+        known_text += f"   分类: {n['分类']}\n"
			
 
				+        known_text += f"   描述: {n['描述']}\n"
			
 
				+        known_text += f"   人设匹配度: {n['人设匹配度']} | 人设全局占比: {n['人设全局占比']}\n"
			
 
				+        if n.get("起点说明"):
			
 
				+            known_text += f"   起点说明: {n['起点说明']}\n"
			
 
				+        known_text += "\n"
			
 
				+
			
 
				+    unknown_text = ""
			
 
				+    for n in context["unknown_nodes"]:
			
 
				+        unknown_text += f"- {n['名称']} ({n['维度']})\n"
			
 
				+        unknown_text += f"  分类: {n['分类']}\n"
			
 
				+        unknown_text += f"  描述: {n['描述']}\n"
			
 
				+        unknown_text += f"  人设匹配度: {n['人设匹配度']} | 人设全局占比: {n['人设全局占比']}\n\n"
			
 
				+
			
 
				+    constants = context.get("constants", [])
			
 
				+    constants_text = "、".join(constants) if constants else "无"
			
 
				+
			
 
				+    prompt = f"""# Role
			
 
				+你是小红书爆款内容的"逆向工程"专家。你的任务是还原创作者的思维路径。
			
 
				+
			
 
				+# Task
			
 
				+基于已知的创意点，推理哪些未知点最可能是创作者**下一步直接想到**的点。
			
 
				+可以有多个点同时被想到（如果它们在逻辑上是并列的）。
			
 
				+
			
 
				+## 已知点（按发现顺序）
			
 
				+{known_text}
			
 
				+## 未知点（待推理）
			
 
				+{unknown_text}
			
 
				+## 人设常量
			
 
				+{constants_text}
			
 
				+
			
 
				+# 推理约束
			
 
				+1. 创作者的思维是有逻辑的：先有动机/目的，再想形式/手法
			
 
				+2. 关键点通常是为了支撑灵感点或目的点
			
 
				+3. 人设常量是创作者固有的风格，不需要推理
			
 
				+4. 只输出"下一步直接能想到"的点，不是所有未知点
			
 
				+
			
 
				+# Output Format
			
 
				+输出 JSON，对每个未知点评分：
			
 
				+- Key: 未知点名称
			
 
				+- Value: 对象，包含：
			
 
				+  - `score`: 0.0-1.0（下一步被想到的可能性）
			
 
				+  - `from`: 从哪个已知点推导出来（已知点名称）
			
 
				+  - `reason`: 如何从该已知点推导出来（一句话）"""
			
 
				+
			
 
				+    return prompt
			
 
				+
			
 
				+
			
 
				+async def analyze_next_step(
			
 
				+    nodes: List[Dict],
			
 
				+    force_llm: bool = False
			
 
				+) -> Dict:
			
 
				+    """
			
 
				+    执行下一步分析
			
 
				+
			
 
				+    输入: 节点列表（有已知和未知）
			
 
				+    输出: 最可能的下一步点列表
			
 
				+    """
			
 
				+    # 分离已知和未知
			
 
				+    known_nodes = [n for n in nodes if n.get("是否已知")]
			
 
				+    unknown_nodes = [n for n in nodes if not n.get("是否已知")]
			
 
				+
			
 
				+    if not unknown_nodes:
			
 
				+        return {
			
 
				+            "输入上下文": {"已知点": [], "未知点": [], "人设常量": []},
			
 
				+            "中间结果": [],
			
 
				+            "下一步点": [],
			
 
				+        }
			
 
				+
			
 
				+    context = build_next_step_context(known_nodes, unknown_nodes, nodes)
			
 
				+    prompt = format_next_step_prompt(context)
			
 
				+
			
 
				+    print(f"\n  已知点: {len(known_nodes)} 个")
			
 
				+    print(f"  未知点: {len(unknown_nodes)} 个")
			
 
				+
			
 
				+    result = await analyze(
			
 
				+        prompt=prompt,
			
 
				+        task_name=f"{TASK_NAME}/next_step",
			
 
				+        force=force_llm,
			
 
				+        parse_json=True,
			
 
				+    )
			
 
				+
			
 
				+    # 解析结果（现在是 {name: {score, from, reason}} 格式）
			
 
				+    llm_result = result.data or {}
			
 
				+
			
 
				+    # 构建候选列表，按分数排序
			
 
				+    candidates = []
			
 
				+    for name, info in llm_result.items():
			
 
				+        candidates.append({
			
 
				+            "节点名称": name,
			
 
				+            "可能性分数": info.get("score", 0),
			
 
				+            "推导来源": info.get("from", ""),
			
 
				+            "推理说明": info.get("reason", ""),
			
 
				+        })
			
 
				+    candidates.sort(key=lambda x: x["可能性分数"], reverse=True)
			
 
				+
			
 
				+    return {
			
 
				+        "输入上下文": {
			
 
				+            "已知点": context["known_nodes"],
			
 
				+            "未知点": context["unknown_nodes"],
			
 
				+            "人设常量": context["constants"],
			
 
				+        },
			
 
				+        "中间结果": llm_result,
			
 
				+        "下一步候选": candidates,
			
 
				+        "cache_hit": result.cache_hit,
			
 
				+        "model": result.model_name,
			
 
				+        "log_url": result.log_url,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				 # ===== 完整流程 =====
			
 
				 
			
 
				 def save_result(post_id: str, post_detail: Dict, steps: List, config: PathConfig) -> Path:
			
@@ -855,6 +1021,244 @@ async def process_single_post(
 
				     # 步骤3完成，保存
			
 
				     save_result(post_id, post_detail, steps, config)
			
 
				 
			
 
				+    if max_step == 3:
			
 
				+        return {"帖子详情": post_detail, "步骤列表": steps}
			
 
				+
			
 
				+    # ===== 步骤4：下一步分析 =====
			
 
				+    print("\n[步骤4] 下一步分析...")
			
 
				+    next_step_result = await analyze_next_step(nodes_step3, force_llm=force_llm)
			
 
				+
			
 
				+    # 获取候选列表
			
 
				+    candidates = next_step_result["下一步候选"]
			
 
				+
			
 
				+    # 筛选高分候选 (>= 0.8)
			
 
				+    NEXT_STEP_THRESHOLD = 0.8
			
 
				+    high_score_candidates = [c for c in candidates if c["可能性分数"] >= NEXT_STEP_THRESHOLD]
			
 
				+
			
 
				+    # 构建节点名称到节点的映射
			
 
				+    node_by_name = {n["节点名称"]: n for n in nodes_step3}
			
 
				+
			
 
				+    # 找出当前最大发现编号
			
 
				+    max_order = max((n.get("发现编号") or 0) for n in nodes_step3)
			
 
				+
			
 
				+    # 更新节点：把高分候选标记为已知
			
 
				+    nodes_step4 = []
			
 
				+    new_known_names = []
			
 
				+    current_order = max_order + 1
			
 
				+
			
 
				+    for node in nodes_step3:
			
 
				+        new_node = dict(node)
			
 
				+        name = node["节点名称"]
			
 
				+
			
 
				+        # 检查是否在高分候选中
			
 
				+        matching = [c for c in high_score_candidates if c["节点名称"] == name]
			
 
				+        if matching and not node.get("是否已知"):
			
 
				+            new_node["是否已知"] = True
			
 
				+            new_node["发现编号"] = current_order
			
 
				+            current_order += 1
			
 
				+            new_known_names.append(name)
			
 
				+
			
 
				+        nodes_step4.append(new_node)
			
 
				+
			
 
				+    # 创建新的边（推导边）
			
 
				+    new_edges = []
			
 
				+    for c in high_score_candidates:
			
 
				+        target_node = node_by_name.get(c["节点名称"])
			
 
				+        source_name = c["推导来源"]
			
 
				+        source_node = node_by_name.get(source_name)
			
 
				+        if target_node and source_node:
			
 
				+            new_edges.append({
			
 
				+                "来源": source_node["节点ID"],
			
 
				+                "目标": target_node["节点ID"],
			
 
				+                "关系类型": "AI推导",
			
 
				+                "可能性分数": c["可能性分数"],
			
 
				+                "推理说明": c["推理说明"],
			
 
				+            })
			
 
				+
			
 
				+    # 合并边列表
			
 
				+    all_edges_step4 = all_edges + new_edges
			
 
				+
			
 
				+    step4 = {
			
 
				+        "步骤": "下一步分析",
			
 
				+        "输入": {
			
 
				+            "已知点": next_step_result["输入上下文"]["已知点"],
			
 
				+            "未知点": next_step_result["输入上下文"]["未知点"],
			
 
				+            "人设常量": next_step_result["输入上下文"]["人设常量"],
			
 
				+        },
			
 
				+        "中间结果": next_step_result["中间结果"],
			
 
				+        "输出": {
			
 
				+            "新的已知节点": new_known_names,
			
 
				+            "新的边": new_edges,
			
 
				+            "节点列表": nodes_step4,
			
 
				+            "边列表": all_edges_step4,
			
 
				+        },
			
 
				+        "摘要": {
			
 
				+            "已知点数": sum(1 for n in nodes_step4 if n.get("是否已知")),
			
 
				+            "新已知数": len(new_known_names),
			
 
				+            "新边数": len(new_edges),
			
 
				+            "未知点数": sum(1 for n in nodes_step4 if not n.get("是否已知")),
			
 
				+            "model": next_step_result.get("model"),
			
 
				+            "cache_hit": next_step_result.get("cache_hit"),
			
 
				+            "log_url": next_step_result.get("log_url"),
			
 
				+        },
			
 
				+    }
			
 
				+    steps.append(step4)
			
 
				+
			
 
				+    # 打印高分候选
			
 
				+    print(f"  候选数: {len(candidates)} 个")
			
 
				+    print(f"  高分候选 (>={NEXT_STEP_THRESHOLD}): {len(high_score_candidates)} 个")
			
 
				+    for c in high_score_candidates:
			
 
				+        print(f"    ★ {c['节点名称']} ({c['可能性分数']:.2f}) ← {c['推导来源']}")
			
 
				+        print(f"      {c['推理说明']}")
			
 
				+
			
 
				+    # 步骤4完成，保存
			
 
				+    save_result(post_id, post_detail, steps, config)
			
 
				+
			
 
				+    if max_step == 4:
			
 
				+        return {"帖子详情": post_detail, "步骤列表": steps}
			
 
				+
			
 
				+    # ===== 循环：步骤3→步骤4 直到全部已知 =====
			
 
				+    iteration = 1
			
 
				+    current_nodes = nodes_step4
			
 
				+    current_edges = all_edges_step4
			
 
				+    MAX_ITERATIONS = 10  # 防止无限循环
			
 
				+
			
 
				+    while True:
			
 
				+        # 检查是否还有未知节点
			
 
				+        unknown_count = sum(1 for n in current_nodes if not n.get("是否已知"))
			
 
				+        if unknown_count == 0:
			
 
				+            print(f"\n[完成] 所有节点已变为已知")
			
 
				+            break
			
 
				+
			
 
				+        if iteration > MAX_ITERATIONS:
			
 
				+            print(f"\n[警告] 达到最大迭代次数 {MAX_ITERATIONS}，停止循环")
			
 
				+            break
			
 
				+
			
 
				+        # ===== 迭代步骤3：共现推导 =====
			
 
				+        print(f"\n[迭代{iteration}-步骤3] 模式推导...")
			
 
				+        derivation_result = derive_patterns(current_nodes, persona_co_occur)
			
 
				+        nodes_iter3 = derivation_result["输出节点"]
			
 
				+        edges_iter3 = derivation_result["推导边列表"]
			
 
				+
			
 
				+        # 统计新推导的
			
 
				+        prev_known_names = {n["节点名称"] for n in current_nodes if n.get("是否已知")}
			
 
				+        new_known_step3 = [n["节点名称"] for n in nodes_iter3 if n.get("是否已知") and n["节点名称"] not in prev_known_names]
			
 
				+        new_edges_step3 = edges_iter3  # derive_patterns 返回的是本轮新增的边
			
 
				+
			
 
				+        all_edges_iter3 = current_edges + new_edges_step3
			
 
				+
			
 
				+        step_iter3 = {
			
 
				+            "步骤": f"迭代{iteration}-模式推导",
			
 
				+            "输入": {
			
 
				+                "节点列表": current_nodes,
			
 
				+                "人设共现关系": persona_co_occur,
			
 
				+            },
			
 
				+            "输出": {
			
 
				+                "新的已知节点": new_known_step3,
			
 
				+                "新的边": new_edges_step3,
			
 
				+                "节点列表": nodes_iter3,
			
 
				+                "边列表": all_edges_iter3,
			
 
				+            },
			
 
				+            "摘要": {
			
 
				+                "已知点数": sum(1 for n in nodes_iter3 if n.get("是否已知")),
			
 
				+                "新已知数": len(new_known_step3),
			
 
				+                "新边数": len(new_edges_step3),
			
 
				+                "未知点数": sum(1 for n in nodes_iter3 if not n.get("是否已知")),
			
 
				+            },
			
 
				+        }
			
 
				+        steps.append(step_iter3)
			
 
				+
			
 
				+        print(f"  新已知: {len(new_known_step3)} 个")
			
 
				+        print(f"  新边: {len(new_edges_step3)} 条")
			
 
				+
			
 
				+        save_result(post_id, post_detail, steps, config)
			
 
				+
			
 
				+        # 检查是否还有未知
			
 
				+        unknown_after_step3 = sum(1 for n in nodes_iter3 if not n.get("是否已知"))
			
 
				+        if unknown_after_step3 == 0:
			
 
				+            print(f"\n[完成] 所有节点已变为已知")
			
 
				+            break
			
 
				+
			
 
				+        # ===== 迭代步骤4：AI推导 =====
			
 
				+        print(f"\n[迭代{iteration}-步骤4] 下一步分析...")
			
 
				+        next_step_result = await analyze_next_step(nodes_iter3, force_llm=force_llm)
			
 
				+        candidates_iter4 = next_step_result["下一步候选"]
			
 
				+        high_score_iter4 = [c for c in candidates_iter4 if c["可能性分数"] >= NEXT_STEP_THRESHOLD]
			
 
				+
			
 
				+        # 更新节点
			
 
				+        node_by_name_iter4 = {n["节点名称"]: n for n in nodes_iter3}
			
 
				+        max_order_iter4 = max((n.get("发现编号") or 0) for n in nodes_iter3)
			
 
				+        nodes_iter4 = []
			
 
				+        new_known_iter4 = []
			
 
				+        current_order_iter4 = max_order_iter4 + 1
			
 
				+
			
 
				+        for node in nodes_iter3:
			
 
				+            new_node = dict(node)
			
 
				+            name = node["节点名称"]
			
 
				+            matching = [c for c in high_score_iter4 if c["节点名称"] == name]
			
 
				+            if matching and not node.get("是否已知"):
			
 
				+                new_node["是否已知"] = True
			
 
				+                new_node["发现编号"] = current_order_iter4
			
 
				+                current_order_iter4 += 1
			
 
				+                new_known_iter4.append(name)
			
 
				+            nodes_iter4.append(new_node)
			
 
				+
			
 
				+        # 创建新边
			
 
				+        new_edges_iter4 = []
			
 
				+        for c in high_score_iter4:
			
 
				+            target_node = node_by_name_iter4.get(c["节点名称"])
			
 
				+            source_node = node_by_name_iter4.get(c["推导来源"])
			
 
				+            if target_node and source_node:
			
 
				+                new_edges_iter4.append({
			
 
				+                    "来源": source_node["节点ID"],
			
 
				+                    "目标": target_node["节点ID"],
			
 
				+                    "关系类型": "AI推导",
			
 
				+                    "可能性分数": c["可能性分数"],
			
 
				+                    "推理说明": c["推理说明"],
			
 
				+                })
			
 
				+
			
 
				+        all_edges_iter4 = all_edges_iter3 + new_edges_iter4
			
 
				+
			
 
				+        step_iter4 = {
			
 
				+            "步骤": f"迭代{iteration}-下一步分析",
			
 
				+            "输入": {
			
 
				+                "已知点": next_step_result["输入上下文"]["已知点"],
			
 
				+                "未知点": next_step_result["输入上下文"]["未知点"],
			
 
				+                "人设常量": next_step_result["输入上下文"]["人设常量"],
			
 
				+            },
			
 
				+            "中间结果": next_step_result["中间结果"],
			
 
				+            "输出": {
			
 
				+                "新的已知节点": new_known_iter4,
			
 
				+                "新的边": new_edges_iter4,
			
 
				+                "节点列表": nodes_iter4,
			
 
				+                "边列表": all_edges_iter4,
			
 
				+            },
			
 
				+            "摘要": {
			
 
				+                "已知点数": sum(1 for n in nodes_iter4 if n.get("是否已知")),
			
 
				+                "新已知数": len(new_known_iter4),
			
 
				+                "新边数": len(new_edges_iter4),
			
 
				+                "未知点数": sum(1 for n in nodes_iter4 if not n.get("是否已知")),
			
 
				+                "model": next_step_result.get("model"),
			
 
				+                "cache_hit": next_step_result.get("cache_hit"),
			
 
				+            },
			
 
				+        }
			
 
				+        steps.append(step_iter4)
			
 
				+
			
 
				+        print(f"  新已知: {len(new_known_iter4)} 个")
			
 
				+        print(f"  新边: {len(new_edges_iter4)} 条")
			
 
				+
			
 
				+        save_result(post_id, post_detail, steps, config)
			
 
				+
			
 
				+        # 如果这轮没有新进展，停止
			
 
				+        if len(new_known_step3) == 0 and len(new_known_iter4) == 0:
			
 
				+            print(f"\n[停止] 本轮无新进展，停止循环")
			
 
				+            break
			
 
				+
			
 
				+        # 更新状态，进入下一轮
			
 
				+        current_nodes = nodes_iter4
			
 
				+        current_edges = all_edges_iter4
			
 
				+        iteration += 1
			
 
				+
			
 
				     return {"帖子详情": post_detail, "步骤列表": steps}
			
 
				 
			
 
				 
			
@@ -939,13 +1343,17 @@ async def main(
 
				             print(f"  {post_id}: 节点数={step1_summary.get('节点数', 0)} (仅数据准备)")
			
 
				         elif num_steps == 2:
			
 
				             step2_summary = steps[1].get("摘要", {})
			
 
				-            print(f"  {post_id}: 起点={step2_summary.get('高分起点数', 0)} (未推导)")
			
 
				-        elif num_steps >= 3:
			
 
				-            step2_summary = steps[1].get("摘要", {})
			
 
				+            print(f"  {post_id}: 起点={step2_summary.get('新已知数', 0)} (未推导)")
			
 
				+        elif num_steps == 3:
			
 
				             step3_summary = steps[2].get("摘要", {})
			
 
				-            print(f"  {post_id}: 起点={step2_summary.get('高分起点数', 0)}, "
			
 
				-                  f"已知={step3_summary.get('已知点数', 0)}, "
			
 
				-                  f"推导边={step3_summary.get('推导边数', 0)}")
			
 
				+            print(f"  {post_id}: 已知={step3_summary.get('已知点数', 0)}, "
			
 
				+                  f"未知={step3_summary.get('未知点数', 0)}")
			
 
				+        elif num_steps >= 4:
			
 
				+            step4_summary = steps[3].get("摘要", {})
			
 
				+            print(f"  {post_id}: 已知={step4_summary.get('已知点数', 0)}, "
			
 
				+                  f"新已知={step4_summary.get('新已知数', 0)}, "
			
 
				+                  f"新边={step4_summary.get('新边数', 0)}, "
			
 
				+                  f"未知={step4_summary.get('未知点数', 0)}")
			
 
				         else:
			
 
				             print(f"  {post_id}: 无步骤数据")
			
 
				 
			
@@ -957,8 +1365,8 @@ if __name__ == "__main__":
 
				     parser.add_argument("--post-id", type=str, help="帖子ID")
			
 
				     parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
			
 
				     parser.add_argument("--force-llm", action="store_true", help="强制重新调用LLM（跳过LLM缓存）")
			
 
				-    parser.add_argument("--step", type=int, default=3, choices=[1, 2, 3],
			
 
				-                        help="运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导)")
			
 
				+    parser.add_argument("--step", type=int, default=5, choices=[1, 2, 3, 4, 5],
			
 
				+                        help="运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导, 4=下一步分析, 5=完整循环)")
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				     asyncio.run(main(
			
--- a/script/data_processing/build_post_graph.py
+++ b/script/data_processing/build_post_graph.py
@@ -121,16 +121,20 @@ def create_node(
 
				     dimension: str,
			
 
				     node_type: str,
			
 
				     name: str,
			
 
				-    detail: Dict = None
			
 
				+    detail: Dict = None,
			
 
				+    category: str = None
			
 
				 ) -> Dict:
			
 
				     """创建节点"""
			
 
				-    return {
			
 
				+    node = {
			
 
				         "name": name,
			
 
				         "type": node_type,
			
 
				         "dimension": dimension,
			
 
				         "domain": domain,
			
 
				         "detail": detail or {}
			
 
				     }
			
 
				+    if category:
			
 
				+        node["category"] = category
			
 
				+    return node
			
 
				 
			
 
				 
			
 
				 def create_edge(
			
@@ -187,6 +191,7 @@ def extract_tags_and_matches(filtered_data: Dict) -> tuple:
 
				             tag_name = point.get("名称", "")
			
 
				             tag_desc = point.get("描述", "")
			
 
				             point_id = point.get("ID", "")
			
 
				+            point_category = point.get("类型", "")  # 根分类：意图/实质/形式
			
 
				 
			
 
				             if not tag_name:
			
 
				                 continue
			
@@ -201,7 +206,8 @@ def extract_tags_and_matches(filtered_data: Dict) -> tuple:
 
				                 detail={
			
 
				                     "description": tag_desc,
			
 
				                     "pointId": point_id
			
 
				-                }
			
 
				+                },
			
 
				+                category=point_category
			
 
				             )
			
 
				 
			
 
				             # 建立 ID 映射
			
--- a/script/data_processing/derive_pattern_relations.py
+++ b/script/data_processing/derive_pattern_relations.py
@@ -0,0 +1,411 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+创作模式推导 - 第二步：基于共现关系的迭代推导
			
 
				+
			
 
				+输入：起点分析结果 + 待分析节点数据
			
 
				+输出：推导结果（已知点集合 + 推导关系）
			
 
				+
			
 
				+算法：
			
 
				+1. 初始化：起点分析中 score >= 0.8 的点 → 已知点集合
			
 
				+2. 迭代：
			
 
				+   - 从新加入的已知点中，筛选人设匹配分数 >= 0.8 的
			
 
				+   - 获取它们的所属分类的历史共现分类ID列表
			
 
				+   - 遍历未知点（人设匹配 >= 0.8），检查其所属分类ID是否在共现列表中
			
 
				+   - 如果在，加入已知点，建立关系
			
 
				+3. 直到没有新点加入
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from dataclasses import dataclass, field, asdict
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Set, Optional
			
 
				+import sys
			
 
				+
			
 
				+# 添加项目根目录到路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from script.data_processing.path_config import PathConfig
			
 
				+
			
 
				+
			
 
				+# ===== 配置 =====
			
 
				+ORIGIN_SCORE_THRESHOLD = 0.8  # 起点分数阈值
			
 
				+MATCH_SCORE_THRESHOLD = 0.8   # 人设匹配分数阈值
			
 
				+
			
 
				+
			
 
				+# ===== 数据结构 =====
			
 
				+
			
 
				+@dataclass
			
 
				+class AnalysisNode:
			
 
				+    """待分析节点"""
			
 
				+    节点ID: str
			
 
				+    节点名称: str
			
 
				+    节点分类: str
			
 
				+    节点维度: str
			
 
				+    人设匹配分数: float
			
 
				+    所属分类ID: Optional[str]
			
 
				+    历史共现分类: Dict[str, float] = field(default_factory=dict)  # {分类ID: 共现度}
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_raw(cls, raw: Dict) -> "AnalysisNode":
			
 
				+        """从原始数据构造"""
			
 
				+        match_info = raw.get("人设匹配") or {}
			
 
				+        match_score = match_info.get("匹配分数", 0)
			
 
				+
			
 
				+        category_info = match_info.get("所属分类") or {}
			
 
				+        category_id = category_info.get("节点ID")
			
 
				+
			
 
				+        co_occur_list = category_info.get("历史共现分类", [])
			
 
				+        co_occur_map = {
			
 
				+            c.get("节点ID"): c.get("共现度", 0)
			
 
				+            for c in co_occur_list
			
 
				+            if c.get("节点ID")
			
 
				+        }
			
 
				+
			
 
				+        return cls(
			
 
				+            节点ID=raw.get("节点ID", ""),
			
 
				+            节点名称=raw.get("节点名称", ""),
			
 
				+            节点分类=raw.get("节点分类", ""),
			
 
				+            节点维度=raw.get("节点维度", ""),
			
 
				+            人设匹配分数=match_score,
			
 
				+            所属分类ID=category_id,
			
 
				+            历史共现分类=co_occur_map,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class DerivedRelation:
			
 
				+    """推导出的关系"""
			
 
				+    来源节点ID: str
			
 
				+    来源节点名称: str
			
 
				+    目标节点ID: str
			
 
				+    目标节点名称: str
			
 
				+    关系类型: str  # "共现推导"
			
 
				+    推导轮次: int
			
 
				+    共现分类ID: str  # 通过哪个共现分类建立的关系
			
 
				+    共现度: float  # 共现度分数
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class DerivationResult:
			
 
				+    """推导结果"""
			
 
				+    帖子ID: str
			
 
				+    起点列表: List[Dict]  # {节点ID, 节点名称, 起点分数}
			
 
				+    已知点列表: List[Dict]  # {节点ID, 节点名称, 加入轮次, 加入原因}
			
 
				+    推导关系列表: List[Dict]  # DerivedRelation 的 dict 形式
			
 
				+    推导轮次: int
			
 
				+    未知点列表: List[Dict]  # 未被推导的点
			
 
				+
			
 
				+
			
 
				+# ===== 数据加载 =====
			
 
				+
			
 
				+def load_json(file_path: Path) -> Dict:
			
 
				+    """加载JSON文件"""
			
 
				+    with open(file_path, "r", encoding="utf-8") as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def get_origin_result_files(config: PathConfig) -> List[Path]:
			
 
				+    """获取所有起点分析结果文件"""
			
 
				+    result_dir = config.intermediate_dir / "origin_analysis_result"
			
 
				+    return sorted(result_dir.glob("*_起点分析.json"))
			
 
				+
			
 
				+
			
 
				+def get_prepared_file(config: PathConfig, post_id: str) -> Optional[Path]:
			
 
				+    """获取待分析数据文件"""
			
 
				+    prepared_dir = config.intermediate_dir / "origin_analysis_prepared"
			
 
				+    files = list(prepared_dir.glob(f"{post_id}_待分析数据.json"))
			
 
				+    return files[0] if files else None
			
 
				+
			
 
				+
			
 
				+# ===== 核心算法 =====
			
 
				+
			
 
				+def derive_patterns(
			
 
				+    nodes: List[AnalysisNode],
			
 
				+    origin_scores: Dict[str, float],  # {节点名称: 起点分数}
			
 
				+) -> DerivationResult:
			
 
				+    """
			
 
				+    基于共现关系的迭代推导
			
 
				+
			
 
				+    Args:
			
 
				+        nodes: 所有待分析节点
			
 
				+        origin_scores: 起点分析的分数 {节点名称: score}
			
 
				+
			
 
				+    Returns:
			
 
				+        DerivationResult
			
 
				+    """
			
 
				+    # 构建索引
			
 
				+    node_by_name: Dict[str, AnalysisNode] = {n.节点名称: n for n in nodes}
			
 
				+    node_by_id: Dict[str, AnalysisNode] = {n.节点ID: n for n in nodes}
			
 
				+
			
 
				+    # 1. 初始化已知点集合（起点分数 >= 0.8）
			
 
				+    known_names: Set[str] = set()
			
 
				+    known_info: List[Dict] = []  # {节点ID, 节点名称, 加入轮次, 加入原因}
			
 
				+    origins: List[Dict] = []
			
 
				+
			
 
				+    for name, score in origin_scores.items():
			
 
				+        if score >= ORIGIN_SCORE_THRESHOLD:
			
 
				+            known_names.add(name)
			
 
				+            node = node_by_name.get(name)
			
 
				+            if node:
			
 
				+                origins.append({
			
 
				+                    "节点ID": node.节点ID,
			
 
				+                    "节点名称": name,
			
 
				+                    "起点分数": score,
			
 
				+                })
			
 
				+                known_info.append({
			
 
				+                    "节点ID": node.节点ID,
			
 
				+                    "节点名称": name,
			
 
				+                    "加入轮次": 0,
			
 
				+                    "加入原因": f"起点(score={score:.2f})",
			
 
				+                })
			
 
				+
			
 
				+    # 未知点集合
			
 
				+    unknown_names: Set[str] = set(node_by_name.keys()) - known_names
			
 
				+
			
 
				+    # 推导关系
			
 
				+    relations: List[DerivedRelation] = []
			
 
				+
			
 
				+    # 2. 迭代推导
			
 
				+    round_num = 0
			
 
				+    new_known_this_round = known_names.copy()  # 第0轮新加入的就是起点
			
 
				+
			
 
				+    while new_known_this_round:
			
 
				+        round_num += 1
			
 
				+        print(f"\n  第 {round_num} 轮推导...")
			
 
				+
			
 
				+        # 本轮新加入的点
			
 
				+        new_known_next_round: Set[str] = set()
			
 
				+
			
 
				+        # 遍历上一轮新加入的已知点
			
 
				+        for known_name in new_known_this_round:
			
 
				+            known_node = node_by_name.get(known_name)
			
 
				+            if not known_node:
			
 
				+                continue
			
 
				+
			
 
				+            # 过滤：人设匹配分数 >= 0.8
			
 
				+            if known_node.人设匹配分数 < MATCH_SCORE_THRESHOLD:
			
 
				+                continue
			
 
				+
			
 
				+            # 获取历史共现分类 {ID: 共现度}
			
 
				+            co_occur_map = known_node.历史共现分类
			
 
				+            if not co_occur_map:
			
 
				+                continue
			
 
				+
			
 
				+            # 遍历未知点
			
 
				+            for unknown_name in list(unknown_names):
			
 
				+                unknown_node = node_by_name.get(unknown_name)
			
 
				+                if not unknown_node:
			
 
				+                    continue
			
 
				+
			
 
				+                # 过滤：人设匹配分数 >= 0.8
			
 
				+                if unknown_node.人设匹配分数 < MATCH_SCORE_THRESHOLD:
			
 
				+                    continue
			
 
				+
			
 
				+                # 检查：未知点的所属分类ID 是否在已知点的共现列表中
			
 
				+                if unknown_node.所属分类ID and unknown_node.所属分类ID in co_occur_map:
			
 
				+                    # 找到关联！
			
 
				+                    co_occur_score = co_occur_map[unknown_node.所属分类ID]
			
 
				+                    new_known_next_round.add(unknown_name)
			
 
				+
			
 
				+                    # 建立关系
			
 
				+                    relations.append(DerivedRelation(
			
 
				+                        来源节点ID=known_node.节点ID,
			
 
				+                        来源节点名称=known_name,
			
 
				+                        目标节点ID=unknown_node.节点ID,
			
 
				+                        目标节点名称=unknown_name,
			
 
				+                        关系类型="共现推导",
			
 
				+                        推导轮次=round_num,
			
 
				+                        共现分类ID=unknown_node.所属分类ID,
			
 
				+                        共现度=co_occur_score,
			
 
				+                    ))
			
 
				+
			
 
				+                    print(f"    {known_name} → {unknown_name} (共现度: {co_occur_score:.2f})")
			
 
				+
			
 
				+        # 更新集合
			
 
				+        for name in new_known_next_round:
			
 
				+            node = node_by_name.get(name)
			
 
				+            if node:
			
 
				+                known_info.append({
			
 
				+                    "节点ID": node.节点ID,
			
 
				+                    "节点名称": name,
			
 
				+                    "加入轮次": round_num,
			
 
				+                    "加入原因": "共现推导",
			
 
				+                })
			
 
				+
			
 
				+        known_names.update(new_known_next_round)
			
 
				+        unknown_names -= new_known_next_round
			
 
				+        new_known_this_round = new_known_next_round
			
 
				+
			
 
				+        if not new_known_next_round:
			
 
				+            print(f"    无新点加入，推导结束")
			
 
				+            break
			
 
				+
			
 
				+    # 3. 构建未知点列表
			
 
				+    unknown_list = []
			
 
				+    for name in unknown_names:
			
 
				+        node = node_by_name.get(name)
			
 
				+        if node:
			
 
				+            unknown_list.append({
			
 
				+                "节点ID": node.节点ID,
			
 
				+                "节点名称": name,
			
 
				+                "节点维度": node.节点维度,
			
 
				+                "人设匹配分数": node.人设匹配分数,
			
 
				+                "未加入原因": "人设匹配分数不足" if node.人设匹配分数 < MATCH_SCORE_THRESHOLD else "无共现关联",
			
 
				+            })
			
 
				+
			
 
				+    return DerivationResult(
			
 
				+        帖子ID="",  # 由调用方设置
			
 
				+        起点列表=origins,
			
 
				+        已知点列表=known_info,
			
 
				+        推导关系列表=[asdict(r) for r in relations],
			
 
				+        推导轮次=round_num,
			
 
				+        未知点列表=unknown_list,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+# ===== 处理函数 =====
			
 
				+
			
 
				+def process_single_post(
			
 
				+    origin_file: Path,
			
 
				+    config: PathConfig,
			
 
				+) -> Optional[Dict]:
			
 
				+    """处理单个帖子"""
			
 
				+    # 加载起点分析结果
			
 
				+    origin_data = load_json(origin_file)
			
 
				+    post_id = origin_data.get("帖子id", "unknown")
			
 
				+
			
 
				+    print(f"\n{'=' * 60}")
			
 
				+    print(f"处理帖子: {post_id}")
			
 
				+    print("-" * 60)
			
 
				+
			
 
				+    # 获取起点分数
			
 
				+    origin_output = origin_data.get("输出", {})
			
 
				+    if not origin_output:
			
 
				+        print("  错误: 起点分析结果为空")
			
 
				+        return None
			
 
				+
			
 
				+    origin_scores = {name: info.get("score", 0) for name, info in origin_output.items()}
			
 
				+
			
 
				+    # 加载待分析数据（获取完整节点信息）
			
 
				+    prepared_file = get_prepared_file(config, post_id)
			
 
				+    if not prepared_file:
			
 
				+        print(f"  错误: 未找到待分析数据文件")
			
 
				+        return None
			
 
				+
			
 
				+    prepared_data = load_json(prepared_file)
			
 
				+    raw_nodes = prepared_data.get("待分析节点列表", [])
			
 
				+
			
 
				+    # 转换为 AnalysisNode
			
 
				+    nodes = [AnalysisNode.from_raw(raw) for raw in raw_nodes]
			
 
				+    print(f"  节点数: {len(nodes)}")
			
 
				+
			
 
				+    # 显示起点
			
 
				+    origins = [(name, score) for name, score in origin_scores.items() if score >= ORIGIN_SCORE_THRESHOLD]
			
 
				+    print(f"  起点 (score >= {ORIGIN_SCORE_THRESHOLD}): {len(origins)} 个")
			
 
				+    for name, score in sorted(origins, key=lambda x: -x[1]):
			
 
				+        print(f"    ★ {name}: {score:.2f}")
			
 
				+
			
 
				+    # 执行推导
			
 
				+    result = derive_patterns(nodes, origin_scores)
			
 
				+    result.帖子ID = post_id
			
 
				+
			
 
				+    # 显示结果
			
 
				+    print(f"\n  推导轮次: {result.推导轮次}")
			
 
				+    print(f"  已知点: {len(result.已知点列表)} 个")
			
 
				+    print(f"  推导关系: {len(result.推导关系列表)} 条")
			
 
				+    print(f"  未知点: {len(result.未知点列表)} 个")
			
 
				+
			
 
				+    # 保存结果
			
 
				+    output_dir = config.intermediate_dir / "pattern_derivation"
			
 
				+    output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    output_file = output_dir / f"{post_id}_模式推导.json"
			
 
				+    with open(output_file, "w", encoding="utf-8") as f:
			
 
				+        json.dump(asdict(result), f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"\n  已保存: {output_file.name}")
			
 
				+
			
 
				+    return asdict(result)
			
 
				+
			
 
				+
			
 
				+# ===== 主函数 =====
			
 
				+
			
 
				+def main(
			
 
				+    post_id: str = None,
			
 
				+    all_posts: bool = False,
			
 
				+):
			
 
				+    """
			
 
				+    主函数
			
 
				+
			
 
				+    Args:
			
 
				+        post_id: 帖子ID，可选
			
 
				+        all_posts: 是否处理所有帖子
			
 
				+    """
			
 
				+    config = PathConfig()
			
 
				+
			
 
				+    print(f"账号: {config.account_name}")
			
 
				+    print(f"起点分数阈值: {ORIGIN_SCORE_THRESHOLD}")
			
 
				+    print(f"匹配分数阈值: {MATCH_SCORE_THRESHOLD}")
			
 
				+
			
 
				+    # 获取起点分析结果文件
			
 
				+    origin_files = get_origin_result_files(config)
			
 
				+    if not origin_files:
			
 
				+        print("错误: 没有找到起点分析结果，请先运行 analyze_creation_origin.py")
			
 
				+        return
			
 
				+
			
 
				+    # 确定要处理的帖子
			
 
				+    if post_id:
			
 
				+        target_file = next(
			
 
				+            (f for f in origin_files if post_id in f.name),
			
 
				+            None
			
 
				+        )
			
 
				+        if not target_file:
			
 
				+            print(f"错误: 未找到帖子 {post_id} 的起点分析结果")
			
 
				+            return
			
 
				+        files_to_process = [target_file]
			
 
				+    elif all_posts:
			
 
				+        files_to_process = origin_files
			
 
				+    else:
			
 
				+        files_to_process = [origin_files[0]]
			
 
				+
			
 
				+    print(f"待处理帖子数: {len(files_to_process)}")
			
 
				+
			
 
				+    # 处理
			
 
				+    results = []
			
 
				+    for i, origin_file in enumerate(files_to_process, 1):
			
 
				+        print(f"\n{'#' * 60}")
			
 
				+        print(f"# 处理帖子 {i}/{len(files_to_process)}")
			
 
				+        print(f"{'#' * 60}")
			
 
				+
			
 
				+        result = process_single_post(origin_file, config)
			
 
				+        if result:
			
 
				+            results.append(result)
			
 
				+
			
 
				+    # 汇总
			
 
				+    print(f"\n{'#' * 60}")
			
 
				+    print(f"# 完成! 共处理 {len(results)} 个帖子")
			
 
				+    print(f"{'#' * 60}")
			
 
				+
			
 
				+    print("\n汇总:")
			
 
				+    for result in results:
			
 
				+        post_id = result.get("帖子ID")
			
 
				+        known_count = len(result.get("已知点列表", []))
			
 
				+        relation_count = len(result.get("推导关系列表", []))
			
 
				+        unknown_count = len(result.get("未知点列表", []))
			
 
				+        print(f"  {post_id}: 已知={known_count}, 关系={relation_count}, 未知={unknown_count}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import argparse
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(description="创作模式推导")
			
 
				+    parser.add_argument("--post-id", type=str, help="帖子ID")
			
 
				+    parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    main(
			
 
				+        post_id=args.post_id,
			
 
				+        all_posts=args.all_posts,
			
 
				+    )
			
--- a/script/data_processing/prepare_origin_analysis.py
+++ b/script/data_processing/prepare_origin_analysis.py
@@ -0,0 +1,453 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+创作起点分析 - 数据准备脚本
			
 
				+
			
 
				+第一步：根据帖子图谱 + 人设图谱，把信息压缩到待分析节点中
			
 
				+
			
 
				+输入：帖子图谱 + 人设图谱
			
 
				+输出：待分析数据结构
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional
			
 
				+import sys
			
 
				+
			
 
				+# 添加项目根目录到路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from script.data_processing.path_config import PathConfig
			
 
				+
			
 
				+
			
 
				+# ===== 数据加载函数 =====
			
 
				+
			
 
				+def load_json(file_path: Path) -> Dict:
			
 
				+    """加载JSON文件"""
			
 
				+    with open(file_path, "r", encoding="utf-8") as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def get_post_graph_files(config: PathConfig) -> List[Path]:
			
 
				+    """获取所有帖子图谱文件"""
			
 
				+    post_graph_dir = config.intermediate_dir / "post_graph"
			
 
				+    return sorted(post_graph_dir.glob("*_帖子图谱.json"))
			
 
				+
			
 
				+
			
 
				+# ===== 数据提取函数 =====
			
 
				+
			
 
				+def extract_post_detail(post_graph: Dict) -> Dict:
			
 
				+    """
			
 
				+    提取帖子详情（保留原始字段名）
			
 
				+    """
			
 
				+    meta = post_graph.get("meta", {})
			
 
				+    post_detail = meta.get("postDetail", {})
			
 
				+
			
 
				+    return {
			
 
				+        "postId": meta.get("postId", ""),
			
 
				+        "postTitle": meta.get("postTitle", ""),
			
 
				+        "body_text": post_detail.get("body_text", ""),
			
 
				+        "images": post_detail.get("images", []),
			
 
				+        "video": post_detail.get("video"),
			
 
				+        "publish_time": post_detail.get("publish_time", ""),
			
 
				+        "like_count": post_detail.get("like_count", 0),
			
 
				+        "collect_count": post_detail.get("collect_count", 0),
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def extract_analysis_nodes(post_graph: Dict, persona_graph: Dict) -> List[Dict]:
			
 
				+    """
			
 
				+    提取待分析节点列表
			
 
				+
			
 
				+    待分析节点 = 灵感点 + 目的点（不包括关键点，关键点是支撑信息）
			
 
				+    """
			
 
				+    nodes = post_graph.get("nodes", {})
			
 
				+    edges = post_graph.get("edges", {})
			
 
				+    persona_nodes = persona_graph.get("nodes", {})
			
 
				+    persona_index = persona_graph.get("index", {})
			
 
				+
			
 
				+    # 1. 收集关键点信息（用于支撑信息）
			
 
				+    keypoints = {}
			
 
				+    for node_id, node in nodes.items():
			
 
				+        if node.get("type") == "标签" and node.get("dimension") == "关键点":
			
 
				+            keypoints[node_id] = {
			
 
				+                "名称": node.get("name", ""),
			
 
				+                "描述": node.get("detail", {}).get("description", ""),
			
 
				+            }
			
 
				+
			
 
				+    # 2. 分析支撑关系：关键点 → 灵感点/目的点
			
 
				+    support_map = {}  # {target_node_id: [支撑的关键点信息]}
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "支撑":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            if source_id in keypoints:
			
 
				+                if target_id not in support_map:
			
 
				+                    support_map[target_id] = []
			
 
				+                support_map[target_id].append(keypoints[source_id])
			
 
				+
			
 
				+    # 3. 分析关联关系
			
 
				+    relation_map = {}  # {node_id: [关联的节点名称]}
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "关联":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            source_name = nodes.get(source_id, {}).get("name", "")
			
 
				+            target_name = nodes.get(target_id, {}).get("name", "")
			
 
				+
			
 
				+            # 双向记录
			
 
				+            if source_id not in relation_map:
			
 
				+                relation_map[source_id] = []
			
 
				+            relation_map[source_id].append(target_name)
			
 
				+
			
 
				+            if target_id not in relation_map:
			
 
				+                relation_map[target_id] = []
			
 
				+            relation_map[target_id].append(source_name)
			
 
				+
			
 
				+    # 4. 分析人设匹配
			
 
				+    match_map = {}  # {node_id: 匹配信息}
			
 
				+    persona_out_edges = persona_index.get("outEdges", {})
			
 
				+
			
 
				+    def get_node_info(node_id: str) -> Optional[Dict]:
			
 
				+        """获取人设节点的标准信息"""
			
 
				+        node = persona_nodes.get(node_id, {})
			
 
				+        if not node:
			
 
				+            return None
			
 
				+        detail = node.get("detail", {})
			
 
				+        parent_path = detail.get("parentPath", [])
			
 
				+        return {
			
 
				+            "节点ID": node_id,
			
 
				+            "节点名称": node.get("name", ""),
			
 
				+            "节点分类": "/".join(parent_path) if parent_path else "",
			
 
				+            "节点维度": node.get("dimension", ""),
			
 
				+            "节点类型": node.get("type", ""),
			
 
				+            "人设全局占比": detail.get("probGlobal", 0),
			
 
				+            "父类下占比": detail.get("probToParent", 0),
			
 
				+        }
			
 
				+
			
 
				+    def get_parent_category_id(node_id: str) -> Optional[str]:
			
 
				+        """通过属于边获取父分类节点ID"""
			
 
				+        belong_edges = persona_out_edges.get(node_id, {}).get("属于", [])
			
 
				+        for edge in belong_edges:
			
 
				+            target_id = edge.get("target", "")
			
 
				+            target_node = persona_nodes.get(target_id, {})
			
 
				+            if target_node.get("type") == "分类":
			
 
				+                return target_id
			
 
				+        return None
			
 
				+
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "匹配":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+
			
 
				+            # 只处理 帖子节点 → 人设节点 的匹配
			
 
				+            if source_id.startswith("帖子:") and target_id.startswith("人设:"):
			
 
				+                match_score = edge.get("score", 0)
			
 
				+                persona_node = persona_nodes.get(target_id, {})
			
 
				+
			
 
				+                if persona_node:
			
 
				+                    node_type = persona_node.get("type", "")
			
 
				+
			
 
				+                    # 获取匹配节点信息
			
 
				+                    match_node_info = get_node_info(target_id)
			
 
				+                    if not match_node_info:
			
 
				+                        continue
			
 
				+
			
 
				+                    # 确定所属分类节点
			
 
				+                    if node_type == "标签":
			
 
				+                        # 标签：找父分类
			
 
				+                        category_id = get_parent_category_id(target_id)
			
 
				+                    else:
			
 
				+                        # 分类：就是自己
			
 
				+                        category_id = target_id
			
 
				+
			
 
				+                    # 获取所属分类信息和常见搭配
			
 
				+                    category_info = None
			
 
				+                    if category_id:
			
 
				+                        category_node = persona_nodes.get(category_id, {})
			
 
				+                        if category_node:
			
 
				+                            category_detail = category_node.get("detail", {})
			
 
				+                            category_path = category_detail.get("parentPath", [])
			
 
				+                            category_info = {
			
 
				+                                "节点ID": category_id,
			
 
				+                                "节点名称": category_node.get("name", ""),
			
 
				+                                "节点分类": "/".join(category_path) if category_path else "",
			
 
				+                                "节点维度": category_node.get("dimension", ""),
			
 
				+                                "节点类型": "分类",
			
 
				+                                "人设全局占比": category_detail.get("probGlobal", 0),
			
 
				+                                "父类下占比": category_detail.get("probToParent", 0),
			
 
				+                                "历史共现分类": [],
			
 
				+                            }
			
 
				+
			
 
				+                            # 获取分类共现节点（按共现度降序排列）
			
 
				+                            co_occur_edges = persona_out_edges.get(category_id, {}).get("分类共现", [])
			
 
				+                            co_occur_edges_sorted = sorted(co_occur_edges, key=lambda x: x.get("score", 0), reverse=True)
			
 
				+                            for co_edge in co_occur_edges_sorted[:5]:  # 取前5个
			
 
				+                                co_target_id = co_edge.get("target", "")
			
 
				+                                co_score = co_edge.get("score", 0)
			
 
				+                                co_node = persona_nodes.get(co_target_id, {})
			
 
				+                                if co_node:
			
 
				+                                    co_detail = co_node.get("detail", {})
			
 
				+                                    co_path = co_detail.get("parentPath", [])
			
 
				+                                    category_info["历史共现分类"].append({
			
 
				+                                        "节点ID": co_target_id,
			
 
				+                                        "节点名称": co_node.get("name", ""),
			
 
				+                                        "节点分类": "/".join(co_path) if co_path else "",
			
 
				+                                        "节点维度": co_node.get("dimension", ""),
			
 
				+                                        "节点类型": "分类",
			
 
				+                                        "人设全局占比": co_detail.get("probGlobal", 0),
			
 
				+                                        "父类下占比": co_detail.get("probToParent", 0),
			
 
				+                                        "共现度": round(co_score, 4),
			
 
				+                                    })
			
 
				+
			
 
				+                    match_map[source_id] = {
			
 
				+                        "匹配节点": match_node_info,
			
 
				+                        "匹配分数": round(match_score, 4),
			
 
				+                        "所属分类": category_info,
			
 
				+                    }
			
 
				+
			
 
				+    # 5. 构建待分析节点列表（灵感点、目的点、关键点）
			
 
				+    analysis_nodes = []
			
 
				+    for node_id, node in nodes.items():
			
 
				+        if node.get("type") == "标签" and node.get("domain") == "帖子":
			
 
				+            dimension = node.get("dimension", "")
			
 
				+            if dimension in ["灵感点", "目的点", "关键点"]:
			
 
				+                # 人设匹配信息
			
 
				+                match_info = match_map.get(node_id)
			
 
				+
			
 
				+                analysis_nodes.append({
			
 
				+                    "节点ID": node_id,
			
 
				+                    "节点名称": node.get("name", ""),
			
 
				+                    "节点分类": node.get("category", ""),  # 根分类：意图/实质/形式
			
 
				+                    "节点维度": dimension,
			
 
				+                    "节点类型": node.get("type", ""),
			
 
				+                    "节点描述": node.get("detail", {}).get("description", ""),
			
 
				+                    "人设匹配": match_info,
			
 
				+                })
			
 
				+
			
 
				+    # 6. 构建可能的关系列表
			
 
				+    relation_list = []
			
 
				+
			
 
				+    # 支撑关系：关键点 → 灵感点/目的点
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "支撑":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            if source_id in keypoints:
			
 
				+                relation_list.append({
			
 
				+                    "来源节点": source_id,
			
 
				+                    "目标节点": target_id,
			
 
				+                    "关系类型": "支撑",
			
 
				+                })
			
 
				+
			
 
				+    # 关联关系：节点之间的关联（去重，只记录一次）
			
 
				+    seen_relations = set()
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "关联":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            # 用排序后的元组作为key去重
			
 
				+            key = tuple(sorted([source_id, target_id]))
			
 
				+            if key not in seen_relations:
			
 
				+                seen_relations.add(key)
			
 
				+                relation_list.append({
			
 
				+                    "来源节点": source_id,
			
 
				+                    "目标节点": target_id,
			
 
				+                    "关系类型": "关联",
			
 
				+                })
			
 
				+
			
 
				+    return analysis_nodes, relation_list
			
 
				+
			
 
				+
			
 
				+def prepare_analysis_data(post_graph: Dict, persona_graph: Dict) -> Dict:
			
 
				+    """
			
 
				+    准备完整的分析数据
			
 
				+
			
 
				+    Returns:
			
 
				+        {
			
 
				+            "帖子详情": {...},
			
 
				+            "待分析节点列表": [...],
			
 
				+            "可能的关系列表": [...]
			
 
				+        }
			
 
				+    """
			
 
				+    analysis_nodes, relation_list = extract_analysis_nodes(post_graph, persona_graph)
			
 
				+    return {
			
 
				+        "帖子详情": extract_post_detail(post_graph),
			
 
				+        "待分析节点列表": analysis_nodes,
			
 
				+        "可能的关系列表": relation_list,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+# ===== 显示函数 =====
			
 
				+
			
 
				+def display_prepared_data(data: Dict):
			
 
				+    """显示准备好的数据"""
			
 
				+    post = data["帖子详情"]
			
 
				+    nodes = data["待分析节点列表"]
			
 
				+    relations = data["可能的关系列表"]
			
 
				+
			
 
				+    print(f"\n帖子: {post['postId']}")
			
 
				+    print(f"标题: {post['postTitle']}")
			
 
				+    print(f"正文: {post['body_text'][:100]}...")
			
 
				+
			
 
				+    print(f"\n待分析节点 ({len(nodes)} 个):")
			
 
				+    for node in nodes:
			
 
				+        match = node.get("人设匹配")
			
 
				+        category = node.get('节点分类', '')
			
 
				+        print(f"  - [{node['节点ID']}] {node['节点名称']} ({node['节点维度']}/{category})")
			
 
				+
			
 
				+        if match:
			
 
				+            match_node = match.get("匹配节点", {})
			
 
				+            category_node = match.get("所属分类", {})
			
 
				+            print(f"    匹配: {match_node.get('节点名称', '')} ({match_node.get('节点类型', '')}, 全局占比={match_node.get('人设全局占比', 0):.2%})")
			
 
				+            if category_node:
			
 
				+                co_count = len(category_node.get("历史共现分类", []))
			
 
				+                print(f"    所属分类: {category_node.get('节点名称', '')} (全局占比={category_node.get('人设全局占比', 0):.2%}, {co_count}个历史共现分类)")
			
 
				+        else:
			
 
				+            print(f"    人设: 无匹配")
			
 
				+
			
 
				+    print(f"\n可能的关系 ({len(relations)} 条):")
			
 
				+    for rel in relations:
			
 
				+        rel_type = rel["关系类型"]
			
 
				+        if rel_type == "支撑":
			
 
				+            print(f"  - {rel['来源节点']} → {rel['目标节点']} [支撑]")
			
 
				+        else:
			
 
				+            print(f"  - {rel['来源节点']} ↔ {rel['目标节点']} [关联]")
			
 
				+
			
 
				+
			
 
				+# ===== 处理函数 =====
			
 
				+
			
 
				+def process_single_post(
			
 
				+    post_file: Path,
			
 
				+    persona_graph: Dict,
			
 
				+    config: PathConfig,
			
 
				+    save: bool = True,
			
 
				+) -> Dict:
			
 
				+    """
			
 
				+    处理单个帖子
			
 
				+
			
 
				+    Args:
			
 
				+        post_file: 帖子图谱文件路径
			
 
				+        persona_graph: 人设图谱数据
			
 
				+        config: 路径配置
			
 
				+        save: 是否保存结果
			
 
				+
			
 
				+    Returns:
			
 
				+        准备好的分析数据
			
 
				+    """
			
 
				+    # 加载帖子图谱
			
 
				+    post_graph = load_json(post_file)
			
 
				+    post_id = post_graph.get("meta", {}).get("postId", "unknown")
			
 
				+
			
 
				+    print(f"\n{'=' * 60}")
			
 
				+    print(f"处理帖子: {post_id}")
			
 
				+    print("-" * 60)
			
 
				+
			
 
				+    # 准备数据
			
 
				+    data = prepare_analysis_data(post_graph, persona_graph)
			
 
				+
			
 
				+    # 显示
			
 
				+    display_prepared_data(data)
			
 
				+
			
 
				+    # 保存
			
 
				+    if save:
			
 
				+        output_dir = config.intermediate_dir / "origin_analysis_prepared"
			
 
				+        output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        output_file = output_dir / f"{post_id}_待分析数据.json"
			
 
				+        with open(output_file, "w", encoding="utf-8") as f:
			
 
				+            json.dump(data, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+        print(f"\n已保存: {output_file.name}")
			
 
				+
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+# ===== 主函数 =====
			
 
				+
			
 
				+def main(
			
 
				+    post_id: str = None,
			
 
				+    all_posts: bool = False,
			
 
				+    save: bool = True,
			
 
				+):
			
 
				+    """
			
 
				+    主函数
			
 
				+
			
 
				+    Args:
			
 
				+        post_id: 帖子ID，可选
			
 
				+        all_posts: 是否处理所有帖子
			
 
				+        save: 是否保存结果
			
 
				+    """
			
 
				+    config = PathConfig()
			
 
				+
			
 
				+    print(f"账号: {config.account_name}")
			
 
				+
			
 
				+    # 加载人设图谱
			
 
				+    persona_graph_file = config.intermediate_dir / "人设图谱.json"
			
 
				+    if not persona_graph_file.exists():
			
 
				+        print(f"错误: 人设图谱文件不存在: {persona_graph_file}")
			
 
				+        return
			
 
				+
			
 
				+    persona_graph = load_json(persona_graph_file)
			
 
				+    print(f"人设图谱节点数: {len(persona_graph.get('nodes', {}))}")
			
 
				+
			
 
				+    # 获取帖子图谱文件
			
 
				+    post_graph_files = get_post_graph_files(config)
			
 
				+    if not post_graph_files:
			
 
				+        print("错误: 没有找到帖子图谱文件")
			
 
				+        return
			
 
				+
			
 
				+    # 确定要处理的帖子
			
 
				+    if post_id:
			
 
				+        target_file = next(
			
 
				+            (f for f in post_graph_files if post_id in f.name),
			
 
				+            None
			
 
				+        )
			
 
				+        if not target_file:
			
 
				+            print(f"错误: 未找到帖子 {post_id}")
			
 
				+            return
			
 
				+        files_to_process = [target_file]
			
 
				+    elif all_posts:
			
 
				+        files_to_process = post_graph_files
			
 
				+    else:
			
 
				+        files_to_process = [post_graph_files[0]]
			
 
				+
			
 
				+    print(f"待处理帖子数: {len(files_to_process)}")
			
 
				+
			
 
				+    # 处理
			
 
				+    results = []
			
 
				+    for i, post_file in enumerate(files_to_process, 1):
			
 
				+        print(f"\n{'#' * 60}")
			
 
				+        print(f"# 处理帖子 {i}/{len(files_to_process)}")
			
 
				+        print(f"{'#' * 60}")
			
 
				+
			
 
				+        data = process_single_post(
			
 
				+            post_file=post_file,
			
 
				+            persona_graph=persona_graph,
			
 
				+            config=config,
			
 
				+            save=save,
			
 
				+        )
			
 
				+        results.append(data)
			
 
				+
			
 
				+    print(f"\n{'#' * 60}")
			
 
				+    print(f"# 完成! 共处理 {len(results)} 个帖子")
			
 
				+    print(f"{'#' * 60}")
			
 
				+
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import argparse
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(description="创作起点分析 - 数据准备")
			
 
				+    parser.add_argument("--post-id", type=str, help="帖子ID")
			
 
				+    parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
			
 
				+    parser.add_argument("--no-save", action="store_true", help="不保存结果")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    main(
			
 
				+        post_id=args.post_id,
			
 
				+        all_posts=args.all_posts,
			
 
				+        save=not args.no_save,
			
 
				+    )