1 day ago · edb80078fc
--- a/lib/llm_cached.py
+++ b/lib/llm_cached.py
@@ -0,0 +1,474 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+通用 LLM 缓存分析模块
			
 
				+
			
 
				+根据 prompt + 模型 + 参数 进行缓存，相同输入直接返回缓存结果。
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import hashlib
			
 
				+import json
			
 
				+import re
			
 
				+from dataclasses import dataclass, field
			
 
				+from datetime import datetime
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Callable, Dict, List, Optional
			
 
				+
			
 
				+from agents import Agent, Runner, ModelSettings
			
 
				+from lib.client import get_model
			
 
				+from lib.config import get_cache_dir
			
 
				+from lib.utils import parse_json_from_text
			
 
				+from lib.my_trace import set_trace_smith as set_trace
			
 
				+
			
 
				+
			
 
				+# ===== 配置 =====
			
 
				+
			
 
				+@dataclass
			
 
				+class LLMConfig:
			
 
				+    """LLM 配置"""
			
 
				+    model_name: str = "google/gemini-3-pro-preview"
			
 
				+    temperature: float = 0.0
			
 
				+    max_tokens: int = 65536
			
 
				+
			
 
				+    def to_dict(self) -> Dict:
			
 
				+        return {
			
 
				+            "model_name": self.model_name,
			
 
				+            "temperature": self.temperature,
			
 
				+            "max_tokens": self.max_tokens,
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class AnalyzeResult:
			
 
				+    """分析结果（包含元数据）"""
			
 
				+    data: Any  # 解析后的数据或原始字符串
			
 
				+    cache_hit: bool  # 是否命中缓存
			
 
				+    model_name: str  # 使用的模型
			
 
				+    cache_key: str  # 缓存键
			
 
				+    log_url: Optional[str] = None  # trace URL（仅当实际调用 LLM 时）
			
 
				+    retries: int = 0  # 重试次数
			
 
				+
			
 
				+    def to_dict(self) -> Dict:
			
 
				+        return {
			
 
				+            "data": self.data,
			
 
				+            "cache_hit": self.cache_hit,
			
 
				+            "model_name": self.model_name,
			
 
				+            "cache_key": self.cache_key,
			
 
				+            "log_url": self.log_url,
			
 
				+            "retries": self.retries,
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+# 默认模型
			
 
				+DEFAULT_MODEL = "google/gemini-3-pro-preview"
			
 
				+
			
 
				+# 预设配置
			
 
				+PRESETS = {
			
 
				+    "default": LLMConfig(
			
 
				+        model_name=DEFAULT_MODEL,
			
 
				+        temperature=0.0,
			
 
				+        max_tokens=65536,
			
 
				+    ),
			
 
				+    "fast": LLMConfig(
			
 
				+        model_name="openai/gpt-4.1-mini",
			
 
				+        temperature=0.0,
			
 
				+        max_tokens=65536,
			
 
				+    ),
			
 
				+    "balanced": LLMConfig(
			
 
				+        model_name="google/gemini-2.5-flash-preview-05-20",
			
 
				+        temperature=0.0,
			
 
				+        max_tokens=65536,
			
 
				+    ),
			
 
				+    "quality": LLMConfig(
			
 
				+        model_name="anthropic/claude-sonnet-4",
			
 
				+        temperature=0.0,
			
 
				+        max_tokens=65536,
			
 
				+    ),
			
 
				+    "best": LLMConfig(
			
 
				+        model_name="google/gemini-2.5-pro-preview-05-06",
			
 
				+        temperature=0.0,
			
 
				+        max_tokens=65536,
			
 
				+    ),
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# ===== 缓存工具函数 =====
			
 
				+
			
 
				+def _get_cache_dir(task_name: str) -> Path:
			
 
				+    """获取缓存目录"""
			
 
				+    return Path(get_cache_dir(f"llm_cached/{task_name}"))
			
 
				+
			
 
				+
			
 
				+def _generate_cache_key(
			
 
				+    prompt: str,
			
 
				+    config: LLMConfig,
			
 
				+) -> str:
			
 
				+    """生成缓存键（MD5 哈希）"""
			
 
				+    cache_string = f"{prompt}||{config.model_name}||{config.temperature}||{config.max_tokens}"
			
 
				+    return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
			
 
				+
			
 
				+
			
 
				+def _sanitize_filename(text: str, max_length: int = 30) -> str:
			
 
				+    """将文本转换为安全的文件名"""
			
 
				+    sanitized = re.sub(r'[^\w\u4e00-\u9fff]', '_', text)
			
 
				+    sanitized = re.sub(r'_+', '_', sanitized)
			
 
				+    if len(sanitized) > max_length:
			
 
				+        sanitized = sanitized[:max_length]
			
 
				+    return sanitized.strip('_')
			
 
				+
			
 
				+
			
 
				+def _get_cache_filepath(
			
 
				+    task_name: str,
			
 
				+    cache_key: str,
			
 
				+    prompt_preview: str,
			
 
				+    config: LLMConfig,
			
 
				+) -> Path:
			
 
				+    """
			
 
				+    获取缓存文件路径
			
 
				+
			
 
				+    文件名格式: {prompt_preview}_{model}_{hash[:8]}.json
			
 
				+    """
			
 
				+    cache_dir = _get_cache_dir(task_name)
			
 
				+
			
 
				+    # 清理 prompt 预览
			
 
				+    clean_preview = _sanitize_filename(prompt_preview, max_length=40)
			
 
				+
			
 
				+    # 简化模型名
			
 
				+    model_short = config.model_name.split('/')[-1]
			
 
				+    model_short = _sanitize_filename(model_short, max_length=20)
			
 
				+
			
 
				+    # 哈希前8位
			
 
				+    hash_short = cache_key[:8]
			
 
				+
			
 
				+    filename = f"{clean_preview}_{model_short}_{hash_short}.json"
			
 
				+    return cache_dir / filename
			
 
				+
			
 
				+
			
 
				+def _load_from_cache(
			
 
				+    task_name: str,
			
 
				+    cache_key: str,
			
 
				+    prompt_preview: str,
			
 
				+    config: LLMConfig,
			
 
				+) -> Optional[Dict]:
			
 
				+    """从缓存加载，返回 {raw: str, log_url: str}"""
			
 
				+    cache_file = _get_cache_filepath(task_name, cache_key, prompt_preview, config)
			
 
				+
			
 
				+    # 如果文件不存在，尝试通过哈希匹配
			
 
				+    if not cache_file.exists():
			
 
				+        cache_dir = _get_cache_dir(task_name)
			
 
				+        if cache_dir.exists():
			
 
				+            hash_short = cache_key[:8]
			
 
				+            matching_files = list(cache_dir.glob(f"*_{hash_short}.json"))
			
 
				+            if matching_files:
			
 
				+                cache_file = matching_files[0]
			
 
				+            else:
			
 
				+                return None
			
 
				+        else:
			
 
				+            return None
			
 
				+
			
 
				+    try:
			
 
				+        with open(cache_file, 'r', encoding='utf-8') as f:
			
 
				+            cached_data = json.load(f)
			
 
				+            return {
			
 
				+                "raw": cached_data['output']['raw'],
			
 
				+                "log_url": cached_data.get('metadata', {}).get('log_url'),
			
 
				+            }
			
 
				+    except (json.JSONDecodeError, IOError, KeyError):
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _save_to_cache(
			
 
				+    task_name: str,
			
 
				+    cache_key: str,
			
 
				+    prompt_preview: str,
			
 
				+    prompt: str,
			
 
				+    config: LLMConfig,
			
 
				+    result: str,
			
 
				+    log_url: Optional[str] = None,
			
 
				+) -> None:
			
 
				+    """保存到缓存（包含 log_url）"""
			
 
				+    cache_file = _get_cache_filepath(task_name, cache_key, prompt_preview, config)
			
 
				+    cache_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # 尝试解析 JSON
			
 
				+    parsed_result = parse_json_from_text(result)
			
 
				+
			
 
				+    cache_data = {
			
 
				+        "input": {
			
 
				+            "prompt": prompt,
			
 
				+            "prompt_preview": prompt_preview,
			
 
				+            **config.to_dict(),
			
 
				+        },
			
 
				+        "output": {
			
 
				+            "raw": result,
			
 
				+            "parsed": parsed_result,
			
 
				+        },
			
 
				+        "metadata": {
			
 
				+            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
			
 
				+            "cache_key": cache_key,
			
 
				+            "cache_file": str(cache_file.name),
			
 
				+            "log_url": log_url,
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    try:
			
 
				+        with open(cache_file, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(cache_data, f, ensure_ascii=False, indent=2)
			
 
				+    except IOError:
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+# ===== 核心 API =====
			
 
				+
			
 
				+async def analyze(
			
 
				+    prompt: str,
			
 
				+    task_name: str = "default",
			
 
				+    config: Optional[LLMConfig] = None,
			
 
				+    preset: Optional[str] = None,
			
 
				+    force: bool = False,
			
 
				+    parse_json: bool = True,
			
 
				+    max_retries: int = 3,
			
 
				+    log_url: Optional[str] = None,
			
 
				+) -> AnalyzeResult:
			
 
				+    """
			
 
				+    通用 LLM 分析（带缓存）
			
 
				+
			
 
				+    Args:
			
 
				+        prompt: 完整的 prompt
			
 
				+        task_name: 任务名称（用于缓存目录分类）
			
 
				+        config: LLM 配置，如果为 None 则使用 preset 或默认配置
			
 
				+        preset: 预设配置名称 ("default", "fast", "balanced", "quality", "best")
			
 
				+        force: 强制重新分析（跳过缓存），默认 False
			
 
				+        parse_json: 是否解析为 JSON
			
 
				+        max_retries: 最大重试次数（默认3次）
			
 
				+        log_url: 外部传入的 trace URL（如果为 None 且缓存未命中，则自动生成）
			
 
				+
			
 
				+    Returns:
			
 
				+        AnalyzeResult 对象，包含：
			
 
				+        - data: 解析后的数据或原始字符串
			
 
				+        - cache_hit: 是否命中缓存
			
 
				+        - model_name: 使用的模型
			
 
				+        - cache_key: 缓存键
			
 
				+        - log_url: trace URL（仅当实际调用 LLM 时）
			
 
				+        - retries: 实际重试次数
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> # 使用缓存（默认）
			
 
				+        >>> result = await analyze("分析...", task_name="origin")
			
 
				+
			
 
				+        >>> # 强制重新分析
			
 
				+        >>> result = await analyze("分析...", task_name="origin", force=True)
			
 
				+
			
 
				+        >>> # 外部控制 trace（多个分析共享同一个 trace）
			
 
				+        >>> _, log_url = set_trace()
			
 
				+        >>> result1 = await analyze("分析1...", log_url=log_url)
			
 
				+        >>> result2 = await analyze("分析2...", log_url=log_url)
			
 
				+    """
			
 
				+    # 确定配置
			
 
				+    if config is None:
			
 
				+        if preset and preset in PRESETS:
			
 
				+            config = PRESETS[preset]
			
 
				+        else:
			
 
				+            config = PRESETS["default"]
			
 
				+
			
 
				+    # 生成缓存键
			
 
				+    cache_key = _generate_cache_key(prompt, config)
			
 
				+
			
 
				+    # prompt 预览（用于文件名）
			
 
				+    prompt_preview = prompt[:50].replace('\n', ' ')
			
 
				+
			
 
				+    # 尝试从缓存加载（除非 force=True）
			
 
				+    if not force:
			
 
				+        cached_data = _load_from_cache(task_name, cache_key, prompt_preview, config)
			
 
				+        if cached_data is not None:
			
 
				+            cached_raw = cached_data["raw"]
			
 
				+            cached_log_url = cached_data.get("log_url")
			
 
				+            if parse_json:
			
 
				+                parsed = parse_json_from_text(cached_raw)
			
 
				+                if parsed:
			
 
				+                    return AnalyzeResult(
			
 
				+                        data=parsed,
			
 
				+                        cache_hit=True,
			
 
				+                        model_name=config.model_name,
			
 
				+                        cache_key=cache_key,
			
 
				+                        log_url=cached_log_url,  # 返回缓存时的 log_url
			
 
				+                        retries=0,
			
 
				+                    )
			
 
				+            else:
			
 
				+                return AnalyzeResult(
			
 
				+                    data=cached_raw,
			
 
				+                    cache_hit=True,
			
 
				+                    model_name=config.model_name,
			
 
				+                    cache_key=cache_key,
			
 
				+                    log_url=cached_log_url,  # 返回缓存时的 log_url
			
 
				+                    retries=0,
			
 
				+                )
			
 
				+
			
 
				+    # 设置 trace（仅当实际调用 LLM 且未传入 log_url 时）
			
 
				+    if log_url is None:
			
 
				+        _, log_url = set_trace()
			
 
				+
			
 
				+    # 创建 Agent
			
 
				+    agent = Agent(
			
 
				+        name=f"LLM-{task_name}",
			
 
				+        model=get_model(config.model_name),
			
 
				+        model_settings=ModelSettings(
			
 
				+            temperature=config.temperature,
			
 
				+            max_tokens=config.max_tokens,
			
 
				+        ),
			
 
				+        tools=[],
			
 
				+    )
			
 
				+
			
 
				+    last_error = None
			
 
				+    retries = 0
			
 
				+
			
 
				+    for attempt in range(max_retries):
			
 
				+        try:
			
 
				+            result = await Runner.run(agent, input=prompt)
			
 
				+            raw_output = result.final_output
			
 
				+
			
 
				+            if parse_json:
			
 
				+                parsed = parse_json_from_text(raw_output)
			
 
				+                if parsed:
			
 
				+                    # 解析成功，缓存并返回
			
 
				+                    _save_to_cache(task_name, cache_key, prompt_preview, prompt, config, raw_output, log_url)
			
 
				+                    return AnalyzeResult(
			
 
				+                        data=parsed,
			
 
				+                        cache_hit=False,
			
 
				+                        model_name=config.model_name,
			
 
				+                        cache_key=cache_key,
			
 
				+                        log_url=log_url,
			
 
				+                        retries=retries,
			
 
				+                    )
			
 
				+                else:
			
 
				+                    # 解析失败，重试
			
 
				+                    retries += 1
			
 
				+                    last_error = f"JSON 解析失败 (尝试 {attempt + 1}/{max_retries})\n响应: {raw_output[:500]}..."
			
 
				+                    print(f"  ⚠️ {last_error}")
			
 
				+                    if attempt < max_retries - 1:
			
 
				+                        await asyncio.sleep(1)
			
 
				+            else:
			
 
				+                # 不需要解析 JSON
			
 
				+                _save_to_cache(task_name, cache_key, prompt_preview, prompt, config, raw_output, log_url)
			
 
				+                return AnalyzeResult(
			
 
				+                    data=raw_output,
			
 
				+                    cache_hit=False,
			
 
				+                    model_name=config.model_name,
			
 
				+                    cache_key=cache_key,
			
 
				+                    log_url=log_url,
			
 
				+                    retries=retries,
			
 
				+                )
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            retries += 1
			
 
				+            last_error = f"API 调用失败 (尝试 {attempt + 1}/{max_retries}): {str(e)}"
			
 
				+            print(f"  ⚠️ {last_error}")
			
 
				+            if attempt < max_retries - 1:
			
 
				+                await asyncio.sleep(1)
			
 
				+
			
 
				+    raise ValueError(f"所有 {max_retries} 次重试均失败: {last_error}")
			
 
				+
			
 
				+
			
 
				+async def analyze_batch(
			
 
				+    prompts: list[str],
			
 
				+    task_name: str = "default",
			
 
				+    config: Optional[LLMConfig] = None,
			
 
				+    preset: Optional[str] = None,
			
 
				+    force: bool = False,
			
 
				+    parse_json: bool = True,
			
 
				+    max_concurrent: int = 10,
			
 
				+    log_url: Optional[str] = None,
			
 
				+    progress_callback: Optional[Callable] = None,
			
 
				+) -> list[AnalyzeResult]:
			
 
				+    """
			
 
				+    批量 LLM 分析（带并发控制）
			
 
				+
			
 
				+    Args:
			
 
				+        prompts: prompt 列表
			
 
				+        task_name: 任务名称
			
 
				+        config: LLM 配置
			
 
				+        preset: 预设配置名称
			
 
				+        force: 强制重新分析（跳过缓存），默认 False
			
 
				+        parse_json: 是否解析为 JSON
			
 
				+        max_concurrent: 最大并发数
			
 
				+        log_url: 外部传入的 trace URL（所有分析共享同一个 trace）
			
 
				+        progress_callback: 进度回调
			
 
				+
			
 
				+    Returns:
			
 
				+        AnalyzeResult 列表
			
 
				+    """
			
 
				+    # 如果没有传入 log_url，生成一个共享的
			
 
				+    if log_url is None:
			
 
				+        _, log_url = set_trace()
			
 
				+
			
 
				+    semaphore = asyncio.Semaphore(max_concurrent)
			
 
				+
			
 
				+    async def limited_analyze(prompt: str):
			
 
				+        async with semaphore:
			
 
				+            result = await analyze(
			
 
				+                prompt=prompt,
			
 
				+                task_name=task_name,
			
 
				+                config=config,
			
 
				+                preset=preset,
			
 
				+                force=force,
			
 
				+                parse_json=parse_json,
			
 
				+                log_url=log_url,
			
 
				+            )
			
 
				+            if progress_callback:
			
 
				+                progress_callback(1)
			
 
				+            return result
			
 
				+
			
 
				+    tasks = [limited_analyze(p) for p in prompts]
			
 
				+    return await asyncio.gather(*tasks)
			
 
				+
			
 
				+
			
 
				+# ===== 便捷函数 =====
			
 
				+
			
 
				+async def analyze_fast(prompt: str, task_name: str = "default", **kwargs) -> AnalyzeResult:
			
 
				+    """快速分析（使用 fast 预设）"""
			
 
				+    return await analyze(prompt, task_name=task_name, preset="fast", **kwargs)
			
 
				+
			
 
				+
			
 
				+async def analyze_quality(prompt: str, task_name: str = "default", **kwargs) -> AnalyzeResult:
			
 
				+    """高质量分析（使用 quality 预设）"""
			
 
				+    return await analyze(prompt, task_name=task_name, preset="quality", **kwargs)
			
 
				+
			
 
				+
			
 
				+# ===== 测试 =====
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    async def main():
			
 
				+        prompt = """
			
 
				+分析"猫咪"和"宠物"的关系，输出 JSON：
			
 
				+```json
			
 
				+{
			
 
				+  "关系": "...",
			
 
				+  "说明": "..."
			
 
				+}
			
 
				+```
			
 
				+"""
			
 
				+        print("测试 1: 基本用法（自动生成 trace）")
			
 
				+        result = await analyze(prompt, task_name="test", preset="fast")
			
 
				+        print(f"  cache_hit: {result.cache_hit}")
			
 
				+        print(f"  model: {result.model_name}")
			
 
				+        print(f"  log_url: {result.log_url}")
			
 
				+        print(f"  data: {result.data}")
			
 
				+
			
 
				+        print("\n测试 2: 缓存命中")
			
 
				+        result = await analyze(prompt, task_name="test", preset="fast")
			
 
				+        print(f"  cache_hit: {result.cache_hit}")
			
 
				+        print(f"  log_url: {result.log_url}")  # 应该是 None
			
 
				+
			
 
				+        print("\n测试 3: 强制重新分析 (force=True)")
			
 
				+        _, shared_log_url = set_trace()
			
 
				+        print(f"  共享 trace: {shared_log_url}")
			
 
				+        result = await analyze(
			
 
				+            "输出 JSON: {\"test\": 123}",
			
 
				+            task_name="test",
			
 
				+            log_url=shared_log_url,
			
 
				+            force=True,
			
 
				+        )
			
 
				+        print(f"  cache_hit: {result.cache_hit}")
			
 
				+        print(f"  log_url: {result.log_url}")
			
 
				+
			
 
				+    asyncio.run(main())
			
--- a/script/data_processing/analyze_creation_pattern.py
+++ b/script/data_processing/analyze_creation_pattern.py
@@ -0,0 +1,969 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+创作模式分析（完整流程）
			
 
				+
			
 
				+整合三步流程：
			
 
				+1. 数据准备：根据帖子图谱 + 人设图谱，提取待分析数据
			
 
				+2. 起点分析：AI分析创意起点
			
 
				+3. 模式推导：基于共现关系的迭代推导
			
 
				+
			
 
				+输入：帖子图谱 + 人设图谱
			
 
				+输出：完整的创作模式分析结果
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional, Set
			
 
				+import sys
			
 
				+
			
 
				+# 添加项目根目录到路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from lib.llm_cached import analyze, LLMConfig, AnalyzeResult
			
 
				+from lib.my_trace import set_trace_smith as set_trace
			
 
				+from script.data_processing.path_config import PathConfig
			
 
				+
			
 
				+
			
 
				+# ===== 配置 =====
			
 
				+TASK_NAME = "creation_pattern"  # 缓存任务名称
			
 
				+
			
 
				+MATCH_SCORE_THRESHOLD = 0.8  # 匹配分数阈值
			
 
				+GLOBAL_RATIO_THRESHOLD = 0.8  # 全局占比阈值
			
 
				+ORIGIN_SCORE_THRESHOLD = 0.8  # 起点分数阈值
			
 
				+
			
 
				+
			
 
				+# ===== 数据加载 =====
			
 
				+
			
 
				+def load_json(file_path: Path) -> Dict:
			
 
				+    """加载JSON文件"""
			
 
				+    with open(file_path, "r", encoding="utf-8") as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def get_post_graph_files(config: PathConfig) -> List[Path]:
			
 
				+    """获取所有帖子图谱文件"""
			
 
				+    post_graph_dir = config.intermediate_dir / "post_graph"
			
 
				+    return sorted(post_graph_dir.glob("*_帖子图谱.json"))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+# ===== 第一步：数据准备 =====
			
 
				+
			
 
				+def extract_post_detail(post_graph: Dict) -> Dict:
			
 
				+    """提取帖子详情"""
			
 
				+    meta = post_graph.get("meta", {})
			
 
				+    post_detail = meta.get("postDetail", {})
			
 
				+
			
 
				+    return {
			
 
				+        "postId": meta.get("postId", ""),
			
 
				+        "postTitle": meta.get("postTitle", ""),
			
 
				+        "body_text": post_detail.get("body_text", ""),
			
 
				+        "images": post_detail.get("images", []),
			
 
				+        "video": post_detail.get("video"),
			
 
				+        "publish_time": post_detail.get("publish_time", ""),
			
 
				+        "like_count": post_detail.get("like_count", 0),
			
 
				+        "collect_count": post_detail.get("collect_count", 0),
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def extract_analysis_nodes(post_graph: Dict, persona_graph: Dict) -> tuple:
			
 
				+    """
			
 
				+    提取待分析节点列表
			
 
				+
			
 
				+    待分析节点 = 灵感点 + 目的点 + 关键点
			
 
				+    """
			
 
				+    nodes = post_graph.get("nodes", {})
			
 
				+    edges = post_graph.get("edges", {})
			
 
				+    persona_nodes = persona_graph.get("nodes", {})
			
 
				+    persona_index = persona_graph.get("index", {})
			
 
				+
			
 
				+    # 1. 收集关键点信息
			
 
				+    keypoints = {}
			
 
				+    for node_id, node in nodes.items():
			
 
				+        if node.get("type") == "标签" and node.get("dimension") == "关键点":
			
 
				+            keypoints[node_id] = {
			
 
				+                "名称": node.get("name", ""),
			
 
				+                "描述": node.get("detail", {}).get("description", ""),
			
 
				+            }
			
 
				+
			
 
				+    # 2. 分析支撑关系
			
 
				+    support_map = {}
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "支撑":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            if source_id in keypoints:
			
 
				+                if target_id not in support_map:
			
 
				+                    support_map[target_id] = []
			
 
				+                support_map[target_id].append(keypoints[source_id])
			
 
				+
			
 
				+    # 3. 分析关联关系
			
 
				+    relation_map = {}
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "关联":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            source_name = nodes.get(source_id, {}).get("name", "")
			
 
				+            target_name = nodes.get(target_id, {}).get("name", "")
			
 
				+
			
 
				+            if source_id not in relation_map:
			
 
				+                relation_map[source_id] = []
			
 
				+            relation_map[source_id].append(target_name)
			
 
				+
			
 
				+            if target_id not in relation_map:
			
 
				+                relation_map[target_id] = []
			
 
				+            relation_map[target_id].append(source_name)
			
 
				+
			
 
				+    # 4. 分析人设匹配
			
 
				+    match_map = {}
			
 
				+    persona_out_edges = persona_index.get("outEdges", {})
			
 
				+
			
 
				+    def get_node_info(node_id: str) -> Optional[Dict]:
			
 
				+        """获取人设节点的标准信息"""
			
 
				+        node = persona_nodes.get(node_id, {})
			
 
				+        if not node:
			
 
				+            return None
			
 
				+        detail = node.get("detail", {})
			
 
				+        parent_path = detail.get("parentPath", [])
			
 
				+        return {
			
 
				+            "节点ID": node_id,
			
 
				+            "节点名称": node.get("name", ""),
			
 
				+            "节点分类": "/".join(parent_path) if parent_path else "",
			
 
				+            "节点维度": node.get("dimension", ""),
			
 
				+            "节点类型": node.get("type", ""),
			
 
				+            "人设全局占比": detail.get("probGlobal", 0),
			
 
				+            "父类下占比": detail.get("probToParent", 0),
			
 
				+        }
			
 
				+
			
 
				+    def get_parent_category_id(node_id: str) -> Optional[str]:
			
 
				+        """通过属于边获取父分类节点ID"""
			
 
				+        belong_edges = persona_out_edges.get(node_id, {}).get("属于", [])
			
 
				+        for edge in belong_edges:
			
 
				+            target_id = edge.get("target", "")
			
 
				+            target_node = persona_nodes.get(target_id, {})
			
 
				+            if target_node.get("type") == "分类":
			
 
				+                return target_id
			
 
				+        return None
			
 
				+
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "匹配":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+
			
 
				+            if source_id.startswith("帖子:") and target_id.startswith("人设:"):
			
 
				+                match_score = edge.get("score", 0)
			
 
				+                persona_node = persona_nodes.get(target_id, {})
			
 
				+
			
 
				+                if persona_node:
			
 
				+                    node_type = persona_node.get("type", "")
			
 
				+                    match_node_info = get_node_info(target_id)
			
 
				+                    if not match_node_info:
			
 
				+                        continue
			
 
				+
			
 
				+                    if node_type == "标签":
			
 
				+                        category_id = get_parent_category_id(target_id)
			
 
				+                    else:
			
 
				+                        category_id = target_id
			
 
				+
			
 
				+                    category_info = None
			
 
				+                    if category_id:
			
 
				+                        category_node = persona_nodes.get(category_id, {})
			
 
				+                        if category_node:
			
 
				+                            category_detail = category_node.get("detail", {})
			
 
				+                            category_path = category_detail.get("parentPath", [])
			
 
				+                            category_info = {
			
 
				+                                "节点ID": category_id,
			
 
				+                                "节点名称": category_node.get("name", ""),
			
 
				+                                "节点分类": "/".join(category_path) if category_path else "",
			
 
				+                                "节点维度": category_node.get("dimension", ""),
			
 
				+                                "节点类型": "分类",
			
 
				+                                "人设全局占比": category_detail.get("probGlobal", 0),
			
 
				+                                "父类下占比": category_detail.get("probToParent", 0),
			
 
				+                                "历史共现分类": [],
			
 
				+                            }
			
 
				+
			
 
				+                            co_occur_edges = persona_out_edges.get(category_id, {}).get("分类共现", [])
			
 
				+                            co_occur_edges_sorted = sorted(co_occur_edges, key=lambda x: x.get("score", 0), reverse=True)
			
 
				+                            for co_edge in co_occur_edges_sorted[:5]:
			
 
				+                                co_target_id = co_edge.get("target", "")
			
 
				+                                co_score = co_edge.get("score", 0)
			
 
				+                                co_node = persona_nodes.get(co_target_id, {})
			
 
				+                                if co_node:
			
 
				+                                    co_detail = co_node.get("detail", {})
			
 
				+                                    co_path = co_detail.get("parentPath", [])
			
 
				+                                    category_info["历史共现分类"].append({
			
 
				+                                        "节点ID": co_target_id,
			
 
				+                                        "节点名称": co_node.get("name", ""),
			
 
				+                                        "节点分类": "/".join(co_path) if co_path else "",
			
 
				+                                        "节点维度": co_node.get("dimension", ""),
			
 
				+                                        "节点类型": "分类",
			
 
				+                                        "人设全局占比": co_detail.get("probGlobal", 0),
			
 
				+                                        "父类下占比": co_detail.get("probToParent", 0),
			
 
				+                                        "共现度": round(co_score, 4),
			
 
				+                                    })
			
 
				+
			
 
				+                    if source_id not in match_map:
			
 
				+                        match_map[source_id] = []
			
 
				+                    match_map[source_id].append({
			
 
				+                        "匹配节点": match_node_info,
			
 
				+                        "匹配分数": round(match_score, 4),
			
 
				+                        "所属分类": category_info,
			
 
				+                    })
			
 
				+
			
 
				+    # 5. 构建待分析节点列表
			
 
				+    analysis_nodes = []
			
 
				+    for node_id, node in nodes.items():
			
 
				+        if node.get("type") == "标签" and node.get("domain") == "帖子":
			
 
				+            dimension = node.get("dimension", "")
			
 
				+            if dimension in ["灵感点", "目的点", "关键点"]:
			
 
				+                match_info = match_map.get(node_id)
			
 
				+
			
 
				+                analysis_nodes.append({
			
 
				+                    "节点ID": node_id,
			
 
				+                    "节点名称": node.get("name", ""),
			
 
				+                    "节点分类": node.get("category", ""),
			
 
				+                    "节点维度": dimension,
			
 
				+                    "节点类型": node.get("type", ""),
			
 
				+                    "节点描述": node.get("detail", {}).get("description", ""),
			
 
				+                    "人设匹配": match_info,
			
 
				+                })
			
 
				+
			
 
				+    # 6. 构建关系列表
			
 
				+    relation_list = []
			
 
				+
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "支撑":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            if source_id in keypoints:
			
 
				+                relation_list.append({
			
 
				+                    "来源节点": source_id,
			
 
				+                    "目标节点": target_id,
			
 
				+                    "关系类型": "支撑",
			
 
				+                })
			
 
				+
			
 
				+    seen_relations = set()
			
 
				+    for edge_id, edge in edges.items():
			
 
				+        if edge.get("type") == "关联":
			
 
				+            source_id = edge.get("source", "")
			
 
				+            target_id = edge.get("target", "")
			
 
				+            key = tuple(sorted([source_id, target_id]))
			
 
				+            if key not in seen_relations:
			
 
				+                seen_relations.add(key)
			
 
				+                relation_list.append({
			
 
				+                    "来源节点": source_id,
			
 
				+                    "目标节点": target_id,
			
 
				+                    "关系类型": "关联",
			
 
				+                })
			
 
				+
			
 
				+    return analysis_nodes, relation_list
			
 
				+
			
 
				+
			
 
				+def prepare_analysis_data(post_graph: Dict, persona_graph: Dict) -> Dict:
			
 
				+    """
			
 
				+    准备完整的分析数据
			
 
				+
			
 
				+    输出扁平化的节点列表 + 独立的人设共现关系数据
			
 
				+    """
			
 
				+    analysis_nodes, relation_list = extract_analysis_nodes(post_graph, persona_graph)
			
 
				+
			
 
				+    # 扁平化节点，提取人设共现关系数据
			
 
				+    flat_nodes = []
			
 
				+    persona_co_occur = {}  # {分类ID: {名称, 共现分类列表}}
			
 
				+
			
 
				+    for node in analysis_nodes:
			
 
				+        # 基础节点字段
			
 
				+        flat_node = {
			
 
				+            "节点ID": node["节点ID"],
			
 
				+            "节点名称": node["节点名称"],
			
 
				+            "节点分类": node.get("节点分类", ""),
			
 
				+            "节点维度": node["节点维度"],
			
 
				+            "节点描述": node.get("节点描述", ""),
			
 
				+            "是否已知": False,
			
 
				+            "发现编号": None,
			
 
				+        }
			
 
				+
			
 
				+        # 提取人设匹配信息（list格式，支持多个匹配）
			
 
				+        match_list = node.get("人设匹配") or []
			
 
				+        if match_list:
			
 
				+            flat_node["人设匹配"] = []
			
 
				+            for match_info in match_list:
			
 
				+                category_info = match_info.get("所属分类")
			
 
				+                category_id = category_info.get("节点ID") if category_info else None
			
 
				+
			
 
				+                # 保留完整的匹配信息，但去掉历史共现分类（拆到外面）
			
 
				+                clean_match = {
			
 
				+                    "匹配节点": match_info.get("匹配节点"),
			
 
				+                    "匹配分数": match_info.get("匹配分数", 0),
			
 
				+                }
			
 
				+                if category_info:
			
 
				+                    # 复制所属分类，但不包含历史共现分类
			
 
				+                    clean_category = {k: v for k, v in category_info.items() if k != "历史共现分类"}
			
 
				+                    clean_match["所属分类"] = clean_category
			
 
				+
			
 
				+                flat_node["人设匹配"].append(clean_match)
			
 
				+
			
 
				+                # 收集人设共现关系（去重）- 从历史共现分类拆出来
			
 
				+                if category_id and category_id not in persona_co_occur:
			
 
				+                    co_occur_list = category_info.get("历史共现分类", [])
			
 
				+                    if co_occur_list:
			
 
				+                        persona_co_occur[category_id] = [
			
 
				+                            {
			
 
				+                                "节点ID": c.get("节点ID"),
			
 
				+                                "节点名称": c.get("节点名称"),
			
 
				+                                "节点分类": c.get("节点分类", ""),
			
 
				+                                "节点维度": c.get("节点维度", ""),
			
 
				+                                "节点类型": c.get("节点类型", ""),
			
 
				+                                "人设全局占比": c.get("人设全局占比", 0),
			
 
				+                                "父类下占比": c.get("父类下占比", 0),
			
 
				+                                "共现度": c.get("共现度", 0),
			
 
				+                            }
			
 
				+                            for c in co_occur_list
			
 
				+                            if c.get("节点ID")
			
 
				+                        ]
			
 
				+        else:
			
 
				+            flat_node["人设匹配"] = []
			
 
				+
			
 
				+        flat_nodes.append(flat_node)
			
 
				+
			
 
				+    return {
			
 
				+        "帖子详情": extract_post_detail(post_graph),
			
 
				+        "节点列表": flat_nodes,
			
 
				+        "关系列表": relation_list,
			
 
				+        "人设共现关系": persona_co_occur,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+# ===== 第二步：起点分析 =====
			
 
				+
			
 
				+def get_best_match(node: Dict) -> Optional[Dict]:
			
 
				+    """获取节点的最佳人设匹配（分数最高的）"""
			
 
				+    match_list = node.get("人设匹配") or []
			
 
				+    if not match_list:
			
 
				+        return None
			
 
				+    return max(match_list, key=lambda m: m.get("匹配分数", 0))
			
 
				+
			
 
				+
			
 
				+def get_match_score(node: Dict) -> float:
			
 
				+    """获取节点的最高人设匹配分数"""
			
 
				+    best_match = get_best_match(node)
			
 
				+    if best_match:
			
 
				+        return best_match.get("匹配分数", 0)
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def get_category_id(node: Dict) -> Optional[str]:
			
 
				+    """获取节点的所属分类ID（最佳匹配的）"""
			
 
				+    best_match = get_best_match(node)
			
 
				+    if best_match:
			
 
				+        category = best_match.get("所属分类")
			
 
				+        if category:
			
 
				+            return category.get("节点ID")
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def get_all_category_ids(node: Dict) -> List[str]:
			
 
				+    """获取节点所有匹配的分类ID"""
			
 
				+    match_list = node.get("人设匹配") or []
			
 
				+    result = []
			
 
				+    for m in match_list:
			
 
				+        category = m.get("所属分类")
			
 
				+        if category and category.get("节点ID"):
			
 
				+            result.append(category.get("节点ID"))
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def get_category_global_ratio(node: Dict) -> float:
			
 
				+    """获取节点所属分类的人设全局占比（最佳匹配的）"""
			
 
				+    best_match = get_best_match(node)
			
 
				+    if best_match:
			
 
				+        category = best_match.get("所属分类")
			
 
				+        if category:
			
 
				+            return category.get("人设全局占比", 0)
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def is_persona_constant(node: Dict) -> bool:
			
 
				+    """判断节点是否为人设常量（匹配分数 >= 0.8 且 分类全局占比 >= 0.8）"""
			
 
				+    match_score = get_match_score(node)
			
 
				+    global_ratio = get_category_global_ratio(node)
			
 
				+    return match_score >= MATCH_SCORE_THRESHOLD and global_ratio >= GLOBAL_RATIO_THRESHOLD
			
 
				+
			
 
				+
			
 
				+def build_origin_context(nodes: List[Dict]) -> Dict:
			
 
				+    """构造AI分析的上下文"""
			
 
				+
			
 
				+    all_points = []
			
 
				+    for node in nodes:
			
 
				+        all_points.append({
			
 
				+            "名称": node["节点名称"],
			
 
				+            "分类": node.get("节点分类", ""),
			
 
				+            "维度": node.get("节点维度", ""),
			
 
				+            "描述": node.get("节点描述", ""),
			
 
				+            "人设匹配度": round(get_match_score(node), 2),
			
 
				+        })
			
 
				+
			
 
				+    # 起点候选集（灵感点 + 目的点）
			
 
				+    candidates = [
			
 
				+        node["节点名称"]
			
 
				+        for node in nodes
			
 
				+        if node["节点维度"] in ["灵感点", "目的点"]
			
 
				+    ]
			
 
				+
			
 
				+    # 人设常量（匹配分数 >= 0.8 且 分类全局占比 >= 0.8）
			
 
				+    constants = [
			
 
				+        node["节点名称"]
			
 
				+        for node in nodes
			
 
				+        if is_persona_constant(node)
			
 
				+    ]
			
 
				+
			
 
				+    return {
			
 
				+        "all_points": all_points,
			
 
				+        "candidates": candidates,
			
 
				+        "constants": constants,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def format_origin_prompt(context: Dict) -> str:
			
 
				+    """格式化起点分析的prompt"""
			
 
				+    all_points = context["all_points"]
			
 
				+    candidates = context["candidates"]
			
 
				+    constants = context["constants"]
			
 
				+
			
 
				+    points_text = ""
			
 
				+    for p in all_points:
			
 
				+        points_text += f"- {p['名称']}\n"
			
 
				+        points_text += f"  维度: {p['维度']} | 分类: {p['分类']}\n"
			
 
				+        points_text += f"  描述: {p['描述']}\n"
			
 
				+        points_text += f"  人设匹配度: {p['人设匹配度']}\n"
			
 
				+        points_text += "\n"
			
 
				+
			
 
				+    candidates_text = "、".join(candidates)
			
 
				+    constants_text = "、".join(constants) if constants else "无"
			
 
				+
			
 
				+    prompt = f"""# Role
			
 
				+你是小红书爆款内容的"逆向工程"专家。你的核心能力是透过内容的表象（视觉/形式），还原创作者最初的脑回路（动机/实质）。
			
 
				+
			
 
				+# Task
			
 
				+我提供一组笔记的【创意标签】和一个【起点候选集】。
			
 
				+请推理出哪些选项是真正的**创意起点**。
			
 
				+
			
 
				+
			
 
				+# Input Data
			
 
				+
			
 
				+## 全部创意点
			
 
				+
			
 
				+{points_text}
			
 
				+
			
 
				+## 起点候选集
			
 
				+{candidates_text}
			
 
				+
			
 
				+## 来自人设的常量
			
 
				+{constants_text}
			
 
				+
			
 
				+
			
 
				+# 推理约束
			
 
				+
			
 
				+1. 实质推形式，而不是形式推实质，除非形式是一切创意的起点
			
 
				+2. 因推果而不是果推因
			
 
				+3. 无法被其他项或人设推理出的点，即为起点
			
 
				+
			
 
				+# Output Format
			
 
				+
			
 
				+请输出一个标准的 JSON 格式。
			
 
				+- Key: 候选集中的词。
			
 
				+- Value: 一个对象，包含：
			
 
				+  - `score`: 0.0 到 1.0 的浮点数（代表是起点的可能性）。
			
 
				+  - `analysis`: 一句话推理"""
			
 
				+
			
 
				+    return prompt
			
 
				+
			
 
				+
			
 
				+async def analyze_origin(nodes: List[Dict], force_llm: bool = False) -> Dict:
			
 
				+    """
			
 
				+    执行起点分析
			
 
				+
			
 
				+    输入: 节点列表
			
 
				+    输出: 节点列表（加了起点分析、是否已知、发现编号字段）+ 中间结果
			
 
				+    """
			
 
				+    context = build_origin_context(nodes)
			
 
				+    prompt = format_origin_prompt(context)
			
 
				+
			
 
				+    print(f"\n  起点候选: {len(context['candidates'])} 个")
			
 
				+    print(f"  人设常量: {len(context['constants'])} 个")
			
 
				+
			
 
				+    result = await analyze(
			
 
				+        prompt=prompt,
			
 
				+        task_name=f"{TASK_NAME}/origin",
			
 
				+        force=force_llm,
			
 
				+        parse_json=True,
			
 
				+    )
			
 
				+
			
 
				+    # 把分析结果合并到节点
			
 
				+    llm_result = result.data or {}
			
 
				+    output_nodes = []
			
 
				+    current_order = 1  # 已知节点的发现编号计数
			
 
				+
			
 
				+    for node in nodes:
			
 
				+        new_node = dict(node)  # 复制原节点
			
 
				+        name = node["节点名称"]
			
 
				+
			
 
				+        if name in llm_result:
			
 
				+            score = llm_result[name].get("score", 0)
			
 
				+            analysis = llm_result[name].get("analysis", "")
			
 
				+            # 加起点分析
			
 
				+            new_node["起点分析"] = {
			
 
				+                "分数": score,
			
 
				+                "说明": analysis,
			
 
				+            }
			
 
				+            # 高分起点标记为已知
			
 
				+            if score >= ORIGIN_SCORE_THRESHOLD:
			
 
				+                new_node["是否已知"] = True
			
 
				+                new_node["发现编号"] = current_order
			
 
				+                current_order += 1
			
 
				+        else:
			
 
				+            new_node["起点分析"] = None
			
 
				+
			
 
				+        output_nodes.append(new_node)
			
 
				+
			
 
				+    return {
			
 
				+        "输入上下文": {
			
 
				+            "起点候选": context["candidates"],
			
 
				+            "人设常量": context["constants"],
			
 
				+        },
			
 
				+        "中间结果": llm_result,
			
 
				+        "输出节点": output_nodes,
			
 
				+        "cache_hit": result.cache_hit,
			
 
				+        "model": result.model_name,
			
 
				+        "log_url": result.log_url,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+# ===== 第三步：模式推导 =====
			
 
				+
			
 
				+def derive_patterns(
			
 
				+    nodes: List[Dict],
			
 
				+    persona_co_occur: Dict[str, Dict],
			
 
				+) -> Dict:
			
 
				+    """
			
 
				+    基于共现关系的迭代推导
			
 
				+
			
 
				+    输入: 带起点分析的节点列表 + 人设共现关系数据
			
 
				+    输出: 节点列表（加了推导轮次、未知原因字段）+ 推导边列表
			
 
				+    """
			
 
				+    node_by_name: Dict[str, Dict] = {n["节点名称"]: n for n in nodes}
			
 
				+
			
 
				+    # 构建共现查找表 {节点ID: {共现节点ID: 共现度}}
			
 
				+    co_occur_lookup = {}
			
 
				+    for cat_id, co_occur_list in persona_co_occur.items():
			
 
				+        co_occur_lookup[cat_id] = {
			
 
				+            c["节点ID"]: c["共现度"]
			
 
				+            for c in co_occur_list
			
 
				+        }
			
 
				+
			
 
				+    # 1. 初始化已知点集合（已经是已知的节点）
			
 
				+    known_names: Set[str] = set()
			
 
				+    node_round: Dict[str, int] = {}  # {节点名称: 加入轮次}
			
 
				+
			
 
				+    for node in nodes:
			
 
				+        if node.get("是否已知"):
			
 
				+            known_names.add(node["节点名称"])
			
 
				+            node_round[node["节点名称"]] = 0
			
 
				+
			
 
				+    unknown_names: Set[str] = set(node_by_name.keys()) - known_names
			
 
				+    edges: List[Dict] = []
			
 
				+
			
 
				+    # 2. 迭代推导
			
 
				+    round_num = 0
			
 
				+    new_known_this_round = known_names.copy()
			
 
				+
			
 
				+    while new_known_this_round:
			
 
				+        round_num += 1
			
 
				+        new_known_next_round: Set[str] = set()
			
 
				+
			
 
				+        for known_name in new_known_this_round:
			
 
				+            known_node = node_by_name.get(known_name)
			
 
				+            if not known_node:
			
 
				+                continue
			
 
				+
			
 
				+            if get_match_score(known_node) < MATCH_SCORE_THRESHOLD:
			
 
				+                continue
			
 
				+
			
 
				+            # 获取该节点所属分类的共现列表
			
 
				+            known_cat_id = get_category_id(known_node)
			
 
				+            if not known_cat_id or known_cat_id not in co_occur_lookup:
			
 
				+                continue
			
 
				+
			
 
				+            co_occur_map = co_occur_lookup[known_cat_id]
			
 
				+
			
 
				+            for unknown_name in list(unknown_names):
			
 
				+                unknown_node = node_by_name.get(unknown_name)
			
 
				+                if not unknown_node:
			
 
				+                    continue
			
 
				+
			
 
				+                if get_match_score(unknown_node) < MATCH_SCORE_THRESHOLD:
			
 
				+                    continue
			
 
				+
			
 
				+                # 检查未知节点的分类是否在已知节点的共现列表中
			
 
				+                unknown_cat_id = get_category_id(unknown_node)
			
 
				+                if unknown_cat_id and unknown_cat_id in co_occur_map:
			
 
				+                    co_occur_score = co_occur_map[unknown_cat_id]
			
 
				+                    new_known_next_round.add(unknown_name)
			
 
				+                    node_round[unknown_name] = round_num
			
 
				+
			
 
				+                    edges.append({
			
 
				+                        "来源": known_node["节点ID"],
			
 
				+                        "目标": unknown_node["节点ID"],
			
 
				+                        "关系类型": "共现推导",
			
 
				+                        "推导轮次": round_num,
			
 
				+                        "共现分类ID": unknown_cat_id,
			
 
				+                        "共现度": co_occur_score,
			
 
				+                    })
			
 
				+
			
 
				+        known_names.update(new_known_next_round)
			
 
				+        unknown_names -= new_known_next_round
			
 
				+        new_known_this_round = new_known_next_round
			
 
				+
			
 
				+        if not new_known_next_round:
			
 
				+            break
			
 
				+
			
 
				+    # 3. 构建输出节点（只更新是否已知、发现编号）
			
 
				+    # 先找出当前最大发现编号
			
 
				+    max_order = 0
			
 
				+    for node in nodes:
			
 
				+        if node.get("发现编号") and node["发现编号"] > max_order:
			
 
				+            max_order = node["发现编号"]
			
 
				+
			
 
				+    # 按推导轮次排序新发现的节点，分配发现编号
			
 
				+    new_known_by_round = {}
			
 
				+    for name, r in node_round.items():
			
 
				+        if r > 0:  # 排除起点（轮次0）
			
 
				+            if r not in new_known_by_round:
			
 
				+                new_known_by_round[r] = []
			
 
				+            new_known_by_round[r].append(name)
			
 
				+
			
 
				+    # 分配发现编号
			
 
				+    order_map = {}
			
 
				+    current_order = max_order + 1
			
 
				+    for r in sorted(new_known_by_round.keys()):
			
 
				+        for name in new_known_by_round[r]:
			
 
				+            order_map[name] = current_order
			
 
				+            current_order += 1
			
 
				+
			
 
				+    output_nodes = []
			
 
				+    for node in nodes:
			
 
				+        new_node = dict(node)
			
 
				+        name = node["节点名称"]
			
 
				+
			
 
				+        # 如果是新推导出来的（非起点），更新已知状态和发现编号
			
 
				+        if name in node_round and node_round[name] > 0:
			
 
				+            new_node["是否已知"] = True
			
 
				+            new_node["发现编号"] = order_map.get(name)
			
 
				+
			
 
				+        output_nodes.append(new_node)
			
 
				+
			
 
				+    return {
			
 
				+        "输出节点": output_nodes,
			
 
				+        "推导边列表": edges,
			
 
				+        "推导轮次": round_num,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+# ===== 完整流程 =====
			
 
				+
			
 
				+def save_result(post_id: str, post_detail: Dict, steps: List, config: PathConfig) -> Path:
			
 
				+    """保存结果到文件"""
			
 
				+    output_dir = config.intermediate_dir / "creation_pattern"
			
 
				+    output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+    output_file = output_dir / f"{post_id}_创作模式.json"
			
 
				+
			
 
				+    result = {
			
 
				+        "帖子详情": post_detail,
			
 
				+        "步骤列表": steps,
			
 
				+    }
			
 
				+    with open(output_file, "w", encoding="utf-8") as f:
			
 
				+        json.dump(result, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"  [已保存] {output_file.name}")
			
 
				+    return output_file
			
 
				+
			
 
				+
			
 
				+async def process_single_post(
			
 
				+    post_file: Path,
			
 
				+    persona_graph: Dict,
			
 
				+    config: PathConfig,
			
 
				+    force_llm: bool = False,
			
 
				+    max_step: int = 3,
			
 
				+) -> Dict:
			
 
				+    """
			
 
				+    处理单个帖子
			
 
				+
			
 
				+    Args:
			
 
				+        force_llm: 强制重新调用LLM（跳过LLM缓存）
			
 
				+        max_step: 最多运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导)
			
 
				+    """
			
 
				+    post_graph = load_json(post_file)
			
 
				+    post_id = post_graph.get("meta", {}).get("postId", "unknown")
			
 
				+
			
 
				+    print(f"\n{'=' * 60}")
			
 
				+    print(f"处理帖子: {post_id}")
			
 
				+    print("-" * 60)
			
 
				+
			
 
				+    steps = []
			
 
				+
			
 
				+    # ===== 步骤1：数据准备 =====
			
 
				+    print("\n[步骤1] 数据准备...")
			
 
				+    data = prepare_analysis_data(post_graph, persona_graph)
			
 
				+    post_detail = data["帖子详情"]
			
 
				+    nodes_step1 = data["节点列表"]
			
 
				+    relations_step1 = data["关系列表"]
			
 
				+    persona_co_occur = data["人设共现关系"]
			
 
				+
			
 
				+    # 步骤1所有节点都是新的
			
 
				+    new_known_step1 = [n["节点名称"] for n in nodes_step1 if n.get("是否已知")]
			
 
				+
			
 
				+    step1 = {
			
 
				+        "步骤": "数据准备",
			
 
				+        "输入": {
			
 
				+            "帖子图谱": str(post_file.name),
			
 
				+            "人设图谱": "人设图谱.json",
			
 
				+        },
			
 
				+        "输出": {
			
 
				+            "新的已知节点": new_known_step1,
			
 
				+            "新的边": [],
			
 
				+            "节点列表": nodes_step1,
			
 
				+            "边列表": relations_step1,
			
 
				+        },
			
 
				+        "人设共现关系": persona_co_occur,
			
 
				+        "摘要": {
			
 
				+            "节点数": len(nodes_step1),
			
 
				+            "边数": len(relations_step1),
			
 
				+            "人设共现数": len(persona_co_occur),
			
 
				+        },
			
 
				+    }
			
 
				+    steps.append(step1)
			
 
				+    print(f"  节点数: {len(nodes_step1)}")
			
 
				+    print(f"  关系数: {len(relations_step1)}")
			
 
				+    print(f"  人设共现数: {len(persona_co_occur)}")
			
 
				+
			
 
				+    # 步骤1完成，保存
			
 
				+    save_result(post_id, post_detail, steps, config)
			
 
				+
			
 
				+    if max_step == 1:
			
 
				+        return {"帖子详情": post_detail, "步骤列表": steps}
			
 
				+
			
 
				+    # ===== 步骤2：起点分析 =====
			
 
				+    print("\n[步骤2] 起点分析...")
			
 
				+    origin_result = await analyze_origin(nodes_step1, force_llm=force_llm)
			
 
				+    nodes_step2 = origin_result["输出节点"]
			
 
				+
			
 
				+    # 统计高分起点
			
 
				+    def get_origin_score(node):
			
 
				+        analysis = node.get("起点分析")
			
 
				+        if analysis:
			
 
				+            return analysis.get("分数", 0)
			
 
				+        return 0
			
 
				+
			
 
				+    high_score_origins = [
			
 
				+        (n["节点名称"], get_origin_score(n))
			
 
				+        for n in nodes_step2
			
 
				+        if get_origin_score(n) >= 0.7
			
 
				+    ]
			
 
				+
			
 
				+    # 新发现的已知节点（起点）
			
 
				+    new_known_nodes = [n["节点名称"] for n in nodes_step2 if n.get("是否已知")]
			
 
				+
			
 
				+    step2 = {
			
 
				+        "步骤": "起点分析",
			
 
				+        "输入": {
			
 
				+            "节点列表": nodes_step1,
			
 
				+            "起点候选": origin_result["输入上下文"]["起点候选"],
			
 
				+            "人设常量": origin_result["输入上下文"]["人设常量"],
			
 
				+        },
			
 
				+        "中间结果": origin_result["中间结果"],
			
 
				+        "输出": {
			
 
				+            "新的已知节点": new_known_nodes,
			
 
				+            "新的边": [],
			
 
				+            "节点列表": nodes_step2,
			
 
				+            "边列表": relations_step1,  # 边没变化
			
 
				+        },
			
 
				+        "摘要": {
			
 
				+            "新已知数": len(new_known_nodes),
			
 
				+            "model": origin_result["model"],
			
 
				+            "cache_hit": origin_result["cache_hit"],
			
 
				+            "log_url": origin_result.get("log_url"),
			
 
				+        },
			
 
				+    }
			
 
				+    steps.append(step2)
			
 
				+
			
 
				+    print(f"  高分起点 (>=0.7): {len(high_score_origins)} 个")
			
 
				+    for name, score in sorted(high_score_origins, key=lambda x: -x[1]):
			
 
				+        print(f"    ★ {name}: {score:.2f}")
			
 
				+
			
 
				+    # 步骤2完成，保存
			
 
				+    save_result(post_id, post_detail, steps, config)
			
 
				+
			
 
				+    if max_step == 2:
			
 
				+        return {"帖子详情": post_detail, "步骤列表": steps}
			
 
				+
			
 
				+    # ===== 步骤3：模式推导 =====
			
 
				+    print("\n[步骤3] 模式推导...")
			
 
				+    derivation_result = derive_patterns(nodes_step2, persona_co_occur)
			
 
				+    nodes_step3 = derivation_result["输出节点"]
			
 
				+    edges = derivation_result["推导边列表"]
			
 
				+
			
 
				+    # 统计
			
 
				+    known_count = sum(1 for n in nodes_step3 if n.get("是否已知"))
			
 
				+    unknown_count = len(nodes_step3) - known_count
			
 
				+
			
 
				+    # 新发现的已知节点（本步骤推导出来的，不包括之前的起点）
			
 
				+    prev_known = {n["节点名称"] for n in nodes_step2 if n.get("是否已知")}
			
 
				+    new_known_nodes = [n["节点名称"] for n in nodes_step3 if n.get("是否已知") and n["节点名称"] not in prev_known]
			
 
				+
			
 
				+    # 合并边列表（原有边 + 推导边）
			
 
				+    all_edges = relations_step1 + edges
			
 
				+
			
 
				+    step3 = {
			
 
				+        "步骤": "模式推导",
			
 
				+        "输入": {
			
 
				+            "节点列表": nodes_step2,
			
 
				+            "人设共现关系": persona_co_occur,
			
 
				+        },
			
 
				+        "输出": {
			
 
				+            "新的已知节点": new_known_nodes,
			
 
				+            "新的边": edges,
			
 
				+            "节点列表": nodes_step3,
			
 
				+            "边列表": all_edges,
			
 
				+        },
			
 
				+        "摘要": {
			
 
				+            "已知点数": known_count,
			
 
				+            "新已知数": len(new_known_nodes),
			
 
				+            "新边数": len(edges),
			
 
				+            "未知点数": unknown_count,
			
 
				+        },
			
 
				+    }
			
 
				+    steps.append(step3)
			
 
				+
			
 
				+    print(f"  已知点: {known_count} 个")
			
 
				+    print(f"  推导边: {len(edges)} 条")
			
 
				+    print(f"  未知点: {unknown_count} 个")
			
 
				+
			
 
				+    # 步骤3完成，保存
			
 
				+    save_result(post_id, post_detail, steps, config)
			
 
				+
			
 
				+    return {"帖子详情": post_detail, "步骤列表": steps}
			
 
				+
			
 
				+
			
 
				+# ===== 主函数 =====
			
 
				+
			
 
				+async def main(
			
 
				+    post_id: str = None,
			
 
				+    all_posts: bool = False,
			
 
				+    force_llm: bool = False,
			
 
				+    max_step: int = 3,
			
 
				+):
			
 
				+    """主函数"""
			
 
				+    _, log_url = set_trace()
			
 
				+
			
 
				+    config = PathConfig()
			
 
				+
			
 
				+    print(f"账号: {config.account_name}")
			
 
				+    print(f"Trace URL: {log_url}")
			
 
				+
			
 
				+    # 加载人设图谱
			
 
				+    persona_graph_file = config.intermediate_dir / "人设图谱.json"
			
 
				+    if not persona_graph_file.exists():
			
 
				+        print(f"错误: 人设图谱文件不存在: {persona_graph_file}")
			
 
				+        return
			
 
				+
			
 
				+    persona_graph = load_json(persona_graph_file)
			
 
				+    print(f"人设图谱节点数: {len(persona_graph.get('nodes', {}))}")
			
 
				+
			
 
				+    # 获取帖子图谱文件
			
 
				+    post_graph_files = get_post_graph_files(config)
			
 
				+    if not post_graph_files:
			
 
				+        print("错误: 没有找到帖子图谱文件")
			
 
				+        return
			
 
				+
			
 
				+    # 确定要处理的帖子
			
 
				+    if post_id:
			
 
				+        target_file = next(
			
 
				+            (f for f in post_graph_files if post_id in f.name),
			
 
				+            None
			
 
				+        )
			
 
				+        if not target_file:
			
 
				+            print(f"错误: 未找到帖子 {post_id}")
			
 
				+            return
			
 
				+        files_to_process = [target_file]
			
 
				+    elif all_posts:
			
 
				+        files_to_process = post_graph_files
			
 
				+    else:
			
 
				+        files_to_process = [post_graph_files[0]]
			
 
				+
			
 
				+    print(f"待处理帖子数: {len(files_to_process)}")
			
 
				+
			
 
				+    # 处理
			
 
				+    results = []
			
 
				+    for i, post_file in enumerate(files_to_process, 1):
			
 
				+        print(f"\n{'#' * 60}")
			
 
				+        print(f"# 处理帖子 {i}/{len(files_to_process)}")
			
 
				+        print(f"{'#' * 60}")
			
 
				+
			
 
				+        result = await process_single_post(
			
 
				+            post_file=post_file,
			
 
				+            persona_graph=persona_graph,
			
 
				+            config=config,
			
 
				+            force_llm=force_llm,
			
 
				+            max_step=max_step,
			
 
				+        )
			
 
				+        results.append(result)
			
 
				+
			
 
				+    # 汇总
			
 
				+    print(f"\n{'#' * 60}")
			
 
				+    print(f"# 完成! 共处理 {len(results)} 个帖子")
			
 
				+    print(f"{'#' * 60}")
			
 
				+    print(f"Trace: {log_url}")
			
 
				+
			
 
				+    print("\n汇总:")
			
 
				+    for result in results:
			
 
				+        post_id = result["帖子详情"]["postId"]
			
 
				+        steps = result.get("步骤列表", [])
			
 
				+        num_steps = len(steps)
			
 
				+
			
 
				+        if num_steps == 1:
			
 
				+            step1_summary = steps[0].get("摘要", {})
			
 
				+            print(f"  {post_id}: 节点数={step1_summary.get('节点数', 0)} (仅数据准备)")
			
 
				+        elif num_steps == 2:
			
 
				+            step2_summary = steps[1].get("摘要", {})
			
 
				+            print(f"  {post_id}: 起点={step2_summary.get('高分起点数', 0)} (未推导)")
			
 
				+        elif num_steps >= 3:
			
 
				+            step2_summary = steps[1].get("摘要", {})
			
 
				+            step3_summary = steps[2].get("摘要", {})
			
 
				+            print(f"  {post_id}: 起点={step2_summary.get('高分起点数', 0)}, "
			
 
				+                  f"已知={step3_summary.get('已知点数', 0)}, "
			
 
				+                  f"推导边={step3_summary.get('推导边数', 0)}")
			
 
				+        else:
			
 
				+            print(f"  {post_id}: 无步骤数据")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import argparse
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(description="创作模式分析")
			
 
				+    parser.add_argument("--post-id", type=str, help="帖子ID")
			
 
				+    parser.add_argument("--all-posts", action="store_true", help="处理所有帖子")
			
 
				+    parser.add_argument("--force-llm", action="store_true", help="强制重新调用LLM（跳过LLM缓存）")
			
 
				+    parser.add_argument("--step", type=int, default=3, choices=[1, 2, 3],
			
 
				+                        help="运行到第几步 (1=数据准备, 2=起点分析, 3=模式推导)")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    asyncio.run(main(
			
 
				+        post_id=args.post_id,
			
 
				+        all_posts=args.all_posts,
			
 
				+        force_llm=args.force_llm,
			
 
				+        max_step=args.step,
			
 
				+    ))