4 ماه پیش · 882bd3b8a4
--- a/agent/core/runner.py
+++ b/agent/core/runner.py
@@ -31,6 +31,8 @@ from agent.trace.compaction import (
 
				     filter_by_goal_status,
			
 
				     estimate_tokens,
			
 
				     needs_level2_compression,
			
 
				+    build_compression_prompt,
			
 
				+    build_reflect_prompt,
			
 
				 )
			
 
				 from agent.memory.models import Skill
			
 
				 from agent.memory.protocols import MemoryStore, StateStore
			
@@ -623,11 +625,10 @@ class AgentRunner:
 
				             max_tokens = compression_config.get_max_tokens(config.model)
			
 
				 
			
 
				             if token_count > max_tokens and self.trace_store and goal_tree:
			
 
				-                # 从 store 加载 Message 对象用于过滤
			
 
				-                trace_obj = await self.trace_store.get_trace(trace_id)
			
 
				-                if trace_obj and trace_obj.head_sequence > 0:
			
 
				+                # 使用本地 head_seq（store 中的 head_sequence 在 loop 期间未更新，是过时的）
			
 
				+                if head_seq > 0:
			
 
				                     main_path_msgs = await self.trace_store.get_main_path_messages(
			
 
				-                        trace_id, trace_obj.head_sequence
			
 
				+                        trace_id, head_seq
			
 
				                     )
			
 
				                     filtered_msgs = filter_by_goal_status(main_path_msgs, goal_tree)
			
 
				                     if len(filtered_msgs) < len(main_path_msgs):
			
@@ -636,12 +637,30 @@ class AgentRunner:
 
				                             len(main_path_msgs), len(filtered_msgs), token_count, max_tokens,
			
 
				                         )
			
 
				                         history = [msg.to_llm_dict() for msg in filtered_msgs]
			
 
				+                    else:
			
 
				+                        logger.info(
			
 
				+                            "Level 1 压缩: 无可过滤消息 (%d 条全部保留, completed/abandoned goals=%d)",
			
 
				+                            len(main_path_msgs),
			
 
				+                            sum(1 for g in goal_tree.goals
			
 
				+                                if g.status in ("completed", "abandoned")),
			
 
				+                        )
			
 
				             elif token_count > max_tokens:
			
 
				                 logger.warning(
			
 
				                     "消息 token 数 (%d) 超过阈值 (%d)，但无法执行 Level 1 压缩（缺少 store 或 goal_tree）",
			
 
				                     token_count, max_tokens,
			
 
				                 )
			
 
				 
			
 
				+            # Level 2 压缩：LLM 总结（Level 1 后仍超阈值时触发）
			
 
				+            token_count_after = estimate_tokens(history)
			
 
				+            if token_count_after > max_tokens:
			
 
				+                logger.info(
			
 
				+                    "Level 1 后 token 仍超阈值 (%d > %d)，触发 Level 2 压缩",
			
 
				+                    token_count_after, max_tokens,
			
 
				+                )
			
 
				+                history, head_seq, sequence = await self._compress_history(
			
 
				+                    trace_id, history, goal_tree, config, sequence, head_seq,
			
 
				+                )
			
 
				+
			
 
				             # 构建 LLM messages（注入上下文）
			
 
				             llm_messages = list(history)
			
 
				 
			
@@ -712,6 +731,31 @@ class AgentRunner:
 
				             sequence += 1
			
 
				 
			
 
				             # 处理工具调用
			
 
				+            # 截断兜底：finish_reason == "length" 说明响应被 max_tokens 截断，
			
 
				+            # tool call 参数很可能不完整，不应执行，改为提示模型分批操作
			
 
				+            if tool_calls and finish_reason == "length":
			
 
				+                logger.warning(
			
 
				+                    "[Runner] 响应被 max_tokens 截断，跳过 %d 个不完整的 tool calls",
			
 
				+                    len(tool_calls),
			
 
				+                )
			
 
				+                truncation_hint = (
			
 
				+                    "你的响应因为 max_tokens 限制被截断，tool call 参数不完整，未执行。"
			
 
				+                    "请将大内容拆分为多次小的工具调用（例如用 write_file 的 append 模式分批写入）。"
			
 
				+                )
			
 
				+                history.append({
			
 
				+                    "role": "assistant",
			
 
				+                    "content": response_content,
			
 
				+                    "tool_calls": tool_calls,
			
 
				+                })
			
 
				+                # 为每个被截断的 tool call 返回错误结果
			
 
				+                for tc in tool_calls:
			
 
				+                    history.append({
			
 
				+                        "role": "tool",
			
 
				+                        "tool_call_id": tc["id"],
			
 
				+                        "content": truncation_hint,
			
 
				+                    })
			
 
				+                continue
			
 
				+
			
 
				             if tool_calls and config.auto_execute_tools:
			
 
				                 history.append({
			
 
				                     "role": "assistant",
			
@@ -814,6 +858,137 @@ class AgentRunner:
 
				             if trace_obj:
			
 
				                 yield trace_obj
			
 
				 
			
 
				+    # ===== Level 2: LLM 压缩 =====
			
 
				+
			
 
				+    async def _compress_history(
			
 
				+        self,
			
 
				+        trace_id: str,
			
 
				+        history: List[Dict],
			
 
				+        goal_tree: Optional[GoalTree],
			
 
				+        config: RunConfig,
			
 
				+        sequence: int,
			
 
				+        head_seq: int,
			
 
				+    ) -> Tuple[List[Dict], int, int]:
			
 
				+        """
			
 
				+        Level 2 压缩：LLM 总结
			
 
				+
			
 
				+        Step 1: 经验提取（reflect）— 纯内存 LLM 调用 + 文件追加，不影响 trace
			
 
				+        Step 2: 压缩总结 — LLM 生成 summary
			
 
				+        Step 3: 存储 summary 为新消息，parent_sequence 跳到 system msg
			
 
				+        Step 4: 重建 history
			
 
				+
			
 
				+        Returns:
			
 
				+            (new_history, new_head_seq, next_sequence)
			
 
				+        """
			
 
				+        logger.info("Level 2 压缩开始: trace=%s, 当前 history 长度=%d", trace_id, len(history))
			
 
				+
			
 
				+        # 找到 system message 的 sequence（主路径第一条消息）
			
 
				+        system_msg_seq = None
			
 
				+        system_msg_dict = None
			
 
				+        if self.trace_store:
			
 
				+            trace_obj = await self.trace_store.get_trace(trace_id)
			
 
				+            if trace_obj and trace_obj.head_sequence > 0:
			
 
				+                main_path = await self.trace_store.get_main_path_messages(
			
 
				+                    trace_id, trace_obj.head_sequence
			
 
				+                )
			
 
				+                for msg in main_path:
			
 
				+                    if msg.role == "system":
			
 
				+                        system_msg_seq = msg.sequence
			
 
				+                        system_msg_dict = msg.to_llm_dict()
			
 
				+                        break
			
 
				+
			
 
				+        # Fallback: 从 history 中找 system message
			
 
				+        if system_msg_dict is None:
			
 
				+            for msg_dict in history:
			
 
				+                if msg_dict.get("role") == "system":
			
 
				+                    system_msg_dict = msg_dict
			
 
				+                    break
			
 
				+
			
 
				+        if system_msg_dict is None:
			
 
				+            logger.warning("Level 2 压缩跳过：未找到 system message")
			
 
				+            return history, head_seq, sequence
			
 
				+
			
 
				+        # --- Step 1: 经验提取（reflect）---
			
 
				+        try:
			
 
				+            reflect_prompt = build_reflect_prompt()
			
 
				+            reflect_messages = list(history) + [{"role": "user", "content": reflect_prompt}]
			
 
				+
			
 
				+            reflect_result = await self.llm_call(
			
 
				+                messages=reflect_messages,
			
 
				+                model=config.model,
			
 
				+                tools=[],
			
 
				+                temperature=config.temperature,
			
 
				+                **config.extra_llm_params,
			
 
				+            )
			
 
				+
			
 
				+            reflect_content = reflect_result.get("content", "").strip()
			
 
				+            if reflect_content and self.experiences_path:
			
 
				+                try:
			
 
				+                    os.makedirs(os.path.dirname(self.experiences_path), exist_ok=True)
			
 
				+                    with open(self.experiences_path, "a", encoding="utf-8") as f:
			
 
				+                        f.write(f"\n\n---\n\n{reflect_content}")
			
 
				+                    logger.info("经验已追加到 %s", self.experiences_path)
			
 
				+                except Exception as e:
			
 
				+                    logger.warning("写入经验文件失败: %s", e)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.warning("Level 2 经验提取失败（不影响压缩）: %s", e)
			
 
				+
			
 
				+        # --- Step 2: 压缩总结 ---
			
 
				+        compress_prompt = build_compression_prompt(goal_tree)
			
 
				+        compress_messages = list(history) + [{"role": "user", "content": compress_prompt}]
			
 
				+
			
 
				+        compress_result = await self.llm_call(
			
 
				+            messages=compress_messages,
			
 
				+            model=config.model,
			
 
				+            tools=[],
			
 
				+            temperature=config.temperature,
			
 
				+            **config.extra_llm_params,
			
 
				+        )
			
 
				+
			
 
				+        summary_text = compress_result.get("content", "").strip()
			
 
				+        if not summary_text:
			
 
				+            logger.warning("Level 2 压缩跳过：LLM 未返回 summary")
			
 
				+            return history, head_seq, sequence
			
 
				+
			
 
				+        # --- Step 3: 存储 summary 消息 ---
			
 
				+        summary_with_header = (
			
 
				+            f"## 对话历史摘要（自动压缩）\n\n{summary_text}\n\n"
			
 
				+            "---\n请基于以上摘要和当前 GoalTree 继续执行任务。"
			
 
				+        )
			
 
				+
			
 
				+        summary_msg = Message.create(
			
 
				+            trace_id=trace_id,
			
 
				+            role="user",
			
 
				+            sequence=sequence,
			
 
				+            goal_id=None,
			
 
				+            parent_sequence=system_msg_seq,  # 跳到 system msg，跳过所有中间消息
			
 
				+            content=summary_with_header,
			
 
				+        )
			
 
				+
			
 
				+        if self.trace_store:
			
 
				+            await self.trace_store.add_message(summary_msg)
			
 
				+
			
 
				+        new_head_seq = sequence
			
 
				+        sequence += 1
			
 
				+
			
 
				+        # --- Step 4: 重建 history ---
			
 
				+        new_history = [system_msg_dict, summary_msg.to_llm_dict()]
			
 
				+
			
 
				+        # 更新 trace head_sequence
			
 
				+        if self.trace_store:
			
 
				+            await self.trace_store.update_trace(
			
 
				+                trace_id,
			
 
				+                head_sequence=new_head_seq,
			
 
				+            )
			
 
				+
			
 
				+        logger.info(
			
 
				+            "Level 2 压缩完成: 旧 history %d 条 → 新 history %d 条, summary 长度=%d",
			
 
				+            len(history), len(new_history), len(summary_text),
			
 
				+        )
			
 
				+
			
 
				+        return new_history, new_head_seq, sequence
			
 
				+
			
 
				     # ===== 回溯（Rewind）=====
			
 
				 
			
 
				     async def _rewind(
			
@@ -1067,13 +1242,26 @@ class AgentRunner:
 
				         trace: Trace,
			
 
				         goal_tree: Optional[GoalTree],
			
 
				     ) -> str:
			
 
				-        """构建周期性注入的上下文（GoalTree + Active Collaborators）"""
			
 
				+        """构建周期性注入的上下文（GoalTree + Active Collaborators + Focus 提醒）"""
			
 
				         parts = []
			
 
				 
			
 
				         # GoalTree
			
 
				         if goal_tree and goal_tree.goals:
			
 
				             parts.append(f"## Current Plan\n\n{goal_tree.to_prompt()}")
			
 
				 
			
 
				+            # 检测 focus 在有子节点的父目标上：提醒模型 focus 到具体子目标
			
 
				+            if goal_tree.current_id:
			
 
				+                children = goal_tree.get_children(goal_tree.current_id)
			
 
				+                pending_children = [c for c in children if c.status in ("pending", "in_progress")]
			
 
				+                if pending_children:
			
 
				+                    child_ids = ", ".join(
			
 
				+                        goal_tree._generate_display_id(c) for c in pending_children[:3]
			
 
				+                    )
			
 
				+                    parts.append(
			
 
				+                        f"**提醒**：当前焦点在父目标上，建议用 `goal(focus=\"...\")` "
			
 
				+                        f"切换到具体子目标（如 {child_ids}）再执行。"
			
 
				+                    )
			
 
				+
			
 
				         # Active Collaborators
			
 
				         collaborators = trace.context.get("collaborators", [])
			
 
				         if collaborators:
			
--- a/agent/llm/yescode.py
+++ b/agent/llm/yescode.py
@@ -325,7 +325,7 @@ async def yescode_llm_call(
 
				     payload = {
			
 
				         "model": api_model,
			
 
				         "messages": anthropic_messages,
			
 
				-        "max_tokens": kwargs.get("max_tokens", 8192),
			
 
				+        "max_tokens": kwargs.get("max_tokens", 16384),
			
 
				     }
			
 
				 
			
 
				     if system_prompt:
			
--- a/agent/memory/skills/core.md
+++ b/agent/memory/skills/core.md
@@ -59,10 +59,11 @@ goal(abandon="方案A需要Redis，环境没有")
 
				 
			
 
				 ### 使用规范
			
 
				 
			
 
				-1. **同时只有一个目标处于执行中**：完成当前目标后再切换
			
 
				-2. **summary 记录结论**：记录关键发现，而非 "已完成调研" 这样无信息量的描述
			
 
				-3. **计划可调整**：根据执行情况随时追加、跳过或放弃目标
			
 
				-4. **使用 ID 定位**：focus、after、under 参数使用目标的 ID（如 "1", "2.1"）
			
 
				+1. **聚焦到具体目标**：始终将焦点放在你正在执行的最具体的子目标上，而不是父目标。创建子目标后立即 `focus` 到第一个要执行的子目标。完成后用 `done` + `focus` 切换到下一个。
			
 
				+2. **同时只有一个目标处于执行中**：完成当前目标后再切换
			
 
				+3. **summary 记录结论**：记录关键发现，而非 "已完成调研" 这样无信息量的描述
			
 
				+4. **计划可调整**：根据执行情况随时追加、跳过或放弃目标
			
 
				+5. **使用 ID 定位**：focus、after、under 参数使用目标的 ID（如 "1", "2.1"）
			
 
				 
			
 
				 ## 信息调研
			
 
				 
			
--- a/agent/trace/compaction.py
+++ b/agent/trace/compaction.py
@@ -144,7 +144,12 @@ def filter_by_goal_status(
 
				 
			
 
				 
			
 
				 def _get_focus_path(goal_tree: GoalTree) -> Set[str]:
			
 
				-    """获取焦点路径上的所有 goal IDs（焦点 + 父链 + 直接子节点）"""
			
 
				+    """
			
 
				+    获取焦点路径上需要保留消息的 goal IDs
			
 
				+
			
 
				+    保留：焦点自身 + 父链 + 未完成的直接子节点
			
 
				+    不保留：已完成/已放弃的直接子节点（信息已在 goal.summary 中）
			
 
				+    """
			
 
				     focus_ids: Set[str] = set()
			
 
				 
			
 
				     if not goal_tree.current_id:
			
@@ -159,10 +164,11 @@ def _get_focus_path(goal_tree: GoalTree) -> Set[str]:
 
				         focus_ids.add(goal.parent_id)
			
 
				         goal = goal_tree.find(goal.parent_id)
			
 
				 
			
 
				-    # 直接子节点
			
 
				+    # 直接子节点：仅保留未完成的（completed/abandoned 的信息已在 summary 中）
			
 
				     children = goal_tree.get_children(goal_tree.current_id)
			
 
				     for child in children:
			
 
				-        focus_ids.add(child.id)
			
 
				+        if child.status not in ("completed", "abandoned"):
			
 
				+            focus_ids.add(child.id)
			
 
				 
			
 
				     return focus_ids
			
 
				 
			
@@ -173,29 +179,64 @@ def estimate_tokens(messages: List[Dict[str, Any]]) -> int:
 
				     """
			
 
				     估算消息列表的 token 数量
			
 
				 
			
 
				-    简单估算：字符数 / 4。实际使用时应该用 tiktoken 或 API 返回的 token 数。
			
 
				+    对 CJK 字符和 ASCII 字符使用不同的估算系数：
			
 
				+    - ASCII/Latin 字符：~4 字符 ≈ 1 token
			
 
				+    - CJK 字符（中日韩）：~1 字符 ≈ 1.5 tokens（BPE tokenizer 特性）
			
 
				     """
			
 
				-    total_chars = 0
			
 
				+    total_tokens = 0
			
 
				     for msg in messages:
			
 
				         content = msg.get("content", "")
			
 
				         if isinstance(content, str):
			
 
				-            total_chars += len(content)
			
 
				+            total_tokens += _estimate_text_tokens(content)
			
 
				         elif isinstance(content, list):
			
 
				             for part in content:
			
 
				                 if isinstance(part, dict) and part.get("type") == "text":
			
 
				-                    total_chars += len(part.get("text", ""))
			
 
				+                    total_tokens += _estimate_text_tokens(part.get("text", ""))
			
 
				         # tool_calls
			
 
				         tool_calls = msg.get("tool_calls")
			
 
				         if tool_calls and isinstance(tool_calls, list):
			
 
				             for tc in tool_calls:
			
 
				                 if isinstance(tc, dict):
			
 
				                     func = tc.get("function", {})
			
 
				-                    total_chars += len(func.get("name", ""))
			
 
				+                    total_tokens += len(func.get("name", "")) // 4
			
 
				                     args = func.get("arguments", "")
			
 
				                     if isinstance(args, str):
			
 
				-                        total_chars += len(args)
			
 
				+                        total_tokens += _estimate_text_tokens(args)
			
 
				+
			
 
				+    return total_tokens
			
 
				 
			
 
				-    return total_chars // 4
			
 
				+
			
 
				+def _estimate_text_tokens(text: str) -> int:
			
 
				+    """
			
 
				+    估算文本的 token 数，区分 CJK 和 ASCII 字符。
			
 
				+
			
 
				+    CJK 字符在 BPE tokenizer 中通常占 1.5-2 tokens，
			
 
				+    ASCII 字符约 4 个对应 1 token。
			
 
				+    """
			
 
				+    if not text:
			
 
				+        return 0
			
 
				+    cjk_chars = 0
			
 
				+    other_chars = 0
			
 
				+    for ch in text:
			
 
				+        if _is_cjk(ch):
			
 
				+            cjk_chars += 1
			
 
				+        else:
			
 
				+            other_chars += 1
			
 
				+    # CJK: 1 char ≈ 1.5 tokens; ASCII: 4 chars ≈ 1 token
			
 
				+    return int(cjk_chars * 1.5) + other_chars // 4
			
 
				+
			
 
				+
			
 
				+def _is_cjk(ch: str) -> bool:
			
 
				+    """判断字符是否为 CJK（中日韩）字符"""
			
 
				+    cp = ord(ch)
			
 
				+    return (
			
 
				+        0x2E80 <= cp <= 0x9FFF       # CJK 基本区 + 部首 + 笔画 + 兼容
			
 
				+        or 0xF900 <= cp <= 0xFAFF    # CJK 兼容表意文字
			
 
				+        or 0xFE30 <= cp <= 0xFE4F    # CJK 兼容形式
			
 
				+        or 0x20000 <= cp <= 0x2FA1F  # CJK 扩展 B-F + 兼容补充
			
 
				+        or 0x3000 <= cp <= 0x303F   # CJK 标点符号
			
 
				+        or 0xFF00 <= cp <= 0xFFEF   # 全角字符
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 def estimate_tokens_from_messages(messages: List[Message]) -> int:
			
@@ -221,7 +262,7 @@ COMPRESSION_PROMPT = """请对以上对话历史进行压缩总结。
 
				 1. 保留关键决策、结论和产出（如创建的文件、修改的代码、得出的分析结论）
			
 
				 2. 保留重要的上下文（如用户的要求、约束条件、之前的讨论结果）
			
 
				 3. 省略中间探索过程、重复的工具调用细节
			
 
				-4. 使用结构化格式（标题 + 要点）
			
 
				+4. 使用结构化格式（标题 + 要点 + 相关资源引用，若有）
			
 
				 5. 控制在 2000 字以内
			
 
				 
			
 
				 当前 GoalTree 状态（完整版，含 summary）：
			
@@ -231,13 +272,13 @@ COMPRESSION_PROMPT = """请对以上对话历史进行压缩总结。
 
				 REFLECT_PROMPT = """请回顾以上整个执行过程，提取有价值的经验教训。
			
 
				 
			
 
				 关注以下方面：
			
 
				-1. **人工干预**：如果有用户中途修改了指令或纠正了方向，说明之前的决策哪里有问题
			
 
				+1. **人工干预**：用户中途的指令是否说明了原来的执行过程哪里有问题
			
 
				 2. **弯路**：哪些尝试是不必要的，有没有更直接的方法
			
 
				 3. **好的决策**：哪些判断和选择是正确的，值得记住
			
 
				 4. **工具使用**：哪些工具用法是高效的，哪些可以改进
			
 
				 
			
 
				 请以简洁的规则列表形式输出，每条规则格式为：
			
 
				-- 当遇到 [条件] 时，应该 [动作]（原因：[简短说明]）
			
 
				+- 当遇到 [条件] 时，应该 [动作]（原因：[简短说明]）。具体案例：[案例]
			
 
				 """
			
 
				 
			
 
				 
			
--- a/examples/analyze_story/sft_v2/00_task_definition.md
+++ b/examples/analyze_story/sft_v2/00_task_definition.md
@@ -0,0 +1,203 @@
 
				+# SFT 数据集：核心任务定义
			
 
				+
			
 
				+## 设计原则
			
 
				+
			
 
				+长篇叙事的核心挑战是：**在已有上文的情况下，做出正确的续写决策**。
			
 
				+AI 需要学习的，是这个决策的思考过程（Chain of Thought）。
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Task 1: 结构规划（Structure Planning）
			
 
				+
			
 
				+**目标**：给定故事当前状态，规划下一个 Scene-Sequel 单元的结构。
			
 
				+
			
 
				+### 输入
			
 
				+```
			
 
				+[系统角色] 你是一位专业的长篇小说结构规划师...
			
 
				+
			
 
				+[用户] 
			
 
				+## 故事状态
			
 
				+- 书名：{title}
			
 
				+- 当前位置：第{chapter}章，约{position}%处
			
 
				+- 已激活的MICE线程：{mice_threads}
			
 
				+- 上一个Scene的Disaster：{last_disaster}
			
 
				+- 上一个Sequel的Decision：{last_decision}
			
 
				+
			
 
				+## 上文（最近500-800字）
			
 
				+{context_text}
			
 
				+
			
 
				+## 任务
			
 
				+请规划下一个Scene-Sequel单元的结构。
			
 
				+```
			
 
				+
			
 
				+### 输出（带CoT）
			
 
				+```
			
 
				+<think>
			
 
				+## 叙事状态分析
			
 
				+[分析当前处于哪个MICE线程、Save the Cat哪个节拍]
			
 
				+[分析上一个Disaster/Decision对下一步的约束]
			
 
				+
			
 
				+## 续写决策
			
 
				+[决定下一个Scene的Goal、Conflict类型、Disaster方向]
			
 
				+[决定是否需要爽点/钩子，类型和强度]
			
 
				+[决定节奏：快/慢，对话比例]
			
 
				+</think>
			
 
				+
			
 
				+{
			
 
				+  "scene": {
			
 
				+    "goal": "...",
			
 
				+    "conflict_type": "人物冲突|环境冲突|内心冲突|信息冲突",
			
 
				+    "conflict_description": "...",
			
 
				+    "disaster": "...",
			
 
				+    "pacing": "fast|medium|slow",
			
 
				+    "dialogue_ratio": 0.0-1.0
			
 
				+  },
			
 
				+  "sequel": {
			
 
				+    "reaction": "...",
			
 
				+    "dilemma": "...",
			
 
				+    "decision": "..."
			
 
				+  },
			
 
				+  "hooks": [
			
 
				+    {"type": "chapter_end|mid_chapter", "content": "..."}
			
 
				+  ],
			
 
				+  "shuang_point": {
			
 
				+    "has_shuang": true|false,
			
 
				+    "type": "打脸|升级|装逼|获得|碾压",
			
 
				+    "position": "scene_start|scene_mid|scene_end"
			
 
				+  },
			
 
				+  "mice_advancement": "M|I|C|E",
			
 
				+  "estimated_words": 1500-3000
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Task 2: 场景续写（Scene Continuation）
			
 
				+
			
 
				+**目标**：给定上文 + 结构规划，生成下一段正文。
			
 
				+
			
 
				+### 输入
			
 
				+```
			
 
				+[系统角色] 你是一位专业的网文作家，擅长写爽文...
			
 
				+
			
 
				+[用户]
			
 
				+## 上文
			
 
				+{context_text}  ← 真实原文，500-1500字
			
 
				+
			
 
				+## 结构规划
			
 
				+{structure_plan}  ← Task 1 的输出
			
 
				+
			
 
				+## 任务
			
 
				+请续写下一段（约{target_words}字），风格与上文保持一致。
			
 
				+```
			
 
				+
			
 
				+### 输出（带CoT）
			
 
				+```
			
 
				+<think>
			
 
				+## 上文理解
			
 
				+[识别上文的叙事状态：最后一个Scene/Sequel的位置，主角的情绪状态]
			
 
				+[识别关键信息：哪些细节需要在续写中呼应]
			
 
				+
			
 
				+## 写法决策
			
 
				+[开头如何衔接：直接延续/场景切换/时间跳跃]
			
 
				+[爽点如何植入：在哪个位置，用什么方式]
			
 
				+[钩子如何设置：章末悬念的具体内容]
			
 
				+[对话设计：谁说什么，潜台词是什么]
			
 
				+</think>
			
 
				+
			
 
				+{续写正文}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Task 3: 爽点注入（Shuang Point Injection）
			
 
				+
			
 
				+**目标**：给定一段"平淡草稿"，注入爽点使其升级。
			
 
				+
			
 
				+### 输入
			
 
				+```
			
 
				+[系统角色] 你是一位专业的网文编辑，擅长设计爽点...
			
 
				+
			
 
				+[用户]
			
 
				+## 平淡草稿
			
 
				+{draft_text}  ← 去掉爽点后的版本（或弱化版）
			
 
				+
			
 
				+## 要求
			
 
				+- 爽点类型：{shuang_type}
			
 
				+- 强度：{intensity} (low|medium|high)
			
 
				+- 不改变核心情节，只增强情感冲击力
			
 
				+
			
 
				+## 任务
			
 
				+请注入爽点，输出增强版本。
			
 
				+```
			
 
				+
			
 
				+### 输出（带CoT）
			
 
				+```
			
 
				+<think>
			
 
				+## 草稿分析
			
 
				+[识别草稿的问题：哪里平淡、缺少冲击力]
			
 
				+[识别可以利用的素材：已有的冲突点、角色特征]
			
 
				+
			
 
				+## 爽点设计
			
 
				+[选择注入位置：在哪里插入铺垫，在哪里爆发]
			
 
				+[设计铺垫：如何建立对比/期待]
			
 
				+[设计爆发：具体的爽点内容]
			
 
				+[设计反应：旁观者/对手的震惊反应]
			
 
				+</think>
			
 
				+
			
 
				+{增强版正文}
			
 
				+
			
 
				+---
			
 
				+**修改说明**：
			
 
				+- 注入位置：...
			
 
				+- 爽点类型：...
			
 
				+- 关键改动：...
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 数据格式：JSONL（每行一条训练样本）
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "messages": [
			
 
				+    {"role": "system", "content": "..."},
			
 
				+    {"role": "user", "content": "..."},
			
 
				+    {"role": "assistant", "content": "<think>\n...\n</think>\n\n..."}
			
 
				+  ],
			
 
				+  "metadata": {
			
 
				+    "task_type": "structure_planning|scene_continuation|shuang_injection",
			
 
				+    "source_file": "大奉打更人.txt",
			
 
				+    "chapter": "第4章",
			
 
				+    "position_percent": 0.04,
			
 
				+    "mice_thread": "E",
			
 
				+    "beat": "fun_and_games",
			
 
				+    "quality_score": 4,
			
 
				+    "word_count": 1200
			
 
				+  }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 负样本格式（用于 DPO 训练）
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "prompt": [
			
 
				+    {"role": "system", "content": "..."},
			
 
				+    {"role": "user", "content": "..."}
			
 
				+  ],
			
 
				+  "chosen": {
			
 
				+    "role": "assistant",
			
 
				+    "content": "<think>...[好的CoT]...</think>\n\n[高质量续写]"
			
 
				+  },
			
 
				+  "rejected": {
			
 
				+    "role": "assistant", 
			
 
				+    "content": "<think>...[差的CoT]...</think>\n\n[低质量续写]"
			
 
				+  },
			
 
				+  "metadata": {
			
 
				+    "rejection_reason": "缺少爽点铺垫|节奏拖沓|CoT未分析叙事结构|..."
			
 
				+  }
			
 
				+}
			
 
				+```
			
--- a/examples/analyze_story/sft_v2/build_dataset.py
+++ b/examples/analyze_story/sft_v2/build_dataset.py
@@ -0,0 +1,766 @@
 
				+"""
			
 
				+长篇叙事 SFT 数据集构建工具
			
 
				+============================
			
 
				+
			
 
				+三个核心任务：
			
 
				+  Task 1: structure_planning  - 给定上文，规划下一个 Scene-Sequel 结构
			
 
				+  Task 2: scene_continuation  - 给定上文+规划，续写正文（CoT + 正文）
			
 
				+  Task 3: shuang_injection    - 给定平淡草稿，注入爽点
			
 
				+
			
 
				+用法：
			
 
				+  # 处理单个文件，生成所有任务的训练数据
			
 
				+  python build_dataset.py --input ../input_1/大奉打更人.txt --tasks all
			
 
				+
			
 
				+  # 只生成续写任务数据
			
 
				+  python build_dataset.py --input ../input_1/大奉打更人.txt --tasks task2
			
 
				+
			
 
				+  # 处理多个文件
			
 
				+  python build_dataset.py --input ../input_1/ --tasks all --max-samples 50
			
 
				+
			
 
				+  # 指定模型（默认 gemini-2.0-flash-001，快速便宜）
			
 
				+  python build_dataset.py --input ../input_1/大奉打更人.txt --model google/gemini-2.5-flash-preview
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import json
			
 
				+import re
			
 
				+import argparse
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+from typing import Iterator
			
 
				+
			
 
				+import requests
			
 
				+from dotenv import load_dotenv
			
 
				+
			
 
				+# ── 路径设置 ──────────────────────────────────────────────────────────────────
			
 
				+HERE = Path(__file__).parent
			
 
				+ROOT = HERE.parent.parent.parent
			
 
				+sys.path.insert(0, str(ROOT))
			
 
				+load_dotenv(ROOT / ".env")
			
 
				+
			
 
				+OPEN_ROUTER_KEY = os.environ.get("OPEN_ROUTER_API_KEY", "")
			
 
				+if not OPEN_ROUTER_KEY:
			
 
				+    raise RuntimeError("请在 .env 中设置 OPEN_ROUTER_API_KEY")
			
 
				+
			
 
				+OUTPUT_DIR = HERE / "output"
			
 
				+OUTPUT_DIR.mkdir(exist_ok=True)
			
 
				+
			
 
				+# ── LLM 调用（单次，同步）────────────────────────────────────────────────────
			
 
				+
			
 
				+def llm_call(
			
 
				+    messages: list[dict],
			
 
				+    model: str = "google/gemini-2.0-flash-001",
			
 
				+    temperature: float = 0.7,
			
 
				+    max_tokens: int = 4096,
			
 
				+    retry: int = 3,
			
 
				+) -> str:
			
 
				+    """调用 OpenRouter API，返回 assistant 文本。失败自动重试。"""
			
 
				+    for attempt in range(retry):
			
 
				+        try:
			
 
				+            resp = requests.post(
			
 
				+                "https://openrouter.ai/api/v1/chat/completions",
			
 
				+                headers={
			
 
				+                    "Authorization": f"Bearer {OPEN_ROUTER_KEY}",
			
 
				+                    "Content-Type": "application/json",
			
 
				+                    "HTTP-Referer": "https://github.com/narrative-sft",
			
 
				+                },
			
 
				+                json={
			
 
				+                    "model": model,
			
 
				+                    "messages": messages,
			
 
				+                    "max_tokens": max_tokens,
			
 
				+                    "temperature": temperature,
			
 
				+                },
			
 
				+                timeout=120,
			
 
				+            )
			
 
				+            resp.raise_for_status()
			
 
				+            return resp.json()["choices"][0]["message"]["content"]
			
 
				+        except Exception as e:
			
 
				+            if attempt < retry - 1:
			
 
				+                wait = 2 ** attempt
			
 
				+                print(f"  [LLM] 重试 {attempt+1}/{retry}，等待 {wait}s... ({e})")
			
 
				+                time.sleep(wait)
			
 
				+            else:
			
 
				+                raise
			
 
				+
			
 
				+
			
 
				+# ── 文本解析：从小说文件中切分场景单元 ───────────────────────────────────────
			
 
				+
			
 
				+def detect_encoding(path: Path) -> str:
			
 
				+    """检测文件编码（GBK 或 UTF-8）。"""
			
 
				+    for enc in ("utf-8", "gbk", "gb18030"):
			
 
				+        try:
			
 
				+            path.read_text(encoding=enc)
			
 
				+            return enc
			
 
				+        except UnicodeDecodeError:
			
 
				+            continue
			
 
				+    return "utf-8"
			
 
				+
			
 
				+
			
 
				+def load_novel(path: Path) -> str:
			
 
				+    """读取小说文件，返回纯文本。"""
			
 
				+    enc = detect_encoding(path)
			
 
				+    text = path.read_text(encoding=enc, errors="replace")
			
 
				+    # 去掉版权声明等头部
			
 
				+    text = re.sub(r"={10,}.*?={10,}", "", text, flags=re.DOTALL)
			
 
				+    return text.strip()
			
 
				+
			
 
				+
			
 
				+def split_chapters(text: str) -> list[dict]:
			
 
				+    """
			
 
				+    按章节切分文本。
			
 
				+    支持格式：
			
 
				+      - "第N章 标题"（独占一行）
			
 
				+      - "第N章\n正文"（章节标题后紧跟正文）
			
 
				+    返回: [{"chapter": "第N章 标题", "content": "正文", "index": N}, ...]
			
 
				+    """
			
 
				+    # 匹配常见章节标题格式（允许行首有全角空格）
			
 
				+    pattern = re.compile(
			
 
				+        r"^[　\s]*(第[零一二三四五六七八九十百千\d]+[章节回卷][^\n]{0,40})\s*$",
			
 
				+        re.MULTILINE,
			
 
				+    )
			
 
				+    matches = list(pattern.finditer(text))
			
 
				+    if not matches:
			
 
				+        return [{"chapter": "全文", "content": text, "index": 0}]
			
 
				+
			
 
				+    chapters = []
			
 
				+    for i, m in enumerate(matches):
			
 
				+        start = m.end()
			
 
				+        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
			
 
				+        content = text[start:end].strip()
			
 
				+        if len(content) > 300:
			
 
				+            chapters.append({
			
 
				+                "chapter": m.group(1).strip(),
			
 
				+                "content": content,
			
 
				+                "index": i,
			
 
				+            })
			
 
				+    return chapters
			
 
				+
			
 
				+
			
 
				+def normalize_paragraphs(text: str) -> list[str]:
			
 
				+    """
			
 
				+    将小说文本规范化为段落列表。
			
 
				+    处理：
			
 
				+      - 单 \\n 分隔的段落（网文常见格式）
			
 
				+      - 行首全角空格（　　）缩进
			
 
				+      - 过滤空行
			
 
				+    """
			
 
				+    lines = text.split("\n")
			
 
				+    paragraphs = []
			
 
				+    for line in lines:
			
 
				+        # 去掉行首全角/半角空格
			
 
				+        line = line.strip().lstrip("　").strip()
			
 
				+        if line:
			
 
				+            paragraphs.append(line)
			
 
				+    return paragraphs
			
 
				+
			
 
				+
			
 
				+def extract_scene_units(
			
 
				+    chapter_content: str,
			
 
				+    min_context_chars: int = 500,
			
 
				+    target_context_chars: int = 1000,
			
 
				+    target_continuation_chars: int = 600,
			
 
				+) -> list[dict]:
			
 
				+    """
			
 
				+    从章节内容中提取 Scene-Sequel 候选单元（滑动窗口）。
			
 
				+
			
 
				+    返回: [{"context": str, "continuation": str,
			
 
				+             "context_words": int, "continuation_words": int}, ...]
			
 
				+    """
			
 
				+    paragraphs = normalize_paragraphs(chapter_content)
			
 
				+    if len(paragraphs) < 4:
			
 
				+        return []
			
 
				+
			
 
				+    units = []
			
 
				+    total = len(paragraphs)
			
 
				+    i = 0
			
 
				+
			
 
				+    while i < total - 3:
			
 
				+        # ── 积累 context ──────────────────────────────────────────────────────
			
 
				+        ctx_paras, ctx_chars = [], 0
			
 
				+        j = i
			
 
				+        while j < total and ctx_chars < target_context_chars:
			
 
				+            ctx_paras.append(paragraphs[j])
			
 
				+            ctx_chars += len(paragraphs[j])
			
 
				+            j += 1
			
 
				+
			
 
				+        if ctx_chars < min_context_chars or j >= total:
			
 
				+            i += max(1, len(ctx_paras) // 2)
			
 
				+            continue
			
 
				+
			
 
				+        # ── 积累 continuation ─────────────────────────────────────────────────
			
 
				+        cont_paras, cont_chars = [], 0
			
 
				+        k = j
			
 
				+        while k < total and cont_chars < target_continuation_chars:
			
 
				+            cont_paras.append(paragraphs[k])
			
 
				+            cont_chars += len(paragraphs[k])
			
 
				+            k += 1
			
 
				+
			
 
				+        if cont_chars < 150:
			
 
				+            i += max(1, len(ctx_paras) // 2)
			
 
				+            continue
			
 
				+
			
 
				+        units.append({
			
 
				+            "context": "\n　　".join(ctx_paras),       # 还原网文缩进格式
			
 
				+            "continuation": "\n　　".join(cont_paras),
			
 
				+            "context_words": ctx_chars,
			
 
				+            "continuation_words": cont_chars,
			
 
				+        })
			
 
				+
			
 
				+        # 步进：跳过 context 的一半（形成重叠，增加多样性）
			
 
				+        i += max(1, len(ctx_paras) // 2)
			
 
				+
			
 
				+    return units
			
 
				+
			
 
				+
			
 
				+# ── Prompt 模板 ───────────────────────────────────────────────────────────────
			
 
				+
			
 
				+SYSTEM_STRUCTURE_PLANNING = """你是一位专业的长篇小说结构分析师，精通以下叙事理论：
			
 
				+- Scene-Sequel 结构（Dwight V. Swain）：Scene = Goal→Conflict→Disaster；Sequel = Reaction→Dilemma→Decision
			
 
				+- MICE Quotient（Orson Scott Card）：Milieu / Idea / Character / Event 四类线程
			
 
				+- Save the Cat 节拍（Blake Snyder）：15个关键节拍
			
 
				+- 网文爽点理论：打脸、升级、装逼、获得、碾压五类爽点
			
 
				+
			
 
				+你的任务是：分析给定的上文，规划下一个 Scene-Sequel 单元的结构。
			
 
				+输出必须包含：
			
 
				+1. <think> 标签内的叙事分析（真实的决策推理，不是事后解释）
			
 
				+2. 结构化 JSON 规划"""
			
 
				+
			
 
				+SYSTEM_SCENE_CONTINUATION = """你是一位专业的网文作家，擅长写节奏紧凑、爽点密集的长篇小说。
			
 
				+你精通 Scene-Sequel 结构，知道如何在续写中：
			
 
				+- 自然衔接上文的叙事状态
			
 
				+- 在正确位置植入爽点（铺垫→爆发→反应）
			
 
				+- 在章节末尾设置钩子
			
 
				+- 保持与原文一致的文风和节奏
			
 
				+
			
 
				+你的任务是：根据上文和结构规划，续写下一段正文。
			
 
				+输出必须包含：
			
 
				+1. <think> 标签内的写法决策（真实的创作思考过程）
			
 
				+2. 续写正文"""
			
 
				+
			
 
				+SYSTEM_SHUANG_INJECTION = """你是一位专业的网文编辑，擅长识别和设计爽点。
			
 
				+你知道爽点的三要素：铺垫（建立期待/对比）→ 爆发（核心爽感）→ 反应（放大效果）。
			
 
				+
			
 
				+你的任务是：分析给定的平淡草稿，注入爽点使其升级。
			
 
				+输出必须包含：
			
 
				+1. <think> 标签内的爽点设计分析
			
 
				+2. 注入爽点后的增强版正文
			
 
				+3. 简要的修改说明"""
			
 
				+
			
 
				+
			
 
				+def make_structure_planning_prompt(
			
 
				+    title: str,
			
 
				+    chapter: str,
			
 
				+    position_pct: float,
			
 
				+    context: str,
			
 
				+) -> list[dict]:
			
 
				+    user_content = f"""## 书名
			
 
				+{title}
			
 
				+
			
 
				+## 当前位置
			
 
				+{chapter}，约 {position_pct:.0%} 处
			
 
				+
			
 
				+## 上文（最近约 {len(context)} 字）
			
 
				+{context}
			
 
				+
			
 
				+## 任务
			
 
				+请分析上文的叙事状态，规划下一个 Scene-Sequel 单元的结构。
			
 
				+
			
 
				+**要求**：
			
 
				+1. 在 <think> 中分析：
			
 
				+   - 上文最后一个 Scene 的 Goal/Conflict/Disaster 是什么
			
 
				+   - 上文最后一个 Sequel 的 Reaction/Dilemma/Decision 是什么（如果有）
			
 
				+   - 当前激活的 MICE 线程（M/I/C/E）及其状态
			
 
				+   - 当前处于 Save the Cat 的哪个节拍
			
 
				+   - 下一步应该推进哪个线程，为什么
			
 
				+2. 输出 JSON 格式的结构规划（严格按照下面的 schema）
			
 
				+
			
 
				+**JSON Schema**：
			
 
				+```json
			
 
				+{{
			
 
				+  "scene": {{
			
 
				+    "goal": "主角在这个场景想要达成什么（具体、可衡量）",
			
 
				+    "conflict_type": "人物冲突|环境冲突|内心冲突|信息冲突",
			
 
				+    "conflict_description": "具体的障碍是什么",
			
 
				+    "disaster": "结果比预期更糟，具体是什么",
			
 
				+    "pacing": "fast|medium|slow",
			
 
				+    "dialogue_ratio": 0.6
			
 
				+  }},
			
 
				+  "sequel": {{
			
 
				+    "reaction": "主角的情感反应",
			
 
				+    "dilemma": "面临的两难选择",
			
 
				+    "decision": "做出的决定（成为下一个 Scene 的 Goal）"
			
 
				+  }},
			
 
				+  "shuang_point": {{
			
 
				+    "has_shuang": true,
			
 
				+    "type": "打脸|升级|装逼|获得|碾压",
			
 
				+    "setup": "铺垫内容",
			
 
				+    "payoff": "爆发内容",
			
 
				+    "reaction": "旁观者/对手的反应"
			
 
				+  }},
			
 
				+  "hooks": [
			
 
				+    {{"type": "chapter_end", "content": "章末钩子的具体内容"}}
			
 
				+  ],
			
 
				+  "mice_advancement": "E",
			
 
				+  "estimated_words": 1500
			
 
				+}}
			
 
				+```"""
			
 
				+    return [
			
 
				+        {"role": "system", "content": SYSTEM_STRUCTURE_PLANNING},
			
 
				+        {"role": "user", "content": user_content},
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+def make_scene_continuation_prompt(
			
 
				+    context: str,
			
 
				+    structure_plan: str,
			
 
				+    target_words: int = 1200,
			
 
				+) -> list[dict]:
			
 
				+    user_content = f"""## 上文
			
 
				+{context}
			
 
				+
			
 
				+## 结构规划
			
 
				+{structure_plan}
			
 
				+
			
 
				+## 任务
			
 
				+请根据上文和结构规划，续写下一段正文（目标约 {target_words} 字）。
			
 
				+
			
 
				+**要求**：
			
 
				+1. 在 <think> 中说明：
			
 
				+   - 如何衔接上文（直接延续/场景切换/时间跳跃）
			
 
				+   - 爽点在哪里植入，具体怎么写
			
 
				+   - 钩子如何设置
			
 
				+   - 对话设计（谁说什么，潜台词）
			
 
				+   - 节奏控制（哪里快，哪里慢）
			
 
				+2. 输出续写正文，风格与上文保持一致
			
 
				+3. 正文中不要出现任何结构标注或括号说明"""
			
 
				+    return [
			
 
				+        {"role": "system", "content": SYSTEM_SCENE_CONTINUATION},
			
 
				+        {"role": "user", "content": user_content},
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+def make_shuang_injection_prompt(
			
 
				+    draft: str,
			
 
				+    shuang_type: str = "智商碾压",
			
 
				+    intensity: str = "high",
			
 
				+) -> list[dict]:
			
 
				+    user_content = f"""## 平淡草稿
			
 
				+{draft}
			
 
				+
			
 
				+## 注入要求
			
 
				+- 爽点类型：{shuang_type}
			
 
				+- 强度：{intensity}（low=轻微惊讶 / medium=明显震惊 / high=三观崩塌）
			
 
				+- 不改变核心情节走向，只增强情感冲击力
			
 
				+
			
 
				+## 任务
			
 
				+请注入爽点，输出增强版本。
			
 
				+
			
 
				+**要求**：
			
 
				+1. 在 <think> 中分析：
			
 
				+   - 草稿的问题：哪里平淡，缺少什么
			
 
				+   - 爽点设计：铺垫在哪里，爆发在哪里，反应怎么写
			
 
				+   - 关键改动：具体改了哪些句子，为什么
			
 
				+2. 输出增强版正文
			
 
				+3. 在正文后附上简要修改说明（3-5条）"""
			
 
				+    return [
			
 
				+        {"role": "system", "content": SYSTEM_SHUANG_INJECTION},
			
 
				+        {"role": "user", "content": user_content},
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+# ── 数据生成函数 ──────────────────────────────────────────────────────────────
			
 
				+
			
 
				+def generate_task1_sample(
			
 
				+    unit: dict,
			
 
				+    title: str,
			
 
				+    chapter: str,
			
 
				+    position_pct: float,
			
 
				+    model: str,
			
 
				+) -> dict | None:
			
 
				+    """生成 Task 1（结构规划）训练样本。"""
			
 
				+    messages = make_structure_planning_prompt(
			
 
				+        title=title,
			
 
				+        chapter=chapter,
			
 
				+        position_pct=position_pct,
			
 
				+        context=unit["context"],
			
 
				+    )
			
 
				+    try:
			
 
				+        response = llm_call(messages, model=model, temperature=0.3, max_tokens=2048)
			
 
				+    except Exception as e:
			
 
				+        print(f"  [Task1] LLM 调用失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+    # 验证输出包含 <think> 和 JSON
			
 
				+    if "<think>" not in response or "{" not in response:
			
 
				+        print(f"  [Task1] 输出格式不符，跳过")
			
 
				+        return None
			
 
				+
			
 
				+    return {
			
 
				+        "messages": messages + [{"role": "assistant", "content": response}],
			
 
				+        "metadata": {
			
 
				+            "task_type": "structure_planning",
			
 
				+            "source_file": title,
			
 
				+            "chapter": chapter,
			
 
				+            "position_percent": round(position_pct, 3),
			
 
				+            "context_words": unit["context_words"],
			
 
				+            "model": model,
			
 
				+        },
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def generate_task2_sample(
			
 
				+    unit: dict,
			
 
				+    title: str,
			
 
				+    chapter: str,
			
 
				+    position_pct: float,
			
 
				+    model: str,
			
 
				+    use_original_as_output: bool = True,
			
 
				+) -> dict | None:
			
 
				+    """
			
 
				+    生成 Task 2（场景续写）训练样本。
			
 
				+
			
 
				+    use_original_as_output=True：
			
 
				+        先用 LLM 生成结构规划，再让 LLM 解释"原著为什么这样写"（CoT），
			
 
				+        最终输出 = CoT + 原著续写文本（金标准）。
			
 
				+
			
 
				+    use_original_as_output=False：
			
 
				+        让 LLM 直接续写，输出 = CoT + LLM 生成文本。
			
 
				+    """
			
 
				+    # Step 1: 生成结构规划（用于构建 user prompt）
			
 
				+    plan_messages = make_structure_planning_prompt(
			
 
				+        title=title,
			
 
				+        chapter=chapter,
			
 
				+        position_pct=position_pct,
			
 
				+        context=unit["context"],
			
 
				+    )
			
 
				+    try:
			
 
				+        plan_response = llm_call(plan_messages, model=model, temperature=0.3, max_tokens=1500)
			
 
				+    except Exception as e:
			
 
				+        print(f"  [Task2] 规划生成失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+    if use_original_as_output:
			
 
				+        # Step 2a: 让 LLM 解释原著的写法（逆向 CoT）
			
 
				+        explain_messages = [
			
 
				+            {"role": "system", "content": SYSTEM_SCENE_CONTINUATION},
			
 
				+            {"role": "user", "content": f"""## 上文
			
 
				+{unit["context"]}
			
 
				+
			
 
				+## 结构规划（已分析）
			
 
				+{plan_response}
			
 
				+
			
 
				+## 原著续写
			
 
				+{unit["continuation"]}
			
 
				+
			
 
				+## 任务
			
 
				+请分析：原著作者在续写这段时，做了哪些写法决策？
			
 
				+请用 <think> 标签写出你的分析，然后直接输出原著续写文本（不要修改）。
			
 
				+
			
 
				+格式：
			
 
				+<think>
			
 
				+[分析原著的写法决策：如何衔接、爽点设计、钩子设置、节奏控制等]
			
 
				+</think>
			
 
				+
			
 
				+[原著续写文本，原文照抄]"""},
			
 
				+        ]
			
 
				+        try:
			
 
				+            cot_response = llm_call(explain_messages, model=model, temperature=0.3, max_tokens=3000)
			
 
				+        except Exception as e:
			
 
				+            print(f"  [Task2] CoT 生成失败: {e}")
			
 
				+            return None
			
 
				+
			
 
				+        # 确保输出包含原著文本（简单验证：取原著前50字检查）
			
 
				+        original_snippet = unit["continuation"][:50].strip()
			
 
				+        if original_snippet not in cot_response and len(cot_response) < 200:
			
 
				+            print(f"  [Task2] 输出未包含原著文本，跳过")
			
 
				+            return None
			
 
				+
			
 
				+        assistant_content = cot_response
			
 
				+    else:
			
 
				+        # Step 2b: 直接续写
			
 
				+        cont_messages = make_scene_continuation_prompt(
			
 
				+            context=unit["context"],
			
 
				+            structure_plan=plan_response,
			
 
				+            target_words=unit["continuation_words"],
			
 
				+        )
			
 
				+        try:
			
 
				+            assistant_content = llm_call(cont_messages, model=model, temperature=0.7, max_tokens=3000)
			
 
				+        except Exception as e:
			
 
				+            print(f"  [Task2] 续写生成失败: {e}")
			
 
				+            return None
			
 
				+
			
 
				+    # 构建最终的 user prompt（包含结构规划）
			
 
				+    final_user_content = f"""## 上文
			
 
				+{unit["context"]}
			
 
				+
			
 
				+## 结构规划
			
 
				+{plan_response}
			
 
				+
			
 
				+## 任务
			
 
				+请根据上文和结构规划，续写下一段正文（目标约 {unit["continuation_words"]} 字）。
			
 
				+在 <think> 中说明写法决策，然后输出续写正文。"""
			
 
				+
			
 
				+    return {
			
 
				+        "messages": [
			
 
				+            {"role": "system", "content": SYSTEM_SCENE_CONTINUATION},
			
 
				+            {"role": "user", "content": final_user_content},
			
 
				+            {"role": "assistant", "content": assistant_content},
			
 
				+        ],
			
 
				+        "metadata": {
			
 
				+            "task_type": "scene_continuation",
			
 
				+            "source_file": title,
			
 
				+            "chapter": chapter,
			
 
				+            "position_percent": round(position_pct, 3),
			
 
				+            "context_words": unit["context_words"],
			
 
				+            "continuation_words": unit["continuation_words"],
			
 
				+            "use_original": use_original_as_output,
			
 
				+            "model": model,
			
 
				+        },
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def generate_task3_sample(
			
 
				+    unit: dict,
			
 
				+    title: str,
			
 
				+    chapter: str,
			
 
				+    model: str,
			
 
				+) -> dict | None:
			
 
				+    """
			
 
				+    生成 Task 3（爽点注入）训练样本。
			
 
				+    策略：先让 LLM 生成"平淡版"（去掉爽点），再注入爽点，对比原著。
			
 
				+    """
			
 
				+    # Step 1: 让 LLM 生成平淡版（去掉爽点）
			
 
				+    flatten_messages = [
			
 
				+        {"role": "system", "content": "你是一位文字编辑，擅长识别和移除文本中的爽点元素。"},
			
 
				+        {"role": "user", "content": f"""请将以下文本改写成"平淡版"：
			
 
				+- 去掉所有让读者感到爽快的元素（打脸、碾压、震惊反应等）
			
 
				+- 保留核心情节和信息
			
 
				+- 改写后应该是一个"能用但不精彩"的版本
			
 
				+- 字数可以减少，但不要少于原文的 60%
			
 
				+
			
 
				+## 原文
			
 
				+{unit["continuation"]}
			
 
				+
			
 
				+## 要求
			
 
				+直接输出平淡版文本，不要任何解释。"""},
			
 
				+    ]
			
 
				+    try:
			
 
				+        draft = llm_call(flatten_messages, model=model, temperature=0.3, max_tokens=2000)
			
 
				+    except Exception as e:
			
 
				+        print(f"  [Task3] 平淡版生成失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+    if len(draft) < 100:
			
 
				+        return None
			
 
				+
			
 
				+    # Step 2: 注入爽点（目标是还原接近原著的版本）
			
 
				+    inject_messages = make_shuang_injection_prompt(
			
 
				+        draft=draft,
			
 
				+        shuang_type="智商碾压",  # 可以根据内容动态判断
			
 
				+        intensity="high",
			
 
				+    )
			
 
				+    try:
			
 
				+        enhanced = llm_call(inject_messages, model=model, temperature=0.7, max_tokens=3000)
			
 
				+    except Exception as e:
			
 
				+        print(f"  [Task3] 爽点注入失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+    if "<think>" not in enhanced:
			
 
				+        return None
			
 
				+
			
 
				+    return {
			
 
				+        "messages": inject_messages + [{"role": "assistant", "content": enhanced}],
			
 
				+        "metadata": {
			
 
				+            "task_type": "shuang_injection",
			
 
				+            "source_file": title,
			
 
				+            "chapter": chapter,
			
 
				+            "original_text": unit["continuation"],  # 保存原著用于对比
			
 
				+            "draft_text": draft,
			
 
				+            "model": model,
			
 
				+        },
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+# ── 主流程 ────────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+def process_file(
			
 
				+    input_path: Path,
			
 
				+    tasks: list[str],
			
 
				+    model: str,
			
 
				+    max_samples: int,
			
 
				+    output_dir: Path,
			
 
				+) -> dict[str, list]:
			
 
				+    """处理单个小说/剧本文件，生成训练数据。"""
			
 
				+    print(f"\n{'='*60}")
			
 
				+    print(f"处理文件: {input_path.name}")
			
 
				+    print(f"{'='*60}")
			
 
				+
			
 
				+    title = input_path.stem
			
 
				+    text = load_novel(input_path)
			
 
				+    chapters = split_chapters(text)
			
 
				+    print(f"  检测到 {len(chapters)} 个章节，总字数约 {len(text):,}")
			
 
				+
			
 
				+    results: dict[str, list] = {t: [] for t in tasks}
			
 
				+    sample_count = {t: 0 for t in tasks}
			
 
				+    total_chapters = len(chapters)
			
 
				+
			
 
				+    for ch_idx, chapter in enumerate(chapters):
			
 
				+        if all(sample_count[t] >= max_samples for t in tasks):
			
 
				+            break
			
 
				+
			
 
				+        position_pct = ch_idx / max(total_chapters - 1, 1)
			
 
				+        units = extract_scene_units(chapter["content"])
			
 
				+
			
 
				+        print(f"\n  章节 [{ch_idx+1}/{total_chapters}] {chapter['chapter'][:30]} "
			
 
				+              f"({len(units)} 个场景单元)")
			
 
				+
			
 
				+        for unit_idx, unit in enumerate(units):
			
 
				+            if all(sample_count[t] >= max_samples for t in tasks):
			
 
				+                break
			
 
				+
			
 
				+            print(f"    单元 {unit_idx+1}: 上文 {unit['context_words']}字 "
			
 
				+                  f"/ 续写 {unit['continuation_words']}字")
			
 
				+
			
 
				+            if "task1" in tasks and sample_count["task1"] < max_samples:
			
 
				+                print(f"      → Task1 结构规划...", end="", flush=True)
			
 
				+                sample = generate_task1_sample(
			
 
				+                    unit, title, chapter["chapter"], position_pct, model
			
 
				+                )
			
 
				+                if sample:
			
 
				+                    results["task1"].append(sample)
			
 
				+                    sample_count["task1"] += 1
			
 
				+                    print(f" ✓ (共 {sample_count['task1']})")
			
 
				+                else:
			
 
				+                    print(f" ✗")
			
 
				+
			
 
				+            if "task2" in tasks and sample_count["task2"] < max_samples:
			
 
				+                print(f"      → Task2 场景续写...", end="", flush=True)
			
 
				+                sample = generate_task2_sample(
			
 
				+                    unit, title, chapter["chapter"], position_pct, model,
			
 
				+                    use_original_as_output=True,
			
 
				+                )
			
 
				+                if sample:
			
 
				+                    results["task2"].append(sample)
			
 
				+                    sample_count["task2"] += 1
			
 
				+                    print(f" ✓ (共 {sample_count['task2']})")
			
 
				+                else:
			
 
				+                    print(f" ✗")
			
 
				+
			
 
				+            if "task3" in tasks and sample_count["task3"] < max_samples:
			
 
				+                print(f"      → Task3 爽点注入...", end="", flush=True)
			
 
				+                sample = generate_task3_sample(
			
 
				+                    unit, title, chapter["chapter"], model
			
 
				+                )
			
 
				+                if sample:
			
 
				+                    results["task3"].append(sample)
			
 
				+                    sample_count["task3"] += 1
			
 
				+                    print(f" ✓ (共 {sample_count['task3']})")
			
 
				+                else:
			
 
				+                    print(f" ✗")
			
 
				+
			
 
				+            # 避免 API 限速
			
 
				+            time.sleep(0.5)
			
 
				+
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+def save_results(results: dict[str, list], output_dir: Path, prefix: str = ""):
			
 
				+    """将结果保存为 JSONL 文件。"""
			
 
				+    saved = {}
			
 
				+    for task, samples in results.items():
			
 
				+        if not samples:
			
 
				+            continue
			
 
				+        fname = f"{prefix}_{task}.jsonl" if prefix else f"{task}.jsonl"
			
 
				+        out_path = output_dir / fname
			
 
				+        with open(out_path, "a", encoding="utf-8") as f:
			
 
				+            for s in samples:
			
 
				+                f.write(json.dumps(s, ensure_ascii=False) + "\n")
			
 
				+        saved[task] = len(samples)
			
 
				+        print(f"  ✓ {task}: {len(samples)} 条 → {out_path}")
			
 
				+    return saved
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser(description="长篇叙事 SFT 数据集构建工具")
			
 
				+    parser.add_argument(
			
 
				+        "--input", "-i", required=True,
			
 
				+        help="输入文件或目录（支持 .txt / .pdf / .docx）",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--tasks", "-t", default="all",
			
 
				+        help="要生成的任务（all / task1 / task2 / task3 / task1,task2）",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--model", "-m", default="google/gemini-2.0-flash-001",
			
 
				+        help="使用的模型（默认 google/gemini-2.0-flash-001）",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--max-samples", "-n", type=int, default=10,
			
 
				+        help="每个任务每个文件最多生成多少条（默认 10）",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--output", "-o", default=str(OUTPUT_DIR),
			
 
				+        help="输出目录（默认 sft_v2/output/）",
			
 
				+    )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # 解析任务列表
			
 
				+    if args.tasks == "all":
			
 
				+        tasks = ["task1", "task2", "task3"]
			
 
				+    else:
			
 
				+        tasks = [t.strip() for t in args.tasks.split(",")]
			
 
				+
			
 
				+    output_dir = Path(args.output)
			
 
				+    output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # 收集输入文件
			
 
				+    input_path = Path(args.input)
			
 
				+    if input_path.is_dir():
			
 
				+        files = list(input_path.glob("*.txt")) + list(input_path.glob("*.pdf"))
			
 
				+    elif input_path.is_file():
			
 
				+        files = [input_path]
			
 
				+    else:
			
 
				+        print(f"错误：找不到输入文件或目录: {input_path}")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    # 过滤支持的格式（当前只支持 txt）
			
 
				+    supported = [f for f in files if f.suffix.lower() == ".txt"]
			
 
				+    if not supported:
			
 
				+        print(f"错误：没有找到支持的文件（.txt）")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    print(f"\n{'='*60}")
			
 
				+    print(f"长篇叙事 SFT 数据集构建工具")
			
 
				+    print(f"{'='*60}")
			
 
				+    print(f"  输入文件: {len(supported)} 个")
			
 
				+    print(f"  任务类型: {tasks}")
			
 
				+    print(f"  模型: {args.model}")
			
 
				+    print(f"  每任务最大样本数: {args.max_samples}")
			
 
				+    print(f"  输出目录: {output_dir}")
			
 
				+    print(f"{'='*60}")
			
 
				+
			
 
				+    total_saved = {t: 0 for t in tasks}
			
 
				+
			
 
				+    for file_path in supported:
			
 
				+        results = process_file(
			
 
				+            input_path=file_path,
			
 
				+            tasks=tasks,
			
 
				+            model=args.model,
			
 
				+            max_samples=args.max_samples,
			
 
				+            output_dir=output_dir,
			
 
				+        )
			
 
				+        print(f"\n保存结果...")
			
 
				+        saved = save_results(results, output_dir, prefix=file_path.stem)
			
 
				+        for t, n in saved.items():
			
 
				+            total_saved[t] += n
			
 
				+
			
 
				+    print(f"\n{'='*60}")
			
 
				+    print(f"完成！总计生成：")
			
 
				+    for t, n in total_saved.items():
			
 
				+        print(f"  {t}: {n} 条")
			
 
				+    print(f"{'='*60}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()