|
|
@@ -0,0 +1,784 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+"""
|
|
|
+步骤2:从分析结果生成三类 SFT 训练数据
|
|
|
+
|
|
|
+三个任务(参考 00_task_definition.md):
|
|
|
+
|
|
|
+ Task 1 - 结构规划(Structure Planning)
|
|
|
+ 输入:故事状态(MICE线程、last disaster/decision、位置)+ 上文
|
|
|
+ 输出:<think>叙事状态分析 + 续写决策</think> + 结构规划 JSON
|
|
|
+ 目标:让模型学会规划下一个 Scene-Sequel 单元的结构
|
|
|
+
|
|
|
+ Task 2 - 场景续写(Scene Continuation)
|
|
|
+ 输入:上文 + 结构规划(Task 1 的输出)
|
|
|
+ 输出:<think>上文理解 + 写法决策</think> + 续写正文
|
|
|
+ 目标:让模型学会根据规划生成高质量正文
|
|
|
+
|
|
|
+ Task 3 - 爽点注入(Shuang Point Injection)
|
|
|
+ 输入:平淡草稿 + 爽点类型 + 强度要求
|
|
|
+ 输出:<think>草稿分析 + 爽点设计</think> + 增强版正文 + 修改说明
|
|
|
+ 目标:让模型学会识别并注入爽点
|
|
|
+
|
|
|
+用法:
|
|
|
+ python step2_build_sft.py \\
|
|
|
+ --analysis analysis_w0.json \\
|
|
|
+ --novel input/大奉打更人.txt \\
|
|
|
+ --output-dir sft/dafeng/ \\
|
|
|
+ [--context-chars 800] \\
|
|
|
+ [--skip-task 3] \\
|
|
|
+ [--concurrency 5] \\
|
|
|
+ [--model qwen-plus]
|
|
|
+
|
|
|
+输出文件:
|
|
|
+ sft/dafeng/task1_structure_planning.jsonl
|
|
|
+ sft/dafeng/task2_scene_continuation.jsonl
|
|
|
+ sft/dafeng/task3_shuang_injection.jsonl
|
|
|
+ sft/dafeng/stats.json
|
|
|
+"""
|
|
|
+
|
|
|
+import os
|
|
|
+import re
|
|
|
+import json
|
|
|
+import asyncio
|
|
|
+import argparse
|
|
|
+from copy import deepcopy
|
|
|
+from pathlib import Path
|
|
|
+from openai import AsyncOpenAI
|
|
|
+from dotenv import load_dotenv
|
|
|
+
|
|
|
+load_dotenv()
|
|
|
+
|
|
|
+client = AsyncOpenAI(
|
|
|
+ api_key=os.getenv("ALI_API_KEY"),
|
|
|
+ base_url=os.getenv(
|
|
|
+ "ALI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
|
|
+ ),
|
|
|
+)
|
|
|
+
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+# 基础工具
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+
|
|
|
+def load_text(path: str) -> str:
|
|
|
+ for enc in ["utf-8", "gbk", "gb2312", "gb18030"]:
|
|
|
+ try:
|
|
|
+ return Path(path).read_text(encoding=enc)
|
|
|
+ except UnicodeDecodeError:
|
|
|
+ continue
|
|
|
+ raise ValueError(f"无法解码文件: {path}")
|
|
|
+
|
|
|
+
|
|
|
+async def llm_call(
|
|
|
+ messages: list,
|
|
|
+ model: str,
|
|
|
+ temperature: float = 0.6,
|
|
|
+ max_tokens: int = 4096,
|
|
|
+) -> str:
|
|
|
+ resp = await client.chat.completions.create(
|
|
|
+ model=model,
|
|
|
+ messages=messages,
|
|
|
+ temperature=temperature,
|
|
|
+ max_tokens=max_tokens,
|
|
|
+ )
|
|
|
+ return resp.choices[0].message.content
|
|
|
+
|
|
|
+
|
|
|
+def extract_json_block(text: str) -> dict:
|
|
|
+ m = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL)
|
|
|
+ json_str = m.group(1) if m else text.strip()
|
|
|
+ try:
|
|
|
+ return json.loads(json_str)
|
|
|
+ except json.JSONDecodeError:
|
|
|
+ json_str = re.sub(r",\s*([}\]])", r"\1", json_str)
|
|
|
+ return json.loads(json_str)
|
|
|
+
|
|
|
+
|
|
|
+def write_jsonl(samples: list[dict], path: Path) -> None:
|
|
|
+ path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
+ with open(path, "w", encoding="utf-8") as f:
|
|
|
+ for s in samples:
|
|
|
+ # 去掉内部 _* 字段再写入
|
|
|
+ out = {k: v for k, v in s.items() if not k.startswith("_")}
|
|
|
+ f.write(json.dumps(out, ensure_ascii=False) + "\n")
|
|
|
+ print(f" 写入 {len(samples)} 条 → {path}")
|
|
|
+
|
|
|
+
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+# 故事状态累积
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+
|
|
|
+def apply_state_changes(state: dict, changes: dict) -> dict:
|
|
|
+ """将一个 beat 的 state_changes 应用到状态快照,返回新快照"""
|
|
|
+ state = deepcopy(state)
|
|
|
+ for pl in changes.get("plot_lines", []):
|
|
|
+ for line in state["plot_lines"]:
|
|
|
+ if line["name"] == pl["name"]:
|
|
|
+ line["status"] = pl["new_state"]
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ state["plot_lines"].append(
|
|
|
+ {"name": pl["name"], "status": pl["new_state"],
|
|
|
+ "mice_type": "?", "description": pl.get("new_state", "")}
|
|
|
+ )
|
|
|
+ for ch in changes.get("characters", []):
|
|
|
+ for char in state["characters"]:
|
|
|
+ if char["name"] == ch["name"]:
|
|
|
+ char.setdefault("recent_changes", []).append(ch["change"])
|
|
|
+ # 只保留最近 3 条变化
|
|
|
+ char["recent_changes"] = char["recent_changes"][-3:]
|
|
|
+ break
|
|
|
+ return state
|
|
|
+
|
|
|
+
|
|
|
+def build_state_snapshot(analysis: dict, beat_index: int) -> dict:
|
|
|
+ """返回 beat_index 之前的故事状态快照"""
|
|
|
+ state = {
|
|
|
+ "plot_lines": deepcopy(analysis.get("outline", {}).get("plot_lines", [])),
|
|
|
+ "characters": deepcopy(analysis.get("characters", [])),
|
|
|
+ }
|
|
|
+ for b in analysis.get("beats", [])[:beat_index]:
|
|
|
+ state = apply_state_changes(state, b.get("state_changes", {}))
|
|
|
+ return state
|
|
|
+
|
|
|
+
|
|
|
+def get_last_disaster_decision(beats: list[dict], before_index: int) -> tuple[str, str]:
|
|
|
+ """返回 beat_index 之前最后一个 scene 的 disaster 和 最后一个 sequel 的 decision"""
|
|
|
+ last_disaster = "无(故事开局)"
|
|
|
+ last_decision = "无(故事开局)"
|
|
|
+ for b in beats[:before_index]:
|
|
|
+ if b["type"] == "scene":
|
|
|
+ last_disaster = b.get("disaster", "")
|
|
|
+ elif b["type"] == "sequel":
|
|
|
+ last_decision = b.get("decision", "")
|
|
|
+ return last_disaster, last_decision
|
|
|
+
|
|
|
+
|
|
|
+def format_mice_threads(plot_lines: list[dict]) -> str:
|
|
|
+ active = [pl for pl in plot_lines if pl.get("status") not in ["已解决", "已关闭"]]
|
|
|
+ if not active:
|
|
|
+ return "(无活跃线程)"
|
|
|
+ lines = []
|
|
|
+ for pl in active:
|
|
|
+ mice = pl.get("mice_type", "?")
|
|
|
+ lines.append(f" [{mice}] {pl['name']}({pl['status']}):{pl.get('description', '')}")
|
|
|
+ return "\n".join(lines)
|
|
|
+
|
|
|
+
|
|
|
+def format_characters(characters: list[dict]) -> str:
|
|
|
+ parts = []
|
|
|
+ for c in characters:
|
|
|
+ recent = "、".join(c.get("recent_changes", []))
|
|
|
+ recent_str = f"近期:{recent}" if recent else ""
|
|
|
+ parts.append(f" {c['name']}({c.get('role', '?')})目标:{c.get('goal', '')} {recent_str}")
|
|
|
+ return "\n".join(parts)
|
|
|
+
|
|
|
+
|
|
|
+def calc_position_percent(beat: dict, total_chars: int) -> float:
|
|
|
+ return round(beat.get("position_start", 0) / max(total_chars, 1) * 100, 1)
|
|
|
+
|
|
|
+
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+# Task 1:结构规划(Structure Planning)
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+TASK1_SYSTEM = (
|
|
|
+ "你是一位专业的长篇小说结构规划师,精通 Scene-Sequel 结构、MICE 线程理论、"
|
|
|
+ "以及中国网文爽点与钩子设计。请严格按指定格式输出。"
|
|
|
+)
|
|
|
+
|
|
|
+TASK1_USER_TMPL = """\
|
|
|
+## 故事状态
|
|
|
+
|
|
|
+- 书名:{title}
|
|
|
+- 当前位置:第 {chapter} 章,约 {position_pct}% 处
|
|
|
+- 已激活的 MICE 线程:
|
|
|
+{mice_threads}
|
|
|
+- 上一个 Scene 的 Disaster:{last_disaster}
|
|
|
+- 上一个 Sequel 的 Decision:{last_decision}
|
|
|
+
|
|
|
+## 当前人物状态
|
|
|
+
|
|
|
+{characters}
|
|
|
+
|
|
|
+## 上文(最近 {context_chars} 字)
|
|
|
+
|
|
|
+{context_text}
|
|
|
+
|
|
|
+## 任务
|
|
|
+
|
|
|
+请规划下一个 Scene-Sequel 单元的结构。"""
|
|
|
+
|
|
|
+TASK1_COT_GEN_TMPL = """\
|
|
|
+## 故事状态
|
|
|
+
|
|
|
+- 书名:{title}
|
|
|
+- 当前位置:第 {chapter} 章,约 {position_pct}% 处
|
|
|
+- 已激活的 MICE 线程:
|
|
|
+{mice_threads}
|
|
|
+- 上一个 Scene 的 Disaster:{last_disaster}
|
|
|
+- 上一个 Sequel 的 Decision:{last_decision}
|
|
|
+
|
|
|
+## 当前人物状态
|
|
|
+
|
|
|
+{characters}
|
|
|
+
|
|
|
+## 上文(最近 {context_chars} 字)
|
|
|
+
|
|
|
+{context_text}
|
|
|
+
|
|
|
+## 参考信息(该节拍的实际内容摘要,仅用于帮你构建 CoT,禁止直接引用)
|
|
|
+
|
|
|
+类型:{beat_type}
|
|
|
+摘要:{beat_summary}
|
|
|
+核心要素:{beat_core}
|
|
|
+爽点:{shuang_info}
|
|
|
+
|
|
|
+---
|
|
|
+
|
|
|
+请以"事前规划"的视角写出你的思考过程和最终规划。
|
|
|
+
|
|
|
+**输出格式**:
|
|
|
+
|
|
|
+<think>
|
|
|
+## 叙事状态分析
|
|
|
+[分析当前处于哪个 MICE 线程、节拍、读者情绪积累]
|
|
|
+[分析上一个 Disaster/Decision 对下一步的约束]
|
|
|
+
|
|
|
+## 续写决策
|
|
|
+[决定下一个 Scene 的 Goal、Conflict 类型、Disaster 方向]
|
|
|
+[决定是否需要爽点/钩子,类型和强度]
|
|
|
+[决定节奏:快/慢,对话比例]
|
|
|
+</think>
|
|
|
+
|
|
|
+```json
|
|
|
+{{
|
|
|
+ "scene": {{
|
|
|
+ "goal": "...",
|
|
|
+ "conflict_type": "人物冲突|环境冲突|内心冲突|信息冲突",
|
|
|
+ "conflict_description": "...",
|
|
|
+ "disaster": "...",
|
|
|
+ "pacing": "fast|medium|slow",
|
|
|
+ "dialogue_ratio": 0.4
|
|
|
+ }},
|
|
|
+ "sequel": {{
|
|
|
+ "reaction": "...",
|
|
|
+ "dilemma": "...",
|
|
|
+ "decision": "..."
|
|
|
+ }},
|
|
|
+ "hooks": [
|
|
|
+ {{"type": "chapter_end|mid_chapter", "content": "..."}}
|
|
|
+ ],
|
|
|
+ "shuang_point": {{
|
|
|
+ "has_shuang": true,
|
|
|
+ "type": "打脸|升级|装逼|获得|碾压",
|
|
|
+ "position": "scene_start|scene_mid|scene_end"
|
|
|
+ }},
|
|
|
+ "mice_advancement": "M|I|C|E",
|
|
|
+ "estimated_words": 2000
|
|
|
+}}
|
|
|
+```"""
|
|
|
+
|
|
|
+
|
|
|
+def _beat_core_str(beat: dict) -> str:
|
|
|
+ if beat["type"] == "scene":
|
|
|
+ return (
|
|
|
+ f"goal={beat.get('goal', '')} "
|
|
|
+ f"conflict={beat.get('conflict_description', '')} "
|
|
|
+ f"disaster={beat.get('disaster', '')}"
|
|
|
+ )
|
|
|
+ return (
|
|
|
+ f"reaction={beat.get('reaction', '')} "
|
|
|
+ f"dilemma={beat.get('dilemma', '')} "
|
|
|
+ f"decision={beat.get('decision', '')}"
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+def _shuang_str(beat: dict) -> str:
|
|
|
+ sp = beat.get("shuang_point", {})
|
|
|
+ if not sp.get("has_shuang"):
|
|
|
+ return "无"
|
|
|
+ return f"{sp.get('type', '')}({sp.get('intensity', '')}):{sp.get('description', '')}"
|
|
|
+
|
|
|
+
|
|
|
+async def gen_task1_sample(
|
|
|
+ i: int,
|
|
|
+ beat: dict,
|
|
|
+ analysis: dict,
|
|
|
+ novel_text: str,
|
|
|
+ context_chars: int,
|
|
|
+ model: str,
|
|
|
+ sem: asyncio.Semaphore,
|
|
|
+) -> dict | None:
|
|
|
+ async with sem:
|
|
|
+ meta = analysis.get("_meta", {})
|
|
|
+ title = meta.get("novel_title", "未知")
|
|
|
+ total_chars = meta.get("total_chars", len(novel_text))
|
|
|
+ beats = analysis.get("beats", [])
|
|
|
+
|
|
|
+ state = build_state_snapshot(analysis, i)
|
|
|
+ last_disaster, last_decision = get_last_disaster_decision(beats, i)
|
|
|
+ mice_threads = format_mice_threads(state["plot_lines"])
|
|
|
+ characters = format_characters(state["characters"])
|
|
|
+
|
|
|
+ chapter = beat.get("chapter_start", "?")
|
|
|
+ position_pct = calc_position_percent(beat, total_chars)
|
|
|
+
|
|
|
+ ctx_start = max(0, beat["position_start"] - context_chars)
|
|
|
+ context_text = novel_text[ctx_start: beat["position_start"]].strip()
|
|
|
+
|
|
|
+ shared_kwargs = dict(
|
|
|
+ title=title,
|
|
|
+ chapter=chapter,
|
|
|
+ position_pct=position_pct,
|
|
|
+ mice_threads=mice_threads,
|
|
|
+ last_disaster=last_disaster,
|
|
|
+ last_decision=last_decision,
|
|
|
+ characters=characters,
|
|
|
+ context_chars=context_chars,
|
|
|
+ context_text=context_text,
|
|
|
+ )
|
|
|
+
|
|
|
+ # 生成 CoT + 规划 JSON
|
|
|
+ cot_prompt = TASK1_COT_GEN_TMPL.format(
|
|
|
+ beat_type=beat["type"],
|
|
|
+ beat_summary=beat.get("summary", ""),
|
|
|
+ beat_core=_beat_core_str(beat),
|
|
|
+ shuang_info=_shuang_str(beat),
|
|
|
+ **shared_kwargs,
|
|
|
+ )
|
|
|
+ messages = [
|
|
|
+ {"role": "system", "content": TASK1_SYSTEM},
|
|
|
+ {"role": "user", "content": cot_prompt},
|
|
|
+ ]
|
|
|
+ try:
|
|
|
+ assistant_content = await llm_call(messages, model=model)
|
|
|
+ except Exception as e:
|
|
|
+ print(f" [Task1] beat {i+1} LLM 调用失败:{e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 训练样本:用户只看到 story_state + context,不知道 beat 实际内容
|
|
|
+ user_content = TASK1_USER_TMPL.format(**shared_kwargs)
|
|
|
+
|
|
|
+ return {
|
|
|
+ "messages": [
|
|
|
+ {"role": "system", "content": TASK1_SYSTEM},
|
|
|
+ {"role": "user", "content": user_content},
|
|
|
+ {"role": "assistant", "content": assistant_content},
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "task_type": "structure_planning",
|
|
|
+ "source_file": meta.get("novel_title", ""),
|
|
|
+ "chapter": f"第{chapter}章",
|
|
|
+ "position_percent": position_pct,
|
|
|
+ "mice_thread": beat.get("mice_thread", ""),
|
|
|
+ "beat_id": beat.get("id", ""),
|
|
|
+ "beat_type": beat["type"],
|
|
|
+ "word_count": beat["position_end"] - beat["position_start"],
|
|
|
+ },
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+# Task 2:场景续写(Scene Continuation)
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+TASK2_SYSTEM = (
|
|
|
+ "你是一位专业的网文作家,擅长写爽文、悬疑和情感类长篇小说,"
|
|
|
+ "能够根据结构规划生成节奏流畅、爽点鲜明的正文。"
|
|
|
+)
|
|
|
+
|
|
|
+TASK2_USER_TMPL = """\
|
|
|
+## 上文
|
|
|
+
|
|
|
+{context_text}
|
|
|
+
|
|
|
+## 结构规划
|
|
|
+
|
|
|
+{structure_plan}
|
|
|
+
|
|
|
+## 任务
|
|
|
+
|
|
|
+请续写下一段(约 {target_words} 字),风格与上文保持一致。"""
|
|
|
+
|
|
|
+TASK2_COT_GEN_TMPL = """\
|
|
|
+## 上文
|
|
|
+
|
|
|
+{context_text}
|
|
|
+
|
|
|
+## 结构规划
|
|
|
+
|
|
|
+{structure_plan}
|
|
|
+
|
|
|
+## 参考信息(该节拍的实际续写内容,仅用于帮你构建 CoT,禁止逐句引用)
|
|
|
+
|
|
|
+{beat_text_hint}
|
|
|
+
|
|
|
+---
|
|
|
+
|
|
|
+请以"事前决策"的视角写出写作思考过程,然后直接输出实际续写内容。
|
|
|
+
|
|
|
+**输出格式**:
|
|
|
+
|
|
|
+<think>
|
|
|
+## 上文理解
|
|
|
+[识别上文的叙事状态:最后一个 Scene/Sequel 的位置,主角的情绪状态]
|
|
|
+[识别关键信息:哪些细节需要在续写中呼应]
|
|
|
+
|
|
|
+## 写法决策
|
|
|
+[开头如何衔接:直接延续/场景切换/时间跳跃]
|
|
|
+[爽点如何植入:在哪个位置,用什么方式]
|
|
|
+[钩子如何设置:章末悬念的具体内容]
|
|
|
+[对话设计:谁说什么,潜台词是什么]
|
|
|
+</think>
|
|
|
+
|
|
|
+{actual_text}"""
|
|
|
+
|
|
|
+
|
|
|
+async def gen_task2_sample(
|
|
|
+ i: int,
|
|
|
+ beat: dict,
|
|
|
+ analysis: dict,
|
|
|
+ novel_text: str,
|
|
|
+ task1_samples: list,
|
|
|
+ context_chars: int,
|
|
|
+ model: str,
|
|
|
+ sem: asyncio.Semaphore,
|
|
|
+) -> dict | None:
|
|
|
+ async with sem:
|
|
|
+ meta = analysis.get("_meta", {})
|
|
|
+ total_chars = meta.get("total_chars", len(novel_text))
|
|
|
+
|
|
|
+ ctx_start = max(0, beat["position_start"] - context_chars)
|
|
|
+ context_text = novel_text[ctx_start: beat["position_start"]].strip()
|
|
|
+
|
|
|
+ beat_text = novel_text[beat["position_start"]: beat["position_end"]].strip()
|
|
|
+ if not beat_text:
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 从 Task1 样本中提取结构规划(assistant 输出部分)
|
|
|
+ structure_plan = ""
|
|
|
+ if i < len(task1_samples) and task1_samples[i]:
|
|
|
+ for msg in task1_samples[i]["messages"]:
|
|
|
+ if msg["role"] == "assistant":
|
|
|
+ structure_plan = msg["content"]
|
|
|
+ break
|
|
|
+ if not structure_plan:
|
|
|
+ structure_plan = f"(Task1 未生成,beat 摘要:{beat.get('summary', '')})"
|
|
|
+
|
|
|
+ target_words = max(500, (beat["position_end"] - beat["position_start"]) // 2)
|
|
|
+
|
|
|
+ # 只给 LLM 前 300 字作为 hint,避免泄露太多
|
|
|
+ beat_hint = beat_text[:300] + "..." if len(beat_text) > 300 else beat_text
|
|
|
+
|
|
|
+ cot_prompt = TASK2_COT_GEN_TMPL.format(
|
|
|
+ context_text=context_text,
|
|
|
+ structure_plan=structure_plan,
|
|
|
+ beat_text_hint=beat_hint,
|
|
|
+ actual_text=beat_text,
|
|
|
+ )
|
|
|
+ messages = [
|
|
|
+ {"role": "system", "content": TASK2_SYSTEM},
|
|
|
+ {"role": "user", "content": cot_prompt},
|
|
|
+ ]
|
|
|
+ try:
|
|
|
+ cot_part = await llm_call(messages, model=model)
|
|
|
+ except Exception as e:
|
|
|
+ print(f" [Task2] beat {i+1} LLM 调用失败:{e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 确保输出格式:<think>...</think>\n\n{实际正文}
|
|
|
+ if "<think>" in cot_part and beat_text not in cot_part:
|
|
|
+ # LLM 只生成了 CoT,拼接实际文本
|
|
|
+ think_end = cot_part.find("</think>")
|
|
|
+ if think_end != -1:
|
|
|
+ think_block = cot_part[: think_end + len("</think>")]
|
|
|
+ assistant_content = f"{think_block}\n\n{beat_text}"
|
|
|
+ else:
|
|
|
+ assistant_content = f"{cot_part}\n\n{beat_text}"
|
|
|
+ else:
|
|
|
+ assistant_content = cot_part
|
|
|
+
|
|
|
+ user_content = TASK2_USER_TMPL.format(
|
|
|
+ context_text=context_text,
|
|
|
+ structure_plan=structure_plan,
|
|
|
+ target_words=target_words,
|
|
|
+ )
|
|
|
+
|
|
|
+ return {
|
|
|
+ "messages": [
|
|
|
+ {"role": "system", "content": TASK2_SYSTEM},
|
|
|
+ {"role": "user", "content": user_content},
|
|
|
+ {"role": "assistant", "content": assistant_content},
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "task_type": "scene_continuation",
|
|
|
+ "source_file": meta.get("novel_title", ""),
|
|
|
+ "chapter": f"第{beat.get('chapter_start', '?')}章",
|
|
|
+ "position_percent": calc_position_percent(beat, total_chars),
|
|
|
+ "mice_thread": beat.get("mice_thread", ""),
|
|
|
+ "beat_id": beat.get("id", ""),
|
|
|
+ "beat_type": beat["type"],
|
|
|
+ "word_count": len(beat_text),
|
|
|
+ },
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+# Task 3:爽点注入(Shuang Point Injection)
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+TASK3_SYSTEM = (
|
|
|
+ "你是一位专业的网文编辑,擅长识别和设计爽点(打脸、升级、装逼、获得、碾压),"
|
|
|
+ "能在不改变核心情节的前提下大幅提升情感冲击力。"
|
|
|
+)
|
|
|
+
|
|
|
+TASK3_GEN_TMPL = """\
|
|
|
+## 原文(包含爽点的增强版)
|
|
|
+
|
|
|
+{beat_text}
|
|
|
+
|
|
|
+---
|
|
|
+
|
|
|
+## 任务
|
|
|
+
|
|
|
+1. 判断这段文字是否包含明显爽点(打脸/升级/装逼/获得/碾压)
|
|
|
+2. 如果有,生成去掉爽点后的"平淡草稿"(保留核心情节事件,但去掉爽感设计)
|
|
|
+3. 以编辑视角,写出重新注入爽点的完整思考过程(CoT)和修改说明
|
|
|
+
|
|
|
+**输出格式(严格 JSON)**:
|
|
|
+
|
|
|
+```json
|
|
|
+{{
|
|
|
+ "has_shuang": true,
|
|
|
+ "shuang_type": "打脸|升级|装逼|获得|碾压",
|
|
|
+ "intensity": "low|medium|high",
|
|
|
+ "flat_draft": "去掉爽点后的平淡版本(完整文字)",
|
|
|
+ "cot": "<think>\\n## 草稿分析\\n[识别草稿问题]\\n\\n## 爽点设计\\n[注入方案]\\n</think>",
|
|
|
+ "modification_notes": "注入位置:...\\n爽点类型:...\\n关键改动:..."
|
|
|
+}}
|
|
|
+```
|
|
|
+
|
|
|
+如果不包含明显爽点,输出:`{{"has_shuang": false}}`"""
|
|
|
+
|
|
|
+TASK3_USER_TMPL = """\
|
|
|
+## 平淡草稿
|
|
|
+
|
|
|
+{flat_draft}
|
|
|
+
|
|
|
+## 要求
|
|
|
+
|
|
|
+- 爽点类型:{shuang_type}
|
|
|
+- 强度:{intensity}(low=轻微强化 | medium=明显提升 | high=大幅改写)
|
|
|
+- 不改变核心情节,只增强情感冲击力
|
|
|
+
|
|
|
+## 任务
|
|
|
+
|
|
|
+请注入爽点,输出增强版本。"""
|
|
|
+
|
|
|
+
|
|
|
+async def gen_task3_sample(
|
|
|
+ i: int,
|
|
|
+ beat: dict,
|
|
|
+ analysis: dict,
|
|
|
+ novel_text: str,
|
|
|
+ model: str,
|
|
|
+ sem: asyncio.Semaphore,
|
|
|
+) -> dict | None:
|
|
|
+ # 只处理有爽点的 beat
|
|
|
+ sp = beat.get("shuang_point", {})
|
|
|
+ if not sp.get("has_shuang"):
|
|
|
+ return None
|
|
|
+
|
|
|
+ async with sem:
|
|
|
+ meta = analysis.get("_meta", {})
|
|
|
+ total_chars = meta.get("total_chars", len(novel_text))
|
|
|
+
|
|
|
+ beat_text = novel_text[beat["position_start"]: beat["position_end"]].strip()
|
|
|
+ if len(beat_text) < 200:
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 生成平淡草稿 + CoT
|
|
|
+ gen_prompt = TASK3_GEN_TMPL.format(beat_text=beat_text)
|
|
|
+ messages = [
|
|
|
+ {"role": "system", "content": TASK3_SYSTEM},
|
|
|
+ {"role": "user", "content": gen_prompt},
|
|
|
+ ]
|
|
|
+ try:
|
|
|
+ raw = await llm_call(messages, model=model)
|
|
|
+ except Exception as e:
|
|
|
+ print(f" [Task3] beat {i+1} LLM 调用失败:{e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+ try:
|
|
|
+ result = extract_json_block(raw)
|
|
|
+ except Exception:
|
|
|
+ print(f" [Task3] beat {i+1} JSON 解析失败,跳过")
|
|
|
+ return None
|
|
|
+
|
|
|
+ if not result.get("has_shuang"):
|
|
|
+ return None
|
|
|
+
|
|
|
+ flat_draft = result.get("flat_draft", "")
|
|
|
+ cot = result.get("cot", "")
|
|
|
+ modification_notes = result.get("modification_notes", "")
|
|
|
+ shuang_type = result.get("shuang_type", sp.get("type", ""))
|
|
|
+ intensity = result.get("intensity", sp.get("intensity", "medium"))
|
|
|
+
|
|
|
+ if not flat_draft or not cot:
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 训练样本
|
|
|
+ user_content = TASK3_USER_TMPL.format(
|
|
|
+ flat_draft=flat_draft,
|
|
|
+ shuang_type=shuang_type,
|
|
|
+ intensity=intensity,
|
|
|
+ )
|
|
|
+
|
|
|
+ # 输出:CoT + 增强版(原文)+ 修改说明
|
|
|
+ assistant_content = (
|
|
|
+ f"{cot}\n\n"
|
|
|
+ f"{beat_text}\n\n"
|
|
|
+ f"---\n**修改说明**:\n{modification_notes}"
|
|
|
+ )
|
|
|
+
|
|
|
+ return {
|
|
|
+ "messages": [
|
|
|
+ {"role": "system", "content": TASK3_SYSTEM},
|
|
|
+ {"role": "user", "content": user_content},
|
|
|
+ {"role": "assistant", "content": assistant_content},
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "task_type": "shuang_injection",
|
|
|
+ "source_file": meta.get("novel_title", ""),
|
|
|
+ "chapter": f"第{beat.get('chapter_start', '?')}章",
|
|
|
+ "position_percent": calc_position_percent(beat, total_chars),
|
|
|
+ "shuang_type": shuang_type,
|
|
|
+ "intensity": intensity,
|
|
|
+ "beat_id": beat.get("id", ""),
|
|
|
+ "word_count": len(beat_text),
|
|
|
+ },
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+# 主流程
|
|
|
+# ──────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+
|
|
|
+async def build_all(
|
|
|
+ analysis_path: str,
|
|
|
+ novel_path: str,
|
|
|
+ output_dir: str,
|
|
|
+ context_chars: int,
|
|
|
+ skip_tasks: set[int],
|
|
|
+ model: str,
|
|
|
+ concurrency: int,
|
|
|
+):
|
|
|
+ with open(analysis_path, encoding="utf-8") as f:
|
|
|
+ analysis = json.load(f)
|
|
|
+
|
|
|
+ novel_text = load_text(novel_path)
|
|
|
+ beats = analysis.get("beats", [])
|
|
|
+ out = Path(output_dir)
|
|
|
+ sem = asyncio.Semaphore(concurrency)
|
|
|
+
|
|
|
+ print(f"\n分析文件:{analysis_path}")
|
|
|
+ print(f"节拍数:{len(beats)}")
|
|
|
+ print(f"输出目录:{out}")
|
|
|
+ print(f"并发数:{concurrency}\n")
|
|
|
+
|
|
|
+ stats = {}
|
|
|
+
|
|
|
+ # ── Task 1 ──────────────────────────────────
|
|
|
+ task1_samples: list[dict | None] = [None] * len(beats)
|
|
|
+ if 1 not in skip_tasks:
|
|
|
+ print("[Task 1] 结构规划(Structure Planning)...")
|
|
|
+ tasks = [
|
|
|
+ gen_task1_sample(i, b, analysis, novel_text, context_chars, model, sem)
|
|
|
+ for i, b in enumerate(beats)
|
|
|
+ ]
|
|
|
+ results = await asyncio.gather(*tasks)
|
|
|
+ task1_samples = list(results)
|
|
|
+ valid = [s for s in task1_samples if s]
|
|
|
+ write_jsonl(valid, out / "task1_structure_planning.jsonl")
|
|
|
+ stats["task1"] = {"total": len(beats), "valid": len(valid)}
|
|
|
+ print(f" Task1 完成:{len(valid)}/{len(beats)} 条有效\n")
|
|
|
+
|
|
|
+ # ── Task 2 ──────────────────────────────────
|
|
|
+ if 2 not in skip_tasks:
|
|
|
+ print("[Task 2] 场景续写(Scene Continuation)...")
|
|
|
+ tasks = [
|
|
|
+ gen_task2_sample(
|
|
|
+ i, b, analysis, novel_text, task1_samples, context_chars, model, sem
|
|
|
+ )
|
|
|
+ for i, b in enumerate(beats)
|
|
|
+ ]
|
|
|
+ results = await asyncio.gather(*tasks)
|
|
|
+ valid = [s for s in results if s]
|
|
|
+ write_jsonl(valid, out / "task2_scene_continuation.jsonl")
|
|
|
+ stats["task2"] = {"total": len(beats), "valid": len(valid)}
|
|
|
+ print(f" Task2 完成:{len(valid)}/{len(beats)} 条有效\n")
|
|
|
+
|
|
|
+ # ── Task 3 ──────────────────────────────────
|
|
|
+ if 3 not in skip_tasks:
|
|
|
+ shuang_beats = [b for b in beats if b.get("shuang_point", {}).get("has_shuang")]
|
|
|
+ print(f"[Task 3] 爽点注入(Shuang Point Injection)... (共 {len(shuang_beats)} 个有爽点的 beat)")
|
|
|
+ tasks = [
|
|
|
+ gen_task3_sample(i, b, analysis, novel_text, model, sem)
|
|
|
+ for i, b in enumerate(beats)
|
|
|
+ ]
|
|
|
+ results = await asyncio.gather(*tasks)
|
|
|
+ valid = [s for s in results if s]
|
|
|
+ write_jsonl(valid, out / "task3_shuang_injection.jsonl")
|
|
|
+ stats["task3"] = {
|
|
|
+ "total": len(shuang_beats),
|
|
|
+ "valid": len(valid),
|
|
|
+ }
|
|
|
+ print(f" Task3 完成:{len(valid)}/{len(shuang_beats)} 条有效\n")
|
|
|
+
|
|
|
+ # ── 统计 ──────────────────────────────────
|
|
|
+ stats_path = out / "stats.json"
|
|
|
+ stats_path.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
+ print(f"统计信息 → {stats_path}")
|
|
|
+
|
|
|
+ total_valid = sum(v.get("valid", 0) for v in stats.values())
|
|
|
+ print(f"\n全部完成。总有效样本数:{total_valid}")
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ parser = argparse.ArgumentParser(description="步骤2:生成三类 SFT 训练数据")
|
|
|
+ parser.add_argument("--analysis", required=True, help="step1 输出的 analysis JSON")
|
|
|
+ parser.add_argument("--novel", required=True, help="小说 txt 文件路径")
|
|
|
+ parser.add_argument("--output-dir", required=True, help="输出目录")
|
|
|
+ parser.add_argument(
|
|
|
+ "--context-chars", type=int, default=800,
|
|
|
+ help="Task1/2 的上文字符数(默认 800)",
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ "--skip-task", type=int, action="append", default=[],
|
|
|
+ metavar="N", help="跳过某个任务(1/2/3),可多次指定",
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ "--concurrency", type=int, default=5,
|
|
|
+ help="并发 LLM 调用数(默认 5)",
|
|
|
+ )
|
|
|
+ parser.add_argument("--model", default="qwen-plus", help="使用的模型名称")
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ asyncio.run(
|
|
|
+ build_all(
|
|
|
+ args.analysis,
|
|
|
+ args.novel,
|
|
|
+ args.output_dir,
|
|
|
+ args.context_chars,
|
|
|
+ set(args.skip_task),
|
|
|
+ args.model,
|
|
|
+ args.concurrency,
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|