#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
lint-case.py — workflow.json 轻量 lint + 自动 record 新 type 到 type_suggestions.md.

设计哲学: **不严格**.
  - 不分 error/warning 等级, 不卡 exit code (都返 0)
  - 主要副作用是 record 新 type 到 spec/taxonomy/type_suggestions.md
  - 检测项打 stdout 给 Agent / 用户看, 决定要不要回去修

用法:
    python spec/tools/lint-case.py --workflow outputs/case-{N}/workflow.json --case-id {N}
    python spec/tools/lint-case.py --workflow outputs/case-{N}/workflow.json --case-id {N} --no-record   # 只校验不写
    python spec/tools/lint-case.py --workflow ... --case-id {N} --json   # 机器可读输出 (runner 完成度判据消费)

退出码:
    0  始终 (不阻塞流程)
    2  CLI 参数错误 / 文件不存在
"""
from __future__ import annotations

import argparse
import json
import re
import sys
from pathlib import Path

# spec/tools/lint-case.py → procedure-dsl/
DSL_ROOT = Path(__file__).resolve().parent.parent.parent
TYPE_JSON = DSL_ROOT / 'spec' / 'taxonomy' / 'type.json'
SUGGESTIONS = DSL_ROOT / 'spec' / 'taxonomy' / 'type_suggestions.md'


# Windows 控制台 UTF-8
for _s in (sys.stdout, sys.stderr):
    if hasattr(_s, 'reconfigure'):
        try:
            _s.reconfigure(encoding='utf-8', errors='replace')
        except Exception:
            pass


def load_type_leaves() -> set[str]:
    """读 spec/taxonomy/type.json 的 $leaves 集合."""
    if not TYPE_JSON.exists():
        return set()
    return set(json.loads(TYPE_JSON.read_text(encoding='utf-8')).get('$leaves', []))


# ===========================================================================
# Check 1: type 完整性提示
# ===========================================================================

def _iter_procedures(case_data: dict):
    """遍历 workflow.json 的 procedures.

    Yields: (procedure_label, procedure_dict) — 含 steps + type_registry.
    """
    for p in case_data.get('procedures') or []:
        label = p.get('id') or p.get('name') or '?'
        yield (label, p)


def check_type_completeness(case_data: dict) -> list[str]:
    """IO 用了 case-specific type 但 type_registry 漏写 entry → 提示.

    只 hint 不 fail. Agent 看输出回去修.
    多工序时, hint 前缀加 [proc_id] 让用户知道是哪个工序的问题.
    """
    leaves = load_type_leaves()
    hints: list[str] = []

    for proc_label, proc in _iter_procedures(case_data):
        type_reg = proc.get('type_registry') or {}

        for i, step in enumerate(proc.get('steps') or []):
            if not isinstance(step, dict):
                continue
            for kind in ('inputs', 'outputs'):
                for j, item in enumerate(step.get(kind) or []):
                    if not isinstance(item, dict):
                        continue
                    t = item.get('type', '') or ''
                    if not t:
                        continue
                    if t in leaves:
                        continue   # 字典叶子, OK
                    if t not in type_reg:
                        hints.append(
                            f"[{proc_label}] step[{i}].{kind}[{j}].type={t!r} 是 case-specific "
                            f"但 type_registry 没注册"
                        )
                    else:
                        entry = type_reg[t]
                        if isinstance(entry, dict):
                            if not entry.get('extends'):
                                hints.append(f"[{proc_label}] type_registry[{t!r}] 缺 extends 字段")
                            if not entry.get('desc'):
                                hints.append(f"[{proc_label}] type_registry[{t!r}] 缺 desc 字段 (renderer drawer 显示需要)")
    return hints


# ===========================================================================
# Check 2: value / directive 自包含性 (禁止引用占位)
# ===========================================================================

# value/directive 应填数据本身, 不是 anchor 的引用. 命中即「没真正回填」.
META_REF = re.compile(r'[（(]?\s*同\s*s[\d]|见\s*s[\d]|←\s*s[\d]|同上')


def check_value_selfcontained(case_data: dict) -> list[str]:
    """扫每个 IO 的 value + 每个 directive, 找「引用占位」文案 (同 sX / 见 sX / ← sX ...).

    spec: value 逐字回填数据本身, 引用归 anchor (README「第二阶段 · 2.0.2 连数据流」).
    这种占位 schema/type 检查抓不到, 专门一条. 只 hint. 可用
    `wf-patch.py --resolve-passthrough` 自动从源回填.
    """
    hints: list[str] = []
    for proc_label, proc in _iter_procedures(case_data):
        for i, step in enumerate(proc.get('steps') or []):
            if not isinstance(step, dict):
                continue
            for kind in ('inputs', 'outputs'):
                for j, item in enumerate(step.get(kind) or []):
                    if not isinstance(item, dict):
                        continue
                    v = item.get('value')
                    if isinstance(v, str) and META_REF.search(v):
                        hints.append(f"[{proc_label}] step[{i}].{kind}[{j}].value 是引用占位 {v[:24]!r} — 应逐字回填数据本身")
            t = step.get('directive')
            if isinstance(t, str) and META_REF.search(t):
                hints.append(f"[{proc_label}] step[{i}].directive 是引用占位 {t[:24]!r} — 应填实际 prompt 原文")
    return hints


# ===========================================================================
# Check 2b: anchor 闭合 (透传输入回填了没 + anchor 格式对不对)
# ===========================================================================

# JSON 路径式 anchor (错): ← p1.s1.outputs[0] / ← s3.inputs[1] / ...outputs[0].id;
# 正确写法是输出**编号** ← s1o1 (或 ← 工序输入 / ← sNoM[i])。只认 .outputs[/.inputs[ 这种
# 明确的路径序列化, 不误伤 ← s2.正向提示词 这类按名引用、← s5o1[-1] 这类带索引的合法编号。
_ANCHOR_JSONPATH = re.compile(r'\.(?:outputs|inputs)\[')


def check_anchor_closure(case_data: dict) -> list[str]:
    """透传输入(带 ← anchor)有没有真把 value/type 回填 + anchor 是不是写成了 JSON 路径。

    case-2-test-1 暴露的静默丢数据: 模型给输入设了 anchor 却——
      A. 漏跑 `wf-patch --resolve-passthrough` → value/type 一直空, verify/lint 当时都没拦;
      B. anchor 写成 JSON 路径 `← p1.s1.outputs[0]` 而非编号 `← s1o1` → resolve 永远匹配不上。
    两者都是**确定性**判断(空/非空、是不是路径), 适合做硬门禁(render 前也跑, 逼回填)。
    只查输入(透传方向 ←); 按名引用 `← s2.正向提示词`、带索引 `← s5o1[-1]` 都放行。
    """
    issues: list[str] = []
    for proc_label, proc in _iter_procedures(case_data):
        for i, step in enumerate(proc.get('steps') or []):
            if not isinstance(step, dict):
                continue
            for j, io in enumerate(step.get('inputs') or []):
                if not isinstance(io, dict) or io.get('inferred'):
                    continue
                anchor = (io.get('anchor') or '').strip()
                if not anchor.startswith('←'):
                    continue
                ref = anchor[1:].strip()
                # B. JSON 路径式 anchor (resolve 永远匹配不上)
                if _ANCHOR_JSONPATH.search(ref):
                    issues.append(
                        f"[{proc_label}] step[{i}].inputs[{j}] anchor={anchor[:32]!r} 是 JSON 路径写法 — "
                        f"数据流来源要用**输出编号**(如 ← s1o1), 不是 ← p1.s1.outputs[0]; "
                        f"否则 --resolve-passthrough 匹配不到、value 永远空")
                    continue
                # 外部/工序输入 (← 工序输入 / ← 输入) 是参数, 上游无 step 输出可抄, value 可空 → 不强制
                ref_base = ref.split('[')[0].strip()
                if ref_base in ('工序输入', '输入') or ref_base.startswith('工序输入'):
                    continue
                # A. 引用了上游 step 输出却 value/type 空 = 透传没回填
                v, t = io.get('value'), io.get('type')
                v_empty = v is None or (isinstance(v, str) and not v.strip())
                t_empty = t is None or (isinstance(t, str) and not t.strip())
                miss = [n for n, e in (('value', v_empty), ('type', t_empty)) if e]
                if miss:
                    issues.append(
                        f"[{proc_label}] step[{i}].inputs[{j}] 有 ← anchor({anchor[:20]!r}) 但 {'/'.join(miss)} 空 — "
                        f"透传没回填: 跑 `wf-patch.py --resolve-passthrough` 顺编号自动抄上游内容, "
                        f"或确认 anchor 指向的输出本身非空")
    return issues


def check_skeleton_filled(case_data: dict) -> list[str]:
    """Phase 1 干骨架(via/value/anchor 空)必须由 Phase 2.0 填满, 这些空字段不该活到 render。

    case-2-test-2 暴露的「填充整体没做」: 步骤 via 空、输入 value 和 anchor 都空——schema 把它们
    声明成无 minLength 的 string, 空串合法; 占位门禁只抓 <占位>; anchor 门禁只查带 ← 的输入,
    于是「连 ← 都没有、value 也空」的纯骨架残留一路漏到成品。这条专补这个洞:
      - kind=step/nested 的 via 空 = 步骤没工具(控制块 kind=block 用 via='-' 合法, 不算);
      - kind=step/nested 的 inputs / outputs **数组为空** = 步骤没有输入或没有产物
        (README: 步骤=对已有数据执行操作产生新产物 — 没 IO 的"步骤"是骨架洞, 不是步骤;
        ⚠ 只查每条 IO 的质量会教模型"删掉报错的条目"过关, 所以必须同时查条目存在性);
      - IO 的 type 空 = 没标签 (Phase 1 就该有粗略标签);
      - 输入 value 和 anchor 都空 = 既无内容(字面量)也无来源(数据流);
      - 输出 value 空 = 这步没产物。输出的 → anchor 是去处不是内容, 所以输出**必须有 value**
        (文本类逐字内容 / 媒体类 <描述>), 没有"用 anchor 顶替"的退路。
    都是确定性判断, 适合做硬门禁。inferred IO 豁免 (条目级); 整条 IO 确实不存在时
    用 inferred:true 的条目显式补上, 不允许留空数组。
    """
    issues: list[str] = []
    for proc_label, proc in _iter_procedures(case_data):
        for i, step in enumerate(proc.get('steps') or []):
            if not isinstance(step, dict):
                continue
            kind = step.get('kind', 'step')
            via = (step.get('via') or '').strip()
            if kind in ('step', 'nested') and not via:
                issues.append(
                    f"[{proc_label}] step[{i}](id={step.get('id')}) via 空 — 步骤要写用的工具"
                    f"(如 nano_banana / human / 剪映); 只有控制块 kind=block 才用 via='-'")
            if kind in ('step', 'nested'):
                for io_kind, label in (('inputs', '输入'), ('outputs', '输出')):
                    if not step.get(io_kind):
                        issues.append(
                            f"[{proc_label}] step[{i}](id={step.get('id')}) {io_kind} 为空数组 — "
                            f"步骤必有{label}(对已有数据操作→产生新产物); 用 wf-patch 补上这条 IO "
                            f"(原文没明写就按工艺推断, 标 inferred:true + inferred_reason), **不要**为过校验而删 IO")
            for io_kind in ('inputs', 'outputs'):
                for j, io in enumerate(step.get(io_kind) or []):
                    if isinstance(io, dict) and not str(io.get('type') or '').strip():
                        issues.append(
                            f"[{proc_label}] step[{i}].{io_kind}[{j}] type 空 — 每个 IO 都要有类型标签"
                            f"(Phase 1 粗略标签即可, Phase 2 归一到词表)")
            for j, io in enumerate(step.get('inputs') or []):
                if not isinstance(io, dict) or io.get('inferred'):
                    continue
                v, a = io.get('value'), (io.get('anchor') or '').strip()
                v_empty = v is None or (isinstance(v, str) and not v.strip())
                if v_empty and not a:
                    issues.append(
                        f"[{proc_label}] step[{i}].inputs[{j}] type={io.get('type', '')!r} 的 value 和 anchor 都空 — "
                        f"输入要么填字面量 value(@quote 拽原文), 要么用 anchor ← 上游编号 引数据流; 二者必有其一")
            for j, io in enumerate(step.get('outputs') or []):
                if not isinstance(io, dict) or io.get('inferred'):
                    continue
                v = io.get('value')
                if v is None or (isinstance(v, str) and not v.strip()):
                    issues.append(
                        f"[{proc_label}] step[{i}].outputs[{j}] type={io.get('type', '')!r} 的 value 空 — "
                        f"输出是这步的产物, 必须有值: 文本类填逐字内容、媒体类填 <描述>; "
                        f"原文确无则用 <占位>(原文未提供) 或标 inferred:true")
    return issues


def check_dataflow_connected(case_data: dict) -> list[str]:
    """多步工序却一个 anchor 都没有 = 2.0.2 连数据流整段没做。

    弱模型常把内容当字面量塞进每个 IO、步骤间不连任何 ← / →; 渲染出来"来源/去处"全空,
    去处也无从反推(反推需要输入 ← 作来源)。一个真·工序是一条数据流水线, ≥2 步必有跨步传递,
    所以"≥2 步 & 0 anchor"是确定性的"流程没连"信号 (单步工序豁免; 任何非空 anchor 都算已连)。
    """
    issues: list[str] = []
    for proc_label, proc in _iter_procedures(case_data):
        steps = [s for s in (proc.get('steps') or []) if isinstance(s, dict)]
        if len(steps) < 2:
            continue
        n_anchor = sum(
            1 for s in steps for k in ('inputs', 'outputs')
            for io in (s.get(k) or [])
            if isinstance(io, dict) and (io.get('anchor') or '').strip())
        if n_anchor == 0:
            issues.append(
                f"[{proc_label}] {len(steps)} 个步骤却 0 个 anchor — 数据流(2.0.2 连来源/去处)整段没做: "
                f"下游输入用 `← 上游输出编号`(如 ← s1o1)引数据、别把内容当字面量重抄; "
                f"至少把工序内的传递链连起来, 否则 HTML 里来源/去处全空")
    return issues


# ===========================================================================
# Check 3: value 占位 / directive 缺失 (提示用 quote-source 回填真内容)
# ===========================================================================

# 纯 <...> 占位 (value 该填真实内容, <...> 仅限无文字的图/视频)
PLACEHOLDER_RE = re.compile(r'^\s*<[^>]*>\s*$')

# 「原文确无该信息」逃生标记 → 占位/逐字检查放行 (等同 inferred)。常见措辞都要认:
# 原文未提供 / 原文确无 / 原文中没有 / 原帖里无 …。render-case.py 的 _NOSRC_RE 与此保持一致。
NOSRC_RE = re.compile(r'原[文帖].{0,2}(未提供|未给出|未写|没有|没写|确无|无)')

# 模态分类关键词 (TEXT 优先于 MEDIA, 因 "配音文案" 这类既含媒体词又是文本)
_TEXT_KW = ('提示词', '描述', '参数', '评', '大纲', '脚本', '文案', '歌词', '字幕',
            '标题', '正文', '词', '知识', '工作流', '对标', '规格', '批处理', '模板', '版式',
            '数据', '分析', '报告', '记录', '方案', '思路', '设定', '依据', '标准', '清单', '列表', '文本', '文字')
_MEDIA_KW = ('图', '视频', '音频', '帧', '片段', '截图', '蒙版', '音效', '配音', 'BGM',
             '数字人', '滤镜', '海报', '封面')


def _type_modality(type_name: str, type_reg: dict) -> str:
    """按类型名(case-specific 类型先经 type_registry.extends 解析到 stdlib 叶子)判模态.

    返回 'media' (图/视频/音频 — 可 <描述>) / 'text' (提示词/数据/报告 — 必须真实文本) / 'unknown'.
    media 用关键词可靠识别; 非 media 一律按"需真实文本"对待 (data/text 占多数, 宁严勿漏).
    """
    base, seen = type_name, set()
    while base in (type_reg or {}) and base not in seen:
        seen.add(base)
        ent = type_reg[base]
        ext = ent.get('extends') if isinstance(ent, dict) else None
        if not ext:
            break
        base = ext
    nm = base or type_name or ''
    if any(k in nm for k in _TEXT_KW):
        return 'text'
    if any(k in nm for k in _MEDIA_KW):
        return 'media'
    return 'unknown'


def check_placeholder_content(case_data: dict) -> list[str]:
    """逐 IO 按模态审计 value + 工具步骤 directive → 提示用 quote-source 回填真内容.

    规则 (README「第二阶段 · 2.0.1 填 value」): 文本类 IO(提示词/数据/报告)的 value 必须是从原文匹配到的真实内容,
    不能写 <…> 占位; 原文确实没有 → 标 inferred:true + inferred_reason 显式说明 (本检查放行).
    媒体类 IO(图/视频/音频)允许 <具体描述>. 工具步骤(via 是具体工具)必须带原文那段 prompt 当 directive.
    弱模型常把所有 value 写成 <…> 占位、整个漏 directive (实测 test-7 全踩), 这条逐 IO 抓, 只 hint.
    """
    hints: list[str] = []
    for proc_label, proc in _iter_procedures(case_data):
        type_reg = proc.get('type_registry') or {}
        for i, step in enumerate(proc.get('steps') or []):
            if not isinstance(step, dict):
                continue
            for kind in ('inputs', 'outputs'):
                for j, item in enumerate(step.get(kind) or []):
                    if not isinstance(item, dict):
                        continue
                    if item.get('inferred'):       # 已显式标 inferred 说明 → 放行
                        continue
                    v = item.get('value')
                    if not isinstance(v, str):
                        continue
                    if NOSRC_RE.search(v):
                        continue                   # 显式标「原文未提供/确无」→ 放行 (LLM 确认原文确无)
                    if not PLACEHOLDER_RE.match(v):
                        continue                   # value 不是 <…> 占位 (已填真内容)
                    t = item.get('type', '') or ''
                    mod = _type_modality(t, type_reg)
                    if mod == 'media':
                        continue                   # 图/视频/音频 用 <描述> 合理
                    label = '文本类' if mod == 'text' else '非媒体(疑似数据/文本)'
                    # 输出占位 = 步骤产出物没回填; 原文/OCR 里通常紧跟在 prompt 后展示了它
                    extra = ('；这是步骤**产出物**, 原文/配图 OCR 里常紧跟 prompt 展示了它, '
                             '用 quote-source --from/--to 把那段产出也捞进 value') if kind == 'outputs' else ''
                    hints.append(
                        f"[{proc_label}] step[{i}].{kind}[{j}] type={t!r}({label}) value={v.strip()!r} 仍是占位 "
                        f"—— 你即便已 quote 到原文也**必须把真实内容替换进 value**(别只填 directive){extra}; "
                        f"原文确无则标 inferred:true + inferred_reason; 若其实是无文字图/视频, 让类型/描述体现"
                    )
            via = (step.get('via') or '').strip()
            directive = (step.get('directive') or '').strip()
            if step.get('kind', 'step') == 'step' and via and via not in ('human', '-') and not directive:
                hints.append(
                    f"[{proc_label}] step[{i}](via={via!r}) directive 空 — 若原文有给工具的提示词/指令, "
                    f"用 quote-source 捞原文那段填进 directive"
                )
            # substance/form 缺失 (Phase 2 该提炼实质/形式; 纯技术步可显式设 null, 但别整个漏掉 key)
            if step.get('kind', 'step') in ('step', 'nested'):
                miss = [f for f in ('substance', 'form') if f not in step]
                if miss:
                    hints.append(
                        f"[{proc_label}] step[{i}] 缺 {'/'.join(miss)} — Phase 2 漏做了实质/形式提炼; "
                        f"读懂这步内容提炼元素点填上(纯技术步可显式设 null, 但别漏掉字段)"
                    )
            # intent 缺失 (Phase 2 每步都要填目的列, 一句话概括)
            if step.get('kind', 'step') in ('step', 'block', 'nested') and not (step.get('intent') or '').strip():
                hints.append(
                    f"[{proc_label}] step[{i}] 缺 intent — Phase 2 每步都要填目的列(一句话概括这步在做什么, ≤25 字)"
                )
    return hints


# ===========================================================================
# Check 3a: 未解析的 @quote 标记残留
# ===========================================================================

def check_unresolved_quotes(case_data: dict) -> list[str]:
    """value/directive 里残留的 `@quote|起锚|止锚` 标记 → 报出 (确定性, render 前必须清零).

    @quote 是喂给 wf-patch --resolve-quotes 的**中间态**: 锚点没匹配上时 wf-patch 只 ⚠ 警告、
    标记原样留下。它不是 <占位>(占位门禁不认)、逐字检查又显式跳过它(resolve 后才比) ——
    之前没有任何检查管它, 会一路漏进 HTML 当正文显示。修法: 改锚点(两段独特短串, 来自原文/OCR
    逐字)重跑 --resolve-quotes, 或放弃标记直接填真实内容。
    """
    issues: list[str] = []
    for proc_label, proc in _iter_procedures(case_data):
        for i, step in enumerate(proc.get('steps') or []):
            if not isinstance(step, dict):
                continue
            for kind in ('inputs', 'outputs'):
                for j, io in enumerate(step.get(kind) or []):
                    if not isinstance(io, dict):
                        continue
                    v = io.get('value')
                    if isinstance(v, str) and v.lstrip().startswith('@quote'):
                        issues.append(
                            f"[{proc_label}] step[{i}].{kind}[{j}] value 是未解析的 @quote 标记 "
                            f"({v.strip()[:48]!r}…) — 锚点没匹配上原文/OCR; 改锚点重跑 "
                            f"`wf-patch --resolve-quotes --source <原文> [--ocr <ocr.txt>]`, 或直接填真实内容")
            d = step.get('directive')
            if isinstance(d, str) and d.lstrip().startswith('@quote'):
                issues.append(
                    f"[{proc_label}] step[{i}].directive 是未解析的 @quote 标记 ({d.strip()[:48]!r}…) — "
                    f"改锚点重跑 --resolve-quotes, 或直接填真实内容")
    return issues


# ===========================================================================
# Check 3b: 归类完成度 (Phase 2.1 做完没 — effect/action 填了、intent 是标记格式)
# ===========================================================================

# intent 标记格式 (README「目的列」): 合法标记类别 effect/via/act/in-type/out-type
_INTENT_MARKERS = ('in-type:', 'out-type:', 'act:', 'via:', 'effect:')


def check_classification_done(case_data: dict) -> list[str]:
    """Phase 2.1 归类是否做完: 每个非控制块步骤要有 effect+action; 有 IO 的步骤 intent 要用标记格式.

    runner 的自动续跑兜底用这条判「还差什么」(via --json), 规则只活在这里, 别在 runner 里复刻.
    """
    missing: list[str] = []
    intent_bad: list[str] = []
    for proc_label, proc in _iter_procedures(case_data):
        for s in proc.get('steps') or []:
            if not isinstance(s, dict) or s.get('kind') == 'block':
                continue   # 控制块不要求 effect/action
            sid = f"{proc_label}.{s.get('id')}"
            if not (s.get('effect') or '').strip() or not (s.get('action') or '').strip():
                missing.append(sid)
            has_io = bool(s.get('inputs') or s.get('outputs'))
            intent = (s.get('intent') or '').strip()
            if has_io and (not intent or '{' not in intent
                           or not any(m in intent for m in _INTENT_MARKERS)):
                intent_bad.append(sid)
    hints: list[str] = []
    if missing:
        hints.append(f"{len(missing)} 个步骤缺 effect/action (Phase 2.1 没做完): "
                     f"{', '.join(missing[:8])}{' …' if len(missing) > 8 else ''}")
    if intent_bad:
        hints.append(f"{len(intent_bad)} 个步骤的 intent 没用标记格式 (README「目的列」: 写成带 "
                     f"{{in-type:X}}/{{out-type:Y}}/{{act:Z}} 的句子): "
                     f"{', '.join(intent_bad[:8])}{' …' if len(intent_bad) > 8 else ''}")
    return hints


# ===========================================================================
# Check 4: 章节覆盖 (结构强制 — 需 --source) + value 逐字 (值强制 — 需 --source)
# ===========================================================================
#
# 弱模型在 Phase 1 骨架阶段走两条最省力的路, 都靠"看原文"才抓得到:
#   (结构) 只挑两个最显眼的工序就收工, 整段章节(框架/附加案例/总结)漏抽
#   (值)   挑中的 value 也打字缩写成标题纲要, 不是逐字原文 (能过 render 门禁因为不是 <占位>)
# 这两条 check 都需要原文 (--source input/case-N.json [--ocr ocr.txt]) 才能比对.

# 比对噪声: 空白 + 各式引号 (原文 “”、骨架常写成 「」/""，内容一致只是引号风格不同, 不该算缩写)
_QUOTE_NOISE = dict.fromkeys(map(ord, '「」『』“”‘’"\'＂＇'), None)


def _norm(s: str) -> str:
    """归一化用于子串比对: 去所有空白 (原文常把一个词拆到两行) + 抹掉引号风格差异."""
    return re.sub(r'\s+', '', s or '').translate(_QUOTE_NOISE)


def _load_source_corpus(source_path: Path | None, ocr_path: Path | None) -> tuple[str, str]:
    """读原文语料: 返回 (raw_text, normed). raw 用来切章节, normed 用来子串比对.

    source = input/case-N.json 的 title + body_text; ocr = 配图 OCR 文本 (可选).
    """
    parts: list[str] = []
    if source_path and source_path.exists():
        try:
            sd = json.loads(source_path.read_text(encoding='utf-8'))
            parts.append(sd.get('title', '') or '')
            parts.append(sd.get('body_text', '') or sd.get('content', '') or '')
        except Exception:
            parts.append(source_path.read_text(encoding='utf-8'))
    if ocr_path and ocr_path.exists():
        parts.append(ocr_path.read_text(encoding='utf-8'))
    raw = '\n'.join(parts)
    return raw, _norm(raw)


def _sections(body: str) -> list[tuple[str, str, str]]:
    """切原文章节: 按行首 `NN |` 两位标号 (01..99; 行首要求天然排除 `图 0N |` 配图说明). 返回 [(号, 标题, 正文段)]."""
    marks = [(m.start(), m.group(1)) for m in re.finditer(r'(?m)^\s*(\d{2})\s*[|｜]', body)]
    out: list[tuple[str, str, str]] = []
    for idx, (pos, num) in enumerate(marks):
        end = marks[idx + 1][0] if idx + 1 < len(marks) else len(body)
        seg = body[pos:end]
        after = re.split(r'[|｜]', seg, 1)
        tail = after[-1] if len(after) > 1 else seg
        title = ''
        for line in tail.splitlines():
            line = line.strip()
            if line:
                title = line[:24]
                break
        out.append((num, title, seg))
    return out


# 章节正文里的"要点标记": 思路X / 第X层 / 第X步 / 案例X / 冒号短标签 (人物特征：…)
_POINT_MARKER = re.compile(
    r'(?m)^\s*(思路[一二三四五]|第[一二三四五六七八九十]+[层步]|案例[一二三四五六七八九十]+)')
_POINT_COLON = re.compile(r'(?m)^\s*([^\n：:（(]{2,12})\s*[：:]')


def _section_points(seg: str) -> list[str]:
    """抽一节正文的要点短语 (用来量化它被骨架覆盖了多少)."""
    pts: list[str] = []
    for m in _POINT_MARKER.finditer(seg):
        line = seg[m.start():].splitlines()[0].strip()
        pts.append(line[:16])
    for m in _POINT_COLON.finditer(seg):
        lab = m.group(1).strip()
        # 纯序号标记 (第X步/第X层/思路X/案例X) 是结构序号不是内容要点, 骨架改写成动作后必然对不上 → 跳过
        if re.fullmatch(r'(思路[一二三四五]|第[一二三四五六七八九十]+[层步]|案例[一二三四五六七八九十]+)', lab):
            continue
        if re.search(r'[一-龥]', lab):
            pts.append(lab)
    seen: set[str] = set()
    out: list[str] = []
    for p in pts:
        if p and p not in seen:
            seen.add(p)
            out.append(p)
    return out


def _point_covered(point: str, wf_norm: str) -> bool:
    """要点是否被骨架覆盖: 去掉结构前缀后, 任一 4-gram 命中 workflow 文本即算覆盖 (从宽)."""
    core = re.sub(r'^(思路[一二三四五]|第[一二三四五六七八九十]+[层步]|案例[一二三四五六七八九十]+)', '', point)
    core = _norm(core) or _norm(point)
    if len(core) < 4:
        return core in wf_norm
    return any(core[k:k + 4] in wf_norm for k in range(len(core) - 3))


def check_section_coverage(case_data: dict, source_raw: str, wf_norm: str) -> list[str]:
    """结构强制: 逐章节算骨架覆盖率, 整段漏抽的章节 (<40%) 报出来 + 给缺失要点样例."""
    hints: list[str] = []
    secs = _sections(source_raw)
    if not secs:
        return hints
    for num, title, seg in secs:
        pts = _section_points(seg)
        if len(pts) < 2:
            continue   # 没足够要点 (纯过渡/口号段), 不评判
        missed = [p for p in pts if not _point_covered(p, wf_norm)]
        ratio = 1 - len(missed) / len(pts)
        if ratio < 0.40:
            sample = '、'.join(missed[:5])
            hints.append(
                f"章节『{num} {title}』覆盖率 {ratio:.0%} ({len(pts) - len(missed)}/{len(pts)} 要点) "
                f"—— 疑似整段漏抽; 缺: {sample}{' …' if len(missed) > 5 else ''}; "
                f"回去为它补 procedure/step (每个 0N 章节至少对应一个工序或子步骤)"
            )
    return hints


def _longest_run(v_norm: str, source_norm: str) -> int:
    """value 在原文里能连续命中的最长子串长度. 逐字原文应是原文一整段连续文本;
    拼接/缩写出来的(把分散的小标题用、串起来)最长连续命中会很短."""
    n = len(v_norm)
    best = 0
    for i in range(n):
        if n - i <= best:
            break                       # 剩余长度已不可能超过 best
        lo, hi = 0, n - i
        while lo < hi:                  # 二分该起点能命中的最长长度
            mid = (lo + hi + 1) // 2
            if v_norm[i:i + mid] in source_norm:
                lo = mid
            else:
                hi = mid - 1
        if lo > best:
            best = lo
    return best


def check_value_verbatim(case_data: dict, source_norm: str) -> list[str]:
    """值强制: 文本类 value 必须是原文里的「一整段连续文本」. 最长连续命中 <80% 判缩写/改写/截断.

    跳过: 占位<…>(归 check3)、inferred、原文未提供、未 resolve 的 @quote、媒体类、短值(<12字).
    用最长连续命中而非逐子句覆盖: 后者会被「人物、产品、环境」这种"原文小标题拼盘"骗过
    (每个词单独在原文里, 但整体不是任何一段原文 — 真正的逐字细节全被丢了).
    """
    hints: list[str] = []
    if not source_norm:
        return hints
    for proc_label, proc in _iter_procedures(case_data):
        type_reg = proc.get('type_registry') or {}
        for i, step in enumerate(proc.get('steps') or []):
            if not isinstance(step, dict):
                continue
            for kind in ('inputs', 'outputs'):
                for j, item in enumerate(step.get(kind) or []):
                    if not isinstance(item, dict):
                        continue
                    if item.get('inferred'):
                        continue
                    v = item.get('value')
                    if not isinstance(v, str) or not v.strip():
                        continue
                    if PLACEHOLDER_RE.match(v):
                        continue                          # 占位 → check3 管
                    if v.startswith('@quote'):
                        continue                          # 未回填的 quote, resolve 后才比
                    if NOSRC_RE.search(v):
                        continue
                    if _type_modality(item.get('type', '') or '', type_reg) == 'media':
                        continue                          # 媒体描述不要求逐字
                    vn = _norm(v)
                    if len(vn) < 12:
                        continue                          # 短标签不查 (无所谓缩写)
                    run = _longest_run(vn, source_norm)
                    ratio = run / len(vn)
                    # 绝对护栏: 连续命中 ≥80 字 = 铁证级真引用 (没人会"凑巧"逐字打 80 字),
                    # 哪怕整体比例因中途一处微小偏差掉到 80% 也放行, 只抓"短值缩写"。
                    if run >= 80:
                        continue
                    if ratio < 0.80:
                        hints.append(
                            f"[{proc_label}] step[{i}].{kind}[{j}] value 最长连续命中原文仅 {run}/{len(vn)} 字"
                            f"({ratio:.0%}) —— 整体不是一整段原文(疑似开头逐字后就缩写/改写); value={v[:40]!r}…; "
                            f"用 @quote|起锚|止锚 + wf-patch --resolve-quotes 把整段原文逐字拽进来(原文那段提示词约 350 字)"
                        )
    return hints


# ===========================================================================
# Side effect: record 新 type 到 type_suggestions.md
# ===========================================================================

def record_new_types(case_data: dict, suggestions_path: Path = SUGGESTIONS) -> list[str]:
    """把 case_data.type_registry 里的 case-specific type append 到 suggestions.

    幂等: 同一 (type_name, case_id) 二元组只 append 一次. Dedup 靠 grep 现有文件,
    抽 `(来自 case-{N})` + 类型名 二元组.

    Returns:
        本次新写入的条目 list (空 list = 没新东西要 record).
    """
    # 合并所有 procedures.type_registry
    type_reg: dict = {}
    for p in case_data.get('procedures', []):
        type_reg.update(p.get('type_registry') or {})
    if not type_reg:
        return []

    leaves = load_type_leaves()
    case_id = case_data.get('case_id') or '?'
    text = suggestions_path.read_text(encoding='utf-8') if suggestions_path.exists() else ''

    # 已 record 过的 (type_name, case_id) — 用 regex 抓 markdown list entry
    existing = set(re.findall(
        r'^- `([^`]+)`:.*?\(来自 case-([^,)\s]+)', text, re.M
    ))

    new_lines: list[str] = []
    for tname, entry in type_reg.items():
        if not isinstance(entry, dict):
            continue
        if tname in leaves:
            continue   # 已是字典叶子, 不是新 type (Agent 误把 stdlib type 加进 case_data.type_registry)
        if (tname, str(case_id)) in existing:
            continue
        ext = entry.get('extends', '?')
        desc = entry.get('desc') or '(无 desc)'
        new_lines.append(f'- `{tname}`: {desc}  (来自 case-{case_id}, extends `{ext}`)')

    if new_lines:
        # 确保 suggestions 文件存在 (没有就建个空骨架)
        if not suggestions_path.exists():
            suggestions_path.write_text(
                '# Type 字典扩展建议\n\n## 累积条目\n\n', encoding='utf-8'
            )
        # append 末尾
        with suggestions_path.open('a', encoding='utf-8') as f:
            f.write('\n' + '\n'.join(new_lines) + '\n')

    return new_lines


# ===========================================================================
# main
# ===========================================================================

def main() -> None:
    ap = argparse.ArgumentParser(
        prog='lint-case.py',
        description='workflow 轻量 lint + 自动 record 新 type 到 type_suggestions.md',
    )
    ap.add_argument('--workflow', type=Path, required=True,
                    help='workflow.json (含 procedures 数组). lint 内部读 procedures + type_registry')
    ap.add_argument('--case-id', type=str, default=None,
                    help='record suggestions 用的 case_id. 不传就 fallback workflow.case_id 或 ?')
    ap.add_argument('--source', type=Path, default=None,
                    help='原文 input/case-N.json. 传了才启用「章节覆盖」+「value 逐字」两条结构/值强制校验')
    ap.add_argument('--ocr', type=Path, default=None,
                    help='配图 OCR 文本 (可选). 并入原文语料, 让逐字校验也认配图里的文字')
    ap.add_argument('--no-record', action='store_true',
                    help='只校验, 不写 suggestions')
    ap.add_argument('--json', action='store_true',
                    help='输出机器可读 JSON ({"checks": {名: [提示...]}}); runner 的完成度判据消费它')
    args = ap.parse_args()

    target_path = args.workflow
    if not target_path.exists():
        print(f'lint-case: 文件不存在 {target_path}', file=sys.stderr)
        sys.exit(2)

    try:
        case_data = json.loads(target_path.read_text(encoding='utf-8'))
    except json.JSONDecodeError as e:
        print(f'lint-case: {target_path} 不是合法 JSON: {e}', file=sys.stderr)
        sys.exit(2)

    # workflow 模式: 注入 case_id (suggestions record 需要)
    if args.case_id is not None and 'case_id' not in case_data:
        try:
            case_data['case_id'] = int(args.case_id)
        except ValueError:
            case_data['case_id'] = args.case_id

    case_id = case_data.get('case_id', '?')

    # 全部检查跑进 results (检查名 → 提示列表); --json 和人读输出共用这一份
    results: dict[str, list[str]] = {
        'type_completeness': check_type_completeness(case_data),
        'value_selfcontained': check_value_selfcontained(case_data),
        'placeholder_content': check_placeholder_content(case_data),
        'unresolved_quotes': check_unresolved_quotes(case_data),
        'classification_done': check_classification_done(case_data),
    }
    source_checked = args.source is not None
    if source_checked:
        source_raw, source_norm = _load_source_corpus(args.source, args.ocr)
        wf_norm = _norm(json.dumps(case_data, ensure_ascii=False))
        results['section_coverage'] = check_section_coverage(case_data, source_raw, wf_norm)
        results['value_verbatim'] = check_value_verbatim(case_data, source_norm)

    # side effect: record 新 type
    recorded: list[str] = []
    if not args.no_record:
        recorded = record_new_types(case_data)

    if args.json:
        print(json.dumps({'case_id': case_id, 'checks': results,
                          'source_checked': source_checked,
                          'recorded': len(recorded)}, ensure_ascii=False, indent=1))
        sys.exit(0)

    print(f'[lint] case-{case_id} ({target_path.name})')
    _HEADERS = {
        'type_completeness': ('type 完整性', '个提示'),
        'value_selfcontained': ('value 自包含', '个引用占位 (跑 wf-patch.py --resolve-passthrough 自动回填)'),
        'placeholder_content': ('value/directive 真实性', '处占位/缺失 (用 quote-source.py 从原文/配图 OCR 捞真内容回填)'),
        'unresolved_quotes': ('@quote 残留', '处未解析标记 (改锚点重跑 --resolve-quotes 或直接填真实内容)'),
        'classification_done': ('归类完成度', '项未完成 (Phase 2.1: effect/action 对词表填, intent 用标记格式)'),
        'section_coverage': ('章节覆盖(结构强制)', '个章节疑似漏抽 —— 骨架要覆盖原文每个章节'),
        'value_verbatim': ('value 逐字(值强制)', '处疑似缩写/改写 —— 文本类 value 要逐字搬原文(用 @quote)'),
    }
    for name, hints in results.items():
        title, suffix = _HEADERS[name]
        if hints:
            print(f'  · {title}: {len(hints)} {suffix}')
            for h in hints:
                print(f'      - {h}')
        else:
            print(f'  · {title}: OK')
    if not source_checked:
        print('  · 章节覆盖 + value 逐字: skipped (传 --source input/case-N.json [--ocr ocr.txt] 启用结构/值强制)')

    if not args.no_record:
        if recorded:
            print(f'  · 已 record {len(recorded)} 条新 type 到 {SUGGESTIONS.name}:')
            for ln in recorded:
                print(f'      {ln}')
        else:
            merged_reg: dict = {}
            for p in case_data.get('procedures', []):
                merged_reg.update(p.get('type_registry') or {})
            if not merged_reg:
                print('  · 无新 type 可 record (type_registry 为空 — 全部 type 命中字典叶子)')
            else:
                print('  · 无新 type 可 record (type_registry 里的项已全部 record 过)')

    # 不卡 exit code
    sys.exit(0)


if __name__ == '__main__':
    main()