howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
							"""
从帖子解构内容中提取选题相关词：
- 新格式：灵感点/目的点/关键点 下每项「选题点元素」中的「元素名称」（不提取「选题点」字段本身）
- 旧格式（兼容）：分词结果 中每项的「词」
去重后输出。
"""
import json
from pathlib import Path


def _append_from_deconstruct_item(item: dict, topics: list[str]) -> None:
    if not isinstance(item, dict):
        return
    for el in item.get("选题点元素") or []:
        if not isinstance(el, dict):
            continue
        name = el.get("元素名称")
        if name and isinstance(name, str) and name.strip():
            topics.append(name.strip())
    for seg in item.get("分词结果") or []:
        if not isinstance(seg, dict):
            continue
        word = seg.get("词")
        if word and isinstance(word, str) and word.strip():
            topics.append(word.strip())


def extract_post_topic(account_name: str, post_id: str) -> list[str]:
    """
    从解构内容中提取选题点元素（元素名称）并去重；不提取「选题点」字段。

    :param account_name: 账号名
    :param post_id: 帖子ID
    :return: 去重后的字符串列表
    """
    overall_derivation_dir = Path(__file__).resolve().parent.parent
    input_path = (
        overall_derivation_dir
        / "input"
        / account_name
        / "原始数据"
        / "解构内容"
        / f"{post_id}.json"
    )

    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    topics: list[str] = []
    for key in ("灵感点", "目的点", "关键点"):
        for item in data.get(key, []) or []:
            _append_from_deconstruct_item(item, topics)

    # 去重且保持首次出现顺序
    seen = set()
    unique_topics: list[str] = []
    for w in topics:
        if w not in seen:
            seen.add(w)
            unique_topics.append(w)

    return unique_topics


def _load_post_id_list_from_exclude_note_ids(account_name: str) -> list[str]:
    """从 input/{account_name}/原始数据/exclude_note_ids.json 读取帖子 ID 列表（字符串数组）。"""
    overall_derivation_dir = Path(__file__).resolve().parent.parent
    path = overall_derivation_dir / "input" / account_name / "原始数据" / "exclude_note_ids.json"
    if not path.is_file():
        raise FileNotFoundError(f"未找到帖子 ID 列表文件: {path}")
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError(f"exclude_note_ids.json 应为字符串数组: {path}")
    out: list[str] = []
    for x in data:
        if isinstance(x, str) and x.strip():
            out.append(x.strip())
    return out


def main(account_name: str, post_id: str):
    # parser = argparse.ArgumentParser(description="从解构内容中提取选题点")
    # parser.add_argument("account_name", help="账号名")
    # parser.add_argument("post_id", help="帖子ID")
    # args = parser.parse_args()

    topics = extract_post_topic(account_name, post_id)

    overall_derivation_dir = Path(__file__).resolve().parent.parent
    out_dir = overall_derivation_dir / "input" / account_name / "处理后数据" / "post_topic"
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / f"{post_id}.json"

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(topics, f, ensure_ascii=False, indent=2)

    print(f"已写入 {len(topics)} 个选题点元素到 {out_path}")


if __name__ == "__main__":
    account_name = "空间点阵设计研究室"
    post_id_list = _load_post_id_list_from_exclude_note_ids(account_name)
    for post_id in post_id_list:
        main(account_name=account_name, post_id=post_id)