""" 从帖子解构内容中提取选题相关词: - 新格式:灵感点/目的点/关键点 下每项「选题点元素」中的「元素名称」(不提取「选题点」字段本身) - 旧格式(兼容):分词结果 中每项的「词」 去重后输出。 """ import json from pathlib import Path def _append_from_deconstruct_item(item: dict, topics: list[str]) -> None: if not isinstance(item, dict): return for el in item.get("选题点元素") or []: if not isinstance(el, dict): continue name = el.get("元素名称") if name and isinstance(name, str) and name.strip(): topics.append(name.strip()) for seg in item.get("分词结果") or []: if not isinstance(seg, dict): continue word = seg.get("词") if word and isinstance(word, str) and word.strip(): topics.append(word.strip()) def extract_post_topic(account_name: str, post_id: str) -> list[str]: """ 从解构内容中提取选题点元素(元素名称)并去重;不提取「选题点」字段。 :param account_name: 账号名 :param post_id: 帖子ID :return: 去重后的字符串列表 """ overall_derivation_dir = Path(__file__).resolve().parent.parent input_path = ( overall_derivation_dir / "input" / account_name / "原始数据" / "解构内容" / f"{post_id}.json" ) with open(input_path, "r", encoding="utf-8") as f: data = json.load(f) topics: list[str] = [] for key in ("灵感点", "目的点", "关键点"): for item in data.get(key, []) or []: _append_from_deconstruct_item(item, topics) # 去重且保持首次出现顺序 seen = set() unique_topics: list[str] = [] for w in topics: if w not in seen: seen.add(w) unique_topics.append(w) return unique_topics def _load_post_id_list_from_exclude_note_ids(account_name: str) -> list[str]: """从 input/{account_name}/原始数据/exclude_note_ids.json 读取帖子 ID 列表(字符串数组)。""" overall_derivation_dir = Path(__file__).resolve().parent.parent path = overall_derivation_dir / "input" / account_name / "原始数据" / "exclude_note_ids.json" if not path.is_file(): raise FileNotFoundError(f"未找到帖子 ID 列表文件: {path}") with open(path, "r", encoding="utf-8") as f: data = json.load(f) if not isinstance(data, list): raise ValueError(f"exclude_note_ids.json 应为字符串数组: {path}") out: list[str] = [] for x in data: if isinstance(x, str) and x.strip(): out.append(x.strip()) return out def main(account_name: str, post_id: str): # parser = argparse.ArgumentParser(description="从解构内容中提取选题点") # parser.add_argument("account_name", help="账号名") # parser.add_argument("post_id", help="帖子ID") # args = parser.parse_args() topics = extract_post_topic(account_name, post_id) overall_derivation_dir = Path(__file__).resolve().parent.parent out_dir = overall_derivation_dir / "input" / account_name / "处理后数据" / "post_topic" out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / f"{post_id}.json" with open(out_path, "w", encoding="utf-8") as f: json.dump(topics, f, ensure_ascii=False, indent=2) print(f"已写入 {len(topics)} 个选题点元素到 {out_path}") if __name__ == "__main__": account_name = "空间点阵设计研究室" post_id_list = _load_post_id_list_from_exclude_note_ids(account_name) for post_id in post_id_list: main(account_name=account_name, post_id=post_id)