| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- """
- 从帖子解构内容中提取选题相关词:
- - 新格式:灵感点/目的点/关键点 下每项「选题点元素」中的「元素名称」(不提取「选题点」字段本身)
- - 旧格式(兼容):分词结果 中每项的「词」
- 去重后输出。
- """
- import json
- from pathlib import Path
- def _append_from_deconstruct_item(item: dict, topics: list[str]) -> None:
- if not isinstance(item, dict):
- return
- for el in item.get("选题点元素") or []:
- if not isinstance(el, dict):
- continue
- name = el.get("元素名称")
- if name and isinstance(name, str) and name.strip():
- topics.append(name.strip())
- for seg in item.get("分词结果") or []:
- if not isinstance(seg, dict):
- continue
- word = seg.get("词")
- if word and isinstance(word, str) and word.strip():
- topics.append(word.strip())
- def extract_post_topic(account_name: str, post_id: str) -> list[str]:
- """
- 从解构内容中提取选题点元素(元素名称)并去重;不提取「选题点」字段。
- :param account_name: 账号名
- :param post_id: 帖子ID
- :return: 去重后的字符串列表
- """
- overall_derivation_dir = Path(__file__).resolve().parent.parent
- input_path = (
- overall_derivation_dir
- / "input"
- / account_name
- / "原始数据"
- / "解构内容"
- / f"{post_id}.json"
- )
- with open(input_path, "r", encoding="utf-8") as f:
- data = json.load(f)
- topics: list[str] = []
- for key in ("灵感点", "目的点", "关键点"):
- for item in data.get(key, []) or []:
- _append_from_deconstruct_item(item, topics)
- # 去重且保持首次出现顺序
- seen = set()
- unique_topics: list[str] = []
- for w in topics:
- if w not in seen:
- seen.add(w)
- unique_topics.append(w)
- return unique_topics
- def _load_post_id_list_from_exclude_note_ids(account_name: str) -> list[str]:
- """从 input/{account_name}/原始数据/exclude_note_ids.json 读取帖子 ID 列表(字符串数组)。"""
- overall_derivation_dir = Path(__file__).resolve().parent.parent
- path = overall_derivation_dir / "input" / account_name / "原始数据" / "exclude_note_ids.json"
- if not path.is_file():
- raise FileNotFoundError(f"未找到帖子 ID 列表文件: {path}")
- with open(path, "r", encoding="utf-8") as f:
- data = json.load(f)
- if not isinstance(data, list):
- raise ValueError(f"exclude_note_ids.json 应为字符串数组: {path}")
- out: list[str] = []
- for x in data:
- if isinstance(x, str) and x.strip():
- out.append(x.strip())
- return out
- def main(account_name: str, post_id: str):
- # parser = argparse.ArgumentParser(description="从解构内容中提取选题点")
- # parser.add_argument("account_name", help="账号名")
- # parser.add_argument("post_id", help="帖子ID")
- # args = parser.parse_args()
- topics = extract_post_topic(account_name, post_id)
- overall_derivation_dir = Path(__file__).resolve().parent.parent
- out_dir = overall_derivation_dir / "input" / account_name / "处理后数据" / "post_topic"
- out_dir.mkdir(parents=True, exist_ok=True)
- out_path = out_dir / f"{post_id}.json"
- with open(out_path, "w", encoding="utf-8") as f:
- json.dump(topics, f, ensure_ascii=False, indent=2)
- print(f"已写入 {len(topics)} 个选题点元素到 {out_path}")
- if __name__ == "__main__":
- account_name = "空间点阵设计研究室"
- post_id_list = _load_post_id_list_from_exclude_note_ids(account_name)
- for post_id in post_id_list:
- main(account_name=account_name, post_id=post_id)
|