extract_post_topic.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. """
  2. 从帖子解构内容中提取选题相关词:
  3. - 新格式:灵感点/目的点/关键点 下每项「选题点元素」中的「元素名称」(不提取「选题点」字段本身)
  4. - 旧格式(兼容):分词结果 中每项的「词」
  5. 去重后输出。
  6. """
  7. import json
  8. from pathlib import Path
  9. def _append_from_deconstruct_item(item: dict, topics: list[str]) -> None:
  10. if not isinstance(item, dict):
  11. return
  12. for el in item.get("选题点元素") or []:
  13. if not isinstance(el, dict):
  14. continue
  15. name = el.get("元素名称")
  16. if name and isinstance(name, str) and name.strip():
  17. topics.append(name.strip())
  18. for seg in item.get("分词结果") or []:
  19. if not isinstance(seg, dict):
  20. continue
  21. word = seg.get("词")
  22. if word and isinstance(word, str) and word.strip():
  23. topics.append(word.strip())
  24. def extract_post_topic(account_name: str, post_id: str) -> list[str]:
  25. """
  26. 从解构内容中提取选题点元素(元素名称)并去重;不提取「选题点」字段。
  27. :param account_name: 账号名
  28. :param post_id: 帖子ID
  29. :return: 去重后的字符串列表
  30. """
  31. overall_derivation_dir = Path(__file__).resolve().parent.parent
  32. input_path = (
  33. overall_derivation_dir
  34. / "input"
  35. / account_name
  36. / "原始数据"
  37. / "解构内容"
  38. / f"{post_id}.json"
  39. )
  40. with open(input_path, "r", encoding="utf-8") as f:
  41. data = json.load(f)
  42. topics: list[str] = []
  43. for key in ("灵感点", "目的点", "关键点"):
  44. for item in data.get(key, []) or []:
  45. _append_from_deconstruct_item(item, topics)
  46. # 去重且保持首次出现顺序
  47. seen = set()
  48. unique_topics: list[str] = []
  49. for w in topics:
  50. if w not in seen:
  51. seen.add(w)
  52. unique_topics.append(w)
  53. return unique_topics
  54. def _load_post_id_list_from_exclude_note_ids(account_name: str) -> list[str]:
  55. """从 input/{account_name}/原始数据/exclude_note_ids.json 读取帖子 ID 列表(字符串数组)。"""
  56. overall_derivation_dir = Path(__file__).resolve().parent.parent
  57. path = overall_derivation_dir / "input" / account_name / "原始数据" / "exclude_note_ids.json"
  58. if not path.is_file():
  59. raise FileNotFoundError(f"未找到帖子 ID 列表文件: {path}")
  60. with open(path, "r", encoding="utf-8") as f:
  61. data = json.load(f)
  62. if not isinstance(data, list):
  63. raise ValueError(f"exclude_note_ids.json 应为字符串数组: {path}")
  64. out: list[str] = []
  65. for x in data:
  66. if isinstance(x, str) and x.strip():
  67. out.append(x.strip())
  68. return out
  69. def main(account_name: str, post_id: str):
  70. # parser = argparse.ArgumentParser(description="从解构内容中提取选题点")
  71. # parser.add_argument("account_name", help="账号名")
  72. # parser.add_argument("post_id", help="帖子ID")
  73. # args = parser.parse_args()
  74. topics = extract_post_topic(account_name, post_id)
  75. overall_derivation_dir = Path(__file__).resolve().parent.parent
  76. out_dir = overall_derivation_dir / "input" / account_name / "处理后数据" / "post_topic"
  77. out_dir.mkdir(parents=True, exist_ok=True)
  78. out_path = out_dir / f"{post_id}.json"
  79. with open(out_path, "w", encoding="utf-8") as f:
  80. json.dump(topics, f, ensure_ascii=False, indent=2)
  81. print(f"已写入 {len(topics)} 个选题点元素到 {out_path}")
  82. if __name__ == "__main__":
  83. account_name = "空间点阵设计研究室"
  84. post_id_list = _load_post_id_list_from_exclude_note_ids(account_name)
  85. for post_id in post_id_list:
  86. main(account_name=account_name, post_id=post_id)