extract_post_topic.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. """
  2. 从帖子解构内容中提取选题点(灵感点/目的点/关键点 下 分词结果 中的 词),去重后输出。
  3. """
  4. import json
  5. import argparse
  6. from pathlib import Path
  7. def extract_post_topic(account_name: str, post_id: str) -> list[str]:
  8. """
  9. 从解构内容中提取选题点并去重。
  10. :param account_name: 账号名
  11. :param post_id: 帖子ID
  12. :return: 去重后的选题点字符串列表
  13. """
  14. base = Path(__file__).resolve().parent
  15. input_path = base / "input" / account_name / "原始数据" / "解构内容" / f"{post_id}.json"
  16. with open(input_path, "r", encoding="utf-8") as f:
  17. data = json.load(f)
  18. topics: list[str] = []
  19. for key in ("灵感点", "目的点", "关键点"):
  20. for item in data.get(key, []):
  21. for seg in item.get("分词结果", []):
  22. word = seg.get("词")
  23. if word and isinstance(word, str) and word.strip():
  24. topics.append(word.strip())
  25. # 去重且保持首次出现顺序
  26. seen = set()
  27. unique_topics: list[str] = []
  28. for w in topics:
  29. if w not in seen:
  30. seen.add(w)
  31. unique_topics.append(w)
  32. return unique_topics
  33. def main(account_name: str, post_id: str):
  34. # parser = argparse.ArgumentParser(description="从解构内容中提取选题点")
  35. # parser.add_argument("account_name", help="账号名")
  36. # parser.add_argument("post_id", help="帖子ID")
  37. # args = parser.parse_args()
  38. topics = extract_post_topic(account_name, post_id)
  39. base = Path(__file__).resolve().parent
  40. out_dir = base / "input" / account_name / "post_topic"
  41. out_dir.mkdir(parents=True, exist_ok=True)
  42. out_path = out_dir / f"{post_id}.json"
  43. with open(out_path, "w", encoding="utf-8") as f:
  44. json.dump(topics, f, ensure_ascii=False, indent=2)
  45. print(f"已写入 {len(topics)} 个选题点到 {out_path}")
  46. if __name__ == "__main__":
  47. main(account_name="家有大志", post_id="69185d49000000000d00f94e")