| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- """
- 从帖子解构内容中提取选题点(灵感点/目的点/关键点 下 分词结果 中的 词),去重后输出。
- """
- import json
- import argparse
- from pathlib import Path
- def extract_post_topic(account_name: str, post_id: str) -> list[str]:
- """
- 从解构内容中提取选题点并去重。
- :param account_name: 账号名
- :param post_id: 帖子ID
- :return: 去重后的选题点字符串列表
- """
- base = Path(__file__).resolve().parent
- input_path = base / "input" / account_name / "原始数据" / "解构内容" / f"{post_id}.json"
- with open(input_path, "r", encoding="utf-8") as f:
- data = json.load(f)
- topics: list[str] = []
- for key in ("灵感点", "目的点", "关键点"):
- for item in data.get(key, []):
- for seg in item.get("分词结果", []):
- word = seg.get("词")
- if word and isinstance(word, str) and word.strip():
- topics.append(word.strip())
- # 去重且保持首次出现顺序
- seen = set()
- unique_topics: list[str] = []
- for w in topics:
- if w not in seen:
- seen.add(w)
- unique_topics.append(w)
- return unique_topics
- def main(account_name: str, post_id: str):
- # parser = argparse.ArgumentParser(description="从解构内容中提取选题点")
- # parser.add_argument("account_name", help="账号名")
- # parser.add_argument("post_id", help="帖子ID")
- # args = parser.parse_args()
- topics = extract_post_topic(account_name, post_id)
- base = Path(__file__).resolve().parent
- out_dir = base / "input" / account_name / "post_topic"
- out_dir.mkdir(parents=True, exist_ok=True)
- out_path = out_dir / f"{post_id}.json"
- with open(out_path, "w", encoding="utf-8") as f:
- json.dump(topics, f, ensure_ascii=False, indent=2)
- print(f"已写入 {len(topics)} 个选题点到 {out_path}")
- if __name__ == "__main__":
- main(account_name="家有大志", post_id="69185d49000000000d00f94e")
|