""" 从帖子解构内容中提取选题点(灵感点/目的点/关键点 下 分词结果 中的 词),去重后输出。 """ import json import argparse from pathlib import Path def extract_post_topic(account_name: str, post_id: str) -> list[str]: """ 从解构内容中提取选题点并去重。 :param account_name: 账号名 :param post_id: 帖子ID :return: 去重后的选题点字符串列表 """ base = Path(__file__).resolve().parent input_path = base / "input" / account_name / "原始数据" / "解构内容" / f"{post_id}.json" with open(input_path, "r", encoding="utf-8") as f: data = json.load(f) topics: list[str] = [] for key in ("灵感点", "目的点", "关键点"): for item in data.get(key, []): for seg in item.get("分词结果", []): word = seg.get("词") if word and isinstance(word, str) and word.strip(): topics.append(word.strip()) # 去重且保持首次出现顺序 seen = set() unique_topics: list[str] = [] for w in topics: if w not in seen: seen.add(w) unique_topics.append(w) return unique_topics def main(account_name: str, post_id: str): # parser = argparse.ArgumentParser(description="从解构内容中提取选题点") # parser.add_argument("account_name", help="账号名") # parser.add_argument("post_id", help="帖子ID") # args = parser.parse_args() topics = extract_post_topic(account_name, post_id) base = Path(__file__).resolve().parent out_dir = base / "input" / account_name / "post_topic" out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / f"{post_id}.json" with open(out_path, "w", encoding="utf-8") as f: json.dump(topics, f, ensure_ascii=False, indent=2) print(f"已写入 {len(topics)} 个选题点到 {out_path}") if __name__ == "__main__": main(account_name="家有大志", post_id="69185d49000000000d00f94e")