""" 构建帖子ID到点和分类的完整映射 从人设.json和what解构结果中提取每个帖子的: 1. 所属的灵感分类、目的分类、关键点分类(来自人设.json) 2. 具体的灵感点、目的点、关键点(来自what解构结果) 3. 帖子详情 使用方式: python build_note_to_all_index.py --persona-dir data/阿里多多酱/out/人设_1110 """ import os import json import argparse from typing import Dict, List, Any from glob import glob from script.detail import get_xiaohongshu_detail def build_category_mapping(persona_data: Dict[str, Any]) -> Dict[str, List[str]]: """构建帖子ID到分类的映射 Args: persona_data: 人设数据 Returns: {note_id: [分类名称列表]} """ note_to_categories = {} # 遍历所有视角 for perspective in persona_data.get("灵感点列表", []): perspective_name = perspective.get("视角名称", "") # 遍历一级分类 for category_l1 in perspective.get("模式列表", []): category_l1_name = category_l1.get("分类名称", "") # 遍历二级分类 for category_l2 in category_l1.get("二级细分", []): category_l2_name = category_l2.get("分类名称", "") category_l2_def = category_l2.get("分类定义", "") note_ids = category_l2.get("帖子ID列表", []) # 去重帖子ID unique_note_ids = list(dict.fromkeys(note_ids)) # 为每个帖子添加分类信息 for note_id in unique_note_ids: if note_id not in note_to_categories: note_to_categories[note_id] = [] note_to_categories[note_id].append({ "分类类型": "灵感分类", "视角名称": perspective_name, "一级分类": category_l1_name, "二级分类": category_l2_name, "分类定义": category_l2_def }) return note_to_categories def extract_points_from_what(what_dir: str) -> Dict[str, Dict[str, Any]]: """从what解构结果提取所有点 Args: what_dir: what解构结果目录 Returns: {note_id: {灵感点列表, 目的点列表, 关键点列表}} """ note_to_points = {} # 读取所有what解构文件 what_files = glob(os.path.join(what_dir, "*_with_history_*.json")) for what_file in what_files: # 从文件名提取note_id filename = os.path.basename(what_file) note_id = filename.split("_with_history_")[0] with open(what_file, 'r', encoding='utf-8') as f: data = json.load(f) three_points = data.get("三点解构", {}) # 提取灵感点 inspiration_points = [] inspiration_data = three_points.get("灵感点", {}) for field in ["全新内容", "共性差异", "共性内容"]: items = inspiration_data.get(field, []) for item in items: point = item.get("灵感点", "") if point: inspiration_points.append({ "灵感点": point, "来源字段": field, "维度": item.get("维度", ""), "描述": item.get("描述", "") }) # 提取目的点 purpose_points = [] purpose_data = three_points.get("目的点", {}) # 主目的 main_purpose = purpose_data.get("main_purpose", {}) if main_purpose: point = main_purpose.get("目的点", "") if point: purpose_points.append({ "目的点": point, "类型": "主目的", "维度": main_purpose.get("维度", ""), "描述": main_purpose.get("描述", "") }) # 次要目的 secondary_purposes = purpose_data.get("secondary_purposes", []) for sec_purpose in secondary_purposes: point = sec_purpose.get("目的点", "") if point: purpose_points.append({ "目的点": point, "类型": "次要目的", "维度": sec_purpose.get("维度", ""), "描述": sec_purpose.get("描述", "") }) # 提取关键点 key_points = [] keypoint_data = three_points.get("关键点", {}) kp_list = keypoint_data.get("key_points", []) for kp in kp_list: point = kp.get("关键点", "") if point: key_points.append({ "关键点": point, "维度大类": kp.get("维度大类", ""), "维度细分": kp.get("维度细分", ""), "描述": kp.get("描述", "") }) note_to_points[note_id] = { "灵感点列表": inspiration_points, "目的点列表": purpose_points, "关键点列表": key_points } return note_to_points def build_note_to_all_index( persona_dir: str, fetch_details: bool = True ) -> Dict[str, Any]: """构建帖子ID到点和分类的完整映射 Args: persona_dir: 人设目录路径 fetch_details: 是否获取帖子详情 Returns: 完整的映射索引 """ persona_file = os.path.join(persona_dir, "人设.json") what_dir = os.path.join(persona_dir, "what解构结果") print(f"{'=' * 80}") print(f"构建帖子ID到点和分类的完整映射") print(f"{'=' * 80}") print(f"人设文件: {persona_file}") print(f"解构目录: {what_dir}\n") # 读取人设数据 with open(persona_file, 'r', encoding='utf-8') as f: persona_data = json.load(f) # 构建分类映射 print(f"{'─' * 80}") print(f"1. 提取分类信息(来自人设.json)") print(f"{'─' * 80}\n") note_to_categories = build_category_mapping(persona_data) print(f"✓ 从人设.json中提取了 {len(note_to_categories)} 个帖子的分类信息\n") # 提取点信息 print(f"{'─' * 80}") print(f"2. 提取点信息(来自what解构结果)") print(f"{'─' * 80}\n") note_to_points = extract_points_from_what(what_dir) print(f"✓ 从what解构结果中提取了 {len(note_to_points)} 个帖子的点信息\n") # 合并所有帖子ID all_note_ids = set(note_to_categories.keys()) | set(note_to_points.keys()) print(f"✓ 共有 {len(all_note_ids)} 个唯一帖子\n") # 构建完整映射 note_index = {} for note_id in all_note_ids: note_index[note_id] = { "帖子ID": note_id, "所属分类": note_to_categories.get(note_id, []), "灵感点列表": note_to_points.get(note_id, {}).get("灵感点列表", []), "目的点列表": note_to_points.get(note_id, {}).get("目的点列表", []), "关键点列表": note_to_points.get(note_id, {}).get("关键点列表", []) } # 获取帖子详情 if fetch_details: print(f"{'=' * 80}") print(f"开始获取帖子详情...") print(f"{'=' * 80}\n") for i, note_id in enumerate(sorted(all_note_ids), 1): try: print(f"[{i}/{len(all_note_ids)}] 获取详情: {note_id}") detail = get_xiaohongshu_detail(note_id) note_index[note_id]["帖子详情"] = detail except Exception as e: print(f" ⚠️ 获取失败: {e}") note_index[note_id]["帖子详情"] = { "channel_content_id": note_id, "error": str(e) } print(f"\n✓ 帖子详情获取完成\n") return note_index def save_index(index_data: Dict[str, Any], output_file: str): """保存索引到文件 Args: index_data: 索引数据 output_file: 输出文件路径 """ output_dir = os.path.dirname(output_file) if output_dir: os.makedirs(output_dir, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(index_data, f, ensure_ascii=False, indent=2) print(f"✓ 索引已保存: {output_file}") def print_statistics(index_data: Dict[str, Any]): """打印统计信息 Args: index_data: 完整的索引数据 """ print(f"\n{'=' * 80}") print(f"索引统计信息") print(f"{'=' * 80}\n") total_notes = len(index_data) print(f"帖子总数: {total_notes}") # 统计有分类的帖子 notes_with_categories = sum(1 for v in index_data.values() if v.get("所属分类")) print(f"有分类信息的帖子: {notes_with_categories}") # 统计有点信息的帖子 notes_with_inspiration = sum(1 for v in index_data.values() if v.get("灵感点列表")) notes_with_purpose = sum(1 for v in index_data.values() if v.get("目的点列表")) notes_with_keypoint = sum(1 for v in index_data.values() if v.get("关键点列表")) print(f"有灵感点信息的帖子: {notes_with_inspiration}") print(f"有目的点信息的帖子: {notes_with_purpose}") print(f"有关键点信息的帖子: {notes_with_keypoint}") # 统计平均数量 total_categories = sum(len(v.get("所属分类", [])) for v in index_data.values()) total_inspiration = sum(len(v.get("灵感点列表", [])) for v in index_data.values()) total_purpose = sum(len(v.get("目的点列表", [])) for v in index_data.values()) total_keypoint = sum(len(v.get("关键点列表", [])) for v in index_data.values()) if total_notes > 0: print(f"\n平均每个帖子:") print(f" 分类数: {total_categories / total_notes:.1f}") print(f" 灵感点数: {total_inspiration / total_notes:.1f}") print(f" 目的点数: {total_purpose / total_notes:.1f}") print(f" 关键点数: {total_keypoint / total_notes:.1f}") def main(): """主函数""" parser = argparse.ArgumentParser( description="构建帖子ID到点和分类的完整映射", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 使用示例: # 基本使用 python build_note_to_all_index.py --persona-dir data/阿里多多酱/out/人设_1110 # 只构建索引,不获取帖子详情 python build_note_to_all_index.py --persona-dir data/阿里多多酱/out/人设_1110 --no-details # 自定义输出文件 python build_note_to_all_index.py --persona-dir data/阿里多多酱/out/人设_1110 --output custom.json """ ) parser.add_argument( "--persona-dir", required=True, help="人设目录路径(包含人设.json和what解构结果/的目录)" ) parser.add_argument( "--output", default=None, help="输出文件路径(默认: {persona_dir}/帖子到分类和点映射.json)" ) parser.add_argument( "--no-details", action="store_true", help="不获取帖子详情(只构建索引结构)" ) args = parser.parse_args() persona_dir = args.persona_dir fetch_details = not args.no_details # 检查必要文件 persona_file = os.path.join(persona_dir, "人设.json") what_dir = os.path.join(persona_dir, "what解构结果") if not os.path.exists(persona_file): print(f"❌ 错误: 找不到人设文件: {persona_file}") return if not os.path.exists(what_dir): print(f"❌ 错误: 找不到what解构目录: {what_dir}") return # 构建索引 index_data = build_note_to_all_index(persona_dir, fetch_details=fetch_details) # 确定输出文件路径 if args.output: output_file = args.output else: output_file = os.path.join(persona_dir, "帖子到分类和点映射.json") # 保存索引 save_index(index_data, output_file) # 打印统计信息 print_statistics(index_data) if __name__ == "__main__": main()