""" 构建人设分类的反向索引 将人设数据和what解构数据转换为以分类名称为键的反向索引结构,包含: - 灵感分类(来自人设.json) - 目的分类(来自what解构) - 关键点分类(来自what解构) 使用方式: python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110 """ import os import json import argparse from typing import Dict, List, Any from glob import glob from script.detail import get_xiaohongshu_detail def build_inspiration_index(persona_data: Dict[str, Any]) -> Dict[str, Any]: """构建灵感点索引 Args: persona_data: 人设数据(包含灵感点列表) Returns: 灵感分类索引 """ index = {} # 遍历所有视角 for perspective in persona_data.get("灵感点列表", []): perspective_name = perspective.get("视角名称", "") perspective_desc = perspective.get("视角描述", "") # 遍历一级分类(模式列表) for category_l1 in perspective.get("模式列表", []): category_l1_name = category_l1.get("分类名称", "") category_l1_def = category_l1.get("核心定义", "") # 收集一级分类下所有二级分类的帖子 category_l1_note_ids = set() # 遍历二级分类(二级细分) for category_l2 in category_l1.get("二级细分", []): category_l2_name = category_l2.get("分类名称", "") category_l2_def = category_l2.get("分类定义", "") note_ids = category_l2.get("帖子ID列表", []) # 去重帖子ID unique_note_ids = list(dict.fromkeys(note_ids)) # 添加到一级分类的帖子集合 category_l1_note_ids.update(unique_note_ids) # 构建二级分类路径 category_l2_path = [ { "视角名称": perspective_name, "视角描述": perspective_desc }, { "分类名称": category_l1_name, "分类定义": category_l1_def }, { "分类名称": category_l2_name, "分类定义": category_l2_def } ] # 如果二级分类已存在,合并帖子列表 if category_l2_name in index: existing_ids = set(index[category_l2_name]["帖子ID列表"]) new_ids = set(unique_note_ids) index[category_l2_name]["帖子ID列表"] = list(existing_ids | new_ids) else: # 创建新的二级分类索引 index[category_l2_name] = { "分类层级": "二级分类", "分类名称": category_l2_name, "分类定义": category_l2_def, "分类路径": category_l2_path, "帖子ID列表": unique_note_ids } # 构建一级分类路径 category_l1_path = [ { "视角名称": perspective_name, "视角描述": perspective_desc }, { "分类名称": category_l1_name, "分类定义": category_l1_def } ] # 添加一级分类索引 if category_l1_name in index: existing_ids = set(index[category_l1_name]["帖子ID列表"]) index[category_l1_name]["帖子ID列表"] = list(existing_ids | category_l1_note_ids) else: index[category_l1_name] = { "分类层级": "一级分类", "分类名称": category_l1_name, "分类定义": category_l1_def, "分类路径": category_l1_path, "帖子ID列表": list(category_l1_note_ids) } return index def build_purpose_index(what_dir: str) -> Dict[str, Any]: """构建目的点索引 Args: what_dir: what解构结果目录路径 Returns: 目的分类索引 """ index = {} # 读取所有what解构文件 what_files = glob(os.path.join(what_dir, "*_with_history_*.json")) for what_file in what_files: # 从文件名提取note_id filename = os.path.basename(what_file) note_id = filename.split("_with_history_")[0] with open(what_file, 'r', encoding='utf-8') as f: data = json.load(f) purpose_data = data.get("三点解构", {}).get("目的点", {}) # 主目的 main_purpose = purpose_data.get("main_purpose", {}) if main_purpose: purpose_name = main_purpose.get("目的点", "") if purpose_name: if purpose_name not in index: index[purpose_name] = { "分类类型": "主目的", "目的点": purpose_name, "维度": main_purpose.get("维度", ""), "描述": main_purpose.get("描述", ""), "帖子ID列表": [] } if note_id not in index[purpose_name]["帖子ID列表"]: index[purpose_name]["帖子ID列表"].append(note_id) # 次要目的 secondary_purposes = purpose_data.get("secondary_purposes", []) for sec_purpose in secondary_purposes: purpose_name = sec_purpose.get("目的点", "") if purpose_name: if purpose_name not in index: index[purpose_name] = { "分类类型": "次要目的", "目的点": purpose_name, "维度": sec_purpose.get("维度", ""), "描述": sec_purpose.get("描述", ""), "帖子ID列表": [] } if note_id not in index[purpose_name]["帖子ID列表"]: index[purpose_name]["帖子ID列表"].append(note_id) return index def build_keypoint_index(what_dir: str) -> Dict[str, Any]: """构建关键点索引 Args: what_dir: what解构结果目录路径 Returns: 关键点分类索引 """ index = {} # 读取所有what解构文件 what_files = glob(os.path.join(what_dir, "*_with_history_*.json")) for what_file in what_files: # 从文件名提取note_id filename = os.path.basename(what_file) note_id = filename.split("_with_history_")[0] with open(what_file, 'r', encoding='utf-8') as f: data = json.load(f) keypoint_data = data.get("三点解构", {}).get("关键点", {}) key_points = keypoint_data.get("key_points", []) for kp in key_points: kp_name = kp.get("关键点", "") if kp_name: if kp_name not in index: index[kp_name] = { "关键点": kp_name, "维度大类": kp.get("维度大类", ""), "维度细分": kp.get("维度细分", ""), "描述": kp.get("描述", ""), "帖子ID列表": [] } if note_id not in index[kp_name]["帖子ID列表"]: index[kp_name]["帖子ID列表"].append(note_id) return index def fetch_note_details(category_data: Dict[str, Any]) -> Dict[str, Any]: """获取帖子详情 Args: category_data: 分类数据(包含帖子ID列表) Returns: 更新后的分类数据(包含帖子详情) """ # 收集所有unique的note_ids all_note_ids = set() for category_info in category_data.values(): all_note_ids.update(category_info.get("帖子ID列表", [])) all_note_ids = list(all_note_ids) print(f"\n{'=' * 80}") print(f"开始获取帖子详情...") print(f"{'=' * 80}\n") print(f"共有 {len(all_note_ids)} 个唯一帖子\n") # 获取所有帖子详情(缓存到内存) note_details_cache = {} for i, note_id in enumerate(all_note_ids, 1): try: print(f"[{i}/{len(all_note_ids)}] 获取详情: {note_id}") detail = get_xiaohongshu_detail(note_id) note_details_cache[note_id] = detail except Exception as e: print(f" ⚠️ 获取失败: {e}") note_details_cache[note_id] = { "channel_content_id": note_id, "error": str(e) } print(f"\n✓ 帖子详情获取完成\n") # 填充详情到每个分类 for category_info in category_data.values(): note_ids = category_info.get("帖子ID列表", []) category_info["帖子详情列表"] = [ note_details_cache.get(note_id, {"channel_content_id": note_id}) for note_id in note_ids ] return category_data def save_index(index_data: Dict[str, Any], output_file: str): """保存索引到文件 Args: index_data: 索引数据 output_file: 输出文件路径 """ output_dir = os.path.dirname(output_file) if output_dir: os.makedirs(output_dir, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(index_data, f, ensure_ascii=False, indent=2) print(f"✓ 索引已保存: {output_file}") def print_statistics(index_data: Dict[str, Any]): """打印统计信息 Args: index_data: 完整的索引数据 """ print(f"\n{'=' * 80}") print(f"索引统计信息") print(f"{'=' * 80}\n") for index_type, categories in index_data.items(): total_categories = len(categories) all_note_ids = set() for cat_info in categories.values(): all_note_ids.update(cat_info.get("帖子ID列表", [])) total_notes = len(all_note_ids) avg_notes = total_notes / total_categories if total_categories > 0 else 0 print(f"{index_type}:") print(f" 分类数量: {total_categories}") print(f" 帖子总数: {total_notes}") print(f" 平均每分类帖子数: {avg_notes:.1f}\n") def main(): """主函数""" parser = argparse.ArgumentParser( description="构建人设分类的反向索引(灵感+目的+关键点)", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 使用示例: # 基本使用 python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110 # 只构建索引,不获取帖子详情 python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110 --no-details # 自定义输出文件 python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110 --output custom_index.json """ ) parser.add_argument( "--persona-dir", required=True, help="人设目录路径(包含人设.json和what解构结果/的目录)" ) parser.add_argument( "--output", default=None, help="输出文件路径(默认: {persona_dir}/分类索引_完整.json)" ) parser.add_argument( "--no-details", action="store_true", help="不获取帖子详情(只构建索引结构)" ) args = parser.parse_args() persona_dir = args.persona_dir fetch_details = not args.no_details # 检查必要文件 persona_file = os.path.join(persona_dir, "人设.json") what_dir = os.path.join(persona_dir, "what解构结果") if not os.path.exists(persona_file): print(f"❌ 错误: 找不到人设文件: {persona_file}") return if not os.path.exists(what_dir): print(f"❌ 错误: 找不到what解构目录: {what_dir}") return print(f"{'=' * 80}") print(f"构建人设分类反向索引(灵感+目的+关键点)") print(f"{'=' * 80}") print(f"人设文件: {persona_file}") print(f"解构目录: {what_dir}") print(f"获取详情: {'是' if fetch_details else '否'}\n") # 读取人设数据 with open(persona_file, 'r', encoding='utf-8') as f: persona_data = json.load(f) # 构建三种索引 print(f"{'─' * 80}") print(f"1. 构建灵感分类索引...") print(f"{'─' * 80}\n") inspiration_index = build_inspiration_index(persona_data) print(f"✓ 灵感分类: {len(inspiration_index)} 个分类\n") print(f"{'─' * 80}") print(f"2. 构建目的分类索引...") print(f"{'─' * 80}\n") purpose_index = build_purpose_index(what_dir) print(f"✓ 目的分类: {len(purpose_index)} 个分类\n") print(f"{'─' * 80}") print(f"3. 构建关键点分类索引...") print(f"{'─' * 80}\n") keypoint_index = build_keypoint_index(what_dir) print(f"✓ 关键点分类: {len(keypoint_index)} 个分类\n") # 合并为完整索引 full_index = { "灵感分类": inspiration_index, "目的分类": purpose_index, "关键点分类": keypoint_index } # 获取帖子详情 if fetch_details: full_index["灵感分类"] = fetch_note_details(inspiration_index) full_index["目的分类"] = fetch_note_details(purpose_index) full_index["关键点分类"] = fetch_note_details(keypoint_index) # 确定输出文件路径 if args.output: output_file = args.output else: output_file = os.path.join(persona_dir, "分类索引_完整.json") # 保存索引 save_index(full_index, output_file) # 打印统计信息 print_statistics(full_index) if __name__ == "__main__": main()