""" 构建点到帖子的映射索引 从 what解构结果 中提取每个帖子的灵感点、目的点、关键点, 构建从具体点到帖子详情的映射关系。 使用方式: python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果 """ import os import json import argparse from typing import Dict, List, Any from glob import glob from script.detail import get_xiaohongshu_detail def extract_points_from_what_file(what_file: str) -> Dict[str, Any]: """从单个 what 解构文件中提取所有点 Args: what_file: what解构文件路径 Returns: 包含灵感点、目的点、关键点的字典 """ # 从文件名提取note_id filename = os.path.basename(what_file) note_id = filename.split("_with_history_")[0] with open(what_file, 'r', encoding='utf-8') as f: data = json.load(f) three_points = data.get("三点解构", {}) # 提取灵感点 inspiration_points = [] inspiration_data = three_points.get("灵感点", {}) for field in ["全新内容", "共性差异", "共性内容"]: items = inspiration_data.get(field, []) for item in items: point = item.get("灵感点", "") if point: inspiration_points.append({ "灵感点": point, "来源字段": field, "维度": item.get("维度", ""), "描述": item.get("描述", "") }) # 提取目的点 purpose_points = [] purpose_data = three_points.get("目的点", {}) # 主目的 main_purpose = purpose_data.get("main_purpose", {}) if main_purpose: point = main_purpose.get("目的点", "") if point: purpose_points.append({ "目的点": point, "类型": "主目的", "维度": main_purpose.get("维度", ""), "描述": main_purpose.get("描述", "") }) # 次要目的 secondary_purposes = purpose_data.get("secondary_purposes", []) for sec_purpose in secondary_purposes: point = sec_purpose.get("目的点", "") if point: purpose_points.append({ "目的点": point, "类型": "次要目的", "维度": sec_purpose.get("维度", ""), "描述": sec_purpose.get("描述", "") }) # 提取关键点 key_points = [] keypoint_data = three_points.get("关键点", {}) kp_list = keypoint_data.get("key_points", []) for kp in kp_list: point = kp.get("关键点", "") if point: key_points.append({ "关键点": point, "维度大类": kp.get("维度大类", ""), "维度细分": kp.get("维度细分", ""), "描述": kp.get("描述", "") }) return { "note_id": note_id, "灵感点列表": inspiration_points, "目的点列表": purpose_points, "关键点列表": key_points } def build_point_to_note_index(what_dir: str, fetch_details: bool = True) -> Dict[str, Any]: """构建点到帖子的映射索引 Args: what_dir: what解构结果目录路径 fetch_details: 是否获取帖子详情 Returns: 完整的映射索引 """ # 读取所有what解构文件 what_files = glob(os.path.join(what_dir, "*_with_history_*.json")) print(f"{'=' * 80}") print(f"开始构建点到帖子的映射索引") print(f"{'=' * 80}") print(f"解构文件数量: {len(what_files)}\n") # 初始化索引结构 inspiration_index = {} # {灵感点: [note_id1, note_id2, ...]} purpose_index = {} # {目的点: [note_id1, note_id2, ...]} keypoint_index = {} # {关键点: [note_id1, note_id2, ...]} # 帖子到点的映射 note_to_points = {} # {note_id: {灵感点: [], 目的点: [], 关键点: []}} # 遍历所有文件 for what_file in what_files: points_data = extract_points_from_what_file(what_file) note_id = points_data["note_id"] # 初始化帖子的点列表 note_to_points[note_id] = { "灵感点列表": points_data["灵感点列表"], "目的点列表": points_data["目的点列表"], "关键点列表": points_data["关键点列表"] } # 构建灵感点到帖子的映射 for insp in points_data["灵感点列表"]: point_name = insp["灵感点"] if point_name not in inspiration_index: inspiration_index[point_name] = { "灵感点": point_name, "维度": insp["维度"], "描述": insp["描述"], "帖子ID列表": [] } if note_id not in inspiration_index[point_name]["帖子ID列表"]: inspiration_index[point_name]["帖子ID列表"].append(note_id) # 构建目的点到帖子的映射 for purp in points_data["目的点列表"]: point_name = purp["目的点"] if point_name not in purpose_index: purpose_index[point_name] = { "目的点": point_name, "类型": purp["类型"], "维度": purp["维度"], "描述": purp["描述"], "帖子ID列表": [] } if note_id not in purpose_index[point_name]["帖子ID列表"]: purpose_index[point_name]["帖子ID列表"].append(note_id) # 构建关键点到帖子的映射 for kp in points_data["关键点列表"]: point_name = kp["关键点"] if point_name not in keypoint_index: keypoint_index[point_name] = { "关键点": point_name, "维度大类": kp["维度大类"], "维度细分": kp["维度细分"], "描述": kp["描述"], "帖子ID列表": [] } if note_id not in keypoint_index[point_name]["帖子ID列表"]: keypoint_index[point_name]["帖子ID列表"].append(note_id) print(f"✓ 灵感点: {len(inspiration_index)} 个") print(f"✓ 目的点: {len(purpose_index)} 个") print(f"✓ 关键点: {len(keypoint_index)} 个") print(f"✓ 帖子: {len(note_to_points)} 个\n") # 获取帖子详情 if fetch_details: # 收集所有唯一的note_ids all_note_ids = list(note_to_points.keys()) print(f"{'=' * 80}") print(f"开始获取帖子详情...") print(f"{'=' * 80}\n") # 获取所有帖子详情(缓存到内存) note_details_cache = {} for i, note_id in enumerate(all_note_ids, 1): try: print(f"[{i}/{len(all_note_ids)}] 获取详情: {note_id}") detail = get_xiaohongshu_detail(note_id) note_details_cache[note_id] = detail except Exception as e: print(f" ⚠️ 获取失败: {e}") note_details_cache[note_id] = { "channel_content_id": note_id, "error": str(e) } print(f"\n✓ 帖子详情获取完成\n") # 填充详情到每个索引 for point_info in inspiration_index.values(): note_ids = point_info.get("帖子ID列表", []) point_info["帖子详情列表"] = [ note_details_cache.get(note_id, {"channel_content_id": note_id}) for note_id in note_ids ] for point_info in purpose_index.values(): note_ids = point_info.get("帖子ID列表", []) point_info["帖子详情列表"] = [ note_details_cache.get(note_id, {"channel_content_id": note_id}) for note_id in note_ids ] for point_info in keypoint_index.values(): note_ids = point_info.get("帖子ID列表", []) point_info["帖子详情列表"] = [ note_details_cache.get(note_id, {"channel_content_id": note_id}) for note_id in note_ids ] # 填充帖子到点的映射中的详情 for note_id in note_to_points.keys(): note_to_points[note_id]["帖子详情"] = note_details_cache.get( note_id, {"channel_content_id": note_id} ) return { "点到帖子映射": { "灵感点": inspiration_index, "目的点": purpose_index, "关键点": keypoint_index }, "帖子到点映射": note_to_points } def save_index(index_data: Dict[str, Any], output_file: str): """保存索引到文件 Args: index_data: 索引数据 output_file: 输出文件路径 """ output_dir = os.path.dirname(output_file) if output_dir: os.makedirs(output_dir, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(index_data, f, ensure_ascii=False, indent=2) print(f"✓ 索引已保存: {output_file}") def print_statistics(index_data: Dict[str, Any]): """打印统计信息 Args: index_data: 完整的索引数据 """ print(f"\n{'=' * 80}") print(f"索引统计信息") print(f"{'=' * 80}\n") point_to_note = index_data.get("点到帖子映射", {}) note_to_point = index_data.get("帖子到点映射", {}) print(f"点到帖子映射:") for point_type, points in point_to_note.items(): total_points = len(points) all_note_ids = set() for point_info in points.values(): all_note_ids.update(point_info.get("帖子ID列表", [])) total_notes = len(all_note_ids) avg_notes = total_notes / total_points if total_points > 0 else 0 print(f" {point_type}:") print(f" 点的数量: {total_points}") print(f" 关联帖子数: {total_notes}") print(f" 平均每个点关联帖子数: {avg_notes:.1f}") print(f"\n帖子到点映射:") print(f" 帖子数量: {len(note_to_point)}") # 统计每个帖子平均有多少个点 total_insp = sum(len(v.get("灵感点列表", [])) for v in note_to_point.values()) total_purp = sum(len(v.get("目的点列表", [])) for v in note_to_point.values()) total_kp = sum(len(v.get("关键点列表", [])) for v in note_to_point.values()) note_count = len(note_to_point) if note_count > 0: print(f" 平均每个帖子的灵感点数: {total_insp / note_count:.1f}") print(f" 平均每个帖子的目的点数: {total_purp / note_count:.1f}") print(f" 平均每个帖子的关键点数: {total_kp / note_count:.1f}") def main(): """主函数""" parser = argparse.ArgumentParser( description="构建点到帖子的映射索引", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 使用示例: # 基本使用 python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果 # 只构建索引,不获取帖子详情 python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果 --no-details # 自定义输出文件 python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果 --output custom.json """ ) parser.add_argument( "--what-dir", required=True, help="what解构结果目录路径" ) parser.add_argument( "--output", default=None, help="输出文件路径(默认: {what_dir}/../点到帖子映射.json)" ) parser.add_argument( "--no-details", action="store_true", help="不获取帖子详情(只构建索引结构)" ) args = parser.parse_args() what_dir = args.what_dir fetch_details = not args.no_details if not os.path.exists(what_dir): print(f"❌ 错误: 找不到what解构目录: {what_dir}") return # 构建索引 index_data = build_point_to_note_index(what_dir, fetch_details=fetch_details) # 确定输出文件路径 if args.output: output_file = args.output else: parent_dir = os.path.dirname(what_dir.rstrip('/')) output_file = os.path.join(parent_dir, "点到帖子映射.json") # 保存索引 save_index(index_data, output_file) # 打印统计信息 print_statistics(index_data) if __name__ == "__main__": main()