#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 从当前帖子目录中提取解构任务列表 支持新版数据结构(inspiration_final_result, purpose_final_result, keypoint_final) """ import json from pathlib import Path from typing import Dict, List, Optional import sys # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from script.detail import get_xiaohongshu_detail from script.data_processing.path_config import PathConfig def extract_post_id_from_filename(filename: str) -> str: """从文件名中提取帖子ID 格式: 68a6b96f000000001d006058.json """ return filename.replace('.json', '') def get_post_detail(post_id: str) -> Optional[Dict]: """获取帖子详情""" try: detail = get_xiaohongshu_detail(post_id) return detail except Exception as e: print(f" 警告: 获取帖子 {post_id} 详情失败: {e}") return None def process_inspiration_points(data: Dict) -> List[Dict]: """处理灵感点数据""" result = [] if "inspiration_final_result" not in data: return result inspiration_data = data["inspiration_final_result"] for item in inspiration_data.get("最终灵感点列表", []): point_item = { "ID": item.get("id", ""), "名称": item.get("灵感点", ""), "类型": item.get("类型", ""), "描述": item.get("描述", ""), "置信度": item.get("置信度", ""), "支撑的ID": [], "关联的ID": [] } result.append(point_item) return result def process_purpose_points(data: Dict) -> List[Dict]: """处理目的点数据(意图+实质)""" result = [] if "purpose_final_result" not in data: return result purpose_data = data["purpose_final_result"] # 处理意图列表 for item in purpose_data.get("最终意图列表", []): point_item = { "ID": item.get("意图ID", ""), "名称": item.get("目的点", ""), "类型": "意图", "描述": item.get("描述", ""), "置信度": item.get("置信度", ""), "支撑的ID": [], "关联的ID": [] } result.append(point_item) # 处理实质列表 for item in purpose_data.get("最终实质列表", []): related_id = item.get("关联意图ID", "") point_item = { "ID": item.get("实质ID", ""), "名称": item.get("目的点", ""), "类型": "实质", "描述": item.get("描述", ""), "置信度": item.get("置信度", ""), "支撑的ID": [], "关联的ID": [related_id] if related_id else [] } result.append(point_item) return result def process_key_points(data: Dict) -> List[Dict]: """处理关键点数据""" result = [] if "keypoint_final" not in data: return result keypoint_data = data["keypoint_final"] for item in keypoint_data.get("最终关键点列表", []): point_item = { "ID": item.get("关键点ID", ""), "名称": item.get("关键点", ""), "类型": item.get("类型", ""), "描述": item.get("描述", ""), "置信度": item.get("置信度", ""), "支撑的ID": item.get("支撑的ID", []), "关联的ID": [] } result.append(point_item) return result def process_single_file(file_path: Path) -> Optional[Dict]: """处理单个JSON文件""" post_id = extract_post_id_from_filename(file_path.name) if not post_id: print(f" 警告: 无法从文件名提取帖子ID: {file_path.name}") return None try: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) print(f" 获取帖子 {post_id} 的详情...") post_detail = get_post_detail(post_id) if not post_detail: print(f" 警告: 未能获取帖子 {post_id} 的详情") # 提取三点数据 inspiration_points = process_inspiration_points(data) purpose_points = process_purpose_points(data) key_points = process_key_points(data) task_item = { "帖子id": post_id, "帖子详情": post_detail if post_detail else {}, "what解构结果": { "灵感点列表": inspiration_points, "目的点列表": purpose_points, "关键点列表": key_points } } return task_item except Exception as e: print(f" 错误: 处理文件 {file_path.name} 时出错: {e}") return None def main(): config = PathConfig() config.ensure_dirs() input_dir = config.current_posts_dir output_file = config.task_list_file print(f"账号: {config.account_name}") print(f"当前帖子目录: {input_dir}") print(f"输出文件: {output_file}") print(f"\n正在扫描目录: {input_dir}") json_files = list(input_dir.glob("*.json")) print(f"找到 {len(json_files)} 个JSON文件") task_list = [] for i, file_path in enumerate(json_files, 1): print(f"\n处理文件 [{i}/{len(json_files)}]: {file_path.name}") task_item = process_single_file(file_path) if task_item: task_list.append(task_item) print(f" ✓ 成功提取") # 统计 total_inspiration = sum(len(t["what解构结果"]["灵感点列表"]) for t in task_list) total_purpose = sum(len(t["what解构结果"]["目的点列表"]) for t in task_list) total_key = sum(len(t["what解构结果"]["关键点列表"]) for t in task_list) print(f"\n提取统计:") print(f" 总帖子数: {len(task_list)}") print(f" 总灵感点: {total_inspiration} 个") print(f" 总目的点: {total_purpose} 个") print(f" 总关键点: {total_key} 个") print(f"\n正在保存结果到: {output_file}") with open(output_file, "w", encoding="utf-8") as f: json.dump(task_list, f, ensure_ascii=False, indent=2) print("完成!") if __name__ == "__main__": main()