#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 从当前帖子_what解构结果目录中提取解构任务列表 """ import json from pathlib import Path from typing import Dict, List, Optional import sys import re # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from script.detail import get_xiaohongshu_detail def extract_post_id_from_filename(filename: str) -> str: """从文件名中提取帖子ID""" match = re.match(r'^([^_]+)_', filename) if match: return match.group(1) return "" def get_post_detail(post_id: str) -> Optional[Dict]: """获取帖子详情""" try: detail = get_xiaohongshu_detail(post_id) return detail except Exception as e: print(f" 警告: 获取帖子 {post_id} 详情失败: {e}") return None def extract_features_from_point(point_data: Dict) -> List[str]: """ 从点数据中提取特征名称列表 Args: point_data: 点的数据(包含"提取的特征"字段) Returns: 特征名称列表 """ features = [] if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list): for feature in point_data["提取的特征"]: if "特征名称" in feature: features.append(feature["特征名称"]) return features def process_inspiration_points(inspiration_data: Dict) -> List[Dict]: """ 处理灵感点数据 Args: inspiration_data: 灵感点数据 Returns: 灵感点列表 """ result = [] # 处理三个维度:全新内容、共性差异、共性内容 for dimension in ["全新内容", "共性差异", "共性内容"]: if dimension in inspiration_data and isinstance(inspiration_data[dimension], list): for item in inspiration_data[dimension]: point_item = { "名称": item.get("灵感点", ""), "描述": item.get("描述", ""), "特征列表": extract_features_from_point(item) } result.append(point_item) return result def process_purpose_points(purpose_data: Dict) -> List[Dict]: """ 处理目的点数据 Args: purpose_data: 目的点数据 Returns: 目的点列表 """ result = [] if "purposes" in purpose_data and isinstance(purpose_data["purposes"], list): for item in purpose_data["purposes"]: point_item = { "名称": item.get("目的点", ""), "描述": item.get("描述", ""), "特征列表": extract_features_from_point(item) } result.append(point_item) return result def process_key_points(key_data: Dict) -> List[Dict]: """ 处理关键点数据 Args: key_data: 关键点数据 Returns: 关键点列表 """ result = [] if "key_points" in key_data and isinstance(key_data["key_points"], list): for item in key_data["key_points"]: point_item = { "名称": item.get("关键点", ""), "描述": item.get("描述", ""), "特征列表": extract_features_from_point(item) } result.append(point_item) return result def process_single_file(file_path: Path) -> Optional[Dict]: """ 处理单个JSON文件 Args: file_path: JSON文件路径 Returns: 解构任务字典,如果处理失败则返回None """ # 从文件名提取帖子ID post_id = extract_post_id_from_filename(file_path.name) if not post_id: print(f" 警告: 无法从文件名提取帖子ID: {file_path.name}") return None try: # 读取文件 with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) # 获取帖子详情 print(f" 获取帖子 {post_id} 的详情...") post_detail = get_post_detail(post_id) if not post_detail: print(f" 警告: 未能获取帖子 {post_id} 的详情") # 提取三点解构数据 three_points = data.get("三点解构", {}) # 处理灵感点 inspiration_points = [] if "灵感点" in three_points: inspiration_points = process_inspiration_points(three_points["灵感点"]) # 处理目的点 purpose_points = [] if "目的点" in three_points: purpose_points = process_purpose_points(three_points["目的点"]) # 处理关键点 key_points = [] if "关键点" in three_points: key_points = process_key_points(three_points["关键点"]) # 构建结果 task_item = { "帖子id": post_id, "帖子详情": post_detail if post_detail else {}, "what解构结果": { "灵感点列表": inspiration_points, "目的点列表": purpose_points, "关键点列表": key_points } } return task_item except Exception as e: print(f" 错误: 处理文件 {file_path.name} 时出错: {e}") return None def main(): # 输入输出路径(默认使用项目根目录下的 data/data_1117 目录) script_dir = Path(__file__).parent project_root = script_dir.parent.parent data_dir = project_root / "data" / "data_1117" input_dir = data_dir / "当前帖子_what解构结果" output_file = data_dir / "当前帖子_解构任务列表.json" print(f"正在扫描目录: {input_dir}") # 获取所有JSON文件 json_files = list(input_dir.glob("*.json")) print(f"找到 {len(json_files)} 个JSON文件\n") # 处理所有文件 task_list = [] for i, file_path in enumerate(json_files, 1): print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}") task_item = process_single_file(file_path) if task_item: task_list.append(task_item) print(f" ✓ 成功提取") print() # 构建最终结果 final_result = { "解构任务列表": task_list } # 统计信息 print(f"提取统计:") print(f" 总帖子数: {len(task_list)}") total_inspiration = sum(len(task["what解构结果"]["灵感点列表"]) for task in task_list) total_purpose = sum(len(task["what解构结果"]["目的点列表"]) for task in task_list) total_key = sum(len(task["what解构结果"]["关键点列表"]) for task in task_list) print(f" 总灵感点: {total_inspiration} 个") print(f" 总目的点: {total_purpose} 个") print(f" 总关键点: {total_key} 个") # 保存结果 print(f"\n正在保存结果到: {output_file}") with open(output_file, "w", encoding="utf-8") as f: json.dump(final_result, f, ensure_ascii=False, indent=4) print("完成!") if __name__ == "__main__": main()