|
|
@@ -0,0 +1,242 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+从当前帖子_what解构结果目录中提取解构任务列表
|
|
|
+"""
|
|
|
+
|
|
|
+import json
|
|
|
+from pathlib import Path
|
|
|
+from typing import Dict, List, Optional
|
|
|
+import sys
|
|
|
+import re
|
|
|
+
|
|
|
+# 添加项目根目录到路径
|
|
|
+project_root = Path(__file__).parent.parent.parent
|
|
|
+sys.path.insert(0, str(project_root))
|
|
|
+
|
|
|
+from script.detail import get_xiaohongshu_detail
|
|
|
+
|
|
|
+
|
|
|
+def extract_post_id_from_filename(filename: str) -> str:
|
|
|
+ """从文件名中提取帖子ID"""
|
|
|
+ match = re.match(r'^([^_]+)_', filename)
|
|
|
+ if match:
|
|
|
+ return match.group(1)
|
|
|
+ return ""
|
|
|
+
|
|
|
+
|
|
|
+def get_post_detail(post_id: str) -> Optional[Dict]:
|
|
|
+ """获取帖子详情"""
|
|
|
+ try:
|
|
|
+ detail = get_xiaohongshu_detail(post_id)
|
|
|
+ return detail
|
|
|
+ except Exception as e:
|
|
|
+ print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def extract_features_from_point(point_data: Dict) -> List[str]:
|
|
|
+ """
|
|
|
+ 从点数据中提取特征名称列表
|
|
|
+
|
|
|
+ Args:
|
|
|
+ point_data: 点的数据(包含"提取的特征"字段)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 特征名称列表
|
|
|
+ """
|
|
|
+ features = []
|
|
|
+ if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
|
|
|
+ for feature in point_data["提取的特征"]:
|
|
|
+ if "特征名称" in feature:
|
|
|
+ features.append(feature["特征名称"])
|
|
|
+ return features
|
|
|
+
|
|
|
+
|
|
|
+def process_inspiration_points(inspiration_data: Dict) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 处理灵感点数据
|
|
|
+
|
|
|
+ Args:
|
|
|
+ inspiration_data: 灵感点数据
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 灵感点列表
|
|
|
+ """
|
|
|
+ result = []
|
|
|
+
|
|
|
+ # 处理三个维度:全新内容、共性差异、共性内容
|
|
|
+ for dimension in ["全新内容", "共性差异", "共性内容"]:
|
|
|
+ if dimension in inspiration_data and isinstance(inspiration_data[dimension], list):
|
|
|
+ for item in inspiration_data[dimension]:
|
|
|
+ point_item = {
|
|
|
+ "名称": item.get("灵感点", ""),
|
|
|
+ "描述": item.get("描述", ""),
|
|
|
+ "特征列表": extract_features_from_point(item)
|
|
|
+ }
|
|
|
+ result.append(point_item)
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def process_purpose_points(purpose_data: Dict) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 处理目的点数据
|
|
|
+
|
|
|
+ Args:
|
|
|
+ purpose_data: 目的点数据
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 目的点列表
|
|
|
+ """
|
|
|
+ result = []
|
|
|
+
|
|
|
+ if "purposes" in purpose_data and isinstance(purpose_data["purposes"], list):
|
|
|
+ for item in purpose_data["purposes"]:
|
|
|
+ point_item = {
|
|
|
+ "名称": item.get("目的点", ""),
|
|
|
+ "描述": item.get("描述", ""),
|
|
|
+ "特征列表": extract_features_from_point(item)
|
|
|
+ }
|
|
|
+ result.append(point_item)
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def process_key_points(key_data: Dict) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 处理关键点数据
|
|
|
+
|
|
|
+ Args:
|
|
|
+ key_data: 关键点数据
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 关键点列表
|
|
|
+ """
|
|
|
+ result = []
|
|
|
+
|
|
|
+ if "key_points" in key_data and isinstance(key_data["key_points"], list):
|
|
|
+ for item in key_data["key_points"]:
|
|
|
+ point_item = {
|
|
|
+ "名称": item.get("关键点", ""),
|
|
|
+ "描述": item.get("描述", ""),
|
|
|
+ "特征列表": extract_features_from_point(item)
|
|
|
+ }
|
|
|
+ result.append(point_item)
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def process_single_file(file_path: Path) -> Optional[Dict]:
|
|
|
+ """
|
|
|
+ 处理单个JSON文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ file_path: JSON文件路径
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 解构任务字典,如果处理失败则返回None
|
|
|
+ """
|
|
|
+ # 从文件名提取帖子ID
|
|
|
+ post_id = extract_post_id_from_filename(file_path.name)
|
|
|
+ if not post_id:
|
|
|
+ print(f" 警告: 无法从文件名提取帖子ID: {file_path.name}")
|
|
|
+ return None
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 读取文件
|
|
|
+ with open(file_path, "r", encoding="utf-8") as f:
|
|
|
+ data = json.load(f)
|
|
|
+
|
|
|
+ # 获取帖子详情
|
|
|
+ print(f" 获取帖子 {post_id} 的详情...")
|
|
|
+ post_detail = get_post_detail(post_id)
|
|
|
+ if not post_detail:
|
|
|
+ print(f" 警告: 未能获取帖子 {post_id} 的详情")
|
|
|
+
|
|
|
+ # 提取三点解构数据
|
|
|
+ three_points = data.get("三点解构", {})
|
|
|
+
|
|
|
+ # 处理灵感点
|
|
|
+ inspiration_points = []
|
|
|
+ if "灵感点" in three_points:
|
|
|
+ inspiration_points = process_inspiration_points(three_points["灵感点"])
|
|
|
+
|
|
|
+ # 处理目的点
|
|
|
+ purpose_points = []
|
|
|
+ if "目的点" in three_points:
|
|
|
+ purpose_points = process_purpose_points(three_points["目的点"])
|
|
|
+
|
|
|
+ # 处理关键点
|
|
|
+ key_points = []
|
|
|
+ if "关键点" in three_points:
|
|
|
+ key_points = process_key_points(three_points["关键点"])
|
|
|
+
|
|
|
+ # 构建结果
|
|
|
+ task_item = {
|
|
|
+ "帖子id": post_id,
|
|
|
+ "帖子详情": post_detail if post_detail else {},
|
|
|
+ "what解构结果": {
|
|
|
+ "灵感点列表": inspiration_points,
|
|
|
+ "目的点列表": purpose_points,
|
|
|
+ "关键点列表": key_points
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return task_item
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f" 错误: 处理文件 {file_path.name} 时出错: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ # 输入输出路径(默认使用项目根目录下的 data/data_1117 目录)
|
|
|
+ script_dir = Path(__file__).parent
|
|
|
+ project_root = script_dir.parent.parent
|
|
|
+ data_dir = project_root / "data" / "data_1117"
|
|
|
+
|
|
|
+ input_dir = data_dir / "当前帖子_what解构结果"
|
|
|
+ output_file = data_dir / "当前帖子_解构任务列表.json"
|
|
|
+
|
|
|
+ print(f"正在扫描目录: {input_dir}")
|
|
|
+
|
|
|
+ # 获取所有JSON文件
|
|
|
+ json_files = list(input_dir.glob("*.json"))
|
|
|
+ print(f"找到 {len(json_files)} 个JSON文件\n")
|
|
|
+
|
|
|
+ # 处理所有文件
|
|
|
+ task_list = []
|
|
|
+ for i, file_path in enumerate(json_files, 1):
|
|
|
+ print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
|
|
|
+ task_item = process_single_file(file_path)
|
|
|
+ if task_item:
|
|
|
+ task_list.append(task_item)
|
|
|
+ print(f" ✓ 成功提取")
|
|
|
+ print()
|
|
|
+
|
|
|
+ # 构建最终结果
|
|
|
+ final_result = {
|
|
|
+ "解构任务列表": task_list
|
|
|
+ }
|
|
|
+
|
|
|
+ # 统计信息
|
|
|
+ print(f"提取统计:")
|
|
|
+ print(f" 总帖子数: {len(task_list)}")
|
|
|
+ total_inspiration = sum(len(task["what解构结果"]["灵感点列表"]) for task in task_list)
|
|
|
+ total_purpose = sum(len(task["what解构结果"]["目的点列表"]) for task in task_list)
|
|
|
+ total_key = sum(len(task["what解构结果"]["关键点列表"]) for task in task_list)
|
|
|
+ print(f" 总灵感点: {total_inspiration} 个")
|
|
|
+ print(f" 总目的点: {total_purpose} 个")
|
|
|
+ print(f" 总关键点: {total_key} 个")
|
|
|
+
|
|
|
+ # 保存结果
|
|
|
+ print(f"\n正在保存结果到: {output_file}")
|
|
|
+ with open(output_file, "w", encoding="utf-8") as f:
|
|
|
+ json.dump(final_result, f, ensure_ascii=False, indent=4)
|
|
|
+
|
|
|
+ print("完成!")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|