extract_current_posts.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从当前帖子_what解构结果目录中提取解构任务列表
  5. """
  6. import json
  7. from pathlib import Path
  8. from typing import Dict, List, Optional
  9. import sys
  10. import re
  11. # 添加项目根目录到路径
  12. project_root = Path(__file__).parent.parent.parent
  13. sys.path.insert(0, str(project_root))
  14. from script.detail import get_xiaohongshu_detail
  15. def extract_post_id_from_filename(filename: str) -> str:
  16. """从文件名中提取帖子ID"""
  17. match = re.match(r'^([^_]+)_', filename)
  18. if match:
  19. return match.group(1)
  20. return ""
  21. def get_post_detail(post_id: str) -> Optional[Dict]:
  22. """获取帖子详情"""
  23. try:
  24. detail = get_xiaohongshu_detail(post_id)
  25. return detail
  26. except Exception as e:
  27. print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
  28. return None
  29. def extract_features_from_point(point_data: Dict) -> List[str]:
  30. """
  31. 从点数据中提取特征名称列表
  32. Args:
  33. point_data: 点的数据(包含"提取的特征"字段)
  34. Returns:
  35. 特征名称列表
  36. """
  37. features = []
  38. if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
  39. for feature in point_data["提取的特征"]:
  40. if "特征名称" in feature:
  41. features.append(feature["特征名称"])
  42. return features
  43. def process_inspiration_points(inspiration_data: Dict) -> List[Dict]:
  44. """
  45. 处理灵感点数据
  46. Args:
  47. inspiration_data: 灵感点数据
  48. Returns:
  49. 灵感点列表
  50. """
  51. result = []
  52. # 处理三个维度:全新内容、共性差异、共性内容
  53. for dimension in ["全新内容", "共性差异", "共性内容"]:
  54. if dimension in inspiration_data and isinstance(inspiration_data[dimension], list):
  55. for item in inspiration_data[dimension]:
  56. point_item = {
  57. "名称": item.get("灵感点", ""),
  58. "描述": item.get("描述", ""),
  59. "特征列表": extract_features_from_point(item)
  60. }
  61. result.append(point_item)
  62. return result
  63. def process_purpose_points(purpose_data: Dict) -> List[Dict]:
  64. """
  65. 处理目的点数据
  66. Args:
  67. purpose_data: 目的点数据
  68. Returns:
  69. 目的点列表
  70. """
  71. result = []
  72. if "purposes" in purpose_data and isinstance(purpose_data["purposes"], list):
  73. for item in purpose_data["purposes"]:
  74. point_item = {
  75. "名称": item.get("目的点", ""),
  76. "描述": item.get("描述", ""),
  77. "特征列表": extract_features_from_point(item)
  78. }
  79. result.append(point_item)
  80. return result
  81. def process_key_points(key_data: Dict) -> List[Dict]:
  82. """
  83. 处理关键点数据
  84. Args:
  85. key_data: 关键点数据
  86. Returns:
  87. 关键点列表
  88. """
  89. result = []
  90. if "key_points" in key_data and isinstance(key_data["key_points"], list):
  91. for item in key_data["key_points"]:
  92. point_item = {
  93. "名称": item.get("关键点", ""),
  94. "描述": item.get("描述", ""),
  95. "特征列表": extract_features_from_point(item)
  96. }
  97. result.append(point_item)
  98. return result
  99. def process_single_file(file_path: Path) -> Optional[Dict]:
  100. """
  101. 处理单个JSON文件
  102. Args:
  103. file_path: JSON文件路径
  104. Returns:
  105. 解构任务字典,如果处理失败则返回None
  106. """
  107. # 从文件名提取帖子ID
  108. post_id = extract_post_id_from_filename(file_path.name)
  109. if not post_id:
  110. print(f" 警告: 无法从文件名提取帖子ID: {file_path.name}")
  111. return None
  112. try:
  113. # 读取文件
  114. with open(file_path, "r", encoding="utf-8") as f:
  115. data = json.load(f)
  116. # 获取帖子详情
  117. print(f" 获取帖子 {post_id} 的详情...")
  118. post_detail = get_post_detail(post_id)
  119. if not post_detail:
  120. print(f" 警告: 未能获取帖子 {post_id} 的详情")
  121. # 提取三点解构数据
  122. three_points = data.get("三点解构", {})
  123. # 处理灵感点
  124. inspiration_points = []
  125. if "灵感点" in three_points:
  126. inspiration_points = process_inspiration_points(three_points["灵感点"])
  127. # 处理目的点
  128. purpose_points = []
  129. if "目的点" in three_points:
  130. purpose_points = process_purpose_points(three_points["目的点"])
  131. # 处理关键点
  132. key_points = []
  133. if "关键点" in three_points:
  134. key_points = process_key_points(three_points["关键点"])
  135. # 构建结果
  136. task_item = {
  137. "帖子id": post_id,
  138. "帖子详情": post_detail if post_detail else {},
  139. "what解构结果": {
  140. "灵感点列表": inspiration_points,
  141. "目的点列表": purpose_points,
  142. "关键点列表": key_points
  143. }
  144. }
  145. return task_item
  146. except Exception as e:
  147. print(f" 错误: 处理文件 {file_path.name} 时出错: {e}")
  148. return None
  149. def main():
  150. # 输入输出路径(默认使用项目根目录下的 data/data_1117 目录)
  151. script_dir = Path(__file__).parent
  152. project_root = script_dir.parent.parent
  153. data_dir = project_root / "data" / "data_1118"
  154. input_dir = data_dir / "当前帖子_what解构结果"
  155. output_file = data_dir / "当前帖子_解构任务列表.json"
  156. print(f"正在扫描目录: {input_dir}")
  157. # 获取所有JSON文件
  158. json_files = list(input_dir.glob("*.json"))
  159. print(f"找到 {len(json_files)} 个JSON文件\n")
  160. # 处理所有文件
  161. task_list = []
  162. for i, file_path in enumerate(json_files, 1):
  163. print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
  164. task_item = process_single_file(file_path)
  165. if task_item:
  166. task_list.append(task_item)
  167. print(f" ✓ 成功提取")
  168. print()
  169. # 构建最终结果
  170. final_result = {
  171. "解构任务列表": task_list
  172. }
  173. # 统计信息
  174. print(f"提取统计:")
  175. print(f" 总帖子数: {len(task_list)}")
  176. total_inspiration = sum(len(task["what解构结果"]["灵感点列表"]) for task in task_list)
  177. total_purpose = sum(len(task["what解构结果"]["目的点列表"]) for task in task_list)
  178. total_key = sum(len(task["what解构结果"]["关键点列表"]) for task in task_list)
  179. print(f" 总灵感点: {total_inspiration} 个")
  180. print(f" 总目的点: {total_purpose} 个")
  181. print(f" 总关键点: {total_key} 个")
  182. # 保存结果
  183. print(f"\n正在保存结果到: {output_file}")
  184. with open(output_file, "w", encoding="utf-8") as f:
  185. json.dump(final_result, f, ensure_ascii=False, indent=4)
  186. print("完成!")
  187. if __name__ == "__main__":
  188. main()