extract_current_posts.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从当前帖子_what解构结果目录中提取解构任务列表
  5. """
  6. import json
  7. from pathlib import Path
  8. from typing import Dict, List, Optional
  9. import sys
  10. import re
  11. # 添加项目根目录到路径
  12. project_root = Path(__file__).parent.parent.parent
  13. sys.path.insert(0, str(project_root))
  14. from script.detail import get_xiaohongshu_detail
  15. def extract_post_id_from_filename(filename: str) -> str:
  16. """从文件名中提取帖子ID"""
  17. match = re.match(r'^([^_]+)_', filename)
  18. if match:
  19. return match.group(1)
  20. return ""
  21. def get_post_detail(post_id: str) -> Optional[Dict]:
  22. """获取帖子详情"""
  23. try:
  24. detail = get_xiaohongshu_detail(post_id)
  25. return detail
  26. except Exception as e:
  27. print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
  28. return None
  29. def extract_features_from_point(point_data: Dict) -> List[Dict]:
  30. """
  31. 从点数据中提取特征信息列表(包含名称和权重)
  32. Args:
  33. point_data: 点的数据(包含"提取的特征"字段)
  34. Returns:
  35. 特征信息列表,每项包含 {"特征名称": str, "权重": float}
  36. """
  37. features = []
  38. if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
  39. for feature in point_data["提取的特征"]:
  40. if "特征名称" in feature:
  41. feature_item = {
  42. "特征名称": feature["特征名称"],
  43. "权重": feature.get("权重", 1.0) # 默认权重为1.0
  44. }
  45. features.append(feature_item)
  46. return features
  47. def process_inspiration_points(inspiration_data: Dict) -> List[Dict]:
  48. """
  49. 处理灵感点数据
  50. Args:
  51. inspiration_data: 灵感点数据
  52. Returns:
  53. 灵感点列表
  54. """
  55. result = []
  56. # 处理三个维度:全新内容、共性差异、共性内容
  57. for dimension in ["全新内容", "共性差异", "共性内容"]:
  58. if dimension in inspiration_data and isinstance(inspiration_data[dimension], list):
  59. for item in inspiration_data[dimension]:
  60. point_item = {
  61. "名称": item.get("灵感点", ""),
  62. "描述": item.get("描述", ""),
  63. "特征列表": extract_features_from_point(item)
  64. }
  65. result.append(point_item)
  66. return result
  67. def process_purpose_points(purpose_data: Dict) -> List[Dict]:
  68. """
  69. 处理目的点数据
  70. Args:
  71. purpose_data: 目的点数据
  72. Returns:
  73. 目的点列表
  74. """
  75. result = []
  76. if "purposes" in purpose_data and isinstance(purpose_data["purposes"], list):
  77. for item in purpose_data["purposes"]:
  78. point_item = {
  79. "名称": item.get("目的点", ""),
  80. "描述": item.get("描述", ""),
  81. "特征列表": extract_features_from_point(item)
  82. }
  83. result.append(point_item)
  84. return result
  85. def process_key_points(key_data: Dict) -> List[Dict]:
  86. """
  87. 处理关键点数据
  88. Args:
  89. key_data: 关键点数据
  90. Returns:
  91. 关键点列表
  92. """
  93. result = []
  94. if "key_points" in key_data and isinstance(key_data["key_points"], list):
  95. for item in key_data["key_points"]:
  96. point_item = {
  97. "名称": item.get("关键点", ""),
  98. "描述": item.get("描述", ""),
  99. "特征列表": extract_features_from_point(item)
  100. }
  101. result.append(point_item)
  102. return result
  103. def process_single_file(file_path: Path) -> Optional[Dict]:
  104. """
  105. 处理单个JSON文件
  106. Args:
  107. file_path: JSON文件路径
  108. Returns:
  109. 解构任务字典,如果处理失败则返回None
  110. """
  111. # 从文件名提取帖子ID
  112. post_id = extract_post_id_from_filename(file_path.name)
  113. if not post_id:
  114. print(f" 警告: 无法从文件名提取帖子ID: {file_path.name}")
  115. return None
  116. try:
  117. # 读取文件
  118. with open(file_path, "r", encoding="utf-8") as f:
  119. data = json.load(f)
  120. # 获取帖子详情
  121. print(f" 获取帖子 {post_id} 的详情...")
  122. post_detail = get_post_detail(post_id)
  123. if not post_detail:
  124. print(f" 警告: 未能获取帖子 {post_id} 的详情")
  125. # 提取三点解构数据
  126. three_points = data.get("三点解构", {})
  127. # 处理灵感点
  128. inspiration_points = []
  129. if "灵感点" in three_points:
  130. inspiration_points = process_inspiration_points(three_points["灵感点"])
  131. # 处理目的点
  132. purpose_points = []
  133. if "目的点" in three_points:
  134. purpose_points = process_purpose_points(three_points["目的点"])
  135. # 处理关键点
  136. key_points = []
  137. if "关键点" in three_points:
  138. key_points = process_key_points(three_points["关键点"])
  139. # 构建结果
  140. task_item = {
  141. "帖子id": post_id,
  142. "帖子详情": post_detail if post_detail else {},
  143. "what解构结果": {
  144. "灵感点列表": inspiration_points,
  145. "目的点列表": purpose_points,
  146. "关键点列表": key_points
  147. }
  148. }
  149. return task_item
  150. except Exception as e:
  151. print(f" 错误: 处理文件 {file_path.name} 时出错: {e}")
  152. return None
  153. def main():
  154. # 输入输出路径(默认使用项目根目录下的 data/data_1117 目录)
  155. script_dir = Path(__file__).parent
  156. project_root = script_dir.parent.parent
  157. data_dir = project_root / "data" / "data_1118"
  158. input_dir = data_dir / "当前帖子_what解构结果"
  159. output_file = data_dir / "当前帖子_解构任务列表.json"
  160. print(f"正在扫描目录: {input_dir}")
  161. # 获取所有JSON文件
  162. json_files = list(input_dir.glob("*.json"))
  163. print(f"找到 {len(json_files)} 个JSON文件\n")
  164. # 处理所有文件
  165. task_list = []
  166. for i, file_path in enumerate(json_files, 1):
  167. print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
  168. task_item = process_single_file(file_path)
  169. if task_item:
  170. task_list.append(task_item)
  171. print(f" ✓ 成功提取")
  172. print()
  173. # 构建最终结果
  174. final_result = {
  175. "解构任务列表": task_list
  176. }
  177. # 统计信息
  178. print(f"提取统计:")
  179. print(f" 总帖子数: {len(task_list)}")
  180. total_inspiration = sum(len(task["what解构结果"]["灵感点列表"]) for task in task_list)
  181. total_purpose = sum(len(task["what解构结果"]["目的点列表"]) for task in task_list)
  182. total_key = sum(len(task["what解构结果"]["关键点列表"]) for task in task_list)
  183. print(f" 总灵感点: {total_inspiration} 个")
  184. print(f" 总目的点: {total_purpose} 个")
  185. print(f" 总关键点: {total_key} 个")
  186. # 保存结果
  187. print(f"\n正在保存结果到: {output_file}")
  188. with open(output_file, "w", encoding="utf-8") as f:
  189. json.dump(final_result, f, ensure_ascii=False, indent=4)
  190. print("完成!")
  191. if __name__ == "__main__":
  192. main()