extract_current_posts.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从当前帖子_what解构结果目录中提取解构任务列表
  5. """
  6. import json
  7. from pathlib import Path
  8. from typing import Dict, List, Optional
  9. import sys
  10. import re
  11. # 添加项目根目录到路径
  12. project_root = Path(__file__).parent.parent.parent
  13. sys.path.insert(0, str(project_root))
  14. from script.detail import get_xiaohongshu_detail
  15. from script.data_processing.path_config import PathConfig
  16. def extract_post_id_from_filename(filename: str) -> str:
  17. """从文件名中提取帖子ID"""
  18. match = re.match(r'^([^_]+)_', filename)
  19. if match:
  20. return match.group(1)
  21. return ""
  22. def get_post_detail(post_id: str) -> Optional[Dict]:
  23. """获取帖子详情"""
  24. try:
  25. detail = get_xiaohongshu_detail(post_id)
  26. return detail
  27. except Exception as e:
  28. print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
  29. return None
  30. def extract_features_from_point(point_data: Dict) -> List[Dict]:
  31. """
  32. 从点数据中提取特征信息列表(包含名称和权重)
  33. Args:
  34. point_data: 点的数据(包含"提取的特征"字段)
  35. Returns:
  36. 特征信息列表,每项包含 {"特征名称": str, "权重": float}
  37. """
  38. features = []
  39. if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
  40. for feature in point_data["提取的特征"]:
  41. if "特征名称" in feature:
  42. feature_item = {
  43. "特征名称": feature["特征名称"],
  44. "权重": feature.get("权重", 1.0) # 默认权重为1.0
  45. }
  46. features.append(feature_item)
  47. return features
  48. def process_inspiration_points(inspiration_data: Dict) -> List[Dict]:
  49. """
  50. 处理灵感点数据
  51. Args:
  52. inspiration_data: 灵感点数据
  53. Returns:
  54. 灵感点列表
  55. """
  56. result = []
  57. # 处理三个维度:全新内容、共性差异、共性内容
  58. for dimension in ["全新内容", "共性差异", "共性内容"]:
  59. if dimension in inspiration_data and isinstance(inspiration_data[dimension], list):
  60. for item in inspiration_data[dimension]:
  61. point_item = {
  62. "名称": item.get("灵感点", ""),
  63. "描述": item.get("描述", ""),
  64. "特征列表": extract_features_from_point(item)
  65. }
  66. result.append(point_item)
  67. return result
  68. def process_purpose_points(purpose_data: Dict) -> List[Dict]:
  69. """
  70. 处理目的点数据
  71. Args:
  72. purpose_data: 目的点数据
  73. Returns:
  74. 目的点列表
  75. """
  76. result = []
  77. if "purposes" in purpose_data and isinstance(purpose_data["purposes"], list):
  78. for item in purpose_data["purposes"]:
  79. point_item = {
  80. "名称": item.get("目的点", ""),
  81. "描述": item.get("描述", ""),
  82. "特征列表": extract_features_from_point(item)
  83. }
  84. result.append(point_item)
  85. return result
  86. def process_key_points(key_data: Dict) -> List[Dict]:
  87. """
  88. 处理关键点数据
  89. Args:
  90. key_data: 关键点数据
  91. Returns:
  92. 关键点列表
  93. """
  94. result = []
  95. if "key_points" in key_data and isinstance(key_data["key_points"], list):
  96. for item in key_data["key_points"]:
  97. point_item = {
  98. "名称": item.get("关键点", ""),
  99. "描述": item.get("描述", ""),
  100. "特征列表": extract_features_from_point(item)
  101. }
  102. result.append(point_item)
  103. return result
  104. def process_single_file(file_path: Path) -> Optional[Dict]:
  105. """
  106. 处理单个JSON文件
  107. Args:
  108. file_path: JSON文件路径
  109. Returns:
  110. 解构任务字典,如果处理失败则返回None
  111. """
  112. # 从文件名提取帖子ID
  113. post_id = extract_post_id_from_filename(file_path.name)
  114. if not post_id:
  115. print(f" 警告: 无法从文件名提取帖子ID: {file_path.name}")
  116. return None
  117. try:
  118. # 读取文件
  119. with open(file_path, "r", encoding="utf-8") as f:
  120. data = json.load(f)
  121. # 获取帖子详情
  122. print(f" 获取帖子 {post_id} 的详情...")
  123. post_detail = get_post_detail(post_id)
  124. if not post_detail:
  125. print(f" 警告: 未能获取帖子 {post_id} 的详情")
  126. # 提取三点解构数据
  127. three_points = data.get("三点解构", {})
  128. # 处理灵感点
  129. inspiration_points = []
  130. if "灵感点" in three_points:
  131. inspiration_points = process_inspiration_points(three_points["灵感点"])
  132. # 处理目的点
  133. purpose_points = []
  134. if "目的点" in three_points:
  135. purpose_points = process_purpose_points(three_points["目的点"])
  136. # 处理关键点
  137. key_points = []
  138. if "关键点" in three_points:
  139. key_points = process_key_points(three_points["关键点"])
  140. # 构建结果
  141. task_item = {
  142. "帖子id": post_id,
  143. "帖子详情": post_detail if post_detail else {},
  144. "what解构结果": {
  145. "灵感点列表": inspiration_points,
  146. "目的点列表": purpose_points,
  147. "关键点列表": key_points
  148. }
  149. }
  150. return task_item
  151. except Exception as e:
  152. print(f" 错误: 处理文件 {file_path.name} 时出错: {e}")
  153. return None
  154. def main():
  155. # 使用路径配置
  156. config = PathConfig()
  157. # 确保输出目录存在
  158. config.ensure_dirs()
  159. # 获取路径
  160. input_dir = config.current_posts_dir
  161. output_file = config.task_list_file
  162. print(f"账号: {config.account_name}")
  163. print(f"当前帖子目录: {input_dir}")
  164. print(f"输出文件: {output_file}")
  165. print()
  166. print(f"正在扫描目录: {input_dir}")
  167. # 获取所有JSON文件
  168. json_files = list(input_dir.glob("*.json"))
  169. print(f"找到 {len(json_files)} 个JSON文件\n")
  170. # 处理所有文件
  171. task_list = []
  172. for i, file_path in enumerate(json_files, 1):
  173. print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
  174. task_item = process_single_file(file_path)
  175. if task_item:
  176. task_list.append(task_item)
  177. print(f" ✓ 成功提取")
  178. print()
  179. # 构建最终结果
  180. final_result = {
  181. "解构任务列表": task_list
  182. }
  183. # 统计信息
  184. print(f"提取统计:")
  185. print(f" 总帖子数: {len(task_list)}")
  186. total_inspiration = sum(len(task["what解构结果"]["灵感点列表"]) for task in task_list)
  187. total_purpose = sum(len(task["what解构结果"]["目的点列表"]) for task in task_list)
  188. total_key = sum(len(task["what解构结果"]["关键点列表"]) for task in task_list)
  189. print(f" 总灵感点: {total_inspiration} 个")
  190. print(f" 总目的点: {total_purpose} 个")
  191. print(f" 总关键点: {total_key} 个")
  192. # 保存结果
  193. print(f"\n正在保存结果到: {output_file}")
  194. with open(output_file, "w", encoding="utf-8") as f:
  195. json.dump(final_result, f, ensure_ascii=False, indent=4)
  196. print("完成!")
  197. if __name__ == "__main__":
  198. main()