extract_current_posts.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从当前帖子目录中提取解构任务列表
  5. 支持新版数据结构(inspiration_final_result, purpose_final_result, keypoint_final)
  6. """
  7. import json
  8. from pathlib import Path
  9. from typing import Dict, List, Optional
  10. import sys
  11. # 添加项目根目录到路径
  12. project_root = Path(__file__).parent.parent.parent
  13. sys.path.insert(0, str(project_root))
  14. from script.detail import get_xiaohongshu_detail
  15. from script.data_processing.path_config import PathConfig
  16. def extract_post_id_from_filename(filename: str) -> str:
  17. """从文件名中提取帖子ID
  18. 格式: 68a6b96f000000001d006058.json
  19. """
  20. return filename.replace('.json', '')
  21. def get_post_detail(post_id: str) -> Optional[Dict]:
  22. """获取帖子详情"""
  23. try:
  24. detail = get_xiaohongshu_detail(post_id)
  25. return detail
  26. except Exception as e:
  27. print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
  28. return None
  29. def process_inspiration_points(data: Dict) -> List[Dict]:
  30. """处理灵感点数据"""
  31. result = []
  32. if "inspiration_final_result" not in data:
  33. return result
  34. inspiration_data = data["inspiration_final_result"]
  35. for item in inspiration_data.get("最终灵感点列表", []):
  36. point_item = {
  37. "ID": item.get("id", ""),
  38. "名称": item.get("灵感点", ""),
  39. "类型": item.get("类型", ""),
  40. "描述": item.get("描述", ""),
  41. "置信度": item.get("置信度", ""),
  42. "支撑的ID": [],
  43. "关联的ID": []
  44. }
  45. result.append(point_item)
  46. return result
  47. def process_purpose_points(data: Dict) -> List[Dict]:
  48. """处理目的点数据(意图+实质)"""
  49. result = []
  50. if "purpose_final_result" not in data:
  51. return result
  52. purpose_data = data["purpose_final_result"]
  53. # 处理意图列表
  54. for item in purpose_data.get("最终意图列表", []):
  55. point_item = {
  56. "ID": item.get("意图ID", ""),
  57. "名称": item.get("目的点", ""),
  58. "类型": "意图",
  59. "描述": item.get("描述", ""),
  60. "置信度": item.get("置信度", ""),
  61. "支撑的ID": [],
  62. "关联的ID": []
  63. }
  64. result.append(point_item)
  65. # 处理实质列表
  66. for item in purpose_data.get("最终实质列表", []):
  67. related_id = item.get("关联意图ID", "")
  68. point_item = {
  69. "ID": item.get("实质ID", ""),
  70. "名称": item.get("目的点", ""),
  71. "类型": "实质",
  72. "描述": item.get("描述", ""),
  73. "置信度": item.get("置信度", ""),
  74. "支撑的ID": [],
  75. "关联的ID": [related_id] if related_id else []
  76. }
  77. result.append(point_item)
  78. return result
  79. def process_key_points(data: Dict) -> List[Dict]:
  80. """处理关键点数据"""
  81. result = []
  82. if "keypoint_final" not in data:
  83. return result
  84. keypoint_data = data["keypoint_final"]
  85. for item in keypoint_data.get("最终关键点列表", []):
  86. point_item = {
  87. "ID": item.get("关键点ID", ""),
  88. "名称": item.get("关键点", ""),
  89. "类型": item.get("类型", ""),
  90. "描述": item.get("描述", ""),
  91. "置信度": item.get("置信度", ""),
  92. "支撑的ID": item.get("支撑的ID", []),
  93. "关联的ID": []
  94. }
  95. result.append(point_item)
  96. return result
  97. def process_single_file(file_path: Path) -> Optional[Dict]:
  98. """处理单个JSON文件"""
  99. post_id = extract_post_id_from_filename(file_path.name)
  100. if not post_id:
  101. print(f" 警告: 无法从文件名提取帖子ID: {file_path.name}")
  102. return None
  103. try:
  104. with open(file_path, "r", encoding="utf-8") as f:
  105. data = json.load(f)
  106. print(f" 获取帖子 {post_id} 的详情...")
  107. post_detail = get_post_detail(post_id)
  108. if not post_detail:
  109. print(f" 警告: 未能获取帖子 {post_id} 的详情")
  110. # 提取三点数据
  111. inspiration_points = process_inspiration_points(data)
  112. purpose_points = process_purpose_points(data)
  113. key_points = process_key_points(data)
  114. task_item = {
  115. "帖子id": post_id,
  116. "帖子详情": post_detail if post_detail else {},
  117. "what解构结果": {
  118. "灵感点列表": inspiration_points,
  119. "目的点列表": purpose_points,
  120. "关键点列表": key_points
  121. }
  122. }
  123. return task_item
  124. except Exception as e:
  125. print(f" 错误: 处理文件 {file_path.name} 时出错: {e}")
  126. return None
  127. def main():
  128. config = PathConfig()
  129. config.ensure_dirs()
  130. input_dir = config.current_posts_dir
  131. output_file = config.task_list_file
  132. print(f"账号: {config.account_name}")
  133. print(f"当前帖子目录: {input_dir}")
  134. print(f"输出文件: {output_file}")
  135. print(f"\n正在扫描目录: {input_dir}")
  136. json_files = list(input_dir.glob("*.json"))
  137. print(f"找到 {len(json_files)} 个JSON文件")
  138. task_list = []
  139. for i, file_path in enumerate(json_files, 1):
  140. print(f"\n处理文件 [{i}/{len(json_files)}]: {file_path.name}")
  141. task_item = process_single_file(file_path)
  142. if task_item:
  143. task_list.append(task_item)
  144. print(f" ✓ 成功提取")
  145. # 统计
  146. total_inspiration = sum(len(t["what解构结果"]["灵感点列表"]) for t in task_list)
  147. total_purpose = sum(len(t["what解构结果"]["目的点列表"]) for t in task_list)
  148. total_key = sum(len(t["what解构结果"]["关键点列表"]) for t in task_list)
  149. print(f"\n提取统计:")
  150. print(f" 总帖子数: {len(task_list)}")
  151. print(f" 总灵感点: {total_inspiration} 个")
  152. print(f" 总目的点: {total_purpose} 个")
  153. print(f" 总关键点: {total_key} 个")
  154. print(f"\n正在保存结果到: {output_file}")
  155. with open(output_file, "w", encoding="utf-8") as f:
  156. json.dump(task_list, f, ensure_ascii=False, indent=2)
  157. print("完成!")
  158. if __name__ == "__main__":
  159. main()