extract_features_from_posts.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从过去帖子解构结果目录中提取特征名称及其来源信息
  5. 仅支持新版数据结构(inspiration_final_result, purpose_final_result, keypoint_final)
  6. """
  7. import json
  8. from pathlib import Path
  9. from typing import Dict, List, Optional, Set
  10. import re
  11. import sys
  12. # 添加项目根目录到路径
  13. project_root = Path(__file__).parent.parent.parent
  14. sys.path.insert(0, str(project_root))
  15. from script.detail import get_xiaohongshu_detail
  16. from script.data_processing.path_config import PathConfig
  17. def extract_post_id_from_filename(filename: str) -> str:
  18. """从文件名中提取帖子ID
  19. 支持格式: 68a6b96f000000001d006058.json
  20. """
  21. return filename.replace('.json', '')
  22. def get_post_detail(post_id: str) -> Optional[Dict]:
  23. """获取帖子详情"""
  24. try:
  25. detail = get_xiaohongshu_detail(post_id)
  26. return detail
  27. except Exception as e:
  28. print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
  29. return None
  30. def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]:
  31. """
  32. 处理单个JSON文件,提取所有特征信息
  33. Args:
  34. file_path: JSON文件路径
  35. Returns:
  36. 包含灵感点、目的点、关键点的特征字典
  37. """
  38. result = {
  39. "灵感点": {},
  40. "目的点": {},
  41. "关键点": {}
  42. }
  43. post_id = extract_post_id_from_filename(file_path.name)
  44. try:
  45. with open(file_path, "r", encoding="utf-8") as f:
  46. data = json.load(f)
  47. # 处理灵感点
  48. if "inspiration_final_result" in data:
  49. inspiration_data = data["inspiration_final_result"]
  50. for item in inspiration_data.get("最终灵感点列表", []):
  51. feature_name = item.get("灵感点", "")
  52. if not feature_name:
  53. continue
  54. if feature_name not in result["灵感点"]:
  55. result["灵感点"][feature_name] = []
  56. result["灵感点"][feature_name].append({
  57. "点的名称": feature_name,
  58. "点的描述": item.get("描述", ""),
  59. "帖子id": post_id,
  60. "点ID": item.get("id", ""),
  61. "类型": item.get("类型", "")
  62. })
  63. # 处理目的点(意图+实质)
  64. if "purpose_final_result" in data:
  65. purpose_data = data["purpose_final_result"]
  66. # 处理意图列表
  67. for item in purpose_data.get("最终意图列表", []):
  68. feature_name = item.get("目的点", "")
  69. if not feature_name:
  70. continue
  71. if feature_name not in result["目的点"]:
  72. result["目的点"][feature_name] = []
  73. result["目的点"][feature_name].append({
  74. "点的名称": feature_name,
  75. "点的描述": item.get("描述", ""),
  76. "帖子id": post_id,
  77. "点ID": item.get("意图ID", ""),
  78. "类型": "意图"
  79. })
  80. # 处理实质列表
  81. for item in purpose_data.get("最终实质列表", []):
  82. feature_name = item.get("目的点", "")
  83. if not feature_name:
  84. continue
  85. if feature_name not in result["目的点"]:
  86. result["目的点"][feature_name] = []
  87. result["目的点"][feature_name].append({
  88. "点的名称": feature_name,
  89. "点的描述": item.get("描述", ""),
  90. "帖子id": post_id,
  91. "点ID": item.get("实质ID", ""),
  92. "类型": "实质",
  93. "关联意图ID": item.get("关联意图ID", "")
  94. })
  95. # 处理关键点
  96. if "keypoint_final" in data:
  97. keypoint_data = data["keypoint_final"]
  98. for item in keypoint_data.get("最终关键点列表", []):
  99. feature_name = item.get("关键点", "")
  100. if not feature_name:
  101. continue
  102. if feature_name not in result["关键点"]:
  103. result["关键点"][feature_name] = []
  104. result["关键点"][feature_name].append({
  105. "点的名称": feature_name,
  106. "点的描述": item.get("描述", ""),
  107. "帖子id": post_id,
  108. "点ID": item.get("关键点ID", ""),
  109. "类型": item.get("类型", ""),
  110. "支撑的ID": item.get("支撑的ID", [])
  111. })
  112. except Exception as e:
  113. print(f"处理文件 {file_path.name} 时出错: {e}")
  114. return result
  115. def merge_results(all_results: List[Dict]) -> Dict:
  116. """合并所有文件的提取结果"""
  117. merged = {
  118. "灵感点": {},
  119. "目的点": {},
  120. "关键点": {}
  121. }
  122. for result in all_results:
  123. for category in ["灵感点", "目的点", "关键点"]:
  124. for feature_name, sources in result[category].items():
  125. if feature_name not in merged[category]:
  126. merged[category][feature_name] = {"来源": []}
  127. merged[category][feature_name]["来源"].extend(sources)
  128. return merged
  129. def convert_to_array_format(
  130. merged_dict: Dict,
  131. fetch_details: bool = True,
  132. exclude_post_ids: Optional[Set[str]] = None
  133. ) -> Dict:
  134. """将字典格式转换为数组格式,并添加帖子详情"""
  135. result = {
  136. "灵感点": [],
  137. "目的点": [],
  138. "关键点": []
  139. }
  140. # 收集所有需要获取详情的帖子ID
  141. post_ids = set()
  142. if fetch_details:
  143. for category in ["灵感点", "目的点", "关键点"]:
  144. for feature_name, data in merged_dict[category].items():
  145. for source in data["来源"]:
  146. post_ids.add(source["帖子id"])
  147. # 批量获取帖子详情
  148. print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
  149. post_details = {}
  150. for i, post_id in enumerate(post_ids, 1):
  151. print(f"[{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
  152. detail = get_post_detail(post_id)
  153. if detail:
  154. post_details[post_id] = detail
  155. print(f"成功获取 {len(post_details)} 个帖子详情")
  156. # 应用帖子ID过滤
  157. if exclude_post_ids:
  158. print(f"\n正在应用帖子ID过滤,排除 {len(exclude_post_ids)} 个当前帖子...")
  159. before_count = len(post_details)
  160. post_details = {pid: detail for pid, detail in post_details.items() if pid not in exclude_post_ids}
  161. filtered_count = before_count - len(post_details)
  162. if filtered_count > 0:
  163. print(f" ⚠️ 过滤掉 {filtered_count} 个当前帖子")
  164. print(f"保留 {len(post_details)} 个帖子")
  165. # 转换为数组格式并添加帖子详情
  166. for category in ["灵感点", "目的点", "关键点"]:
  167. for feature_name, data in merged_dict[category].items():
  168. enhanced_sources = []
  169. for source in data["来源"]:
  170. if fetch_details and exclude_post_ids and source["帖子id"] not in post_details:
  171. continue
  172. enhanced_source = source.copy()
  173. if fetch_details and source["帖子id"] in post_details:
  174. enhanced_source["帖子详情"] = post_details[source["帖子id"]]
  175. enhanced_sources.append(enhanced_source)
  176. if enhanced_sources:
  177. result[category].append({
  178. "特征名称": feature_name,
  179. "特征来源": enhanced_sources
  180. })
  181. return result
  182. def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
  183. """获取当前帖子目录中的所有帖子ID"""
  184. if not current_posts_dir.exists():
  185. print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
  186. return set()
  187. json_files = list(current_posts_dir.glob("*.json"))
  188. if not json_files:
  189. print(f"警告: 当前帖子目录为空: {current_posts_dir}")
  190. return set()
  191. print(f"\n正在获取当前帖子ID...")
  192. print(f"找到 {len(json_files)} 个当前帖子")
  193. post_ids = set()
  194. for file_path in json_files:
  195. post_id = extract_post_id_from_filename(file_path.name)
  196. if post_id:
  197. post_ids.add(post_id)
  198. print(f"提取到 {len(post_ids)} 个帖子ID")
  199. return post_ids
  200. def main():
  201. config = PathConfig()
  202. config.ensure_dirs()
  203. input_dir = config.historical_posts_dir
  204. current_posts_dir = config.current_posts_dir
  205. output_file = config.feature_source_mapping_file
  206. print(f"账号: {config.account_name}")
  207. print(f"过滤模式: {config.filter_mode}")
  208. print(f"过去帖子目录: {input_dir}")
  209. print(f"当前帖子目录: {current_posts_dir}")
  210. print(f"输出文件: {output_file}")
  211. print()
  212. print(f"\n正在扫描目录: {input_dir}")
  213. json_files = list(input_dir.glob("*.json"))
  214. print(f"找到 {len(json_files)} 个JSON文件")
  215. all_results = []
  216. for i, file_path in enumerate(json_files, 1):
  217. print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
  218. result = process_single_file(file_path)
  219. all_results.append(result)
  220. print("\n正在合并结果...")
  221. merged_result = merge_results(all_results)
  222. # 过滤当前帖子
  223. exclude_post_ids = None
  224. if config.filter_mode == "exclude_current_posts":
  225. print("\n应用过滤规则: 排除当前帖子ID")
  226. exclude_post_ids = get_current_post_ids(current_posts_dir)
  227. elif config.filter_mode == "none":
  228. print("\n过滤模式: none,不应用任何过滤")
  229. print("正在转换为数组格式...")
  230. final_result = convert_to_array_format(
  231. merged_result,
  232. fetch_details=True,
  233. exclude_post_ids=exclude_post_ids
  234. )
  235. print(f"\n提取统计:")
  236. for category in ["灵感点", "目的点", "关键点"]:
  237. feature_count = len(final_result[category])
  238. source_count = sum(len(item["特征来源"]) for item in final_result[category])
  239. print(f" {category}: {feature_count} 个特征, {source_count} 个来源")
  240. print(f"\n正在保存结果到: {output_file}")
  241. with open(output_file, "w", encoding="utf-8") as f:
  242. json.dump(final_result, f, ensure_ascii=False, indent=4)
  243. print("完成!")
  244. if __name__ == "__main__":
  245. main()