extract_features_from_posts.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从过去帖子_what解构结果目录中提取特征名称及其来源信息
  5. """
  6. import json
  7. from pathlib import Path
  8. from typing import Dict, List, Optional
  9. import re
  10. import sys
  11. # 添加项目根目录到路径
  12. project_root = Path(__file__).parent.parent.parent
  13. sys.path.insert(0, str(project_root))
  14. from script.detail import get_xiaohongshu_detail
  15. def extract_post_id_from_filename(filename: str) -> str:
  16. """从文件名中提取帖子ID"""
  17. match = re.match(r'^([^_]+)_', filename)
  18. if match:
  19. return match.group(1)
  20. return ""
  21. def get_post_detail(post_id: str) -> Optional[Dict]:
  22. """
  23. 获取帖子详情
  24. Args:
  25. post_id: 帖子ID
  26. Returns:
  27. 帖子详情字典,如果获取失败则返回None
  28. """
  29. try:
  30. detail = get_xiaohongshu_detail(post_id)
  31. return detail
  32. except Exception as e:
  33. print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
  34. return None
  35. def extract_features_from_point(point_data: Dict, post_id: str, point_name: str, point_description: str) -> List[Dict]:
  36. """
  37. 从单个点(灵感点/目的点/关键点)中提取特征信息
  38. Args:
  39. point_data: 点的数据
  40. post_id: 帖子ID
  41. point_name: 点的名称
  42. point_description: 点的描述
  43. Returns:
  44. 特征列表
  45. """
  46. features = []
  47. # 检查是否有"提取的特征"字段
  48. if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
  49. for feature in point_data["提取的特征"]:
  50. if "特征名称" in feature:
  51. features.append({
  52. "特征名称": feature["特征名称"],
  53. "点的名称": point_name,
  54. "点的描述": point_description,
  55. "帖子id": post_id
  56. })
  57. return features
  58. def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]:
  59. """
  60. 处理单个JSON文件,提取所有特征信息
  61. Args:
  62. file_path: JSON文件路径
  63. Returns:
  64. 包含灵感点、目的点、关键点的特征字典
  65. """
  66. result = {
  67. "灵感点": {},
  68. "目的点": {},
  69. "关键点": {}
  70. }
  71. # 从文件名提取帖子ID
  72. post_id = extract_post_id_from_filename(file_path.name)
  73. try:
  74. with open(file_path, "r", encoding="utf-8") as f:
  75. data = json.load(f)
  76. # 提取三点解构数据
  77. if "三点解构" not in data:
  78. return result
  79. three_points = data["三点解构"]
  80. # 处理灵感点
  81. if "灵感点" in three_points:
  82. inspiration = three_points["灵感点"]
  83. # 处理全新内容
  84. if "全新内容" in inspiration and isinstance(inspiration["全新内容"], list):
  85. for item in inspiration["全新内容"]:
  86. point_name = item.get("灵感点", "")
  87. point_desc = item.get("描述", "")
  88. features = extract_features_from_point(item, post_id, point_name, point_desc)
  89. for feature in features:
  90. feature_name = feature["特征名称"]
  91. if feature_name not in result["灵感点"]:
  92. result["灵感点"][feature_name] = []
  93. result["灵感点"][feature_name].append({
  94. "点的名称": feature["点的名称"],
  95. "点的描述": feature["点的描述"],
  96. "帖子id": feature["帖子id"]
  97. })
  98. # 处理共性差异
  99. if "共性差异" in inspiration and isinstance(inspiration["共性差异"], list):
  100. for item in inspiration["共性差异"]:
  101. point_name = item.get("灵感点", "")
  102. point_desc = item.get("描述", "")
  103. features = extract_features_from_point(item, post_id, point_name, point_desc)
  104. for feature in features:
  105. feature_name = feature["特征名称"]
  106. if feature_name not in result["灵感点"]:
  107. result["灵感点"][feature_name] = []
  108. result["灵感点"][feature_name].append({
  109. "点的名称": feature["点的名称"],
  110. "点的描述": feature["点的描述"],
  111. "帖子id": feature["帖子id"]
  112. })
  113. # 处理共性内容
  114. if "共性内容" in inspiration and isinstance(inspiration["共性内容"], list):
  115. for item in inspiration["共性内容"]:
  116. point_name = item.get("灵感点", "")
  117. point_desc = item.get("描述", "")
  118. features = extract_features_from_point(item, post_id, point_name, point_desc)
  119. for feature in features:
  120. feature_name = feature["特征名称"]
  121. if feature_name not in result["灵感点"]:
  122. result["灵感点"][feature_name] = []
  123. result["灵感点"][feature_name].append({
  124. "点的名称": feature["点的名称"],
  125. "点的描述": feature["点的描述"],
  126. "帖子id": feature["帖子id"]
  127. })
  128. # 处理目的点
  129. if "目的点" in three_points:
  130. purpose = three_points["目的点"]
  131. if "purposes" in purpose and isinstance(purpose["purposes"], list):
  132. for item in purpose["purposes"]:
  133. point_name = item.get("目的点", "")
  134. point_desc = item.get("描述", "")
  135. features = extract_features_from_point(item, post_id, point_name, point_desc)
  136. for feature in features:
  137. feature_name = feature["特征名称"]
  138. if feature_name not in result["目的点"]:
  139. result["目的点"][feature_name] = []
  140. result["目的点"][feature_name].append({
  141. "点的名称": feature["点的名称"],
  142. "点的描述": feature["点的描述"],
  143. "帖子id": feature["帖子id"]
  144. })
  145. # 处理关键点
  146. if "关键点" in three_points:
  147. key_points = three_points["关键点"]
  148. if "key_points" in key_points and isinstance(key_points["key_points"], list):
  149. for item in key_points["key_points"]:
  150. point_name = item.get("关键点", "")
  151. point_desc = item.get("描述", "")
  152. features = extract_features_from_point(item, post_id, point_name, point_desc)
  153. for feature in features:
  154. feature_name = feature["特征名称"]
  155. if feature_name not in result["关键点"]:
  156. result["关键点"][feature_name] = []
  157. result["关键点"][feature_name].append({
  158. "点的名称": feature["点的名称"],
  159. "点的描述": feature["点的描述"],
  160. "帖子id": feature["帖子id"]
  161. })
  162. except Exception as e:
  163. print(f"处理文件 {file_path.name} 时出错: {e}")
  164. return result
  165. def merge_results(all_results: List[Dict]) -> Dict:
  166. """
  167. 合并所有文件的提取结果
  168. Args:
  169. all_results: 所有文件的结果列表
  170. Returns:
  171. 合并后的结果
  172. """
  173. merged = {
  174. "灵感点": {},
  175. "目的点": {},
  176. "关键点": {}
  177. }
  178. for result in all_results:
  179. for category in ["灵感点", "目的点", "关键点"]:
  180. for feature_name, sources in result[category].items():
  181. if feature_name not in merged[category]:
  182. merged[category][feature_name] = {"来源": []}
  183. merged[category][feature_name]["来源"].extend(sources)
  184. return merged
  185. def convert_to_array_format(merged_dict: Dict, fetch_details: bool = True, time_filter: Optional[str] = None) -> Dict:
  186. """
  187. 将字典格式转换为数组格式,并添加帖子详情
  188. Args:
  189. merged_dict: 字典格式的结果
  190. fetch_details: 是否获取帖子详情,默认为True
  191. time_filter: 时间过滤阈值,只保留发布时间>=该时间的帖子,格式为 "YYYY-MM-DD HH:MM:SS"
  192. Returns:
  193. 数组格式的结果
  194. """
  195. result = {
  196. "灵感点": [],
  197. "目的点": [],
  198. "关键点": []
  199. }
  200. # 收集所有需要获取详情的帖子ID
  201. post_ids = set()
  202. if fetch_details:
  203. for category in ["灵感点", "目的点", "关键点"]:
  204. for feature_name, data in merged_dict[category].items():
  205. for source in data["来源"]:
  206. post_ids.add(source["帖子id"])
  207. # 批量获取帖子详情
  208. print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
  209. post_details = {}
  210. for i, post_id in enumerate(post_ids, 1):
  211. print(f"[{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
  212. detail = get_post_detail(post_id)
  213. if detail:
  214. post_details[post_id] = detail
  215. print(f"成功获取 {len(post_details)} 个帖子详情")
  216. # 如果启用时间过滤,过滤帖子(过滤掉发布时间晚于等于阈值的帖子,避免穿越)
  217. if time_filter:
  218. print(f"\n正在应用时间过滤 (< {time_filter}),避免使用晚于当前帖子的数据...")
  219. filtered_post_ids = set()
  220. filtered_count = 0
  221. for post_id, detail in post_details.items():
  222. publish_time = detail.get('publish_time', '')
  223. if publish_time < time_filter:
  224. filtered_post_ids.add(post_id)
  225. else:
  226. filtered_count += 1
  227. print(f" ⚠️ 过滤掉帖子 {post_id} (发布时间: {publish_time},晚于阈值)")
  228. print(f"过滤掉 {filtered_count} 个帖子(穿越),保留 {len(filtered_post_ids)} 个帖子")
  229. # 更新post_details,只保留符合时间条件的
  230. post_details = {pid: detail for pid, detail in post_details.items() if pid in filtered_post_ids}
  231. # 转换为数组格式并添加帖子详情
  232. for category in ["灵感点", "目的点", "关键点"]:
  233. for feature_name, data in merged_dict[category].items():
  234. # 为每个来源添加帖子详情
  235. enhanced_sources = []
  236. for source in data["来源"]:
  237. # 如果启用时间过滤,跳过不符合时间条件的帖子
  238. if fetch_details and time_filter and source["帖子id"] not in post_details:
  239. continue
  240. enhanced_source = source.copy()
  241. if fetch_details and source["帖子id"] in post_details:
  242. enhanced_source["帖子详情"] = post_details[source["帖子id"]]
  243. enhanced_sources.append(enhanced_source)
  244. # 只添加有来源的特征
  245. if enhanced_sources:
  246. result[category].append({
  247. "特征名称": feature_name,
  248. "特征来源": enhanced_sources
  249. })
  250. return result
  251. def get_earliest_publish_time(current_posts_dir: Path) -> Optional[str]:
  252. """
  253. 获取当前帖子目录中最早的发布时间
  254. Args:
  255. current_posts_dir: 当前帖子目录路径
  256. Returns:
  257. 最早的发布时间字符串,格式为 "YYYY-MM-DD HH:MM:SS"
  258. """
  259. if not current_posts_dir.exists():
  260. print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
  261. return None
  262. json_files = list(current_posts_dir.glob("*.json"))
  263. if not json_files:
  264. print(f"警告: 当前帖子目录为空: {current_posts_dir}")
  265. return None
  266. print(f"\n正在获取当前帖子的发布时间...")
  267. print(f"找到 {len(json_files)} 个当前帖子")
  268. earliest_time = None
  269. for file_path in json_files:
  270. post_id = extract_post_id_from_filename(file_path.name)
  271. if not post_id:
  272. continue
  273. try:
  274. detail = get_post_detail(post_id)
  275. if detail and 'publish_time' in detail:
  276. publish_time = detail['publish_time']
  277. if earliest_time is None or publish_time < earliest_time:
  278. earliest_time = publish_time
  279. print(f" 更新最早时间: {publish_time} (帖子: {post_id})")
  280. except Exception as e:
  281. print(f" 警告: 获取帖子 {post_id} 发布时间失败: {e}")
  282. if earliest_time:
  283. print(f"\n当前帖子最早发布时间: {earliest_time}")
  284. else:
  285. print("\n警告: 未能获取到任何当前帖子的发布时间")
  286. return earliest_time
  287. def main():
  288. # 输入输出路径(默认使用项目根目录下的 data/data_1117 目录)
  289. script_dir = Path(__file__).parent
  290. project_root = script_dir.parent.parent
  291. data_dir = project_root / "data" / "data_1117"
  292. input_dir = data_dir / "过去帖子_what解构结果"
  293. current_posts_dir = data_dir / "当前帖子_what解构结果"
  294. output_file = data_dir / "特征名称_帖子来源.json"
  295. # 获取当前帖子的最早发布时间
  296. earliest_time = get_earliest_publish_time(current_posts_dir)
  297. print(f"\n正在扫描目录: {input_dir}")
  298. # 获取所有JSON文件
  299. json_files = list(input_dir.glob("*.json"))
  300. print(f"找到 {len(json_files)} 个JSON文件")
  301. # 处理所有文件
  302. all_results = []
  303. for i, file_path in enumerate(json_files, 1):
  304. print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
  305. result = process_single_file(file_path)
  306. all_results.append(result)
  307. # 合并结果
  308. print("\n正在合并结果...")
  309. merged_result = merge_results(all_results)
  310. # 转换为数组格式(带时间过滤)
  311. print("正在转换为数组格式...")
  312. final_result = convert_to_array_format(merged_result, fetch_details=True, time_filter=earliest_time)
  313. # 统计信息
  314. if earliest_time:
  315. print(f"\n提取统计 (已过滤掉发布时间 >= {earliest_time} 的帖子):")
  316. else:
  317. print(f"\n提取统计:")
  318. for category in ["灵感点", "目的点", "关键点"]:
  319. feature_count = len(final_result[category])
  320. source_count = sum(len(item["特征来源"]) for item in final_result[category])
  321. print(f" {category}: {feature_count} 个特征, {source_count} 个来源")
  322. # 保存结果
  323. print(f"\n正在保存结果到: {output_file}")
  324. with open(output_file, "w", encoding="utf-8") as f:
  325. json.dump(final_result, f, ensure_ascii=False, indent=4)
  326. print("完成!")
  327. if __name__ == "__main__":
  328. main()