build_point_to_note_index.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. """
  2. 构建点到帖子的映射索引
  3. 从 what解构结果 中提取每个帖子的灵感点、目的点、关键点,
  4. 构建从具体点到帖子详情的映射关系。
  5. 使用方式:
  6. python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果
  7. """
  8. import os
  9. import json
  10. import argparse
  11. from typing import Dict, List, Any
  12. from glob import glob
  13. from script.detail import get_xiaohongshu_detail
  14. def extract_points_from_what_file(what_file: str) -> Dict[str, Any]:
  15. """从单个 what 解构文件中提取所有点
  16. Args:
  17. what_file: what解构文件路径
  18. Returns:
  19. 包含灵感点、目的点、关键点的字典
  20. """
  21. # 从文件名提取note_id
  22. filename = os.path.basename(what_file)
  23. note_id = filename.split("_with_history_")[0]
  24. with open(what_file, 'r', encoding='utf-8') as f:
  25. data = json.load(f)
  26. three_points = data.get("三点解构", {})
  27. # 提取灵感点
  28. inspiration_points = []
  29. inspiration_data = three_points.get("灵感点", {})
  30. for field in ["全新内容", "共性差异", "共性内容"]:
  31. items = inspiration_data.get(field, [])
  32. for item in items:
  33. point = item.get("灵感点", "")
  34. if point:
  35. inspiration_points.append({
  36. "灵感点": point,
  37. "来源字段": field,
  38. "维度": item.get("维度", ""),
  39. "描述": item.get("描述", "")
  40. })
  41. # 提取目的点
  42. purpose_points = []
  43. purpose_data = three_points.get("目的点", {})
  44. # 主目的
  45. main_purpose = purpose_data.get("main_purpose", {})
  46. if main_purpose:
  47. point = main_purpose.get("目的点", "")
  48. if point:
  49. purpose_points.append({
  50. "目的点": point,
  51. "类型": "主目的",
  52. "维度": main_purpose.get("维度", ""),
  53. "描述": main_purpose.get("描述", "")
  54. })
  55. # 次要目的
  56. secondary_purposes = purpose_data.get("secondary_purposes", [])
  57. for sec_purpose in secondary_purposes:
  58. point = sec_purpose.get("目的点", "")
  59. if point:
  60. purpose_points.append({
  61. "目的点": point,
  62. "类型": "次要目的",
  63. "维度": sec_purpose.get("维度", ""),
  64. "描述": sec_purpose.get("描述", "")
  65. })
  66. # 提取关键点
  67. key_points = []
  68. keypoint_data = three_points.get("关键点", {})
  69. kp_list = keypoint_data.get("key_points", [])
  70. for kp in kp_list:
  71. point = kp.get("关键点", "")
  72. if point:
  73. key_points.append({
  74. "关键点": point,
  75. "维度大类": kp.get("维度大类", ""),
  76. "维度细分": kp.get("维度细分", ""),
  77. "描述": kp.get("描述", "")
  78. })
  79. return {
  80. "note_id": note_id,
  81. "灵感点列表": inspiration_points,
  82. "目的点列表": purpose_points,
  83. "关键点列表": key_points
  84. }
  85. def build_point_to_note_index(what_dir: str, fetch_details: bool = True) -> Dict[str, Any]:
  86. """构建点到帖子的映射索引
  87. Args:
  88. what_dir: what解构结果目录路径
  89. fetch_details: 是否获取帖子详情
  90. Returns:
  91. 完整的映射索引
  92. """
  93. # 读取所有what解构文件
  94. what_files = glob(os.path.join(what_dir, "*_with_history_*.json"))
  95. print(f"{'=' * 80}")
  96. print(f"开始构建点到帖子的映射索引")
  97. print(f"{'=' * 80}")
  98. print(f"解构文件数量: {len(what_files)}\n")
  99. # 初始化索引结构
  100. inspiration_index = {} # {灵感点: [note_id1, note_id2, ...]}
  101. purpose_index = {} # {目的点: [note_id1, note_id2, ...]}
  102. keypoint_index = {} # {关键点: [note_id1, note_id2, ...]}
  103. # 帖子到点的映射
  104. note_to_points = {} # {note_id: {灵感点: [], 目的点: [], 关键点: []}}
  105. # 遍历所有文件
  106. for what_file in what_files:
  107. points_data = extract_points_from_what_file(what_file)
  108. note_id = points_data["note_id"]
  109. # 初始化帖子的点列表
  110. note_to_points[note_id] = {
  111. "灵感点列表": points_data["灵感点列表"],
  112. "目的点列表": points_data["目的点列表"],
  113. "关键点列表": points_data["关键点列表"]
  114. }
  115. # 构建灵感点到帖子的映射
  116. for insp in points_data["灵感点列表"]:
  117. point_name = insp["灵感点"]
  118. if point_name not in inspiration_index:
  119. inspiration_index[point_name] = {
  120. "灵感点": point_name,
  121. "维度": insp["维度"],
  122. "描述": insp["描述"],
  123. "帖子ID列表": []
  124. }
  125. if note_id not in inspiration_index[point_name]["帖子ID列表"]:
  126. inspiration_index[point_name]["帖子ID列表"].append(note_id)
  127. # 构建目的点到帖子的映射
  128. for purp in points_data["目的点列表"]:
  129. point_name = purp["目的点"]
  130. if point_name not in purpose_index:
  131. purpose_index[point_name] = {
  132. "目的点": point_name,
  133. "类型": purp["类型"],
  134. "维度": purp["维度"],
  135. "描述": purp["描述"],
  136. "帖子ID列表": []
  137. }
  138. if note_id not in purpose_index[point_name]["帖子ID列表"]:
  139. purpose_index[point_name]["帖子ID列表"].append(note_id)
  140. # 构建关键点到帖子的映射
  141. for kp in points_data["关键点列表"]:
  142. point_name = kp["关键点"]
  143. if point_name not in keypoint_index:
  144. keypoint_index[point_name] = {
  145. "关键点": point_name,
  146. "维度大类": kp["维度大类"],
  147. "维度细分": kp["维度细分"],
  148. "描述": kp["描述"],
  149. "帖子ID列表": []
  150. }
  151. if note_id not in keypoint_index[point_name]["帖子ID列表"]:
  152. keypoint_index[point_name]["帖子ID列表"].append(note_id)
  153. print(f"✓ 灵感点: {len(inspiration_index)} 个")
  154. print(f"✓ 目的点: {len(purpose_index)} 个")
  155. print(f"✓ 关键点: {len(keypoint_index)} 个")
  156. print(f"✓ 帖子: {len(note_to_points)} 个\n")
  157. # 获取帖子详情
  158. if fetch_details:
  159. # 收集所有唯一的note_ids
  160. all_note_ids = list(note_to_points.keys())
  161. print(f"{'=' * 80}")
  162. print(f"开始获取帖子详情...")
  163. print(f"{'=' * 80}\n")
  164. # 获取所有帖子详情(缓存到内存)
  165. note_details_cache = {}
  166. for i, note_id in enumerate(all_note_ids, 1):
  167. try:
  168. print(f"[{i}/{len(all_note_ids)}] 获取详情: {note_id}")
  169. detail = get_xiaohongshu_detail(note_id)
  170. note_details_cache[note_id] = detail
  171. except Exception as e:
  172. print(f" ⚠️ 获取失败: {e}")
  173. note_details_cache[note_id] = {
  174. "channel_content_id": note_id,
  175. "error": str(e)
  176. }
  177. print(f"\n✓ 帖子详情获取完成\n")
  178. # 填充详情到每个索引
  179. for point_info in inspiration_index.values():
  180. note_ids = point_info.get("帖子ID列表", [])
  181. point_info["帖子详情列表"] = [
  182. note_details_cache.get(note_id, {"channel_content_id": note_id})
  183. for note_id in note_ids
  184. ]
  185. for point_info in purpose_index.values():
  186. note_ids = point_info.get("帖子ID列表", [])
  187. point_info["帖子详情列表"] = [
  188. note_details_cache.get(note_id, {"channel_content_id": note_id})
  189. for note_id in note_ids
  190. ]
  191. for point_info in keypoint_index.values():
  192. note_ids = point_info.get("帖子ID列表", [])
  193. point_info["帖子详情列表"] = [
  194. note_details_cache.get(note_id, {"channel_content_id": note_id})
  195. for note_id in note_ids
  196. ]
  197. # 填充帖子到点的映射中的详情
  198. for note_id in note_to_points.keys():
  199. note_to_points[note_id]["帖子详情"] = note_details_cache.get(
  200. note_id,
  201. {"channel_content_id": note_id}
  202. )
  203. return {
  204. "点到帖子映射": {
  205. "灵感点": inspiration_index,
  206. "目的点": purpose_index,
  207. "关键点": keypoint_index
  208. },
  209. "帖子到点映射": note_to_points
  210. }
  211. def save_index(index_data: Dict[str, Any], output_file: str):
  212. """保存索引到文件
  213. Args:
  214. index_data: 索引数据
  215. output_file: 输出文件路径
  216. """
  217. output_dir = os.path.dirname(output_file)
  218. if output_dir:
  219. os.makedirs(output_dir, exist_ok=True)
  220. with open(output_file, 'w', encoding='utf-8') as f:
  221. json.dump(index_data, f, ensure_ascii=False, indent=2)
  222. print(f"✓ 索引已保存: {output_file}")
  223. def print_statistics(index_data: Dict[str, Any]):
  224. """打印统计信息
  225. Args:
  226. index_data: 完整的索引数据
  227. """
  228. print(f"\n{'=' * 80}")
  229. print(f"索引统计信息")
  230. print(f"{'=' * 80}\n")
  231. point_to_note = index_data.get("点到帖子映射", {})
  232. note_to_point = index_data.get("帖子到点映射", {})
  233. print(f"点到帖子映射:")
  234. for point_type, points in point_to_note.items():
  235. total_points = len(points)
  236. all_note_ids = set()
  237. for point_info in points.values():
  238. all_note_ids.update(point_info.get("帖子ID列表", []))
  239. total_notes = len(all_note_ids)
  240. avg_notes = total_notes / total_points if total_points > 0 else 0
  241. print(f" {point_type}:")
  242. print(f" 点的数量: {total_points}")
  243. print(f" 关联帖子数: {total_notes}")
  244. print(f" 平均每个点关联帖子数: {avg_notes:.1f}")
  245. print(f"\n帖子到点映射:")
  246. print(f" 帖子数量: {len(note_to_point)}")
  247. # 统计每个帖子平均有多少个点
  248. total_insp = sum(len(v.get("灵感点列表", [])) for v in note_to_point.values())
  249. total_purp = sum(len(v.get("目的点列表", [])) for v in note_to_point.values())
  250. total_kp = sum(len(v.get("关键点列表", [])) for v in note_to_point.values())
  251. note_count = len(note_to_point)
  252. if note_count > 0:
  253. print(f" 平均每个帖子的灵感点数: {total_insp / note_count:.1f}")
  254. print(f" 平均每个帖子的目的点数: {total_purp / note_count:.1f}")
  255. print(f" 平均每个帖子的关键点数: {total_kp / note_count:.1f}")
  256. def main():
  257. """主函数"""
  258. parser = argparse.ArgumentParser(
  259. description="构建点到帖子的映射索引",
  260. formatter_class=argparse.RawDescriptionHelpFormatter,
  261. epilog="""
  262. 使用示例:
  263. # 基本使用
  264. python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果
  265. # 只构建索引,不获取帖子详情
  266. python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果 --no-details
  267. # 自定义输出文件
  268. python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果 --output custom.json
  269. """
  270. )
  271. parser.add_argument(
  272. "--what-dir",
  273. required=True,
  274. help="what解构结果目录路径"
  275. )
  276. parser.add_argument(
  277. "--output",
  278. default=None,
  279. help="输出文件路径(默认: {what_dir}/../点到帖子映射.json)"
  280. )
  281. parser.add_argument(
  282. "--no-details",
  283. action="store_true",
  284. help="不获取帖子详情(只构建索引结构)"
  285. )
  286. args = parser.parse_args()
  287. what_dir = args.what_dir
  288. fetch_details = not args.no_details
  289. if not os.path.exists(what_dir):
  290. print(f"❌ 错误: 找不到what解构目录: {what_dir}")
  291. return
  292. # 构建索引
  293. index_data = build_point_to_note_index(what_dir, fetch_details=fetch_details)
  294. # 确定输出文件路径
  295. if args.output:
  296. output_file = args.output
  297. else:
  298. parent_dir = os.path.dirname(what_dir.rstrip('/'))
  299. output_file = os.path.join(parent_dir, "点到帖子映射.json")
  300. # 保存索引
  301. save_index(index_data, output_file)
  302. # 打印统计信息
  303. print_statistics(index_data)
  304. if __name__ == "__main__":
  305. main()