build_note_to_all_index.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. """
  2. 构建帖子ID到点和分类的完整映射
  3. 从人设.json和what解构结果中提取每个帖子的:
  4. 1. 所属的灵感分类、目的分类、关键点分类(来自人设.json)
  5. 2. 具体的灵感点、目的点、关键点(来自what解构结果)
  6. 3. 帖子详情
  7. 使用方式:
  8. python build_note_to_all_index.py --persona-dir data/阿里多多酱/out/人设_1110
  9. """
  10. import os
  11. import json
  12. import argparse
  13. from typing import Dict, List, Any
  14. from glob import glob
  15. from script.detail import get_xiaohongshu_detail
  16. def build_category_mapping(persona_data: Dict[str, Any]) -> Dict[str, List[str]]:
  17. """构建帖子ID到分类的映射
  18. Args:
  19. persona_data: 人设数据
  20. Returns:
  21. {note_id: [分类名称列表]}
  22. """
  23. note_to_categories = {}
  24. # 遍历所有视角
  25. for perspective in persona_data.get("灵感点列表", []):
  26. perspective_name = perspective.get("视角名称", "")
  27. # 遍历一级分类
  28. for category_l1 in perspective.get("模式列表", []):
  29. category_l1_name = category_l1.get("分类名称", "")
  30. # 遍历二级分类
  31. for category_l2 in category_l1.get("二级细分", []):
  32. category_l2_name = category_l2.get("分类名称", "")
  33. category_l2_def = category_l2.get("分类定义", "")
  34. note_ids = category_l2.get("帖子ID列表", [])
  35. # 去重帖子ID
  36. unique_note_ids = list(dict.fromkeys(note_ids))
  37. # 为每个帖子添加分类信息
  38. for note_id in unique_note_ids:
  39. if note_id not in note_to_categories:
  40. note_to_categories[note_id] = []
  41. note_to_categories[note_id].append({
  42. "分类类型": "灵感分类",
  43. "视角名称": perspective_name,
  44. "一级分类": category_l1_name,
  45. "二级分类": category_l2_name,
  46. "分类定义": category_l2_def
  47. })
  48. return note_to_categories
  49. def extract_points_from_what(what_dir: str) -> Dict[str, Dict[str, Any]]:
  50. """从what解构结果提取所有点
  51. Args:
  52. what_dir: what解构结果目录
  53. Returns:
  54. {note_id: {灵感点列表, 目的点列表, 关键点列表}}
  55. """
  56. note_to_points = {}
  57. # 读取所有what解构文件
  58. what_files = glob(os.path.join(what_dir, "*_with_history_*.json"))
  59. for what_file in what_files:
  60. # 从文件名提取note_id
  61. filename = os.path.basename(what_file)
  62. note_id = filename.split("_with_history_")[0]
  63. with open(what_file, 'r', encoding='utf-8') as f:
  64. data = json.load(f)
  65. three_points = data.get("三点解构", {})
  66. # 提取灵感点
  67. inspiration_points = []
  68. inspiration_data = three_points.get("灵感点", {})
  69. for field in ["全新内容", "共性差异", "共性内容"]:
  70. items = inspiration_data.get(field, [])
  71. for item in items:
  72. point = item.get("灵感点", "")
  73. if point:
  74. inspiration_points.append({
  75. "灵感点": point,
  76. "来源字段": field,
  77. "维度": item.get("维度", ""),
  78. "描述": item.get("描述", "")
  79. })
  80. # 提取目的点
  81. purpose_points = []
  82. purpose_data = three_points.get("目的点", {})
  83. # 主目的
  84. main_purpose = purpose_data.get("main_purpose", {})
  85. if main_purpose:
  86. point = main_purpose.get("目的点", "")
  87. if point:
  88. purpose_points.append({
  89. "目的点": point,
  90. "类型": "主目的",
  91. "维度": main_purpose.get("维度", ""),
  92. "描述": main_purpose.get("描述", "")
  93. })
  94. # 次要目的
  95. secondary_purposes = purpose_data.get("secondary_purposes", [])
  96. for sec_purpose in secondary_purposes:
  97. point = sec_purpose.get("目的点", "")
  98. if point:
  99. purpose_points.append({
  100. "目的点": point,
  101. "类型": "次要目的",
  102. "维度": sec_purpose.get("维度", ""),
  103. "描述": sec_purpose.get("描述", "")
  104. })
  105. # 提取关键点
  106. key_points = []
  107. keypoint_data = three_points.get("关键点", {})
  108. kp_list = keypoint_data.get("key_points", [])
  109. for kp in kp_list:
  110. point = kp.get("关键点", "")
  111. if point:
  112. key_points.append({
  113. "关键点": point,
  114. "维度大类": kp.get("维度大类", ""),
  115. "维度细分": kp.get("维度细分", ""),
  116. "描述": kp.get("描述", "")
  117. })
  118. note_to_points[note_id] = {
  119. "灵感点列表": inspiration_points,
  120. "目的点列表": purpose_points,
  121. "关键点列表": key_points
  122. }
  123. return note_to_points
  124. def build_note_to_all_index(
  125. persona_dir: str,
  126. fetch_details: bool = True
  127. ) -> Dict[str, Any]:
  128. """构建帖子ID到点和分类的完整映射
  129. Args:
  130. persona_dir: 人设目录路径
  131. fetch_details: 是否获取帖子详情
  132. Returns:
  133. 完整的映射索引
  134. """
  135. persona_file = os.path.join(persona_dir, "人设.json")
  136. what_dir = os.path.join(persona_dir, "what解构结果")
  137. print(f"{'=' * 80}")
  138. print(f"构建帖子ID到点和分类的完整映射")
  139. print(f"{'=' * 80}")
  140. print(f"人设文件: {persona_file}")
  141. print(f"解构目录: {what_dir}\n")
  142. # 读取人设数据
  143. with open(persona_file, 'r', encoding='utf-8') as f:
  144. persona_data = json.load(f)
  145. # 构建分类映射
  146. print(f"{'─' * 80}")
  147. print(f"1. 提取分类信息(来自人设.json)")
  148. print(f"{'─' * 80}\n")
  149. note_to_categories = build_category_mapping(persona_data)
  150. print(f"✓ 从人设.json中提取了 {len(note_to_categories)} 个帖子的分类信息\n")
  151. # 提取点信息
  152. print(f"{'─' * 80}")
  153. print(f"2. 提取点信息(来自what解构结果)")
  154. print(f"{'─' * 80}\n")
  155. note_to_points = extract_points_from_what(what_dir)
  156. print(f"✓ 从what解构结果中提取了 {len(note_to_points)} 个帖子的点信息\n")
  157. # 合并所有帖子ID
  158. all_note_ids = set(note_to_categories.keys()) | set(note_to_points.keys())
  159. print(f"✓ 共有 {len(all_note_ids)} 个唯一帖子\n")
  160. # 构建完整映射
  161. note_index = {}
  162. for note_id in all_note_ids:
  163. note_index[note_id] = {
  164. "帖子ID": note_id,
  165. "所属分类": note_to_categories.get(note_id, []),
  166. "灵感点列表": note_to_points.get(note_id, {}).get("灵感点列表", []),
  167. "目的点列表": note_to_points.get(note_id, {}).get("目的点列表", []),
  168. "关键点列表": note_to_points.get(note_id, {}).get("关键点列表", [])
  169. }
  170. # 获取帖子详情
  171. if fetch_details:
  172. print(f"{'=' * 80}")
  173. print(f"开始获取帖子详情...")
  174. print(f"{'=' * 80}\n")
  175. for i, note_id in enumerate(sorted(all_note_ids), 1):
  176. try:
  177. print(f"[{i}/{len(all_note_ids)}] 获取详情: {note_id}")
  178. detail = get_xiaohongshu_detail(note_id)
  179. note_index[note_id]["帖子详情"] = detail
  180. except Exception as e:
  181. print(f" ⚠️ 获取失败: {e}")
  182. note_index[note_id]["帖子详情"] = {
  183. "channel_content_id": note_id,
  184. "error": str(e)
  185. }
  186. print(f"\n✓ 帖子详情获取完成\n")
  187. return note_index
  188. def save_index(index_data: Dict[str, Any], output_file: str):
  189. """保存索引到文件
  190. Args:
  191. index_data: 索引数据
  192. output_file: 输出文件路径
  193. """
  194. output_dir = os.path.dirname(output_file)
  195. if output_dir:
  196. os.makedirs(output_dir, exist_ok=True)
  197. with open(output_file, 'w', encoding='utf-8') as f:
  198. json.dump(index_data, f, ensure_ascii=False, indent=2)
  199. print(f"✓ 索引已保存: {output_file}")
  200. def print_statistics(index_data: Dict[str, Any]):
  201. """打印统计信息
  202. Args:
  203. index_data: 完整的索引数据
  204. """
  205. print(f"\n{'=' * 80}")
  206. print(f"索引统计信息")
  207. print(f"{'=' * 80}\n")
  208. total_notes = len(index_data)
  209. print(f"帖子总数: {total_notes}")
  210. # 统计有分类的帖子
  211. notes_with_categories = sum(1 for v in index_data.values() if v.get("所属分类"))
  212. print(f"有分类信息的帖子: {notes_with_categories}")
  213. # 统计有点信息的帖子
  214. notes_with_inspiration = sum(1 for v in index_data.values() if v.get("灵感点列表"))
  215. notes_with_purpose = sum(1 for v in index_data.values() if v.get("目的点列表"))
  216. notes_with_keypoint = sum(1 for v in index_data.values() if v.get("关键点列表"))
  217. print(f"有灵感点信息的帖子: {notes_with_inspiration}")
  218. print(f"有目的点信息的帖子: {notes_with_purpose}")
  219. print(f"有关键点信息的帖子: {notes_with_keypoint}")
  220. # 统计平均数量
  221. total_categories = sum(len(v.get("所属分类", [])) for v in index_data.values())
  222. total_inspiration = sum(len(v.get("灵感点列表", [])) for v in index_data.values())
  223. total_purpose = sum(len(v.get("目的点列表", [])) for v in index_data.values())
  224. total_keypoint = sum(len(v.get("关键点列表", [])) for v in index_data.values())
  225. if total_notes > 0:
  226. print(f"\n平均每个帖子:")
  227. print(f" 分类数: {total_categories / total_notes:.1f}")
  228. print(f" 灵感点数: {total_inspiration / total_notes:.1f}")
  229. print(f" 目的点数: {total_purpose / total_notes:.1f}")
  230. print(f" 关键点数: {total_keypoint / total_notes:.1f}")
  231. def main():
  232. """主函数"""
  233. parser = argparse.ArgumentParser(
  234. description="构建帖子ID到点和分类的完整映射",
  235. formatter_class=argparse.RawDescriptionHelpFormatter,
  236. epilog="""
  237. 使用示例:
  238. # 基本使用
  239. python build_note_to_all_index.py --persona-dir data/阿里多多酱/out/人设_1110
  240. # 只构建索引,不获取帖子详情
  241. python build_note_to_all_index.py --persona-dir data/阿里多多酱/out/人设_1110 --no-details
  242. # 自定义输出文件
  243. python build_note_to_all_index.py --persona-dir data/阿里多多酱/out/人设_1110 --output custom.json
  244. """
  245. )
  246. parser.add_argument(
  247. "--persona-dir",
  248. required=True,
  249. help="人设目录路径(包含人设.json和what解构结果/的目录)"
  250. )
  251. parser.add_argument(
  252. "--output",
  253. default=None,
  254. help="输出文件路径(默认: {persona_dir}/帖子到分类和点映射.json)"
  255. )
  256. parser.add_argument(
  257. "--no-details",
  258. action="store_true",
  259. help="不获取帖子详情(只构建索引结构)"
  260. )
  261. args = parser.parse_args()
  262. persona_dir = args.persona_dir
  263. fetch_details = not args.no_details
  264. # 检查必要文件
  265. persona_file = os.path.join(persona_dir, "人设.json")
  266. what_dir = os.path.join(persona_dir, "what解构结果")
  267. if not os.path.exists(persona_file):
  268. print(f"❌ 错误: 找不到人设文件: {persona_file}")
  269. return
  270. if not os.path.exists(what_dir):
  271. print(f"❌ 错误: 找不到what解构目录: {what_dir}")
  272. return
  273. # 构建索引
  274. index_data = build_note_to_all_index(persona_dir, fetch_details=fetch_details)
  275. # 确定输出文件路径
  276. if args.output:
  277. output_file = args.output
  278. else:
  279. output_file = os.path.join(persona_dir, "帖子到分类和点映射.json")
  280. # 保存索引
  281. save_index(index_data, output_file)
  282. # 打印统计信息
  283. print_statistics(index_data)
  284. if __name__ == "__main__":
  285. main()