build_category_index.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. """
  2. 构建人设分类的反向索引
  3. 将人设数据和what解构数据转换为以分类名称为键的反向索引结构,包含:
  4. - 灵感分类(来自人设.json)
  5. - 目的分类(来自what解构)
  6. - 关键点分类(来自what解构)
  7. 使用方式:
  8. python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110
  9. """
  10. import os
  11. import json
  12. import argparse
  13. from typing import Dict, List, Any
  14. from glob import glob
  15. from script.detail import get_xiaohongshu_detail
  16. def build_inspiration_index(persona_data: Dict[str, Any]) -> Dict[str, Any]:
  17. """构建灵感点索引
  18. Args:
  19. persona_data: 人设数据(包含灵感点列表)
  20. Returns:
  21. 灵感分类索引
  22. """
  23. index = {}
  24. # 遍历所有视角
  25. for perspective in persona_data.get("灵感点列表", []):
  26. perspective_name = perspective.get("视角名称", "")
  27. perspective_desc = perspective.get("视角描述", "")
  28. # 遍历一级分类(模式列表)
  29. for category_l1 in perspective.get("模式列表", []):
  30. category_l1_name = category_l1.get("分类名称", "")
  31. category_l1_def = category_l1.get("核心定义", "")
  32. # 收集一级分类下所有二级分类的帖子
  33. category_l1_note_ids = set()
  34. # 遍历二级分类(二级细分)
  35. for category_l2 in category_l1.get("二级细分", []):
  36. category_l2_name = category_l2.get("分类名称", "")
  37. category_l2_def = category_l2.get("分类定义", "")
  38. note_ids = category_l2.get("帖子ID列表", [])
  39. # 去重帖子ID
  40. unique_note_ids = list(dict.fromkeys(note_ids))
  41. # 添加到一级分类的帖子集合
  42. category_l1_note_ids.update(unique_note_ids)
  43. # 构建二级分类路径
  44. category_l2_path = [
  45. {
  46. "视角名称": perspective_name,
  47. "视角描述": perspective_desc
  48. },
  49. {
  50. "分类名称": category_l1_name,
  51. "分类定义": category_l1_def
  52. },
  53. {
  54. "分类名称": category_l2_name,
  55. "分类定义": category_l2_def
  56. }
  57. ]
  58. # 如果二级分类已存在,合并帖子列表
  59. if category_l2_name in index:
  60. existing_ids = set(index[category_l2_name]["帖子ID列表"])
  61. new_ids = set(unique_note_ids)
  62. index[category_l2_name]["帖子ID列表"] = list(existing_ids | new_ids)
  63. else:
  64. # 创建新的二级分类索引
  65. index[category_l2_name] = {
  66. "分类层级": "二级分类",
  67. "分类名称": category_l2_name,
  68. "分类定义": category_l2_def,
  69. "分类路径": category_l2_path,
  70. "帖子ID列表": unique_note_ids
  71. }
  72. # 构建一级分类路径
  73. category_l1_path = [
  74. {
  75. "视角名称": perspective_name,
  76. "视角描述": perspective_desc
  77. },
  78. {
  79. "分类名称": category_l1_name,
  80. "分类定义": category_l1_def
  81. }
  82. ]
  83. # 添加一级分类索引
  84. if category_l1_name in index:
  85. existing_ids = set(index[category_l1_name]["帖子ID列表"])
  86. index[category_l1_name]["帖子ID列表"] = list(existing_ids | category_l1_note_ids)
  87. else:
  88. index[category_l1_name] = {
  89. "分类层级": "一级分类",
  90. "分类名称": category_l1_name,
  91. "分类定义": category_l1_def,
  92. "分类路径": category_l1_path,
  93. "帖子ID列表": list(category_l1_note_ids)
  94. }
  95. return index
  96. def build_purpose_index(what_dir: str) -> Dict[str, Any]:
  97. """构建目的点索引
  98. Args:
  99. what_dir: what解构结果目录路径
  100. Returns:
  101. 目的分类索引
  102. """
  103. index = {}
  104. # 读取所有what解构文件
  105. what_files = glob(os.path.join(what_dir, "*_with_history_*.json"))
  106. for what_file in what_files:
  107. # 从文件名提取note_id
  108. filename = os.path.basename(what_file)
  109. note_id = filename.split("_with_history_")[0]
  110. with open(what_file, 'r', encoding='utf-8') as f:
  111. data = json.load(f)
  112. purpose_data = data.get("三点解构", {}).get("目的点", {})
  113. # 主目的
  114. main_purpose = purpose_data.get("main_purpose", {})
  115. if main_purpose:
  116. purpose_name = main_purpose.get("目的点", "")
  117. if purpose_name:
  118. if purpose_name not in index:
  119. index[purpose_name] = {
  120. "分类类型": "主目的",
  121. "目的点": purpose_name,
  122. "维度": main_purpose.get("维度", ""),
  123. "描述": main_purpose.get("描述", ""),
  124. "帖子ID列表": []
  125. }
  126. if note_id not in index[purpose_name]["帖子ID列表"]:
  127. index[purpose_name]["帖子ID列表"].append(note_id)
  128. # 次要目的
  129. secondary_purposes = purpose_data.get("secondary_purposes", [])
  130. for sec_purpose in secondary_purposes:
  131. purpose_name = sec_purpose.get("目的点", "")
  132. if purpose_name:
  133. if purpose_name not in index:
  134. index[purpose_name] = {
  135. "分类类型": "次要目的",
  136. "目的点": purpose_name,
  137. "维度": sec_purpose.get("维度", ""),
  138. "描述": sec_purpose.get("描述", ""),
  139. "帖子ID列表": []
  140. }
  141. if note_id not in index[purpose_name]["帖子ID列表"]:
  142. index[purpose_name]["帖子ID列表"].append(note_id)
  143. return index
  144. def build_keypoint_index(what_dir: str) -> Dict[str, Any]:
  145. """构建关键点索引
  146. Args:
  147. what_dir: what解构结果目录路径
  148. Returns:
  149. 关键点分类索引
  150. """
  151. index = {}
  152. # 读取所有what解构文件
  153. what_files = glob(os.path.join(what_dir, "*_with_history_*.json"))
  154. for what_file in what_files:
  155. # 从文件名提取note_id
  156. filename = os.path.basename(what_file)
  157. note_id = filename.split("_with_history_")[0]
  158. with open(what_file, 'r', encoding='utf-8') as f:
  159. data = json.load(f)
  160. keypoint_data = data.get("三点解构", {}).get("关键点", {})
  161. key_points = keypoint_data.get("key_points", [])
  162. for kp in key_points:
  163. kp_name = kp.get("关键点", "")
  164. if kp_name:
  165. if kp_name not in index:
  166. index[kp_name] = {
  167. "关键点": kp_name,
  168. "维度大类": kp.get("维度大类", ""),
  169. "维度细分": kp.get("维度细分", ""),
  170. "描述": kp.get("描述", ""),
  171. "帖子ID列表": []
  172. }
  173. if note_id not in index[kp_name]["帖子ID列表"]:
  174. index[kp_name]["帖子ID列表"].append(note_id)
  175. return index
  176. def fetch_note_details(category_data: Dict[str, Any]) -> Dict[str, Any]:
  177. """获取帖子详情
  178. Args:
  179. category_data: 分类数据(包含帖子ID列表)
  180. Returns:
  181. 更新后的分类数据(包含帖子详情)
  182. """
  183. # 收集所有unique的note_ids
  184. all_note_ids = set()
  185. for category_info in category_data.values():
  186. all_note_ids.update(category_info.get("帖子ID列表", []))
  187. all_note_ids = list(all_note_ids)
  188. print(f"\n{'=' * 80}")
  189. print(f"开始获取帖子详情...")
  190. print(f"{'=' * 80}\n")
  191. print(f"共有 {len(all_note_ids)} 个唯一帖子\n")
  192. # 获取所有帖子详情(缓存到内存)
  193. note_details_cache = {}
  194. for i, note_id in enumerate(all_note_ids, 1):
  195. try:
  196. print(f"[{i}/{len(all_note_ids)}] 获取详情: {note_id}")
  197. detail = get_xiaohongshu_detail(note_id)
  198. note_details_cache[note_id] = detail
  199. except Exception as e:
  200. print(f" ⚠️ 获取失败: {e}")
  201. note_details_cache[note_id] = {
  202. "channel_content_id": note_id,
  203. "error": str(e)
  204. }
  205. print(f"\n✓ 帖子详情获取完成\n")
  206. # 填充详情到每个分类
  207. for category_info in category_data.values():
  208. note_ids = category_info.get("帖子ID列表", [])
  209. category_info["帖子详情列表"] = [
  210. note_details_cache.get(note_id, {"channel_content_id": note_id})
  211. for note_id in note_ids
  212. ]
  213. return category_data
  214. def save_index(index_data: Dict[str, Any], output_file: str):
  215. """保存索引到文件
  216. Args:
  217. index_data: 索引数据
  218. output_file: 输出文件路径
  219. """
  220. output_dir = os.path.dirname(output_file)
  221. if output_dir:
  222. os.makedirs(output_dir, exist_ok=True)
  223. with open(output_file, 'w', encoding='utf-8') as f:
  224. json.dump(index_data, f, ensure_ascii=False, indent=2)
  225. print(f"✓ 索引已保存: {output_file}")
  226. def print_statistics(index_data: Dict[str, Any]):
  227. """打印统计信息
  228. Args:
  229. index_data: 完整的索引数据
  230. """
  231. print(f"\n{'=' * 80}")
  232. print(f"索引统计信息")
  233. print(f"{'=' * 80}\n")
  234. for index_type, categories in index_data.items():
  235. total_categories = len(categories)
  236. all_note_ids = set()
  237. for cat_info in categories.values():
  238. all_note_ids.update(cat_info.get("帖子ID列表", []))
  239. total_notes = len(all_note_ids)
  240. avg_notes = total_notes / total_categories if total_categories > 0 else 0
  241. print(f"{index_type}:")
  242. print(f" 分类数量: {total_categories}")
  243. print(f" 帖子总数: {total_notes}")
  244. print(f" 平均每分类帖子数: {avg_notes:.1f}\n")
  245. def main():
  246. """主函数"""
  247. parser = argparse.ArgumentParser(
  248. description="构建人设分类的反向索引(灵感+目的+关键点)",
  249. formatter_class=argparse.RawDescriptionHelpFormatter,
  250. epilog="""
  251. 使用示例:
  252. # 基本使用
  253. python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110
  254. # 只构建索引,不获取帖子详情
  255. python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110 --no-details
  256. # 自定义输出文件
  257. python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110 --output custom_index.json
  258. """
  259. )
  260. parser.add_argument(
  261. "--persona-dir",
  262. required=True,
  263. help="人设目录路径(包含人设.json和what解构结果/的目录)"
  264. )
  265. parser.add_argument(
  266. "--output",
  267. default=None,
  268. help="输出文件路径(默认: {persona_dir}/分类索引_完整.json)"
  269. )
  270. parser.add_argument(
  271. "--no-details",
  272. action="store_true",
  273. help="不获取帖子详情(只构建索引结构)"
  274. )
  275. args = parser.parse_args()
  276. persona_dir = args.persona_dir
  277. fetch_details = not args.no_details
  278. # 检查必要文件
  279. persona_file = os.path.join(persona_dir, "人设.json")
  280. what_dir = os.path.join(persona_dir, "what解构结果")
  281. if not os.path.exists(persona_file):
  282. print(f"❌ 错误: 找不到人设文件: {persona_file}")
  283. return
  284. if not os.path.exists(what_dir):
  285. print(f"❌ 错误: 找不到what解构目录: {what_dir}")
  286. return
  287. print(f"{'=' * 80}")
  288. print(f"构建人设分类反向索引(灵感+目的+关键点)")
  289. print(f"{'=' * 80}")
  290. print(f"人设文件: {persona_file}")
  291. print(f"解构目录: {what_dir}")
  292. print(f"获取详情: {'是' if fetch_details else '否'}\n")
  293. # 读取人设数据
  294. with open(persona_file, 'r', encoding='utf-8') as f:
  295. persona_data = json.load(f)
  296. # 构建三种索引
  297. print(f"{'─' * 80}")
  298. print(f"1. 构建灵感分类索引...")
  299. print(f"{'─' * 80}\n")
  300. inspiration_index = build_inspiration_index(persona_data)
  301. print(f"✓ 灵感分类: {len(inspiration_index)} 个分类\n")
  302. print(f"{'─' * 80}")
  303. print(f"2. 构建目的分类索引...")
  304. print(f"{'─' * 80}\n")
  305. purpose_index = build_purpose_index(what_dir)
  306. print(f"✓ 目的分类: {len(purpose_index)} 个分类\n")
  307. print(f"{'─' * 80}")
  308. print(f"3. 构建关键点分类索引...")
  309. print(f"{'─' * 80}\n")
  310. keypoint_index = build_keypoint_index(what_dir)
  311. print(f"✓ 关键点分类: {len(keypoint_index)} 个分类\n")
  312. # 合并为完整索引
  313. full_index = {
  314. "灵感分类": inspiration_index,
  315. "目的分类": purpose_index,
  316. "关键点分类": keypoint_index
  317. }
  318. # 获取帖子详情
  319. if fetch_details:
  320. full_index["灵感分类"] = fetch_note_details(inspiration_index)
  321. full_index["目的分类"] = fetch_note_details(purpose_index)
  322. full_index["关键点分类"] = fetch_note_details(keypoint_index)
  323. # 确定输出文件路径
  324. if args.output:
  325. output_file = args.output
  326. else:
  327. output_file = os.path.join(persona_dir, "分类索引_完整.json")
  328. # 保存索引
  329. save_index(full_index, output_file)
  330. # 打印统计信息
  331. print_statistics(full_index)
  332. if __name__ == "__main__":
  333. main()