search_with_detail.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. #!/usr/bin/env python3
  2. """
  3. 小红书搜索 + 详情补充 - 端到端工具
  4. 先调用搜索API获取笔记列表,再批量调用详情API补充完整信息
  5. """
  6. import json
  7. import os
  8. import argparse
  9. import sys
  10. from datetime import datetime
  11. from typing import List, Dict, Any
  12. # 添加项目根目录到路径
  13. script_dir = os.path.dirname(os.path.abspath(__file__))
  14. project_root = os.path.dirname(os.path.dirname(script_dir))
  15. sys.path.insert(0, project_root)
  16. from script.search.xiaohongshu_search import XiaohongshuSearch
  17. from script.search.xiaohongshu_detail import XiaohongshuDetail
  18. from script.search.enrichment_helper import (
  19. enrich_posts_batch,
  20. create_enriched_summary,
  21. print_enrichment_stats
  22. )
  23. from knowledge_search_traverse import Post, process_note_data
  24. def search_and_enrich(
  25. keyword: str,
  26. content_type: str = "不限",
  27. sort_type: str = "综合",
  28. publish_time: str = "不限",
  29. cursor: str = "",
  30. enable_detail: bool = True,
  31. detail_delay: int = 1,
  32. results_dir: str = None
  33. ) -> tuple[List[Post], str]:
  34. """
  35. 搜索并补充详情的主流程
  36. Args:
  37. keyword: 搜索关键词
  38. content_type: 内容类型
  39. sort_type: 排序方式
  40. publish_time: 发布时间筛选
  41. cursor: 翻页游标
  42. enable_detail: 是否启用详情补充
  43. detail_delay: 详情请求间隔(秒)
  44. results_dir: 结果输出目录
  45. Returns:
  46. (Post对象列表, 保存的文件路径)
  47. """
  48. print("\n" + "=" * 80)
  49. print(f"小红书搜索 + 详情补充工具")
  50. print("=" * 80)
  51. print(f"关键词: {keyword}")
  52. print(f"内容类型: {content_type}")
  53. print(f"排序方式: {sort_type}")
  54. print(f"发布时间: {publish_time}")
  55. print(f"详情补充: {'启用' if enable_detail else '禁用'}")
  56. print("=" * 80 + "\n")
  57. # 1. 执行搜索
  58. print("步骤 1/3: 执行搜索...")
  59. print("-" * 80)
  60. search_client = XiaohongshuSearch(results_dir=results_dir)
  61. search_result = search_client.search(
  62. keyword=keyword,
  63. content_type=content_type,
  64. sort_type=sort_type,
  65. publish_time=publish_time,
  66. cursor=cursor
  67. )
  68. # 解析搜索结果
  69. notes_data = search_result.get("data", {}).get("data", [])
  70. print(f"✓ 搜索完成,获得 {len(notes_data)} 条结果\n")
  71. if not notes_data:
  72. print("未找到任何结果")
  73. return [], ""
  74. # 2. 转换为Post对象
  75. print("步骤 2/3: 解析搜索结果...")
  76. print("-" * 80)
  77. posts: List[Post] = []
  78. for note in notes_data:
  79. try:
  80. post = process_note_data(note)
  81. posts.append(post)
  82. except Exception as e:
  83. print(f" ✗ 解析失败: {e}")
  84. print(f"✓ 成功解析 {len(posts)}/{len(notes_data)} 条结果\n")
  85. # 3. 补充详情(如果启用)
  86. if enable_detail and posts:
  87. print("步骤 3/3: 补充详情信息...")
  88. print("-" * 80)
  89. detail_client = XiaohongshuDetail(results_dir=results_dir)
  90. success, fail = enrich_posts_batch(
  91. posts,
  92. detail_client,
  93. show_progress=True,
  94. delay=detail_delay
  95. )
  96. print(f"\n✓ 详情补充完成: 成功 {success}/{len(posts)}, 失败 {fail}")
  97. print_enrichment_stats(posts)
  98. else:
  99. print("步骤 3/3: 跳过详情补充\n")
  100. # 4. 保存结果
  101. filepath = save_enriched_results(keyword, posts, search_result, results_dir)
  102. return posts, filepath
  103. def save_enriched_results(
  104. keyword: str,
  105. posts: List[Post],
  106. search_result: Dict[str, Any],
  107. results_dir: str = None
  108. ) -> str:
  109. """
  110. 保存增强后的结果
  111. Args:
  112. keyword: 搜索关键词
  113. posts: Post对象列表
  114. search_result: 原始搜索结果
  115. results_dir: 结果输出目录
  116. Returns:
  117. 保存的文件路径
  118. """
  119. # 确定输出目录
  120. if results_dir:
  121. base_dir = results_dir
  122. else:
  123. script_dir = os.path.dirname(os.path.abspath(__file__))
  124. project_root = os.path.dirname(os.path.dirname(script_dir))
  125. base_dir = os.path.join(project_root, "data", "search")
  126. # 创建目录
  127. result_dir = os.path.join(base_dir, "enriched", keyword)
  128. os.makedirs(result_dir, exist_ok=True)
  129. # 构建结果数据
  130. enriched_data = {
  131. "metadata": {
  132. "keyword": keyword,
  133. "search_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  134. "total_posts": len(posts),
  135. "enriched_posts": sum(1 for p in posts if p.detail_fetched),
  136. "video_posts": sum(1 for p in posts if p.type == "video"),
  137. "image_posts": sum(1 for p in posts if p.type != "video"),
  138. },
  139. "posts": [create_enriched_summary(p) for p in posts],
  140. "original_search_result": search_result # 保留原始搜索结果供参考
  141. }
  142. # 保存文件
  143. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  144. filename = f"{timestamp}_enriched.json"
  145. filepath = os.path.join(result_dir, filename)
  146. with open(filepath, 'w', encoding='utf-8') as f:
  147. json.dump(enriched_data, f, ensure_ascii=False, indent=2)
  148. print(f"\n✓ 结果已保存: {filepath}\n")
  149. return filepath
  150. def main():
  151. """命令行入口"""
  152. parser = argparse.ArgumentParser(
  153. description='小红书搜索 + 详情补充工具',
  154. formatter_class=argparse.RawDescriptionHelpFormatter,
  155. epilog="""
  156. 使用示例:
  157. # 基础搜索并补充详情
  158. python3 search_with_detail.py --keyword "健身教程"
  159. # 搜索视频内容
  160. python3 search_with_detail.py --keyword "化妆教程" --content-type "视频"
  161. # 仅搜索不补充详情
  162. python3 search_with_detail.py --keyword "美食" --no-detail
  163. # 自定义输出目录
  164. python3 search_with_detail.py --keyword "旅游" --results-dir "custom/output"
  165. """
  166. )
  167. # 搜索参数
  168. parser.add_argument(
  169. '--keyword',
  170. type=str,
  171. required=True,
  172. help='搜索关键词(必填)'
  173. )
  174. parser.add_argument(
  175. '--content-type',
  176. type=str,
  177. default='不限',
  178. choices=['不限', '视频', '图文'],
  179. help='内容类型(默认: 不限)'
  180. )
  181. parser.add_argument(
  182. '--sort-type',
  183. type=str,
  184. default='综合',
  185. choices=['综合', '最新', '最多点赞', '最多评论'],
  186. help='排序方式(默认: 综合)'
  187. )
  188. parser.add_argument(
  189. '--publish-time',
  190. type=str,
  191. default='不限',
  192. choices=['不限', '一天内', '一周内', '半年内'],
  193. help='发布时间筛选(默认: 不限)'
  194. )
  195. parser.add_argument(
  196. '--cursor',
  197. type=str,
  198. default='',
  199. help='翻页游标(默认为空,即第一页)'
  200. )
  201. # 详情补充参数
  202. parser.add_argument(
  203. '--no-detail',
  204. action='store_true',
  205. help='禁用详情补充(仅搜索)'
  206. )
  207. parser.add_argument(
  208. '--detail-delay',
  209. type=int,
  210. default=1,
  211. help='详情请求间隔时间(秒),默认1秒'
  212. )
  213. # 输出参数
  214. parser.add_argument(
  215. '--results-dir',
  216. type=str,
  217. default=None,
  218. help='结果输出目录(默认: data/search)'
  219. )
  220. args = parser.parse_args()
  221. # 执行搜索和补充
  222. try:
  223. posts, filepath = search_and_enrich(
  224. keyword=args.keyword,
  225. content_type=args.content_type,
  226. sort_type=args.sort_type,
  227. publish_time=args.publish_time,
  228. cursor=args.cursor,
  229. enable_detail=not args.no_detail,
  230. detail_delay=args.detail_delay,
  231. results_dir=args.results_dir
  232. )
  233. # 打印摘要
  234. print("=" * 80)
  235. print("执行完成")
  236. print("=" * 80)
  237. print(f"关键词: {args.keyword}")
  238. print(f"获得帖子: {len(posts)} 条")
  239. if not args.no_detail:
  240. enriched = sum(1 for p in posts if p.detail_fetched)
  241. print(f"详情补充: {enriched}/{len(posts)} 条")
  242. print(f"结果文件: {filepath}")
  243. print("=" * 80)
  244. except Exception as e:
  245. print(f"\n✗ 执行失败: {e}", file=sys.stderr)
  246. import traceback
  247. traceback.print_exc()
  248. sys.exit(1)
  249. if __name__ == "__main__":
  250. main()