| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292 |
- #!/usr/bin/env python3
- """
- 小红书搜索 + 详情补充 - 端到端工具
- 先调用搜索API获取笔记列表,再批量调用详情API补充完整信息
- """
- import json
- import os
- import argparse
- import sys
- from datetime import datetime
- from typing import List, Dict, Any
- # 添加项目根目录到路径
- script_dir = os.path.dirname(os.path.abspath(__file__))
- project_root = os.path.dirname(os.path.dirname(script_dir))
- sys.path.insert(0, project_root)
- from script.search.xiaohongshu_search import XiaohongshuSearch
- from script.search.xiaohongshu_detail import XiaohongshuDetail
- from script.search.enrichment_helper import (
- enrich_posts_batch,
- create_enriched_summary,
- print_enrichment_stats
- )
- from knowledge_search_traverse import Post, process_note_data
- def search_and_enrich(
- keyword: str,
- content_type: str = "不限",
- sort_type: str = "综合",
- publish_time: str = "不限",
- cursor: str = "",
- enable_detail: bool = True,
- detail_delay: int = 1,
- results_dir: str = None
- ) -> tuple[List[Post], str]:
- """
- 搜索并补充详情的主流程
- Args:
- keyword: 搜索关键词
- content_type: 内容类型
- sort_type: 排序方式
- publish_time: 发布时间筛选
- cursor: 翻页游标
- enable_detail: 是否启用详情补充
- detail_delay: 详情请求间隔(秒)
- results_dir: 结果输出目录
- Returns:
- (Post对象列表, 保存的文件路径)
- """
- print("\n" + "=" * 80)
- print(f"小红书搜索 + 详情补充工具")
- print("=" * 80)
- print(f"关键词: {keyword}")
- print(f"内容类型: {content_type}")
- print(f"排序方式: {sort_type}")
- print(f"发布时间: {publish_time}")
- print(f"详情补充: {'启用' if enable_detail else '禁用'}")
- print("=" * 80 + "\n")
- # 1. 执行搜索
- print("步骤 1/3: 执行搜索...")
- print("-" * 80)
- search_client = XiaohongshuSearch(results_dir=results_dir)
- search_result = search_client.search(
- keyword=keyword,
- content_type=content_type,
- sort_type=sort_type,
- publish_time=publish_time,
- cursor=cursor
- )
- # 解析搜索结果
- notes_data = search_result.get("data", {}).get("data", [])
- print(f"✓ 搜索完成,获得 {len(notes_data)} 条结果\n")
- if not notes_data:
- print("未找到任何结果")
- return [], ""
- # 2. 转换为Post对象
- print("步骤 2/3: 解析搜索结果...")
- print("-" * 80)
- posts: List[Post] = []
- for note in notes_data:
- try:
- post = process_note_data(note)
- posts.append(post)
- except Exception as e:
- print(f" ✗ 解析失败: {e}")
- print(f"✓ 成功解析 {len(posts)}/{len(notes_data)} 条结果\n")
- # 3. 补充详情(如果启用)
- if enable_detail and posts:
- print("步骤 3/3: 补充详情信息...")
- print("-" * 80)
- detail_client = XiaohongshuDetail(results_dir=results_dir)
- success, fail = enrich_posts_batch(
- posts,
- detail_client,
- show_progress=True,
- delay=detail_delay
- )
- print(f"\n✓ 详情补充完成: 成功 {success}/{len(posts)}, 失败 {fail}")
- print_enrichment_stats(posts)
- else:
- print("步骤 3/3: 跳过详情补充\n")
- # 4. 保存结果
- filepath = save_enriched_results(keyword, posts, search_result, results_dir)
- return posts, filepath
- def save_enriched_results(
- keyword: str,
- posts: List[Post],
- search_result: Dict[str, Any],
- results_dir: str = None
- ) -> str:
- """
- 保存增强后的结果
- Args:
- keyword: 搜索关键词
- posts: Post对象列表
- search_result: 原始搜索结果
- results_dir: 结果输出目录
- Returns:
- 保存的文件路径
- """
- # 确定输出目录
- if results_dir:
- base_dir = results_dir
- else:
- script_dir = os.path.dirname(os.path.abspath(__file__))
- project_root = os.path.dirname(os.path.dirname(script_dir))
- base_dir = os.path.join(project_root, "data", "search")
- # 创建目录
- result_dir = os.path.join(base_dir, "enriched", keyword)
- os.makedirs(result_dir, exist_ok=True)
- # 构建结果数据
- enriched_data = {
- "metadata": {
- "keyword": keyword,
- "search_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
- "total_posts": len(posts),
- "enriched_posts": sum(1 for p in posts if p.detail_fetched),
- "video_posts": sum(1 for p in posts if p.type == "video"),
- "image_posts": sum(1 for p in posts if p.type != "video"),
- },
- "posts": [create_enriched_summary(p) for p in posts],
- "original_search_result": search_result # 保留原始搜索结果供参考
- }
- # 保存文件
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- filename = f"{timestamp}_enriched.json"
- filepath = os.path.join(result_dir, filename)
- with open(filepath, 'w', encoding='utf-8') as f:
- json.dump(enriched_data, f, ensure_ascii=False, indent=2)
- print(f"\n✓ 结果已保存: {filepath}\n")
- return filepath
- def main():
- """命令行入口"""
- parser = argparse.ArgumentParser(
- description='小红书搜索 + 详情补充工具',
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
- 使用示例:
- # 基础搜索并补充详情
- python3 search_with_detail.py --keyword "健身教程"
- # 搜索视频内容
- python3 search_with_detail.py --keyword "化妆教程" --content-type "视频"
- # 仅搜索不补充详情
- python3 search_with_detail.py --keyword "美食" --no-detail
- # 自定义输出目录
- python3 search_with_detail.py --keyword "旅游" --results-dir "custom/output"
- """
- )
- # 搜索参数
- parser.add_argument(
- '--keyword',
- type=str,
- required=True,
- help='搜索关键词(必填)'
- )
- parser.add_argument(
- '--content-type',
- type=str,
- default='不限',
- choices=['不限', '视频', '图文'],
- help='内容类型(默认: 不限)'
- )
- parser.add_argument(
- '--sort-type',
- type=str,
- default='综合',
- choices=['综合', '最新', '最多点赞', '最多评论'],
- help='排序方式(默认: 综合)'
- )
- parser.add_argument(
- '--publish-time',
- type=str,
- default='不限',
- choices=['不限', '一天内', '一周内', '半年内'],
- help='发布时间筛选(默认: 不限)'
- )
- parser.add_argument(
- '--cursor',
- type=str,
- default='',
- help='翻页游标(默认为空,即第一页)'
- )
- # 详情补充参数
- parser.add_argument(
- '--no-detail',
- action='store_true',
- help='禁用详情补充(仅搜索)'
- )
- parser.add_argument(
- '--detail-delay',
- type=int,
- default=1,
- help='详情请求间隔时间(秒),默认1秒'
- )
- # 输出参数
- parser.add_argument(
- '--results-dir',
- type=str,
- default=None,
- help='结果输出目录(默认: data/search)'
- )
- args = parser.parse_args()
- # 执行搜索和补充
- try:
- posts, filepath = search_and_enrich(
- keyword=args.keyword,
- content_type=args.content_type,
- sort_type=args.sort_type,
- publish_time=args.publish_time,
- cursor=args.cursor,
- enable_detail=not args.no_detail,
- detail_delay=args.detail_delay,
- results_dir=args.results_dir
- )
- # 打印摘要
- print("=" * 80)
- print("执行完成")
- print("=" * 80)
- print(f"关键词: {args.keyword}")
- print(f"获得帖子: {len(posts)} 条")
- if not args.no_detail:
- enriched = sum(1 for p in posts if p.detail_fetched)
- print(f"详情补充: {enriched}/{len(posts)} 条")
- print(f"结果文件: {filepath}")
- print("=" * 80)
- except Exception as e:
- print(f"\n✗ 执行失败: {e}", file=sys.stderr)
- import traceback
- traceback.print_exc()
- sys.exit(1)
- if __name__ == "__main__":
- main()
|