#!/usr/bin/env python3 """ 小红书搜索 + 详情补充 - 端到端工具 先调用搜索API获取笔记列表,再批量调用详情API补充完整信息 """ import json import os import argparse import sys from datetime import datetime from typing import List, Dict, Any # 添加项目根目录到路径 script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(os.path.dirname(script_dir)) sys.path.insert(0, project_root) from script.search.xiaohongshu_search import XiaohongshuSearch from script.search.xiaohongshu_detail import XiaohongshuDetail from script.search.enrichment_helper import ( enrich_posts_batch, create_enriched_summary, print_enrichment_stats ) from knowledge_search_traverse import Post, process_note_data def search_and_enrich( keyword: str, content_type: str = "不限", sort_type: str = "综合", publish_time: str = "不限", cursor: str = "", enable_detail: bool = True, detail_delay: int = 1, results_dir: str = None ) -> tuple[List[Post], str]: """ 搜索并补充详情的主流程 Args: keyword: 搜索关键词 content_type: 内容类型 sort_type: 排序方式 publish_time: 发布时间筛选 cursor: 翻页游标 enable_detail: 是否启用详情补充 detail_delay: 详情请求间隔(秒) results_dir: 结果输出目录 Returns: (Post对象列表, 保存的文件路径) """ print("\n" + "=" * 80) print(f"小红书搜索 + 详情补充工具") print("=" * 80) print(f"关键词: {keyword}") print(f"内容类型: {content_type}") print(f"排序方式: {sort_type}") print(f"发布时间: {publish_time}") print(f"详情补充: {'启用' if enable_detail else '禁用'}") print("=" * 80 + "\n") # 1. 执行搜索 print("步骤 1/3: 执行搜索...") print("-" * 80) search_client = XiaohongshuSearch(results_dir=results_dir) search_result = search_client.search( keyword=keyword, content_type=content_type, sort_type=sort_type, publish_time=publish_time, cursor=cursor ) # 解析搜索结果 notes_data = search_result.get("data", {}).get("data", []) print(f"✓ 搜索完成,获得 {len(notes_data)} 条结果\n") if not notes_data: print("未找到任何结果") return [], "" # 2. 转换为Post对象 print("步骤 2/3: 解析搜索结果...") print("-" * 80) posts: List[Post] = [] for note in notes_data: try: post = process_note_data(note) posts.append(post) except Exception as e: print(f" ✗ 解析失败: {e}") print(f"✓ 成功解析 {len(posts)}/{len(notes_data)} 条结果\n") # 3. 补充详情(如果启用) if enable_detail and posts: print("步骤 3/3: 补充详情信息...") print("-" * 80) detail_client = XiaohongshuDetail(results_dir=results_dir) success, fail = enrich_posts_batch( posts, detail_client, show_progress=True, delay=detail_delay ) print(f"\n✓ 详情补充完成: 成功 {success}/{len(posts)}, 失败 {fail}") print_enrichment_stats(posts) else: print("步骤 3/3: 跳过详情补充\n") # 4. 保存结果 filepath = save_enriched_results(keyword, posts, search_result, results_dir) return posts, filepath def save_enriched_results( keyword: str, posts: List[Post], search_result: Dict[str, Any], results_dir: str = None ) -> str: """ 保存增强后的结果 Args: keyword: 搜索关键词 posts: Post对象列表 search_result: 原始搜索结果 results_dir: 结果输出目录 Returns: 保存的文件路径 """ # 确定输出目录 if results_dir: base_dir = results_dir else: script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(os.path.dirname(script_dir)) base_dir = os.path.join(project_root, "data", "search") # 创建目录 result_dir = os.path.join(base_dir, "enriched", keyword) os.makedirs(result_dir, exist_ok=True) # 构建结果数据 enriched_data = { "metadata": { "keyword": keyword, "search_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "total_posts": len(posts), "enriched_posts": sum(1 for p in posts if p.detail_fetched), "video_posts": sum(1 for p in posts if p.type == "video"), "image_posts": sum(1 for p in posts if p.type != "video"), }, "posts": [create_enriched_summary(p) for p in posts], "original_search_result": search_result # 保留原始搜索结果供参考 } # 保存文件 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{timestamp}_enriched.json" filepath = os.path.join(result_dir, filename) with open(filepath, 'w', encoding='utf-8') as f: json.dump(enriched_data, f, ensure_ascii=False, indent=2) print(f"\n✓ 结果已保存: {filepath}\n") return filepath def main(): """命令行入口""" parser = argparse.ArgumentParser( description='小红书搜索 + 详情补充工具', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 使用示例: # 基础搜索并补充详情 python3 search_with_detail.py --keyword "健身教程" # 搜索视频内容 python3 search_with_detail.py --keyword "化妆教程" --content-type "视频" # 仅搜索不补充详情 python3 search_with_detail.py --keyword "美食" --no-detail # 自定义输出目录 python3 search_with_detail.py --keyword "旅游" --results-dir "custom/output" """ ) # 搜索参数 parser.add_argument( '--keyword', type=str, required=True, help='搜索关键词(必填)' ) parser.add_argument( '--content-type', type=str, default='不限', choices=['不限', '视频', '图文'], help='内容类型(默认: 不限)' ) parser.add_argument( '--sort-type', type=str, default='综合', choices=['综合', '最新', '最多点赞', '最多评论'], help='排序方式(默认: 综合)' ) parser.add_argument( '--publish-time', type=str, default='不限', choices=['不限', '一天内', '一周内', '半年内'], help='发布时间筛选(默认: 不限)' ) parser.add_argument( '--cursor', type=str, default='', help='翻页游标(默认为空,即第一页)' ) # 详情补充参数 parser.add_argument( '--no-detail', action='store_true', help='禁用详情补充(仅搜索)' ) parser.add_argument( '--detail-delay', type=int, default=1, help='详情请求间隔时间(秒),默认1秒' ) # 输出参数 parser.add_argument( '--results-dir', type=str, default=None, help='结果输出目录(默认: data/search)' ) args = parser.parse_args() # 执行搜索和补充 try: posts, filepath = search_and_enrich( keyword=args.keyword, content_type=args.content_type, sort_type=args.sort_type, publish_time=args.publish_time, cursor=args.cursor, enable_detail=not args.no_detail, detail_delay=args.detail_delay, results_dir=args.results_dir ) # 打印摘要 print("=" * 80) print("执行完成") print("=" * 80) print(f"关键词: {args.keyword}") print(f"获得帖子: {len(posts)} 条") if not args.no_detail: enriched = sum(1 for p in posts if p.detail_fetched) print(f"详情补充: {enriched}/{len(posts)} 条") print(f"结果文件: {filepath}") print("=" * 80) except Exception as e: print(f"\n✗ 执行失败: {e}", file=sys.stderr) import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()