| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217 |
- #!/usr/bin/env python3
- """
- 小红书笔记搜索工具
- 根据关键词搜索小红书笔记,支持多种筛选条件
- """
- import requests
- import json
- import os
- import argparse
- from datetime import datetime
- from typing import Dict, Any
- class XiaohongshuSearch:
- """小红书笔记搜索API封装类"""
- BASE_URL = "http://47.84.182.56:8001"
- TOOL_NAME = "xhs_note_search"
- PLATFORM = "xiaohongshu"
- def __init__(self, results_dir: str = None):
- """
- 初始化API客户端
- Args:
- results_dir: 结果输出目录,默认为项目根目录下的 data/search 文件夹
- """
- self.api_url = f"{self.BASE_URL}/tools/call/{self.TOOL_NAME}"
- # 设置结果输出目录
- if results_dir:
- self.results_base_dir = results_dir
- else:
- # 默认使用项目根目录的 data/search 文件夹
- script_dir = os.path.dirname(os.path.abspath(__file__))
- project_root = os.path.dirname(os.path.dirname(script_dir))
- self.results_base_dir = os.path.join(project_root, "data", "search")
- def search(
- self,
- keyword: str,
- content_type: str = "不限",
- sort_type: str = "综合",
- publish_time: str = "不限",
- cursor: str = "",
- timeout: int = 30
- ) -> Dict[str, Any]:
- """
- 搜索小红书笔记
- Args:
- keyword: 搜索关键词
- content_type: 内容类型,可选值:不限、视频、图文,默认为'不限'
- sort_type: 排序方式,可选值:综合、最新、最多点赞、最多评论,默认为'综合'
- publish_time: 发布时间筛选,可选值:不限、一天内、一周内、半年内,默认为'不限'
- cursor: 翻页游标,第一页默认为空,下一页的游标在上一页的返回值中获取
- timeout: 请求超时时间(秒),默认30秒
- Returns:
- API响应的JSON数据
- Raises:
- requests.exceptions.RequestException: 请求失败时抛出异常
- """
- payload = {
- "keyword": keyword,
- "content_type": content_type,
- "sort_type": sort_type,
- "publish_time": publish_time,
- "cursor": cursor
- }
- try:
- response = requests.post(
- self.api_url,
- json=payload,
- timeout=timeout,
- headers={"Content-Type": "application/json"}
- )
- response.raise_for_status()
- result = response.json()
- # 预处理返回数据:提取 image_list 中的 URL 字符串
- self._preprocess_response(result)
- return result
- except requests.exceptions.RequestException as e:
- print(f"请求失败: {e}")
- raise
- def _preprocess_response(self, result: Dict[str, Any]) -> None:
- """
- 预处理搜索结果,将 image_list 中的字典格式转换为 URL 字符串列表
- Args:
- result: API返回的原始结果字典(会直接修改)
- """
- # 获取帖子列表
- notes = result.get("data", {}).get("data", [])
- for note in notes:
- note_card = note.get("note_card", {})
- image_list_raw = note_card.get("image_list", [])
- # 提取 URL 字符串
- image_list = []
- for img in image_list_raw:
- if isinstance(img, dict) and "image_url" in img:
- image_list.append(img["image_url"])
- elif isinstance(img, str):
- # 如果已经是字符串,直接使用
- image_list.append(img)
- # 更新为预处理后的列表
- note_card["image_list"] = image_list
- def save_result(self, keyword: str, result: Dict[str, Any], page: int = 1) -> str:
- """
- 保存结果到文件
- 目录结构: results/xiaohongshu_search/关键词/时间戳_page{页码}.json
- Args:
- keyword: 搜索关键词
- result: API返回的结果
- page: 页码
- Returns:
- 保存的文件路径
- """
- # 创建目录结构: results/xiaohongshu_search/关键词/
- result_dir = os.path.join(self.results_base_dir, "xiaohongshu_search", keyword)
- os.makedirs(result_dir, exist_ok=True)
- # 文件名使用时间戳和页码
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- filename = f"{timestamp}_page{page}.json"
- filepath = os.path.join(result_dir, filename)
- # 保存结果
- with open(filepath, 'w', encoding='utf-8') as f:
- json.dump(result, f, ensure_ascii=False, indent=2)
- return filepath
- def main():
- """示例使用"""
- # 解析命令行参数
- parser = argparse.ArgumentParser(description='小红书笔记搜索工具')
- parser.add_argument(
- '--results-dir',
- type=str,
- default='data/search',
- help='结果输出目录 (默认: data/search)'
- )
- parser.add_argument(
- '--keyword',
- type=str,
- required=True,
- help='搜索关键词 (必填)'
- )
- parser.add_argument(
- '--content-type',
- type=str,
- default='不限',
- choices=['不限', '视频', '图文'],
- help='内容类型 (默认: 不限)'
- )
- parser.add_argument(
- '--sort-type',
- type=str,
- default='综合',
- choices=['综合', '最新', '最多点赞', '最多评论'],
- help='排序方式 (默认: 综合)'
- )
- parser.add_argument(
- '--publish-time',
- type=str,
- default='不限',
- choices=['不限', '一天内', '一周内', '半年内'],
- help='发布时间筛选 (默认: 不限)'
- )
- parser.add_argument(
- '--cursor',
- type=str,
- default='',
- help='翻页游标 (默认为空,即第一页)'
- )
- parser.add_argument(
- '--page',
- type=int,
- default=1,
- help='页码标识,用于保存文件名 (默认: 1)'
- )
- args = parser.parse_args()
- # 创建API客户端实例
- client = XiaohongshuSearch(results_dir=args.results_dir)
- # 执行搜索并保存
- try:
- result = client.search(
- args.keyword,
- args.content_type,
- args.sort_type,
- args.publish_time,
- args.cursor
- )
- filepath = client.save_result(args.keyword, result, args.page)
- print(f"Output: {filepath}")
- except Exception as e:
- print(f"Error: {e}", file=__import__('sys').stderr)
- if __name__ == "__main__":
- main()
|