yangxiaohui
/
kg_agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
							#!/usr/bin/env python3
"""
小红书笔记搜索工具
根据关键词搜索小红书笔记，支持多种筛选条件
"""

import requests
import json
import os
import argparse
from datetime import datetime
from typing import Dict, Any


class XiaohongshuSearch:
    """小红书笔记搜索API封装类"""

    BASE_URL = "http://47.84.182.56:8001"
    TOOL_NAME = "xhs_note_search"
    PLATFORM = "xiaohongshu"

    def __init__(self, results_dir: str = None):
        """
        初始化API客户端

        Args:
            results_dir: 结果输出目录，默认为项目根目录下的 data/search 文件夹
        """
        self.api_url = f"{self.BASE_URL}/tools/call/{self.TOOL_NAME}"

        # 设置结果输出目录
        if results_dir:
            self.results_base_dir = results_dir
        else:
            # 默认使用项目根目录的 data/search 文件夹
            script_dir = os.path.dirname(os.path.abspath(__file__))
            project_root = os.path.dirname(os.path.dirname(script_dir))
            self.results_base_dir = os.path.join(project_root, "data", "search")

    def search(
        self,
        keyword: str,
        content_type: str = "不限",
        sort_type: str = "综合",
        publish_time: str = "不限",
        cursor: str = "",
        timeout: int = 30
    ) -> Dict[str, Any]:
        """
        搜索小红书笔记

        Args:
            keyword: 搜索关键词
            content_type: 内容类型，可选值：不限、视频、图文，默认为'不限'
            sort_type: 排序方式，可选值：综合、最新、最多点赞、最多评论，默认为'综合'
            publish_time: 发布时间筛选，可选值：不限、一天内、一周内、半年内，默认为'不限'
            cursor: 翻页游标，第一页默认为空，下一页的游标在上一页的返回值中获取
            timeout: 请求超时时间（秒），默认30秒

        Returns:
            API响应的JSON数据

        Raises:
            requests.exceptions.RequestException: 请求失败时抛出异常
        """
        payload = {
            "keyword": keyword,
            "content_type": content_type,
            "sort_type": sort_type,
            "publish_time": publish_time,
            "cursor": cursor
        }

        try:
            response = requests.post(
                self.api_url,
                json=payload,
                timeout=timeout,
                headers={"Content-Type": "application/json"}
            )
            response.raise_for_status()
            result = response.json()

            # 预处理返回数据：提取 image_list 中的 URL 字符串
            self._preprocess_response(result)

            return result
        except requests.exceptions.RequestException as e:
            print(f"请求失败: {e}")
            raise

    def _preprocess_response(self, result: Dict[str, Any]) -> None:
        """
        预处理搜索结果，将 image_list 中的字典格式转换为 URL 字符串列表

        Args:
            result: API返回的原始结果字典（会直接修改）
        """
        # 获取帖子列表
        notes = result.get("data", {}).get("data", [])

        for note in notes:
            note_card = note.get("note_card", {})
            image_list_raw = note_card.get("image_list", [])

            # 提取 URL 字符串
            image_list = []
            for img in image_list_raw:
                if isinstance(img, dict) and "image_url" in img:
                    image_list.append(img["image_url"])
                elif isinstance(img, str):
                    # 如果已经是字符串，直接使用
                    image_list.append(img)

            # 更新为预处理后的列表
            note_card["image_list"] = image_list

    def save_result(self, keyword: str, result: Dict[str, Any], page: int = 1) -> str:
        """
        保存结果到文件
        目录结构: results/xiaohongshu_search/关键词/时间戳_page{页码}.json

        Args:
            keyword: 搜索关键词
            result: API返回的结果
            page: 页码

        Returns:
            保存的文件路径
        """
        # 创建目录结构: results/xiaohongshu_search/关键词/
        result_dir = os.path.join(self.results_base_dir, "xiaohongshu_search", keyword)
        os.makedirs(result_dir, exist_ok=True)

        # 文件名使用时间戳和页码
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{timestamp}_page{page}.json"
        filepath = os.path.join(result_dir, filename)

        # 保存结果
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=2)

        return filepath


def main():
    """示例使用"""
    # 解析命令行参数
    parser = argparse.ArgumentParser(description='小红书笔记搜索工具')
    parser.add_argument(
        '--results-dir',
        type=str,
        default='data/search',
        help='结果输出目录 (默认: data/search)'
    )
    parser.add_argument(
        '--keyword',
        type=str,
        required=True,
        help='搜索关键词 (必填)'
    )
    parser.add_argument(
        '--content-type',
        type=str,
        default='不限',
        choices=['不限', '视频', '图文'],
        help='内容类型 (默认: 不限)'
    )
    parser.add_argument(
        '--sort-type',
        type=str,
        default='综合',
        choices=['综合', '最新', '最多点赞', '最多评论'],
        help='排序方式 (默认: 综合)'
    )
    parser.add_argument(
        '--publish-time',
        type=str,
        default='不限',
        choices=['不限', '一天内', '一周内', '半年内'],
        help='发布时间筛选 (默认: 不限)'
    )
    parser.add_argument(
        '--cursor',
        type=str,
        default='',
        help='翻页游标 (默认为空，即第一页)'
    )
    parser.add_argument(
        '--page',
        type=int,
        default=1,
        help='页码标识，用于保存文件名 (默认: 1)'
    )
    args = parser.parse_args()

    # 创建API客户端实例
    client = XiaohongshuSearch(results_dir=args.results_dir)

    # 执行搜索并保存
    try:
        result = client.search(
            args.keyword,
            args.content_type,
            args.sort_type,
            args.publish_time,
            args.cursor
        )
        filepath = client.save_result(args.keyword, result, args.page)
        print(f"Output: {filepath}")
    except Exception as e:
        print(f"Error: {e}", file=__import__('sys').stderr)


if __name__ == "__main__":
    main()