yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
							#!/usr/bin/env python3
"""
小红书帖子数据获取脚本
功能：根据帖子链接获取帖子详情和作者历史帖子，并保存到本地目录
"""

import json
import re
from pathlib import Path
from typing import Dict
import sys
import argparse
import shutil

# 导入共享工具模块
from xhs_utils import (
    get_note_detail,
    get_author_history_notes,
    merge_note_data,
    transform_note_data
)


def extract_note_id_from_url(url: str) -> str:
    """
    从小红书URL中提取note_id

    Args:
        url: 小红书帖子URL

    Returns:
        note_id: 帖子ID

    Example:
        https://www.xiaohongshu.com/explore/68c6a924000000001b0336d0?xsec_token=...
        返回: 68c6a924000000001b0336d0
    """
    # 尝试从URL路径中提取
    pattern = r'/explore/([a-f0-9]+)'
    match = re.search(pattern, url)

    if match:
        return match.group(1)

    # 如果直接传入的是note_id，则直接返回
    if re.match(r'^[a-f0-9]{24}$', url):
        return url

    raise ValueError(f"无法从URL中提取note_id: {url}")


def save_note_to_file(note_data: Dict, file_path: Path):
    """
    将帖子数据保存到JSON文件

    Args:
        note_data: 帖子数据
        file_path: 文件路径
    """
    # 确保目录存在
    file_path.parent.mkdir(parents=True, exist_ok=True)

    # 保存JSON文件
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(note_data, f, ensure_ascii=False, indent=2)

    print(f"已保存: {file_path}")


def check_note_data_integrity(note_data: dict) -> bool:
    """
    检查帖子数据的完整性

    Args:
        note_data: 帖子数据字典

    Returns:
        bool: 如果 images 或 video 字段至少一个不为空，返回 True，否则返回 False
    """
    images = note_data.get("images", [])
    video = note_data.get("video")

    # 检查 images 是否为非空列表
    has_images = isinstance(images, list) and len(images) > 0

    # 检查 video 是否存在且不为空（字符串或字典都可以）
    has_video = video is not None and video != "" and video != {}

    return has_images or has_video


def check_data_exists(note_id: str, output_dir: str = "examples") -> dict:
    """
    检查数据是否已经存在且完整

    Args:
        note_id: 帖子ID
        output_dir: 输出根目录

    Returns:
        dict: 包含检查结果的字典
    """
    result = {
        "exists": False,
        "complete": False,
        "target_note_path": None,
        "history_notes_path": None,
        "incomplete_files": [],
        "note_id": note_id
    }

    # 构建路径
    input_dir = Path(output_dir) / note_id / "输入"
    target_note_path = input_dir / "待解构帖子.json"
    history_notes_path = input_dir / "作者历史帖子"

    result["target_note_path"] = target_note_path
    result["history_notes_path"] = history_notes_path

    # 检查输入目录是否存在
    if not input_dir.exists():
        return result

    result["exists"] = True

    # 检查待解构帖子是否存在且完整
    if not target_note_path.exists():
        result["incomplete_files"].append(str(target_note_path))
        return result

    try:
        with open(target_note_path, 'r', encoding='utf-8') as f:
            target_note_data = json.load(f)

        if not check_note_data_integrity(target_note_data):
            result["incomplete_files"].append(str(target_note_path))
    except Exception as e:
        result["incomplete_files"].append(f"{target_note_path} (读取错误: {e})")

    # 检查历史帖子目录
    if not history_notes_path.exists():
        result["incomplete_files"].append(str(history_notes_path))
        return result

    # 检查历史帖子文件的完整性
    history_files = list(history_notes_path.glob("*.json"))

    if len(history_files) == 0:
        result["incomplete_files"].append(f"{history_notes_path} (没有历史帖子文件)")
    else:
        # 统计有效的历史帖子数量
        valid_history_count = 0

        for history_file in history_files:
            try:
                with open(history_file, 'r', encoding='utf-8') as f:
                    history_note_data = json.load(f)

                if not check_note_data_integrity(history_note_data):
                    result["incomplete_files"].append(str(history_file))
                else:
                    valid_history_count += 1
            except Exception as e:
                result["incomplete_files"].append(f"{history_file} (读取错误: {e})")

        # 验证历史帖子数量必须大于4
        if valid_history_count <= 4:
            result["incomplete_files"].append(f"{history_notes_path} (有效历史帖子数量 {valid_history_count} ≤ 4，不满足要求)")

    # 如果没有不完整的文件，则数据完整
    result["complete"] = len(result["incomplete_files"]) == 0

    return result


def delete_incomplete_data(note_id: str, output_dir: str = "examples") -> bool:
    """
    删除不完整的数据目录

    Args:
        note_id: 帖子ID
        output_dir: 输出根目录

    Returns:
        bool: 删除成功返回True，否则返回False
    """
    try:
        target_dir = Path(output_dir) / note_id

        if target_dir.exists():
            shutil.rmtree(target_dir)
            print(f"  ✓ 已删除不完整数据目录: {target_dir}")
            return True
        else:
            print(f"  ⚠️  目录不存在: {target_dir}")
            return False
    except Exception as e:
        print(f"  ✗ 删除目录失败: {e}")
        return False


def fetch_and_save_xhs_data(url: str, output_dir: str = "examples",
                             check_only: bool = False, skip_if_exists: bool = True,
                             clean_incomplete: bool = False):
    """
    获取小红书帖子数据并保存到本地

    Args:
        url: 小红书帖子URL
        output_dir: 输出目录，默认为examples
        check_only: 如果为True，只检查数据是否存在，不执行获取操作
        skip_if_exists: 如果为True且数据已存在且完整，则跳过获取
        clean_incomplete: 如果为True，检测到不完整数据时自动删除
    """
    print(f"\n{'='*80}")
    print(f"{'[检查模式]' if check_only else '[处理模式]'} 根据帖子URL获取数据")
    print(f"{'='*80}")

    # 1. 提取note_id
    print(f"正在解析URL: {url}")
    note_id = extract_note_id_from_url(url)
    print(f"提取到note_id: {note_id}")

    # 先检查数据是否已存在
    check_result = check_data_exists(note_id, output_dir=output_dir)

    if check_result["exists"]:
        if check_result["complete"]:
            print(f"\n✓ 数据已存在且完整")
            print(f"  待解构帖子: {check_result['target_note_path']}")
            print(f"  历史帖子目录: {check_result['history_notes_path']}")

            if check_only or skip_if_exists:
                print(f"{'  [检查模式] 跳过获取' if check_only else '  [跳过] 数据已完整'}")
                return
        else:
            print(f"\n⚠️  数据存在但不完整")
            print(f"  不完整的文件:")
            for incomplete_file in check_result["incomplete_files"]:
                print(f"    - {incomplete_file}")

            # 如果启用了清理不完整数据的功能
            if clean_incomplete:
                print(f"  [清理模式] 删除不完整数据...")
                delete_incomplete_data(note_id, output_dir)

            if check_only:
                print(f"  [检查模式] 需要重新获取")
                return
            else:
                print(f"  将重新获取数据...")
    else:
        print(f"\nℹ️  数据不存在")
        if check_only:
            print(f"  [检查模式] 需要获取")
            return

    # 如果是检查模式，到这里就结束了
    if check_only:
        return

    # 2. 获取帖子详情
    print(f"正在获取帖子详情...")
    note_detail = get_note_detail(note_id)

    # 3. 转换数据格式
    transformed_note = transform_note_data(note_detail)
    account_id = transformed_note["channel_account_id"]

    # 4. 创建目录结构
    base_path = Path(output_dir) / note_id / "输入"
    history_path = base_path / "作者历史帖子"

    # 5. 保存待解构帖子
    target_note_path = base_path / "待解构帖子.json"
    save_note_to_file(transformed_note, target_note_path)

    # 6. 获取作者历史帖子
    if account_id:
        print(f"正在获取作者历史帖子 (账号ID: {account_id})...")
        history_notes = get_author_history_notes(account_id)

        # 7. 为每个历史帖子处理数据并保存
        if isinstance(history_notes, list):
            print(f"找到 {len(history_notes)} 个历史帖子，正在处理...")

            for idx, note in enumerate(history_notes, 1):
                # 从历史帖子列表中提取note_id
                history_note_id = note.get("note_id", "")

                if history_note_id:
                    print(f"  [{idx}/{len(history_notes)}] 处理帖子: {history_note_id}")

                    try:
                        # 检查历史API数据是否缺少关键字段（主要是body_text）
                        need_detail = not (note.get("desc") or note.get("note_text") or note.get("body_text"))

                        detail_data = None
                        if need_detail:
                            print(f"    → 缺少正文，调用详情API补充...")
                            detail_data = get_note_detail(history_note_id)

                        # 合并历史API和详情API的数据
                        merged_note = merge_note_data(note, detail_data)

                        # 保存到文件
                        history_note_path = history_path / f"{history_note_id}.json"
                        save_note_to_file(merged_note, history_note_path)
                        if transformed_note['channel_content_id'] == merged_note['channel_content_id']:
                            save_note_to_file(merged_note, target_note_path)
                    except Exception as e:
                        print(f"  ⚠️  处理帖子 {history_note_id} 失败: {e}")
                        continue

            print(f"\n共成功保存 {len(history_notes)} 个历史帖子")
        else:
            print("历史帖子数据格式不正确")
    else:
        print("未找到账号ID，跳过获取历史帖子")

    print(f"\n✓ 数据获取完成！")
    print(f"输出目录: {base_path}")


def main():
    """主函数"""
    # 解析命令行参数
    parser = argparse.ArgumentParser(
        description='小红书帖子数据获取脚本',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
使用示例:
  # 获取帖子数据
  python fetch_xhs_data.py <帖子URL>

  # 只检查数据是否存在且完整
  python fetch_xhs_data.py <帖子URL> --check-only

  # 检查并清理不完整数据
  python fetch_xhs_data.py <帖子URL> --check-only --clean-incomplete

  # 强制重新获取（即使数据已存在）
  python fetch_xhs_data.py <帖子URL> --no-skip-if-exists
        """
    )

    parser.add_argument(
        'url',
        nargs='?',
        default='https://www.xiaohongshu.com/explore/68c6a924000000001b0336d0',
        help='小红书帖子URL（可选，默认使用示例URL）'
    )

    parser.add_argument(
        '--check-only',
        action='store_true',
        help='只检查数据是否存在且完整，不执行获取操作'
    )

    parser.add_argument(
        '--no-skip-if-exists',
        action='store_true',
        help='即使数据已存在且完整也重新获取'
    )

    parser.add_argument(
        '--clean-incomplete',
        action='store_true',
        help='自动删除检测到的不完整数据目录'
    )

    parser.add_argument(
        '--output-dir',
        type=str,
        default='examples',
        help='输出根目录 (默认: examples)'
    )

    args = parser.parse_args()

    url = args.url
    check_only = args.check_only
    skip_if_exists = not args.no_skip_if_exists
    clean_incomplete = args.clean_incomplete
    output_dir = args.output_dir

    print(f"{'='*80}")
    print(f"小红书帖子数据{'检查' if check_only else '获取'}脚本")
    print(f"{'='*80}")
    print(f"帖子URL: {url}")
    print(f"模式: {'只检查' if check_only else '获取数据'}")
    print(f"跳过已存在: {'是' if skip_if_exists else '否'}")
    print(f"清理不完整数据: {'是' if clean_incomplete else '否'}")
    print(f"输出目录: {output_dir}")
    print(f"{'='*80}")

    try:
        fetch_and_save_xhs_data(
            url,
            output_dir=output_dir,
            check_only=check_only,
            skip_if_exists=skip_if_exists,
            clean_incomplete=clean_incomplete
        )
    except Exception as e:
        print(f"错误: {e}")
        import traceback
        traceback.print_exc()
        return 1

    return 0


if __name__ == "__main__":
    exit(main())