yangxiaohui
/
kg_agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
							#!/usr/bin/env python3
"""
帖子详情补充工具
用于将搜索结果与详情API结果合并，补充高清图片、视频URL、作者信息等
"""

import json
from typing import Dict, Any, List
from datetime import datetime


def parse_detail_result(detail_response: Dict[str, Any]) -> Dict[str, Any] | None:
    """
    解析详情API返回的结果

    Args:
        detail_response: 详情API的完整响应

    Returns:
        解析后的数据字典，失败返回None
    """
    try:
        # 检查success字段
        if not detail_response.get("success"):
            print(f"    ⚠️  详情API返回失败")
            return None

        # 解析result字段（可能是JSON字符串）
        result = detail_response.get("result", "")
        if isinstance(result, str):
            result = json.loads(result)

        # 提取data
        if isinstance(result, list) and len(result) > 0:
            return result[0].get("data", {})
        elif isinstance(result, dict):
            return result.get("data", {})

        return None

    except Exception as e:
        print(f"    ✗ 解析详情结果失败: {e}")
        return None


def enrich_post_with_detail(post: Any, detail_response: Dict[str, Any]) -> bool:
    """
    使用详情API的数据补充Post对象

    Args:
        post: Post对象（会被直接修改）
        detail_response: 详情API的完整响应

    Returns:
        是否成功补充
    """
    # 解析详情数据
    detail_data = parse_detail_result(detail_response)
    if not detail_data:
        return False

    try:
        # 1. 正文内容 - 使用详情API的完整正文覆盖
        body_text = detail_data.get("body_text", "")
        if body_text:
            post.body_text = body_text

        # 2. 作者信息
        post.author_name = detail_data.get("channel_account_name", "")
        post.author_id = detail_data.get("channel_account_id", "")

        # 3. 发布时间
        post.publish_time = detail_data.get("publish_timestamp", 0)

        # 4. 互动信息 - 使用详情API的精确数据更新
        post.interact_info.update({
            "like_count": detail_data.get("like_count", 0),       # 详情API字段
            "collect_count": detail_data.get("collect_count", 0), # 详情API字段
        })

        # 5. 根据类型处理图片/视频
        if post.type == "video":
            # 视频帖：补充视频URL（images保持不变）
            video_url = detail_data.get("video", "")
            if video_url:
                post.video = video_url

        else:
            # 图文帖：仅保存CDN图片到cdn_images字段，不覆盖images
            images_data = detail_data.get("images", [])
            if images_data:
                # 提取CDN URL
                cdn_urls = []
                for img in images_data:
                    if isinstance(img, dict):
                        cdn_url = img.get("cdn_url", "")
                        if cdn_url:
                            cdn_urls.append(cdn_url)
                    elif isinstance(img, str):
                        cdn_urls.append(img)

                # 仅保存CDN图片列表，不覆盖images
                post.cdn_images = cdn_urls

        # 6. 标记已获取详情
        post.detail_fetched = True

        return True

    except Exception as e:
        print(f"    ✗ 补充详情失败: {e}")
        return False


def enrich_posts_batch(
    posts: List[Any],
    detail_client: Any,
    show_progress: bool = True,
    delay: int = 1
) -> tuple[int, int]:
    """
    批量补充帖子详情

    Args:
        posts: Post对象列表（会被直接修改）
        detail_client: XiaohongshuDetail实例
        show_progress: 是否显示进度
        delay: 请求间隔（秒）

    Returns:
        (成功数量, 失败数量)
    """
    success_count = 0
    fail_count = 0
    total = len(posts)

    for idx, post in enumerate(posts, 1):
        if show_progress:
            print(f"补充详情 ({idx}/{total}): {post.note_id}")

        try:
            # 调用详情API
            detail_response = detail_client.get_detail(post.note_id)

            # 合并数据
            if enrich_post_with_detail(post, detail_response):
                success_count += 1
                if show_progress:
                    print(f"  ✓ 成功补充")
            else:
                fail_count += 1
                if show_progress:
                    print(f"  ✗ 补充失败")

        except Exception as e:
            fail_count += 1
            if show_progress:
                print(f"  ✗ 请求失败: {e}")

        # 避免请求过快（最后一个不需要延迟）
        if idx < total and delay > 0:
            import time
            time.sleep(delay)

    return success_count, fail_count


def create_enriched_summary(post: Any) -> Dict[str, Any]:
    """
    创建包含详情的帖子摘要（用于保存）

    Args:
        post: Post对象

    Returns:
        摘要字典
    """
    summary = {
        # 基础信息
        "note_id": post.note_id,
        "note_url": post.note_url,
        "title": post.title,
        "body_text": post.body_text,
        "type": post.type,

        # 媒体信息
        "images": post.images,
        "cdn_images": post.cdn_images,
        "video": post.video,

        # 作者信息（详情补充）
        "author": {
            "name": post.author_name,
            "id": post.author_id
        } if post.detail_fetched else {},

        # 互动信息
        "interact_info": post.interact_info,

        # 时间信息
        "publish_time": post.publish_time,
        "publish_time_readable": datetime.fromtimestamp(
            post.publish_time / 1000
        ).strftime("%Y-%m-%d %H:%M:%S") if post.publish_time > 0 else "",

        # 元数据
        "detail_fetched": post.detail_fetched
    }

    return summary


def print_enrichment_stats(posts: List[Any]) -> None:
    """
    打印详情补充统计信息

    Args:
        posts: Post对象列表
    """
    total = len(posts)
    enriched = sum(1 for p in posts if p.detail_fetched)

    video_count = sum(1 for p in posts if p.type == "video")
    image_count = total - video_count

    print("\n" + "=" * 60)
    print("详情补充统计")
    print("=" * 60)
    print(f"总帖子数: {total}")
    print(f"  - 图文帖: {image_count}")
    print(f"  - 视频帖: {video_count}")
    print(f"\n已补充详情: {enriched}/{total} ({enriched*100//total if total > 0 else 0}%)")
    print(f"未补充详情: {total - enriched}")

    if enriched > 0:
        print("\n详情字段统计:")
        has_author = sum(1 for p in posts if p.author_name)
        has_publish_time = sum(1 for p in posts if p.publish_time > 0)
        has_cdn_images = sum(1 for p in posts if p.cdn_images)
        has_video_url = sum(1 for p in posts if p.video and p.type == "video")

        print(f"  - 作者信息: {has_author}/{enriched}")
        print(f"  - 发布时间: {has_publish_time}/{enriched}")
        print(f"  - 高清图片: {has_cdn_images}/{image_count} (图文帖)")
        print(f"  - 视频URL: {has_video_url}/{video_count} (视频帖)")

    print("=" * 60 + "\n")


# ============================================================================
# 使用示例
# ============================================================================

if __name__ == "__main__":
    print("这是一个辅助模块，请通过 search_with_detail.py 使用")
    print("\n主要功能：")
    print("1. parse_detail_result() - 解析详情API响应")
    print("2. enrich_post_with_detail() - 补充单个帖子详情")
    print("3. enrich_posts_batch() - 批量补充详情")
    print("4. create_enriched_summary() - 创建详情摘要")
    print("5. print_enrichment_stats() - 打印统计信息")