#!/usr/bin/env python3 """ 帖子详情补充工具 用于将搜索结果与详情API结果合并,补充高清图片、视频URL、作者信息等 """ import json from typing import Dict, Any, List from datetime import datetime def parse_detail_result(detail_response: Dict[str, Any]) -> Dict[str, Any] | None: """ 解析详情API返回的结果 Args: detail_response: 详情API的完整响应 Returns: 解析后的数据字典,失败返回None """ try: # 检查success字段 if not detail_response.get("success"): print(f" ⚠️ 详情API返回失败") return None # 解析result字段(可能是JSON字符串) result = detail_response.get("result", "") if isinstance(result, str): result = json.loads(result) # 提取data if isinstance(result, list) and len(result) > 0: return result[0].get("data", {}) elif isinstance(result, dict): return result.get("data", {}) return None except Exception as e: print(f" ✗ 解析详情结果失败: {e}") return None def enrich_post_with_detail(post: Any, detail_response: Dict[str, Any]) -> bool: """ 使用详情API的数据补充Post对象 Args: post: Post对象(会被直接修改) detail_response: 详情API的完整响应 Returns: 是否成功补充 """ # 解析详情数据 detail_data = parse_detail_result(detail_response) if not detail_data: return False try: # 1. 正文内容 - 使用详情API的完整正文覆盖 body_text = detail_data.get("body_text", "") if body_text: post.body_text = body_text # 2. 作者信息 post.author_name = detail_data.get("channel_account_name", "") post.author_id = detail_data.get("channel_account_id", "") # 3. 发布时间 post.publish_time = detail_data.get("publish_timestamp", 0) # 4. 互动信息 - 使用详情API的精确数据更新 post.interact_info.update({ "like_count": detail_data.get("like_count", 0), # 详情API字段 "collect_count": detail_data.get("collect_count", 0), # 详情API字段 }) # 5. 根据类型处理图片/视频 if post.type == "video": # 视频帖:补充视频URL(images保持不变) video_url = detail_data.get("video", "") if video_url: post.video = video_url else: # 图文帖:仅保存CDN图片到cdn_images字段,不覆盖images images_data = detail_data.get("images", []) if images_data: # 提取CDN URL cdn_urls = [] for img in images_data: if isinstance(img, dict): cdn_url = img.get("cdn_url", "") if cdn_url: cdn_urls.append(cdn_url) elif isinstance(img, str): cdn_urls.append(img) # 仅保存CDN图片列表,不覆盖images post.cdn_images = cdn_urls # 6. 标记已获取详情 post.detail_fetched = True return True except Exception as e: print(f" ✗ 补充详情失败: {e}") return False def enrich_posts_batch( posts: List[Any], detail_client: Any, show_progress: bool = True, delay: int = 1 ) -> tuple[int, int]: """ 批量补充帖子详情 Args: posts: Post对象列表(会被直接修改) detail_client: XiaohongshuDetail实例 show_progress: 是否显示进度 delay: 请求间隔(秒) Returns: (成功数量, 失败数量) """ success_count = 0 fail_count = 0 total = len(posts) for idx, post in enumerate(posts, 1): if show_progress: print(f"补充详情 ({idx}/{total}): {post.note_id}") try: # 调用详情API detail_response = detail_client.get_detail(post.note_id) # 合并数据 if enrich_post_with_detail(post, detail_response): success_count += 1 if show_progress: print(f" ✓ 成功补充") else: fail_count += 1 if show_progress: print(f" ✗ 补充失败") except Exception as e: fail_count += 1 if show_progress: print(f" ✗ 请求失败: {e}") # 避免请求过快(最后一个不需要延迟) if idx < total and delay > 0: import time time.sleep(delay) return success_count, fail_count def create_enriched_summary(post: Any) -> Dict[str, Any]: """ 创建包含详情的帖子摘要(用于保存) Args: post: Post对象 Returns: 摘要字典 """ summary = { # 基础信息 "note_id": post.note_id, "note_url": post.note_url, "title": post.title, "body_text": post.body_text, "type": post.type, # 媒体信息 "images": post.images, "cdn_images": post.cdn_images, "video": post.video, # 作者信息(详情补充) "author": { "name": post.author_name, "id": post.author_id } if post.detail_fetched else {}, # 互动信息 "interact_info": post.interact_info, # 时间信息 "publish_time": post.publish_time, "publish_time_readable": datetime.fromtimestamp( post.publish_time / 1000 ).strftime("%Y-%m-%d %H:%M:%S") if post.publish_time > 0 else "", # 元数据 "detail_fetched": post.detail_fetched } return summary def print_enrichment_stats(posts: List[Any]) -> None: """ 打印详情补充统计信息 Args: posts: Post对象列表 """ total = len(posts) enriched = sum(1 for p in posts if p.detail_fetched) video_count = sum(1 for p in posts if p.type == "video") image_count = total - video_count print("\n" + "=" * 60) print("详情补充统计") print("=" * 60) print(f"总帖子数: {total}") print(f" - 图文帖: {image_count}") print(f" - 视频帖: {video_count}") print(f"\n已补充详情: {enriched}/{total} ({enriched*100//total if total > 0 else 0}%)") print(f"未补充详情: {total - enriched}") if enriched > 0: print("\n详情字段统计:") has_author = sum(1 for p in posts if p.author_name) has_publish_time = sum(1 for p in posts if p.publish_time > 0) has_cdn_images = sum(1 for p in posts if p.cdn_images) has_video_url = sum(1 for p in posts if p.video and p.type == "video") print(f" - 作者信息: {has_author}/{enriched}") print(f" - 发布时间: {has_publish_time}/{enriched}") print(f" - 高清图片: {has_cdn_images}/{image_count} (图文帖)") print(f" - 视频URL: {has_video_url}/{video_count} (视频帖)") print("=" * 60 + "\n") # ============================================================================ # 使用示例 # ============================================================================ if __name__ == "__main__": print("这是一个辅助模块,请通过 search_with_detail.py 使用") print("\n主要功能:") print("1. parse_detail_result() - 解析详情API响应") print("2. enrich_post_with_detail() - 补充单个帖子详情") print("3. enrich_posts_batch() - 批量补充详情") print("4. create_enriched_summary() - 创建详情摘要") print("5. print_enrichment_stats() - 打印统计信息")