| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261 |
- #!/usr/bin/env python3
- """
- 帖子详情补充工具
- 用于将搜索结果与详情API结果合并,补充高清图片、视频URL、作者信息等
- """
- import json
- from typing import Dict, Any, List
- from datetime import datetime
- def parse_detail_result(detail_response: Dict[str, Any]) -> Dict[str, Any] | None:
- """
- 解析详情API返回的结果
- Args:
- detail_response: 详情API的完整响应
- Returns:
- 解析后的数据字典,失败返回None
- """
- try:
- # 检查success字段
- if not detail_response.get("success"):
- print(f" ⚠️ 详情API返回失败")
- return None
- # 解析result字段(可能是JSON字符串)
- result = detail_response.get("result", "")
- if isinstance(result, str):
- result = json.loads(result)
- # 提取data
- if isinstance(result, list) and len(result) > 0:
- return result[0].get("data", {})
- elif isinstance(result, dict):
- return result.get("data", {})
- return None
- except Exception as e:
- print(f" ✗ 解析详情结果失败: {e}")
- return None
- def enrich_post_with_detail(post: Any, detail_response: Dict[str, Any]) -> bool:
- """
- 使用详情API的数据补充Post对象
- Args:
- post: Post对象(会被直接修改)
- detail_response: 详情API的完整响应
- Returns:
- 是否成功补充
- """
- # 解析详情数据
- detail_data = parse_detail_result(detail_response)
- if not detail_data:
- return False
- try:
- # 1. 正文内容 - 使用详情API的完整正文覆盖
- body_text = detail_data.get("body_text", "")
- if body_text:
- post.body_text = body_text
- # 2. 作者信息
- post.author_name = detail_data.get("channel_account_name", "")
- post.author_id = detail_data.get("channel_account_id", "")
- # 3. 发布时间
- post.publish_time = detail_data.get("publish_timestamp", 0)
- # 4. 互动信息 - 使用详情API的精确数据更新
- post.interact_info.update({
- "like_count": detail_data.get("like_count", 0), # 详情API字段
- "collect_count": detail_data.get("collect_count", 0), # 详情API字段
- })
- # 5. 根据类型处理图片/视频
- if post.type == "video":
- # 视频帖:补充视频URL(images保持不变)
- video_url = detail_data.get("video", "")
- if video_url:
- post.video = video_url
- else:
- # 图文帖:仅保存CDN图片到cdn_images字段,不覆盖images
- images_data = detail_data.get("images", [])
- if images_data:
- # 提取CDN URL
- cdn_urls = []
- for img in images_data:
- if isinstance(img, dict):
- cdn_url = img.get("cdn_url", "")
- if cdn_url:
- cdn_urls.append(cdn_url)
- elif isinstance(img, str):
- cdn_urls.append(img)
- # 仅保存CDN图片列表,不覆盖images
- post.cdn_images = cdn_urls
- # 6. 标记已获取详情
- post.detail_fetched = True
- return True
- except Exception as e:
- print(f" ✗ 补充详情失败: {e}")
- return False
- def enrich_posts_batch(
- posts: List[Any],
- detail_client: Any,
- show_progress: bool = True,
- delay: int = 1
- ) -> tuple[int, int]:
- """
- 批量补充帖子详情
- Args:
- posts: Post对象列表(会被直接修改)
- detail_client: XiaohongshuDetail实例
- show_progress: 是否显示进度
- delay: 请求间隔(秒)
- Returns:
- (成功数量, 失败数量)
- """
- success_count = 0
- fail_count = 0
- total = len(posts)
- for idx, post in enumerate(posts, 1):
- if show_progress:
- print(f"补充详情 ({idx}/{total}): {post.note_id}")
- try:
- # 调用详情API
- detail_response = detail_client.get_detail(post.note_id)
- # 合并数据
- if enrich_post_with_detail(post, detail_response):
- success_count += 1
- if show_progress:
- print(f" ✓ 成功补充")
- else:
- fail_count += 1
- if show_progress:
- print(f" ✗ 补充失败")
- except Exception as e:
- fail_count += 1
- if show_progress:
- print(f" ✗ 请求失败: {e}")
- # 避免请求过快(最后一个不需要延迟)
- if idx < total and delay > 0:
- import time
- time.sleep(delay)
- return success_count, fail_count
- def create_enriched_summary(post: Any) -> Dict[str, Any]:
- """
- 创建包含详情的帖子摘要(用于保存)
- Args:
- post: Post对象
- Returns:
- 摘要字典
- """
- summary = {
- # 基础信息
- "note_id": post.note_id,
- "note_url": post.note_url,
- "title": post.title,
- "body_text": post.body_text,
- "type": post.type,
- # 媒体信息
- "images": post.images,
- "cdn_images": post.cdn_images,
- "video": post.video,
- # 作者信息(详情补充)
- "author": {
- "name": post.author_name,
- "id": post.author_id
- } if post.detail_fetched else {},
- # 互动信息
- "interact_info": post.interact_info,
- # 时间信息
- "publish_time": post.publish_time,
- "publish_time_readable": datetime.fromtimestamp(
- post.publish_time / 1000
- ).strftime("%Y-%m-%d %H:%M:%S") if post.publish_time > 0 else "",
- # 元数据
- "detail_fetched": post.detail_fetched
- }
- return summary
- def print_enrichment_stats(posts: List[Any]) -> None:
- """
- 打印详情补充统计信息
- Args:
- posts: Post对象列表
- """
- total = len(posts)
- enriched = sum(1 for p in posts if p.detail_fetched)
- video_count = sum(1 for p in posts if p.type == "video")
- image_count = total - video_count
- print("\n" + "=" * 60)
- print("详情补充统计")
- print("=" * 60)
- print(f"总帖子数: {total}")
- print(f" - 图文帖: {image_count}")
- print(f" - 视频帖: {video_count}")
- print(f"\n已补充详情: {enriched}/{total} ({enriched*100//total if total > 0 else 0}%)")
- print(f"未补充详情: {total - enriched}")
- if enriched > 0:
- print("\n详情字段统计:")
- has_author = sum(1 for p in posts if p.author_name)
- has_publish_time = sum(1 for p in posts if p.publish_time > 0)
- has_cdn_images = sum(1 for p in posts if p.cdn_images)
- has_video_url = sum(1 for p in posts if p.video and p.type == "video")
- print(f" - 作者信息: {has_author}/{enriched}")
- print(f" - 发布时间: {has_publish_time}/{enriched}")
- print(f" - 高清图片: {has_cdn_images}/{image_count} (图文帖)")
- print(f" - 视频URL: {has_video_url}/{video_count} (视频帖)")
- print("=" * 60 + "\n")
- # ============================================================================
- # 使用示例
- # ============================================================================
- if __name__ == "__main__":
- print("这是一个辅助模块,请通过 search_with_detail.py 使用")
- print("\n主要功能:")
- print("1. parse_detail_result() - 解析详情API响应")
- print("2. enrich_post_with_detail() - 补充单个帖子详情")
- print("3. enrich_posts_batch() - 批量补充详情")
- print("4. create_enriched_summary() - 创建详情摘要")
- print("5. print_enrichment_stats() - 打印统计信息")
|