enrichment_helper.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. #!/usr/bin/env python3
  2. """
  3. 帖子详情补充工具
  4. 用于将搜索结果与详情API结果合并,补充高清图片、视频URL、作者信息等
  5. """
  6. import json
  7. from typing import Dict, Any, List
  8. from datetime import datetime
  9. def parse_detail_result(detail_response: Dict[str, Any]) -> Dict[str, Any] | None:
  10. """
  11. 解析详情API返回的结果
  12. Args:
  13. detail_response: 详情API的完整响应
  14. Returns:
  15. 解析后的数据字典,失败返回None
  16. """
  17. try:
  18. # 检查success字段
  19. if not detail_response.get("success"):
  20. print(f" ⚠️ 详情API返回失败")
  21. return None
  22. # 解析result字段(可能是JSON字符串)
  23. result = detail_response.get("result", "")
  24. if isinstance(result, str):
  25. result = json.loads(result)
  26. # 提取data
  27. if isinstance(result, list) and len(result) > 0:
  28. return result[0].get("data", {})
  29. elif isinstance(result, dict):
  30. return result.get("data", {})
  31. return None
  32. except Exception as e:
  33. print(f" ✗ 解析详情结果失败: {e}")
  34. return None
  35. def enrich_post_with_detail(post: Any, detail_response: Dict[str, Any]) -> bool:
  36. """
  37. 使用详情API的数据补充Post对象
  38. Args:
  39. post: Post对象(会被直接修改)
  40. detail_response: 详情API的完整响应
  41. Returns:
  42. 是否成功补充
  43. """
  44. # 解析详情数据
  45. detail_data = parse_detail_result(detail_response)
  46. if not detail_data:
  47. return False
  48. try:
  49. # 1. 正文内容 - 使用详情API的完整正文覆盖
  50. body_text = detail_data.get("body_text", "")
  51. if body_text:
  52. post.body_text = body_text
  53. # 2. 作者信息
  54. post.author_name = detail_data.get("channel_account_name", "")
  55. post.author_id = detail_data.get("channel_account_id", "")
  56. # 3. 发布时间
  57. post.publish_time = detail_data.get("publish_timestamp", 0)
  58. # 4. 互动信息 - 使用详情API的精确数据更新
  59. post.interact_info.update({
  60. "like_count": detail_data.get("like_count", 0), # 详情API字段
  61. "collect_count": detail_data.get("collect_count", 0), # 详情API字段
  62. })
  63. # 5. 根据类型处理图片/视频
  64. if post.type == "video":
  65. # 视频帖:补充视频URL(images保持不变)
  66. video_url = detail_data.get("video", "")
  67. if video_url:
  68. post.video = video_url
  69. else:
  70. # 图文帖:仅保存CDN图片到cdn_images字段,不覆盖images
  71. images_data = detail_data.get("images", [])
  72. if images_data:
  73. # 提取CDN URL
  74. cdn_urls = []
  75. for img in images_data:
  76. if isinstance(img, dict):
  77. cdn_url = img.get("cdn_url", "")
  78. if cdn_url:
  79. cdn_urls.append(cdn_url)
  80. elif isinstance(img, str):
  81. cdn_urls.append(img)
  82. # 仅保存CDN图片列表,不覆盖images
  83. post.cdn_images = cdn_urls
  84. # 6. 标记已获取详情
  85. post.detail_fetched = True
  86. return True
  87. except Exception as e:
  88. print(f" ✗ 补充详情失败: {e}")
  89. return False
  90. def enrich_posts_batch(
  91. posts: List[Any],
  92. detail_client: Any,
  93. show_progress: bool = True,
  94. delay: int = 1
  95. ) -> tuple[int, int]:
  96. """
  97. 批量补充帖子详情
  98. Args:
  99. posts: Post对象列表(会被直接修改)
  100. detail_client: XiaohongshuDetail实例
  101. show_progress: 是否显示进度
  102. delay: 请求间隔(秒)
  103. Returns:
  104. (成功数量, 失败数量)
  105. """
  106. success_count = 0
  107. fail_count = 0
  108. total = len(posts)
  109. for idx, post in enumerate(posts, 1):
  110. if show_progress:
  111. print(f"补充详情 ({idx}/{total}): {post.note_id}")
  112. try:
  113. # 调用详情API
  114. detail_response = detail_client.get_detail(post.note_id)
  115. # 合并数据
  116. if enrich_post_with_detail(post, detail_response):
  117. success_count += 1
  118. if show_progress:
  119. print(f" ✓ 成功补充")
  120. else:
  121. fail_count += 1
  122. if show_progress:
  123. print(f" ✗ 补充失败")
  124. except Exception as e:
  125. fail_count += 1
  126. if show_progress:
  127. print(f" ✗ 请求失败: {e}")
  128. # 避免请求过快(最后一个不需要延迟)
  129. if idx < total and delay > 0:
  130. import time
  131. time.sleep(delay)
  132. return success_count, fail_count
  133. def create_enriched_summary(post: Any) -> Dict[str, Any]:
  134. """
  135. 创建包含详情的帖子摘要(用于保存)
  136. Args:
  137. post: Post对象
  138. Returns:
  139. 摘要字典
  140. """
  141. summary = {
  142. # 基础信息
  143. "note_id": post.note_id,
  144. "note_url": post.note_url,
  145. "title": post.title,
  146. "body_text": post.body_text,
  147. "type": post.type,
  148. # 媒体信息
  149. "images": post.images,
  150. "cdn_images": post.cdn_images,
  151. "video": post.video,
  152. # 作者信息(详情补充)
  153. "author": {
  154. "name": post.author_name,
  155. "id": post.author_id
  156. } if post.detail_fetched else {},
  157. # 互动信息
  158. "interact_info": post.interact_info,
  159. # 时间信息
  160. "publish_time": post.publish_time,
  161. "publish_time_readable": datetime.fromtimestamp(
  162. post.publish_time / 1000
  163. ).strftime("%Y-%m-%d %H:%M:%S") if post.publish_time > 0 else "",
  164. # 元数据
  165. "detail_fetched": post.detail_fetched
  166. }
  167. return summary
  168. def print_enrichment_stats(posts: List[Any]) -> None:
  169. """
  170. 打印详情补充统计信息
  171. Args:
  172. posts: Post对象列表
  173. """
  174. total = len(posts)
  175. enriched = sum(1 for p in posts if p.detail_fetched)
  176. video_count = sum(1 for p in posts if p.type == "video")
  177. image_count = total - video_count
  178. print("\n" + "=" * 60)
  179. print("详情补充统计")
  180. print("=" * 60)
  181. print(f"总帖子数: {total}")
  182. print(f" - 图文帖: {image_count}")
  183. print(f" - 视频帖: {video_count}")
  184. print(f"\n已补充详情: {enriched}/{total} ({enriched*100//total if total > 0 else 0}%)")
  185. print(f"未补充详情: {total - enriched}")
  186. if enriched > 0:
  187. print("\n详情字段统计:")
  188. has_author = sum(1 for p in posts if p.author_name)
  189. has_publish_time = sum(1 for p in posts if p.publish_time > 0)
  190. has_cdn_images = sum(1 for p in posts if p.cdn_images)
  191. has_video_url = sum(1 for p in posts if p.video and p.type == "video")
  192. print(f" - 作者信息: {has_author}/{enriched}")
  193. print(f" - 发布时间: {has_publish_time}/{enriched}")
  194. print(f" - 高清图片: {has_cdn_images}/{image_count} (图文帖)")
  195. print(f" - 视频URL: {has_video_url}/{video_count} (视频帖)")
  196. print("=" * 60 + "\n")
  197. # ============================================================================
  198. # 使用示例
  199. # ============================================================================
  200. if __name__ == "__main__":
  201. print("这是一个辅助模块,请通过 search_with_detail.py 使用")
  202. print("\n主要功能:")
  203. print("1. parse_detail_result() - 解析详情API响应")
  204. print("2. enrich_post_with_detail() - 补充单个帖子详情")
  205. print("3. enrich_posts_batch() - 批量补充详情")
  206. print("4. create_enriched_summary() - 创建详情摘要")
  207. print("5. print_enrichment_stats() - 打印统计信息")