weixin_video_crawler.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. """
  2. @author: luojunhui
  3. 抓取视频
  4. """
  5. import json
  6. import time
  7. import traceback
  8. from typing import List, Dict
  9. from tqdm import tqdm
  10. from applications import bot
  11. from applications import log
  12. from applications import Functions
  13. from applications import WeixinSpider
  14. from applications import longArticlesMySQL
  15. from applications.const import WeixinVideoCrawlerConst
  16. spider = WeixinSpider()
  17. const = WeixinVideoCrawlerConst()
  18. functions = Functions()
  19. class WeixinVideoCrawler(object):
  20. """
  21. 微信视频抓取
  22. """
  23. def __init__(self):
  24. self.db_client = longArticlesMySQL()
  25. def update_account_latest_crawler_timestamp(self, gh_id: str) -> int:
  26. """
  27. 更新最新抓取时间戳
  28. :param gh_id:
  29. :return:
  30. """
  31. update_sql = f"""
  32. UPDATE weixin_account_for_videos
  33. SET latest_crawler_timestamp = (
  34. SELECT max(publish_timestamp)
  35. FROM publish_single_video_source
  36. WHERE out_account_id = %s
  37. )
  38. WHERE gh_id = %s;
  39. """
  40. affected_rows = self.db_client.update(
  41. sql=update_sql,
  42. params=(gh_id, gh_id)
  43. )
  44. return affected_rows
  45. def get_crawler_accounts(self) -> List[Dict]:
  46. """
  47. 获取微信公众号列表
  48. :return:
  49. """
  50. select_sql = f"""
  51. SELECT gh_id, account_name, latest_crawler_timestamp
  52. FROM weixin_account_for_videos
  53. WHERE status = {const.ACCOUNT_CRAWL_STATUS};
  54. """
  55. response = self.db_client.select_json(select_sql)
  56. return response
  57. def crawler_article_video_list(self, account_obj: Dict, cursor=None):
  58. """
  59. 抓取单个账号的文章列表,获取视频
  60. :param cursor:
  61. :param account_obj:
  62. :return: 返回待下载的视频列表
  63. """
  64. gh_id = account_obj["gh_id"]
  65. account_name = account_obj["account_name"]
  66. latest_crawler_timestamp = account_obj["latest_crawler_timestamp"]
  67. if latest_crawler_timestamp is None:
  68. latest_crawler_timestamp = const.DEFAULT_TIMESTAMP
  69. # 调用爬虫接口
  70. response = spider.update_msg_list(gh_id, index=cursor)
  71. if response['code'] == const.REQUEST_SUCCESS:
  72. # 一般返回最近10天的msg_list
  73. msg_list = response.get('data', {}).get("data", [])
  74. if msg_list:
  75. last_msg = msg_list[-1]
  76. last_msg_base_info = last_msg['AppMsg']['BaseInfo']
  77. last_msg_create_timestamp = last_msg_base_info['CreateTime']
  78. self.insert_msg_list(account_name=account_name, gh_id=gh_id, msg_list=msg_list)
  79. if last_msg_create_timestamp > latest_crawler_timestamp:
  80. next_cursor = response['data']['next_cursor']
  81. return self.crawler_article_video_list(account_obj=account_obj, cursor=next_cursor)
  82. else:
  83. return []
  84. else:
  85. return []
  86. return []
  87. def is_downloaded(self, url_unique: str) -> bool:
  88. """
  89. 判断该视频是否已经下载
  90. :param url_unique:
  91. :return:
  92. """
  93. select_sql = f"""
  94. SELECT id
  95. FROM publish_single_video_source
  96. WHERE url_unique_md5 = '{url_unique}';
  97. """
  98. response = self.db_client.select(select_sql)
  99. if response:
  100. return True
  101. else:
  102. return False
  103. def insert_msg_list(self, account_name, gh_id, msg_list: List[Dict]) -> None:
  104. """
  105. 插入视频信息
  106. :param gh_id:
  107. :param account_name:
  108. :param msg_list:
  109. :return:
  110. """
  111. for info in msg_list:
  112. create_time = info.get("AppMsg", {}).get("BaseInfo", {}).get("CreateTime", None)
  113. publish_type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None)
  114. detail_article_list = info.get("AppMsg", {}).get("DetailInfo", [])
  115. if detail_article_list:
  116. for article in tqdm(detail_article_list, desc="crawler_in_msg_list"):
  117. article_url = article.get("ContentUrl", None)
  118. url_unique = functions.generateGzhId(article_url)
  119. # 判断该视频链接是否下载,若已经下载则直接跳过
  120. if self.is_downloaded(url_unique):
  121. continue
  122. try:
  123. download_path = functions.download_gzh_video(article_url)
  124. if download_path:
  125. oss_path = functions.upload_to_oss(local_video_path=download_path)
  126. title = article.get("Title", None)
  127. position = article.get("ItemIndex", None)
  128. cover_url = article.get("CoverImgUrl", None)
  129. show_desc = article.get("ShowDesc", None)
  130. show_stat = functions.show_desc_to_sta(show_desc)
  131. read_cnt = show_stat.get("show_view_count", 0)
  132. like_cnt = show_stat.get("show_like_count", 0)
  133. insert_sql = f"""
  134. INSERT INTO publish_single_video_source
  135. (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5)
  136. values
  137. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  138. """
  139. try:
  140. self.db_client.update(
  141. sql=insert_sql,
  142. params=(
  143. "video" + url_unique,
  144. title,
  145. gh_id,
  146. account_name,
  147. read_cnt,
  148. like_cnt,
  149. position,
  150. publish_type,
  151. article_url,
  152. cover_url,
  153. oss_path,
  154. create_time,
  155. int(time.time()),
  156. url_unique
  157. )
  158. )
  159. log(
  160. task='weixin_video_crawler',
  161. function="insert_msg_list",
  162. message="插入一条视频",
  163. data={"account_name": account_name, "url": article_url}
  164. )
  165. except Exception as e:
  166. try:
  167. update_sql = f"""
  168. UPDATE publish_single_video_source
  169. SET read_cnt = %s, like_cnt = %s
  170. WHERE url_unique_md5 = %s;
  171. """
  172. self.db_client.update(
  173. sql=update_sql,
  174. params=(read_cnt, like_cnt, functions.generateGzhId(article_url))
  175. )
  176. except Exception as e:
  177. error_stack = traceback.format_exc()
  178. log(
  179. task='weixin_video_crawler',
  180. function="update_msg_list",
  181. status="fail",
  182. message="更新内容失败",
  183. data={"error": str(e), "error_stack": error_stack, "url": article_url}
  184. )
  185. else:
  186. continue
  187. except Exception as e:
  188. error_stack = traceback.format_exc()
  189. log(
  190. task='weixin_video_crawler',
  191. function="update_msg_list",
  192. status="fail",
  193. message="更新内容失败",
  194. data={"error": str(e), "error_stack": error_stack, "url": article_url}
  195. )
  196. def crawler_task(self):
  197. """
  198. 抓取任务
  199. :return:
  200. """
  201. account_list = self.get_crawler_accounts()
  202. for account_obj in tqdm(account_list, desc="crawler_video_for_each_account"):
  203. self.crawler_article_video_list(account_obj)
  204. self.update_account_latest_crawler_timestamp(gh_id=account_obj["gh_id"])
  205. time.sleep(const.SLEEP_SECONDS)
  206. def mention(self, start_timestamp):
  207. """
  208. 飞书发送消息
  209. :param start_timestamp:
  210. :return:
  211. """
  212. sql = f"""select count(1) from publish_single_video_source where crawler_timestamp >= {start_timestamp};"""
  213. response = self.db_client.select(sql)
  214. new_articles_count = response[0][0]
  215. bot(
  216. title='微信抓取任务执行完成',
  217. detail={
  218. "新增视频数量": new_articles_count
  219. }
  220. )
  221. def run(self):
  222. """
  223. 执行任务
  224. :return:
  225. """
  226. start_timestamp = int(time.time())
  227. self.crawler_task()
  228. self.mention(start_timestamp)