|
@@ -0,0 +1,287 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+抓取视频
|
|
|
+"""
|
|
|
+import json
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+from typing import List, Dict
|
|
|
+
|
|
|
+from tqdm import tqdm
|
|
|
+from pymysql.cursors import DictCursor
|
|
|
+
|
|
|
+from config import apolloConfig
|
|
|
+from applications import bot
|
|
|
+from applications import log
|
|
|
+from applications import Functions
|
|
|
+from applications import WeixinSpider
|
|
|
+from applications import longArticlesMySQL
|
|
|
+from applications.const import WeixinVideoCrawlerConst
|
|
|
+
|
|
|
+spider = WeixinSpider()
|
|
|
+functions = Functions()
|
|
|
+config = apolloConfig(env="prod")
|
|
|
+const = WeixinVideoCrawlerConst()
|
|
|
+
|
|
|
+
|
|
|
+class WeixinVideoCrawler(object):
|
|
|
+ """
|
|
|
+ 微信视频抓取
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.db_client = longArticlesMySQL()
|
|
|
+ self.festival_list = json.loads(config.getConfigValue("festival"))
|
|
|
+
|
|
|
+ def is_festival(self, title: str) -> bool:
|
|
|
+ """
|
|
|
+ 判断是否为节假日
|
|
|
+ :param title:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ for festival in self.festival_list:
|
|
|
+ if festival in title:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ def get_title_status(self, title: str) -> int:
|
|
|
+ """
|
|
|
+ 通过标题获取文章状态
|
|
|
+ :param title:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ if self.is_festival(title):
|
|
|
+ return const.TITLE_FESTIVAL_STATUS
|
|
|
+ elif len(title) < const.TITLE_MIN_LENGTH:
|
|
|
+ return const.TITLE_SHORT_STATUS
|
|
|
+ else:
|
|
|
+ return const.TITLE_DEFAULT_STATUS
|
|
|
+
|
|
|
+ def update_account_latest_crawler_timestamp(self, gh_id: str) -> int:
|
|
|
+ """
|
|
|
+ 更新最新抓取时间戳
|
|
|
+ :param gh_id:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ update_sql = f"""
|
|
|
+ UPDATE weixin_account_for_videos
|
|
|
+ SET latest_crawler_timestamp = (
|
|
|
+ SELECT max(publish_timestamp)
|
|
|
+ FROM publish_single_video_source
|
|
|
+ WHERE out_account_id = %s
|
|
|
+ )
|
|
|
+ WHERE gh_id = %s;
|
|
|
+ """
|
|
|
+ affected_rows = self.db_client.update(
|
|
|
+ sql=update_sql,
|
|
|
+ params=(gh_id, gh_id)
|
|
|
+ )
|
|
|
+ return affected_rows
|
|
|
+
|
|
|
+ def get_crawler_accounts(self) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 获取微信公众号列表
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT gh_id, account_name, latest_crawler_timestamp
|
|
|
+ FROM weixin_account_for_videos
|
|
|
+ WHERE status = {const.ACCOUNT_CRAWL_STATUS};
|
|
|
+ """
|
|
|
+ response = self.db_client.select(select_sql, DictCursor)
|
|
|
+ return response
|
|
|
+
|
|
|
+ def crawler_article_video_list(self, account_obj: Dict, cursor=None):
|
|
|
+ """
|
|
|
+ 抓取单个账号的文章列表,获取视频
|
|
|
+ :param cursor:
|
|
|
+ :param account_obj:
|
|
|
+ :return: 返回待下载的视频列表
|
|
|
+ """
|
|
|
+ gh_id = account_obj["gh_id"]
|
|
|
+ account_name = account_obj["account_name"]
|
|
|
+ latest_crawler_timestamp = account_obj["latest_crawler_timestamp"]
|
|
|
+ if latest_crawler_timestamp is None:
|
|
|
+ latest_crawler_timestamp = const.DEFAULT_TIMESTAMP
|
|
|
+ # 调用爬虫接口
|
|
|
+ response = spider.update_msg_list(gh_id, index=cursor)
|
|
|
+ if response['code'] == const.REQUEST_SUCCESS:
|
|
|
+ # 一般返回最近10天的msg_list
|
|
|
+ msg_list = response.get('data', {}).get("data", [])
|
|
|
+ if msg_list:
|
|
|
+ last_msg = msg_list[-1]
|
|
|
+ last_msg_base_info = last_msg['AppMsg']['BaseInfo']
|
|
|
+ last_msg_create_timestamp = last_msg_base_info['CreateTime']
|
|
|
+ self.insert_msg_list(account_name=account_name, gh_id=gh_id, msg_list=msg_list)
|
|
|
+ if last_msg_create_timestamp > latest_crawler_timestamp:
|
|
|
+ next_cursor = response['data']['next_cursor']
|
|
|
+ return self.crawler_article_video_list(account_obj=account_obj, cursor=next_cursor)
|
|
|
+ else:
|
|
|
+ return []
|
|
|
+ else:
|
|
|
+ return []
|
|
|
+ return []
|
|
|
+
|
|
|
+ def is_downloaded(self, url_unique: str) -> bool:
|
|
|
+ """
|
|
|
+ 判断该视频是否已经下载
|
|
|
+ :param url_unique:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT id
|
|
|
+ FROM publish_single_video_source
|
|
|
+ WHERE url_unique_md5 = '{url_unique}';
|
|
|
+ """
|
|
|
+ response = self.db_client.select(select_sql)
|
|
|
+ if response:
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+ def insert_msg_list(self, account_name, gh_id, msg_list: List[Dict]) -> None:
|
|
|
+ """
|
|
|
+ 插入视频信息
|
|
|
+ :param gh_id:
|
|
|
+ :param account_name:
|
|
|
+ :param msg_list:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ for info in msg_list:
|
|
|
+ create_time = info.get("AppMsg", {}).get("BaseInfo", {}).get("CreateTime", None)
|
|
|
+ publish_type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None)
|
|
|
+ detail_article_list = info.get("AppMsg", {}).get("DetailInfo", [])
|
|
|
+ if detail_article_list:
|
|
|
+ for article in tqdm(detail_article_list, desc="{}: crawler_in_msg_list".format(account_name)):
|
|
|
+ article_url = article.get("ContentUrl", None)
|
|
|
+ url_unique = functions.generateGzhId(article_url)
|
|
|
+ # 判断该视频链接是否下载,若已经下载则直接跳过
|
|
|
+ if self.is_downloaded(url_unique):
|
|
|
+ continue
|
|
|
+ try:
|
|
|
+ download_path = functions.download_gzh_video(article_url)
|
|
|
+ if download_path:
|
|
|
+ oss_path = functions.upload_to_oss(local_video_path=download_path)
|
|
|
+ title = article.get("Title", None)
|
|
|
+ position = article.get("ItemIndex", None)
|
|
|
+ cover_url = article.get("CoverImgUrl", None)
|
|
|
+ show_desc = article.get("ShowDesc", None)
|
|
|
+ show_stat = functions.show_desc_to_sta(show_desc)
|
|
|
+ read_cnt = show_stat.get("show_view_count", 0)
|
|
|
+ like_cnt = show_stat.get("show_like_count", 0)
|
|
|
+ title_status = self.get_title_status(title)
|
|
|
+ insert_sql = f"""
|
|
|
+ INSERT INTO publish_single_video_source
|
|
|
+ (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, bad_status, publish_timestamp, crawler_timestamp, url_unique_md5)
|
|
|
+ values
|
|
|
+ (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ self.db_client.update(
|
|
|
+ sql=insert_sql,
|
|
|
+ params=(
|
|
|
+ "video" + url_unique,
|
|
|
+ title,
|
|
|
+ gh_id,
|
|
|
+ account_name,
|
|
|
+ read_cnt,
|
|
|
+ like_cnt,
|
|
|
+ position,
|
|
|
+ publish_type,
|
|
|
+ article_url,
|
|
|
+ cover_url,
|
|
|
+ oss_path,
|
|
|
+ title_status,
|
|
|
+ create_time,
|
|
|
+ int(time.time()),
|
|
|
+ url_unique
|
|
|
+ )
|
|
|
+ )
|
|
|
+ log(
|
|
|
+ task='weixin_video_crawler',
|
|
|
+ function="insert_msg_list",
|
|
|
+ message="插入一条视频",
|
|
|
+ data={"account_name": account_name, "url": article_url}
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ try:
|
|
|
+ update_sql = f"""
|
|
|
+ UPDATE publish_single_video_source
|
|
|
+ SET read_cnt = %s, like_cnt = %s
|
|
|
+ WHERE url_unique_md5 = %s;
|
|
|
+ """
|
|
|
+ self.db_client.update(
|
|
|
+ sql=update_sql,
|
|
|
+ params=(read_cnt, like_cnt, functions.generateGzhId(article_url))
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ error_stack = traceback.format_exc()
|
|
|
+ log(
|
|
|
+ task='weixin_video_crawler',
|
|
|
+ function="update_msg_list",
|
|
|
+ status="fail",
|
|
|
+ message="更新内容失败",
|
|
|
+ data={"error": str(e), "error_stack": error_stack, "url": article_url}
|
|
|
+
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+ except Exception as e:
|
|
|
+ error_stack = traceback.format_exc()
|
|
|
+ log(
|
|
|
+ task='weixin_video_crawler',
|
|
|
+ function="update_msg_list",
|
|
|
+ status="fail",
|
|
|
+ message="更新内容失败",
|
|
|
+ data={"error": str(e), "error_stack": error_stack, "url": article_url}
|
|
|
+
|
|
|
+ )
|
|
|
+
|
|
|
+ def crawler_task(self):
|
|
|
+ """
|
|
|
+ 抓取任务
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ account_list = self.get_crawler_accounts()
|
|
|
+ for account_obj in tqdm(account_list, desc="crawler_video_for_each_account"):
|
|
|
+ try:
|
|
|
+ self.crawler_article_video_list(account_obj)
|
|
|
+ self.update_account_latest_crawler_timestamp(gh_id=account_obj["gh_id"])
|
|
|
+ time.sleep(const.SLEEP_SECONDS)
|
|
|
+ except Exception as e:
|
|
|
+ error_stack = traceback.format_exc()
|
|
|
+ log(
|
|
|
+ task='weixin_video_crawler',
|
|
|
+ function="crawler_task",
|
|
|
+ status="fail",
|
|
|
+ message="抓取任务失败--单账号",
|
|
|
+ data={"error": str(e), "error_stack": error_stack, "account_name": account_obj["account_name"]}
|
|
|
+ )
|
|
|
+
|
|
|
+ def mention(self, start_timestamp):
|
|
|
+ """
|
|
|
+ 飞书发送消息
|
|
|
+ :param start_timestamp:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ sql = f"""select count(1) from publish_single_video_source where crawler_timestamp >= {start_timestamp};"""
|
|
|
+ response = self.db_client.select(sql)
|
|
|
+ new_articles_count = response[0][0]
|
|
|
+ bot(
|
|
|
+ title='微信抓取任务执行完成',
|
|
|
+ detail={
|
|
|
+ "新增视频数量": new_articles_count
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ def run(self):
|
|
|
+ """
|
|
|
+ 执行任务
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ start_timestamp = int(time.time())
|
|
|
+ self.crawler_task()
|
|
|
+ self.mention(start_timestamp)
|
|
|
+
|
|
|
+
|
|
|
+
|