|
@@ -93,6 +93,23 @@ class WeixinVideoCrawler(object):
|
|
|
return []
|
|
|
return []
|
|
|
|
|
|
+ def is_downloaded(self, url_unique: str) -> bool:
|
|
|
+ """
|
|
|
+ 判断该视频是否已经下载
|
|
|
+ :param url_unique:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT count(1)
|
|
|
+ FROM publish_single_video_source
|
|
|
+ WHERE url_unique_md5 = '{url_unique}';
|
|
|
+ """
|
|
|
+ response = self.db_client.select_json(select_sql)
|
|
|
+ if response:
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
def insert_msg_list(self, account_name, gh_id, msg_list: List[Dict]) -> None:
|
|
|
"""
|
|
|
插入视频信息
|
|
@@ -108,73 +125,88 @@ class WeixinVideoCrawler(object):
|
|
|
if detail_article_list:
|
|
|
for article in detail_article_list:
|
|
|
article_url = article.get("ContentUrl", None)
|
|
|
- download_path = functions.download_gzh_video(article_url)
|
|
|
- if download_path:
|
|
|
- oss_path = functions.upload_to_oss(local_video_path=download_path)
|
|
|
- title = article.get("Title", None)
|
|
|
- position = article.get("ItemIndex", None)
|
|
|
- cover_url = article.get("CoverImgUrl", None)
|
|
|
- show_desc = article.get("ShowDesc", None)
|
|
|
- show_stat = functions.show_desc_to_sta(show_desc)
|
|
|
- read_cnt = show_stat.get("show_view_count", 0)
|
|
|
- like_cnt = show_stat.get("show_like_count", 0)
|
|
|
- url_unique = functions.generateGzhId(article_url)
|
|
|
- insert_sql = f"""
|
|
|
- INSERT INTO publish_single_video_source
|
|
|
- (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5)
|
|
|
- values
|
|
|
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
- """
|
|
|
- try:
|
|
|
- self.db_client.update(
|
|
|
- sql=insert_sql,
|
|
|
- params=(
|
|
|
- "video" + url_unique,
|
|
|
- title,
|
|
|
- gh_id,
|
|
|
- account_name,
|
|
|
- read_cnt,
|
|
|
- like_cnt,
|
|
|
- position,
|
|
|
- publish_type,
|
|
|
- article_url,
|
|
|
- cover_url,
|
|
|
- oss_path,
|
|
|
- create_time,
|
|
|
- int(time.time()),
|
|
|
- url_unique
|
|
|
- )
|
|
|
- )
|
|
|
- log(
|
|
|
- task='weixin_video_crawler',
|
|
|
- function="insert_msg_list",
|
|
|
- message="插入一条视频",
|
|
|
- data={"account_name": account_name, "url": article_url}
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print(str(e))
|
|
|
+ url_unique = functions.generateGzhId(article_url)
|
|
|
+ # 判断该视频链接是否下载,若已经下载则直接跳过
|
|
|
+ if self.is_downloaded(url_unique):
|
|
|
+ continue
|
|
|
+
|
|
|
+ try:
|
|
|
+ download_path = functions.download_gzh_video(article_url)
|
|
|
+ if download_path:
|
|
|
+ oss_path = functions.upload_to_oss(local_video_path=download_path)
|
|
|
+ title = article.get("Title", None)
|
|
|
+ position = article.get("ItemIndex", None)
|
|
|
+ cover_url = article.get("CoverImgUrl", None)
|
|
|
+ show_desc = article.get("ShowDesc", None)
|
|
|
+ show_stat = functions.show_desc_to_sta(show_desc)
|
|
|
+ read_cnt = show_stat.get("show_view_count", 0)
|
|
|
+ like_cnt = show_stat.get("show_like_count", 0)
|
|
|
+ insert_sql = f"""
|
|
|
+ INSERT INTO publish_single_video_source
|
|
|
+ (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5)
|
|
|
+ values
|
|
|
+ (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
+ """
|
|
|
try:
|
|
|
- update_sql = f"""
|
|
|
- UPDATE publish_single_video_source
|
|
|
- SET read_cnt = %s, like_cnt = %s
|
|
|
- WHERE url_unique_md5 = %s;
|
|
|
- """
|
|
|
self.db_client.update(
|
|
|
- sql=update_sql,
|
|
|
- params=(read_cnt, like_cnt, functions.generateGzhId(article_url))
|
|
|
+ sql=insert_sql,
|
|
|
+ params=(
|
|
|
+ "video" + url_unique,
|
|
|
+ title,
|
|
|
+ gh_id,
|
|
|
+ account_name,
|
|
|
+ read_cnt,
|
|
|
+ like_cnt,
|
|
|
+ position,
|
|
|
+ publish_type,
|
|
|
+ article_url,
|
|
|
+ cover_url,
|
|
|
+ oss_path,
|
|
|
+ create_time,
|
|
|
+ int(time.time()),
|
|
|
+ url_unique
|
|
|
+ )
|
|
|
)
|
|
|
- except Exception as e:
|
|
|
- error_stack = traceback.format_exc()
|
|
|
log(
|
|
|
task='weixin_video_crawler',
|
|
|
- function="update_msg_list",
|
|
|
- status="fail",
|
|
|
- message="更新内容失败",
|
|
|
- data={"error": str(e), "error_stack": error_stack, "url": article_url}
|
|
|
-
|
|
|
+ function="insert_msg_list",
|
|
|
+ message="插入一条视频",
|
|
|
+ data={"account_name": account_name, "url": article_url}
|
|
|
)
|
|
|
- else:
|
|
|
- continue
|
|
|
+ except Exception as e:
|
|
|
+ print(str(e))
|
|
|
+ try:
|
|
|
+ update_sql = f"""
|
|
|
+ UPDATE publish_single_video_source
|
|
|
+ SET read_cnt = %s, like_cnt = %s
|
|
|
+ WHERE url_unique_md5 = %s;
|
|
|
+ """
|
|
|
+ self.db_client.update(
|
|
|
+ sql=update_sql,
|
|
|
+ params=(read_cnt, like_cnt, functions.generateGzhId(article_url))
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ error_stack = traceback.format_exc()
|
|
|
+ log(
|
|
|
+ task='weixin_video_crawler',
|
|
|
+ function="update_msg_list",
|
|
|
+ status="fail",
|
|
|
+ message="更新内容失败",
|
|
|
+ data={"error": str(e), "error_stack": error_stack, "url": article_url}
|
|
|
+
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+ except Exception as e:
|
|
|
+ error_stack = traceback.format_exc()
|
|
|
+ log(
|
|
|
+ task='weixin_video_crawler',
|
|
|
+ function="update_msg_list",
|
|
|
+ status="fail",
|
|
|
+ message="更新内容失败",
|
|
|
+ data={"error": str(e), "error_stack": error_stack, "url": article_url}
|
|
|
+
|
|
|
+ )
|
|
|
|
|
|
def crawler_task(self):
|
|
|
"""
|