Browse Source

上线账号抓取v2

luojunhui 6 months ago
parent
commit
25ce8926fd
1 changed files with 93 additions and 61 deletions
  1. 93 61
      coldStartTasks/crawler/weixin_video_crawler.py

+ 93 - 61
coldStartTasks/crawler/weixin_video_crawler.py

@@ -93,6 +93,23 @@ class WeixinVideoCrawler(object):
             return []
         return []
 
+    def is_downloaded(self, url_unique: str) -> bool:
+        """
+        判断该视频是否已经下载
+        :param url_unique:
+        :return:
+        """
+        select_sql = f"""
+            SELECT count(1)
+            FROM publish_single_video_source
+            WHERE url_unique_md5 = '{url_unique}';
+        """
+        response = self.db_client.select_json(select_sql)
+        if response:
+            return True
+        else:
+            return False
+
     def insert_msg_list(self, account_name, gh_id, msg_list: List[Dict]) -> None:
         """
         插入视频信息
@@ -108,73 +125,88 @@ class WeixinVideoCrawler(object):
             if detail_article_list:
                 for article in detail_article_list:
                     article_url = article.get("ContentUrl", None)
-                    download_path = functions.download_gzh_video(article_url)
-                    if download_path:
-                        oss_path = functions.upload_to_oss(local_video_path=download_path)
-                        title = article.get("Title", None)
-                        position = article.get("ItemIndex", None)
-                        cover_url = article.get("CoverImgUrl", None)
-                        show_desc = article.get("ShowDesc", None)
-                        show_stat = functions.show_desc_to_sta(show_desc)
-                        read_cnt = show_stat.get("show_view_count", 0)
-                        like_cnt = show_stat.get("show_like_count", 0)
-                        url_unique = functions.generateGzhId(article_url)
-                        insert_sql = f"""
-                            INSERT INTO publish_single_video_source
-                            (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5)
-                            values
-                            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
-                        """
-                        try:
-                            self.db_client.update(
-                                sql=insert_sql,
-                                params=(
-                                    "video" + url_unique,
-                                    title,
-                                    gh_id,
-                                    account_name,
-                                    read_cnt,
-                                    like_cnt,
-                                    position,
-                                    publish_type,
-                                    article_url,
-                                    cover_url,
-                                    oss_path,
-                                    create_time,
-                                    int(time.time()),
-                                    url_unique
-                                )
-                            )
-                            log(
-                                task='weixin_video_crawler',
-                                function="insert_msg_list",
-                                message="插入一条视频",
-                                data={"account_name": account_name, "url": article_url}
-                            )
-                        except Exception as e:
-                            print(str(e))
+                    url_unique = functions.generateGzhId(article_url)
+                    # 判断该视频链接是否下载,若已经下载则直接跳过
+                    if self.is_downloaded(url_unique):
+                        continue
+
+                    try:
+                        download_path = functions.download_gzh_video(article_url)
+                        if download_path:
+                            oss_path = functions.upload_to_oss(local_video_path=download_path)
+                            title = article.get("Title", None)
+                            position = article.get("ItemIndex", None)
+                            cover_url = article.get("CoverImgUrl", None)
+                            show_desc = article.get("ShowDesc", None)
+                            show_stat = functions.show_desc_to_sta(show_desc)
+                            read_cnt = show_stat.get("show_view_count", 0)
+                            like_cnt = show_stat.get("show_like_count", 0)
+                            insert_sql = f"""
+                                INSERT INTO publish_single_video_source
+                                (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5)
+                                values
+                                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+                            """
                             try:
-                                update_sql = f"""
-                                    UPDATE publish_single_video_source
-                                    SET read_cnt = %s, like_cnt = %s
-                                    WHERE url_unique_md5 = %s;
-                                """
                                 self.db_client.update(
-                                    sql=update_sql,
-                                    params=(read_cnt, like_cnt, functions.generateGzhId(article_url))
+                                    sql=insert_sql,
+                                    params=(
+                                        "video" + url_unique,
+                                        title,
+                                        gh_id,
+                                        account_name,
+                                        read_cnt,
+                                        like_cnt,
+                                        position,
+                                        publish_type,
+                                        article_url,
+                                        cover_url,
+                                        oss_path,
+                                        create_time,
+                                        int(time.time()),
+                                        url_unique
+                                    )
                                 )
-                            except Exception as e:
-                                error_stack = traceback.format_exc()
                                 log(
                                     task='weixin_video_crawler',
-                                    function="update_msg_list",
-                                    status="fail",
-                                    message="更新内容失败",
-                                    data={"error": str(e), "error_stack": error_stack, "url": article_url}
-
+                                    function="insert_msg_list",
+                                    message="插入一条视频",
+                                    data={"account_name": account_name, "url": article_url}
                                 )
-                    else:
-                        continue
+                            except Exception as e:
+                                print(str(e))
+                                try:
+                                    update_sql = f"""
+                                        UPDATE publish_single_video_source
+                                        SET read_cnt = %s, like_cnt = %s
+                                        WHERE url_unique_md5 = %s;
+                                    """
+                                    self.db_client.update(
+                                        sql=update_sql,
+                                        params=(read_cnt, like_cnt, functions.generateGzhId(article_url))
+                                    )
+                                except Exception as e:
+                                    error_stack = traceback.format_exc()
+                                    log(
+                                        task='weixin_video_crawler',
+                                        function="update_msg_list",
+                                        status="fail",
+                                        message="更新内容失败",
+                                        data={"error": str(e), "error_stack": error_stack, "url": article_url}
+
+                                    )
+                        else:
+                            continue
+                    except Exception as e:
+                        error_stack = traceback.format_exc()
+                        log(
+                            task='weixin_video_crawler',
+                            function="update_msg_list",
+                            status="fail",
+                            message="更新内容失败",
+                            data={"error": str(e), "error_stack": error_stack, "url": article_url}
+
+                        )
 
     def crawler_task(self):
         """