Parcourir la source

测试账号抓取

luojunhui il y a 10 mois
Parent
commit
66800efb69
1 fichiers modifiés avec 6 ajouts et 10 suppressions
  1. 6 10
      coldStartTasks/crawler/weixin_video_crawler.py

+ 6 - 10
coldStartTasks/crawler/weixin_video_crawler.py

@@ -75,10 +75,8 @@ class WeixinVideoCrawler(object):
         latest_crawler_timestamp = account_obj["latest_crawler_timestamp"]
         if latest_crawler_timestamp is None:
             latest_crawler_timestamp = const.DEFAULT_TIMESTAMP
-        print(gh_id, account_name, latest_crawler_timestamp)
         # 调用爬虫接口
         response = spider.update_msg_list(gh_id, index=cursor)
-        print(json.dumps(response, ensure_ascii=False, indent=4))
         if response['code'] == const.REQUEST_SUCCESS:
             # 一般返回最近10天的msg_list
             msg_list = response.get('data', {}).get("data", [])
@@ -103,11 +101,11 @@ class WeixinVideoCrawler(object):
         :return:
         """
         select_sql = f"""
-            SELECT count(1)
+            SELECT id
             FROM publish_single_video_source
             WHERE url_unique_md5 = '{url_unique}';
         """
-        response = self.db_client.select_json(select_sql)
+        response = self.db_client.select(select_sql)
         if response:
             return True
         else:
@@ -126,13 +124,12 @@ class WeixinVideoCrawler(object):
             publish_type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None)
             detail_article_list = info.get("AppMsg", {}).get("DetailInfo", [])
             if detail_article_list:
-                for article in detail_article_list:
+                for article in tqdm(detail_article_list, desc="crawler_in_msg_list"):
                     article_url = article.get("ContentUrl", None)
                     url_unique = functions.generateGzhId(article_url)
                     # 判断该视频链接是否下载,若已经下载则直接跳过
                     if self.is_downloaded(url_unique):
                         continue
-
                     try:
                         download_path = functions.download_gzh_video(article_url)
                         if download_path:
@@ -177,7 +174,6 @@ class WeixinVideoCrawler(object):
                                     data={"account_name": account_name, "url": article_url}
                                 )
                             except Exception as e:
-                                print(str(e))
                                 try:
                                     update_sql = f"""
                                         UPDATE publish_single_video_source
@@ -217,7 +213,7 @@ class WeixinVideoCrawler(object):
         :return:
         """
         account_list = self.get_crawler_accounts()
-        for account_obj in tqdm(account_list[3:4], desc="crawler_video_for_each_account"):
+        for account_obj in tqdm(account_list, desc="crawler_video_for_each_account"):
             self.crawler_article_video_list(account_obj)
             self.update_account_latest_crawler_timestamp(gh_id=account_obj["gh_id"])
             time.sleep(const.SLEEP_SECONDS)
@@ -243,9 +239,9 @@ class WeixinVideoCrawler(object):
         执行任务
         :return:
         """
-        # start_timestamp = int(time.time())
+        start_timestamp = int(time.time())
         self.crawler_task()
-        # self.mention(start_timestamp)
+        self.mention(start_timestamp)