Sfoglia il codice sorgente

中青看点用户数据爬取每次从第一个用户开始爬

zhangliang 1 settimana fa
parent
commit
aeecf86c22
1 ha cambiato i file con 9 aggiunte e 14 eliminazioni
  1. 9 14
      spider/crawler_author/zhongqingkandian_author.py

+ 9 - 14
spider/crawler_author/zhongqingkandian_author.py

@@ -57,9 +57,9 @@ class ZhongQingKanDianAuthor:
         self.LocalLog = Local.logger(self.platform, self.mode)
         self.curses = 1
         result = self.redis_ops.get_last_scanned_id()
-        self.last_scanned_id = 0 if result is None else int(result)
-        self.zqkd_user_list = self.db_ops.select_user(self.last_scanned_id)
-        self.LocalLog.info(f"获取到的用户列表:{self.zqkd_user_list} \n 昨天最后扫描的用户ID{self.last_scanned_id}")
+        # self.last_scanned_id = 0 if result is None else int(result)
+        # self.zqkd_user_list = self.db_ops.select_user(self.last_scanned_id)
+        # self.LocalLog.info(f"获取到的用户列表:{self.zqkd_user_list} \n 昨天最后扫描的用户ID{self.last_scanned_id}")
         self.session = requests.session()
 
     def __del__(self):
@@ -193,11 +193,11 @@ class ZhongQingKanDianAuthor:
                     self.LocalLog.info(f"当日视频已达到最大爬取量{self.download_cnt}")
                     return
                 self.LocalLog.info(f"开始用户视频列表的请求和处理流程,今日已爬 {self.download_cnt} 个视频")
-                if not self.zqkd_user_list:
+                if not self.db_ops.select_user(0):
                     self.LocalLog.info("没有用户数据")
                     time.sleep(10)
                     continue
-                for user_info in self.zqkd_user_list:
+                for user_info in self.db_ops.select_user(0):
                     if not self.limit_flag:
                         self.aliyun_log.logging(
                             code="2010",
@@ -228,8 +228,8 @@ class ZhongQingKanDianAuthor:
                         if video_content_link:
                             self.req_detail(video_content_link, **video_obj)
                             time.sleep(random.randint(5,10))
-                self.redis_ops.set_last_scanned_id(0)
-                self.zqkd_user_list = self.db_ops.select_user(0)
+                # self.redis_ops.set_last_scanned_id(0)
+                # self.zqkd_user_list = self.db_ops.select_user(0)
                 self.curses += 1
             except Exception as e:
                 tb_info = traceback.format_exc()
@@ -351,7 +351,7 @@ class ZhongQingKanDianAuthor:
             )
             # 保存视频ID
             self.redis_ops.save_video_id(video_obj['channel_content_id'])
-            if self.download_cnt >= self.rule_dict.get("videos_cnt", {}).get("min", 300) and self.zqkd_user_list:
+            if self.download_cnt >= self.rule_dict.get("videos_cnt", {}).get("min", 300):
                 self.aliyun_log.logging(
                     code="2010",
                     message=f"今日已经达到最大量",
@@ -361,11 +361,6 @@ class ZhongQingKanDianAuthor:
                 # 判断视频数量达到预期且用户列表没有轮训完
                 # self.redis_ops.set_last_scanned_id(self.last_scanned_id)
                 self.limit_flag = False
-            elif not self.zqkd_user_list:
-                # 如果数据没达到预期数量,则重新开始扫用户数据,扫所有用户下一页的数据,直到数量达到预期
-                self.LocalLog.info("扫描到最后一个用户")
-                self.redis_ops.set_last_scanned_id(0)
-                self.curses += 1
 
         except Exception as e:
             tb_info = traceback.format_exc()
@@ -420,6 +415,6 @@ if __name__ == '__main__':
     ZhongQingKanDianAuthor(
         platform="zhongqingkandian",
         mode="author",
-        rule_dict={'videos_cnt': {'min': 2, 'max': 0}, 'duration': {'min': 30, 'max': 1200}},
+        rule_dict={'videos_cnt': {'min': 500, 'max': 0}, 'duration': {'min': 30, 'max': 1200}},
         user_list=[{"uid": 81525568, "link": "中青看点推荐", "nick_name": "芸芸众生"}]
     ).run()