wangkun 1 year ago
parent
commit
9c27360196

+ 1 - 1
haitunzhufu/haitunzhufu_main/run_htzf_dev.py

@@ -12,7 +12,7 @@ from haitunzhufu.haitunzhufu_recommend.haitunzhufu_recommend2 import HTZFRecomme
 class HTZFMain:
     @classmethod
     def main(cls, log_type, crawler, env):
-        videos_cnt = 50
+        videos_cnt = 100
         Common.logger(log_type, crawler).info('开始抓取"海豚祝福"')
         HTZFRecommend.start_wechat(log_type, crawler, videos_cnt, env)
         Common.logger(log_type, crawler).info("抓取一轮结束")

+ 1 - 1
haitunzhufu/haitunzhufu_main/run_htzf_recommend.py

@@ -13,7 +13,7 @@ from haitunzhufu.haitunzhufu_recommend.haitunzhufu_recommend2 import HTZFRecomme
 class HTZFMain:
     @classmethod
     def main(cls, log_type, crawler, env):
-        videos_cnt = 50
+        videos_cnt = 100
         Common.logger(log_type, crawler).info('开始抓取"海豚祝福"')
         HTZFRecommend.start_wechat(log_type=log_type,
                                    crawler=crawler,

+ 20 - 2
haitunzhufu/haitunzhufu_recommend/haitunzhufu_recommend2.py

@@ -26,6 +26,8 @@ from common.scheduling_db import MysqlHelper
 
 class HTZFRecommend:
     platform = "海豚祝福"
+    i = 0
+    element_list = []
 
     @classmethod
     def today_download_cnt(cls, log_type, crawler, env):
@@ -144,10 +146,12 @@ class HTZFRecommend:
         Common.logger(log_type, crawler).info('点击"推荐"列表成功\n')
 
         # while True:
-        for page in range(200):
+        for page in range(500):
             Common.logger(log_type, crawler).info(f"正在抓取第{page+1}页")
             if cls.search_elements(driver, '//*[@class="list"]') is None:
                 Common.logger(log_type, crawler).info("列表页窗口已销毁\n")
+                cls.element_list = []
+                cls.i = 0
                 return
             for i in range(1):
                 cls.swipe_up(driver)
@@ -158,15 +162,29 @@ class HTZFRecommend:
             soup.prettify()
 
             video_list_elements = soup.findAll("wx-view", class_="img_bf")
+            # video_list_elements 有,cls.element_list 中没有的元素
+            video_list_elements = list(set(video_list_elements).difference(set(cls.element_list)))
+            # video_list_elements 与 cls.element_list 的并集
+            cls.element_list = list(set(video_list_elements) | set(cls.element_list))
             Common.logger(log_type, crawler).info(f"第{page+1}页共:{len(video_list_elements)}条视频\n")
 
+            if len(video_list_elements) == 0:
+                for i in range(10):
+                    Common.logger(log_type, crawler).info(f"向上滑动第{i + 1}次")
+                    cls.swipe_up(driver)
+                    time.sleep(0.5)
+                continue
+
             for i, video_element in enumerate(video_list_elements):
                 try:
                     today_download = cls.today_download_cnt(log_type, crawler, env)
                     if today_download >= videos_cnt:
                         Common.logger(log_type, crawler).info(f"今日已下载视频数:{today_download}")
+                        cls.element_list = []
+                        cls.i = 0
                         return
-                    Common.logger(log_type, crawler).info(f"第{i+1}条视频")
+                    cls.i += 1
+                    Common.logger(log_type, crawler).info(f"第{cls.i}条视频")
                     video_title = video_element.find("wx-view", class_="title").text
                     play_str = video_element.find("wx-view", class_="wan").text
                     play_cnt = int(re.sub(r"\D", "", play_str)) * 10000 if "万" in play_str else play_str