wangkun 1 jaar geleden
bovenliggende
commit
1b67be1170
1 gewijzigde bestanden met toevoegingen van 56 en 56 verwijderingen
  1. 56 56
      xigua/xigua_search/xigua_search_scheduling.py

+ 56 - 56
xigua/xigua_search/xigua_search_scheduling.py

@@ -596,52 +596,52 @@ class XiguasearchScheduling:
                 driver.quit()
                 return
             for i, video_element in enumerate(video_element_temp):
-                try:
-                    if cls.download_cnt >= int(rule_dict.get("videos_cnt", {}).get("min", 30)):
-                        Common.logger(log_type, crawler).info(f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
-                        driver.quit()
-                        return
-                    if video_element is None:
-                        Common.logger(log_type, crawler).info('到底啦~\n')
-                        driver.quit()
-                        return
-                    num += 1
-                    Common.logger(log_type, crawler).info(f'拖动"视频"列表第{num}个至屏幕中间')
-                    driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
-                    time.sleep(3)
-                    driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
-                    item_id = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i].get_attribute('href')
-                    item_id = item_id.split("com/")[-1].split("?&")[0]
-                    video_dict = cls.get_video_info(log_type, crawler, item_id)
-                    if video_dict is None:
-                        Common.logger(log_type, crawler).info("无效视频\n")
-                        continue
-                    for k, v in video_dict.items():
-                        Common.logger(log_type, crawler).info(f"{k}:{v}")
-                    if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
-                        Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
-                        driver.quit()
-                        return
-                    if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
-                        Common.logger(log_type, crawler).info("不满足抓取规则\n")
-                    elif any(str(word) if str(word) in video_dict["video_title"] else False
-                             for word in get_config_from_mysql(log_type=log_type,
-                                                               source=crawler,
-                                                               env=env,
-                                                               text="filter",
-                                                               action="")) is True:
-                        Common.logger(log_type, crawler).info('已中过滤词\n')
-                    elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
-                        Common.logger(log_type, crawler).info('视频已下载\n')
-                    else:
-                        cls.download_publish(log_type=log_type,
-                                             crawler=crawler,
-                                             user_dict=user_dict,
-                                             video_dict=video_dict,
-                                             rule_dict=rule_dict,
-                                             env=env)
-                except Exception as e:
-                    Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
+                # try:
+                if cls.download_cnt >= int(rule_dict.get("videos_cnt", {}).get("min", 30)):
+                    Common.logger(log_type, crawler).info(f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
+                    driver.quit()
+                    return
+                if video_element is None:
+                    Common.logger(log_type, crawler).info('到底啦~\n')
+                    driver.quit()
+                    return
+                num += 1
+                Common.logger(log_type, crawler).info(f'拖动"视频"列表第{num}个至屏幕中间')
+                driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
+                time.sleep(3)
+                driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
+                item_id = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i].get_attribute('href')
+                item_id = item_id.split("com/")[-1].split("?&")[0]
+                video_dict = cls.get_video_info(log_type, crawler, item_id)
+                if video_dict is None:
+                    Common.logger(log_type, crawler).info("无效视频\n")
+                    continue
+                for k, v in video_dict.items():
+                    Common.logger(log_type, crawler).info(f"{k}:{v}")
+                if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
+                    Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
+                    driver.quit()
+                    return
+                if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
+                    Common.logger(log_type, crawler).info("不满足抓取规则\n")
+                elif any(str(word) if str(word) in video_dict["video_title"] else False
+                         for word in get_config_from_mysql(log_type=log_type,
+                                                           source=crawler,
+                                                           env=env,
+                                                           text="filter",
+                                                           action="")) is True:
+                    Common.logger(log_type, crawler).info('已中过滤词\n')
+                elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
+                    Common.logger(log_type, crawler).info('视频已下载\n')
+                else:
+                    cls.download_publish(log_type=log_type,
+                                         crawler=crawler,
+                                         user_dict=user_dict,
+                                         video_dict=video_dict,
+                                         rule_dict=rule_dict,
+                                         env=env)
+                # except Exception as e:
+                #     Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
 
             Common.logger(log_type, crawler).info('已抓取完一组视频,休眠10秒\n')
             time.sleep(10)
@@ -778,16 +778,16 @@ class XiguasearchScheduling:
     @classmethod
     def get_search_videos(cls, log_type, crawler, user_list, rule_dict, env):
         for user_dict in user_list:
-            try:
-                cls.download_cnt = 0
-                Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['link']} 视频\n")
-                cls.get_videoList(log_type=log_type,
-                                  crawler=crawler,
-                                  user_dict=user_dict,
-                                  rule_dict=rule_dict,
-                                  env=env)
-            except Exception as e:
-                Common.logger(log_type, crawler).error(f"抓取{user_dict['link']}视频时异常:{e}\n")
+            # try:
+            cls.download_cnt = 0
+            Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['link']} 视频\n")
+            cls.get_videoList(log_type=log_type,
+                              crawler=crawler,
+                              user_dict=user_dict,
+                              rule_dict=rule_dict,
+                              env=env)
+            # except Exception as e:
+            #     Common.logger(log_type, crawler).error(f"抓取{user_dict['link']}视频时异常:{e}\n")
 
 
 if __name__ == '__main__':