|
@@ -597,56 +597,56 @@ class XiguasearchScheduling:
|
|
|
driver.quit()
|
|
|
return
|
|
|
for i, video_element in enumerate(video_element_temp):
|
|
|
- # try:
|
|
|
- if cls.download_cnt >= int(rule_dict.get("videos_cnt", {}).get("min", 30)):
|
|
|
- Common.logger(log_type, crawler).info(f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
|
|
|
- driver.quit()
|
|
|
- return
|
|
|
- if video_element is None:
|
|
|
- Common.logger(log_type, crawler).info('到底啦~\n')
|
|
|
- driver.quit()
|
|
|
- return
|
|
|
- num += 1
|
|
|
- Common.logger(log_type, crawler).info(f"len_videos:{len(video_element_temp)}")
|
|
|
- Common.logger(log_type, crawler).info(f"index:{index}")
|
|
|
- Common.logger(log_type, crawler).info(f"i:{i}")
|
|
|
- Common.logger(log_type, crawler).info(f"index+i:{index+i}")
|
|
|
- Common.logger(log_type, crawler).info(f'拖动"视频"列表第{num}个至屏幕中间')
|
|
|
- driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
|
|
|
- time.sleep(3)
|
|
|
- driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
|
|
|
- item_id = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i].get_attribute('href')
|
|
|
- item_id = item_id.split("com/")[-1].split("?&")[0]
|
|
|
- video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
- if video_dict is None:
|
|
|
- Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
- continue
|
|
|
- for k, v in video_dict.items():
|
|
|
- Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
- # if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
|
|
|
- # Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
|
|
|
- # driver.quit()
|
|
|
- # return
|
|
|
- if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
|
|
|
- Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
- elif any(str(word) if str(word) in video_dict["video_title"] else False
|
|
|
- for word in get_config_from_mysql(log_type=log_type,
|
|
|
- source=crawler,
|
|
|
- env=env,
|
|
|
- text="filter",
|
|
|
- action="")) is True:
|
|
|
- Common.logger(log_type, crawler).info('已中过滤词\n')
|
|
|
- elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
|
|
|
- Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
- else:
|
|
|
- cls.download_publish(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- user_dict=user_dict,
|
|
|
- video_dict=video_dict,
|
|
|
- rule_dict=rule_dict,
|
|
|
- env=env)
|
|
|
- # except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
|
|
|
+ try:
|
|
|
+ if cls.download_cnt >= int(rule_dict.get("videos_cnt", {}).get("min", 30)):
|
|
|
+ Common.logger(log_type, crawler).info(f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
|
|
|
+ driver.quit()
|
|
|
+ return
|
|
|
+ if video_element is None:
|
|
|
+ Common.logger(log_type, crawler).info('到底啦~\n')
|
|
|
+ driver.quit()
|
|
|
+ return
|
|
|
+ num += 1
|
|
|
+ # Common.logger(log_type, crawler).info(f"len_videos:{len(video_element_temp)}")
|
|
|
+ # Common.logger(log_type, crawler).info(f"index:{index}")
|
|
|
+ # Common.logger(log_type, crawler).info(f"i:{i}")
|
|
|
+ # Common.logger(log_type, crawler).info(f"index+i:{index+i}")
|
|
|
+ Common.logger(log_type, crawler).info(f'拖动"视频"列表第{num}个至屏幕中间')
|
|
|
+ driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
|
|
|
+ time.sleep(3)
|
|
|
+ driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
|
|
|
+ item_id = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i].get_attribute('href')
|
|
|
+ item_id = item_id.split("com/")[-1].split("?&")[0]
|
|
|
+ video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
+ if video_dict is None:
|
|
|
+ Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
+ continue
|
|
|
+ for k, v in video_dict.items():
|
|
|
+ Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+ # if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
|
|
|
+ # Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
|
|
|
+ # driver.quit()
|
|
|
+ # return
|
|
|
+ if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
|
|
|
+ Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
+ elif any(str(word) if str(word) in video_dict["video_title"] else False
|
|
|
+ for word in get_config_from_mysql(log_type=log_type,
|
|
|
+ source=crawler,
|
|
|
+ env=env,
|
|
|
+ text="filter",
|
|
|
+ action="")) is True:
|
|
|
+ Common.logger(log_type, crawler).info('已中过滤词\n')
|
|
|
+ elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
|
|
|
+ Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
+ else:
|
|
|
+ cls.download_publish(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ user_dict=user_dict,
|
|
|
+ video_dict=video_dict,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ env=env)
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
|
|
|
|
|
|
Common.logger(log_type, crawler).info('已抓取完一组视频,休眠10秒\n')
|
|
|
time.sleep(10)
|
|
@@ -783,16 +783,16 @@ class XiguasearchScheduling:
|
|
|
@classmethod
|
|
|
def get_search_videos(cls, log_type, crawler, user_list, rule_dict, env):
|
|
|
for user_dict in user_list:
|
|
|
- # try:
|
|
|
- cls.download_cnt = 0
|
|
|
- Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['link']} 视频\n")
|
|
|
- cls.get_videoList(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- user_dict=user_dict,
|
|
|
- rule_dict=rule_dict,
|
|
|
- env=env)
|
|
|
- # except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).error(f"抓取{user_dict['link']}视频时异常:{e}\n")
|
|
|
+ try:
|
|
|
+ cls.download_cnt = 0
|
|
|
+ Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['link']} 视频\n")
|
|
|
+ cls.get_videoList(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ user_dict=user_dict,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ env=env)
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"抓取{user_dict['link']}视频时异常:{e}\n")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|