|
@@ -568,24 +568,23 @@ class XiguasearchScheduling:
|
|
|
Common.logging(log_type, crawler, env, f"打开搜索页:{user_dict['link']}")
|
|
|
driver.get(f"https://www.ixigua.com/search/{user_dict['link']}/")
|
|
|
time.sleep(3)
|
|
|
- # driver.get_screenshot_as_file(f"./{crawler}/logs/打开搜索页.jpg")
|
|
|
- # if len(driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]')) != 0:
|
|
|
- # driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click()
|
|
|
- # time.sleep(1)
|
|
|
- # Common.logger(log_type, crawler).info("点击筛选")
|
|
|
- # driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
|
|
|
- # time.sleep(1)
|
|
|
- # Common.logger(log_type, crawler).info("点击最新排序")
|
|
|
- # driver.find_element(By.XPATH, '//*[@class="searchPageV2-category__wrapper"]/*[2]/*[1]').click()
|
|
|
- # time.sleep(5)
|
|
|
- # Common.logger(log_type, crawler).info("收回筛选")
|
|
|
- # driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
|
|
|
- # time.sleep(1)
|
|
|
- # # 点击列表形式//div[@class="searchPageV2__header-icons"]/*[3]
|
|
|
- # Common.logger(log_type, crawler).info("点击列表形式展示")
|
|
|
- # driver.find_element(By.XPATH, '//div[@class="searchPageV2__header-icons"]/*[3]').click()
|
|
|
- # time.sleep(3)
|
|
|
- # driver.get_screenshot_as_file(f"./{crawler}/logs/已点击最新排序.jpg")
|
|
|
+ Common.logger(log_type, crawler).info("关闭登录弹框")
|
|
|
+ Common.logging(log_type, crawler, env, "关闭登录弹框")
|
|
|
+ if driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]') != 0:
|
|
|
+ driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click()
|
|
|
+ # driver.get_screenshot_as_file(f"./{crawler}/photos/{user_dict['link']}-关闭登录弹框.png")
|
|
|
+ Common.logger(log_type, crawler).info("展开筛选按钮")
|
|
|
+ Common.logging(log_type, crawler, env, "展开筛选按钮")
|
|
|
+ driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
|
|
|
+ Common.logger(log_type, crawler).info("点击最新排序")
|
|
|
+ Common.logging(log_type, crawler, env, "点击最新排序")
|
|
|
+ driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-categories-wrapper"]/*[1]/*[2]/*[1]').click()
|
|
|
+ time.sleep(3)
|
|
|
+ # driver.get_screenshot_as_file(f"./{crawler}/photos/{user_dict['link']}-最新排序.png")
|
|
|
+ Common.logger(log_type, crawler).info("收起筛选按钮\n")
|
|
|
+ Common.logging(log_type, crawler, env, "收起筛选按钮\n")
|
|
|
+ driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
|
|
|
+ time.sleep(1)
|
|
|
|
|
|
index = 0
|
|
|
num = 0
|
|
@@ -615,8 +614,20 @@ class XiguasearchScheduling:
|
|
|
Common.logging(log_type, crawler, env, f'拖动"视频"列表第{num}个至屏幕中间')
|
|
|
driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
|
|
|
time.sleep(3)
|
|
|
- # driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
|
|
|
- item_id = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i].get_attribute('href')
|
|
|
+ # driver.get_screenshot_as_file(f"./{crawler}/photos/{user_dict['link']}-{num}.png")
|
|
|
+ title = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i-1].get_attribute('title')
|
|
|
+ publish_day = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard-accessories-bottomInfo__statistics"]')[index+i-1].text.split('· ')[-1]
|
|
|
+ Common.logger(log_type, crawler).info(f"标题:{title}")
|
|
|
+ Common.logging(log_type, crawler, env, f"标题:{title}")
|
|
|
+ Common.logger(log_type, crawler).info(f"发布时间:{publish_day}")
|
|
|
+ Common.logging(log_type, crawler, env, f"发布时间:{publish_day}")
|
|
|
+ if "年" in publish_day:
|
|
|
+ Common.logger(log_type, crawler).info("发布时间超过 1 年\n")
|
|
|
+ Common.logging(log_type, crawler, env, "发布时间超过 1 年\n")
|
|
|
+ driver.quit()
|
|
|
+ return
|
|
|
+
|
|
|
+ item_id = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i-1].get_attribute('href')
|
|
|
item_id = item_id.split("com/")[-1].split("?&")[0]
|
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
if video_dict is None:
|
|
@@ -626,10 +637,12 @@ class XiguasearchScheduling:
|
|
|
for k, v in video_dict.items():
|
|
|
Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
Common.logging(log_type, crawler, env, f"{video_dict}")
|
|
|
+
|
|
|
# if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
|
|
|
# Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
|
|
|
# driver.quit()
|
|
|
# return
|
|
|
+
|
|
|
if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
|
|
|
Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
Common.logging(log_type, crawler, env, "不满足抓取规则\n")
|
|
@@ -805,6 +818,8 @@ class XiguasearchScheduling:
|
|
|
|
|
|
@classmethod
|
|
|
def get_search_videos(cls, log_type, crawler, user_list, rule_dict, env):
|
|
|
+ Common.logger(log_type, crawler).info(f"搜索词总数:{len(user_list)}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"搜索词总数:{len(user_list)}\n")
|
|
|
for user_dict in user_list:
|
|
|
try:
|
|
|
cls.download_cnt = 0
|