|
@@ -565,6 +565,7 @@ class XiguasearchScheduling:
|
|
|
driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(chromedriver))
|
|
|
driver.implicitly_wait(10)
|
|
|
Common.logger(log_type, crawler).info(f"打开搜索页:{user_dict['link']}")
|
|
|
+ Common.logging(log_type, crawler, env, f"打开搜索页:{user_dict['link']}")
|
|
|
driver.get(f"https://www.ixigua.com/search/{user_dict['link']}/")
|
|
|
time.sleep(3)
|
|
|
# driver.get_screenshot_as_file(f"./{crawler}/logs/打开搜索页.jpg")
|
|
@@ -594,20 +595,24 @@ class XiguasearchScheduling:
|
|
|
video_element_temp = video_elements[index:]
|
|
|
if len(video_element_temp) == 0:
|
|
|
Common.logger(log_type, crawler).info('到底啦~~~~~~~~~~~~~\n')
|
|
|
+ Common.logging(log_type, crawler, env, '到底啦~~~~~~~~~~~~~\n')
|
|
|
driver.quit()
|
|
|
return
|
|
|
for i, video_element in enumerate(video_element_temp):
|
|
|
try:
|
|
|
if cls.download_cnt >= int(rule_dict.get("videos_cnt", {}).get("min", 30)):
|
|
|
Common.logger(log_type, crawler).info(f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
|
|
|
driver.quit()
|
|
|
return
|
|
|
if video_element is None:
|
|
|
Common.logger(log_type, crawler).info('到底啦~\n')
|
|
|
+ Common.logging(log_type, crawler, env, '到底啦~\n')
|
|
|
driver.quit()
|
|
|
return
|
|
|
num += 1
|
|
|
Common.logger(log_type, crawler).info(f'拖动"视频"列表第{num}个至屏幕中间')
|
|
|
+ Common.logging(log_type, crawler, env, f'拖动"视频"列表第{num}个至屏幕中间')
|
|
|
driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
|
|
|
time.sleep(3)
|
|
|
# driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
|
|
@@ -616,15 +621,18 @@ class XiguasearchScheduling:
|
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
if video_dict is None:
|
|
|
Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
+ Common.logging(log_type, crawler, env, "无效视频\n")
|
|
|
continue
|
|
|
for k, v in video_dict.items():
|
|
|
Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+ Common.logging(log_type, crawler, env, f"{video_dict}")
|
|
|
# if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
|
|
|
# Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
|
|
|
# driver.quit()
|
|
|
# return
|
|
|
if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
|
|
|
Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
+ Common.logging(log_type, crawler, env, "不满足抓取规则\n")
|
|
|
elif any(str(word) if str(word) in video_dict["video_title"] else False
|
|
|
for word in get_config_from_mysql(log_type=log_type,
|
|
|
source=crawler,
|
|
@@ -632,8 +640,10 @@ class XiguasearchScheduling:
|
|
|
text="filter",
|
|
|
action="")) is True:
|
|
|
Common.logger(log_type, crawler).info('已中过滤词\n')
|
|
|
+ Common.logging(log_type, crawler, env, '已中过滤词\n')
|
|
|
elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
|
|
|
Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
+ Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
else:
|
|
|
cls.download_publish(log_type=log_type,
|
|
|
crawler=crawler,
|
|
@@ -643,8 +653,10 @@ class XiguasearchScheduling:
|
|
|
env=env)
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
|
|
|
|
|
|
Common.logger(log_type, crawler).info('已抓取完一组视频,休眠10秒\n')
|
|
|
+ Common.logging(log_type, crawler, env, '已抓取完一组视频,休眠10秒\n')
|
|
|
time.sleep(10)
|
|
|
index = index + len(video_element_temp)
|
|
|
|
|
@@ -672,11 +684,13 @@ class XiguasearchScheduling:
|
|
|
# 删除视频文件夹
|
|
|
shutil.rmtree(f"./{crawler}/videos/{md_title}")
|
|
|
Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
|
|
|
+ Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
|
|
|
return
|
|
|
except FileNotFoundError:
|
|
|
# 删除视频文件夹
|
|
|
shutil.rmtree(f"./{crawler}/videos/{md_title}")
|
|
|
Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
|
|
|
+ Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
|
|
|
return
|
|
|
# 下载封面
|
|
|
Common.download_method(log_type=log_type, crawler=crawler, text='cover',
|
|
@@ -686,6 +700,7 @@ class XiguasearchScheduling:
|
|
|
|
|
|
# 上传视频
|
|
|
Common.logger(log_type, crawler).info("开始上传视频...")
|
|
|
+ Common.logging(log_type, crawler, env, "开始上传视频...")
|
|
|
if env == "dev":
|
|
|
oss_endpoint = "out"
|
|
|
our_video_id = Publish.upload_and_publish(log_type=log_type,
|
|
@@ -746,9 +761,11 @@ class XiguasearchScheduling:
|
|
|
{int(video_dict['video_width'])},
|
|
|
{int(video_dict['video_height'])}) """
|
|
|
Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
|
|
|
+ Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
|
|
|
MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
|
|
|
cls.download_cnt += 1
|
|
|
Common.logger(log_type, crawler).info("视频信息写入数据库完成")
|
|
|
+ Common.logging(log_type, crawler, env, "视频信息写入数据库完成")
|
|
|
|
|
|
# 视频信息写入飞书
|
|
|
Feishu.insert_columns(log_type, crawler, "BUNvGC", "ROWS", 1, 2)
|
|
@@ -775,6 +792,7 @@ class XiguasearchScheduling:
|
|
|
time.sleep(0.5)
|
|
|
Feishu.update_values(log_type, crawler, "BUNvGC", "E2:Z2", values)
|
|
|
Common.logger(log_type, crawler).info('视频信息写入飞书完成\n')
|
|
|
+ Common.logging(log_type, crawler, env, '视频信息写入飞书完成\n')
|
|
|
|
|
|
@classmethod
|
|
|
def get_search_videos(cls, log_type, crawler, user_list, rule_dict, env):
|
|
@@ -782,6 +800,7 @@ class XiguasearchScheduling:
|
|
|
try:
|
|
|
cls.download_cnt = 0
|
|
|
Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['link']} 视频\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"开始抓取 {user_dict['link']} 视频\n")
|
|
|
cls.get_videoList(log_type=log_type,
|
|
|
crawler=crawler,
|
|
|
user_dict=user_dict,
|
|
@@ -789,6 +808,7 @@ class XiguasearchScheduling:
|
|
|
env=env)
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).error(f"抓取{user_dict['link']}视频时异常:{e}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"抓取{user_dict['link']}视频时异常:{e}\n")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|