|
@@ -48,7 +48,7 @@ class ShipinhaoSearchScheduling:
|
|
|
if rule_duration_max == 0:
|
|
|
rule_duration_max = 100000000
|
|
|
|
|
|
- rule_period_min = rule_dict.get('period', {}).get('min', 0)
|
|
|
+ # rule_period_min = rule_dict.get('period', {}).get('min', 0)
|
|
|
# rule_period_max = rule_dict.get('period', {}).get('max', 100000000)
|
|
|
# if rule_period_max == 0:
|
|
|
# rule_period_max = 100000000
|
|
@@ -115,7 +115,7 @@ class ShipinhaoSearchScheduling:
|
|
|
Common.logger(log_type, crawler).info(
|
|
|
f'rule_height_max:{int(rule_height_max)} >= video_height:{int(video_dict["video_height"])} >= rule_height_min:{int(rule_height_min)}')
|
|
|
Common.logger(log_type, crawler).info(
|
|
|
- f'rule_publish_time_max:{int(rule_publish_time_max)} >= publish_time_stamp:{int(video_dict["publish_time_stamp"])} >= rule_publish_time_min:{int(rule_publish_time_min)}')
|
|
|
+ f'rule_publish_time_max:{int(rule_publish_time_max)} >= publish_time_stamp:{int(video_dict["publish_time_stamp"])*1000} >= rule_publish_time_min:{int(rule_publish_time_min)}')
|
|
|
|
|
|
if int(rule_duration_max) >= int(float(video_dict["duration"])) >= int(rule_duration_min) \
|
|
|
and int(rule_play_cnt_max) >= int(video_dict['play_cnt']) >= int(rule_play_cnt_min) \
|
|
@@ -125,7 +125,7 @@ class ShipinhaoSearchScheduling:
|
|
|
and int(rule_favorite_cnt_max) >= int(video_dict['favorite_cnt']) >= int(rule_favorite_cnt_min) \
|
|
|
and int(rule_width_max) >= int(video_dict['video_width']) >= int(rule_width_min) \
|
|
|
and int(rule_height_max) >= int(video_dict['video_height']) >= int(rule_height_min) \
|
|
|
- and int(rule_publish_time_max) >= int(video_dict['publish_time_stamp']) >= int(rule_publish_time_min):
|
|
|
+ and int(rule_publish_time_max) >= int(video_dict['publish_time_stamp'])*1000 >= int(rule_publish_time_min):
|
|
|
return True
|
|
|
else:
|
|
|
return False
|
|
@@ -262,10 +262,6 @@ class ShipinhaoSearchScheduling:
|
|
|
videos_cnt = rule_dict.get('videos_cnt', {}).get('min', 0)
|
|
|
index = 0
|
|
|
while True:
|
|
|
- if cls.download_cnt >= int(videos_cnt):
|
|
|
- Common.logger(log_type, crawler).info(f'搜索词:"{word}",已抓取视频数:{index}')
|
|
|
- cls.download_cnt = 0
|
|
|
- return
|
|
|
# try:
|
|
|
if cls.search_elements(driver, '//*[@class="double-rich double-rich_vertical"]') is None:
|
|
|
Common.logger(log_type, crawler).info('窗口已销毁\n')
|
|
@@ -283,9 +279,16 @@ class ShipinhaoSearchScheduling:
|
|
|
return
|
|
|
|
|
|
for i, video_element in enumerate(video_element_temp):
|
|
|
+ Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
|
|
|
+ if cls.download_cnt >= int(videos_cnt):
|
|
|
+ Common.logger(log_type, crawler).info(f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
|
|
|
+ cls.download_cnt = 0
|
|
|
+ return
|
|
|
+
|
|
|
if video_element is None:
|
|
|
Common.logger(log_type, crawler).info('到底啦~\n')
|
|
|
return
|
|
|
+
|
|
|
cls.i += 1
|
|
|
cls.search_elements(driver, '//div[@class="vc active__mask"]')
|
|
|
|
|
@@ -371,7 +374,10 @@ class ShipinhaoSearchScheduling:
|
|
|
video_dict["video_height"] = ffmpeg_dict["height"]
|
|
|
|
|
|
# 规则判断
|
|
|
- if cls.download_rule(log_type, crawler, video_dict, rule_dict) is False:
|
|
|
+ if cls.download_rule(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ video_dict=video_dict,
|
|
|
+ rule_dict=rule_dict) is False:
|
|
|
md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
|
|
|
shutil.rmtree(f"./{crawler}/videos/{md_title}/")
|
|
|
Common.logger(log_type, crawler).info("不满足抓取规则,删除成功\n")
|
|
@@ -397,9 +403,14 @@ class ShipinhaoSearchScheduling:
|
|
|
Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
|
|
|
if our_video_id is None:
|
|
|
- # 删除视频文件夹
|
|
|
- shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
- return
|
|
|
+ try:
|
|
|
+ # 删除视频文件夹
|
|
|
+ md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
|
|
|
+ shutil.rmtree(f"./{crawler}/videos/{md_title}")
|
|
|
+ Common.logger(log_type, crawler).warning(f"our_video_id:{our_video_id}, 删除成功\n")
|
|
|
+ return
|
|
|
+ except FileNotFoundError:
|
|
|
+ return
|
|
|
|
|
|
insert_sql = f""" insert into crawler_video(video_id,
|
|
|
out_user_id,
|
|
@@ -573,7 +584,7 @@ class ShipinhaoSearchScheduling:
|
|
|
continue
|
|
|
our_user_list = []
|
|
|
# for i in range(1, len(user_sheet)):
|
|
|
- for i in range(1, 4):
|
|
|
+ for i in range(1, 3):
|
|
|
search_word = user_sheet[i][4]
|
|
|
our_uid = user_sheet[i][6]
|
|
|
tag1 = user_sheet[i][8]
|
|
@@ -641,5 +652,9 @@ if __name__ == '__main__':
|
|
|
# oss_endpoint="out",
|
|
|
# env="dev")
|
|
|
# print(ShipinhaoSearchScheduling.get_users("search", "shipinhao", "wNgi6Z", "dev"))
|
|
|
- print((date.today() + timedelta(days=0)).strftime("%Y-%m-%d"))
|
|
|
+ # print((date.today() + timedelta(days=0)).strftime("%Y-%m-%d"))
|
|
|
+ print(ShipinhaoSearchScheduling.repeat_out_video_id(log_type="search",
|
|
|
+ crawler="shipinhao",
|
|
|
+ out_video_id="123",
|
|
|
+ env="dev"))
|
|
|
pass
|