|
@@ -668,6 +668,15 @@ class XiguaSearch:
|
|
if publish_time <= old_time:
|
|
if publish_time <= old_time:
|
|
Common.logger(log_type, crawler).error(f'关键词:{search_word},抓取完毕,退出抓取\n')
|
|
Common.logger(log_type, crawler).error(f'关键词:{search_word},抓取完毕,退出抓取\n')
|
|
return
|
|
return
|
|
|
|
+ filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
|
|
|
|
+ is_filter = False
|
|
|
|
+ for filter_word in filter_words:
|
|
|
|
+ if filter_word in video_dict['video_title']:
|
|
|
|
+ is_filter = True
|
|
|
|
+ break
|
|
|
|
+ if is_filter:
|
|
|
|
+ Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
|
|
|
|
+ continue
|
|
if v_type == 'video':
|
|
if v_type == 'video':
|
|
item_id = video_info['data']['group_id']
|
|
item_id = video_info['data']['group_id']
|
|
if video_info['data']['publish_time'] <= old_time:
|
|
if video_info['data']['publish_time'] <= old_time:
|
|
@@ -680,6 +689,8 @@ class XiguaSearch:
|
|
'video_time'] > rule_dict['max_duration']:
|
|
'video_time'] > rule_dict['max_duration']:
|
|
Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},不符合抓取规则\n')
|
|
Common.logger(log_type, crawler).error(f'关键词:{search_word},视频:{item_id},不符合抓取规则\n')
|
|
continue
|
|
continue
|
|
|
|
+
|
|
|
|
+
|
|
try:
|
|
try:
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
@@ -796,11 +807,7 @@ class XiguaSearch:
|
|
@classmethod
|
|
@classmethod
|
|
def download_publish(cls, log_type, crawler, search_word, strategy, video_dict, rule_dict, our_uid, oss_endpoint,
|
|
def download_publish(cls, log_type, crawler, search_word, strategy, video_dict, rule_dict, our_uid, oss_endpoint,
|
|
env, machine):
|
|
env, machine):
|
|
- filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
|
|
|
|
- for filter_word in filter_words:
|
|
|
|
- if filter_word in video_dict['video_title']:
|
|
|
|
- Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
|
|
|
|
- return
|
|
|
|
|
|
+
|
|
Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
|
|
Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
|
|
title=video_dict['video_title'], url=video_dict['video_url'])
|
|
title=video_dict['video_title'], url=video_dict['video_url'])
|
|
# 下载音频
|
|
# 下载音频
|