|
@@ -668,15 +668,7 @@ class XiguaSearch:
|
|
if publish_time <= old_time:
|
|
if publish_time <= old_time:
|
|
Common.logger(log_type, crawler).error(f'关键词:{search_word},抓取完毕,退出抓取\n')
|
|
Common.logger(log_type, crawler).error(f'关键词:{search_word},抓取完毕,退出抓取\n')
|
|
return
|
|
return
|
|
- filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
|
|
|
|
- is_filter = False
|
|
|
|
- for filter_word in filter_words:
|
|
|
|
- if filter_word in video_dict['video_title']:
|
|
|
|
- is_filter = True
|
|
|
|
- break
|
|
|
|
- if is_filter:
|
|
|
|
- Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
|
|
|
|
- continue
|
|
|
|
|
|
+
|
|
if v_type == 'video':
|
|
if v_type == 'video':
|
|
item_id = video_info['data']['group_id']
|
|
item_id = video_info['data']['group_id']
|
|
if video_info['data']['publish_time'] <= old_time:
|
|
if video_info['data']['publish_time'] <= old_time:
|
|
@@ -693,6 +685,15 @@ class XiguaSearch:
|
|
|
|
|
|
try:
|
|
try:
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
|
+ filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
|
|
|
|
+ is_filter = False
|
|
|
|
+ for filter_word in filter_words:
|
|
|
|
+ if filter_word in video_dict['video_title']:
|
|
|
|
+ is_filter = True
|
|
|
|
+ break
|
|
|
|
+ if is_filter:
|
|
|
|
+ Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
|
|
|
|
+ continue
|
|
video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
video_dict['video_width'] = video_url_dict["video_width"]
|
|
video_dict['video_width'] = video_url_dict["video_width"]
|
|
video_dict['video_height'] = video_url_dict["video_height"]
|
|
video_dict['video_height'] = video_url_dict["video_height"]
|
|
@@ -939,20 +940,5 @@ class XiguaSearch:
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
- # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
|
|
|
|
- # XiguaSearch.get_search_videos('search', 'xigua', 'xigua_search', 'inner', 'prod', 'aliyun')
|
|
|
|
-
|
|
|
|
- # Follow.get_videolist(log_type="follow",
|
|
|
|
- # crawler="xigua",
|
|
|
|
- # strategy="定向爬虫策略",
|
|
|
|
- # our_uid="6267141",
|
|
|
|
- # out_uid="95420624045",
|
|
|
|
- # oss_endpoint="out",
|
|
|
|
- # env="dev",
|
|
|
|
- # machine="local")
|
|
|
|
- # print(Follow.random_signature())
|
|
|
|
- # rule = Follow.get_rule("follow", "xigua")
|
|
|
|
- # print(type(rule))
|
|
|
|
- # print(type(json.dumps(rule)))
|
|
|
|
- # print(json.dumps(rule))
|
|
|
|
- pass
|
|
|
|
|
|
+ XiguaSearch.get_search_videos('search', 'xigua', 'xigua_search', 'inner', 'prod', 'aliyun')
|
|
|
|
+
|