|
@@ -2,16 +2,18 @@
|
|
|
# @Author: wangkun
|
|
|
# @Time: 2023/3/27
|
|
|
import os, sys
|
|
|
+import time
|
|
|
import random
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
from common.common import Common
|
|
|
from common.scheduling_db import MysqlHelper
|
|
|
+
|
|
|
+
|
|
|
# from common import Common
|
|
|
# from scheduling_db import MysqlHelper
|
|
|
|
|
|
|
|
|
-
|
|
|
# 过滤词库
|
|
|
def filter_word(log_type, crawler, source, env):
|
|
|
"""
|
|
@@ -93,6 +95,96 @@ def task_fun(task_str):
|
|
|
}
|
|
|
return task_dict
|
|
|
|
|
|
+
|
|
|
+def download_rule(log_type, crawler, video_dict, rule_dict):
|
|
|
+ """
|
|
|
+ 下载视频的基本规则
|
|
|
+ :param log_type: 日志
|
|
|
+ :param crawler: 哪款爬虫
|
|
|
+ :param video_dict: 视频信息,字典格式
|
|
|
+ :param rule_dict: 规则信息,字典格式
|
|
|
+ :return: 满足规则,返回 True;反之,返回 False
|
|
|
+ """
|
|
|
+ rule_playCnt_min = rule_dict.get('playCnt', {}).get('min', 0)
|
|
|
+ rule_playCnt_max = rule_dict.get('playCnt', {}).get('max', 100000000)
|
|
|
+ if rule_playCnt_max == 0:
|
|
|
+ rule_playCnt_max = 100000000
|
|
|
+
|
|
|
+ rule_duration_min = rule_dict.get('duration', {}).get('min', 0)
|
|
|
+ rule_duration_max = rule_dict.get('duration', {}).get('max', 100000000)
|
|
|
+ if rule_duration_max == 0:
|
|
|
+ rule_duration_max = 100000000
|
|
|
+
|
|
|
+ rule_period_min = rule_dict.get('period', {}).get('min', 0)
|
|
|
+ # rule_period_max = rule_dict.get('period', {}).get('max', 100000000)
|
|
|
+ # if rule_period_max == 0:
|
|
|
+ # rule_period_max = 100000000
|
|
|
+ #
|
|
|
+ # rule_fans_min = rule_dict.get('fans', {}).get('min', 0)
|
|
|
+ # rule_fans_max = rule_dict.get('fans', {}).get('max', 100000000)
|
|
|
+ # if rule_fans_max == 0:
|
|
|
+ # rule_fans_max = 100000000
|
|
|
+ #
|
|
|
+ # rule_videos_min = rule_dict.get('videos', {}).get('min', 0)
|
|
|
+ # rule_videos_max = rule_dict.get('videos', {}).get('max', 100000000)
|
|
|
+ # if rule_videos_max == 0:
|
|
|
+ # rule_videos_max = 100000000
|
|
|
+
|
|
|
+ rule_like_min = rule_dict.get('like', {}).get('min', 0)
|
|
|
+ rule_like_max = rule_dict.get('like', {}).get('max', 100000000)
|
|
|
+ if rule_like_max == 0:
|
|
|
+ rule_like_max = 100000000
|
|
|
+
|
|
|
+ rule_videoWidth_min = rule_dict.get('videoWidth', {}).get('min', 0)
|
|
|
+ rule_videoWidth_max = rule_dict.get('videoWidth', {}).get('max', 100000000)
|
|
|
+ if rule_videoWidth_max == 0:
|
|
|
+ rule_videoWidth_max = 100000000
|
|
|
+
|
|
|
+ rule_videoHeight_min = rule_dict.get('videoHeight', {}).get('min', 0)
|
|
|
+ rule_videoHeight_max = rule_dict.get('videoHeight', {}).get('max', 100000000)
|
|
|
+ if rule_videoHeight_max == 0:
|
|
|
+ rule_videoHeight_max = 100000000
|
|
|
+
|
|
|
+ rule_shareCnt_min = rule_dict.get('shareCnt', {}).get('min', 0)
|
|
|
+ rule_shareCnt_max = rule_dict.get('shareCnt', {}).get('max', 100000000)
|
|
|
+ if rule_shareCnt_max == 0:
|
|
|
+ rule_shareCnt_max = 100000000
|
|
|
+
|
|
|
+ rule_commentCnt_min = rule_dict.get('commentCnt', {}).get('min', 0)
|
|
|
+ rule_commentCnt_max = rule_dict.get('commentCnt', {}).get('max', 100000000)
|
|
|
+ if rule_commentCnt_max == 0:
|
|
|
+ rule_commentCnt_max = 100000000
|
|
|
+
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
+ f'rule_duration_max:{rule_duration_max} >= duration:{int(float(video_dict["duration"]))} >= rule_duration_min:{int(rule_duration_min)}')
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
+ f'rule_playCnt_max:{int(rule_playCnt_max)} >= play_cnt:{int(video_dict["play_cnt"])} >= rule_playCnt_min:{int(rule_playCnt_min)}')
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
+ f'now:{int(time.time())} - publish_time_stamp:{int(video_dict["publish_time_stamp"])} <= {3600 * 24 * int(rule_period_min)}')
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
+ f'rule_like_max:{int(rule_like_max)} >= like_cnt:{int(video_dict["like_cnt"])} >= rule_like_min:{int(rule_like_min)}')
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
+ f'rule_commentCnt_max:{int(rule_commentCnt_max)} >= comment_cnt:{int(video_dict["comment_cnt"])} >= rule_commentCnt_min:{int(rule_commentCnt_min)}')
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
+ f'rule_shareCnt_max:{int(rule_shareCnt_max)} >= share_cnt:{int(video_dict["share_cnt"])} >= rule_shareCnt_min:{int(rule_shareCnt_min)}')
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
+ f'rule_videoWidth_max:{int(rule_videoWidth_max)} >= video_width:{int(video_dict["video_width"])} >= rule_videoWidth_min:{int(rule_videoWidth_min)}')
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
+ f'rule_videoHeight_max:{int(rule_videoHeight_max)} >= video_height:{int(video_dict["video_height"])} >= rule_videoHeight_min:{int(rule_videoHeight_min)}')
|
|
|
+
|
|
|
+ if int(rule_duration_max) >= int(float(video_dict["duration"])) >= int(rule_duration_min) \
|
|
|
+ and int(rule_playCnt_max) >= int(video_dict['play_cnt']) >= int(rule_playCnt_min) \
|
|
|
+ and int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600 * 24 * int(rule_period_min) \
|
|
|
+ and int(rule_like_max) >= int(video_dict['like_cnt']) >= int(rule_like_min) \
|
|
|
+ and int(rule_commentCnt_max) >= int(video_dict['comment_cnt']) >= int(rule_commentCnt_min) \
|
|
|
+ and int(rule_shareCnt_max) >= int(video_dict['share_cnt']) >= int(rule_shareCnt_min) \
|
|
|
+ and int(rule_videoWidth_max) >= int(video_dict['video_width']) >= int(rule_videoWidth_min) \
|
|
|
+ and int(rule_videoHeight_max) >= int(video_dict['video_height']) >= int(rule_videoHeight_min):
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
|
# print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
|
|
|
print(get_config_from_mysql('hour', 'xiaoniangao', 'dev', 'emoji'))
|