|
@@ -4,7 +4,6 @@
|
|
|
import os, sys
|
|
|
import time
|
|
|
import random
|
|
|
-
|
|
|
sys.path.append(os.getcwd())
|
|
|
from common.common import Common
|
|
|
from common.scheduling_db import MysqlHelper
|
|
@@ -12,27 +11,6 @@ from common.scheduling_db import MysqlHelper
|
|
|
# from scheduling_db import MysqlHelper
|
|
|
|
|
|
|
|
|
-# 过滤词库
|
|
|
-def filter_word(log_type, crawler, source, env):
|
|
|
- """
|
|
|
- 过滤词库
|
|
|
- :param log_type: 日志
|
|
|
- :param crawler: 哪款爬虫,如:xiaoniangao
|
|
|
- :param source: 哪款爬虫,如:小年糕
|
|
|
- :param env: 环境
|
|
|
- :return: word_list
|
|
|
- """
|
|
|
- select_sql = f""" select * from crawler_filter_word where source="{source}" """
|
|
|
- words = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
|
|
|
- word_list = []
|
|
|
- if len(words) == 0:
|
|
|
- return word_list
|
|
|
- for word in words:
|
|
|
- word_list.append(word['filter_word'])
|
|
|
-
|
|
|
- return word_list
|
|
|
-
|
|
|
-
|
|
|
def get_user_from_mysql(log_type, crawler, source, env, action=''):
|
|
|
sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
|
|
|
results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
|
|
@@ -109,89 +87,38 @@ def download_rule(log_type, crawler, video_dict, rule_dict):
|
|
|
:param rule_dict: 规则信息,字典格式
|
|
|
:return: 满足规则,返回 True;反之,返回 False
|
|
|
"""
|
|
|
- rule_playCnt_min = rule_dict.get('playCnt', {}).get('min', 0)
|
|
|
- rule_playCnt_max = rule_dict.get('playCnt', {}).get('max', 100000000)
|
|
|
- if rule_playCnt_max == 0:
|
|
|
- rule_playCnt_max = 100000000
|
|
|
-
|
|
|
- rule_duration_min = rule_dict.get('duration', {}).get('min', 0)
|
|
|
- rule_duration_max = rule_dict.get('duration', {}).get('max', 100000000)
|
|
|
- if rule_duration_max == 0:
|
|
|
- rule_duration_max = 100000000
|
|
|
-
|
|
|
- rule_period_min = rule_dict.get('period', {}).get('min', 0)
|
|
|
- # rule_period_max = rule_dict.get('period', {}).get('max', 100000000)
|
|
|
- # if rule_period_max == 0:
|
|
|
- # rule_period_max = 100000000
|
|
|
- #
|
|
|
- # rule_fans_min = rule_dict.get('fans', {}).get('min', 0)
|
|
|
- # rule_fans_max = rule_dict.get('fans', {}).get('max', 100000000)
|
|
|
- # if rule_fans_max == 0:
|
|
|
- # rule_fans_max = 100000000
|
|
|
- #
|
|
|
- # rule_videos_min = rule_dict.get('videos', {}).get('min', 0)
|
|
|
- # rule_videos_max = rule_dict.get('videos', {}).get('max', 100000000)
|
|
|
- # if rule_videos_max == 0:
|
|
|
- # rule_videos_max = 100000000
|
|
|
-
|
|
|
- rule_like_min = rule_dict.get('like', {}).get('min', 0)
|
|
|
- rule_like_max = rule_dict.get('like', {}).get('max', 100000000)
|
|
|
- if rule_like_max == 0:
|
|
|
- rule_like_max = 100000000
|
|
|
-
|
|
|
- rule_videoWidth_min = rule_dict.get('videoWidth', {}).get('min', 0)
|
|
|
- rule_videoWidth_max = rule_dict.get('videoWidth', {}).get('max', 100000000)
|
|
|
- if rule_videoWidth_max == 0:
|
|
|
- rule_videoWidth_max = 100000000
|
|
|
-
|
|
|
- rule_videoHeight_min = rule_dict.get('videoHeight', {}).get('min', 0)
|
|
|
- rule_videoHeight_max = rule_dict.get('videoHeight', {}).get('max', 100000000)
|
|
|
- if rule_videoHeight_max == 0:
|
|
|
- rule_videoHeight_max = 100000000
|
|
|
-
|
|
|
- rule_shareCnt_min = rule_dict.get('shareCnt', {}).get('min', 0)
|
|
|
- rule_shareCnt_max = rule_dict.get('shareCnt', {}).get('max', 100000000)
|
|
|
- if rule_shareCnt_max == 0:
|
|
|
- rule_shareCnt_max = 100000000
|
|
|
-
|
|
|
- rule_commentCnt_min = rule_dict.get('commentCnt', {}).get('min', 0)
|
|
|
- rule_commentCnt_max = rule_dict.get('commentCnt', {}).get('max', 100000000)
|
|
|
- if rule_commentCnt_max == 0:
|
|
|
- rule_commentCnt_max = 100000000
|
|
|
-
|
|
|
- Common.logger(log_type, crawler).info(
|
|
|
- f'rule_duration_max:{rule_duration_max} >= duration:{int(float(video_dict["duration"]))} >= rule_duration_min:{int(rule_duration_min)}')
|
|
|
- Common.logger(log_type, crawler).info(
|
|
|
- f'rule_playCnt_max:{int(rule_playCnt_max)} >= play_cnt:{int(video_dict["play_cnt"])} >= rule_playCnt_min:{int(rule_playCnt_min)}')
|
|
|
- Common.logger(log_type, crawler).info(
|
|
|
- f'now:{int(time.time())} - publish_time_stamp:{int(video_dict["publish_time_stamp"])} <= {3600 * 24 * int(rule_period_min)}')
|
|
|
- Common.logger(log_type, crawler).info(
|
|
|
- f'rule_like_max:{int(rule_like_max)} >= like_cnt:{int(video_dict["like_cnt"])} >= rule_like_min:{int(rule_like_min)}')
|
|
|
- Common.logger(log_type, crawler).info(
|
|
|
- f'rule_commentCnt_max:{int(rule_commentCnt_max)} >= comment_cnt:{int(video_dict["comment_cnt"])} >= rule_commentCnt_min:{int(rule_commentCnt_min)}')
|
|
|
- Common.logger(log_type, crawler).info(
|
|
|
- f'rule_shareCnt_max:{int(rule_shareCnt_max)} >= share_cnt:{int(video_dict["share_cnt"])} >= rule_shareCnt_min:{int(rule_shareCnt_min)}')
|
|
|
- Common.logger(log_type, crawler).info(
|
|
|
- f'rule_videoWidth_max:{int(rule_videoWidth_max)} >= video_width:{int(video_dict["video_width"])} >= rule_videoWidth_min:{int(rule_videoWidth_min)}')
|
|
|
- Common.logger(log_type, crawler).info(
|
|
|
- f'rule_videoHeight_max:{int(rule_videoHeight_max)} >= video_height:{int(video_dict["video_height"])} >= rule_videoHeight_min:{int(rule_videoHeight_min)}')
|
|
|
-
|
|
|
- if int(rule_duration_max) >= int(float(video_dict["duration"])) >= int(rule_duration_min) \
|
|
|
- and int(rule_playCnt_max) >= int(video_dict['play_cnt']) >= int(rule_playCnt_min) \
|
|
|
- and int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600 * 24 * int(rule_period_min) \
|
|
|
- and int(rule_like_max) >= int(video_dict['like_cnt']) >= int(rule_like_min) \
|
|
|
- and int(rule_commentCnt_max) >= int(video_dict['comment_cnt']) >= int(rule_commentCnt_min) \
|
|
|
- and int(rule_shareCnt_max) >= int(video_dict['share_cnt']) >= int(rule_shareCnt_min) \
|
|
|
- and int(rule_videoWidth_max) >= int(video_dict['video_width']) >= int(rule_videoWidth_min) \
|
|
|
- and int(rule_videoHeight_max) >= int(video_dict['video_height']) >= int(rule_videoHeight_min):
|
|
|
- return True
|
|
|
- else:
|
|
|
- return False
|
|
|
+ # 格式化 video_dict:publish_time_stamp
|
|
|
+ if "publish_time_stamp" in video_dict.keys():
|
|
|
+ video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
|
|
|
+ # 格式化 video_dict:period
|
|
|
+ if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
|
|
|
+ video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000))
|
|
|
+ # 格式化 rule_dict 最大值取值为 0 的问题
|
|
|
+ for rule_value in rule_dict.values():
|
|
|
+ if rule_value["max"] == 0:
|
|
|
+ rule_value["max"] = 999999999999999
|
|
|
+ # 格式化 rule_dict 有的 key,video_dict 中没有的问题
|
|
|
+ for rule_key in rule_dict.keys():
|
|
|
+ if rule_key not in video_dict.keys():
|
|
|
+ video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2)
|
|
|
+ # 比较结果,输出:True / False
|
|
|
+ for video_key, video_value in video_dict.items():
|
|
|
+ for rule_key, rule_value in rule_dict.items():
|
|
|
+ if video_key == rule_key:
|
|
|
+ result = rule_value["min"] <= video_value <= rule_value["max"]
|
|
|
+ # print(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
|
|
|
+ Common.logger(log_type, crawler).info(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
|
|
|
+ if result is False:
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+ return True
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
# print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
|
|
|
- print(get_config_from_mysql('hour', 'xiaoniangao', 'prod', 'emoji'))
|
|
|
+ # print(get_config_from_mysql('test', 'gongzhonghao', 'prod', 'filter'))
|
|
|
+ # print(filter_word('test', 'gongzhonghao', '公众号', 'prod'))
|
|
|
# task_str = "[('task_id','11')," \
|
|
|
# "('task_name','小年糕小时榜')," \
|
|
|
# "('source','xiaoniangao')," \
|