# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/3/27 import os, sys import time import random sys.path.append(os.getcwd()) from common.common import Common from common.scheduling_db import MysqlHelper # from common import Common # from scheduling_db import MysqlHelper # 过滤词库 def filter_word(log_type, crawler, source, env): """ 过滤词库 :param log_type: 日志 :param crawler: 哪款爬虫,如:xiaoniangao :param source: 哪款爬虫,如:小年糕 :param env: 环境 :return: word_list """ select_sql = f""" select * from crawler_filter_word where source="{source}" """ words = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='') word_list = [] if len(words) == 0: return word_list for word in words: word_list.append(word['filter_word']) return word_list def get_user_from_mysql(log_type, crawler, source, env, action=''): sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'" results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action) if results: return results else: Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单") return [] def get_config_from_mysql(log_type, source, env, text, action=''): select_sql = f"""select * from crawler_config where source="{source}" """ contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action) title_list = [] filter_list = [] emoji_list = [] for content in contents: config = content['config'] config_dict = eval(config) for k, v in config_dict.items(): if k == "title": title_list_config = v.split(",") for title in title_list_config: title_list.append(title) if k == "filter": filter_list_config = v.split(",") for filter_word in filter_list_config: filter_list.append(filter_word) if k == "emoji": emoji_list_config = v.split(",") for emoji in emoji_list_config: emoji_list.append(emoji) if text == "title": return title_list elif text == "filter": return filter_list elif text == "emoji": return emoji_list def random_title(log_type, crawler, env, text): random_title_list = get_config_from_mysql(log_type, crawler, env, text) return random.choice(random_title_list) def task_fun(task_str): task_str = task_str.replace("'[{", '[{').replace("}}]'", '}}]') task_dict = dict(eval(task_str)) rule = task_dict['rule'] task_dict['rule'] = dict() for item in rule: for k, val in item.items(): task_dict['rule'][k] = val rule_dict = task_dict['rule'] task_dict = { "task_dict": task_dict, "rule_dict": rule_dict } return task_dict def download_rule(log_type, crawler, video_dict, rule_dict): """ 下载视频的基本规则 :param log_type: 日志 :param crawler: 哪款爬虫 :param video_dict: 视频信息,字典格式 :param rule_dict: 规则信息,字典格式 :return: 满足规则,返回 True;反之,返回 False """ rule_playCnt_min = rule_dict.get('playCnt', {}).get('min', 0) rule_playCnt_max = rule_dict.get('playCnt', {}).get('max', 100000000) if rule_playCnt_max == 0: rule_playCnt_max = 100000000 rule_duration_min = rule_dict.get('duration', {}).get('min', 0) rule_duration_max = rule_dict.get('duration', {}).get('max', 100000000) if rule_duration_max == 0: rule_duration_max = 100000000 rule_period_min = rule_dict.get('period', {}).get('min', 0) # rule_period_max = rule_dict.get('period', {}).get('max', 100000000) # if rule_period_max == 0: # rule_period_max = 100000000 # # rule_fans_min = rule_dict.get('fans', {}).get('min', 0) # rule_fans_max = rule_dict.get('fans', {}).get('max', 100000000) # if rule_fans_max == 0: # rule_fans_max = 100000000 # # rule_videos_min = rule_dict.get('videos', {}).get('min', 0) # rule_videos_max = rule_dict.get('videos', {}).get('max', 100000000) # if rule_videos_max == 0: # rule_videos_max = 100000000 rule_like_min = rule_dict.get('like', {}).get('min', 0) rule_like_max = rule_dict.get('like', {}).get('max', 100000000) if rule_like_max == 0: rule_like_max = 100000000 rule_videoWidth_min = rule_dict.get('videoWidth', {}).get('min', 0) rule_videoWidth_max = rule_dict.get('videoWidth', {}).get('max', 100000000) if rule_videoWidth_max == 0: rule_videoWidth_max = 100000000 rule_videoHeight_min = rule_dict.get('videoHeight', {}).get('min', 0) rule_videoHeight_max = rule_dict.get('videoHeight', {}).get('max', 100000000) if rule_videoHeight_max == 0: rule_videoHeight_max = 100000000 rule_shareCnt_min = rule_dict.get('shareCnt', {}).get('min', 0) rule_shareCnt_max = rule_dict.get('shareCnt', {}).get('max', 100000000) if rule_shareCnt_max == 0: rule_shareCnt_max = 100000000 rule_commentCnt_min = rule_dict.get('commentCnt', {}).get('min', 0) rule_commentCnt_max = rule_dict.get('commentCnt', {}).get('max', 100000000) if rule_commentCnt_max == 0: rule_commentCnt_max = 100000000 Common.logger(log_type, crawler).info( f'rule_duration_max:{rule_duration_max} >= duration:{int(float(video_dict["duration"]))} >= rule_duration_min:{int(rule_duration_min)}') Common.logger(log_type, crawler).info( f'rule_playCnt_max:{int(rule_playCnt_max)} >= play_cnt:{int(video_dict["play_cnt"])} >= rule_playCnt_min:{int(rule_playCnt_min)}') Common.logger(log_type, crawler).info( f'now:{int(time.time())} - publish_time_stamp:{int(video_dict["publish_time_stamp"])} <= {3600 * 24 * int(rule_period_min)}') Common.logger(log_type, crawler).info( f'rule_like_max:{int(rule_like_max)} >= like_cnt:{int(video_dict["like_cnt"])} >= rule_like_min:{int(rule_like_min)}') Common.logger(log_type, crawler).info( f'rule_commentCnt_max:{int(rule_commentCnt_max)} >= comment_cnt:{int(video_dict["comment_cnt"])} >= rule_commentCnt_min:{int(rule_commentCnt_min)}') Common.logger(log_type, crawler).info( f'rule_shareCnt_max:{int(rule_shareCnt_max)} >= share_cnt:{int(video_dict["share_cnt"])} >= rule_shareCnt_min:{int(rule_shareCnt_min)}') Common.logger(log_type, crawler).info( f'rule_videoWidth_max:{int(rule_videoWidth_max)} >= video_width:{int(video_dict["video_width"])} >= rule_videoWidth_min:{int(rule_videoWidth_min)}') Common.logger(log_type, crawler).info( f'rule_videoHeight_max:{int(rule_videoHeight_max)} >= video_height:{int(video_dict["video_height"])} >= rule_videoHeight_min:{int(rule_videoHeight_min)}') if int(rule_duration_max) >= int(float(video_dict["duration"])) >= int(rule_duration_min) \ and int(rule_playCnt_max) >= int(video_dict['play_cnt']) >= int(rule_playCnt_min) \ and int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600 * 24 * int(rule_period_min) \ and int(rule_like_max) >= int(video_dict['like_cnt']) >= int(rule_like_min) \ and int(rule_commentCnt_max) >= int(video_dict['comment_cnt']) >= int(rule_commentCnt_min) \ and int(rule_shareCnt_max) >= int(video_dict['share_cnt']) >= int(rule_shareCnt_min) \ and int(rule_videoWidth_max) >= int(video_dict['video_width']) >= int(rule_videoWidth_min) \ and int(rule_videoHeight_max) >= int(video_dict['video_height']) >= int(rule_videoHeight_min): return True else: return False if __name__ == "__main__": # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod')) print(get_config_from_mysql('hour', 'xiaoniangao', 'dev', 'emoji')) # task_str = "[('task_id','11')," \ # "('task_name','小年糕小时榜')," \ # "('source','xiaoniangao')," \ # "('start_time','1681834560000')," \ # "('interval','1'),('mode','hour')," \ # "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \ # "('spider_name','')," \ # "('machine','')," \ # "('status','0')," \ # "('create_time','1681889875288')," \ # "('update_time','1681889904908')," \ # "('operator','王坤')]" # print(task(task_str)) pass