123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2023/3/27
- import os, sys
- import time
- import random
- sys.path.append(os.getcwd())
- from common.common import Common
- from common.scheduling_db import MysqlHelper
- # from common import Common
- # from scheduling_db import MysqlHelper
- def get_user_from_mysql(log_type, crawler, source, env, action=''):
- sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
- results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
- if results:
- return results
- else:
- Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
- return []
- def get_config_from_mysql(log_type, source, env, text, action=''):
- select_sql = f"""select * from crawler_config where source="{source}" """
- contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
- title_list = []
- filter_list = []
- emoji_list = []
- search_word_list = []
- for content in contents:
- config = content['config']
- config_dict = eval(config)
- for k, v in config_dict.items():
- if k == "title":
- title_list_config = v.split(",")
- for title in title_list_config:
- title_list.append(title)
- if k == "filter":
- filter_list_config = v.split(",")
- for filter_word in filter_list_config:
- filter_list.append(filter_word)
- if k == "emoji":
- emoji_list_config = v.split(",")
- for emoji in emoji_list_config:
- emoji_list.append(emoji)
- if k == "search_word":
- search_word_list_config = v.split(",")
- for search_word in search_word_list_config:
- search_word_list.append(search_word)
- if text == "title":
- return title_list
- elif text == "filter":
- return filter_list
- elif text == "emoji":
- return emoji_list
- elif text == "search_word":
- return search_word_list
- def random_title(log_type, crawler, env, text):
- random_title_list = get_config_from_mysql(log_type, crawler, env, text)
- return random.choice(random_title_list)
- def task_fun(task_str):
- task_str = task_str.replace("'[", '[').replace("]'", ']')
- task_dict = dict(eval(task_str))
- rule = task_dict['rule']
- task_dict['rule'] = dict()
- for item in rule:
- for k, val in item.items():
- task_dict['rule'][k] = val
- rule_dict = task_dict['rule']
- task_dict = {
- "task_dict": task_dict,
- "rule_dict": rule_dict
- }
- return task_dict
- def download_rule(log_type, crawler, video_dict, rule_dict):
- """
- 下载视频的基本规则
- :param log_type: 日志
- :param crawler: 哪款爬虫
- :param video_dict: 视频信息,字典格式
- :param rule_dict: 规则信息,字典格式
- :return: 满足规则,返回 True;反之,返回 False
- """
- # 格式化 video_dict:publish_time_stamp
- if "publish_time_stamp" in video_dict.keys():
- video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
- # 格式化 video_dict:period
- if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
- video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000))
- # 格式化 rule_dict 最大值取值为 0 的问题
- for rule_value in rule_dict.values():
- if rule_value["max"] == 0:
- rule_value["max"] = 999999999999999
- # 格式化 rule_dict 有的 key,video_dict 中没有的问题
- for rule_key in rule_dict.keys():
- if rule_key not in video_dict.keys():
- video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2)
- # 比较结果,输出:True / False
- for video_key, video_value in video_dict.items():
- for rule_key, rule_value in rule_dict.items():
- if video_key == rule_key == "period":
- result = 0 <= int(video_value) <= int(rule_value["max"])
- Common.logger(log_type, crawler).info(f'{video_key}: 0 <= {video_value} <= {rule_value["min"]}, {result}')
- elif video_key == rule_key:
- result = int(rule_value["min"]) <= int(video_value) <= int(rule_value["max"])
- Common.logger(log_type, crawler).info(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
- else:
- result = True
- if result is False:
- return False
- else:
- continue
- return True
- if __name__ == "__main__":
- # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
- # print(get_config_from_mysql('test', 'gongzhonghao', 'prod', 'filter'))
- # print(filter_word('test', 'gongzhonghao', '公众号', 'prod'))
- # task_str = "[('task_id','11')," \
- # "('task_name','小年糕小时榜')," \
- # "('source','xiaoniangao')," \
- # "('start_time','1681834560000')," \
- # "('interval','1'),('mode','hour')," \
- # "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \
- # "('spider_name','')," \
- # "('machine','')," \
- # "('status','0')," \
- # "('create_time','1681889875288')," \
- # "('update_time','1681889904908')," \
- # "('operator','王坤')]"
- # print(task(task_str))
- pass
|