# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/3/27 import os, sys import time import random import difflib sys.path.append(os.getcwd()) from common.common import Common from common.scheduling_db import MysqlHelper # from common import Common # from scheduling_db import MysqlHelper def get_user_from_mysql(log_type, crawler, source, env, action=''): sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'" results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action) if results: return results else: Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单") return [] def title_like(log_type, crawler, platform, title, env): """ 标题相似度 :param log_type: 日志 :param crawler: 哪款爬虫 :param platform: 爬虫渠道,如:公众号 / 小年糕 :param title: 视频标题 :param env: 环境 :return: 相似度>=80%,返回 True;反之,返回 False """ select_sql = f""" select video_title from crawler_video where platform="{platform}" """ video_list = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="") # print(video_list) if len(video_list) == 0: return False for video_dict in video_list: video_title = video_dict["video_title"] # print(video_title) if difflib.SequenceMatcher(None, title, video_title).quick_ratio() >= 0.8: return True else: continue return False def get_config_from_mysql(log_type, source, env, text, action=''): select_sql = f"""select * from crawler_config where source="{source}" """ contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action) title_list = [] filter_list = [] emoji_list = [] search_word_list = [] for content in contents: config = content['config'] config_dict = eval(config) for k, v in config_dict.items(): if k == "title": title_list_config = v.split(",") for title in title_list_config: title_list.append(title) if k == "filter": filter_list_config = v.split(",") for filter_word in filter_list_config: filter_list.append(filter_word) if k == "emoji": emoji_list_config = v.split(",") for emoji in emoji_list_config: emoji_list.append(emoji) if k == "search_word": search_word_list_config = v.split(",") for search_word in search_word_list_config: search_word_list.append(search_word) if text == "title": return title_list elif text == "filter": return filter_list elif text == "emoji": return emoji_list elif text == "search_word": return search_word_list def random_title(log_type, crawler, env, text): random_title_list = get_config_from_mysql(log_type, crawler, env, text) return random.choice(random_title_list) def task_fun(task_str): task_str = task_str.replace("'[", '[').replace("]'", ']') task_dict = dict(eval(task_str)) rule = task_dict['rule'] task_dict['rule'] = dict() for item in rule: for k, val in item.items(): task_dict['rule'][k] = val rule_dict = task_dict['rule'] task_dict = { "task_dict": task_dict, "rule_dict": rule_dict } return task_dict def download_rule(log_type, crawler, video_dict, rule_dict): """ 下载视频的基本规则 :param log_type: 日志 :param crawler: 哪款爬虫 :param video_dict: 视频信息,字典格式 :param rule_dict: 规则信息,字典格式 :return: 满足规则,返回 True;反之,返回 False """ # 格式化 video_dict:publish_time_stamp if "publish_time_stamp" in video_dict.keys(): video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000 # 格式化 video_dict:period if "period" not in video_dict.keys() and "publish_time" in video_dict.keys(): video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000)) # 格式化 rule_dict 最大值取值为 0 的问题 for rule_value in rule_dict.values(): if rule_value["max"] == 0: rule_value["max"] = 999999999999999 # 格式化 rule_dict 有的 key,video_dict 中没有的问题 for rule_key in rule_dict.keys(): if rule_key not in video_dict.keys(): video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2) # 比较结果,输出:True / False for video_key, video_value in video_dict.items(): for rule_key, rule_value in rule_dict.items(): if video_key == rule_key == "period": result = 0 <= int(video_value) <= int(rule_value["max"]) Common.logger(log_type, crawler).info(f'{video_key}: 0 <= {video_value} <= {rule_value["min"]}, {result}') elif video_key == rule_key: result = int(rule_value["min"]) <= int(video_value) <= int(rule_value["max"]) Common.logger(log_type, crawler).info(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}') else: result = True if result is False: return False else: continue return True if __name__ == "__main__": # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod')) # print(get_config_from_mysql('test', 'gongzhonghao', 'prod', 'filter')) # print(filter_word('test', 'gongzhonghao', '公众号', 'prod')) # task_str = "[('task_id','11')," \ # "('task_name','小年糕小时榜')," \ # "('source','xiaoniangao')," \ # "('start_time','1681834560000')," \ # "('interval','1'),('mode','hour')," \ # "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \ # "('spider_name','')," \ # "('machine','')," \ # "('status','0')," \ # "('create_time','1681889875288')," \ # "('update_time','1681889904908')," \ # "('operator','王坤')]" # print(task(task_str)) pass