# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/3/27 import os, sys import random sys.path.append(os.getcwd()) from common.common import Common from common.scheduling_db import MysqlHelper # 过滤词库 def filter_word(log_type, crawler, source, env): """ 过滤词库 :param log_type: 日志 :param crawler: 哪款爬虫,如:xiaoniangao :param source: 哪款爬虫,如:小年糕 :param env: 环境 :return: word_list """ select_sql = f""" select * from crawler_filter_word where source="{source}" """ words = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='') word_list = [] if len(words) == 0: return word_list for word in words: word_list.append(word['filter_word']) return word_list def get_user_from_mysql(log_type, crawler, source, env, action=''): sql = f"select * from crawler_author_map where source='{source}' and task_type='{log_type}' and is_del=1" results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action) if results: return results else: Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单") return [] def get_config_from_mysql(log_type, source, env, text, action=''): select_sql = f"""select * from crawler_config where source='{source}' """ contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action) title_list = [] filter_list = [] for content in contents: config = content['config'] config_dict = eval(config) for k, v in config_dict.items(): if k == "title": title_list_config = v.split(",") for title in title_list_config: title_list.append(title) if k == "filter": filter_list_config = v.split(",") for filter_word in filter_list_config: filter_list.append(filter_word) if text == "title": return title_list elif text == "filter": return filter_list def random_title(log_type, crawler, env, text): random_title_list = get_config_from_mysql(log_type, crawler, env, text) return random.choice(random_title_list) if __name__ == "__main__": print(filter_word('public', 'xiaoniangao', '小年糕', 'prod')) # get_config_from_mysql('author', 'xigua', 'prod')