12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2023/3/27
- import os, sys
- import random
- sys.path.append(os.getcwd())
- from common.common import Common
- from common.scheduling_db import MysqlHelper
- # 过滤词库
- def filter_word(log_type, crawler, source, env):
- """
- 过滤词库
- :param log_type: 日志
- :param crawler: 哪款爬虫,如:xiaoniangao
- :param source: 哪款爬虫,如:小年糕
- :param env: 环境
- :return: word_list
- """
- select_sql = f""" select * from crawler_filter_word where source="{source}" """
- words = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
- word_list = []
- if len(words) == 0:
- return word_list
- for word in words:
- word_list.append(word['filter_word'])
- return word_list
- def get_user_from_mysql(log_type, crawler, source, env, action=''):
- sql = f"select * from crawler_author_map where source='{source}' and task_type='{log_type}' and is_del=1"
- results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
- if results:
- return results
- else:
- Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
- return []
- def get_config_from_mysql(log_type, source, env, text, action=''):
- select_sql = f"""select * from crawler_config where source='{source}' """
- contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
- title_list = []
- filter_list = []
- for content in contents:
- config = content['config']
- config_dict = eval(config)
- for k, v in config_dict.items():
- if k == "title":
- title_list_config = v.split(",")
- for title in title_list_config:
- title_list.append(title)
- if k == "filter":
- filter_list_config = v.split(",")
- for filter_word in filter_list_config:
- filter_list.append(filter_word)
- if text == "title":
- return title_list
- elif text == "filter":
- return filter_list
- def random_title(log_type, crawler, env, text):
- random_title_list = get_config_from_mysql(log_type, crawler, env, text)
- return random.choice(random_title_list)
- if __name__ == "__main__":
- print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
- # get_config_from_mysql('author', 'xigua', 'prod')
|