public.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/27
  4. import os, sys
  5. import random
  6. sys.path.append(os.getcwd())
  7. from common.common import Common
  8. from common.scheduling_db import MysqlHelper
  9. # 过滤词库
  10. def filter_word(log_type, crawler, source, env):
  11. """
  12. 过滤词库
  13. :param log_type: 日志
  14. :param crawler: 哪款爬虫,如:xiaoniangao
  15. :param source: 哪款爬虫,如:小年糕
  16. :param env: 环境
  17. :return: word_list
  18. """
  19. select_sql = f""" select * from crawler_filter_word where source="{source}" """
  20. words = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
  21. word_list = []
  22. if len(words) == 0:
  23. return word_list
  24. for word in words:
  25. word_list.append(word['filter_word'])
  26. return word_list
  27. def get_user_from_mysql(log_type, crawler, source, env, action=''):
  28. sql = f"select * from crawler_author_map where source='{source}' and task_type='{log_type}' and is_del=1"
  29. results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
  30. if results:
  31. return results
  32. else:
  33. Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
  34. return []
  35. def get_config_from_mysql(log_type, source, env, text, action=''):
  36. select_sql = f"""select * from crawler_config where source='{source}' """
  37. contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
  38. title_list = []
  39. filter_list = []
  40. for content in contents:
  41. config = content['config']
  42. config_dict = eval(config)
  43. for k, v in config_dict.items():
  44. if k == "title":
  45. title_list_config = v.split(",")
  46. for title in title_list_config:
  47. title_list.append(title)
  48. if k == "filter":
  49. filter_list_config = v.split(",")
  50. for filter_word in filter_list_config:
  51. filter_list.append(filter_word)
  52. if text == "title":
  53. return title_list
  54. elif text == "filter":
  55. return filter_list
  56. def random_title(log_type, crawler, env, text):
  57. random_title_list = get_config_from_mysql(log_type, crawler, env, text)
  58. return random.choice(random_title_list)
  59. if __name__ == "__main__":
  60. print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
  61. # get_config_from_mysql('author', 'xigua', 'prod')