public.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/27
  4. import os, sys
  5. import random
  6. sys.path.append(os.getcwd())
  7. from common.common import Common
  8. from common.scheduling_db import MysqlHelper
  9. # from common import Common
  10. # from scheduling_db import MysqlHelper
  11. # 过滤词库
  12. def filter_word(log_type, crawler, source, env):
  13. """
  14. 过滤词库
  15. :param log_type: 日志
  16. :param crawler: 哪款爬虫,如:xiaoniangao
  17. :param source: 哪款爬虫,如:小年糕
  18. :param env: 环境
  19. :return: word_list
  20. """
  21. select_sql = f""" select * from crawler_filter_word where source="{source}" """
  22. words = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
  23. word_list = []
  24. if len(words) == 0:
  25. return word_list
  26. for word in words:
  27. word_list.append(word['filter_word'])
  28. return word_list
  29. def get_user_from_mysql(log_type, crawler, source, env, action=''):
  30. sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
  31. results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
  32. if results:
  33. return results
  34. else:
  35. Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
  36. return []
  37. def get_config_from_mysql(log_type, source, env, text, action=''):
  38. select_sql = f"""select * from crawler_config where source="{source}" """
  39. contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
  40. title_list = []
  41. filter_list = []
  42. emoji_list = []
  43. for content in contents:
  44. config = content['config']
  45. config_dict = eval(config)
  46. for k, v in config_dict.items():
  47. if k == "title":
  48. title_list_config = v.split(",")
  49. for title in title_list_config:
  50. title_list.append(title)
  51. if k == "filter":
  52. filter_list_config = v.split(",")
  53. for filter_word in filter_list_config:
  54. filter_list.append(filter_word)
  55. if k == "emoji":
  56. emoji_list_config = v.split(",")
  57. for emoji in emoji_list_config:
  58. emoji_list.append(emoji)
  59. if text == "title":
  60. return title_list
  61. elif text == "filter":
  62. return filter_list
  63. elif text == "emoji":
  64. return emoji_list
  65. def random_title(log_type, crawler, env, text):
  66. random_title_list = get_config_from_mysql(log_type, crawler, env, text)
  67. return random.choice(random_title_list)
  68. def task_fun(task_str):
  69. task_str = task_str.replace("'[{", '[{').replace("}}]'", '}}]')
  70. task_dict = dict(eval(task_str))
  71. rule = task_dict['rule']
  72. task_dict['rule'] = dict()
  73. for item in rule:
  74. for k, val in item.items():
  75. task_dict['rule'][k] = val
  76. rule_dict = task_dict['rule']
  77. task_dict = {
  78. "task_dict": task_dict,
  79. "rule_dict": rule_dict
  80. }
  81. return task_dict
  82. if __name__ == "__main__":
  83. # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
  84. print(get_config_from_mysql('hour', 'xiaoniangao', 'dev', 'emoji'))
  85. # task_str = "[('task_id','11')," \
  86. # "('task_name','小年糕小时榜')," \
  87. # "('source','xiaoniangao')," \
  88. # "('start_time','1681834560000')," \
  89. # "('interval','1'),('mode','hour')," \
  90. # "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \
  91. # "('spider_name','')," \
  92. # "('machine','')," \
  93. # "('status','0')," \
  94. # "('create_time','1681889875288')," \
  95. # "('update_time','1681889904908')," \
  96. # "('operator','王坤')]"
  97. # print(task(task_str))
  98. pass