public.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/27
  4. import os, sys
  5. import time
  6. import random
  7. sys.path.append(os.getcwd())
  8. from common.common import Common
  9. from common.scheduling_db import MysqlHelper
  10. # from common import Common
  11. # from scheduling_db import MysqlHelper
  12. def get_user_from_mysql(log_type, crawler, source, env, action=''):
  13. sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
  14. results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
  15. if results:
  16. return results
  17. else:
  18. Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
  19. return []
  20. def get_config_from_mysql(log_type, source, env, text, action=''):
  21. select_sql = f"""select * from crawler_config where source="{source}" """
  22. contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
  23. title_list = []
  24. filter_list = []
  25. emoji_list = []
  26. search_word_list = []
  27. for content in contents:
  28. config = content['config']
  29. config_dict = eval(config)
  30. for k, v in config_dict.items():
  31. if k == "title":
  32. title_list_config = v.split(",")
  33. for title in title_list_config:
  34. title_list.append(title)
  35. if k == "filter":
  36. filter_list_config = v.split(",")
  37. for filter_word in filter_list_config:
  38. filter_list.append(filter_word)
  39. if k == "emoji":
  40. emoji_list_config = v.split(",")
  41. for emoji in emoji_list_config:
  42. emoji_list.append(emoji)
  43. if k == "search_word":
  44. search_word_list_config = v.split(",")
  45. for search_word in search_word_list_config:
  46. search_word_list.append(search_word)
  47. if text == "title":
  48. return title_list
  49. elif text == "filter":
  50. return filter_list
  51. elif text == "emoji":
  52. return emoji_list
  53. elif text == "search_word":
  54. return search_word_list
  55. def random_title(log_type, crawler, env, text):
  56. random_title_list = get_config_from_mysql(log_type, crawler, env, text)
  57. return random.choice(random_title_list)
  58. def task_fun(task_str):
  59. task_str = task_str.replace("'[", '[').replace("]'", ']')
  60. task_dict = dict(eval(task_str))
  61. rule = task_dict['rule']
  62. task_dict['rule'] = dict()
  63. for item in rule:
  64. for k, val in item.items():
  65. task_dict['rule'][k] = val
  66. rule_dict = task_dict['rule']
  67. task_dict = {
  68. "task_dict": task_dict,
  69. "rule_dict": rule_dict
  70. }
  71. return task_dict
  72. def download_rule(log_type, crawler, video_dict, rule_dict):
  73. """
  74. 下载视频的基本规则
  75. :param log_type: 日志
  76. :param crawler: 哪款爬虫
  77. :param video_dict: 视频信息,字典格式
  78. :param rule_dict: 规则信息,字典格式
  79. :return: 满足规则,返回 True;反之,返回 False
  80. """
  81. # 格式化 video_dict:publish_time_stamp
  82. if "publish_time_stamp" in video_dict.keys():
  83. video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
  84. # 格式化 video_dict:period
  85. if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
  86. video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000))
  87. # 格式化 rule_dict 最大值取值为 0 的问题
  88. for rule_value in rule_dict.values():
  89. if rule_value["max"] == 0:
  90. rule_value["max"] = 999999999999999
  91. # 格式化 rule_dict 有的 key,video_dict 中没有的问题
  92. for rule_key in rule_dict.keys():
  93. if rule_key not in video_dict.keys():
  94. video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2)
  95. # 比较结果,输出:True / False
  96. for video_key, video_value in video_dict.items():
  97. for rule_key, rule_value in rule_dict.items():
  98. if video_key == rule_key == "period":
  99. result = 0 <= int(video_value) <= int(rule_value["max"])
  100. Common.logger(log_type, crawler).info(f'{video_key}: 0 <= {video_value} <= {rule_value["min"]}, {result}')
  101. elif video_key == rule_key:
  102. result = int(rule_value["min"]) <= int(video_value) <= int(rule_value["max"])
  103. Common.logger(log_type, crawler).info(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
  104. else:
  105. result = True
  106. if result is False:
  107. return False
  108. else:
  109. continue
  110. return True
  111. if __name__ == "__main__":
  112. # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
  113. # print(get_config_from_mysql('test', 'gongzhonghao', 'prod', 'filter'))
  114. # print(filter_word('test', 'gongzhonghao', '公众号', 'prod'))
  115. # task_str = "[('task_id','11')," \
  116. # "('task_name','小年糕小时榜')," \
  117. # "('source','xiaoniangao')," \
  118. # "('start_time','1681834560000')," \
  119. # "('interval','1'),('mode','hour')," \
  120. # "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \
  121. # "('spider_name','')," \
  122. # "('machine','')," \
  123. # "('status','0')," \
  124. # "('create_time','1681889875288')," \
  125. # "('update_time','1681889904908')," \
  126. # "('operator','王坤')]"
  127. # print(task(task_str))
  128. pass