public.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/27
  4. import os, sys
  5. import time
  6. import random
  7. sys.path.append(os.getcwd())
  8. from common.common import Common
  9. from common.scheduling_db import MysqlHelper
  10. # from common import Common
  11. # from scheduling_db import MysqlHelper
  12. # 过滤词库
  13. def filter_word(log_type, crawler, source, env):
  14. """
  15. 过滤词库
  16. :param log_type: 日志
  17. :param crawler: 哪款爬虫,如:xiaoniangao
  18. :param source: 哪款爬虫,如:小年糕
  19. :param env: 环境
  20. :return: word_list
  21. """
  22. select_sql = f""" select * from crawler_filter_word where source="{source}" """
  23. words = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
  24. word_list = []
  25. if len(words) == 0:
  26. return word_list
  27. for word in words:
  28. word_list.append(word['filter_word'])
  29. return word_list
  30. def get_user_from_mysql(log_type, crawler, source, env, action=''):
  31. sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
  32. results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
  33. if results:
  34. return results
  35. else:
  36. Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
  37. return []
  38. def get_config_from_mysql(log_type, source, env, text, action=''):
  39. select_sql = f"""select * from crawler_config where source="{source}" """
  40. contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
  41. title_list = []
  42. filter_list = []
  43. emoji_list = []
  44. search_word_list = []
  45. for content in contents:
  46. config = content['config']
  47. config_dict = eval(config)
  48. for k, v in config_dict.items():
  49. if k == "title":
  50. title_list_config = v.split(",")
  51. for title in title_list_config:
  52. title_list.append(title)
  53. if k == "filter":
  54. filter_list_config = v.split(",")
  55. for filter_word in filter_list_config:
  56. filter_list.append(filter_word)
  57. if k == "emoji":
  58. emoji_list_config = v.split(",")
  59. for emoji in emoji_list_config:
  60. emoji_list.append(emoji)
  61. if k == "search_word":
  62. search_word_list_config = v.split(",")
  63. for search_word in search_word_list_config:
  64. search_word_list.append(search_word)
  65. if text == "title":
  66. return title_list
  67. elif text == "filter":
  68. return filter_list
  69. elif text == "emoji":
  70. return emoji_list
  71. elif text == "search_word":
  72. return search_word_list
  73. def random_title(log_type, crawler, env, text):
  74. random_title_list = get_config_from_mysql(log_type, crawler, env, text)
  75. return random.choice(random_title_list)
  76. def task_fun(task_str):
  77. task_str = task_str.replace("'[", '[').replace("]'", ']')
  78. task_dict = dict(eval(task_str))
  79. rule = task_dict['rule']
  80. task_dict['rule'] = dict()
  81. for item in rule:
  82. for k, val in item.items():
  83. task_dict['rule'][k] = val
  84. rule_dict = task_dict['rule']
  85. task_dict = {
  86. "task_dict": task_dict,
  87. "rule_dict": rule_dict
  88. }
  89. return task_dict
  90. def download_rule(log_type, crawler, video_dict, rule_dict):
  91. """
  92. 下载视频的基本规则
  93. :param log_type: 日志
  94. :param crawler: 哪款爬虫
  95. :param video_dict: 视频信息,字典格式
  96. :param rule_dict: 规则信息,字典格式
  97. :return: 满足规则,返回 True;反之,返回 False
  98. """
  99. rule_playCnt_min = rule_dict.get('playCnt', {}).get('min', 0)
  100. rule_playCnt_max = rule_dict.get('playCnt', {}).get('max', 100000000)
  101. if rule_playCnt_max == 0:
  102. rule_playCnt_max = 100000000
  103. rule_duration_min = rule_dict.get('duration', {}).get('min', 0)
  104. rule_duration_max = rule_dict.get('duration', {}).get('max', 100000000)
  105. if rule_duration_max == 0:
  106. rule_duration_max = 100000000
  107. rule_period_min = rule_dict.get('period', {}).get('min', 0)
  108. # rule_period_max = rule_dict.get('period', {}).get('max', 100000000)
  109. # if rule_period_max == 0:
  110. # rule_period_max = 100000000
  111. #
  112. # rule_fans_min = rule_dict.get('fans', {}).get('min', 0)
  113. # rule_fans_max = rule_dict.get('fans', {}).get('max', 100000000)
  114. # if rule_fans_max == 0:
  115. # rule_fans_max = 100000000
  116. #
  117. # rule_videos_min = rule_dict.get('videos', {}).get('min', 0)
  118. # rule_videos_max = rule_dict.get('videos', {}).get('max', 100000000)
  119. # if rule_videos_max == 0:
  120. # rule_videos_max = 100000000
  121. rule_like_min = rule_dict.get('like', {}).get('min', 0)
  122. rule_like_max = rule_dict.get('like', {}).get('max', 100000000)
  123. if rule_like_max == 0:
  124. rule_like_max = 100000000
  125. rule_videoWidth_min = rule_dict.get('videoWidth', {}).get('min', 0)
  126. rule_videoWidth_max = rule_dict.get('videoWidth', {}).get('max', 100000000)
  127. if rule_videoWidth_max == 0:
  128. rule_videoWidth_max = 100000000
  129. rule_videoHeight_min = rule_dict.get('videoHeight', {}).get('min', 0)
  130. rule_videoHeight_max = rule_dict.get('videoHeight', {}).get('max', 100000000)
  131. if rule_videoHeight_max == 0:
  132. rule_videoHeight_max = 100000000
  133. rule_shareCnt_min = rule_dict.get('shareCnt', {}).get('min', 0)
  134. rule_shareCnt_max = rule_dict.get('shareCnt', {}).get('max', 100000000)
  135. if rule_shareCnt_max == 0:
  136. rule_shareCnt_max = 100000000
  137. rule_commentCnt_min = rule_dict.get('commentCnt', {}).get('min', 0)
  138. rule_commentCnt_max = rule_dict.get('commentCnt', {}).get('max', 100000000)
  139. if rule_commentCnt_max == 0:
  140. rule_commentCnt_max = 100000000
  141. Common.logger(log_type, crawler).info(
  142. f'rule_duration_max:{rule_duration_max} >= duration:{int(float(video_dict["duration"]))} >= rule_duration_min:{int(rule_duration_min)}')
  143. Common.logger(log_type, crawler).info(
  144. f'rule_playCnt_max:{int(rule_playCnt_max)} >= play_cnt:{int(video_dict["play_cnt"])} >= rule_playCnt_min:{int(rule_playCnt_min)}')
  145. Common.logger(log_type, crawler).info(
  146. f'now:{int(time.time())} - publish_time_stamp:{int(video_dict["publish_time_stamp"])} <= {3600 * 24 * int(rule_period_min)}')
  147. Common.logger(log_type, crawler).info(
  148. f'rule_like_max:{int(rule_like_max)} >= like_cnt:{int(video_dict["like_cnt"])} >= rule_like_min:{int(rule_like_min)}')
  149. Common.logger(log_type, crawler).info(
  150. f'rule_commentCnt_max:{int(rule_commentCnt_max)} >= comment_cnt:{int(video_dict["comment_cnt"])} >= rule_commentCnt_min:{int(rule_commentCnt_min)}')
  151. Common.logger(log_type, crawler).info(
  152. f'rule_shareCnt_max:{int(rule_shareCnt_max)} >= share_cnt:{int(video_dict["share_cnt"])} >= rule_shareCnt_min:{int(rule_shareCnt_min)}')
  153. Common.logger(log_type, crawler).info(
  154. f'rule_videoWidth_max:{int(rule_videoWidth_max)} >= video_width:{int(video_dict["video_width"])} >= rule_videoWidth_min:{int(rule_videoWidth_min)}')
  155. Common.logger(log_type, crawler).info(
  156. f'rule_videoHeight_max:{int(rule_videoHeight_max)} >= video_height:{int(video_dict["video_height"])} >= rule_videoHeight_min:{int(rule_videoHeight_min)}')
  157. if int(rule_duration_max) >= int(float(video_dict["duration"])) >= int(rule_duration_min) \
  158. and int(rule_playCnt_max) >= int(video_dict['play_cnt']) >= int(rule_playCnt_min) \
  159. and int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600 * 24 * int(rule_period_min) \
  160. and int(rule_like_max) >= int(video_dict['like_cnt']) >= int(rule_like_min) \
  161. and int(rule_commentCnt_max) >= int(video_dict['comment_cnt']) >= int(rule_commentCnt_min) \
  162. and int(rule_shareCnt_max) >= int(video_dict['share_cnt']) >= int(rule_shareCnt_min) \
  163. and int(rule_videoWidth_max) >= int(video_dict['video_width']) >= int(rule_videoWidth_min) \
  164. and int(rule_videoHeight_max) >= int(video_dict['video_height']) >= int(rule_videoHeight_min):
  165. return True
  166. else:
  167. return False
  168. if __name__ == "__main__":
  169. # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
  170. print(get_config_from_mysql('hour', 'xiaoniangao', 'prod', 'emoji'))
  171. # task_str = "[('task_id','11')," \
  172. # "('task_name','小年糕小时榜')," \
  173. # "('source','xiaoniangao')," \
  174. # "('start_time','1681834560000')," \
  175. # "('interval','1'),('mode','hour')," \
  176. # "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \
  177. # "('spider_name','')," \
  178. # "('machine','')," \
  179. # "('status','0')," \
  180. # "('create_time','1681889875288')," \
  181. # "('update_time','1681889904908')," \
  182. # "('operator','王坤')]"
  183. # print(task(task_str))
  184. pass