public.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/27
  4. from mq_http_sdk.mq_client import *
  5. # from mq_http_sdk.mq_consumer import *
  6. from mq_http_sdk.mq_exception import MQExceptionBase
  7. import os, sys
  8. import time
  9. import random
  10. import difflib
  11. sys.path.append(os.getcwd())
  12. from common.common import Common
  13. from common.scheduling_db import MysqlHelper
  14. # from common import Common
  15. # from scheduling_db import MysqlHelper
  16. def get_user_from_mysql(log_type, crawler, source, env, action=''):
  17. sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
  18. results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
  19. if results:
  20. return results
  21. else:
  22. Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
  23. return []
  24. def title_like(log_type, crawler, platform, title, env):
  25. """
  26. 标题相似度
  27. :param log_type: 日志
  28. :param crawler: 哪款爬虫
  29. :param platform: 爬虫渠道,如:公众号 / 小年糕
  30. :param title: 视频标题
  31. :param env: 环境
  32. :return: 相似度>=80%,返回 True;反之,返回 False
  33. """
  34. select_sql = f""" select video_title from crawler_video where platform="{platform}" """
  35. video_list = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  36. # print(video_list)
  37. if len(video_list) == 0:
  38. return False
  39. for video_dict in video_list:
  40. video_title = video_dict["video_title"]
  41. # print(video_title)
  42. if difflib.SequenceMatcher(None, title, video_title).quick_ratio() >= 0.8:
  43. return True
  44. else:
  45. continue
  46. return False
  47. def get_config_from_mysql(log_type, source, env, text, action=''):
  48. select_sql = f"""select * from crawler_config where source="{source}" """
  49. contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
  50. title_list = []
  51. filter_list = []
  52. emoji_list = []
  53. search_word_list = []
  54. for content in contents:
  55. config = content['config']
  56. config_dict = eval(config)
  57. for k, v in config_dict.items():
  58. if k == "title":
  59. title_list_config = v.split(",")
  60. for title in title_list_config:
  61. title_list.append(title)
  62. if k == "filter":
  63. filter_list_config = v.split(",")
  64. for filter_word in filter_list_config:
  65. filter_list.append(filter_word)
  66. if k == "emoji":
  67. emoji_list_config = v.split(",")
  68. for emoji in emoji_list_config:
  69. emoji_list.append(emoji)
  70. if k == "search_word":
  71. search_word_list_config = v.split(",")
  72. for search_word in search_word_list_config:
  73. search_word_list.append(search_word)
  74. if text == "title":
  75. return title_list
  76. elif text == "filter":
  77. return filter_list
  78. elif text == "emoji":
  79. return emoji_list
  80. elif text == "search_word":
  81. return search_word_list
  82. def random_title(log_type, crawler, env, text):
  83. random_title_list = get_config_from_mysql(log_type, crawler, env, text)
  84. return random.choice(random_title_list)
  85. def task_fun(task_str):
  86. task_str = task_str.replace("'[", '[').replace("]'", ']')
  87. task_dict = dict(eval(task_str))
  88. rule = task_dict['rule']
  89. task_dict['rule'] = dict()
  90. for item in rule:
  91. for k, val in item.items():
  92. task_dict['rule'][k] = val
  93. rule_dict = task_dict['rule']
  94. task_dict = {
  95. "task_dict": task_dict,
  96. "rule_dict": rule_dict
  97. }
  98. return task_dict
  99. def task_fun_mq(task_str):
  100. task_str = task_str.replace('"[', '[').replace(']"', ']').replace('\\', '')
  101. task_dict = dict(eval(task_str))
  102. rule = task_dict['rule']
  103. task_dict['rule'] = dict()
  104. for item in rule:
  105. for k, val in item.items():
  106. task_dict['rule'][k] = val
  107. rule_dict = task_dict['rule']
  108. task_dict = {
  109. "task_dict": task_dict,
  110. "rule_dict": rule_dict
  111. }
  112. return task_dict
  113. def get_consumer(topic_name, group_id):
  114. # 初始化client。
  115. mq_client = MQClient(
  116. # 设置HTTP协议客户端接入点,进入云消息队列 RocketMQ 版控制台实例详情页面的接入点区域查看。
  117. "http://1894469520484605.mqrest.cn-qingdao-public.aliyuncs.com",
  118. # AccessKey ID,阿里云身份验证标识。获取方式,请参见创建AccessKey。
  119. "LTAI4G7puhXtLyHzHQpD6H7A",
  120. # AccessKey Secret,阿里云身份验证密钥。获取方式,请参见创建AccessKey。
  121. "nEbq3xWNQd1qLpdy2u71qFweHkZjSG"
  122. )
  123. # 消息所属的Topic,在云消息队列 RocketMQ 版控制台创建。
  124. # topic_name = "${TOPIC}"
  125. topic_name = str(topic_name)
  126. # 您在云消息队列 RocketMQ 版控制台创建的Group ID。
  127. # group_id = "${GROUP_ID}"
  128. group_id = str(group_id)
  129. # Topic所属的实例ID,在云消息队列 RocketMQ 版控制台创建。
  130. # 若实例有命名空间,则实例ID必须传入;若实例无命名空间,则实例ID传入空字符串。实例的命名空间可以在云消息队列 RocketMQ 版控制台的实例详情页面查看。
  131. instance_id = "MQ_INST_1894469520484605_BXhXuzkZ"
  132. consumer = mq_client.get_consumer(instance_id, topic_name, group_id)
  133. return consumer
  134. def ack_message(log_type, crawler, recv_msgs, consumer):
  135. # msg.next_consume_time前若不确认消息消费成功,则消息会被重复消费。
  136. # 消息句柄有时间戳,同一条消息每次消费拿到的都不一样。
  137. try:
  138. receipt_handle_list = [msg.receipt_handle for msg in recv_msgs]
  139. consumer.ack_message(receipt_handle_list)
  140. Common.logger(log_type, crawler).info(f"Ack {len(receipt_handle_list)} Message Succeed.\n")
  141. except MQExceptionBase as err:
  142. Common.logger(log_type, crawler).info(f"Ack Message Fail! Exception:{err}\n")
  143. def download_rule(log_type, crawler, video_dict, rule_dict):
  144. """
  145. 下载视频的基本规则
  146. :param log_type: 日志
  147. :param crawler: 哪款爬虫
  148. :param video_dict: 视频信息,字典格式
  149. :param rule_dict: 规则信息,字典格式
  150. :return: 满足规则,返回 True;反之,返回 False
  151. """
  152. # 格式化 video_dict:publish_time_stamp
  153. if "publish_time_stamp" in video_dict.keys():
  154. video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
  155. # 格式化 video_dict:period
  156. if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
  157. video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000))
  158. # 格式化 rule_dict 最大值取值为 0 的问题
  159. for rule_value in rule_dict.values():
  160. if rule_value["max"] == 0:
  161. rule_value["max"] = 999999999999999
  162. # 格式化 rule_dict 有的 key,video_dict 中没有的问题
  163. for rule_key in rule_dict.keys():
  164. if rule_key not in video_dict.keys():
  165. video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2)
  166. # 比较结果,输出:True / False
  167. for video_key, video_value in video_dict.items():
  168. for rule_key, rule_value in rule_dict.items():
  169. if video_key == rule_key == "period":
  170. result = 0 <= int(video_value) <= int(rule_value["max"])
  171. Common.logger(log_type, crawler).info(f'{video_key}: 0 <= {video_value} <= {rule_value["min"]}, {result}')
  172. elif video_key == rule_key:
  173. result = int(rule_value["min"]) <= int(video_value) <= int(rule_value["max"])
  174. Common.logger(log_type, crawler).info(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
  175. else:
  176. result = True
  177. if result is False:
  178. return False
  179. else:
  180. continue
  181. return True
  182. if __name__ == "__main__":
  183. # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
  184. # print(get_config_from_mysql('test', 'gongzhonghao', 'prod', 'filter'))
  185. # print(filter_word('test', 'gongzhonghao', '公众号', 'prod'))
  186. # task_str = "[('task_id','11')," \
  187. # "('task_name','小年糕小时榜')," \
  188. # "('source','xiaoniangao')," \
  189. # "('start_time','1681834560000')," \
  190. # "('interval','1'),('mode','hour')," \
  191. # "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \
  192. # "('spider_name','')," \
  193. # "('machine','')," \
  194. # "('status','0')," \
  195. # "('create_time','1681889875288')," \
  196. # "('update_time','1681889904908')," \
  197. # "('operator','王坤')]"
  198. # print(task(task_str))
  199. pass