public.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/27
  4. import requests
  5. from mq_http_sdk.mq_client import *
  6. from mq_http_sdk.mq_exception import MQExceptionBase
  7. from sklearn.feature_extraction.text import TfidfVectorizer
  8. from sklearn.metrics.pairwise import cosine_similarity
  9. import os, sys, jieba
  10. import time
  11. import random
  12. import difflib
  13. sys.path.append(os.getcwd())
  14. from common.common import Common
  15. from common.feishu import Feishu
  16. from common.scheduling_db import MysqlHelper
  17. # from common import Common
  18. # from feishu import Feishu
  19. # from scheduling_db import MysqlHelper
  20. def get_user_from_mysql(log_type, crawler, source, env, action=""):
  21. sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
  22. results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
  23. if results:
  24. return results
  25. else:
  26. Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
  27. return []
  28. def similarity(title1, title2):
  29. # 分词
  30. seg1 = jieba.lcut(title1)
  31. seg2 = jieba.lcut(title2)
  32. # 构建TF-IDF向量
  33. tfidf_vectorizer = TfidfVectorizer()
  34. tfidf_matrix = tfidf_vectorizer.fit_transform(["".join(seg1), "".join(seg2)])
  35. # 计算余弦相似度
  36. similar = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
  37. return similar
  38. def title_like(log_type, crawler, platform, title, env):
  39. """
  40. 标题相似度
  41. :param log_type: 日志
  42. :param crawler: 哪款爬虫
  43. :param platform: 爬虫渠道,如:公众号 / 小年糕
  44. :param title: 视频标题
  45. :param env: 环境
  46. :return: 相似度>=80%,返回 True;反之,返回 False
  47. """
  48. select_sql = (
  49. f""" select video_title from crawler_video where platform="{platform}" """
  50. )
  51. video_list = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  52. # print(video_list)
  53. if len(video_list) == 0:
  54. return False
  55. for video_dict in video_list:
  56. video_title = video_dict["video_title"]
  57. # print(video_title)
  58. if difflib.SequenceMatcher(None, title, video_title).quick_ratio() >= 0.8:
  59. return True
  60. else:
  61. continue
  62. return False
  63. def get_config_from_mysql(log_type, source, env, text, action=""):
  64. select_sql = f"""select * from crawler_config where source="{source}" """
  65. contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
  66. title_list = []
  67. filter_list = []
  68. emoji_list = []
  69. search_word_list = []
  70. for content in contents:
  71. config = content["config"]
  72. config_dict = eval(config)
  73. for k, v in config_dict.items():
  74. if k == "title":
  75. title_list_config = v.split(",")
  76. for title in title_list_config:
  77. title_list.append(title)
  78. if k == "filter":
  79. filter_list_config = v.split(",")
  80. for filter_word in filter_list_config:
  81. filter_list.append(filter_word)
  82. if k == "emoji":
  83. emoji_list_config = v.split(",")
  84. for emoji in emoji_list_config:
  85. emoji_list.append(emoji)
  86. if k == "search_word":
  87. search_word_list_config = v.split(",")
  88. for search_word in search_word_list_config:
  89. search_word_list.append(search_word)
  90. if text == "title":
  91. return title_list
  92. elif text == "filter":
  93. return filter_list
  94. elif text == "emoji":
  95. return emoji_list
  96. elif text == "search_word":
  97. return search_word_list
  98. def get_rule_from_mysql(task_id, log_type, crawler, env):
  99. select_rule_sql = f"""select rule from crawler_task_v3 where id={task_id}"""
  100. rule_list = MysqlHelper.get_values(
  101. log_type, crawler, select_rule_sql, env, action=""
  102. )
  103. return json.loads(rule_list[0]["rule"])
  104. def random_title(log_type, crawler, env, text):
  105. random_title_list = get_config_from_mysql(log_type, crawler, env, text)
  106. return random.choice(random_title_list)
  107. def task_fun(task_str):
  108. task_str = task_str.replace("'[", "[").replace("]'", "]")
  109. task_dict = dict(eval(task_str))
  110. rule = task_dict["rule"]
  111. task_dict["rule"] = dict()
  112. for item in rule:
  113. for k, val in item.items():
  114. task_dict["rule"][k] = val
  115. rule_dict = task_dict["rule"]
  116. task_dict = {"task_dict": task_dict, "rule_dict": rule_dict}
  117. return task_dict
  118. def task_fun_mq(task_str):
  119. task_str = task_str.replace('"[', "[").replace(']"', "]").replace("\\", "")
  120. task_dict = dict(eval(task_str))
  121. rule = task_dict["rule"]
  122. task_dict["rule"] = dict()
  123. for item in rule:
  124. for k, val in item.items():
  125. task_dict["rule"][k] = val
  126. rule_dict = task_dict["rule"]
  127. task_dict = {"task_dict": task_dict, "rule_dict": rule_dict}
  128. return task_dict
  129. def get_consumer(topic_name, group_id):
  130. # 初始化client。
  131. mq_client = MQClient(
  132. # 设置HTTP协议客户端接入点,进入云消息队列 RocketMQ 版控制台实例详情页面的接入点区域查看。
  133. "http://1894469520484605.mqrest.cn-qingdao-public.aliyuncs.com",
  134. # AccessKey ID,阿里云身份验证标识。获取方式,请参见创建AccessKey。
  135. "LTAI4G7puhXtLyHzHQpD6H7A",
  136. # AccessKey Secret,阿里云身份验证密钥。获取方式,请参见创建AccessKey。
  137. "nEbq3xWNQd1qLpdy2u71qFweHkZjSG",
  138. )
  139. # 消息所属的Topic,在云消息队列 RocketMQ 版控制台创建。
  140. # topic_name = "${TOPIC}"
  141. topic_name = str(topic_name)
  142. # 您在云消息队列 RocketMQ 版控制台创建的Group ID。
  143. # group_id = "${GROUP_ID}"
  144. group_id = str(group_id)
  145. # Topic所属的实例ID,在云消息队列 RocketMQ 版控制台创建。
  146. # 若实例有命名空间,则实例ID必须传入;若实例无命名空间,则实例ID传入空字符串。实例的命名空间可以在云消息队列 RocketMQ 版控制台的实例详情页面查看。
  147. instance_id = "MQ_INST_1894469520484605_BXhXuzkZ"
  148. consumer = mq_client.get_consumer(instance_id, topic_name, group_id)
  149. return consumer
  150. def ack_message(log_type, crawler, recv_msgs, consumer):
  151. # msg.next_consume_time前若不确认消息消费成功,则消息会被重复消费。
  152. # 消息句柄有时间戳,同一条消息每次消费拿到的都不一样。
  153. try:
  154. receipt_handle_list = [msg.receipt_handle for msg in recv_msgs]
  155. consumer.ack_message(receipt_handle_list)
  156. Common.logger(log_type, crawler).info(
  157. f"Ack {len(receipt_handle_list)} Message Succeed.\n"
  158. )
  159. except MQExceptionBase as err:
  160. Common.logger(log_type, crawler).info(f"Ack Message Fail! Exception:{err}\n")
  161. def download_rule(log_type, crawler, video_dict, rule_dict):
  162. """
  163. 下载视频的基本规则
  164. :param log_type: 日志
  165. :param crawler: 哪款爬虫
  166. :param video_dict: 视频信息,字典格式
  167. :param rule_dict: 规则信息,字典格式
  168. :return: 满足规则,返回 True;反之,返回 False
  169. """
  170. # 格式化 video_dict:publish_time_stamp
  171. if "publish_time_stamp" in video_dict.keys():
  172. video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
  173. # 格式化 video_dict:period
  174. if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
  175. video_dict["period"] = int(
  176. (int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000)
  177. )
  178. # 格式化 rule_dict 最大值取值为 0 的问题
  179. for rule_value in rule_dict.values():
  180. if rule_value["max"] == 0:
  181. rule_value["max"] = 999999999999999
  182. # 格式化 rule_dict 有的 key,video_dict 中没有的问题
  183. for rule_key in rule_dict.keys():
  184. if rule_key not in video_dict.keys():
  185. video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2)
  186. # 比较结果,输出:True / False
  187. for video_key, video_value in video_dict.items():
  188. for rule_key, rule_value in rule_dict.items():
  189. if video_key == rule_key == "period":
  190. result = 0 <= int(video_value) <= int(rule_value["max"])
  191. Common.logger(log_type, crawler).info(
  192. f'{video_key}: 0 <= {video_value} <= {rule_value["min"]}, {result}'
  193. )
  194. elif video_key == rule_key:
  195. result = (
  196. int(rule_value["min"]) <= int(video_value) <= int(rule_value["max"])
  197. )
  198. Common.logger(log_type, crawler).info(
  199. f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}'
  200. )
  201. else:
  202. result = True
  203. if result is False:
  204. return False
  205. else:
  206. continue
  207. return True
  208. def download_rule_v2(log_type, crawler, video_dict, rule_dict):
  209. """
  210. 下载视频的基本规则
  211. :param log_type: 日志
  212. :param crawler: 哪款爬虫
  213. :param video_dict: 视频信息,字典格式
  214. :param rule_dict: 规则信息,字典格式
  215. :return: 满足规则,返回 True;反之,返回 False
  216. """
  217. # 格式化 video_dict:publish_time_stamp
  218. if video_dict.get("publish_time_stamp"):
  219. video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
  220. # 格式化 video_dict:period
  221. if (
  222. video_dict.get("publish_time")
  223. and video_dict.get("period", "noperiod") == "noperiod"
  224. ):
  225. video_dict["period"] = int(
  226. (int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000)
  227. )
  228. # 格式化 rule_dict 最大值取值为 0 的问题
  229. for key in video_dict:
  230. if rule_dict.get(key):
  231. max_value = (
  232. int(rule_dict[key]["max"])
  233. if int(rule_dict[key]["max"]) > 0
  234. else 999999999999999
  235. )
  236. if key == "peroid":
  237. flag = 0 <= int(video_dict[key]) <= max_value
  238. Common.logger(log_type, crawler).info(
  239. "{}: 0 <= {} <= {}, {}".format(
  240. key, video_dict[key], max_value, flag
  241. )
  242. )
  243. if not flag:
  244. return flag
  245. else:
  246. flag = int(rule_dict[key]["min"]) <= int(video_dict[key] <= max_value)
  247. Common.logger(log_type, crawler).info(
  248. "{}: {} <= {} <= {}, {}".format(
  249. key, rule_dict[key]["min"], video_dict[key], max_value, flag
  250. )
  251. )
  252. if not flag:
  253. return flag
  254. else:
  255. continue
  256. return True
  257. def get_word_score(log_type, crawler, score_sheet, word):
  258. while True:
  259. score_sheet = Feishu.get_values_batch(log_type, crawler, score_sheet)
  260. if score_sheet is None:
  261. time.sleep(1)
  262. continue
  263. for i in range(1, len(score_sheet)):
  264. if word not in [y for x in score_sheet for y in x]:
  265. return 0
  266. if word == score_sheet[i][0]:
  267. word_score = score_sheet[i][8]
  268. return word_score
  269. def get_title_score(log_type, crawler, stop_sheet, score_sheet, title):
  270. # 获取停用词列表
  271. # while True:
  272. stop_word_list = []
  273. stop_word_sheet = Feishu.get_values_batch(log_type, crawler, stop_sheet)
  274. if stop_word_sheet is None:
  275. return 0
  276. # time.sleep(1)
  277. # continue
  278. for x in stop_word_sheet:
  279. for y in x:
  280. if y is None:
  281. pass
  282. else:
  283. stop_word_list.append(y)
  284. # break
  285. # 文本分词
  286. cut_word_list = []
  287. cut_list = jieba.lcut(title)
  288. for cut_item in cut_list:
  289. if cut_item == " ":
  290. continue
  291. if cut_item in stop_word_list:
  292. continue
  293. cut_word_list.append(cut_item)
  294. # 获取权重分列表
  295. score_list = []
  296. for word in cut_word_list:
  297. word_score = get_word_score(log_type, crawler, score_sheet, word)
  298. score_list.append(word_score)
  299. # 获取标题的权重总分
  300. title_score = sum(score_list)
  301. return title_score
  302. def task_unbind(log_type, crawler, taskid, uids, env):
  303. if env == "dev":
  304. url = "https://testadmin.piaoquantv.com/manager/crawler/v3/task/unbind"
  305. else:
  306. url = "https://admin.piaoquantv.com/manager/crawler/v3/task/unbind"
  307. params = {
  308. "taskId": taskid, # 任务 ID
  309. "uids": uids, # 解绑用户uid(多个英文逗号隔开),例如"3222121,213231"
  310. "operator": "", # 默认 system
  311. }
  312. Common.logger(log_type, crawler).info(f"url:{url}")
  313. Common.logging(log_type, crawler, env, f"url:{url}")
  314. Common.logger(log_type, crawler).info(f"params:{params}")
  315. Common.logging(log_type, crawler, env, f"params:{params}")
  316. response = requests.post(url=url, json=params)
  317. Common.logger(log_type, crawler).info(f"task_unbind_response:{response.text}")
  318. Common.logging(log_type, crawler, env, f"task_unbind_response:{response.text}")
  319. if response.status_code == 200 and response.json()["code"] == 0:
  320. return "success"
  321. else:
  322. return response.text
  323. def clean_title(strings):
  324. return (
  325. strings.strip()
  326. .replace("\n", "")
  327. .replace("/", "")
  328. .replace("\r", "")
  329. .replace("#", "")
  330. .replace(".", "。")
  331. .replace("\\", "")
  332. .replace("&NBSP", "")
  333. .replace(":", "")
  334. .replace("*", "")
  335. .replace("?", "")
  336. .replace("?", "")
  337. .replace('"', "")
  338. .replace("<", "")
  339. .replace(">", "")
  340. .replace("|", "")
  341. .replace(" ", "")
  342. .replace('"', "")
  343. .replace("'", "")
  344. )
  345. if __name__ == "__main__":
  346. print(get_title_score("recommend", "kuaishou", "16QspO", "0usaDk", "像梦一场"))
  347. pass