gongzhonghao_author_lock.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/28
  4. import datetime
  5. import json
  6. import os
  7. import random
  8. import sys
  9. import time
  10. import requests
  11. import urllib3
  12. from selenium.webdriver import DesiredCapabilities
  13. from selenium.webdriver.chrome.service import Service
  14. from selenium.webdriver.common.by import By
  15. from selenium import webdriver
  16. sys.path.append(os.getcwd())
  17. from common.mq import MQ
  18. from common.common import Common
  19. from common.feishu import Feishu
  20. from common.scheduling_db import MysqlHelper
  21. from common.public import get_config_from_mysql, title_like, task_unbind
  22. class GongzhonghaoAuthor:
  23. platform = "gongzhonghao"
  24. @classmethod
  25. def get_token(cls, log_type, crawler, env):
  26. while True:
  27. select_sql = f""" select * from crawler_config where source="gongzhonghao" and status=0;"""
  28. sql_res_list = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  29. token_list = []
  30. for sql_res in sql_res_list:
  31. if "token" not in sql_res["config"]:
  32. pass
  33. else:
  34. token_list.append(sql_res)
  35. if len(token_list) == 0:
  36. Common.logger(log_type, crawler).info("暂无可用的token\n")
  37. Common.logging(log_type, crawler, env, "暂无可用的token\n")
  38. if 20 >= datetime.datetime.now().hour >= 10:
  39. Feishu.bot(log_type, crawler, "暂无可用的token,请更新\n")
  40. time.sleep(60 * 15)
  41. continue
  42. token_info = random.choice(token_list)
  43. lock_time_stamp = cls.lock_token(log_type, crawler, env, token_info["id"])
  44. if lock_time_stamp is None:
  45. continue
  46. token_info_dict = {
  47. "token_id": token_info["id"],
  48. "title": token_info["title"].strip(),
  49. "status": token_info["status"],
  50. "token": dict(eval(token_info["config"]))["token"].strip(),
  51. "cookie": dict(eval(token_info["config"]))["cookie"].strip(),
  52. "update_time_stamp": lock_time_stamp,
  53. "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(lock_time_stamp / 1000))),
  54. "operator": token_info["operator"].strip()
  55. }
  56. # for k, v in token_info_dict.items():
  57. # print(f"{k}:{type(v)}, {v}")
  58. return token_info_dict
  59. @classmethod
  60. def lock_token(cls, log_type, crawler, env, token_id):
  61. """
  62. token 上锁。status=1,update_time=int(time.time()*1000)
  63. :param log_type: log
  64. :param crawler: crawler
  65. :param env: env
  66. :param token_id: token_id
  67. :return: None
  68. """
  69. lock_time_stamp = int(time.time() * 1000)
  70. lock_sql = f""" update crawler_config set status={1}, update_time={lock_time_stamp} WHERE id ={token_id} and status={0} ; """
  71. lock_token = MysqlHelper.update_values(log_type, crawler, lock_sql, env, action="")
  72. # Common.logger(log_type, crawler).info(f"lock_token:{lock_token}")
  73. if lock_token == 1:
  74. return lock_time_stamp
  75. else:
  76. return None
  77. @classmethod
  78. def release_token(cls, log_type, crawler, env, token_id, status):
  79. """
  80. 释放 token
  81. :param log_type: 日志
  82. :param crawler: 爬虫
  83. :param env: 环境
  84. :param token_id: token_id
  85. :param status: 0,正常可用状态;1,被占用状态;-2,不可用状态(过期/频控)
  86. :return: None
  87. """
  88. release_sql = f""" update crawler_config set status={status}, update_time={int(time.time() * 1000)} WHERE id ={token_id} ; """
  89. MysqlHelper.update_values(log_type, crawler, release_sql, env, action="")
  90. # 获取腾讯视频下载链接
  91. @classmethod
  92. def get_tencent_video_url(cls, video_id):
  93. url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
  94. response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
  95. response = json.loads(response)
  96. url = response['vl']['vi'][0]['ul']['ui'][0]['url']
  97. fvkey = response['vl']['vi'][0]['fvkey']
  98. video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
  99. return video_url
  100. @classmethod
  101. def get_video_url(cls, article_url, env):
  102. # 打印请求配置
  103. ca = DesiredCapabilities.CHROME
  104. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  105. # 不打开浏览器运行
  106. chrome_options = webdriver.ChromeOptions()
  107. chrome_options.add_argument("headless")
  108. chrome_options.add_argument(
  109. f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  110. chrome_options.add_argument("--no-sandbox")
  111. # driver初始化
  112. if env == "prod":
  113. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  114. else:
  115. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
  116. '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
  117. driver.implicitly_wait(10)
  118. driver.get(article_url)
  119. time.sleep(1)
  120. if len(driver.find_elements(By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]')) != 0:
  121. video_url = driver.find_element(
  122. By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]').get_attribute('src')
  123. elif len(driver.find_elements(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]')) != 0:
  124. iframe = driver.find_element(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]').get_attribute(
  125. 'src')
  126. video_id = iframe.split('vid=')[-1].split('&')[0]
  127. video_url = cls.get_tencent_video_url(video_id)
  128. else:
  129. video_url = 0
  130. driver.quit()
  131. return video_url
  132. @classmethod
  133. def repeat_video(cls, log_type, crawler, video_id, env):
  134. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
  135. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  136. return len(repeat_video)
  137. @classmethod
  138. def get_all_videos(cls, log_type, crawler, task_dict, rule_dict, user_list, env):
  139. for user_dict in user_list:
  140. Common.logger(log_type, crawler).info(f'抓取公众号:{user_dict["nick_name"]}\n')
  141. Common.logging(log_type, crawler, env, f'抓取公众号:{user_dict["nick_name"]}\n')
  142. try:
  143. cls.get_videoList(log_type=log_type,
  144. crawler=crawler,
  145. task_dict=task_dict,
  146. rule_dict=rule_dict,
  147. user_dict=user_dict,
  148. env=env)
  149. Common.logger(log_type, crawler).info('休眠 60 秒\n')
  150. Common.logging(log_type, crawler, env, '休眠 60 秒\n')
  151. time.sleep(60)
  152. except Exception as e:
  153. Common.logger(log_type, crawler).info(f'抓取公众号:{user_dict["nick_name"]}时异常:{e}\n')
  154. Common.logging(log_type, crawler, env, f'抓取公众号:{user_dict["nick_name"]}时异常:{e}\n')
  155. cls.release_token(log_type, crawler, env, token_dict["token_id"], 0)
  156. # 获取用户 fakeid
  157. @classmethod
  158. def get_user_info(cls, log_type, crawler, task_dict, user_dict, env):
  159. Common.logger(log_type, crawler).info(f"获取站外用户信息:{user_dict['link']}")
  160. Common.logging(log_type, crawler, env, f"获取站外用户信息:{user_dict['link']}")
  161. while True:
  162. global token_dict
  163. token_dict = cls.get_token(log_type, crawler, env)
  164. Common.logger(log_type, crawler).info(f"get_user_info_token:{token_dict}")
  165. if int(time.time() * 1000) - token_dict["update_time_stamp"] >= 3600 * 24 * 1000:
  166. # if int(time.time()*1000) - token_dict["update_time_stamp"] >= 30000:
  167. Common.logger(log_type, crawler).info(
  168. f"{int(time.time() * 1000)}-{token_dict['update_time_stamp']}={(int(time.time() * 1000) - token_dict['update_time_stamp'])}")
  169. Common.logger(log_type, crawler).info("token使用时长>=24小时,申请释放")
  170. Common.logging(log_type, crawler, env, "token使用时长>=24小时,申请释放")
  171. cls.release_token(log_type, crawler, env, token_dict["token_id"], 0)
  172. url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
  173. headers = {
  174. "accept": "*/*",
  175. "accept-encoding": "gzip, deflate, br",
  176. "accept-language": "zh-CN,zh;q=0.9",
  177. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  178. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  179. "&type=77&createType=5&token=1011071554&lang=zh_CN",
  180. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  181. "sec-ch-ua-mobile": "?0",
  182. "sec-ch-ua-platform": '"Windows"',
  183. "sec-fetch-dest": "empty",
  184. "sec-fetch-mode": "cors",
  185. "sec-fetch-site": "same-origin",
  186. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  187. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  188. "x-requested-with": "XMLHttpRequest",
  189. 'cookie': token_dict['cookie'],
  190. }
  191. params = {
  192. "action": "search_biz",
  193. "begin": "0",
  194. "count": "5",
  195. "query": str(user_dict['link']),
  196. "token": token_dict['token'],
  197. "lang": "zh_CN",
  198. "f": "json",
  199. "ajax": "1",
  200. }
  201. urllib3.disable_warnings()
  202. r = requests.get(url=url, headers=headers, params=params, verify=False)
  203. r.close()
  204. if r.json()["base_resp"]["err_msg"] == "invalid session":
  205. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
  206. Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
  207. cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
  208. if 20 >= datetime.datetime.now().hour >= 10:
  209. Feishu.bot(log_type, crawler,
  210. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
  211. time.sleep(60 * 15)
  212. continue
  213. if r.json()["base_resp"]["err_msg"] == "freq control":
  214. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
  215. Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
  216. cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
  217. if 20 >= datetime.datetime.now().hour >= 10:
  218. Feishu.bot(log_type, crawler,
  219. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
  220. time.sleep(60 * 15)
  221. continue
  222. if r.json()["base_resp"]["err_msg"] == "ok" and len(r.json()["list"]) == 0:
  223. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
  224. Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
  225. unbind_msg = task_unbind(log_type=log_type, crawler=crawler, taskid=task_dict['id'],
  226. uids=str(user_dict["uid"]), env=env)
  227. if unbind_msg == "success":
  228. if 20 >= datetime.datetime.now().hour >= 10:
  229. Feishu.bot(log_type, crawler,
  230. f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n")
  231. Common.logging(log_type, crawler, env,
  232. f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n")
  233. else:
  234. Common.logger(log_type, crawler).warning(f"unbind_msg:{unbind_msg}")
  235. Common.logging(log_type, crawler, env, f"unbind_msg:{unbind_msg}")
  236. return None
  237. user_info_dict = {'user_name': r.json()["list"][0]["nickname"],
  238. 'user_id': r.json()["list"][0]["fakeid"],
  239. 'avatar_url': r.json()["list"][0]["round_head_img"]}
  240. return user_info_dict
  241. # 获取文章列表
  242. @classmethod
  243. def get_videoList(cls, log_type, crawler, task_dict, rule_dict, user_dict, env):
  244. mq = MQ(topic_name="topic_crawler_etl_" + env)
  245. user_info_dict = cls.get_user_info(log_type=log_type,
  246. crawler=crawler,
  247. task_dict=task_dict,
  248. user_dict=user_dict,
  249. env=env)
  250. if user_info_dict is None:
  251. return
  252. user_dict["user_id"] = user_info_dict["user_id"]
  253. user_dict["user_name"] = user_info_dict["user_name"]
  254. user_dict["avatar_url"] = user_info_dict["avatar_url"]
  255. begin = 0
  256. while True:
  257. Common.logger(log_type, crawler).info(f"get_videoList_token:{token_dict}")
  258. url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  259. headers = {
  260. "accept": "*/*",
  261. "accept-encoding": "gzip, deflate, br",
  262. "accept-language": "zh-CN,zh;q=0.9",
  263. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  264. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  265. "&type=77&createType=5&token=" + str(token_dict['token']) + "&lang=zh_CN",
  266. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  267. "sec-ch-ua-mobile": "?0",
  268. "sec-ch-ua-platform": '"Windows"',
  269. "sec-fetch-dest": "empty",
  270. "sec-fetch-mode": "cors",
  271. "sec-fetch-site": "same-origin",
  272. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  273. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  274. "x-requested-with": "XMLHttpRequest",
  275. 'cookie': token_dict['cookie'],
  276. }
  277. params = {
  278. "action": "list_ex",
  279. "begin": str(begin),
  280. "count": "5",
  281. "fakeid": user_dict['user_id'],
  282. "type": "9",
  283. "query": "",
  284. "token": str(token_dict['token']),
  285. "lang": "zh_CN",
  286. "f": "json",
  287. "ajax": "1",
  288. }
  289. urllib3.disable_warnings()
  290. r = requests.get(url=url, headers=headers, params=params, verify=False)
  291. r.close()
  292. if r.json()["base_resp"]["err_msg"] == "invalid session":
  293. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
  294. Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
  295. cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
  296. if 20 >= datetime.datetime.now().hour >= 10:
  297. Feishu.bot(log_type, crawler,
  298. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
  299. time.sleep(60 * 15)
  300. continue
  301. if r.json()["base_resp"]["err_msg"] == "freq control":
  302. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
  303. Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
  304. cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
  305. if 20 >= datetime.datetime.now().hour >= 10:
  306. Feishu.bot(log_type, crawler,
  307. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
  308. time.sleep(60 * 15)
  309. continue
  310. if r.json()["base_resp"]["err_msg"] == "invalid args" and r.json()["base_resp"]["ret"] == 200002:
  311. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
  312. Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
  313. task_unbind(log_type=log_type, crawler=crawler, taskid=task_dict['id'], uids=str(user_dict["uid"]),
  314. env=env)
  315. if 20 >= datetime.datetime.now().hour >= 10:
  316. Feishu.bot(log_type, crawler,
  317. f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n")
  318. return
  319. if 'app_msg_list' not in r.json():
  320. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
  321. Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
  322. cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
  323. if 20 >= datetime.datetime.now().hour >= 10:
  324. Feishu.bot(log_type, crawler,
  325. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
  326. time.sleep(60 * 15)
  327. continue
  328. if len(r.json()['app_msg_list']) == 0:
  329. Common.logger(log_type, crawler).info('没有更多视频了\n')
  330. Common.logging(log_type, crawler, env, '没有更多视频了\n')
  331. return
  332. else:
  333. begin += 5
  334. app_msg_list = r.json()['app_msg_list']
  335. for article in app_msg_list:
  336. try:
  337. create_time = article.get('create_time', 0)
  338. publish_time_stamp = int(create_time)
  339. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  340. article_url = article.get('link', '')
  341. video_dict = {
  342. 'video_id': article.get('aid', ''),
  343. 'video_title': article.get("title", "").replace(' ', '').replace('"', '').replace("'", ""),
  344. 'publish_time_stamp': publish_time_stamp,
  345. 'publish_time_str': publish_time_str,
  346. 'user_name': user_dict["user_name"],
  347. 'play_cnt': 0,
  348. 'comment_cnt': 0,
  349. 'like_cnt': 0,
  350. 'share_cnt': 0,
  351. 'user_id': user_dict['user_id'],
  352. 'avatar_url': user_dict['avatar_url'],
  353. 'cover_url': article.get('cover', ''),
  354. 'article_url': article.get('link', ''),
  355. 'video_url': cls.get_video_url(article_url, env),
  356. 'session': f'gongzhonghao-author1-{int(time.time())}'
  357. }
  358. for k, v in video_dict.items():
  359. Common.logger(log_type, crawler).info(f"{k}:{v}")
  360. Common.logging(log_type, crawler, env, f'video_dict:{video_dict}')
  361. if int(time.time()) - publish_time_stamp > 3600 * 24 * int(
  362. rule_dict.get('period', {}).get('max', 1000)):
  363. Common.logger(log_type, crawler).info(
  364. f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
  365. Common.logging(log_type, crawler, env,
  366. f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
  367. return
  368. if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
  369. Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
  370. Common.logging(log_type, crawler, env, "文章涉嫌违反相关法律法规和政策\n")
  371. # 标题敏感词过滤
  372. elif any(str(word) if str(word) in video_dict['video_title'] else False
  373. for word in get_config_from_mysql(log_type=log_type,
  374. source=crawler,
  375. env=env,
  376. text="filter",
  377. action="")) is True:
  378. Common.logger(log_type, crawler).info("标题已中过滤词\n")
  379. Common.logging(log_type, crawler, env, "标题已中过滤词\n")
  380. # 已下载判断
  381. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
  382. Common.logger(log_type, crawler).info("视频已下载\n")
  383. Common.logging(log_type, crawler, env, "视频已下载\n")
  384. # 标题相似度
  385. elif title_like(log_type, crawler, video_dict['video_title'], cls.platform, env) is True:
  386. Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
  387. Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
  388. else:
  389. video_dict["out_user_id"] = video_dict["user_id"]
  390. video_dict["platform"] = crawler
  391. video_dict["strategy"] = log_type
  392. video_dict["out_video_id"] = video_dict["video_id"]
  393. video_dict["width"] = 0
  394. video_dict["height"] = 0
  395. video_dict["crawler_rule"] = json.dumps(rule_dict)
  396. video_dict["user_id"] = user_dict["uid"]
  397. video_dict["publish_time"] = video_dict["publish_time_str"]
  398. mq.send_msg(video_dict)
  399. except Exception as e:
  400. Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
  401. Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
  402. Common.logger(log_type, crawler).info('休眠 60 秒\n')
  403. Common.logging(log_type, crawler, env, '休眠 60 秒\n')
  404. time.sleep(60)
  405. if __name__ == "__main__":
  406. GongzhonghaoAuthor.get_token("author", "gongzhonghao", "dev")
  407. # print(int(time.time()*1000))
  408. pass