gongzhonghao_author.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726
  1. import datetime
  2. import json
  3. import os
  4. import random
  5. import sys
  6. import time
  7. import uuid
  8. import requests
  9. import urllib3
  10. from selenium.webdriver import DesiredCapabilities
  11. from selenium.webdriver.chrome.service import Service
  12. from selenium.webdriver.common.by import By
  13. from selenium import webdriver
  14. sys.path.append(os.getcwd())
  15. from common.mq import MQ
  16. from common.common import Common
  17. from common.feishu import Feishu
  18. from common.scheduling_db import MysqlHelper
  19. from common.public import get_config_from_mysql, download_rule, title_like, task_unbind
  20. from common import AliyunLogger, PiaoQuanPipeline
  21. class GongzhonghaoAuthor:
  22. platform = "公众号"
  23. # 获取 token
  24. @classmethod
  25. def get_token(cls, log_type, crawler, token_index, env):
  26. select_sql = f""" select * from crawler_config where source="{crawler}" and title LIKE "%公众号_{token_index}%";"""
  27. configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  28. if len(configs) == 0:
  29. Feishu.bot(log_type, crawler, f"公众号_{token_index}:未配置token")
  30. time.sleep(60)
  31. return None
  32. token_dict = {
  33. "token_id": configs[0]["id"],
  34. "title": configs[0]["title"].strip(),
  35. "token": dict(eval(configs[0]["config"]))["token"].strip(),
  36. "cookie": dict(eval(configs[0]["config"]))["cookie"].strip(),
  37. "update_time": time.strftime(
  38. "%Y-%m-%d %H:%M:%S",
  39. time.localtime(int(configs[0]["update_time"] / 1000)),
  40. ),
  41. "operator": configs[0]["operator"].strip(),
  42. }
  43. return token_dict
  44. # 获取用户 fakeid
  45. @classmethod
  46. def get_user_info(cls, log_type, crawler, task_dict, user_dict, token_index, env):
  47. Common.logger(log_type, crawler).info(f"获取站外用户信息:{user_dict['link']}")
  48. Common.logging(log_type, crawler, env, f"获取站外用户信息:{user_dict['link']}")
  49. AliyunLogger.logging(
  50. code="1000",
  51. platform=crawler,
  52. mode=log_type,
  53. env=env,
  54. message=f"获取站外用户信息:{user_dict['link']}",
  55. )
  56. while True:
  57. token_dict = cls.get_token(log_type, crawler, token_index, env)
  58. url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
  59. headers = {
  60. "accept": "*/*",
  61. "accept-encoding": "gzip, deflate, br",
  62. "accept-language": "zh-CN,zh;q=0.9",
  63. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  64. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  65. "&type=77&createType=5&token=1011071554&lang=zh_CN",
  66. "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  67. "sec-ch-ua-mobile": "?0",
  68. "sec-ch-ua-platform": '"Windows"',
  69. "sec-fetch-dest": "empty",
  70. "sec-fetch-mode": "cors",
  71. "sec-fetch-site": "same-origin",
  72. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  73. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  74. "x-requested-with": "XMLHttpRequest",
  75. "cookie": token_dict["cookie"],
  76. }
  77. params = {
  78. "action": "search_biz",
  79. "begin": "0",
  80. "count": "5",
  81. "query": str(user_dict["link"]),
  82. "token": token_dict["token"],
  83. "lang": "zh_CN",
  84. "f": "json",
  85. "ajax": "1",
  86. }
  87. urllib3.disable_warnings()
  88. r = requests.get(url=url, headers=headers, params=params, verify=False)
  89. r.close()
  90. if r.json()["base_resp"]["err_msg"] == "invalid session":
  91. Common.logger(log_type, crawler).warning(
  92. f"status_code:{r.status_code}, get_fakeid:{r.text}\n"
  93. )
  94. Common.logging(
  95. log_type,
  96. crawler,
  97. env,
  98. f"status_code:{r.status_code}, get_fakeid:{r.text}\n",
  99. )
  100. AliyunLogger.logging(
  101. code="2000",
  102. platform=crawler,
  103. mode=log_type,
  104. env=env,
  105. message=f"status_code:{r.status_code}, get_fakeid:{r.text}\n",
  106. )
  107. if 20 >= datetime.datetime.now().hour >= 10:
  108. Feishu.bot(
  109. log_type,
  110. crawler,
  111. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/",
  112. )
  113. time.sleep(60 * 15)
  114. continue
  115. if r.json()["base_resp"]["err_msg"] == "freq control":
  116. Common.logger(log_type, crawler).warning(
  117. f"status_code:{r.status_code}, get_fakeid:{r.text}\n"
  118. )
  119. Common.logging(
  120. log_type,
  121. crawler,
  122. env,
  123. f"status_code:{r.status_code}, get_fakeid:{r.text}\n",
  124. )
  125. AliyunLogger.logging(
  126. code="2000",
  127. platform=crawler,
  128. mode=log_type,
  129. env=env,
  130. message=f"status_code:{r.status_code}, get_fakeid:{r.text}\n",
  131. )
  132. if 20 >= datetime.datetime.now().hour >= 10:
  133. Feishu.bot(
  134. log_type,
  135. crawler,
  136. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/",
  137. )
  138. time.sleep(60 * 15)
  139. continue
  140. if r.json()["base_resp"]["err_msg"] == "ok" and len(r.json()["list"]) == 0:
  141. Common.logger(log_type, crawler).warning(
  142. f"status_code:{r.status_code}, get_fakeid:{r.text}\n"
  143. )
  144. Common.logging(
  145. log_type,
  146. crawler,
  147. env,
  148. f"status_code:{r.status_code}, get_fakeid:{r.text}\n",
  149. )
  150. AliyunLogger.logging(
  151. code="2000",
  152. platform=crawler,
  153. mode=log_type,
  154. env=env,
  155. message=f"status_code:{r.status_code}, get_fakeid:{r.text}\n",
  156. )
  157. unbind_msg = task_unbind(
  158. log_type=log_type,
  159. crawler=crawler,
  160. taskid=task_dict["id"],
  161. uids=str(user_dict["uid"]),
  162. env=env,
  163. )
  164. if unbind_msg == "success":
  165. if 20 >= datetime.datetime.now().hour >= 10:
  166. Feishu.bot(
  167. log_type,
  168. crawler,
  169. f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n",
  170. )
  171. Common.logging(
  172. log_type,
  173. crawler,
  174. env,
  175. f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n",
  176. )
  177. AliyunLogger.logging(
  178. code="2000",
  179. platform=crawler,
  180. mode=log_type,
  181. env=env,
  182. message=f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n",
  183. )
  184. else:
  185. Common.logger(log_type, crawler).warning(f"unbind_msg:{unbind_msg}")
  186. Common.logging(log_type, crawler, env, f"unbind_msg:{unbind_msg}")
  187. AliyunLogger.logging(
  188. code="2000",
  189. platform=crawler,
  190. mode=log_type,
  191. env=env,
  192. message=f"unbind_msg: {unbind_msg}",
  193. )
  194. return None
  195. user_info_dict = {
  196. "user_name": r.json()["list"][0]["nickname"],
  197. "user_id": r.json()["list"][0]["fakeid"],
  198. "avatar_url": r.json()["list"][0]["round_head_img"],
  199. }
  200. return user_info_dict
  201. # 获取腾讯视频下载链接
  202. @classmethod
  203. def get_tencent_video_url(cls, video_id):
  204. url = "https://h5vv.video.qq.com/getinfo?vid={}&platform=101001&charge=0&otype=json&defn=shd".format(
  205. video_id
  206. )
  207. headers = {
  208. "Host": "h5vv.video.qq.com",
  209. "xweb_xhr": "1",
  210. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
  211. "Content-Type": "application/x-www-form-urlencoded",
  212. "Accept": "*/*",
  213. "Sec-Fetch-Site": "cross-site",
  214. "Sec-Fetch-Mode": "cors",
  215. "Sec-Fetch-Dest": "empty",
  216. "Referer": "https://servicewechat.com/wx5fcd817f3f80aece/3/page-frame.html",
  217. "Accept-Language": "en",
  218. }
  219. response = requests.get(url, headers=headers)
  220. result = json.loads(response.text.replace("QZOutputJson=", "")[:-1])
  221. vl = result["vl"]["vi"][0]
  222. key = vl["fvkey"]
  223. name = vl["fn"]
  224. folder = vl["ul"]["ui"][0]["url"]
  225. video_url = folder + name + "?vkey=" + key
  226. time.sleep(random.randint(1, 5))
  227. return video_url
  228. @classmethod
  229. def get_video_url(cls, article_url, env):
  230. # 打印请求配置
  231. ca = DesiredCapabilities.CHROME
  232. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  233. # 不打开浏览器运行
  234. chrome_options = webdriver.ChromeOptions()
  235. chrome_options.add_argument("headless")
  236. chrome_options.add_argument(
  237. f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"
  238. )
  239. chrome_options.add_argument("--no-sandbox")
  240. # driver初始化
  241. if env == "prod":
  242. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  243. else:
  244. driver = webdriver.Chrome(
  245. desired_capabilities=ca,
  246. options=chrome_options,
  247. service=Service(
  248. "/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver"
  249. ),
  250. )
  251. driver.implicitly_wait(10)
  252. driver.get(article_url)
  253. time.sleep(1)
  254. if (
  255. len(
  256. driver.find_elements(
  257. By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]'
  258. )
  259. )
  260. != 0
  261. ):
  262. video_url = driver.find_element(
  263. By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]'
  264. ).get_attribute("src")
  265. elif (
  266. len(
  267. driver.find_elements(
  268. By.XPATH, '//span[@class="js_tx_video_container"]/*[1]'
  269. )
  270. )
  271. != 0
  272. ):
  273. iframe = driver.find_element(
  274. By.XPATH, '//span[@class="js_tx_video_container"]/*[1]'
  275. ).get_attribute("src")
  276. video_id = iframe.split("vid=")[-1].split("&")[0]
  277. video_url = cls.get_tencent_video_url(video_id)
  278. else:
  279. video_url = 0
  280. driver.quit()
  281. if "mpvideo.qpic.cn" in str(video_url):
  282. time.sleep(random.randint(1, 3))
  283. return video_url
  284. # 获取文章列表
  285. @classmethod
  286. def get_videoList(
  287. cls, log_type, crawler, task_dict, token_index, rule_dict, user_dict, env
  288. ):
  289. mq = MQ(topic_name="topic_crawler_etl_" + env)
  290. user_info_dict = cls.get_user_info(
  291. log_type=log_type,
  292. crawler=crawler,
  293. task_dict=task_dict,
  294. user_dict=user_dict,
  295. token_index=token_index,
  296. env=env,
  297. )
  298. if user_info_dict is None:
  299. return
  300. user_dict["user_id"] = user_info_dict["user_id"]
  301. user_dict["user_name"] = user_info_dict["user_name"]
  302. user_dict["avatar_url"] = user_info_dict["avatar_url"]
  303. begin = 0
  304. while True:
  305. token_dict = cls.get_token(log_type, crawler, token_index, env)
  306. url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  307. headers = {
  308. "accept": "*/*",
  309. "accept-encoding": "gzip, deflate, br",
  310. "accept-language": "zh-CN,zh;q=0.9",
  311. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  312. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  313. "&type=77&createType=5&token="
  314. + str(token_dict["token"])
  315. + "&lang=zh_CN",
  316. "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  317. "sec-ch-ua-mobile": "?0",
  318. "sec-ch-ua-platform": '"Windows"',
  319. "sec-fetch-dest": "empty",
  320. "sec-fetch-mode": "cors",
  321. "sec-fetch-site": "same-origin",
  322. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  323. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  324. "x-requested-with": "XMLHttpRequest",
  325. "cookie": token_dict["cookie"],
  326. }
  327. params = {
  328. "action": "list_ex",
  329. "begin": str(begin),
  330. "count": "5",
  331. "fakeid": user_dict["user_id"],
  332. "type": "9",
  333. "query": "",
  334. "token": str(token_dict["token"]),
  335. "lang": "zh_CN",
  336. "f": "json",
  337. "ajax": "1",
  338. }
  339. urllib3.disable_warnings()
  340. r = requests.get(url=url, headers=headers, params=params, verify=False)
  341. r.close()
  342. if r.json()["base_resp"]["err_msg"] == "invalid session":
  343. Common.logger(log_type, crawler).warning(
  344. f"status_code:{r.status_code}, get_videoList:{r.text}\n"
  345. )
  346. Common.logging(
  347. log_type,
  348. crawler,
  349. env,
  350. f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  351. )
  352. AliyunLogger.logging(
  353. code="2000",
  354. platform=crawler,
  355. mode=log_type,
  356. env=env,
  357. message=f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  358. )
  359. if 20 >= datetime.datetime.now().hour >= 10:
  360. Feishu.bot(
  361. log_type,
  362. crawler,
  363. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/",
  364. )
  365. time.sleep(60 * 15)
  366. continue
  367. if r.json()["base_resp"]["err_msg"] == "freq control":
  368. Common.logger(log_type, crawler).warning(
  369. f"status_code:{r.status_code}, get_videoList:{r.text}\n"
  370. )
  371. Common.logging(
  372. log_type,
  373. crawler,
  374. env,
  375. f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  376. )
  377. AliyunLogger.logging(
  378. code="2000",
  379. platform=crawler,
  380. mode=log_type,
  381. env=env,
  382. message=f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  383. )
  384. if 20 >= datetime.datetime.now().hour >= 10:
  385. Feishu.bot(
  386. log_type,
  387. crawler,
  388. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/",
  389. )
  390. time.sleep(60 * 15)
  391. continue
  392. if (
  393. r.json()["base_resp"]["err_msg"] == "invalid args"
  394. and r.json()["base_resp"]["ret"] == 200002
  395. ):
  396. Common.logger(log_type, crawler).warning(
  397. f"status_code:{r.status_code}, get_videoList:{r.text}\n"
  398. )
  399. Common.logging(
  400. log_type,
  401. crawler,
  402. env,
  403. f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  404. )
  405. AliyunLogger.logging(
  406. code="2000",
  407. platform=crawler,
  408. mode=log_type,
  409. env=env,
  410. message=f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  411. )
  412. task_unbind(
  413. log_type=log_type,
  414. crawler=crawler,
  415. taskid=task_dict["id"],
  416. uids=str(user_dict["uid"]),
  417. env=env,
  418. )
  419. if 20 >= datetime.datetime.now().hour >= 10:
  420. Feishu.bot(
  421. log_type,
  422. crawler,
  423. f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n",
  424. )
  425. return
  426. if "app_msg_list" not in r.json():
  427. Common.logger(log_type, crawler).warning(
  428. f"status_code:{r.status_code}, get_videoList:{r.text}\n"
  429. )
  430. Common.logging(
  431. log_type,
  432. crawler,
  433. env,
  434. f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  435. )
  436. AliyunLogger.logging(
  437. code="2000",
  438. platform=crawler,
  439. mode=log_type,
  440. env=env,
  441. message=f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  442. )
  443. if 20 >= datetime.datetime.now().hour >= 10:
  444. Feishu.bot(
  445. log_type,
  446. crawler,
  447. f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/",
  448. )
  449. time.sleep(60 * 15)
  450. continue
  451. if len(r.json()["app_msg_list"]) == 0:
  452. Common.logger(log_type, crawler).info("没有更多视频了\n")
  453. Common.logging(log_type, crawler, env, "没有更多视频了\n")
  454. AliyunLogger.logging(
  455. code="2000",
  456. platform=crawler,
  457. mode=log_type,
  458. env=env,
  459. message="没有更多视频了\n",
  460. )
  461. return
  462. else:
  463. begin += 5
  464. app_msg_list = r.json()["app_msg_list"]
  465. for article in app_msg_list:
  466. try:
  467. trace_id = crawler + str(uuid.uuid1())
  468. create_time = article.get("create_time", 0)
  469. update_time = article.get("update_time", 0)
  470. publish_time_stamp = int(create_time)
  471. update_time_stamp = int(update_time)
  472. publish_time_str = time.strftime(
  473. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  474. )
  475. article_url = article.get("link", "")
  476. video_dict = {
  477. "video_id": article.get("aid", ""),
  478. "video_title": article.get("title", "")
  479. .replace(" ", "")
  480. .replace('"', "")
  481. .replace("'", ""),
  482. "publish_time_stamp": publish_time_stamp,
  483. "publish_time_str": publish_time_str,
  484. "user_name": user_dict["user_name"],
  485. "play_cnt": 0,
  486. "comment_cnt": 0,
  487. "like_cnt": 0,
  488. "share_cnt": 0,
  489. "user_id": user_dict["user_id"],
  490. "avatar_url": user_dict["avatar_url"],
  491. "cover_url": article.get("cover", ""),
  492. "article_url": article.get("link", ""),
  493. "video_url": cls.get_video_url(article_url, env),
  494. "session": f"gongzhonghao-author1-{int(time.time())}",
  495. }
  496. for k, v in video_dict.items():
  497. Common.logger(log_type, crawler).info(f"{k}:{v}")
  498. Common.logging(
  499. log_type, crawler, env, f"video_dict:{video_dict}"
  500. )
  501. AliyunLogger.logging(
  502. code="1001",
  503. trace_id=trace_id,
  504. platform=crawler,
  505. mode=log_type,
  506. env=env,
  507. message="扫描到一条视频",
  508. data=video_dict,
  509. )
  510. if (
  511. int(time.time()) - publish_time_stamp
  512. > 3600
  513. * 24
  514. * int(rule_dict.get("period", {}).get("max", 1000))
  515. ) and (
  516. int(time.time()) - update_time_stamp
  517. > 3600
  518. * 24
  519. * int(rule_dict.get("period", {}).get("max", 1000))
  520. ):
  521. Common.logger(log_type, crawler).info(
  522. f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n"
  523. )
  524. Common.logging(
  525. log_type,
  526. crawler,
  527. env,
  528. f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n",
  529. )
  530. AliyunLogger.logging(
  531. code="2004",
  532. trace_id=trace_id,
  533. platform=crawler,
  534. mode=log_type,
  535. env=env,
  536. data=video_dict,
  537. message="发布时间超过{}天".format(
  538. int(rule_dict.get("period", {}).get("max", 1000))
  539. ),
  540. )
  541. return
  542. if (
  543. video_dict["article_url"] == 0
  544. or video_dict["video_url"] == 0
  545. ):
  546. Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
  547. Common.logging(log_type, crawler, env, "文章涉嫌违反相关法律法规和政策\n")
  548. AliyunLogger.logging(
  549. code="2005",
  550. trace_id=trace_id,
  551. platform=crawler,
  552. mode=log_type,
  553. env=env,
  554. data=video_dict,
  555. message="无效文章或视频",
  556. )
  557. # 标题敏感词过滤
  558. elif (
  559. any(
  560. str(word)
  561. if str(word) in video_dict["video_title"]
  562. else False
  563. for word in get_config_from_mysql(
  564. log_type=log_type,
  565. source=crawler,
  566. env=env,
  567. text="filter",
  568. action="",
  569. )
  570. )
  571. is True
  572. ):
  573. Common.logger(log_type, crawler).info("标题已中过滤词\n")
  574. Common.logging(log_type, crawler, env, "标题已中过滤词\n")
  575. AliyunLogger.logging(
  576. code="2003",
  577. trace_id=trace_id,
  578. platform=crawler,
  579. mode=log_type,
  580. env=env,
  581. data=video_dict,
  582. message="标题已中过滤词\n",
  583. )
  584. # 已下载判断
  585. elif (
  586. cls.repeat_video(
  587. log_type,
  588. crawler,
  589. video_dict["video_id"],
  590. video_dict["video_title"],
  591. env,
  592. )
  593. != 0
  594. ):
  595. Common.logger(log_type, crawler).info("视频已下载\n")
  596. Common.logging(log_type, crawler, env, "视频已下载\n")
  597. AliyunLogger.logging(
  598. code="2002",
  599. trace_id=trace_id,
  600. platform=crawler,
  601. mode=log_type,
  602. env=env,
  603. data=video_dict,
  604. message="视频已下载",
  605. )
  606. # 标题相似度
  607. # elif title_like(log_type, crawler, video_dict['video_title'], cls.platform, env) is True:
  608. # Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
  609. # Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
  610. else:
  611. video_dict["out_user_id"] = video_dict["user_id"]
  612. video_dict["platform"] = crawler
  613. video_dict["strategy"] = log_type
  614. video_dict["out_video_id"] = video_dict["video_id"]
  615. video_dict["width"] = 0
  616. video_dict["height"] = 0
  617. video_dict["crawler_rule"] = json.dumps(rule_dict)
  618. video_dict["user_id"] = user_dict[
  619. "uid"
  620. ] # 站内 UID?爬虫获取不到了(随机发布到原 5 个账号中)
  621. video_dict["publish_time"] = video_dict["publish_time_str"]
  622. mq.send_msg(video_dict)
  623. AliyunLogger.logging(
  624. code="1002",
  625. trace_id=trace_id,
  626. platform=crawler,
  627. mode=log_type,
  628. env=env,
  629. data=video_dict,
  630. message="成功发送 MQ 至 ETL",
  631. )
  632. time.sleep(random.randint(1, 8))
  633. except Exception as e:
  634. Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
  635. Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
  636. AliyunLogger.logging(
  637. code="3000",
  638. platform=crawler,
  639. mode=log_type,
  640. env=env,
  641. message=f"抓取单条视频异常:{e}\n",
  642. )
  643. Common.logger(log_type, crawler).info("休眠 60 秒\n")
  644. Common.logging(log_type, crawler, env, "休眠 60 秒\n")
  645. time.sleep(60)
  646. @classmethod
  647. def repeat_video(cls, log_type, crawler, video_id, video_title, env):
  648. # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
  649. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}" ; """
  650. # sql = f"""select * from crawler_video where platform = "{crawler}" and (out_video_id="{video_id}" or video_title="{video_title}") ;"""
  651. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  652. return len(repeat_video)
  653. @classmethod
  654. def get_all_videos(
  655. cls, log_type, crawler, task_dict, token_index, rule_dict, user_list, env
  656. ):
  657. total_s = 8 * 60 * 60 # 每个爬虫每天抓取的时间是12h(8h等待+4h抓取)
  658. wait_average_time = int((total_s / len(user_list)))
  659. for user_dict in user_list:
  660. Common.logger(log_type, crawler).info(f'抓取公众号:{user_dict["nick_name"]}\n')
  661. Common.logging(log_type, crawler, env, f'抓取公众号:{user_dict["nick_name"]}\n')
  662. AliyunLogger.logging(
  663. code="1003",
  664. platform=crawler,
  665. mode=log_type,
  666. env=env,
  667. message="开始抓取公众号: {}".format(user_dict["nick_name"]),
  668. )
  669. try:
  670. cls.get_videoList(
  671. log_type=log_type,
  672. crawler=crawler,
  673. task_dict=task_dict,
  674. token_index=token_index,
  675. rule_dict=rule_dict,
  676. user_dict=user_dict,
  677. env=env,
  678. )
  679. sleep_time = random.randint(
  680. wait_average_time - 120, wait_average_time - 60
  681. )
  682. Common.logger(log_type, crawler).info("休眠 {} 秒\n".format(sleep_time))
  683. Common.logging(log_type, crawler, env, "休眠 {} 秒\n".format(sleep_time))
  684. time.sleep(sleep_time)
  685. except Exception as e:
  686. Common.logger(log_type, crawler).info(
  687. f'抓取公众号:{user_dict["nick_name"]}时异常:{e}\n'
  688. )
  689. Common.logging(
  690. log_type, crawler, env, f'抓取公众号:{user_dict["nick_name"]}时异常:{e}\n'
  691. )
  692. AliyunLogger.logging(
  693. code="3000",
  694. platform=crawler,
  695. mode=log_type,
  696. env=env,
  697. message="抓取公众号: {} 时异常".format(user_dict["nick_name"]),
  698. )
  699. AliyunLogger.logging(
  700. code="1004",
  701. platform=crawler,
  702. mode=log_type,
  703. env=env,
  704. message="完成抓取公众号: {}".format(user_dict["nick_name"]),
  705. )
  706. # 判断该账号10天内是否有新进到数据
  707. if __name__ == "__main__":
  708. pass