gongzhonghao_url_author.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. import datetime
  2. import json
  3. import os
  4. import random
  5. import re
  6. import sys
  7. import time
  8. import uuid
  9. import requests
  10. sys.path.append(os.getcwd())
  11. from selenium.webdriver import DesiredCapabilities
  12. from selenium.webdriver.chrome.service import Service
  13. from selenium.webdriver.common.by import By
  14. from selenium import webdriver
  15. from common.scheduling_db import MysqlHelper
  16. from common.mq import MQ
  17. from common.common import Common
  18. from common.public import get_config_from_mysql
  19. from common import AliyunLogger
  20. from datetime import datetime
  21. class GongzhonghaoUrlAuthor:
  22. platform = "公众号"
  23. @classmethod
  24. def get_all_videos(cls, log_type, crawler, task_dict, rule_dict, user_list, env):
  25. total_s = 8 * 60 * 60 # 每个爬虫每天抓取的时间是12h(8h等待+4h抓取)
  26. wait_average_time = int((total_s / len(user_list)))
  27. for user_dict in user_list:
  28. a = user_dict["nick_name"]
  29. Common.logger(log_type, crawler).info(f'抓取公众号:{user_dict["nick_name"]}\n')
  30. Common.logging(log_type, crawler, env, f'抓取公众号:{user_dict["nick_name"]}\n')
  31. AliyunLogger.logging(
  32. code="1003",
  33. platform=crawler,
  34. mode=log_type,
  35. env=env,
  36. message="开始抓取公众号: {}".format(user_dict["nick_name"]),
  37. )
  38. try:
  39. cls.get_videoList(
  40. log_type=log_type,
  41. crawler=crawler,
  42. task_dict=task_dict,
  43. rule_dict=rule_dict,
  44. user_dict=user_dict,
  45. env=env,
  46. )
  47. sleep_time = random.randint(
  48. wait_average_time - 120, wait_average_time - 60
  49. )
  50. Common.logger(log_type, crawler).info("休眠 {} 秒\n".format(sleep_time))
  51. Common.logging(log_type, crawler, env, "休眠 {} 秒\n".format(sleep_time))
  52. time.sleep(sleep_time)
  53. except Exception as e:
  54. Common.logger(log_type, crawler).info(
  55. f'抓取公众号:{user_dict["nick_name"]}时异常:{e}\n'
  56. )
  57. Common.logging(
  58. log_type, crawler, env, f'抓取公众号:{user_dict["nick_name"]}时异常:{e}\n'
  59. )
  60. AliyunLogger.logging(
  61. code="3000",
  62. platform=crawler,
  63. mode=log_type,
  64. env=env,
  65. message="抓取公众号: {} 时异常".format(user_dict["nick_name"]),
  66. )
  67. AliyunLogger.logging(
  68. code="1004",
  69. platform=crawler,
  70. mode=log_type,
  71. env=env,
  72. message="完成抓取公众号: {}".format(user_dict["nick_name"]),
  73. )
  74. @classmethod
  75. def get_video_url(cls, article_url, env):
  76. # 打印请求配置
  77. ca = DesiredCapabilities.CHROME
  78. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  79. # 不打开浏览器运行
  80. chrome_options = webdriver.ChromeOptions()
  81. chrome_options.add_argument("headless")
  82. chrome_options.add_argument(
  83. f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"
  84. )
  85. chrome_options.add_argument("--no-sandbox")
  86. # driver初始化
  87. if env == "prod":
  88. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  89. else:
  90. driver = webdriver.Chrome(
  91. desired_capabilities=ca,
  92. options=chrome_options,
  93. service=Service(
  94. "/Users/tzld/Downloads/chromedriver_mac64/chromedriver"
  95. ),
  96. )
  97. driver.implicitly_wait(10)
  98. driver.get(article_url)
  99. time.sleep(1)
  100. if (
  101. len(
  102. driver.find_elements(
  103. By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]'
  104. )
  105. )
  106. != 0
  107. ):
  108. video_url = driver.find_element(
  109. By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]'
  110. ).get_attribute("src")
  111. elif (
  112. len(
  113. driver.find_elements(
  114. By.XPATH, '//span[@class="js_tx_video_container"]/*[1]'
  115. )
  116. )
  117. != 0
  118. ):
  119. iframe = driver.find_element(
  120. By.XPATH, '//span[@class="js_tx_video_container"]/*[1]'
  121. ).get_attribute("src")
  122. video_id = iframe.split("vid=")[-1].split("&")[0]
  123. video_url = cls.get_tencent_video_url(video_id)
  124. else:
  125. video_url = 0
  126. driver.quit()
  127. if "mpvideo.qpic.cn" in str(video_url):
  128. time.sleep(random.randint(1, 3))
  129. return video_url
  130. @classmethod
  131. def get_wechat_gh(cls,content_link):
  132. payload = {}
  133. headers = {
  134. 'authority': 'mp.weixin.qq.com',
  135. 'cookie': 'RK=kLuB01bYUa; ptcz=604f91ae284ed19ddcddda0c052312f03f096ccaa23994b0dc7aac856159a1d9; iip=0; rewardsn=; wxtokenkey=777; pac_uid=1_364544322; pgv_info=ssid=s5424148811; pgv_pvid=8423646400; o_cookie=364544322; wwapp.vid=; wwapp.cst=; wwapp.deviceid=; login_type=wxqrcode; tvfe_boss_uuid=97fe85e41c02f816; ua_id=FS5Q0DLf7QjeurnpAAAAAG3yjawqm2QVreYybCeE-bE=; wxuin=84408959445830; mm_lang=zh_CN; sig=h01d8310f2cf065f1baf641dec377a7cf209b3acd87e6f47e759a8eff53a83be44365d97fd1e013a7d4; uuid=626f86245f04d876538319d2b0ad00a8; xid=69967389815bcec44c878b4ec5f7d0cd; _clck=3891672333|1|fbt|0; qm_authimgs_id=1; qm_verifyimagesession=h014171884333ea321544e60514b652e6ea94b41703bea3db998ba552a68c3bb029407c558b3c658287; qqhxqqcomrouteLine=index; eas_sid=11q6j8S5i49225n2H271a3i9W6; ariaDefaultTheme=undefined; rewardsn=; wxtokenkey=777',
  136. 'referer': 'https://weixin.sogou.com/link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6cwJThYulHEtVjXrGTiVgS-jLzX0QJsZc9LKGBXLDqu6hcy8W9YK4n1qXa8Fplpd9kqb1on3XUORxrmoftjAEj_GbEcfeUOWbw4CoyV3mfI6CnS5wEgRgloC4xjPDiE6GeHrvBBz3sVJJqopuR3-XqA0a-_G6lnkfM41cvBft-VHFr1bNo2EnzytenNSxFGs7t5_16x7SsuyAXBbT1gj0mwfbwdmomkYm6Wv3FtUFWt3zAjcIGepUqA..&type=2&query=%E7%83%AD%E7%82%B9&token=BE1F165D212650B2999C655336D9D740998E8E7A6475BD69&k=43&h=9',
  137. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
  138. }
  139. try:
  140. response = requests.request("GET", content_link, headers=headers, data=payload)
  141. wechat_gh = re.search(r'var user_name = "(.*?)"', response.text).group(1)
  142. return wechat_gh
  143. except Exception:
  144. return None
  145. @classmethod
  146. def repeat_video(cls, log_type, crawler, video_id, env):
  147. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}" ; """
  148. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  149. return len(repeat_video)
  150. # 获取文章列表
  151. @classmethod
  152. def get_videoList(cls, log_type, crawler, task_dict, rule_dict, user_dict, env):
  153. mq = MQ(topic_name="topic_crawler_etl_" + env)
  154. wechat_gh = cls.get_wechat_gh(user_dict["nick_name"])
  155. if None == wechat_gh:
  156. Common.logging(
  157. log_type,
  158. crawler,
  159. env,
  160. f"获取用主页id为空{task_dict}\n",
  161. )
  162. AliyunLogger.logging(
  163. code="2004",
  164. platform=crawler,
  165. mode=log_type,
  166. env=env,
  167. message=f"获取用主页id为空{task_dict}",
  168. )
  169. return
  170. time.sleep(1)
  171. url = "http://61.48.133.26:30001/GetGh_Doc"
  172. payload = json.dumps({
  173. "appid": wechat_gh,
  174. "decode": "1"
  175. })
  176. headers = {
  177. 'Content-Type': 'application/json'
  178. }
  179. r = requests.request("POST", url, headers=headers, data=payload)
  180. if "list" not in r.json():
  181. Common.logger(log_type, crawler).warning(
  182. f"status_code:{r.status_code}, get_videoList:{r.text}\n"
  183. )
  184. Common.logging(
  185. log_type,
  186. crawler,
  187. env,
  188. f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  189. )
  190. AliyunLogger.logging(
  191. code="2000",
  192. platform=crawler,
  193. mode=log_type,
  194. env=env,
  195. message=f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  196. )
  197. time.sleep(60 * 15)
  198. return
  199. if len(r.json()["list"]) == 0:
  200. Common.logger(log_type, crawler).info("没有更多视频了\n")
  201. Common.logging(log_type, crawler, env, "没有更多视频了\n")
  202. AliyunLogger.logging(
  203. code="2000",
  204. platform=crawler,
  205. mode=log_type,
  206. env=env,
  207. message="没有更多视频了\n",
  208. )
  209. return
  210. else:
  211. user_name = r.json().get("gh_name")
  212. app_msg_list = r.json()["list"]
  213. for article in app_msg_list:
  214. try:
  215. trace_id = crawler + str(uuid.uuid1())
  216. publish_time_str = article.get("published_time", 0)
  217. date_format = "%Y-%m-%d %H:%M:%S"
  218. date_time_obj = datetime.strptime(publish_time_str, date_format)
  219. publish_time_stamp = int(date_time_obj.timestamp())
  220. article_url = article.get("url", "")
  221. video_dict = {
  222. "video_id": article.get("aid", ""),
  223. "video_title": article.get("title", "")
  224. .replace(" ", "")
  225. .replace('"', "")
  226. .replace("'", ""),
  227. "publish_time_stamp": publish_time_stamp,
  228. "publish_time_str": publish_time_str,
  229. "user_name": user_name,
  230. "play_cnt": 0,
  231. "comment_cnt": 0,
  232. "like_cnt": 0,
  233. "share_cnt": 0,
  234. "user_id": user_dict["uid"],
  235. "avatar_url": user_dict["avatar_url"],
  236. "cover_url": article.get("head_pic", ""),
  237. "article_url": article.get("head_pic", ""),
  238. "video_url": cls.get_video_url(article_url, env),
  239. "session": f"gongzhonghao-author1-{int(time.time())}",
  240. }
  241. for k, v in video_dict.items():
  242. Common.logger(log_type, crawler).info(f"{k}:{v}")
  243. Common.logging(
  244. log_type, crawler, env, f"video_dict:{video_dict}"
  245. )
  246. AliyunLogger.logging(
  247. code="1001",
  248. trace_id=trace_id,
  249. platform=crawler,
  250. mode=log_type,
  251. env=env,
  252. message="扫描到一条视频",
  253. data=video_dict,
  254. )
  255. if (
  256. int(time.time()) - publish_time_stamp
  257. > 3600
  258. * 24
  259. * int(rule_dict.get("period", {}).get("max", 1000))
  260. ):
  261. Common.logger(log_type, crawler).info(
  262. f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n"
  263. )
  264. Common.logging(
  265. log_type,
  266. crawler,
  267. env,
  268. f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n",
  269. )
  270. AliyunLogger.logging(
  271. code="2004",
  272. trace_id=trace_id,
  273. platform=crawler,
  274. mode=log_type,
  275. env=env,
  276. data=video_dict,
  277. message="发布时间超过{}天".format(
  278. int(rule_dict.get("period", {}).get("max", 1000))
  279. ),
  280. )
  281. return
  282. if (
  283. video_dict["article_url"] == 0
  284. or video_dict["video_url"] == 0
  285. ):
  286. Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
  287. Common.logging(log_type, crawler, env, "文章涉嫌违反相关法律法规和政策\n")
  288. AliyunLogger.logging(
  289. code="2005",
  290. trace_id=trace_id,
  291. platform=crawler,
  292. mode=log_type,
  293. env=env,
  294. data=video_dict,
  295. message="无效文章或视频",
  296. )
  297. # 标题敏感词过滤
  298. elif (
  299. any(
  300. str(word)
  301. if str(word) in video_dict["video_title"]
  302. else False
  303. for word in get_config_from_mysql(
  304. log_type=log_type,
  305. source=crawler,
  306. env=env,
  307. text="filter",
  308. action="",
  309. )
  310. )
  311. is True
  312. ):
  313. Common.logger(log_type, crawler).info("标题已中过滤词\n")
  314. Common.logging(log_type, crawler, env, "标题已中过滤词\n")
  315. AliyunLogger.logging(
  316. code="2003",
  317. trace_id=trace_id,
  318. platform=crawler,
  319. mode=log_type,
  320. env=env,
  321. data=video_dict,
  322. message="标题已中过滤词\n",
  323. )
  324. # 已下载判断
  325. elif (
  326. cls.repeat_video(
  327. log_type,
  328. crawler,
  329. video_dict["video_id"],
  330. video_dict["video_title"],
  331. env,
  332. )
  333. != 0
  334. ):
  335. Common.logger(log_type, crawler).info("视频已下载\n")
  336. Common.logging(log_type, crawler, env, "视频已下载\n")
  337. AliyunLogger.logging(
  338. code="2002",
  339. trace_id=trace_id,
  340. platform=crawler,
  341. mode=log_type,
  342. env=env,
  343. data=video_dict,
  344. message="视频已下载",
  345. )
  346. else:
  347. video_dict["out_user_id"] = video_dict["user_id"]
  348. video_dict["platform"] = crawler
  349. video_dict["strategy"] = log_type
  350. video_dict["out_video_id"] = video_dict["video_id"]
  351. video_dict["width"] = 0
  352. video_dict["height"] = 0
  353. video_dict["crawler_rule"] = json.dumps(rule_dict)
  354. video_dict["user_id"] = user_dict[
  355. "uid"
  356. ] # 站内 UID?爬虫获取不到了(随机发布到原 5 个账号中)
  357. video_dict["publish_time"] = video_dict["publish_time_str"]
  358. mq.send_msg(video_dict)
  359. AliyunLogger.logging(
  360. code="1002",
  361. trace_id=trace_id,
  362. platform=crawler,
  363. mode=log_type,
  364. env=env,
  365. data=video_dict,
  366. message="成功发送 MQ 至 ETL",
  367. )
  368. time.sleep(random.randint(1, 8))
  369. except Exception as e:
  370. Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
  371. Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
  372. AliyunLogger.logging(
  373. code="3000",
  374. platform=crawler,
  375. mode=log_type,
  376. env=env,
  377. message=f"抓取单条视频异常:{e}\n",
  378. )
  379. Common.logger(log_type, crawler).info("休眠 60 秒\n")
  380. Common.logging(log_type, crawler, env, "休眠 60 秒\n")
  381. time.sleep(60)
  382. if __name__ == '__main__':
  383. log_type = "author"
  384. crawler = "gongzhonghao"
  385. env = "dev"
  386. task_dict = {'createTime': 1688382816512, 'id': 54, 'interval': 200, 'machine': 'aliyun', 'mode': 'author',
  387. 'operator': '王坤', 'rule': {'period': {'min': 1, 'max': 1}, 'duration': {'min': 20, 'max': 2700}},
  388. 'source': 'gongzhonghao', 'spiderName': 'run_gzh_author', 'startTime': 1688456874000, 'status': 0,
  389. 'taskName': '公众号账号', 'updateTime': 1688456876643}
  390. rule_dict = {"period": {"min": 1, "max": 1}, "duration": {"min": 20, "max": 2700}}
  391. task_id = 54
  392. select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
  393. user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
  394. GongzhonghaoUrlAuthor.get_all_videos(log_type=log_type,
  395. crawler=crawler,
  396. task_dict=task_dict,
  397. rule_dict=rule_dict,
  398. user_list=user_list,
  399. env=env)