gongzhongxinhao_author.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. import datetime
  2. import json
  3. import os
  4. import random
  5. import sys
  6. import time
  7. import uuid
  8. import requests
  9. from common.feishu import Feishu
  10. sys.path.append(os.getcwd())
  11. from selenium.webdriver import DesiredCapabilities
  12. from selenium.webdriver.chrome.service import Service
  13. from selenium.webdriver.common.by import By
  14. from selenium import webdriver
  15. from common.mq import MQ
  16. from common import AliyunLogger, PiaoQuanPipeline
  17. from common.limit import AuthorLimit
  18. from datetime import datetime
  19. class GZXHAuthor:
  20. """
  21. 公众新号账号爬虫
  22. """
  23. def __init__(self, platform, mode, rule_dict, user_dict, env):
  24. self.platform = platform
  25. self.mode = mode
  26. self.rule_dict = rule_dict
  27. self.user_dict = user_dict
  28. self.env = env
  29. self.download_cnt = 0
  30. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  31. self.limiter = AuthorLimit(platform=self.platform, mode=self.mode)
  32. def get_account_videos(self):
  33. AliyunLogger.logging(
  34. code="1003",
  35. platform=self.platform,
  36. mode=self.mode,
  37. env=self.env,
  38. message="开始抓取公众新号: {}".format(self.user_dict['link']),
  39. )
  40. try:
  41. self.get_videoList()
  42. except Exception as e:
  43. AliyunLogger.logging(
  44. code="3000",
  45. platform=self.platform,
  46. mode=self.mode,
  47. env=self.env,
  48. message=f"抓取公众新号: {self.user_dict['link']} 时异常,异常信息: {e}",
  49. )
  50. AliyunLogger.logging(
  51. code="1004",
  52. platform=self.platform,
  53. mode=self.mode,
  54. env=self.env,
  55. message="抓取公众新号: {}".format(self.user_dict['link']),
  56. )
  57. # 获取腾讯视频下载链接
  58. def get_tencent_video_url(self, video_id):
  59. # url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
  60. # response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
  61. # response = json.loads(response)
  62. # url = response['vl']['vi'][0]['ul']['ui'][0]['url']
  63. # fvkey = response['vl']['vi'][0]['fvkey']
  64. # video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
  65. # return video_url
  66. url = "https://h5vv.video.qq.com/getinfo?vid={}&platform=101001&charge=0&otype=json&defn=shd".format(
  67. video_id
  68. )
  69. headers = {
  70. "Host": "h5vv.video.qq.com",
  71. "xweb_xhr": "1",
  72. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
  73. "Content-Type": "application/x-www-form-urlencoded",
  74. "Accept": "*/*",
  75. "Sec-Fetch-Site": "cross-site",
  76. "Sec-Fetch-Mode": "cors",
  77. "Sec-Fetch-Dest": "empty",
  78. "Referer": "https://servicewechat.com/wx5fcd817f3f80aece/3/page-frame.html",
  79. "Accept-Language": "en",
  80. }
  81. response = requests.get(url, headers=headers)
  82. result = json.loads(response.text.replace("QZOutputJson=", "")[:-1])
  83. vl = result["vl"]["vi"][0]
  84. key = vl["fvkey"]
  85. name = vl["fn"]
  86. folder = vl["ul"]["ui"][0]["url"]
  87. video_url = folder + name + "?vkey=" + key
  88. time.sleep(random.randint(1, 5))
  89. return video_url
  90. def get_video_url(self, article_url):
  91. # 打印请求配置
  92. ca = DesiredCapabilities.CHROME
  93. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  94. # 不打开浏览器运行
  95. chrome_options = webdriver.ChromeOptions()
  96. chrome_options.add_argument("headless")
  97. chrome_options.add_argument(
  98. f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"
  99. )
  100. chrome_options.add_argument("--no-sandbox")
  101. # driver初始化
  102. if self.env == "prod":
  103. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  104. else:
  105. driver = webdriver.Chrome(
  106. # desired_capabilities=ca,
  107. # options=chrome_options,
  108. service=Service(
  109. "/Users/tzld/Downloads/chromedriver_mac64/chromedriver"
  110. ),
  111. )
  112. driver.implicitly_wait(10)
  113. driver.get(article_url)
  114. time.sleep(1)
  115. if (
  116. len(
  117. driver.find_elements(
  118. By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]'
  119. )
  120. )
  121. != 0
  122. ):
  123. video_url = driver.find_element(
  124. By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]'
  125. ).get_attribute("src")
  126. elif (
  127. len(
  128. driver.find_elements(
  129. By.XPATH, '//span[@class="js_tx_video_container"]/*[1]'
  130. )
  131. )
  132. != 0
  133. ):
  134. iframe = driver.find_element(
  135. By.XPATH, '//span[@class="js_tx_video_container"]/*[1]'
  136. ).get_attribute("src")
  137. video_id = iframe.split("vid=")[-1].split("&")[0]
  138. video_url = self.get_tencent_video_url(video_id)
  139. else:
  140. video_url = 0
  141. driver.quit()
  142. if "mpvideo.qpic.cn" in str(video_url):
  143. time.sleep(random.randint(1, 3))
  144. return video_url
  145. def get_wechat_gh(self, link: str):
  146. url = "http://8.217.190.241:8888/crawler/wei_xin/account_info"
  147. payload = json.dumps({"content_link": link})
  148. headers = {'Content-Type': 'application/json'}
  149. response = requests.request("POST", url, headers=headers, data=payload).json()
  150. if response['code'] == 0:
  151. wx_gh = response['data']['data']['wx_gh']
  152. return wx_gh
  153. # 获取文章列表
  154. def get_videoList(self):
  155. mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  156. time.sleep(1)
  157. wechat_gh = self.get_wechat_gh(self.user_dict['link'])
  158. if None == wechat_gh:
  159. AliyunLogger.logging(
  160. code="2004",
  161. platform=self.platform,
  162. mode=self.mode,
  163. env=self.env,
  164. message=f"获取用主页为空{self.user_dict['link']}",
  165. )
  166. return
  167. time.sleep(1)
  168. url = "http://61.48.133.26:30001/GetGh_Doc"
  169. payload = json.dumps({
  170. "appid": wechat_gh,
  171. "decode": "1"
  172. })
  173. headers = {
  174. 'Content-Type': 'application/json'
  175. }
  176. r = requests.request("POST", url, headers=headers, data=payload)
  177. if "list" not in r.json():
  178. AliyunLogger.logging(
  179. code="2000",
  180. platform=self.platform,
  181. mode=self.mode,
  182. env=self.env,
  183. message=f"status_code:{r.status_code}, get_videoList:{r.text}\n",
  184. )
  185. time.sleep(60 * 15)
  186. return
  187. if len(r.json()["list"]) == 0:
  188. AliyunLogger.logging(
  189. code="2000",
  190. platform=self.platform,
  191. mode=self.mode,
  192. env=self.env,
  193. message="没有更多视频了\n",
  194. )
  195. return
  196. else:
  197. user_name = r.json().get("gh_name")
  198. app_msg_list = r.json()["list"]
  199. for article in app_msg_list:
  200. try:
  201. AliyunLogger.logging(
  202. code="1001",
  203. platform=self.platform,
  204. mode=self.mode,
  205. message="扫描到一条视频",
  206. env=self.env,
  207. data=article,
  208. )
  209. repeat_flag = self.process_video_obj(article, user_name, wechat_gh)
  210. if not repeat_flag:
  211. return
  212. except Exception as e:
  213. AliyunLogger.logging(
  214. code="3000",
  215. platform=self.platform,
  216. mode=self.mode,
  217. env=self.env,
  218. message=f"抓取单条视频异常:{e}\n",
  219. )
  220. return
  221. def process_video_obj(self, article, user_name, wechat_gh):
  222. trace_id = self.platform + str(uuid.uuid1())
  223. # update_time_stamp = int(time.time())
  224. publish_time_str = article.get("published_time", 0)
  225. date_format = "%Y-%m-%d %H:%M:%S"
  226. date_time_obj = datetime.strptime(publish_time_str, date_format)
  227. publish_time_stamp = int(date_time_obj.timestamp())
  228. article_url = article.get("url", "")
  229. video_id = wechat_gh + str(int(date_time_obj.timestamp()))
  230. cover_url = article.get("head_pic", "")
  231. video_url = self.get_video_url(article_url)
  232. video_dict = {
  233. "user_name": user_name,
  234. "video_id": video_id,
  235. "video_title": article.get("title", "")
  236. .replace(" ", "")
  237. .replace('"', "")
  238. .replace("'", ""),
  239. "out_video_id": video_id,
  240. "publish_time_stamp": publish_time_stamp,
  241. "update_time_stamp": 0,
  242. "publish_time_str": publish_time_str,
  243. "play_cnt": 0,
  244. "comment_cnt": 0,
  245. "like_cnt": 0,
  246. "share_cnt": 0,
  247. "user_id": self.user_dict["uid"],
  248. "cover_url": cover_url,
  249. "video_url": video_url,
  250. "width": 0,
  251. "height": 0,
  252. "duration": 0,
  253. "platform": self.platform,
  254. "strategy": self.mode,
  255. "crawler_rule": self.rule_dict,
  256. "session": f"gongzhongxinhao-author-{int(time.time())}",
  257. }
  258. AliyunLogger.logging(
  259. code="1001",
  260. trace_id=trace_id,
  261. platform=self.platform,
  262. mode=self.mode,
  263. env=self.env,
  264. message="扫描到一条视频",
  265. data=video_dict,
  266. )
  267. pipeline = PiaoQuanPipeline(
  268. platform=self.platform,
  269. mode=self.mode,
  270. item=video_dict,
  271. rule_dict=self.rule_dict,
  272. env=self.env,
  273. trace_id=trace_id,
  274. )
  275. if not pipeline.publish_time_flag():
  276. return False
  277. if not pipeline.repeat_video():
  278. return True
  279. if not pipeline.download_rule_flag():
  280. return True
  281. if not pipeline.title_flag():
  282. return True
  283. else:
  284. current_time = datetime.now()
  285. timestamp = current_time.strftime("%Y-%m-%d %H:%M:%S")
  286. values = [[
  287. user_name,
  288. video_id,
  289. article.get("title", "")
  290. .replace(" ", "")
  291. .replace('"', "")
  292. .replace("'", ""),
  293. publish_time_str,
  294. timestamp,
  295. video_url,
  296. article.get("head_pic", ""),
  297. self.user_dict['link']
  298. ]]
  299. Feishu.insert_columns('gongzhonghao', 'gongzhonghao', "9QU7wE", "ROWS", 1, 2)
  300. time.sleep(0.5)
  301. Feishu.update_values('gongzhonghao', 'gongzhonghao', "9QU7wE", "A2:Z2", values)
  302. video_dict["publish_time"] = video_dict["publish_time_str"]
  303. limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
  304. if limit_flag:
  305. self.mq.send_msg(video_dict)
  306. self.download_cnt += 1
  307. AliyunLogger.logging(
  308. code="1002",
  309. platform=self.platform,
  310. mode=self.mode,
  311. env=self.env,
  312. data=video_dict,
  313. trace_id=trace_id,
  314. message="成功发送 MQ 至 ETL",
  315. )
  316. time.sleep(5)
  317. return True
  318. if __name__ == "__main__":
  319. GZ = GZXHAuthor(
  320. platform="gongzhonghao",
  321. mode="author",
  322. user_dict={"uid": "123456", "link": "https://mp.weixin.qq.com/s/Cwc2D3RUNDk30zv_s0IxTA", "user_id": "1234565"},
  323. rule_dict={"period":{"min":1,"max":1}},
  324. env="dev",
  325. )
  326. GZ.get_account_videos()