zhufuquanzi.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. # -*- coding: utf-8 -*-
  2. # @Time: 2023/12/20
  3. import json
  4. import multiprocessing
  5. import os
  6. import random
  7. import re
  8. import sys
  9. import time
  10. import uuid
  11. from urllib.parse import urlparse
  12. from hashlib import md5
  13. from appium import webdriver
  14. from appium.webdriver.extensions.android.nativekey import AndroidKey
  15. from bs4 import BeautifulSoup
  16. from selenium.common.exceptions import NoSuchElementException
  17. from selenium.webdriver.common.by import By
  18. from application.functions import get_redirect_url
  19. from application.common.log import AliyunLogger, Local
  20. sys.path.append(os.getcwd())
  21. from application.pipeline import PiaoQuanPipeline
  22. class ZhuFuQuanZiRecommend(object):
  23. def __init__(self, log_type, crawler, env, rule_dict, our_uid):
  24. self.mq = None
  25. self.platform = "zhufuquanzi"
  26. self.download_cnt = 0
  27. self.element_list = []
  28. self.count = 0
  29. self.swipe_count = 0
  30. self.log_type = log_type
  31. self.crawler = crawler
  32. self.env = env
  33. self.rule_dict = rule_dict
  34. self.our_uid = our_uid
  35. chromedriverExecutable = "/usr/bin/chromedriver"
  36. self.aliyun_log = AliyunLogger(platform=crawler, mode=log_type, env=env)
  37. Local.logger(self.log_type, self.crawler).info("启动微信")
  38. # 微信的配置文件
  39. caps = {
  40. "platformName": "Android",
  41. "devicesName": "Android",
  42. "appPackage": "com.tencent.mm",
  43. "appActivity": ".ui.LauncherUI",
  44. "autoGrantPermissions": "true",
  45. "noReset": True,
  46. "resetkeyboard": True,
  47. "unicodekeyboard": True,
  48. "showChromedriverLog": True,
  49. "printPageSourceOnFailure": True,
  50. "recreateChromeDriverSessions": True,
  51. "enableWebviewDetailsCollection": True,
  52. "setWebContentsDebuggingEnabled": True,
  53. "newCommandTimeout": 6000,
  54. "automationName": "UiAutomator2",
  55. "chromedriverExecutable": chromedriverExecutable,
  56. "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
  57. }
  58. try:
  59. self.driver = webdriver.Remote("http://localhost:4750/wd/hub", caps)
  60. except Exception as e:
  61. self.aliyun_log.logging(
  62. code="3002",
  63. message=f'appium 启动异常: {e}'
  64. )
  65. return
  66. self.driver.implicitly_wait(30)
  67. for i in range(5):
  68. try:
  69. if self.driver.find_elements(By.ID, "com.tencent.mm:id/f2s"):
  70. print("启动微信成功")
  71. break
  72. elif self.driver.find_element(
  73. By.ID, "com.android.systemui:id/dismiss_view"
  74. ):
  75. print("发现并关闭系统下拉菜单")
  76. size = self.driver.get_window_size()
  77. self.driver.swipe(
  78. int(size["width"] * 0.5),
  79. int(size["height"] * 0.8),
  80. int(size["width"] * 0.5),
  81. int(size["height"] * 0.2),
  82. 200,
  83. )
  84. else:
  85. pass
  86. except Exception as e:
  87. self.aliyun_log.logging(
  88. code="3002",
  89. message=f'打开微信异常: {e}'
  90. )
  91. time.sleep(1)
  92. Local.logger(self.log_type, self.crawler).info("下滑,展示小程序选择面板")
  93. size = self.driver.get_window_size()
  94. self.driver.swipe(
  95. int(size["width"] * 0.5),
  96. int(size["height"] * 0.2),
  97. int(size["width"] * 0.5),
  98. int(size["height"] * 0.8),
  99. 200,
  100. )
  101. time.sleep(1)
  102. self.driver.find_elements(By.XPATH, '//*[@text="祝福圈子"]')[-1].click()
  103. self.aliyun_log.logging(
  104. code="1000",
  105. message="打开小程序祝福圈子成功"
  106. )
  107. time.sleep(5)
  108. self.get_videoList()
  109. time.sleep(1)
  110. self.driver.quit()
  111. def get_videoList(self):
  112. self.driver.implicitly_wait(20)
  113. # 切换到 web_view
  114. self.check_to_applet(xpath='//*[@class="tags--tag tags--tag-0 tags--checked"]')
  115. print("切换到 webview 成功")
  116. time.sleep(1)
  117. name = ["推荐", "搞笑", "大雪", "亲子"]
  118. selected_text = random.choice(name)
  119. try:
  120. self.driver.find_element(By.XPATH, f"//wx-button[contains(., '{selected_text}')]").click()
  121. time.sleep(2)
  122. except NoSuchElementException:
  123. pass
  124. if self.search_elements('//*[@class="bless--list"]') is None:
  125. print("窗口已销毁")
  126. self.count = 0
  127. self.download_cnt = 0
  128. self.element_list = []
  129. return
  130. print("开始获取视频信息")
  131. for i in range(50):
  132. print("下滑{}次".format(i))
  133. element = self.parse_detail(i)
  134. self.get_video_info(element)
  135. self.swipe_up()
  136. time.sleep(1)
  137. if self.swipe_count > 100:
  138. return
  139. print("已抓取完一组,休眠 5 秒\n")
  140. time.sleep(5)
  141. def parse_detail(self, index):
  142. page_source = self.driver.page_source
  143. soup = BeautifulSoup(page_source, "html.parser")
  144. soup.prettify()
  145. video_list = soup.findAll(
  146. name="wx-view", attrs={"class": "expose--adapt-parent"}
  147. )
  148. index = index + 1
  149. element_list = [i for i in video_list][index:]
  150. return element_list[0]
  151. def get_video_info(self, video_element):
  152. try:
  153. self.get_video_info_2(video_element)
  154. except Exception as e:
  155. self.driver.press_keycode(AndroidKey.BACK)
  156. Local.logger(self.log_type, self.crawler).error(f"抓取单条视频异常:{e}\n")
  157. self.aliyun_log.logging(
  158. code="3001",
  159. message=f"抓取单条视频异常:{e}\n"
  160. )
  161. def swipe_up(self):
  162. self.search_elements('//*[@class="bless--list"]')
  163. size = self.driver.get_window_size()
  164. self.driver.swipe(
  165. int(size["width"] * 0.5),
  166. int(size["height"] * 0.8),
  167. int(size["width"] * 0.5),
  168. int(size["height"] * 0.442),
  169. 200,
  170. )
  171. self.swipe_count += 1
  172. def search_elements(self, xpath):
  173. time.sleep(1)
  174. windowHandles = self.driver.window_handles
  175. for handle in windowHandles:
  176. self.driver.switch_to.window(handle)
  177. time.sleep(1)
  178. try:
  179. elements = self.driver.find_elements(By.XPATH, xpath)
  180. if elements:
  181. return elements
  182. except NoSuchElementException:
  183. pass
  184. def check_to_applet(self, xpath):
  185. time.sleep(1)
  186. webViews = self.driver.contexts
  187. self.driver.switch_to.context(webViews[-1])
  188. windowHandles = self.driver.window_handles
  189. for handle in windowHandles:
  190. self.driver.switch_to.window(handle)
  191. time.sleep(1)
  192. try:
  193. self.driver.find_element(By.XPATH, xpath)
  194. Local.logger(self.log_type, self.crawler).info("切换到WebView成功\n")
  195. self.aliyun_log.logging(
  196. code="1000",
  197. message="成功切换到 webview"
  198. )
  199. return
  200. except NoSuchElementException:
  201. time.sleep(1)
  202. def get_video_url(self, video_title_element):
  203. for i in range(3):
  204. self.search_elements('//*[@class="bless--list"]')
  205. time.sleep(1)
  206. self.driver.execute_script(
  207. "arguments[0].scrollIntoView({block:'center',inline:'center'});",
  208. video_title_element[0],
  209. )
  210. time.sleep(3)
  211. video_title_element[0].click()
  212. time.sleep(5)
  213. video_url_elements = self.search_elements(
  214. '//*[@class="index--video-item index--video"]'
  215. )
  216. return video_url_elements[0].get_attribute("src")
  217. def get_video_info_2(self, video_element):
  218. Local.logger(self.log_type, self.crawler).info(f"本轮已抓取{self.download_cnt}条视频\n")
  219. if self.download_cnt >= int(
  220. self.rule_dict.get("videos_cnt", {}).get("min", 10)
  221. ):
  222. self.count = 0
  223. self.download_cnt = 0
  224. self.element_list = []
  225. return
  226. self.count += 1
  227. Local.logger(self.log_type, self.crawler).info(f"第{self.count}条视频")
  228. # 获取 trace_id, 并且把该 id 当做视频生命周期唯一索引
  229. trace_id = self.crawler + str(uuid.uuid1())
  230. self.aliyun_log.logging(
  231. code="1001",
  232. trace_id=trace_id,
  233. message="扫描到一条视频",
  234. )
  235. video_title = video_element.find("wx-view", class_="dynamic--title").text
  236. play_str = video_element.find("wx-view", class_="dynamic--views").text
  237. like_str = video_element.findAll("wx-view", class_="dynamic--commerce-btn-text")[0].text
  238. comment_str = video_element.findAll("wx-view", class_="dynamic--commerce-btn-text")[1].text
  239. duration_str = video_element.find("wx-view", class_="dynamic--duration").text
  240. user_name = video_element.find("wx-view", class_="dynamic--nick-top").text
  241. avatar_url = video_element.find("wx-image", class_="avatar--avatar")["src"]
  242. cover_url = video_element.find("wx-image", class_="dynamic--bg-image")["src"]
  243. play_cnt = int(play_str.replace("+", "").replace("次播放", ""))
  244. duration = int(duration_str.split(":")[0].strip()) * 60 + int(duration_str.split(":")[-1].strip())
  245. if "点赞" in like_str:
  246. like_cnt = 0
  247. elif "万" in like_str:
  248. like_cnt = int(like_str.split("万")[0]) * 10000
  249. else:
  250. like_cnt = int(like_str)
  251. if "评论" in comment_str:
  252. comment_cnt = 0
  253. elif "万" in comment_str:
  254. comment_cnt = int(comment_str.split("万")[0]) * 10000
  255. else:
  256. comment_cnt = int(comment_str)
  257. out_video_id = md5(video_title.encode('utf8')).hexdigest()
  258. out_user_id = md5(user_name.encode('utf8')).hexdigest()
  259. video_dict = {
  260. "video_title": video_title,
  261. "video_id": out_video_id,
  262. 'out_video_id': out_video_id,
  263. "duration_str": duration_str,
  264. "duration": duration,
  265. "play_str": play_str,
  266. "play_cnt": play_cnt,
  267. "like_str": like_str,
  268. "like_cnt": like_cnt,
  269. "comment_cnt": comment_cnt,
  270. "share_cnt": 0,
  271. "user_name": user_name,
  272. "user_id": out_user_id,
  273. 'publish_time_stamp': int(time.time()),
  274. 'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))),
  275. 'update_time_stamp': int(time.time()),
  276. "avatar_url": avatar_url,
  277. "cover_url": cover_url,
  278. "session": f"zhufuquanzi-{int(time.time())}"
  279. }
  280. pipeline = PiaoQuanPipeline(
  281. platform=self.platform,
  282. mode=self.log_type,
  283. rule_dict=self.rule_dict,
  284. env=self.env,
  285. item=video_dict,
  286. trace_id=trace_id,
  287. )
  288. flag = pipeline.process_item()
  289. if flag:
  290. video_title_element = self.search_elements(f'//*[contains(text(), "{video_title}")]')
  291. if video_title_element is None:
  292. return
  293. Local.logger(self.log_type, self.crawler).info("点击标题,进入视频详情页")
  294. self.aliyun_log.logging(
  295. code="1000",
  296. message="点击标题,进入视频详情页",
  297. )
  298. video_url = self.get_video_url(video_title_element)
  299. video_url = get_redirect_url(video_url)
  300. if video_url is None:
  301. self.driver.press_keycode(AndroidKey.BACK)
  302. time.sleep(5)
  303. return
  304. video_dict['video_url'] = video_url
  305. video_dict["platform"] = self.crawler
  306. video_dict["strategy"] = self.log_type
  307. video_dict["out_video_id"] = video_dict["video_id"]
  308. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  309. video_dict["user_id"] = self.our_uid
  310. video_dict["publish_time"] = video_dict["publish_time_str"]
  311. self.mq.send_msg(video_dict)
  312. self.download_cnt += 1
  313. self.driver.press_keycode(AndroidKey.BACK)
  314. time.sleep(5)
  315. def run():
  316. rule_dict1 = {"period": {"min": 365, "max": 365},
  317. "duration": {"min": 30, "max": 1800},
  318. "favorite_cnt": {"min": 0, "max": 0},
  319. "videos_cnt": {"min": 5000, "max": 0},
  320. "share_cnt": {"min": 0, "max": 0}}
  321. ZhuFuQuanZiRecommend("recommend", "zhufuquanzi", "prod", rule_dict1, [64120158, 64120157, 63676778])
  322. if __name__ == "__main__":
  323. process = multiprocessing.Process(
  324. target=run
  325. )
  326. process.start()
  327. while True:
  328. if not process.is_alive():
  329. print("正在重启")
  330. process.terminate()
  331. time.sleep(60)
  332. os.system("adb forward --remove-all")
  333. process = multiprocessing.Process(target=run)
  334. process.start()
  335. time.sleep(60)